Blame bin/enc2xs

Packit d0f5c2
#!./perl
Packit d0f5c2
BEGIN {
Packit d0f5c2
    # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's
Packit d0f5c2
    # with $ENV{PERL_CORE} set
Packit d0f5c2
    # In case we need it in future...
Packit d0f5c2
    require Config; import Config;
Packit d0f5c2
    pop @INC if $INC[-1] eq '.';
Packit d0f5c2
}
Packit d0f5c2
use strict;
Packit d0f5c2
use warnings;
Packit d0f5c2
use Getopt::Std;
Packit d0f5c2
use Config;
Packit d0f5c2
my @orig_ARGV = @ARGV;
Packit d0f5c2
our $VERSION  = do { my @r = (q$Revision: 2.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
Packit d0f5c2
Packit d0f5c2
# These may get re-ordered.
Packit d0f5c2
# RAW is a do_now as inserted by &enter
Packit d0f5c2
# AGG is an aggregated do_now, as built up by &process
Packit d0f5c2
Packit d0f5c2
use constant {
Packit d0f5c2
  RAW_NEXT => 0,
Packit d0f5c2
  RAW_IN_LEN => 1,
Packit d0f5c2
  RAW_OUT_BYTES => 2,
Packit d0f5c2
  RAW_FALLBACK => 3,
Packit d0f5c2
Packit d0f5c2
  AGG_MIN_IN => 0,
Packit d0f5c2
  AGG_MAX_IN => 1,
Packit d0f5c2
  AGG_OUT_BYTES => 2,
Packit d0f5c2
  AGG_NEXT => 3,
Packit d0f5c2
  AGG_IN_LEN => 4,
Packit d0f5c2
  AGG_OUT_LEN => 5,
Packit d0f5c2
  AGG_FALLBACK => 6,
Packit d0f5c2
};
Packit d0f5c2
Packit d0f5c2
# (See the algorithm in encengine.c - we're building structures for it)
Packit d0f5c2
Packit d0f5c2
# There are two sorts of structures.
Packit d0f5c2
# "do_now" (an array, two variants of what needs storing) is whatever we need
Packit d0f5c2
# to do now we've read an input byte.
Packit d0f5c2
# It's housed in a "do_next" (which is how we got to it), and in turn points
Packit d0f5c2
# to a "do_next" which contains all the "do_now"s for the next input byte.
Packit d0f5c2
Packit d0f5c2
# There will be a "do_next" which is the start state.
Packit d0f5c2
# For a single byte encoding it's the only "do_next" - each "do_now" points
Packit d0f5c2
# back to it, and each "do_now" will cause bytes. There is no state.
Packit d0f5c2
Packit d0f5c2
# For a multi-byte encoding where all characters in the input are the same
Packit d0f5c2
# length, then there will be a tree of "do_now"->"do_next"->"do_now"
Packit d0f5c2
# branching out from the start state, one step for each input byte.
Packit d0f5c2
# The leaf "do_now"s will all be at the same distance from the start state,
Packit d0f5c2
# only the leaf "do_now"s cause output bytes, and they in turn point back to
Packit d0f5c2
# the start state.
Packit d0f5c2
Packit d0f5c2
# For an encoding where there are variable length input byte sequences, you
Packit d0f5c2
# will encounter a leaf "do_now" sooner for the shorter input sequences, but
Packit d0f5c2
# as before the leaves will point back to the start state.
Packit d0f5c2
Packit d0f5c2
# The system will cope with escape encodings (imagine them as a mostly
Packit d0f5c2
# self-contained tree for each escape state, and cross links between trees
Packit d0f5c2
# at the state-switching characters) but so far no input format defines these.
Packit d0f5c2
Packit d0f5c2
# The system will also cope with having output "leaves" in the middle of
Packit d0f5c2
# the bifurcating branches, not just at the extremities, but again no
Packit d0f5c2
# input format does this yet.
Packit d0f5c2
Packit d0f5c2
# There are two variants of the "do_now" structure. The first, smaller variant
Packit d0f5c2
# is generated by &enter as the input file is read. There is one structure
Packit d0f5c2
# for each input byte. Say we are mapping a single byte encoding to a
Packit d0f5c2
# single byte encoding, with  "ABCD" going "abcd". There will be
Packit d0f5c2
# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
Packit d0f5c2
Packit d0f5c2
# &process then walks the tree, building aggregate "do_now" structures for
Packit d0f5c2
# adjacent bytes where possible. The aggregate is for a contiguous range of
Packit d0f5c2
# bytes which each produce the same length of output, each move to the
Packit d0f5c2
# same next state, and each have the same fallback flag.
Packit d0f5c2
# So our 4 RAW "do_now"s above become replaced by a single structure
Packit d0f5c2
# containing:
Packit d0f5c2
# ["A", "D", "abcd", 1, ...]
Packit d0f5c2
# ie, for an input byte $_ in "A".."D", output 1 byte, found as
Packit d0f5c2
# substr ("abcd", (ord $_ - ord "A") * 1, 1)
Packit d0f5c2
# which maps very nicely into pointer arithmetic in C for encengine.c
Packit d0f5c2
Packit d0f5c2
sub encode_U
Packit d0f5c2
{
Packit d0f5c2
 # UTF-8 encode long hand - only covers part of perl's range
Packit d0f5c2
 ## my $uv = shift;
Packit d0f5c2
 # chr() works in native space so convert value from table
Packit d0f5c2
 # into that space before using chr().
Packit d0f5c2
 my $ch = chr(utf8::unicode_to_native($_[0]));
Packit d0f5c2
 # Now get core perl to encode that the way it likes.
Packit d0f5c2
 utf8::encode($ch);
Packit d0f5c2
 return $ch;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub encode_S
Packit d0f5c2
{
Packit d0f5c2
 # encode single byte
Packit d0f5c2
 ## my ($ch,$page) = @_; return chr($ch);
Packit d0f5c2
 return chr $_[0];
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub encode_D
Packit d0f5c2
{
Packit d0f5c2
 # encode double byte MS byte first
Packit d0f5c2
 ## my ($ch,$page) = @_; return chr($page).chr($ch);
Packit d0f5c2
 return chr ($_[1]) . chr $_[0];
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub encode_M
Packit d0f5c2
{
Packit d0f5c2
 # encode Multi-byte - single for 0..255 otherwise double
Packit d0f5c2
 ## my ($ch,$page) = @_;
Packit d0f5c2
 ## return &encode_D if $page;
Packit d0f5c2
 ## return &encode_S;
Packit d0f5c2
 return chr ($_[1]) . chr $_[0] if $_[1];
Packit d0f5c2
 return chr $_[0];
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
my %encode_types = (U => \&encode_U,
Packit d0f5c2
                    S => \&encode_S,
Packit d0f5c2
                    D => \&encode_D,
Packit d0f5c2
                    M => \&encode_M,
Packit d0f5c2
                   );
Packit d0f5c2
Packit d0f5c2
# Win32 does not expand globs on command line
Packit d0f5c2
if ($^O eq 'MSWin32' and !$ENV{PERL_CORE}) {
Packit d0f5c2
    eval "\@ARGV = map(glob(\$_),\@ARGV)";
Packit d0f5c2
    @ARGV = @orig_ARGV unless @ARGV;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
my %opt;
Packit d0f5c2
# I think these are:
Packit d0f5c2
# -Q to disable the duplicate codepoint test
Packit d0f5c2
# -S make mapping errors fatal
Packit d0f5c2
# -q to remove comments written to output files
Packit d0f5c2
# -O to enable the (brute force) substring optimiser
Packit d0f5c2
# -o <output> to specify the output file name (else it's the first arg)
Packit d0f5c2
# -f <inlist> to give a file with a list of input files (else use the args)
Packit d0f5c2
# -n <name> to name the encoding (else use the basename of the input file.
Packit d0f5c2
#Getopt::Long::Configure("bundling");
Packit d0f5c2
#GetOptions(\%opt, qw(C M=s S Q q O o=s f=s n=s v));
Packit d0f5c2
getopts('CM:SQqOo:f:n:v',\%opt);
Packit d0f5c2
Packit d0f5c2
$opt{M} and make_makefile_pl($opt{M}, @ARGV);
Packit d0f5c2
$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
Packit d0f5c2
$opt{v} ||= $ENV{ENC2XS_VERBOSE};
Packit d0f5c2
Packit d0f5c2
sub verbose {
Packit d0f5c2
    print STDERR @_ if $opt{v};
Packit d0f5c2
}
Packit d0f5c2
sub verbosef {
Packit d0f5c2
    printf STDERR @_ if $opt{v};
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
Packit d0f5c2
# ($cpp, $static, $sized) = compiler_info($declaration)
Packit d0f5c2
#
Packit d0f5c2
# return some information about the compiler and compile options we're using:
Packit d0f5c2
#
Packit d0f5c2
#   $declaration - true if we're doing a declaration rather than a definition.
Packit d0f5c2
#
Packit d0f5c2
#   $cpp    - we're using C++
Packit d0f5c2
#   $static - ok to declare the arrays as static
Packit d0f5c2
#   $sized  - the array declarations should be sized
Packit d0f5c2
Packit d0f5c2
sub compiler_info {
Packit d0f5c2
    my ($declaration) = @_;
Packit d0f5c2
Packit d0f5c2
    my $ccflags = $Config{ccflags};
Packit d0f5c2
    if (defined $Config{ccwarnflags}) {
Packit d0f5c2
        $ccflags .= " " . $Config{ccwarnflags};
Packit d0f5c2
    }
Packit d0f5c2
    my $compat   = $ccflags =~ /\Q-Wc++-compat/;
Packit d0f5c2
    my $pedantic = $ccflags =~ /-pedantic/;
Packit d0f5c2
Packit d0f5c2
    my $cpp      = ($Config{d_cplusplus} || '') eq 'define';
Packit d0f5c2
Packit d0f5c2
    # The encpage_t tables contain recursive and mutually recursive
Packit d0f5c2
    # references. To allow them to compile under C++ and some restrictive
Packit d0f5c2
    # cc options, it may be necessary to make the tables non-static/const
Packit d0f5c2
    # (thus moving them from the text to the data segment) and/or not
Packit d0f5c2
    # include the size in the declaration.
Packit d0f5c2
Packit d0f5c2
    my $static = !(
Packit d0f5c2
                        $cpp
Packit d0f5c2
                     || ($compat && $pedantic)
Packit d0f5c2
                     || ($^O eq 'MacOS' && $declaration)
Packit d0f5c2
                  );
Packit d0f5c2
Packit d0f5c2
    # -Wc++-compat on its own warns if the array declaration is sized.
Packit d0f5c2
    # The easiest way to avoid this warning is simply not to include
Packit d0f5c2
    # the size in the declaration.
Packit d0f5c2
    # With -pedantic as well, the issue doesn't arise because $static
Packit d0f5c2
    # above becomes false.
Packit d0f5c2
    my $sized  = $declaration && !($compat && !$pedantic);
Packit d0f5c2
Packit d0f5c2
    return ($cpp, $static, $sized);
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
Packit d0f5c2
# This really should go first, else the die here causes empty (non-erroneous)
Packit d0f5c2
# output files to be written.
Packit d0f5c2
my @encfiles;
Packit d0f5c2
if (exists $opt{f}) {
Packit d0f5c2
    # -F is followed by name of file containing list of filenames
Packit d0f5c2
    my $flist = $opt{f};
Packit d0f5c2
    open(FLIST,$flist) || die "Cannot open $flist:$!";
Packit d0f5c2
    chomp(@encfiles = <FLIST>);
Packit d0f5c2
    close(FLIST);
Packit d0f5c2
} else {
Packit d0f5c2
    @encfiles = @ARGV;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
my $cname = $opt{o} ? $opt{o} : shift(@ARGV);
Packit d0f5c2
unless ($cname) { #debuging a win32 nmake error-only. works via cmdline
Packit d0f5c2
    print "\nARGV:";
Packit d0f5c2
    print "$_ " for @ARGV;
Packit d0f5c2
    print "\nopt:";
Packit d0f5c2
    print "  $_ => ",defined $opt{$_}?$opt{$_}:"undef","\n" for keys %opt;
Packit d0f5c2
}
Packit d0f5c2
chmod(0666,$cname) if -f $cname && !-w $cname;
Packit d0f5c2
open(C,">", $cname) || die "Cannot open $cname:$!";
Packit d0f5c2
Packit d0f5c2
my $dname = $cname;
Packit d0f5c2
my $hname = $cname;
Packit d0f5c2
Packit d0f5c2
my ($doC,$doEnc,$doUcm,$doPet);
Packit d0f5c2
Packit d0f5c2
if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
Packit d0f5c2
 {
Packit d0f5c2
  $doC = 1;
Packit d0f5c2
  $dname =~ s/(\.[^\.]*)?$/.exh/;
Packit d0f5c2
  chmod(0666,$dname) if -f $cname && !-w $dname;
Packit d0f5c2
  open(D,">", $dname) || die "Cannot open $dname:$!";
Packit d0f5c2
  $hname =~ s/(\.[^\.]*)?$/.h/;
Packit d0f5c2
  chmod(0666,$hname) if -f $cname && !-w $hname;
Packit d0f5c2
  open(H,">", $hname) || die "Cannot open $hname:$!";
Packit d0f5c2
Packit d0f5c2
  foreach my $fh (\*C,\*D,\*H)
Packit d0f5c2
  {
Packit d0f5c2
   print $fh <<"END" unless $opt{'q'};
Packit d0f5c2
/*
Packit d0f5c2
 !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
Packit d0f5c2
 This file was autogenerated by:
Packit d0f5c2
 $^X $0 @orig_ARGV
Packit d0f5c2
 enc2xs VERSION $VERSION
Packit d0f5c2
*/
Packit d0f5c2
END
Packit d0f5c2
  }
Packit d0f5c2
Packit d0f5c2
  if ($cname =~ /(\w+)\.xs$/)
Packit d0f5c2
   {
Packit d0f5c2
    print C "#define PERL_NO_GET_CONTEXT\n";
Packit d0f5c2
    print C "#include <EXTERN.h>\n";
Packit d0f5c2
    print C "#include <perl.h>\n";
Packit d0f5c2
    print C "#include <XSUB.h>\n";
Packit d0f5c2
   }
Packit d0f5c2
  print C "#include \"encode.h\"\n\n";
Packit d0f5c2
Packit d0f5c2
 }
Packit d0f5c2
elsif ($cname =~ /\.enc$/)
Packit d0f5c2
 {
Packit d0f5c2
  $doEnc = 1;
Packit d0f5c2
 }
Packit d0f5c2
elsif ($cname =~ /\.ucm$/)
Packit d0f5c2
 {
Packit d0f5c2
  $doUcm = 1;
Packit d0f5c2
 }
Packit d0f5c2
elsif ($cname =~ /\.pet$/)
Packit d0f5c2
 {
Packit d0f5c2
  $doPet = 1;
Packit d0f5c2
 }
Packit d0f5c2
Packit d0f5c2
my %encoding;
Packit d0f5c2
my %strings;
Packit d0f5c2
my $string_acc;
Packit d0f5c2
my %strings_in_acc;
Packit d0f5c2
Packit d0f5c2
my $saved = 0;
Packit d0f5c2
my $subsave = 0;
Packit d0f5c2
my $strings = 0;
Packit d0f5c2
Packit d0f5c2
sub cmp_name
Packit d0f5c2
{
Packit d0f5c2
 if ($a =~ /^.*-(\d+)/)
Packit d0f5c2
  {
Packit d0f5c2
   my $an = $1;
Packit d0f5c2
   if ($b =~ /^.*-(\d+)/)
Packit d0f5c2
    {
Packit d0f5c2
     my $r = $an <=> $1;
Packit d0f5c2
     return $r if $r;
Packit d0f5c2
    }
Packit d0f5c2
  }
Packit d0f5c2
 return $a cmp $b;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
Packit d0f5c2
foreach my $enc (sort cmp_name @encfiles)
Packit d0f5c2
 {
Packit d0f5c2
  my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
Packit d0f5c2
  $name = $opt{'n'} if exists $opt{'n'};
Packit d0f5c2
  if (open(E,$enc))
Packit d0f5c2
   {
Packit d0f5c2
    if ($sfx eq 'enc')
Packit d0f5c2
     {
Packit d0f5c2
      compile_enc(\*E,lc($name));
Packit d0f5c2
     }
Packit d0f5c2
    else
Packit d0f5c2
     {
Packit d0f5c2
      compile_ucm(\*E,lc($name));
Packit d0f5c2
     }
Packit d0f5c2
   }
Packit d0f5c2
  else
Packit d0f5c2
   {
Packit d0f5c2
    warn "Cannot open $enc for $name:$!";
Packit d0f5c2
   }
Packit d0f5c2
 }
Packit d0f5c2
Packit d0f5c2
if ($doC)
Packit d0f5c2
 {
Packit d0f5c2
  verbose "Writing compiled form\n";
Packit d0f5c2
  foreach my $name (sort cmp_name keys %encoding)
Packit d0f5c2
   {
Packit d0f5c2
    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
Packit d0f5c2
    process($name.'_utf8',$e2u);
Packit d0f5c2
    addstrings(\*C,$e2u);
Packit d0f5c2
Packit d0f5c2
    process('utf8_'.$name,$u2e);
Packit d0f5c2
    addstrings(\*C,$u2e);
Packit d0f5c2
   }
Packit d0f5c2
  outbigstring(\*C,"enctable");
Packit d0f5c2
  foreach my $name (sort cmp_name keys %encoding)
Packit d0f5c2
   {
Packit d0f5c2
    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
Packit d0f5c2
    outtable(\*C,$e2u, "enctable");
Packit d0f5c2
    outtable(\*C,$u2e, "enctable");
Packit d0f5c2
Packit d0f5c2
    # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
Packit d0f5c2
   }
Packit d0f5c2
  my ($cpp) = compiler_info(0);
Packit d0f5c2
  my $ext  = $cpp ? 'extern "C"' : "extern";
Packit d0f5c2
  my $exta = $cpp ? 'extern "C"' : "static";
Packit d0f5c2
  my $extb = $cpp ? 'extern "C"' : "";
Packit d0f5c2
  foreach my $enc (sort cmp_name keys %encoding)
Packit d0f5c2
   {
Packit d0f5c2
    # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
Packit d0f5c2
    my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
Packit d0f5c2
    #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
Packit d0f5c2
    my $replen = 0; 
Packit d0f5c2
    $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
Packit d0f5c2
    my $sym = "${enc}_encoding";
Packit d0f5c2
    $sym =~ s/\W+/_/g;
Packit d0f5c2
    my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
Packit d0f5c2
        $min_el,$max_el);
Packit d0f5c2
    print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n";
Packit d0f5c2
    print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n";
Packit d0f5c2
    print C "${extb} const encode_t $sym = \n";
Packit d0f5c2
    # This is to make null encoding work -- dankogai
Packit d0f5c2
    for (my $i = (scalar @info) - 1;  $i >= 0; --$i){
Packit d0f5c2
    $info[$i] ||= 1;
Packit d0f5c2
    }
Packit d0f5c2
    # end of null tweak -- dankogai
Packit d0f5c2
    print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
Packit d0f5c2
   }
Packit d0f5c2
Packit d0f5c2
  foreach my $enc (sort cmp_name keys %encoding)
Packit d0f5c2
   {
Packit d0f5c2
    my $sym = "${enc}_encoding";
Packit d0f5c2
    $sym =~ s/\W+/_/g;
Packit d0f5c2
    print H "${ext} encode_t $sym;\n";
Packit d0f5c2
    print D " Encode_XSEncoding(aTHX_ &$sym);\n";
Packit d0f5c2
   }
Packit d0f5c2
Packit d0f5c2
  if ($cname =~ /(\w+)\.xs$/)
Packit d0f5c2
   {
Packit d0f5c2
    my $mod = $1;
Packit d0f5c2
    print C <<'END';
Packit d0f5c2
Packit d0f5c2
static void
Packit d0f5c2
Encode_XSEncoding(pTHX_ encode_t *enc)
Packit d0f5c2
{
Packit d0f5c2
 dSP;
Packit d0f5c2
 HV *stash = gv_stashpv("Encode::XS", TRUE);
Packit d0f5c2
 SV *iv    = newSViv(PTR2IV(enc));
Packit d0f5c2
 SV *sv    = sv_bless(newRV_noinc(iv),stash);
Packit d0f5c2
 int i = 0;
Packit d0f5c2
 /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's
Packit d0f5c2
 constness, in the hope that perl won't mess with it. */
Packit d0f5c2
 assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0);
Packit d0f5c2
 SvFLAGS(iv) |= SVp_POK;
Packit d0f5c2
 SvPVX(iv) = (char*) enc->name[0];
Packit d0f5c2
 PUSHMARK(sp);
Packit d0f5c2
 XPUSHs(sv);
Packit d0f5c2
 while (enc->name[i])
Packit d0f5c2
  {
Packit d0f5c2
   const char *name = enc->name[i++];
Packit d0f5c2
   XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
Packit d0f5c2
  }
Packit d0f5c2
 PUTBACK;
Packit d0f5c2
 call_pv("Encode::define_encoding",G_DISCARD);
Packit d0f5c2
 SvREFCNT_dec(sv);
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
END
Packit d0f5c2
Packit d0f5c2
    print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
Packit d0f5c2
    print C "BOOT:\n{\n";
Packit d0f5c2
    print C "#include \"$dname\"\n";
Packit d0f5c2
    print C "}\n";
Packit d0f5c2
   }
Packit d0f5c2
  # Close in void context is bad, m'kay
Packit d0f5c2
  close(D) or warn "Error closing '$dname': $!";
Packit d0f5c2
  close(H) or warn "Error closing '$hname': $!";
Packit d0f5c2
Packit d0f5c2
  my $perc_saved    = $saved/($strings + $saved) * 100;
Packit d0f5c2
  my $perc_subsaved = $subsave/($strings + $subsave) * 100;
Packit d0f5c2
  verbosef "%d bytes in string tables\n",$strings;
Packit d0f5c2
  verbosef "%d bytes (%.3g%%) saved spotting duplicates\n",
Packit d0f5c2
    $saved, $perc_saved              if $saved;
Packit d0f5c2
  verbosef "%d bytes (%.3g%%) saved using substrings\n",
Packit d0f5c2
    $subsave, $perc_subsaved         if $subsave;
Packit d0f5c2
 }
Packit d0f5c2
elsif ($doEnc)
Packit d0f5c2
 {
Packit d0f5c2
  foreach my $name (sort cmp_name keys %encoding)
Packit d0f5c2
   {
Packit d0f5c2
    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
Packit d0f5c2
    output_enc(\*C,$name,$e2u);
Packit d0f5c2
   }
Packit d0f5c2
 }
Packit d0f5c2
elsif ($doUcm)
Packit d0f5c2
 {
Packit d0f5c2
  foreach my $name (sort cmp_name keys %encoding)
Packit d0f5c2
   {
Packit d0f5c2
    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
Packit d0f5c2
    output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
Packit d0f5c2
   }
Packit d0f5c2
 }
Packit d0f5c2
Packit d0f5c2
# writing half meg files and then not checking to see if you just filled the
Packit d0f5c2
# disk is bad, m'kay
Packit d0f5c2
close(C) or die "Error closing '$cname': $!";
Packit d0f5c2
Packit d0f5c2
# End of the main program.
Packit d0f5c2
Packit d0f5c2
sub compile_ucm
Packit d0f5c2
{
Packit d0f5c2
 my ($fh,$name) = @_;
Packit d0f5c2
 my $e2u = {};
Packit d0f5c2
 my $u2e = {};
Packit d0f5c2
 my $cs;
Packit d0f5c2
 my %attr;
Packit d0f5c2
 while (<$fh>)
Packit d0f5c2
  {
Packit d0f5c2
   s/#.*$//;
Packit d0f5c2
   last if /^\s*CHARMAP\s*$/i;
Packit d0f5c2
   if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
Packit d0f5c2
    {
Packit d0f5c2
     $attr{$1} = $2;
Packit d0f5c2
    }
Packit d0f5c2
  }
Packit d0f5c2
 if (!defined($cs =  $attr{'code_set_name'}))
Packit d0f5c2
  {
Packit d0f5c2
   warn "No <code_set_name> in $name\n";
Packit d0f5c2
  }
Packit d0f5c2
 else
Packit d0f5c2
  {
Packit d0f5c2
   $name = $cs unless exists $opt{'n'};
Packit d0f5c2
  }
Packit d0f5c2
 my $erep;
Packit d0f5c2
 my $urep;
Packit d0f5c2
 my $max_el;
Packit d0f5c2
 my $min_el;
Packit d0f5c2
 if (exists $attr{'subchar'})
Packit d0f5c2
  {
Packit d0f5c2
   #my @byte;
Packit d0f5c2
   #$attr{'subchar'} =~ /^\s*/cg;
Packit d0f5c2
   #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
Packit d0f5c2
   #$erep = join('',map(chr(hex($_)),@byte));
Packit d0f5c2
   $erep = $attr{'subchar'}; 
Packit d0f5c2
   $erep =~ s/^\s+//; $erep =~ s/\s+$//;
Packit d0f5c2
  }
Packit d0f5c2
 print "Reading $name ($cs)\n"
Packit d0f5c2
   unless defined $ENV{MAKEFLAGS}
Packit d0f5c2
      and $ENV{MAKEFLAGS} =~ /\b(s|silent|quiet)\b/;
Packit d0f5c2
 my $nfb = 0;
Packit d0f5c2
 my $hfb = 0;
Packit d0f5c2
 while (<$fh>)
Packit d0f5c2
  {
Packit d0f5c2
   s/#.*$//;
Packit d0f5c2
   last if /^\s*END\s+CHARMAP\s*$/i;
Packit d0f5c2
   next if /^\s*$/;
Packit d0f5c2
   my (@uni, @byte) = ();
Packit d0f5c2
   my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
Packit d0f5c2
       or die "Bad line: $_";
Packit d0f5c2
   while ($uni =~  m/\G<([U0-9a-fA-F\+]+)>/g){
Packit d0f5c2
       push @uni, map { substr($_, 1) } split(/\+/, $1);
Packit d0f5c2
   }
Packit d0f5c2
   while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
Packit d0f5c2
       push @byte, $1;
Packit d0f5c2
   }
Packit d0f5c2
   if (@uni)
Packit d0f5c2
    {
Packit d0f5c2
     my $uch =  join('', map { encode_U(hex($_)) } @uni );
Packit d0f5c2
     my $ech = join('',map(chr(hex($_)),@byte));
Packit d0f5c2
     my $el  = length($ech);
Packit d0f5c2
     $max_el = $el if (!defined($max_el) || $el > $max_el);
Packit d0f5c2
     $min_el = $el if (!defined($min_el) || $el < $min_el);
Packit d0f5c2
     if (length($fb))
Packit d0f5c2
      {
Packit d0f5c2
       $fb = substr($fb,1);
Packit d0f5c2
       $hfb++;
Packit d0f5c2
      }
Packit d0f5c2
     else
Packit d0f5c2
      {
Packit d0f5c2
       $nfb++;
Packit d0f5c2
       $fb = '0';
Packit d0f5c2
      }
Packit d0f5c2
     # $fb is fallback flag
Packit d0f5c2
     # 0 - round trip safe
Packit d0f5c2
     # 1 - fallback for unicode -> enc
Packit d0f5c2
     # 2 - skip sub-char mapping
Packit d0f5c2
     # 3 - fallback enc -> unicode
Packit d0f5c2
     enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
Packit d0f5c2
     enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
Packit d0f5c2
    }
Packit d0f5c2
   else
Packit d0f5c2
    {
Packit d0f5c2
     warn $_;
Packit d0f5c2
    }
Packit d0f5c2
  }
Packit d0f5c2
 if ($nfb && $hfb)
Packit d0f5c2
  {
Packit d0f5c2
   die "$nfb entries without fallback, $hfb entries with\n";
Packit d0f5c2
  }
Packit d0f5c2
 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
Packit d0f5c2
Packit d0f5c2
sub compile_enc
Packit d0f5c2
{
Packit d0f5c2
 my ($fh,$name) = @_;
Packit d0f5c2
 my $e2u = {};
Packit d0f5c2
 my $u2e = {};
Packit d0f5c2
Packit d0f5c2
 my $type;
Packit d0f5c2
 while ($type = <$fh>)
Packit d0f5c2
  {
Packit d0f5c2
   last if $type !~ /^\s*#/;
Packit d0f5c2
  }
Packit d0f5c2
 chomp($type);
Packit d0f5c2
 return if $type eq 'E';
Packit d0f5c2
 # Do the hash lookup once, rather than once per function call. 4% speedup.
Packit d0f5c2
 my $type_func = $encode_types{$type};
Packit d0f5c2
 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
Packit d0f5c2
 warn "$type encoded $name\n";
Packit d0f5c2
 my $rep = '';
Packit d0f5c2
 # Save a defined test by setting these to defined values.
Packit d0f5c2
 my $min_el = ~0; # A very big integer
Packit d0f5c2
 my $max_el = 0;  # Anything must be longer than 0
Packit d0f5c2
 {
Packit d0f5c2
  my $v = hex($def);
Packit d0f5c2
  $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
Packit d0f5c2
 }
Packit d0f5c2
 my $errors;
Packit d0f5c2
 my $seen;
Packit d0f5c2
 # use -Q to silence the seen test. Makefile.PL uses this by default.
Packit d0f5c2
 $seen = {} unless $opt{Q};
Packit d0f5c2
 do
Packit d0f5c2
  {
Packit d0f5c2
   my $line = <$fh>;
Packit d0f5c2
   chomp($line);
Packit d0f5c2
   my $page = hex($line);
Packit d0f5c2
   my $ch = 0;
Packit d0f5c2
   my $i = 16;
Packit d0f5c2
   do
Packit d0f5c2
    {
Packit d0f5c2
     # So why is it 1% faster to leave the my here?
Packit d0f5c2
     my $line = <$fh>;
Packit d0f5c2
     $line =~ s/\r\n$/\n/;
Packit d0f5c2
     die "$.:${line}Line should be exactly 65 characters long including
Packit d0f5c2
     newline (".length($line).")" unless length ($line) == 65;
Packit d0f5c2
     # Split line into groups of 4 hex digits, convert groups to ints
Packit d0f5c2
     # This takes 65.35		
Packit d0f5c2
     # map {hex $_} $line =~ /(....)/g
Packit d0f5c2
     # This takes 63.75 (2.5% less time)
Packit d0f5c2
     # unpack "n*", pack "H*", $line
Packit d0f5c2
     # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
Packit d0f5c2
     # Doing it as while ($line =~ /(....)/g) took 74.63
Packit d0f5c2
     foreach my $val (unpack "n*", pack "H*", $line)
Packit d0f5c2
      {
Packit d0f5c2
       next if $val == 0xFFFD;
Packit d0f5c2
       my $ech = &$type_func($ch,$page);
Packit d0f5c2
       if ($val || (!$ch && !$page))
Packit d0f5c2
        {
Packit d0f5c2
         my $el  = length($ech);
Packit d0f5c2
         $max_el = $el if $el > $max_el;
Packit d0f5c2
         $min_el = $el if $el < $min_el;
Packit d0f5c2
         my $uch = encode_U($val);
Packit d0f5c2
         if ($seen) {
Packit d0f5c2
           # We're doing the test.
Packit d0f5c2
           # We don't need to read this quickly, so storing it as a scalar,
Packit d0f5c2
           # rather than 3 (anon array, plus the 2 scalars it holds) saves
Packit d0f5c2
           # RAM and may make us faster on low RAM systems. [see __END__]
Packit d0f5c2
           if (exists $seen->{$uch})
Packit d0f5c2
             {
Packit d0f5c2
               warn sprintf("U%04X is %02X%02X and %04X\n",
Packit d0f5c2
                            $val,$page,$ch,$seen->{$uch});
Packit d0f5c2
               $errors++;
Packit d0f5c2
             }
Packit d0f5c2
           else
Packit d0f5c2
             {
Packit d0f5c2
               $seen->{$uch} = $page << 8 | $ch;
Packit d0f5c2
             }
Packit d0f5c2
         }
Packit d0f5c2
         # Passing 2 extra args each time is 3.6% slower!
Packit d0f5c2
         # Even with having to add $fallback ||= 0 later
Packit d0f5c2
         enter_fb0($e2u,$ech,$uch);
Packit d0f5c2
         enter_fb0($u2e,$uch,$ech);
Packit d0f5c2
        }
Packit d0f5c2
       else
Packit d0f5c2
        {
Packit d0f5c2
         # No character at this position
Packit d0f5c2
         # enter($e2u,$ech,undef,$e2u);
Packit d0f5c2
        }
Packit d0f5c2
       $ch++;
Packit d0f5c2
      }
Packit d0f5c2
    } while --$i;
Packit d0f5c2
  } while --$pages;
Packit d0f5c2
 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
Packit d0f5c2
   if $min_el > $max_el;
Packit d0f5c2
 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
Packit d0f5c2
 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
# my ($a,$s,$d,$t,$fb) = @_;
Packit d0f5c2
sub enter {
Packit d0f5c2
  my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
Packit d0f5c2
  # state we shift to after this (multibyte) input character defaults to same
Packit d0f5c2
  # as current state.
Packit d0f5c2
  $next ||= $current;
Packit d0f5c2
  # Making sure it is defined seems to be faster than {no warnings;} in
Packit d0f5c2
  # &process, or passing it in as 0 explicitly.
Packit d0f5c2
  # XXX $fallback ||= 0;
Packit d0f5c2
Packit d0f5c2
  # Start at the beginning and work forwards through the string to zero.
Packit d0f5c2
  # effectively we are removing 1 character from the front each time
Packit d0f5c2
  # but we don't actually edit the string. [this alone seems to be 14% speedup]
Packit d0f5c2
  # Hence -$pos is the length of the remaining string.
Packit d0f5c2
  my $pos = -length $inbytes;
Packit d0f5c2
  while (1) {
Packit d0f5c2
    my $byte = substr $inbytes, $pos, 1;
Packit d0f5c2
    #  RAW_NEXT => 0,
Packit d0f5c2
    #  RAW_IN_LEN => 1,
Packit d0f5c2
    #  RAW_OUT_BYTES => 2,
Packit d0f5c2
    #  RAW_FALLBACK => 3,
Packit d0f5c2
    # to unicode an array would seem to be better, because the pages are dense.
Packit d0f5c2
    # from unicode can be very sparse, favouring a hash.
Packit d0f5c2
    # hash using the bytes (all length 1) as keys rather than ord value,
Packit d0f5c2
    # as it's easier to sort these in &process.
Packit d0f5c2
Packit d0f5c2
    # It's faster to always add $fallback even if it's undef, rather than
Packit d0f5c2
    # choosing between 3 and 4 element array. (hence why we set it defined
Packit d0f5c2
    # above)
Packit d0f5c2
    my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
Packit d0f5c2
    # When $pos was -1 we were at the last input character.
Packit d0f5c2
    unless (++$pos) {
Packit d0f5c2
      $do_now->[RAW_OUT_BYTES] = $outbytes;
Packit d0f5c2
      $do_now->[RAW_NEXT] = $next;
Packit d0f5c2
      return;
Packit d0f5c2
    }
Packit d0f5c2
    # Tail recursion. The intermediate state may not have a name yet.
Packit d0f5c2
    $current = $do_now->[RAW_NEXT];
Packit d0f5c2
  }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
# This is purely for optimisation. It's just &enter hard coded for $fallback
Packit d0f5c2
# of 0, using only a 3 entry array ref to save memory for every entry.
Packit d0f5c2
sub enter_fb0 {
Packit d0f5c2
  my ($current,$inbytes,$outbytes,$next) = @_;
Packit d0f5c2
  $next ||= $current;
Packit d0f5c2
Packit d0f5c2
  my $pos = -length $inbytes;
Packit d0f5c2
  while (1) {
Packit d0f5c2
    my $byte = substr $inbytes, $pos, 1;
Packit d0f5c2
    my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
Packit d0f5c2
    unless (++$pos) {
Packit d0f5c2
      $do_now->[RAW_OUT_BYTES] = $outbytes;
Packit d0f5c2
      $do_now->[RAW_NEXT] = $next;
Packit d0f5c2
      return;
Packit d0f5c2
    }
Packit d0f5c2
    $current = $do_now->[RAW_NEXT];
Packit d0f5c2
  }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub process
Packit d0f5c2
{
Packit d0f5c2
  my ($name,$a) = @_;
Packit d0f5c2
  $name =~ s/\W+/_/g;
Packit d0f5c2
  $a->{Cname} = $name;
Packit d0f5c2
  my $raw = $a->{Raw};
Packit d0f5c2
  my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
Packit d0f5c2
  my @ent;
Packit d0f5c2
  $agg_max_in = 0;
Packit d0f5c2
  foreach my $key (sort keys %$raw) {
Packit d0f5c2
    #  RAW_NEXT => 0,
Packit d0f5c2
    #  RAW_IN_LEN => 1,
Packit d0f5c2
    #  RAW_OUT_BYTES => 2,
Packit d0f5c2
    #  RAW_FALLBACK => 3,
Packit d0f5c2
    my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
Packit d0f5c2
    # Now we are converting from raw to aggregate, switch from 1 byte strings
Packit d0f5c2
    # to numbers
Packit d0f5c2
    my $b = ord $key;
Packit d0f5c2
    $fallback ||= 0;
Packit d0f5c2
    if ($l &&
Packit d0f5c2
        # If this == fails, we're going to reset $agg_max_in below anyway.
Packit d0f5c2
        $b == ++$agg_max_in &&
Packit d0f5c2
        # References in numeric context give the pointer as an int.
Packit d0f5c2
        $agg_next == $next &&
Packit d0f5c2
        $agg_in_len == $in_len &&
Packit d0f5c2
        $agg_out_len == length $out_bytes &&
Packit d0f5c2
        $agg_fallback == $fallback
Packit d0f5c2
        # && length($l->[AGG_OUT_BYTES]) < 16
Packit d0f5c2
       ) {
Packit d0f5c2
      #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
Packit d0f5c2
      # we can aggregate this byte onto the end.
Packit d0f5c2
      $l->[AGG_MAX_IN] = $b;
Packit d0f5c2
      $l->[AGG_OUT_BYTES] .= $out_bytes;
Packit d0f5c2
    } else {
Packit d0f5c2
      # AGG_MIN_IN => 0,
Packit d0f5c2
      # AGG_MAX_IN => 1,
Packit d0f5c2
      # AGG_OUT_BYTES => 2,
Packit d0f5c2
      # AGG_NEXT => 3,
Packit d0f5c2
      # AGG_IN_LEN => 4,
Packit d0f5c2
      # AGG_OUT_LEN => 5,
Packit d0f5c2
      # AGG_FALLBACK => 6,
Packit d0f5c2
      # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
Packit d0f5c2
      # (only gains .6% on euc-jp  -- is it worth it?)
Packit d0f5c2
      push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
Packit d0f5c2
                       $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
Packit d0f5c2
                       $agg_fallback = $fallback];
Packit d0f5c2
    }
Packit d0f5c2
    if (exists $next->{Cname}) {
Packit d0f5c2
      $next->{'Forward'} = 1 if $next != $a;
Packit d0f5c2
    } else {
Packit d0f5c2
      process(sprintf("%s_%02x",$name,$b),$next);
Packit d0f5c2
    }
Packit d0f5c2
  }
Packit d0f5c2
  # encengine.c rules say that last entry must be for 255
Packit d0f5c2
  if ($agg_max_in < 255) {
Packit d0f5c2
    push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
Packit d0f5c2
  }
Packit d0f5c2
  $a->{'Entries'} = \@ent;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
Packit d0f5c2
sub addstrings
Packit d0f5c2
{
Packit d0f5c2
 my ($fh,$a) = @_;
Packit d0f5c2
 my $name = $a->{'Cname'};
Packit d0f5c2
 # String tables
Packit d0f5c2
 foreach my $b (@{$a->{'Entries'}})
Packit d0f5c2
  {
Packit d0f5c2
   next unless $b->[AGG_OUT_LEN];
Packit d0f5c2
   $strings{$b->[AGG_OUT_BYTES]} = undef;
Packit d0f5c2
  }
Packit d0f5c2
 if ($a->{'Forward'})
Packit d0f5c2
  {
Packit d0f5c2
   my ($cpp, $static, $sized) = compiler_info(1);
Packit d0f5c2
   my $count = $sized ? scalar(@{$a->{'Entries'}}) : '';
Packit d0f5c2
   if ($static) {
Packit d0f5c2
     # we cannot ask Config for d_plusplus since we can override CC=g++-6 on the cmdline
Packit d0f5c2
     print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6
Packit d0f5c2
     print $fh "extern encpage_t $name\[$count];\n";
Packit d0f5c2
     print $fh "#else\n";
Packit d0f5c2
     print $fh "static const encpage_t $name\[$count];\n";
Packit d0f5c2
     print $fh "#endif\n";
Packit d0f5c2
   } else {
Packit d0f5c2
     print $fh "extern encpage_t $name\[$count];\n";
Packit d0f5c2
   }
Packit d0f5c2
  }
Packit d0f5c2
 $a->{'DoneStrings'} = 1;
Packit d0f5c2
 foreach my $b (@{$a->{'Entries'}})
Packit d0f5c2
  {
Packit d0f5c2
   my ($s,$e,$out,$t,$end,$l) = @$b;
Packit d0f5c2
   addstrings($fh,$t) unless $t->{'DoneStrings'};
Packit d0f5c2
  }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub outbigstring
Packit d0f5c2
{
Packit d0f5c2
  my ($fh,$name) = @_;
Packit d0f5c2
Packit d0f5c2
  $string_acc = '';
Packit d0f5c2
Packit d0f5c2
  # Make the big string in the string accumulator. Longest first, on the hope
Packit d0f5c2
  # that this makes it more likely that we find the short strings later on.
Packit d0f5c2
  # Not sure if it helps sorting strings of the same length lexically.
Packit d0f5c2
  foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
Packit d0f5c2
    my $index = index $string_acc, $s;
Packit d0f5c2
    if ($index >= 0) {
Packit d0f5c2
      $saved += length($s);
Packit d0f5c2
      $strings_in_acc{$s} = $index;
Packit d0f5c2
    } else {
Packit d0f5c2
    OPTIMISER: {
Packit d0f5c2
    if ($opt{'O'}) {
Packit d0f5c2
      my $sublength = length $s;
Packit d0f5c2
      while (--$sublength > 0) {
Packit d0f5c2
        # progressively lop characters off the end, to see if the start of
Packit d0f5c2
        # the new string overlaps the end of the accumulator.
Packit d0f5c2
        if (substr ($string_acc, -$sublength)
Packit d0f5c2
        eq substr ($s, 0, $sublength)) {
Packit d0f5c2
          $subsave += $sublength;
Packit d0f5c2
          $strings_in_acc{$s} = length ($string_acc) - $sublength;
Packit d0f5c2
          # append the last bit on the end.
Packit d0f5c2
          $string_acc .= substr ($s, $sublength);
Packit d0f5c2
          last OPTIMISER;
Packit d0f5c2
        }
Packit d0f5c2
        # or if the end of the new string overlaps the start of the
Packit d0f5c2
        # accumulator
Packit d0f5c2
        next unless substr ($string_acc, 0, $sublength)
Packit d0f5c2
          eq substr ($s, -$sublength);
Packit d0f5c2
        # well, the last $sublength characters of the accumulator match.
Packit d0f5c2
        # so as we're prepending to the accumulator, need to shift all our
Packit d0f5c2
        # existing offsets forwards
Packit d0f5c2
        $_ += $sublength foreach values %strings_in_acc;
Packit d0f5c2
        $subsave += $sublength;
Packit d0f5c2
        $strings_in_acc{$s} = 0;
Packit d0f5c2
        # append the first bit on the start.
Packit d0f5c2
        $string_acc = substr ($s, 0, -$sublength) . $string_acc;
Packit d0f5c2
        last OPTIMISER;
Packit d0f5c2
      }
Packit d0f5c2
    }
Packit d0f5c2
    # Optimiser (if it ran) found nothing, so just going have to tack the
Packit d0f5c2
    # whole thing on the end.
Packit d0f5c2
    $strings_in_acc{$s} = length $string_acc;
Packit d0f5c2
    $string_acc .= $s;
Packit d0f5c2
      };
Packit d0f5c2
    }
Packit d0f5c2
  }
Packit d0f5c2
Packit d0f5c2
  $strings = length $string_acc;
Packit d0f5c2
  my ($cpp) = compiler_info(0);
Packit d0f5c2
  my $var = $cpp ? '' : 'static';
Packit d0f5c2
  my $definition = "\n$var const U8 $name\[$strings] = { " .
Packit d0f5c2
    join(',',unpack "C*",$string_acc);
Packit d0f5c2
  # We have a single long line. Split it at convenient commas.
Packit d0f5c2
  print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
Packit d0f5c2
  print $fh substr ($definition, pos $definition), " };\n";
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub findstring {
Packit d0f5c2
  my ($name,$s) = @_;
Packit d0f5c2
  my $offset = $strings_in_acc{$s};
Packit d0f5c2
  die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
Packit d0f5c2
    unless defined $offset;
Packit d0f5c2
  "$name + $offset";
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub outtable
Packit d0f5c2
{
Packit d0f5c2
 my ($fh,$a,$bigname) = @_;
Packit d0f5c2
 my $name = $a->{'Cname'};
Packit d0f5c2
 $a->{'Done'} = 1;
Packit d0f5c2
 foreach my $b (@{$a->{'Entries'}})
Packit d0f5c2
  {
Packit d0f5c2
   my ($s,$e,$out,$t,$end,$l) = @$b;
Packit d0f5c2
   outtable($fh,$t,$bigname) unless $t->{'Done'};
Packit d0f5c2
  }
Packit d0f5c2
 my ($cpp, $static) = compiler_info(0);
Packit d0f5c2
 my $count = scalar(@{$a->{'Entries'}});
Packit d0f5c2
 if ($static) {
Packit d0f5c2
     print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6
Packit d0f5c2
     print $fh "encpage_t $name\[$count] = {\n";
Packit d0f5c2
     print $fh "#else\n";
Packit d0f5c2
     print $fh "static const encpage_t $name\[$count] = {\n";
Packit d0f5c2
     print $fh "#endif\n";
Packit d0f5c2
 } else {
Packit d0f5c2
   print $fh "\nencpage_t $name\[$count] = {\n";
Packit d0f5c2
 }
Packit d0f5c2
 foreach my $b (@{$a->{'Entries'}})
Packit d0f5c2
  {
Packit d0f5c2
   my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
Packit d0f5c2
   # $end |= 0x80 if $fb; # what the heck was on your mind, Nick?  -- Dan
Packit d0f5c2
   print  $fh "{";
Packit d0f5c2
   if ($l)
Packit d0f5c2
    {
Packit d0f5c2
     printf $fh findstring($bigname,$out);
Packit d0f5c2
    }
Packit d0f5c2
   else
Packit d0f5c2
    {
Packit d0f5c2
     print  $fh "0";
Packit d0f5c2
    }
Packit d0f5c2
   print  $fh ",",$t->{Cname};
Packit d0f5c2
   printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
Packit d0f5c2
  }
Packit d0f5c2
 print $fh "};\n";
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub output_enc
Packit d0f5c2
{
Packit d0f5c2
 my ($fh,$name,$a) = @_;
Packit d0f5c2
 die "Changed - fix me for new structure";
Packit d0f5c2
 foreach my $b (sort keys %$a)
Packit d0f5c2
  {
Packit d0f5c2
   my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
Packit d0f5c2
  }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub decode_U
Packit d0f5c2
{
Packit d0f5c2
 my $s = shift;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
my @uname;
Packit d0f5c2
sub char_names
Packit d0f5c2
{
Packit d0f5c2
 my $s = do "unicore/Name.pl";
Packit d0f5c2
 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
Packit d0f5c2
 pos($s) = 0;
Packit d0f5c2
 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
Packit d0f5c2
  {
Packit d0f5c2
   my $name = $3;
Packit d0f5c2
   my $s = hex($1);
Packit d0f5c2
   last if $s >= 0x10000;
Packit d0f5c2
   my $e = length($2) ? hex($2) : $s;
Packit d0f5c2
   for (my $i = $s; $i <= $e; $i++)
Packit d0f5c2
    {
Packit d0f5c2
     $uname[$i] = $name;
Packit d0f5c2
#    print sprintf("U%04X $name\n",$i);
Packit d0f5c2
    }
Packit d0f5c2
  }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub output_ucm_page
Packit d0f5c2
{
Packit d0f5c2
  my ($cmap,$a,$t,$pre) = @_;
Packit d0f5c2
  # warn sprintf("Page %x\n",$pre);
Packit d0f5c2
  my $raw = $t->{Raw};
Packit d0f5c2
  foreach my $key (sort keys %$raw) {
Packit d0f5c2
    #  RAW_NEXT => 0,
Packit d0f5c2
    #  RAW_IN_LEN => 1,
Packit d0f5c2
    #  RAW_OUT_BYTES => 2,
Packit d0f5c2
    #  RAW_FALLBACK => 3,
Packit d0f5c2
    my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
Packit d0f5c2
    my $u = ord $key;
Packit d0f5c2
    $fallback ||= 0;
Packit d0f5c2
Packit d0f5c2
    if ($next != $a && $next != $t) {
Packit d0f5c2
      output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
Packit d0f5c2
    } elsif (length $out_bytes) {
Packit d0f5c2
      if ($pre) {
Packit d0f5c2
        $u = $pre|($u &0x3f);
Packit d0f5c2
      }
Packit d0f5c2
      my $s = sprintf "<U%04X> ",$u;
Packit d0f5c2
      #foreach my $c (split(//,$out_bytes)) {
Packit d0f5c2
      #  $s .= sprintf "\\x%02X",ord($c);
Packit d0f5c2
      #}
Packit d0f5c2
      # 9.5% faster changing that loop to this:
Packit d0f5c2
      $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
Packit d0f5c2
      $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
Packit d0f5c2
      push(@$cmap,$s);
Packit d0f5c2
    } else {
Packit d0f5c2
      warn join(',',$u, @{$raw->{$key}},$a,$t);
Packit d0f5c2
    }
Packit d0f5c2
  }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub output_ucm
Packit d0f5c2
{
Packit d0f5c2
 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
Packit d0f5c2
 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
Packit d0f5c2
 print $fh "<code_set_name> \"$name\"\n";
Packit d0f5c2
 char_names();
Packit d0f5c2
 if (defined $min_el)
Packit d0f5c2
  {
Packit d0f5c2
   print $fh "<mb_cur_min> $min_el\n";
Packit d0f5c2
  }
Packit d0f5c2
 if (defined $max_el)
Packit d0f5c2
  {
Packit d0f5c2
   print $fh "<mb_cur_max> $max_el\n";
Packit d0f5c2
  }
Packit d0f5c2
 if (defined $rep)
Packit d0f5c2
  {
Packit d0f5c2
   print $fh "<subchar> ";
Packit d0f5c2
   foreach my $c (split(//,$rep))
Packit d0f5c2
    {
Packit d0f5c2
     printf $fh "\\x%02X",ord($c);
Packit d0f5c2
    }
Packit d0f5c2
   print $fh "\n";
Packit d0f5c2
  }
Packit d0f5c2
 my @cmap;
Packit d0f5c2
 output_ucm_page(\@cmap,$h,$h,0);
Packit d0f5c2
 print $fh "#\nCHARMAP\n";
Packit d0f5c2
 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
Packit d0f5c2
  {
Packit d0f5c2
   print $fh $line;
Packit d0f5c2
  }
Packit d0f5c2
 print $fh "END CHARMAP\n";
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
use vars qw(
Packit d0f5c2
    $_Enc2xs
Packit d0f5c2
    $_Version
Packit d0f5c2
    $_Inc
Packit d0f5c2
    $_E2X 
Packit d0f5c2
    $_Name
Packit d0f5c2
    $_TableFiles
Packit d0f5c2
    $_Now
Packit d0f5c2
);
Packit d0f5c2
Packit d0f5c2
sub find_e2x{
Packit d0f5c2
    eval { require File::Find; };
Packit d0f5c2
    my (@inc, %e2x_dir);
Packit d0f5c2
    for my $inc (@INC){
Packit d0f5c2
    push @inc, $inc unless $inc eq '.'; #skip current dir
Packit d0f5c2
    }
Packit d0f5c2
    File::Find::find(
Packit d0f5c2
         sub {
Packit d0f5c2
         my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
Packit d0f5c2
             $atime,$mtime,$ctime,$blksize,$blocks)
Packit d0f5c2
             = lstat($_) or return;
Packit d0f5c2
         -f _ or return;
Packit d0f5c2
         if (/^.*\.e2x$/o){
Packit d0f5c2
             no warnings 'once';
Packit d0f5c2
             $e2x_dir{$File::Find::dir} ||= $mtime;
Packit d0f5c2
         }
Packit d0f5c2
         return;
Packit d0f5c2
         }, @inc);
Packit d0f5c2
    warn join("\n", keys %e2x_dir), "\n";
Packit d0f5c2
    for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
Packit d0f5c2
    $_E2X = $d;
Packit d0f5c2
    # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
Packit d0f5c2
    return $_E2X;
Packit d0f5c2
    }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub make_makefile_pl
Packit d0f5c2
{
Packit d0f5c2
    eval { require Encode } or die "You need to install Encode to use enc2xs -M\nerror: $@\n";
Packit d0f5c2
    # our used for variable expansion
Packit d0f5c2
    $_Enc2xs = $0;
Packit d0f5c2
    $_Version = $VERSION;
Packit d0f5c2
    $_E2X = find_e2x();
Packit d0f5c2
    $_Name = shift;
Packit d0f5c2
    $_TableFiles = join(",", map {qq('$_')} @_);
Packit d0f5c2
    $_Now = scalar localtime();
Packit d0f5c2
Packit d0f5c2
    eval { require File::Spec; };
Packit d0f5c2
    _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
Packit d0f5c2
    _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"),        "$_Name.pm");
Packit d0f5c2
    _print_expand(File::Spec->catfile($_E2X,"_T.e2x"),         "t/$_Name.t");
Packit d0f5c2
    _print_expand(File::Spec->catfile($_E2X,"README.e2x"),     "README");
Packit d0f5c2
    _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"),    "Changes");
Packit d0f5c2
    exit;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
use vars qw(
Packit d0f5c2
        $_ModLines
Packit d0f5c2
        $_LocalVer
Packit d0f5c2
        );
Packit d0f5c2
Packit d0f5c2
sub make_configlocal_pm {
Packit d0f5c2
    eval { require Encode } or die "Unable to require Encode: $@\n";
Packit d0f5c2
    eval { require File::Spec; };
Packit d0f5c2
Packit d0f5c2
    # our used for variable expantion
Packit d0f5c2
    my %in_core = map { $_ => 1 } (
Packit d0f5c2
        'ascii',      'iso-8859-1', 'utf8',
Packit d0f5c2
        'ascii-ctrl', 'null',       'utf-8-strict'
Packit d0f5c2
    );
Packit d0f5c2
    my %LocalMod = ();
Packit d0f5c2
    # check @enc;
Packit d0f5c2
    use File::Find ();
Packit d0f5c2
    my $wanted = sub{
Packit d0f5c2
	-f $_ or return;
Packit d0f5c2
	$File::Find::name =~ /\A\./        and return;
Packit d0f5c2
	$File::Find::name =~ /\.pm\z/      or  return;
Packit d0f5c2
	$File::Find::name =~ m/\bEncode\b/ or  return;
Packit d0f5c2
	my $mod = $File::Find::name;
Packit d0f5c2
	$mod =~ s/.*\bEncode\b/Encode/o;
Packit d0f5c2
	$mod =~ s/\.pm\z//o;
Packit d0f5c2
	$mod =~ s,/,::,og;
Packit d0f5c2
	eval qq{ require $mod; } or return;
Packit d0f5c2
        warn qq{ require $mod;\n};
Packit d0f5c2
	for my $enc ( Encode->encodings() ) {
Packit d0f5c2
	    no warnings;
Packit d0f5c2
	    $in_core{$enc}                   and next;
Packit d0f5c2
	    $Encode::Config::ExtModule{$enc} and next;
Packit d0f5c2
	    $LocalMod{$enc} ||= $mod;
Packit d0f5c2
	}
Packit d0f5c2
    };
Packit d0f5c2
    File::Find::find({wanted => $wanted}, @INC);
Packit d0f5c2
    $_ModLines = "";
Packit d0f5c2
    for my $enc ( sort keys %LocalMod ) {
Packit d0f5c2
        $_ModLines .=
Packit d0f5c2
          qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
Packit d0f5c2
    }
Packit d0f5c2
    warn $_ModLines if $_ModLines;
Packit d0f5c2
    $_LocalVer = _mkversion();
Packit d0f5c2
    $_E2X      = find_e2x();
Packit d0f5c2
    $_Inc      = $INC{"Encode.pm"};
Packit d0f5c2
    $_Inc =~ s/\.pm$//o;
Packit d0f5c2
    _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
Packit d0f5c2
        File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
Packit d0f5c2
    exit;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub _mkversion{
Packit d0f5c2
    # v-string is now depreciated; use time() instead;
Packit d0f5c2
    #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
Packit d0f5c2
    #$yyyy += 1900, $mo +=1;
Packit d0f5c2
    #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
Packit d0f5c2
    return time();
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub _print_expand{
Packit d0f5c2
    eval { require File::Basename } or die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
Packit d0f5c2
    File::Basename->import();
Packit d0f5c2
    my ($src, $dst, $clobber) = @_;
Packit d0f5c2
    if (!$clobber and -e $dst){
Packit d0f5c2
    warn "$dst exists. skipping\n";
Packit d0f5c2
    return;
Packit d0f5c2
    }
Packit d0f5c2
    warn "Generating $dst...\n";
Packit d0f5c2
    open my $in, $src or die "$src : $!";
Packit d0f5c2
    if ((my $d = dirname($dst)) ne '.'){
Packit d0f5c2
    -d $d or mkdir $d, 0755 or die  "mkdir $d : $!";
Packit d0f5c2
    }	   
Packit d0f5c2
    open my $out, ">", $dst or die "$!";
Packit d0f5c2
    my $asis = 0;
Packit d0f5c2
    while (<$in>){ 
Packit d0f5c2
    if (/^#### END_OF_HEADER/){
Packit d0f5c2
        $asis = 1; next;
Packit d0f5c2
    }	  
Packit d0f5c2
    s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
Packit d0f5c2
    print $out $_;
Packit d0f5c2
    }
Packit d0f5c2
}
Packit d0f5c2
__END__
Packit d0f5c2
Packit d0f5c2
=head1 NAME
Packit d0f5c2
Packit d0f5c2
enc2xs -- Perl Encode Module Generator
Packit d0f5c2
Packit d0f5c2
=head1 SYNOPSIS
Packit d0f5c2
Packit d0f5c2
  enc2xs -[options]
Packit d0f5c2
  enc2xs -M ModName mapfiles...
Packit d0f5c2
  enc2xs -C
Packit d0f5c2
Packit d0f5c2
=head1 DESCRIPTION
Packit d0f5c2
Packit d0f5c2
F<enc2xs> builds a Perl extension for use by Encode from either
Packit d0f5c2
Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
Packit d0f5c2
Besides being used internally during the build process of the Encode
Packit d0f5c2
module, you can use F<enc2xs> to add your own encoding to perl.
Packit d0f5c2
No knowledge of XS is necessary.
Packit d0f5c2
Packit d0f5c2
=head1 Quick Guide
Packit d0f5c2
Packit d0f5c2
If you want to know as little about Perl as possible but need to
Packit d0f5c2
add a new encoding, just read this chapter and forget the rest.
Packit d0f5c2
Packit d0f5c2
=over 4
Packit d0f5c2
Packit d0f5c2
=item 0.Z<>
Packit d0f5c2
Packit d0f5c2
Have a .ucm file ready.  You can get it from somewhere or you can write
Packit d0f5c2
your own from scratch or you can grab one from the Encode distribution
Packit d0f5c2
and customize it.  For the UCM format, see the next Chapter.  In the
Packit d0f5c2
example below, I'll call my theoretical encoding myascii, defined
Packit d0f5c2
in I<my.ucm>.  C<$> is a shell prompt.
Packit d0f5c2
Packit d0f5c2
  $ ls -F
Packit d0f5c2
  my.ucm
Packit d0f5c2
Packit d0f5c2
=item 1.Z<>
Packit d0f5c2
Packit d0f5c2
Issue a command as follows;
Packit d0f5c2
Packit d0f5c2
  $ enc2xs -M My my.ucm
Packit d0f5c2
  generating Makefile.PL
Packit d0f5c2
  generating My.pm
Packit d0f5c2
  generating README
Packit d0f5c2
  generating Changes
Packit d0f5c2
Packit d0f5c2
Now take a look at your current directory.  It should look like this.
Packit d0f5c2
Packit d0f5c2
  $ ls -F
Packit d0f5c2
  Makefile.PL   My.pm         my.ucm        t/
Packit d0f5c2
Packit d0f5c2
The following files were created.
Packit d0f5c2
Packit d0f5c2
  Makefile.PL - MakeMaker script
Packit d0f5c2
  My.pm       - Encode submodule
Packit d0f5c2
  t/My.t      - test file
Packit d0f5c2
Packit d0f5c2
=over 4
Packit d0f5c2
Packit d0f5c2
=item 1.1.Z<>
Packit d0f5c2
Packit d0f5c2
If you want *.ucm installed together with the modules, do as follows;
Packit d0f5c2
Packit d0f5c2
  $ mkdir Encode
Packit d0f5c2
  $ mv *.ucm Encode
Packit d0f5c2
  $ enc2xs -M My Encode/*ucm
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=item 2.Z<>
Packit d0f5c2
Packit d0f5c2
Edit the files generated.  You don't have to if you have no time AND no
Packit d0f5c2
intention to give it to someone else.  But it is a good idea to edit
Packit d0f5c2
the pod and to add more tests.
Packit d0f5c2
Packit d0f5c2
=item 3.Z<>
Packit d0f5c2
Packit d0f5c2
Now issue a command all Perl Mongers love:
Packit d0f5c2
Packit d0f5c2
  $ perl Makefile.PL
Packit d0f5c2
  Writing Makefile for Encode::My
Packit d0f5c2
Packit d0f5c2
=item 4.Z<>
Packit d0f5c2
Packit d0f5c2
Now all you have to do is make.
Packit d0f5c2
Packit d0f5c2
  $ make
Packit d0f5c2
  cp My.pm blib/lib/Encode/My.pm
Packit d0f5c2
  /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
Packit d0f5c2
    -o encode_t.c -f encode_t.fnm
Packit d0f5c2
  Reading myascii (myascii)
Packit d0f5c2
  Writing compiled form
Packit d0f5c2
  128 bytes in string tables
Packit d0f5c2
  384 bytes (75%) saved spotting duplicates
Packit d0f5c2
  1 bytes (0.775%) saved using substrings
Packit d0f5c2
  ....
Packit d0f5c2
  chmod 644 blib/arch/auto/Encode/My/My.bs
Packit d0f5c2
  $
Packit d0f5c2
Packit d0f5c2
The time it takes varies depending on how fast your machine is and
Packit d0f5c2
how large your encoding is.  Unless you are working on something big
Packit d0f5c2
like euc-tw, it won't take too long.
Packit d0f5c2
Packit d0f5c2
=item 5.Z<>
Packit d0f5c2
Packit d0f5c2
You can "make install" already but you should test first.
Packit d0f5c2
Packit d0f5c2
  $ make test
Packit d0f5c2
  PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
Packit d0f5c2
    -e 'use Test::Harness  qw(&runtests $verbose); \
Packit d0f5c2
    $verbose=0; runtests @ARGV;' t/*.t
Packit d0f5c2
  t/My....ok
Packit d0f5c2
  All tests successful.
Packit d0f5c2
  Files=1, Tests=2,  0 wallclock secs
Packit d0f5c2
   ( 0.09 cusr + 0.01 csys = 0.09 CPU)
Packit d0f5c2
Packit d0f5c2
=item 6.Z<>
Packit d0f5c2
Packit d0f5c2
If you are content with the test result, just "make install"
Packit d0f5c2
Packit d0f5c2
=item 7.Z<>
Packit d0f5c2
Packit d0f5c2
If you want to add your encoding to Encode's demand-loading list
Packit d0f5c2
(so you don't have to "use Encode::YourEncoding"), run
Packit d0f5c2
Packit d0f5c2
  enc2xs -C
Packit d0f5c2
Packit d0f5c2
to update Encode::ConfigLocal, a module that controls local settings.
Packit d0f5c2
After that, "use Encode;" is enough to load your encodings on demand.
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=head1 The Unicode Character Map
Packit d0f5c2
Packit d0f5c2
Encode uses the Unicode Character Map (UCM) format for source character
Packit d0f5c2
mappings.  This format is used by IBM's ICU package and was adopted
Packit d0f5c2
by Nick Ing-Simmons for use with the Encode module.  Since UCM is
Packit d0f5c2
more flexible than Tcl's Encoding Map and far more user-friendly,
Packit d0f5c2
this is the recommended format for Encode now.
Packit d0f5c2
Packit d0f5c2
A UCM file looks like this.
Packit d0f5c2
Packit d0f5c2
  #
Packit d0f5c2
  # Comments
Packit d0f5c2
  #
Packit d0f5c2
  <code_set_name> "US-ascii" # Required
Packit d0f5c2
  <code_set_alias> "ascii"   # Optional
Packit d0f5c2
  <mb_cur_min> 1             # Required; usually 1
Packit d0f5c2
  <mb_cur_max> 1             # Max. # of bytes/char
Packit d0f5c2
  <subchar> \x3F             # Substitution char
Packit d0f5c2
  #
Packit d0f5c2
  CHARMAP
Packit d0f5c2
  <U0000> \x00 |0 # <control>
Packit d0f5c2
  <U0001> \x01 |0 # <control>
Packit d0f5c2
  <U0002> \x02 |0 # <control>
Packit d0f5c2
  ....
Packit d0f5c2
  <U007C> \x7C |0 # VERTICAL LINE
Packit d0f5c2
  <U007D> \x7D |0 # RIGHT CURLY BRACKET
Packit d0f5c2
  <U007E> \x7E |0 # TILDE
Packit d0f5c2
  <U007F> \x7F |0 # <control>
Packit d0f5c2
  END CHARMAP
Packit d0f5c2
Packit d0f5c2
=over 4
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
Anything that follows C<#> is treated as a comment.
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
The header section continues until a line containing the word
Packit d0f5c2
CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
Packit d0f5c2
pair per line.  Strings used as values must be quoted. Barewords are
Packit d0f5c2
treated as numbers.  I<\xXX> represents a byte.
Packit d0f5c2
Packit d0f5c2
Most of the keywords are self-explanatory. I<subchar> means
Packit d0f5c2
substitution character, not subcharacter.  When you decode a Unicode
Packit d0f5c2
sequence to this encoding but no matching character is found, the byte
Packit d0f5c2
sequence defined here will be used.  For most cases, the value here is
Packit d0f5c2
\x3F; in ASCII, this is a question mark.
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
CHARMAP starts the character map section.  Each line has a form as
Packit d0f5c2
follows:
Packit d0f5c2
Packit d0f5c2
  <UXXXX> \xXX.. |0 # comment
Packit d0f5c2
    ^     ^      ^
Packit d0f5c2
    |     |      +- Fallback flag
Packit d0f5c2
    |     +-------- Encoded byte sequence
Packit d0f5c2
    +-------------- Unicode Character ID in hex
Packit d0f5c2
Packit d0f5c2
The format is roughly the same as a header section except for the
Packit d0f5c2
fallback flag: | followed by 0..3.   The meaning of the possible
Packit d0f5c2
values is as follows:
Packit d0f5c2
Packit d0f5c2
=over 4
Packit d0f5c2
Packit d0f5c2
=item |0 
Packit d0f5c2
Packit d0f5c2
Round trip safe.  A character decoded to Unicode encodes back to the
Packit d0f5c2
same byte sequence.  Most characters have this flag.
Packit d0f5c2
Packit d0f5c2
=item |1
Packit d0f5c2
Packit d0f5c2
Fallback for unicode -> encoding.  When seen, enc2xs adds this
Packit d0f5c2
character for the encode map only.
Packit d0f5c2
Packit d0f5c2
=item |2 
Packit d0f5c2
Packit d0f5c2
Skip sub-char mapping should there be no code point.
Packit d0f5c2
Packit d0f5c2
=item |3 
Packit d0f5c2
Packit d0f5c2
Fallback for encoding -> unicode.  When seen, enc2xs adds this
Packit d0f5c2
character for the decode map only.
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
And finally, END OF CHARMAP ends the section.
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
When you are manually creating a UCM file, you should copy ascii.ucm
Packit d0f5c2
or an existing encoding which is close to yours, rather than write
Packit d0f5c2
your own from scratch.
Packit d0f5c2
Packit d0f5c2
When you do so, make sure you leave at least B<U0000> to B<U0020> as
Packit d0f5c2
is, unless your environment is EBCDIC.
Packit d0f5c2
Packit d0f5c2
B<CAVEAT>: not all features in UCM are implemented.  For example,
Packit d0f5c2
icu:state is not used.  Because of that, you need to write a perl
Packit d0f5c2
module if you want to support algorithmical encodings, notably
Packit d0f5c2
the ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
Packit d0f5c2
L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
Packit d0f5c2
Packit d0f5c2
=head2 Coping with duplicate mappings
Packit d0f5c2
Packit d0f5c2
When you create a map, you SHOULD make your mappings round-trip safe.
Packit d0f5c2
That is, C
Packit d0f5c2
$data> stands for all characters that are marked as C<|0>.  Here is
Packit d0f5c2
how to make sure:
Packit d0f5c2
Packit d0f5c2
=over 4
Packit d0f5c2
Packit d0f5c2
=item * 
Packit d0f5c2
Packit d0f5c2
Sort your map in Unicode order.
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
When you have a duplicate entry, mark either one with '|1' or '|3'.
Packit d0f5c2
  
Packit d0f5c2
=item * 
Packit d0f5c2
Packit d0f5c2
And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
Here is an example from big5-eten.
Packit d0f5c2
Packit d0f5c2
  <U2550> \xF9\xF9 |0
Packit d0f5c2
  <U2550> \xA2\xA4 |3
Packit d0f5c2
Packit d0f5c2
Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
Packit d0f5c2
this;
Packit d0f5c2
Packit d0f5c2
  E to U               U to E
Packit d0f5c2
  --------------------------------------
Packit d0f5c2
  \xF9\xF9 => U2550    U2550 => \xF9\xF9
Packit d0f5c2
  \xA2\xA4 => U2550
Packit d0f5c2
 
Packit d0f5c2
So it is round-trip safe for \xF9\xF9.  But if the line above is upside
Packit d0f5c2
down, here is what happens.
Packit d0f5c2
Packit d0f5c2
  E to U               U to E
Packit d0f5c2
  --------------------------------------
Packit d0f5c2
  \xA2\xA4 => U2550    U2550 => \xF9\xF9
Packit d0f5c2
  (\xF9\xF9 => U2550 is now overwritten!)
Packit d0f5c2
Packit d0f5c2
The Encode package comes with F<ucmlint>, a crude but sufficient
Packit d0f5c2
utility to check the integrity of a UCM file.  Check under the
Packit d0f5c2
Encode/bin directory for this.
Packit d0f5c2
Packit d0f5c2
When in doubt, you can use F<ucmsort>, yet another utility under
Packit d0f5c2
Encode/bin directory.
Packit d0f5c2
Packit d0f5c2
=head1 Bookmarks
Packit d0f5c2
Packit d0f5c2
=over 4
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
ICU Home Page 
Packit d0f5c2
L<http://www.icu-project.org/>
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
ICU Character Mapping Tables
Packit d0f5c2
L<http://site.icu-project.org/charts/charset>
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
ICU:Conversion Data
Packit d0f5c2
L<http://www.icu-project.org/userguide/conversion-data.html>
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=head1 SEE ALSO
Packit d0f5c2
Packit d0f5c2
L<Encode>,
Packit d0f5c2
L<perlmod>,
Packit d0f5c2
L<perlpod>
Packit d0f5c2
Packit d0f5c2
=cut
Packit d0f5c2
Packit d0f5c2
# -Q to disable the duplicate codepoint test
Packit d0f5c2
# -S make mapping errors fatal
Packit d0f5c2
# -q to remove comments written to output files
Packit d0f5c2
# -O to enable the (brute force) substring optimiser
Packit d0f5c2
# -o <output> to specify the output file name (else it's the first arg)
Packit d0f5c2
# -f <inlist> to give a file with a list of input files (else use the args)
Packit d0f5c2
# -n <name> to name the encoding (else use the basename of the input file.
Packit d0f5c2
Packit d0f5c2
With %seen holding array refs:
Packit d0f5c2
Packit d0f5c2
      865.66 real        28.80 user         8.79 sys
Packit d0f5c2
      7904  maximum resident set size
Packit d0f5c2
      1356  average shared memory size
Packit d0f5c2
     18566  average unshared data size
Packit d0f5c2
       229  average unshared stack size
Packit d0f5c2
     46080  page reclaims
Packit d0f5c2
     33373  page faults
Packit d0f5c2
Packit d0f5c2
With %seen holding simple scalars:
Packit d0f5c2
Packit d0f5c2
      342.16 real        27.11 user         3.54 sys
Packit d0f5c2
      8388  maximum resident set size
Packit d0f5c2
      1394  average shared memory size
Packit d0f5c2
     14969  average unshared data size
Packit d0f5c2
       236  average unshared stack size
Packit d0f5c2
     28159  page reclaims
Packit d0f5c2
      9839  page faults
Packit d0f5c2
Packit d0f5c2
Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
Packit d0f5c2
how %seen is storing things its seen. So it is pathalogically bad on a 16M
Packit d0f5c2
RAM machine, but it's going to help even on modern machines.
Packit d0f5c2
Swapping is bad, m'kay :-)