Blame bin/ucmlint

Packit d0f5c2
#!/usr/local/bin/perl
Packit d0f5c2
#
Packit d0f5c2
# $Id: ucmlint,v 2.4 2017/06/10 17:23:50 dankogai Exp $
Packit d0f5c2
#
Packit d0f5c2
Packit d0f5c2
BEGIN { pop @INC if $INC[-1] eq '.' }
Packit d0f5c2
use strict;
Packit d0f5c2
our  $VERSION = do { my @r = (q$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
Packit d0f5c2
Packit d0f5c2
use Getopt::Std;
Packit d0f5c2
our %Opt;
Packit d0f5c2
getopts("Dehfv", \%Opt);
Packit d0f5c2
Packit d0f5c2
if ($Opt{e}){
Packit d0f5c2
   eval { require Encode } or die "can't load Encode : $@";
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
$Opt{h} and help();
Packit d0f5c2
@ARGV or help();
Packit d0f5c2
Packit d0f5c2
sub help{
Packit d0f5c2
    print <<"";
Packit d0f5c2
$0 -[Dehfv] [ucm files ...]
Packit d0f5c2
  -D debug mode on
Packit d0f5c2
  -e test with Encode module also (requires perl 5.7.3 or higher)
Packit d0f5c2
  -h shows this message
Packit d0f5c2
  -f forces roundtrip check even for |[123]
Packit d0f5c2
  -v verbose mode
Packit d0f5c2
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
$| = 1;
Packit d0f5c2
my (%Hdr, %U2E, %E2U, %Fallback);
Packit d0f5c2
my $in_charmap = 0;
Packit d0f5c2
my $nerror = 0;
Packit d0f5c2
my $nwarning = 0;
Packit d0f5c2
Packit d0f5c2
sub nit($;$){
Packit d0f5c2
    my ($msg, $level) = @_;
Packit d0f5c2
    my $lstr;
Packit d0f5c2
    if ($level == 2){
Packit d0f5c2
        $lstr = 'notice';
Packit d0f5c2
    }elsif ($level == 1){
Packit d0f5c2
        $lstr = 'warning'; $nwarning++;
Packit d0f5c2
    }else{
Packit d0f5c2
        $lstr = 'error'; $nerror++;
Packit d0f5c2
    }
Packit d0f5c2
    print "$ARGV:$lstr in line $.: $msg\n";
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
for $ARGV (@ARGV){
Packit d0f5c2
    open UCM, $ARGV or die "$ARGV:$!";
Packit d0f5c2
    %Hdr = %U2E = %E2U = %Fallback = ();
Packit d0f5c2
    $in_charmap = $nerror = $nwarning = 0;
Packit d0f5c2
    $. = 0;
Packit d0f5c2
    while(<UCM>){
Packit d0f5c2
        chomp;
Packit d0f5c2
        s/\s*#.*$//o; /^$/ and next;
Packit d0f5c2
        if ($_ eq "CHARMAP"){ 
Packit d0f5c2
            $in_charmap = 1;
Packit d0f5c2
            for my $must (qw/code_set_name mb_cur_min mb_cur_max/){
Packit d0f5c2
                exists $Hdr{$must} or nit "<$must> nonexistent";
Packit d0f5c2
            }
Packit d0f5c2
            $Hdr{mb_cur_min} > $Hdr{mb_cur_max}
Packit d0f5c2
                and nit sprintf("mb_cur_min(%d) > mb_cur_max(%d)",
Packit d0f5c2
                                $Hdr{mb_cur_min},$Hdr{mb_cur_max});
Packit d0f5c2
            $in_charmap = 1;
Packit d0f5c2
            next;
Packit d0f5c2
        }
Packit d0f5c2
        unless ($in_charmap){
Packit d0f5c2
            my($hkey, $hvalue) = /^<(\S+)>\s+[\"\']?([^\"\']+)/o or next;
Packit d0f5c2
            $Opt{D} and warn "$hkey => $hvalue";
Packit d0f5c2
            if ($hkey eq "code_set_name"){ # name check
Packit d0f5c2
                exists $Hdr{code_set_name} 
Packit d0f5c2
                    and nit "Duplicate <code_set_name>: $hkey";
Packit d0f5c2
            }
Packit d0f5c2
            if ($hkey eq "code_set_alias"){ # alias check
Packit d0f5c2
                $hvalue eq $Hdr{code_set_name}
Packit d0f5c2
                    and nit qq(alias "$hvalue" is already in <code_set_name>);
Packit d0f5c2
            }
Packit d0f5c2
            $Hdr{$hkey} = $hvalue;
Packit d0f5c2
        }else{
Packit d0f5c2
            my $name = $Hdr{code_set_name};
Packit d0f5c2
            my($unistr, $encstr, $fb) = /^(\S+)\s+(\S+)\s(\S+)/o or next;
Packit d0f5c2
            $Opt{v} and nit $_, 2;
Packit d0f5c2
            my $uni = uniparse($unistr);
Packit d0f5c2
            my $enc = encparse($encstr);
Packit d0f5c2
            $fb =~ /^\|([0123])$/ or nit "malformed fallback: $fb";
Packit d0f5c2
            $fb = $1; 
Packit d0f5c2
            $Opt{f} and $fb = 0;
Packit d0f5c2
            unless ($fb == 3){ # check uni -> enc
Packit d0f5c2
                if (exists $U2E{$uni}){
Packit d0f5c2
                    nit "dupe encode map: U$uni => $U2E{$uni} and $enc", 1;
Packit d0f5c2
                }else{
Packit d0f5c2
                    $U2E{$uni} = $enc;
Packit d0f5c2
                    $Fallback{$uni}{$enc} = 1 if $fb == 1;
Packit d0f5c2
                    if ($Opt{e}) {
Packit d0f5c2
                        my $e = hex2enc($enc);
Packit d0f5c2
                        my $u = hex2uni($uni);
Packit d0f5c2
                        my $eu = Encode::encode($name, $u);
Packit d0f5c2
                        $e eq $eu
Packit d0f5c2
                            or nit qq(encode('$name', $uni) != $enc);
Packit d0f5c2
                    }
Packit d0f5c2
                }
Packit d0f5c2
            }
Packit d0f5c2
            unless ($fb == 1){  # check enc -> uni
Packit d0f5c2
                if (exists $E2U{$enc}){
Packit d0f5c2
                    nit "dupe decode map: $enc => U$E2U{$enc} and U$uni", 1;
Packit d0f5c2
                }else{
Packit d0f5c2
                    $E2U{$enc} = $uni;
Packit d0f5c2
                    $Fallback{$enc}{$uni} = 1 if $fb == 3;
Packit d0f5c2
                    if ($Opt{e}) {
Packit d0f5c2
                        my $e = hex2enc($enc);
Packit d0f5c2
                        my $u = hex2uni($uni);
Packit d0f5c2
                        $Opt{D} and warn "$uni, $enc";
Packit d0f5c2
                        my $de = Encode::decode($name, $e);
Packit d0f5c2
                        $de eq $u
Packit d0f5c2
                            or nit qq(decode('$name', $enc) != $uni);
Packit d0f5c2
                    }
Packit d0f5c2
                }
Packit d0f5c2
            }
Packit d0f5c2
            # warn "$uni, $enc, $fb";
Packit d0f5c2
        }
Packit d0f5c2
    }
Packit d0f5c2
    $in_charmap or nit "Where is CHARMAP?";
Packit d0f5c2
    checkRT();
Packit d0f5c2
    printf ("$ARGV: %s error%s found\n", 
Packit d0f5c2
            ($nerror == 0 ? 'no' : $nerror),
Packit d0f5c2
            ($nerror > 1 ? 's' : ''));
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
exit;
Packit d0f5c2
Packit d0f5c2
sub hex2enc{
Packit d0f5c2
    pack("C*", map {hex($_)} split(",", shift));
Packit d0f5c2
}
Packit d0f5c2
sub hex2uni{
Packit d0f5c2
    join("", map { chr(hex($_)) } split(",", shift));
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub checkRT{
Packit d0f5c2
    for my $uni (keys %E2U){
Packit d0f5c2
        my $enc = $U2E{$uni} or next; # okay
Packit d0f5c2
        $E2U{$U2E{$uni}} eq $uni or $Fallback{$uni}{$enc} or
Packit d0f5c2
            nit "RT failure: U$uni => $enc =>U$E2U{$U2E{$uni}}";
Packit d0f5c2
    }
Packit d0f5c2
    for my $enc (keys %E2U){
Packit d0f5c2
        my $uni = $E2U{$enc} or next; # okay
Packit d0f5c2
        $U2E{$E2U{$enc}} eq $enc or $Fallback{$enc}{$uni} or
Packit d0f5c2
            nit "RT failure: $enc => U$uni => $U2E{$E2U{$enc}}";
Packit d0f5c2
    }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
Packit d0f5c2
sub uniparse{
Packit d0f5c2
    my $str = shift;
Packit d0f5c2
    my @u;
Packit d0f5c2
    push @u, $1 while($str =~ /\G<U(.*?)>/ig);
Packit d0f5c2
    for my $u (@u){
Packit d0f5c2
        $u =~ /^([0-9A-Za-z]+)$/o
Packit d0f5c2
            or nit "malformed Unicode character: $u";
Packit d0f5c2
    }
Packit d0f5c2
    return join(',', @u);
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub encparse{
Packit d0f5c2
    my $str = shift;
Packit d0f5c2
    my @e;
Packit d0f5c2
    for my $e (split /\\x/io, $str){
Packit d0f5c2
        $e or next; # first \x
Packit d0f5c2
        $e =~ /^([0-9A-Za-z]{1,2})$/io
Packit d0f5c2
            or nit "Hex $e in $str is bogus";
Packit d0f5c2
        push @e, $1;
Packit d0f5c2
    }
Packit d0f5c2
    return join(',', @e);
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
Packit d0f5c2
Packit d0f5c2
__END__
Packit d0f5c2
Packit d0f5c2
A UCM file looks like this.
Packit d0f5c2
Packit d0f5c2
  #
Packit d0f5c2
  # Comments
Packit d0f5c2
  #
Packit d0f5c2
  <code_set_name> "US-ascii" # Required
Packit d0f5c2
  <code_set_alias> "ascii"   # Optional
Packit d0f5c2
  <mb_cur_min> 1             # Required; usually 1
Packit d0f5c2
  <mb_cur_max> 1             # Max. # of bytes/char
Packit d0f5c2
  <subchar> \x3F             # Substitution char
Packit d0f5c2
  #
Packit d0f5c2
  CHARMAP
Packit d0f5c2
  <U0000> \x00 |0 # <control>
Packit d0f5c2
  <U0001> \x01 |0 # <control>
Packit d0f5c2
  <U0002> \x02 |0 # <control>
Packit d0f5c2
  ....
Packit d0f5c2
  <U007C> \x7C |0 # VERTICAL LINE
Packit d0f5c2
  <U007D> \x7D |0 # RIGHT CURLY BRACKET
Packit d0f5c2
  <U007E> \x7E |0 # TILDE
Packit d0f5c2
  <U007F> \x7F |0 # <control>
Packit d0f5c2
  END CHARMAP
Packit d0f5c2