|
Packit |
d0f5c2 |
#!/usr/local/bin/perl
|
|
Packit |
d0f5c2 |
#
|
|
Packit |
d0f5c2 |
# $Id: ucmlint,v 2.4 2017/06/10 17:23:50 dankogai Exp $
|
|
Packit |
d0f5c2 |
#
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
BEGIN { pop @INC if $INC[-1] eq '.' }
|
|
Packit |
d0f5c2 |
use strict;
|
|
Packit |
d0f5c2 |
our $VERSION = do { my @r = (q$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
use Getopt::Std;
|
|
Packit |
d0f5c2 |
our %Opt;
|
|
Packit |
d0f5c2 |
getopts("Dehfv", \%Opt);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
if ($Opt{e}){
|
|
Packit |
d0f5c2 |
eval { require Encode } or die "can't load Encode : $@";
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
$Opt{h} and help();
|
|
Packit |
d0f5c2 |
@ARGV or help();
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub help{
|
|
Packit |
d0f5c2 |
print <<"";
|
|
Packit |
d0f5c2 |
$0 -[Dehfv] [ucm files ...]
|
|
Packit |
d0f5c2 |
-D debug mode on
|
|
Packit |
d0f5c2 |
-e test with Encode module also (requires perl 5.7.3 or higher)
|
|
Packit |
d0f5c2 |
-h shows this message
|
|
Packit |
d0f5c2 |
-f forces roundtrip check even for |[123]
|
|
Packit |
d0f5c2 |
-v verbose mode
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
$| = 1;
|
|
Packit |
d0f5c2 |
my (%Hdr, %U2E, %E2U, %Fallback);
|
|
Packit |
d0f5c2 |
my $in_charmap = 0;
|
|
Packit |
d0f5c2 |
my $nerror = 0;
|
|
Packit |
d0f5c2 |
my $nwarning = 0;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub nit($;$){
|
|
Packit |
d0f5c2 |
my ($msg, $level) = @_;
|
|
Packit |
d0f5c2 |
my $lstr;
|
|
Packit |
d0f5c2 |
if ($level == 2){
|
|
Packit |
d0f5c2 |
$lstr = 'notice';
|
|
Packit |
d0f5c2 |
}elsif ($level == 1){
|
|
Packit |
d0f5c2 |
$lstr = 'warning'; $nwarning++;
|
|
Packit |
d0f5c2 |
}else{
|
|
Packit |
d0f5c2 |
$lstr = 'error'; $nerror++;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
print "$ARGV:$lstr in line $.: $msg\n";
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
for $ARGV (@ARGV){
|
|
Packit |
d0f5c2 |
open UCM, $ARGV or die "$ARGV:$!";
|
|
Packit |
d0f5c2 |
%Hdr = %U2E = %E2U = %Fallback = ();
|
|
Packit |
d0f5c2 |
$in_charmap = $nerror = $nwarning = 0;
|
|
Packit |
d0f5c2 |
$. = 0;
|
|
Packit |
d0f5c2 |
while(<UCM>){
|
|
Packit |
d0f5c2 |
chomp;
|
|
Packit |
d0f5c2 |
s/\s*#.*$//o; /^$/ and next;
|
|
Packit |
d0f5c2 |
if ($_ eq "CHARMAP"){
|
|
Packit |
d0f5c2 |
$in_charmap = 1;
|
|
Packit |
d0f5c2 |
for my $must (qw/code_set_name mb_cur_min mb_cur_max/){
|
|
Packit |
d0f5c2 |
exists $Hdr{$must} or nit "<$must> nonexistent";
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
$Hdr{mb_cur_min} > $Hdr{mb_cur_max}
|
|
Packit |
d0f5c2 |
and nit sprintf("mb_cur_min(%d) > mb_cur_max(%d)",
|
|
Packit |
d0f5c2 |
$Hdr{mb_cur_min},$Hdr{mb_cur_max});
|
|
Packit |
d0f5c2 |
$in_charmap = 1;
|
|
Packit |
d0f5c2 |
next;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
unless ($in_charmap){
|
|
Packit |
d0f5c2 |
my($hkey, $hvalue) = /^<(\S+)>\s+[\"\']?([^\"\']+)/o or next;
|
|
Packit |
d0f5c2 |
$Opt{D} and warn "$hkey => $hvalue";
|
|
Packit |
d0f5c2 |
if ($hkey eq "code_set_name"){ # name check
|
|
Packit |
d0f5c2 |
exists $Hdr{code_set_name}
|
|
Packit |
d0f5c2 |
and nit "Duplicate <code_set_name>: $hkey";
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
if ($hkey eq "code_set_alias"){ # alias check
|
|
Packit |
d0f5c2 |
$hvalue eq $Hdr{code_set_name}
|
|
Packit |
d0f5c2 |
and nit qq(alias "$hvalue" is already in <code_set_name>);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
$Hdr{$hkey} = $hvalue;
|
|
Packit |
d0f5c2 |
}else{
|
|
Packit |
d0f5c2 |
my $name = $Hdr{code_set_name};
|
|
Packit |
d0f5c2 |
my($unistr, $encstr, $fb) = /^(\S+)\s+(\S+)\s(\S+)/o or next;
|
|
Packit |
d0f5c2 |
$Opt{v} and nit $_, 2;
|
|
Packit |
d0f5c2 |
my $uni = uniparse($unistr);
|
|
Packit |
d0f5c2 |
my $enc = encparse($encstr);
|
|
Packit |
d0f5c2 |
$fb =~ /^\|([0123])$/ or nit "malformed fallback: $fb";
|
|
Packit |
d0f5c2 |
$fb = $1;
|
|
Packit |
d0f5c2 |
$Opt{f} and $fb = 0;
|
|
Packit |
d0f5c2 |
unless ($fb == 3){ # check uni -> enc
|
|
Packit |
d0f5c2 |
if (exists $U2E{$uni}){
|
|
Packit |
d0f5c2 |
nit "dupe encode map: U$uni => $U2E{$uni} and $enc", 1;
|
|
Packit |
d0f5c2 |
}else{
|
|
Packit |
d0f5c2 |
$U2E{$uni} = $enc;
|
|
Packit |
d0f5c2 |
$Fallback{$uni}{$enc} = 1 if $fb == 1;
|
|
Packit |
d0f5c2 |
if ($Opt{e}) {
|
|
Packit |
d0f5c2 |
my $e = hex2enc($enc);
|
|
Packit |
d0f5c2 |
my $u = hex2uni($uni);
|
|
Packit |
d0f5c2 |
my $eu = Encode::encode($name, $u);
|
|
Packit |
d0f5c2 |
$e eq $eu
|
|
Packit |
d0f5c2 |
or nit qq(encode('$name', $uni) != $enc);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
unless ($fb == 1){ # check enc -> uni
|
|
Packit |
d0f5c2 |
if (exists $E2U{$enc}){
|
|
Packit |
d0f5c2 |
nit "dupe decode map: $enc => U$E2U{$enc} and U$uni", 1;
|
|
Packit |
d0f5c2 |
}else{
|
|
Packit |
d0f5c2 |
$E2U{$enc} = $uni;
|
|
Packit |
d0f5c2 |
$Fallback{$enc}{$uni} = 1 if $fb == 3;
|
|
Packit |
d0f5c2 |
if ($Opt{e}) {
|
|
Packit |
d0f5c2 |
my $e = hex2enc($enc);
|
|
Packit |
d0f5c2 |
my $u = hex2uni($uni);
|
|
Packit |
d0f5c2 |
$Opt{D} and warn "$uni, $enc";
|
|
Packit |
d0f5c2 |
my $de = Encode::decode($name, $e);
|
|
Packit |
d0f5c2 |
$de eq $u
|
|
Packit |
d0f5c2 |
or nit qq(decode('$name', $enc) != $uni);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
# warn "$uni, $enc, $fb";
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
$in_charmap or nit "Where is CHARMAP?";
|
|
Packit |
d0f5c2 |
checkRT();
|
|
Packit |
d0f5c2 |
printf ("$ARGV: %s error%s found\n",
|
|
Packit |
d0f5c2 |
($nerror == 0 ? 'no' : $nerror),
|
|
Packit |
d0f5c2 |
($nerror > 1 ? 's' : ''));
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
exit;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub hex2enc{
|
|
Packit |
d0f5c2 |
pack("C*", map {hex($_)} split(",", shift));
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
sub hex2uni{
|
|
Packit |
d0f5c2 |
join("", map { chr(hex($_)) } split(",", shift));
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub checkRT{
|
|
Packit |
d0f5c2 |
for my $uni (keys %E2U){
|
|
Packit |
d0f5c2 |
my $enc = $U2E{$uni} or next; # okay
|
|
Packit |
d0f5c2 |
$E2U{$U2E{$uni}} eq $uni or $Fallback{$uni}{$enc} or
|
|
Packit |
d0f5c2 |
nit "RT failure: U$uni => $enc =>U$E2U{$U2E{$uni}}";
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
for my $enc (keys %E2U){
|
|
Packit |
d0f5c2 |
my $uni = $E2U{$enc} or next; # okay
|
|
Packit |
d0f5c2 |
$U2E{$E2U{$enc}} eq $enc or $Fallback{$enc}{$uni} or
|
|
Packit |
d0f5c2 |
nit "RT failure: $enc => U$uni => $U2E{$E2U{$enc}}";
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub uniparse{
|
|
Packit |
d0f5c2 |
my $str = shift;
|
|
Packit |
d0f5c2 |
my @u;
|
|
Packit |
d0f5c2 |
push @u, $1 while($str =~ /\G<U(.*?)>/ig);
|
|
Packit |
d0f5c2 |
for my $u (@u){
|
|
Packit |
d0f5c2 |
$u =~ /^([0-9A-Za-z]+)$/o
|
|
Packit |
d0f5c2 |
or nit "malformed Unicode character: $u";
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
return join(',', @u);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub encparse{
|
|
Packit |
d0f5c2 |
my $str = shift;
|
|
Packit |
d0f5c2 |
my @e;
|
|
Packit |
d0f5c2 |
for my $e (split /\\x/io, $str){
|
|
Packit |
d0f5c2 |
$e or next; # first \x
|
|
Packit |
d0f5c2 |
$e =~ /^([0-9A-Za-z]{1,2})$/io
|
|
Packit |
d0f5c2 |
or nit "Hex $e in $str is bogus";
|
|
Packit |
d0f5c2 |
push @e, $1;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
return join(',', @e);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
__END__
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
A UCM file looks like this.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
#
|
|
Packit |
d0f5c2 |
# Comments
|
|
Packit |
d0f5c2 |
#
|
|
Packit |
d0f5c2 |
<code_set_name> "US-ascii" # Required
|
|
Packit |
d0f5c2 |
<code_set_alias> "ascii" # Optional
|
|
Packit |
d0f5c2 |
<mb_cur_min> 1 # Required; usually 1
|
|
Packit |
d0f5c2 |
<mb_cur_max> 1 # Max. # of bytes/char
|
|
Packit |
d0f5c2 |
<subchar> \x3F # Substitution char
|
|
Packit |
d0f5c2 |
#
|
|
Packit |
d0f5c2 |
CHARMAP
|
|
Packit |
d0f5c2 |
<U0000> \x00 |0 # <control>
|
|
Packit |
d0f5c2 |
<U0001> \x01 |0 # <control>
|
|
Packit |
d0f5c2 |
<U0002> \x02 |0 # <control>
|
|
Packit |
d0f5c2 |
....
|
|
Packit |
d0f5c2 |
<U007C> \x7C |0 # VERTICAL LINE
|
|
Packit |
d0f5c2 |
<U007D> \x7D |0 # RIGHT CURLY BRACKET
|
|
Packit |
d0f5c2 |
<U007E> \x7E |0 # TILDE
|
|
Packit |
d0f5c2 |
<U007F> \x7F |0 # <control>
|
|
Packit |
d0f5c2 |
END CHARMAP
|
|
Packit |
d0f5c2 |
|