Blame unicode/gensyntax.pl

Packit 8a864e
#! /usr/bin/perl
Packit 8a864e
Packit 8a864e
# Feed this ftp://unicode.org/MappingTables/UnicodeData-1.1.4.txt.
Packit 8a864e
Packit 8a864e
$compat_start = 0xfb00;
Packit 8a864e
Packit 8a864e
# Small Kana
Packit 8a864e
$uc[12353] = 12354;
Packit 8a864e
$is_uc[12354] = 1;
Packit 8a864e
$uc[12355] = 12356;
Packit 8a864e
$is_uc[12356] = 1;
Packit 8a864e
$uc[12357] = 12358;
Packit 8a864e
$is_uc[12358] = 1;
Packit 8a864e
$uc[12359] = 12360;
Packit 8a864e
$is_uc[12360] = 1;
Packit 8a864e
$uc[12361] = 12362;
Packit 8a864e
$is_uc[12362] = 1;
Packit 8a864e
$uc[12387] = 12388;
Packit 8a864e
$is_uc[12388] = 1;
Packit 8a864e
$uc[12419] = 12420;
Packit 8a864e
$is_uc[12420] = 1;
Packit 8a864e
$uc[12421] = 12422;
Packit 8a864e
$is_uc[12422] = 1;
Packit 8a864e
$uc[12423] = 12424;
Packit 8a864e
$is_uc[12424] = 1;
Packit 8a864e
$uc[12430] = 12430;
Packit 8a864e
$is_uc[12430] = 1;
Packit 8a864e
$uc[12449] = 12450;
Packit 8a864e
$is_uc[12450] = 1;
Packit 8a864e
$uc[12451] = 12452;
Packit 8a864e
$is_uc[12452] = 1;
Packit 8a864e
$uc[12453] = 12454;
Packit 8a864e
$is_uc[12454] = 1;
Packit 8a864e
$uc[12455] = 12456;
Packit 8a864e
$is_uc[12456] = 1;
Packit 8a864e
$uc[12457] = 12458;
Packit 8a864e
$is_uc[12458] = 1;
Packit 8a864e
$uc[12483] = 12484;
Packit 8a864e
$is_uc[12484] = 1;
Packit 8a864e
$uc[12515] = 12516;
Packit 8a864e
$is_uc[12516] = 1;
Packit 8a864e
$uc[12517] = 12518;
Packit 8a864e
$is_uc[12518] = 1;
Packit 8a864e
$uc[12519] = 12520;
Packit 8a864e
$is_uc[12520] = 1;
Packit 8a864e
$uc[12526] = 12527;
Packit 8a864e
$is_uc[12527] = 1;
Packit 8a864e
Packit 8a864e
while (<>) {
Packit 8a864e
    next if /^#/;
Packit 8a864e
    @F = split(';');
Packit 8a864e
    $code = hex($F[0]);
Packit 8a864e
    next if $code < 0x200c && $F[2] eq "Cc";
Packit 8a864e
    $type[$code] = $F[2];
Packit 8a864e
    # 0x17f maps to 0x53, but SGML doesn't allow that.
Packit 8a864e
    if ($F[12] && hex($F[12]) > 128) {
Packit 8a864e
	$uc[$code] = hex($F[12]);
Packit 8a864e
	$is_uc[hex($F[12])] = 1;
Packit 8a864e
    }
Packit 8a864e
    if ($F[2] eq "Zs") {
Packit 8a864e
	$name[$code] = $F[1];
Packit 8a864e
    }
Packit 8a864e
	
Packit 8a864e
    if ($code == 0x4e00) {
Packit 8a864e
	foreach $code (0x4e00 .. 0x9fa5) {
Packit 8a864e
	    $type[$code] = $F[2];
Packit 8a864e
	}
Packit 8a864e
	foreach $code (0xf900 .. 0xfa2e) {
Packit 8a864e
	    $type[$code] = $F[2];
Packit 8a864e
	}
Packit 8a864e
    }
Packit 8a864e
}
Packit 8a864e
Packit 8a864e
print <
Packit 8a864e
SHUNCHAR CONTROLS
Packit 8a864e
BASESET "ISO Registration Number 176//CHARSET
Packit 8a864e
ISO/IEC 10646-1:1993 UCS-2 with implementation level 3//ESC 2/5 2/15 4/5"
Packit 8a864e
Packit 8a864e
DESCSET         0               65536   0        -- 16 bit --
Packit 8a864e
Packit 8a864e
FUNCTION        RE                      13
Packit 8a864e
                RS                      10
Packit 8a864e
                SPACE                   32       
Packit 8a864e
                TAB             SEPCHAR 9
Packit 8a864e
EOF
Packit 8a864e
Packit 8a864e
foreach $code (128 .. $#type) {
Packit 8a864e
    if ($type[$code] eq "Zs") {
Packit 8a864e
	$s = $name[$code];
Packit 8a864e
	$s =~ s/ /-/g;
Packit 8a864e
	print("\"$s\" SEPCHAR $code\n");
Packit 8a864e
    }
Packit 8a864e
}
Packit 8a864e
Packit 8a864e
print "NAMING\n";
Packit 8a864e
Packit 8a864e
$is_nmstrt{"Lu"} = 1;
Packit 8a864e
$is_nmstrt{"Ll"} = 1;
Packit 8a864e
$is_nmchar{"Lm"} = 1;
Packit 8a864e
$is_nmstrt{"Lo"} = 1;
Packit 8a864e
$is_nmchar{"Mn"} = 1;
Packit 8a864e
$is_nmchar{"Mc"} = 1;
Packit 8a864e
$is_nmchar{"Nd"} = 1;
Packit 8a864e
$is_nmstrt{"No"} = 1;
Packit 8a864e
$is_nmstrt{"Cc"} = 1; # >= 0x200c
Packit 8a864e
$is_sr{"Cc"} = 1; # >= 0x200c
Packit 8a864e
$is_sr{"Pd"} = 1;
Packit 8a864e
$is_sr{"Ps"} = 1;
Packit 8a864e
$is_sr{"Pe"} = 1;
Packit 8a864e
$is_sr{"Po"} = 1;
Packit 8a864e
$is_sr{"Sm"} = 1;
Packit 8a864e
$is_sr{"Sc"} = 1;
Packit 8a864e
$is_sr{"So"} = 1;
Packit 8a864e
$is_sr{"Zs"} = 1;
Packit 8a864e
$is_sr{"Zl"} = 1;
Packit 8a864e
$is_sr{"Zp"} = 1;
Packit 8a864e
Packit 8a864e
print "LCNMSTRT\n";
Packit 8a864e
foreach $code (128 .. $#type) {
Packit 8a864e
Packit 8a864e
    if ($uc[$code] && $code < $compat_start) {
Packit 8a864e
	&output($code);
Packit 8a864e
    }
Packit 8a864e
    
Packit 8a864e
}
Packit 8a864e
&flush();
Packit 8a864e
Packit 8a864e
print "UCNMSTRT\n";
Packit 8a864e
Packit 8a864e
foreach $code (128 .. $#type) {
Packit 8a864e
    if ($uc[$code] && $code < $compat_start) {
Packit 8a864e
	&output($uc[$code]);
Packit 8a864e
    }
Packit 8a864e
    
Packit 8a864e
}
Packit 8a864e
&flush();
Packit 8a864e
Packit 8a864e
print "NAMESTRT\n";
Packit 8a864e
foreach $code (128 .. $#type) {
Packit 8a864e
    if (!$uc[$code] && !$is_uc[$code]
Packit 8a864e
	&& $is_nmstrt{$type[$code]} && $code < $compat_start) {
Packit 8a864e
	&output($code);
Packit 8a864e
    }
Packit 8a864e
    
Packit 8a864e
}
Packit 8a864e
&flush();
Packit 8a864e
Packit 8a864e
print "LCNMCHAR\n";
Packit 8a864e
&output(ord("-"));
Packit 8a864e
&output(ord("."));
Packit 8a864e
&flush();
Packit 8a864e
Packit 8a864e
print "UCNMCHAR\n";
Packit 8a864e
&output(ord("-"));
Packit 8a864e
&output(ord("."));
Packit 8a864e
&flush();
Packit 8a864e
Packit 8a864e
print "NAMECHAR\n";
Packit 8a864e
foreach $code (128 .. $#type) {
Packit 8a864e
    if ($is_nmchar{$type[$code]} && $code < $compat_start) {
Packit 8a864e
	&output($code);
Packit 8a864e
    }
Packit 8a864e
    
Packit 8a864e
}
Packit 8a864e
&flush();
Packit 8a864e
Packit 8a864e
print <
Packit 8a864e
NAMECASE   GENERAL    YES
Packit 8a864e
           ENTITY     NO
Packit 8a864e
DELIM      GENERAL    SGMLREF
Packit 8a864e
	   SHORTREF   SGMLREF
Packit 8a864e
EOF
Packit 8a864e
Packit 8a864e
foreach $code (128 .. $#type) {
Packit 8a864e
    if ($is_sr{$type[$code]}) {
Packit 8a864e
	&output($code);
Packit 8a864e
    }
Packit 8a864e
    
Packit 8a864e
}
Packit 8a864e
Packit 8a864e
&flush;
Packit 8a864e
Packit 8a864e
print <
Packit 8a864e
NAMES           SGMLREF 
Packit 8a864e
Packit 8a864e
QUANTITY        SGMLREF         -- To be determined --
Packit 8a864e
        ATTSPLEN        1920    -- ?? --
Packit 8a864e
        LITLEN          240     -- ?? --
Packit 8a864e
        NAMELEN         240     -- ?? --
Packit 8a864e
        PILEN           1920    -- ?? --
Packit 8a864e
        TAGLEN          1920    -- ?? --
Packit 8a864e
EOF
Packit 8a864e
Packit 8a864e
Packit 8a864e
sub output {
Packit 8a864e
    $ch = $_[0];
Packit 8a864e
    if ($pending > 0 && $base + $pending == $ch) {
Packit 8a864e
	$pending++;
Packit 8a864e
    }
Packit 8a864e
    else {
Packit 8a864e
	&flush;
Packit 8a864e
	$base = $ch;
Packit 8a864e
	$pending = 1;
Packit 8a864e
    }
Packit 8a864e
}
Packit 8a864e
Packit 8a864e
sub flush {
Packit 8a864e
    if ($pending > 0) {
Packit 8a864e
	printf("%d", $base);
Packit 8a864e
	if ($pending > 1) {
Packit 8a864e
	    if ($pending > 2) {
Packit 8a864e
		print "-";
Packit 8a864e
	    }
Packit 8a864e
	    else {
Packit 8a864e
		print "\n";
Packit 8a864e
	    }
Packit 8a864e
	    printf("%d", $base + ($pending - 1));
Packit 8a864e
	}
Packit 8a864e
	print "\n";
Packit 8a864e
	$count += $pending;
Packit 8a864e
	$pending = 0;
Packit 8a864e
    }
Packit 8a864e
}
Packit 8a864e