Blob Blame History Raw
#!/usr/bin/perl 
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

######################################################################
#
# Initial global variable
#
######################################################################
%utot = ();
$ui=0;
$li=0;

######################################################################
#
# Open the unicode database file
#
######################################################################
open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt") 
   || die "cannot find UnicodeData-Latest.txt";

######################################################################
#
# Open the JIS X 4051 Class file
#
######################################################################
open ( CLASS , "< jisx4051class.txt") 
   || die "cannot find jisx4051class.txt";

######################################################################
#
# Open the JIS X 4051 Class simplified mapping
#
######################################################################
open ( SIMP , "< jisx4051simp.txt") 
   || die "cannot find jisx4051simp.txt";

######################################################################
#
# Open the output file
#
######################################################################
open ( OUT , "> anzx4051.html") 
  || die "cannot open output anzx4051.html file";

######################################################################
#
# Open the output file
#
######################################################################
open ( HEADER , "> ../jisx4051class.h")
  || die "cannot open output ../jisx4051class.h file";

######################################################################
#
# Generate license and header
#
######################################################################
$hthmlheader = <<END_OF_HTML;
<!-- This Source Code Form is subject to the terms of the Mozilla Public
   - License, v. 2.0. If a copy of the MPL was not distributed with this
   - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->

<HTML>
<HEAD>
<TITLE>
Analysis of JIS X 4051 to Unicode General Category Mapping
</TITLE>
</HEAD>
<BODY>
<H1>
Analysis of JIS X 4051 to Unicode General Category Mapping
</H1>
END_OF_HTML
print OUT $hthmlheader;

######################################################################
#
# Generate license and header
#
######################################################################
$npl = <<END_OF_NPL;
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/*
    DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY
    mozilla/intl/lwbrk/tools/anzx4051.pl
 */
END_OF_NPL
print HEADER $npl;

%occ = ();
%gcat = ();
%dcat = ();
%simp = ();
%gcount = ();
%dcount = ();
%sccount = ();
%rangecount = ();

######################################################################
#
# Process the file line by line
#
######################################################################
while(<UNICODATA>) {
   chop;
   ######################################################################
   #
   # Get value from fields
   #
   ######################################################################
   @f = split(/;/ , $_); 
   $c = $f[0];   # The unicode value
   $g = $f[2]; 
   $d = substr($g, 0, 1);

   $gcat{$c} = $g;
   $dcat{$c} = $d;
   $gcount{$g}++;
   $dcount{$d}++;
}
close(UNIDATA);

while(<SIMP>) {
   chop;
   ######################################################################
   #
   # Get value from fields
   #
   ######################################################################
   @f = split(/;/ , $_); 

   $simp{$f[0]} = $f[1];
   $sccount{$f[1]}++;
}
close(SIMP);

sub GetClass{
  my ($u) = @_;
  my $hex = DecToHex($u);
  $g = $gcat{$hex};
  if($g ne "") {
    return $g;
  } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 )  ) {
    return "Han";
  } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 )  ) {
    return "Lo";
  } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f )  ) {
    return "Cs";
  } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff )  ) {
    return "Cs";
  } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff )  ) {
    return "Cs";
  } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff )  ) {
    return "Co";
  } else {
    printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex;
  }
}
sub GetDClass{
  my ($u) = @_;
  my $hex = DecToHex($u);
  $g = $dcat{$hex};
  if($g ne "") {
    return $g;
  } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 )  ) {
    return "Han";
  } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 )  ) {
    return "L";
  } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f )  ) {
    return "C";
  } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff )  ) {
    return "C";
  } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff )  ) {
    return "C";
  } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff )  ) {
    return "C";
  } else {
    printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex;
  }
}
sub DecToHex{
     my ($d) = @_;
     return sprintf("%04X", $d); 
}
%gtotal = ();
%dtotal = ();
while(<CLASS>) {
   chop;
   ######################################################################
   #
   # Get value from fields
   #
   ######################################################################
   @f = split(/;/ , $_); 

   if( substr($f[2], 0, 1) ne "a")
   {
     $sc = $simp{$f[2]};
     $l = hex($f[0]);
     if($f[1] eq "")
     {
       $h = $l;
     } else {
       $h = hex($f[1]);
     }
     for($k = $l; $k <= $h ; $k++)
     {
       if( exists($occ{$k}))
       {
          #  printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n", 
          #         DecToHex($k),  $occ{$k} , $f[2] , $sc;
       }
       else
       {
           $occ{$k} = $sc . " | " . $f[2];
           $gclass = GetClass($k); 
           $dclass = GetDClass($k);
           $gtotal{$sc . $gclass}++;
           $dtotal{$sc . $dclass}++;
           $u = DecToHex($k);
           $rk = " " . substr($u,0,2) . ":" . $sc;
           $rangecount{$rk}++;
       }
     }
  }
}

#print %gtotal;
#print %dtotal;

sub printreport 
{
    print OUT "<TABLE BORDER=3>\n";
    print OUT "<TR BGCOLOR=blue><TH><TH>\n";
    
    foreach $d (sort(keys %dcount)) {
       print OUT "<TD BGCOLOR=red>$d</TD>\n";
    }
    
    print OUT "<TD BGCOLOR=white>Total</TD>\n";
    foreach $g (sort(keys %gcount)) {
       print OUT "<TD BGCOLOR=yellow>$g</TD>\n";
    }
    print OUT "</TR>\n";
    foreach $sc (sort(keys %sccount)) {
    
       print OUT "<TR><TH>$sc<TH>\n";
    
       $total = 0; 
       foreach $d (sort (keys %dcount)) {
         $count = $dtotal{$sc . $d};
         $total += $count;
         print OUT "<TD>$count</TD>\n";
       }
    
       print OUT "<TD BGCOLOR=white>$total</TD>\n";
    
       foreach $g (sort(keys %gcount)) {
         $count = $gtotal{$sc . $g};
         print OUT "<TD>$count</TD>\n";
       }
    
    
       print OUT "</TR>\n";
    }
    print OUT "</TABLE>\n";
    
    
    print OUT "<TABLE BORDER=3>\n";
    print OUT "<TR BGCOLOR=blue><TH><TH>\n";
    
    foreach $sc (sort(keys %sccount)) 
    {
       print OUT "<TD BGCOLOR=red>$sc</TD>\n";
    }
    
    print OUT "</TR>\n";
    
    
    for($rr = 0; $rr < 0x4f; $rr++)
    {
       $empty = 0;
       $r = sprintf("%02X" , $rr) ;
       $tmp = "<TR><TH>" . $r . "<TH>\n";
    
       foreach $sc (sort(keys %sccount)) {
         $count = $rangecount{ " " .$r . ":" .$sc};
         $tmp .= sprintf("<TD>%s</TD>\n", $count);
         $empty += $count;
       }
    
       $tmp .=  "</TR>\n";
    
       if($empty ne 0) 
       {
          print OUT $tmp;
       }
    }
    print OUT "</TABLE>\n";
    
}
printreport();

sub printarray
{
   my($r, $def) = @_;
printf "[%s || %s]\n", $r, $def;
   $k = hex($r) * 256;
   printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r;
   for($i = 0 ; $i < 256; $i+= 8)
   {  
      for($j = 7 ; $j >= 0; $j-- )
      {  
          $v = $k + $i + $j;
          if( exists($occ{$v})) 
	  {
             $p = substr($occ{$v}, 1,1);
          } else {
             $p = $def;
          }

          if($j eq 7 ) 
          {
             printf HEADER "0x%s" , $p;
          } else {
             printf HEADER "%s", $p ;
          }
      }
      printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7);
   }
   print HEADER "};\n\n";
}
printarray("00", "7");
printarray("20", "7");
printarray("21", "7");
printarray("30", "5");
printarray("0E", "8");
printarray("17", "7");

#print %rangecount;

######################################################################
#
# Close files
#
######################################################################
close(HEADER);
close(CLASS);
close(OUT);