Blob Blame History Raw
#!/usr/bin/env perl
use strict;
use warnings;

use Getopt::Std;

my %options;
getopts("hs:n:", \%options);

sub usage {
    print STDERR "Usage: cat CHARSET_to_uni.trans | $0 -s START -n NAME\n";
    exit(1);
}

usage() if $options{h} || !$options{s} || !$options{n};
usage() if @ARGV;

my $start = int($options{s});
my $charset = $options{n};

my $keysyms = {
    "undefined" => "",

    "cyrillic_small_letter_ukrainian_ie" => "ukrainian_cyrillic_small_letter_ie",
    "cyrillic_small_letter_byelorussian_ukrainian_i" => "ukrainian_cyrillic_small_letter_i",
    "cyrillic_small_letter_yi" => "ukrainian_cyrillic_small_letter_yi",
    "cyrillic_capital_letter_ukrainian_ie" => "ukrainian_cyrillic_capital_letter_ie",
    "cyrillic_capital_letter_byelorussian_ukrainian_i" => "ukrainian_cyrillic_capital_letter_i",
    "cyrillic_capital_letter_yi" => "ukrainian_cyrillic_capital_letter_yi",
    "cyrillic_capital_letter_dje" => "serbocroatian_cyrillic_capital_letter_dje",
    "cyrillic_capital_letter_gje" => "macedonian_cyrillic_capital_letter_gje",
    "cyrillic_capital_letter_dze" => "macedonian_cyrillic_capital_letter_dze",
    "cyrillic_capital_letter_tshe" => "serbocroatian_cyrillic_capital_letter_chje",
    "cyrillic_capital_letter_kje" => "macedonian_cyrillic_capital_letter_kje",
    "cyrillic_capital_letter_short_u" => "bielorussian_cyrillic_capital_letter_short_u",
    "cyrillic_small_letter_dje" => "serbocroatian_cyrillic_small_letter_dje",
    "cyrillic_small_letter_gje" => "macedonian_cyrillic_small_letter_gje",
    "cyrillic_small_letter_dze" => "macedonian_cyrillic_small_letter_dze",
    "cyrillic_small_letter_tshe" => "serbocroatian_cyrillic_small_letter_chje",
    "cyrillic_small_letter_kje" => "macedonian_cyrillic_small_letter_kje",
    "cyrillic_small_letter_short_u" => "bielorussian_cyrillic_small_letter_short_u",

    "middle_dot" => "periodcentered",
    "not" => "notsign",
    "sharp_s" => "ssharp",
    "Ostroke" => "Ooblique",
    "ostroke" => "oslash",
    "Tstroke" => "Tslash",
    "tstroke" => "tslash",
    "greek_tonos" => "accent",
    "greek_dialytika_tonos" => "diaeresisaccent",
    "Xi" => "Ksi",
    "Chi" => "Khi",
    "xi" => "ksi",
    "chi" => "khi",
    "finalsigma" => "terminalsigma",
    "overline" => "overscore",
    "double_low_line" => "doubleunderscore",
    "no_break_space" => "nobreakspace",
    "inverted_exclamation_mark" => "exclamdown",
    "pound" => "sterling",
    "broken_bar" => "brokenbar",
    "feminine_ordinal_indicator" => "ordfeminine",
    "left_pointing_double_angle_quotation_mark" => "guillemotleft",
    "soft_hyphen" => "hyphen",
    "plus_minus" => "plusminus",
    "superscript_one" => "onesuperior",
    "superscript_two" => "twosuperior",
    "superscript_digit_two" => "twosuperior",
    "superscript_three" => "threesuperior",
    "micro" => "mu",
    "pilcrow" => "paragraph",
    "masculine_ordinal_indicator" => "masculine",
    "right_pointing_double_angle_quotation_mark" => "guillemotright",
    "inverted_question_mark" => "questiondown",
    "multiplication" => "multiply",
    "dotless_i" => "idotless",
    "numero" => "number_acronym",
    "horizontal_ellipsis" => "ellipsis",
    "double_dagger" => "doubledagger",
    "per_mille" => "permille",
    "em_dash" => "emdash",
    "en_dash" => "endash",
    "trade_mark" => "trademark",

    # should be synonyms?
    "cyrillic_small_letter_soft" => "cyrillic_small_soft_sign",
    "cyrillic_small_letter_hard" => "cyrillic_small_hard_sign",
    "cyrillic_capital_letter_soft" => "cyrillic_capital_soft_sign",
    "cyrillic_capital_letter_hard" => "cyrillic_capital_hard_sign",
};

my $table = [];
while (<STDIN>) {
    if (my ($c, $uni, $name) = /^0x([0-9a-fA-F]{2})[ \t]+U\+([0-9a-fA-F]{4})[ \t]+#(.*)/) {
        my $code = hex($c);
        next if $code < $start;

        $uni =~ y/[A-F]/[a-f]/;

        $name =~ s/^[ \t]+//;
        $name =~ s/[ \t]$//;
        $name =~ y/[A-Z]/[a-z]/;
        $name =~ s/[ -]+/_/g;

        $name =~ s/^latin_capital_(?:letter|ligature)_([a-z]+)/uc($1)/e;
        $name =~ s/^latin_small_(?:letter|ligature)_//;
        $name =~ s/^greek_capital_letter_([a-z])/uc($1)/e;
        $name =~ s/^greek_small_letter_//;
        $name =~ s/^hebrew_letter_//;
        $name =~ s/_with_tonos$/_with_accent/;
        $name =~ s/_with_dialytika$/_with_diaeresis/;
        $name =~ s/_with_dialytika_and_tonos$/_with_diaeresisaccent/;
        $name =~ s/^([A-Za-z]+)_with_/$1/;
        $name =~ s/^vulgar_fraction_([a-z]+)_([a-z]+)$/$1$2/;

        $name =~ s/acute_accent$/acute/;
        $name =~ s/dot_above$/abovedot/;
        $name =~ s/double_acute$/doubleacute/;
        $name =~ s/ring_above$/ring/;
        $name =~ s/_single_quotation_mark$/quote/;

        $name =~ s/_sign$//;
        $name =~ s/^final_/final/;
        $name =~ s/_\(.*\)$//;

        $table->[$code-$start] = { uni => $uni, keysym => $keysyms->{$name} // $name };
    } else {
        die "invalid line: $_";
    }
}

print "static sym\n";
print "const ${charset}[] = {\n";
foreach my $r (@$table) {
    if (!defined($r)) {
        print "\t{ 0xfffd, \"\" },\n";
    } else {
        print "\t{ 0x$r->{uni}, \"$r->{keysym}\" },\n";
    }
}
for (my $i = $start + scalar(@$table); $i < 256; $i++) {
    print "\t{ 0xfffd, \"\" },\n";
}
print "};\n";