|
Packit |
5d935b |
# This utility script interprets HTML files from MS's version of the OT spec
|
|
Packit |
5d935b |
# to generate tags script for OTTags.pm
|
|
Packit |
5d935b |
# The three files processed are scripttags.htm, featurelist.htm, and languagetags.htm
|
|
Packit |
5d935b |
# These files are assumed to be in "C:\Reference\Microsoft\OpenType 1.6" unless
|
|
Packit |
5d935b |
# a folder name is supplied as the sole argument on the command name.
|
|
Packit |
5d935b |
#
|
|
Packit |
5d935b |
# Output (to stdout) is in perl syntax for the hash initialization, e.g.:
|
|
Packit |
5d935b |
# "Arabic" => "arab",
|
|
Packit |
5d935b |
# "Armenian" => "armn",
|
|
Packit |
5d935b |
# This output can the be transferred to Tags.pm
|
|
Packit |
5d935b |
#
|
|
Packit |
5d935b |
# Bob Hallissy 2010-09-16
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
use strict;
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
use File::Spec::Functions;
|
|
Packit |
5d935b |
use HTML::Parser;
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
my $dir = ($ARGV[0] ? $ARGV[0] : "/Reference/Microsoft/OpenType 1.6");
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
die "Cannot locate .HTM files in '$dir'.\n" unless (
|
|
Packit |
5d935b |
-f catfile($dir, "languagetags.htm") and
|
|
Packit |
5d935b |
-f catfile($dir, "featurelist.htm") and
|
|
Packit |
5d935b |
-f catfile($dir, "scripttags.htm")
|
|
Packit |
5d935b |
);
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
my $filename;
|
|
Packit |
5d935b |
my $which; # either LANGUAGE, FEATURE, or SCRIPT
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
my $curText; # Text accumulator.
|
|
Packit |
5d935b |
my $curCol; # Which column of the table we're processing -- reset to 0 by
|
|
Packit |
5d935b |
my $td; # ref to array of text from a containing
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
my (%tttags, %iso639list); # Accumulated data
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
sub text
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
my ($self, $text) = @_;
|
|
Packit |
5d935b |
$curText .= $text;
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
sub start
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
my ($self, $tagname) = @_;
|
|
Packit |
5d935b |
$curText = '';
|
|
Packit |
5d935b |
if ($tagname eq 'tr')
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
$curCol = 0;
|
|
Packit |
5d935b |
undef $td;
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
sub end
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
my ($self, $tagname) = @_;
|
|
Packit |
5d935b |
if ($tagname eq 'th')
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
if ($curCol++ == 0)
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
# confirm which table we have:
|
|
Packit |
5d935b |
$curText =~ /^(\S+)/;
|
|
Packit |
5d935b |
$which = uc($1);
|
|
Packit |
5d935b |
die "Unexpected table header '$curText' in '$filename'./n" unless $filename =~ /^${which}/i;
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
elsif ($tagname eq 'td')
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
# trip leading and trailing whitespace and quotes:
|
|
Packit |
5d935b |
$curText =~ s/[\s']+$//;
|
|
Packit |
5d935b |
$curText =~ s/^[\s']+//;
|
|
Packit |
5d935b |
# fold dashes to hyphen-minus:
|
|
Packit |
5d935b |
$curText =~ s/[\x{2010}-\x{201F}]/-/g;
|
|
Packit |
5d935b |
$td->[$curCol++] = $curText;
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
elsif ($tagname eq 'tr' && defined $td)
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
# Ok -- got a complete row of data to work with
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
# Feature table is reversed with tag being first:
|
|
Packit |
5d935b |
$td = [ reverse @{$td} ] if $which eq "FEATURE";
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
# So now
|
|
Packit |
5d935b |
# $td->[0] is the name (of script, language, or feature(s))
|
|
Packit |
5d935b |
# $td->[1] is the tag name plus possibly extra stuff
|
|
Packit |
5d935b |
# $td->[3], if exists, is comma-separated iso639 language codes
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
my ($name, $tag, $iso639list) = @{$td};
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
if ($tag =~ /^(\S+)\s+(.+)$/)
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
# Extra text after the tag name, such as Dhivehi has "(deprecated)" after the "DHV " tag -- move it to name.
|
|
Packit |
5d935b |
$tag = $1;
|
|
Packit |
5d935b |
$name .= " $2";
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
if ($tag =~ /^(.{1,4})-(.{1,4})$/)
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
# Special handling for feature names like 'cv01-cv99'
|
|
Packit |
5d935b |
my ($tag1, $tag2) = ($1, $2);
|
|
Packit |
5d935b |
for my $tag ($tag1 .. $tag2)
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
$tag =~ /(\d+)$/;
|
|
Packit |
5d935b |
my $index = $1;
|
|
Packit |
5d935b |
$tag .= ' ' x (4 - length($tag)); # pad tag
|
|
Packit |
5d935b |
$tttags{$which}{"$name $index"} = "$tag";
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
else
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
# Normal tags
|
|
Packit |
5d935b |
# Pad the tag:
|
|
Packit |
5d935b |
$tag .= ' ' x (4 - length($tag));
|
|
Packit |
5d935b |
$tttags{$which}{$name} = $tag;
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
if (defined $iso639list)
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
$iso639list =~ s/[, ]+/ /g; # Strip commas, leaving space.
|
|
Packit |
5d935b |
$iso639list{$tag} = $iso639list # Save for later
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
sub VerifyAnsi
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
my $str = shift;
|
|
Packit |
5d935b |
my $strA = $str;
|
|
Packit |
5d935b |
$strA =~ s/[^\x00-\x7F]/?/g;
|
|
Packit |
5d935b |
print STDERR "Wide data:\n$strA\n$str\n" if $str ne $strA;
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
my $p = HTML::Parser->new(
|
|
Packit |
5d935b |
api_version => 3,
|
|
Packit |
5d935b |
start_h => [\&start, 'self,tagname'],
|
|
Packit |
5d935b |
end_h => [\&end, 'self,tagname'],
|
|
Packit |
5d935b |
text_h => [\&text, 'self,text'],
|
|
Packit |
5d935b |
report_tags => [qw(table th tr td)],
|
|
Packit |
5d935b |
);
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
foreach (qw (scripttags.htm languagetags.htm featurelist.htm))
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
$filename = $_;
|
|
Packit |
5d935b |
my $fh;
|
|
Packit |
5d935b |
open($fh, "<:utf8", catfile($dir, $filename)) || die "cannot open '$filename': $!/n";
|
|
Packit |
5d935b |
$p->parse_file($fh);
|
|
Packit |
5d935b |
close $fh;
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
print <
|
|
Packit |
5d935b |
# All data below derived Microsoft OpenType specification 1.6
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
%tttags = (
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
EOF
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
for $which (qw (SCRIPT LANGUAGE FEATURE))
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
print "'$which' => {\n";
|
|
Packit |
5d935b |
# Alpha order by name (not tag)
|
|
Packit |
5d935b |
foreach my $name (sort keys (%{$tttags{$which}}))
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
VerifyAnsi "$name => $tttags{$which}{$name}";
|
|
Packit |
5d935b |
print " \"$name\" => '$tttags{$which}{$name}',\n";
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
print " },\n\n";
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
print ");\n\n";
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
print "\%iso639 = (\n";
|
|
Packit |
5d935b |
foreach my $tag (sort keys(%iso639list))
|
|
Packit |
5d935b |
{
|
|
Packit |
5d935b |
VerifyAnsi "$tag => $iso639list{$tag}";
|
|
Packit |
5d935b |
printf " '$tag' => '$iso639list{$tag}',\n";
|
|
Packit |
5d935b |
}
|
|
Packit |
5d935b |
print ");\n";
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
=head1 AUTHOR
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
Bob Hallissy L<http://scripts.sil.org/FontUtils>.
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
=head1 LICENSING
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
Copyright (c) 1998-2014, SIL International (http://www.sil.org)
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
This script is released under the terms of the Artistic License 2.0.
|
|
Packit |
5d935b |
For details, see the full text of the license in the file LICENSE.
|
|
Packit |
5d935b |
|
|
Packit |
5d935b |
=cut
|
|
Packit |
5d935b |
|