Blame buildutils/convert_tags_from_html.pl

Packit 5d935b
# This utility script interprets HTML files from MS's version of the OT spec
Packit 5d935b
# to generate tags script for OTTags.pm
Packit 5d935b
# The three files processed are scripttags.htm, featurelist.htm, and languagetags.htm
Packit 5d935b
# These files are assumed to be in "C:\Reference\Microsoft\OpenType 1.6" unless
Packit 5d935b
#   a folder name is supplied as the sole argument on the command name.
Packit 5d935b
#
Packit 5d935b
# Output (to stdout) is in perl syntax for the hash initialization, e.g.:
Packit 5d935b
#	    "Arabic" => "arab",
Packit 5d935b
#	    "Armenian" => "armn",
Packit 5d935b
# This output can the be transferred to Tags.pm
Packit 5d935b
#
Packit 5d935b
# Bob Hallissy 2010-09-16
Packit 5d935b
Packit 5d935b
use strict;
Packit 5d935b
Packit 5d935b
use File::Spec::Functions;
Packit 5d935b
use HTML::Parser;
Packit 5d935b
Packit 5d935b
my $dir = ($ARGV[0] ? $ARGV[0] : "/Reference/Microsoft/OpenType 1.6");
Packit 5d935b
Packit 5d935b
die "Cannot locate .HTM files in '$dir'.\n" unless (
Packit 5d935b
	-f catfile($dir, "languagetags.htm") and 
Packit 5d935b
	-f catfile($dir, "featurelist.htm") and 
Packit 5d935b
	-f catfile($dir, "scripttags.htm")
Packit 5d935b
	);
Packit 5d935b
Packit 5d935b
my $filename;
Packit 5d935b
my $which;		# either LANGUAGE, FEATURE, or SCRIPT
Packit 5d935b
Packit 5d935b
my $curText;	# Text accumulator.
Packit 5d935b
my $curCol;		# Which column of the table we're processing -- reset to 0 by 
Packit 5d935b
my $td;			# ref to array of text from a  containing 
Packit 5d935b
Packit 5d935b
my (%tttags, %iso639list);   # Accumulated data
Packit 5d935b
Packit 5d935b
sub text
Packit 5d935b
{
Packit 5d935b
	my ($self, $text) = @_;
Packit 5d935b
	$curText .= $text;
Packit 5d935b
}
Packit 5d935b
Packit 5d935b
sub start
Packit 5d935b
{
Packit 5d935b
	my ($self, $tagname) = @_;
Packit 5d935b
	$curText = '';
Packit 5d935b
	if ($tagname eq 'tr')
Packit 5d935b
	{
Packit 5d935b
		$curCol = 0;
Packit 5d935b
		undef $td;
Packit 5d935b
	}
Packit 5d935b
}	
Packit 5d935b
	
Packit 5d935b
sub end
Packit 5d935b
{
Packit 5d935b
	my ($self, $tagname) = @_;
Packit 5d935b
	if ($tagname eq 'th')
Packit 5d935b
	{
Packit 5d935b
		if ($curCol++ == 0)
Packit 5d935b
		{
Packit 5d935b
			# confirm which table we have:
Packit 5d935b
			$curText =~ /^(\S+)/;
Packit 5d935b
			$which = uc($1);
Packit 5d935b
			die "Unexpected table header '$curText' in '$filename'./n" unless $filename =~ /^${which}/i;
Packit 5d935b
		}
Packit 5d935b
	}
Packit 5d935b
	elsif ($tagname eq 'td')
Packit 5d935b
	{
Packit 5d935b
		# trip leading and trailing whitespace and quotes:
Packit 5d935b
		$curText =~ s/[\s']+$//;
Packit 5d935b
		$curText =~ s/^[\s']+//;
Packit 5d935b
		# fold dashes to hyphen-minus:
Packit 5d935b
		$curText =~ s/[\x{2010}-\x{201F}]/-/g;
Packit 5d935b
		$td->[$curCol++] = $curText;
Packit 5d935b
	}
Packit 5d935b
	elsif ($tagname eq 'tr' && defined $td)
Packit 5d935b
	{
Packit 5d935b
		# Ok -- got a complete row of data to work with
Packit 5d935b
		
Packit 5d935b
		# Feature table is reversed with tag being first:
Packit 5d935b
		$td = [ reverse @{$td} ] if $which eq "FEATURE";
Packit 5d935b
		
Packit 5d935b
		# So now
Packit 5d935b
		#    $td->[0] is the name (of script, language, or feature(s))
Packit 5d935b
		#    $td->[1] is the tag name plus possibly extra stuff
Packit 5d935b
		#    $td->[3], if exists, is comma-separated iso639 language codes
Packit 5d935b
		
Packit 5d935b
		my ($name, $tag, $iso639list) = @{$td};
Packit 5d935b
		
Packit 5d935b
		if ($tag =~ /^(\S+)\s+(.+)$/)
Packit 5d935b
		{
Packit 5d935b
			# Extra text after the tag name, such as Dhivehi has "(deprecated)" after the "DHV " tag -- move it to name.
Packit 5d935b
			$tag = $1;
Packit 5d935b
			$name .= " $2";
Packit 5d935b
		}
Packit 5d935b
		
Packit 5d935b
		if ($tag =~ /^(.{1,4})-(.{1,4})$/)
Packit 5d935b
		{
Packit 5d935b
			# Special handling for feature names like 'cv01-cv99'
Packit 5d935b
			my ($tag1, $tag2) = ($1, $2);
Packit 5d935b
			for my $tag ($tag1 .. $tag2)
Packit 5d935b
			{
Packit 5d935b
				$tag =~ /(\d+)$/;
Packit 5d935b
				my $index = $1;
Packit 5d935b
				$tag .= ' ' x (4 - length($tag));	# pad tag
Packit 5d935b
				$tttags{$which}{"$name $index"} = "$tag";
Packit 5d935b
			}
Packit 5d935b
		}
Packit 5d935b
		else
Packit 5d935b
		{
Packit 5d935b
			# Normal tags	
Packit 5d935b
			# Pad the tag:
Packit 5d935b
			$tag .= ' ' x (4 - length($tag));
Packit 5d935b
			$tttags{$which}{$name} = $tag;
Packit 5d935b
		}
Packit 5d935b
Packit 5d935b
		if (defined $iso639list)
Packit 5d935b
		{
Packit 5d935b
			$iso639list =~ s/[, ]+/ /g;  # Strip commas, leaving space.
Packit 5d935b
			$iso639list{$tag} = $iso639list # Save for later
Packit 5d935b
		}
Packit 5d935b
	}
Packit 5d935b
}
Packit 5d935b
Packit 5d935b
sub VerifyAnsi
Packit 5d935b
{
Packit 5d935b
	my $str = shift;
Packit 5d935b
	my $strA = $str;
Packit 5d935b
	$strA =~ s/[^\x00-\x7F]/?/g;
Packit 5d935b
	print STDERR "Wide data:\n$strA\n$str\n" if $str ne $strA;
Packit 5d935b
}
Packit 5d935b
Packit 5d935b
my  $p = HTML::Parser->new(
Packit 5d935b
	api_version => 3,
Packit 5d935b
	start_h => [\&start, 'self,tagname'],
Packit 5d935b
	end_h   => [\&end,   'self,tagname'],
Packit 5d935b
	text_h   => [\&text, 'self,text'],
Packit 5d935b
	report_tags => [qw(table th tr td)],
Packit 5d935b
	);
Packit 5d935b
Packit 5d935b
foreach (qw (scripttags.htm languagetags.htm featurelist.htm))
Packit 5d935b
{
Packit 5d935b
	$filename = $_;
Packit 5d935b
	my $fh;
Packit 5d935b
	open($fh, "<:utf8", catfile($dir, $filename)) || die "cannot open '$filename': $!/n";
Packit 5d935b
	$p->parse_file($fh);
Packit 5d935b
	close $fh;
Packit 5d935b
}
Packit 5d935b
Packit 5d935b
print <
Packit 5d935b
# All data below derived Microsoft OpenType specification 1.6
Packit 5d935b
Packit 5d935b
%tttags = (
Packit 5d935b
Packit 5d935b
EOF
Packit 5d935b
Packit 5d935b
for $which (qw (SCRIPT LANGUAGE FEATURE))
Packit 5d935b
{
Packit 5d935b
	print "'$which' => {\n"; 
Packit 5d935b
	# Alpha order by name (not tag)
Packit 5d935b
	foreach my $name (sort keys (%{$tttags{$which}}))
Packit 5d935b
	{
Packit 5d935b
		VerifyAnsi "$name => $tttags{$which}{$name}";
Packit 5d935b
		print "    \"$name\" => '$tttags{$which}{$name}',\n";
Packit 5d935b
	}
Packit 5d935b
	print "    },\n\n";	
Packit 5d935b
}
Packit 5d935b
print ");\n\n";
Packit 5d935b
Packit 5d935b
print "\%iso639 = (\n";
Packit 5d935b
foreach my $tag (sort keys(%iso639list))
Packit 5d935b
{
Packit 5d935b
	VerifyAnsi "$tag => $iso639list{$tag}";
Packit 5d935b
	printf "    '$tag' => '$iso639list{$tag}',\n";
Packit 5d935b
}
Packit 5d935b
print ");\n";
Packit 5d935b
Packit 5d935b
=head1 AUTHOR
Packit 5d935b
Packit 5d935b
Bob Hallissy L<http://scripts.sil.org/FontUtils>.
Packit 5d935b
Packit 5d935b
=head1 LICENSING
Packit 5d935b
Packit 5d935b
Copyright (c) 1998-2014, SIL International (http://www.sil.org)
Packit 5d935b
Packit 5d935b
This script is released under the terms of the Artistic License 2.0.
Packit 5d935b
For details, see the full text of the license in the file LICENSE.
Packit 5d935b
Packit 5d935b
=cut 
Packit 5d935b