|
Packit |
1184b9 |
#!/usr/bin/perl
|
|
Packit |
1184b9 |
|
|
Packit |
1184b9 |
# perl program to take a thesaurus structured text data file
|
|
Packit |
1184b9 |
# and create the proper sorted index file (.idx)
|
|
Packit |
1184b9 |
#
|
|
Packit |
1184b9 |
# typically invoked as follows:
|
|
Packit |
1184b9 |
# cat th_en_US_new.dat | ./th_gen_idx.pl > th_en_US_new.idx
|
|
Packit |
1184b9 |
#
|
|
Packit |
1184b9 |
|
|
Packit |
1184b9 |
sub by_entry {
|
|
Packit |
1184b9 |
my ($aent, $aoff) = split('\|',$a);
|
|
Packit |
1184b9 |
my ($bent, $boff) = split('\|',$b);
|
|
Packit |
1184b9 |
$aent cmp $bent;
|
|
Packit |
1184b9 |
}
|
|
Packit |
1184b9 |
|
|
Packit |
1184b9 |
# main routine
|
|
Packit |
1184b9 |
my $ne = 0; # number of entries in index
|
|
Packit |
1184b9 |
my @tindex=(); # the index itself
|
|
Packit |
1184b9 |
my $foffset = 0; # file position offset into thesaurus
|
|
Packit |
1184b9 |
my $rec=""; # current string and related pieces
|
|
Packit |
1184b9 |
my $rl=0; # misc string length
|
|
Packit |
1184b9 |
my $entry=""; # current word being processed
|
|
Packit |
1184b9 |
my $nm=0; # number of meaning for the current word
|
|
Packit |
1184b9 |
my $meaning=""; # current meaning and synonyms
|
|
Packit |
1184b9 |
my $p; # misc uses
|
|
Packit |
1184b9 |
my $encoding; # encoding used by text file
|
|
Packit |
1184b9 |
|
|
Packit |
1184b9 |
# top line of thesaurus provides encoding
|
|
Packit |
1184b9 |
$encoding=<STDIN>;
|
|
Packit |
1184b9 |
$foffset = $foffset + length($encoding);
|
|
Packit |
1184b9 |
chomp($encoding);
|
|
Packit |
1184b9 |
|
|
Packit |
1184b9 |
# read thesaurus line by line
|
|
Packit |
1184b9 |
# first line of every block is an entry and meaning count
|
|
Packit |
1184b9 |
while ($rec=<STDIN>){
|
|
Packit |
1184b9 |
$rl = length($rec);
|
|
Packit |
1184b9 |
chomp($rec);
|
|
Packit |
1184b9 |
($entry, $nm) = split('\|',$rec);
|
|
Packit |
1184b9 |
$p = 0;
|
|
Packit |
1184b9 |
while ($p < $nm) {
|
|
Packit |
1184b9 |
$meaning=<STDIN>;
|
|
Packit |
1184b9 |
$rl = $rl + length($meaning);
|
|
Packit |
1184b9 |
chomp($meaning);
|
|
Packit |
1184b9 |
$p++;
|
|
Packit |
1184b9 |
}
|
|
Packit |
1184b9 |
push(@tindex,"$entry|$foffset");
|
|
Packit |
1184b9 |
$ne++;
|
|
Packit |
1184b9 |
$foffset = $foffset + $rl;
|
|
Packit |
1184b9 |
}
|
|
Packit |
1184b9 |
|
|
Packit |
1184b9 |
# now we have all of the information
|
|
Packit |
1184b9 |
# so sort it and then output the encoding, count and index data
|
|
Packit |
1184b9 |
@tindex = sort by_entry @tindex;
|
|
Packit |
1184b9 |
print STDOUT "$encoding\n";
|
|
Packit |
1184b9 |
print STDOUT "$ne\n";
|
|
Packit |
1184b9 |
foreach $one (@tindex) {
|
|
Packit |
1184b9 |
print STDOUT "$one\n";
|
|
Packit |
1184b9 |
}
|
|
Packit |
1184b9 |
|