Blame bin/th_gen_idx.pl

Packit 7ece30
#!/usr/bin/perl
Packit 7ece30
Packit 7ece30
# perl program to take a thesaurus structured text data file 
Packit 7ece30
# and create the proper sorted index file (.idx)
Packit 7ece30
#
Packit 7ece30
# typcially invoked as follows:
Packit 7ece30
# cat th_en_US_new.dat | ./th_gen_idx.pl > th_en_US_new.idx
Packit 7ece30
#
Packit 7ece30
Packit 7ece30
sub by_entry {
Packit 7ece30
    my ($aent, $aoff) = split('\|',$a);
Packit 7ece30
    my ($bent, $boff) = split('\|',$b);
Packit 7ece30
    $aent cmp $bent;
Packit 7ece30
}
Packit 7ece30
Packit 7ece30
# main routine
Packit 7ece30
my $ne = 0;       # number of entries in index
Packit 7ece30
my @tindex=();    # the index itself
Packit 7ece30
my $foffset = 0;  # file position offset into thesaurus
Packit 7ece30
my $rec="";       # current string and related pieces
Packit 7ece30
my $rl=0;         # misc string length     
Packit 7ece30
my $entry="";     # current word being processed
Packit 7ece30
my $nm=0;         # number of meaning for the current word
Packit 7ece30
my $meaning="";   # current meaning and synonyms
Packit 7ece30
my $p;            # misc uses
Packit 7ece30
my $encoding;     # encoding used by text file
Packit 7ece30
Packit 7ece30
# top line of thesaurus provides encoding
Packit 7ece30
$encoding=<STDIN>;
Packit 7ece30
$foffset = $foffset + length($encoding); 
Packit 7ece30
chomp($encoding);
Packit 7ece30
   
Packit 7ece30
# read thesaurus line by line
Packit 7ece30
# first line of every block is an entry and meaning count
Packit 7ece30
while ($rec=<STDIN>){
Packit 7ece30
    $rl = length($rec);
Packit 7ece30
    chomp($rec);
Packit 7ece30
    ($entry, $nm) = split('\|',$rec);
Packit 7ece30
    $p = 0;
Packit 7ece30
    while ($p < $nm) {
Packit 7ece30
        $meaning=<STDIN>;
Packit 7ece30
        $rl = $rl + length($meaning);
Packit 7ece30
        chomp($meaning);
Packit 7ece30
        $p++;
Packit 7ece30
    }       
Packit 7ece30
    push(@tindex,"$entry|$foffset");
Packit 7ece30
    $ne++;
Packit 7ece30
    $foffset = $foffset + $rl;
Packit 7ece30
}
Packit 7ece30
Packit 7ece30
# now we have all of the information
Packit 7ece30
# so sort it and then output the encoding, count and index data
Packit 7ece30
@tindex = sort by_entry @tindex;
Packit 7ece30
print STDOUT "$encoding\n";
Packit 7ece30
print STDOUT "$ne\n";
Packit 7ece30
foreach $one (@tindex) {
Packit 7ece30
    print STDOUT "$one\n";
Packit 7ece30
}
Packit 7ece30