|
Packit |
575503 |
# From arnold@f7.net Sun Sep 5 12:30:53 2004
|
|
Packit |
575503 |
# Date: Fri, 3 Sep 2004 00:54:32 -0400 (EDT)
|
|
Packit |
575503 |
# From: William J Poser <wjposer@ldc.upenn.edu>
|
|
Packit |
575503 |
# To: arnold@skeeve.com
|
|
Packit |
575503 |
# Subject: gawk bug
|
|
Packit |
575503 |
# Message-ID: <20040903004347.W80049@lorax.ldc.upenn.edu>
|
|
Packit |
575503 |
#
|
|
Packit |
575503 |
# Here is a revised version of my previous message, modified to describe
|
|
Packit |
575503 |
# the accompanying files.
|
|
Packit |
575503 |
#
|
|
Packit |
575503 |
# IhSplit.awk should replicate every record with exactly one entry in the
|
|
Packit |
575503 |
# IH field, delete records lacking an IH field, and produce as many copies
|
|
Packit |
575503 |
# of records with two or more entries in the IH field as there are entries.
|
|
Packit |
575503 |
# In the latter case, the original IH field should be relabelled OIH and
|
|
Packit |
575503 |
# a new IH field be added at the beginning of the record.
|
|
Packit |
575503 |
#
|
|
Packit |
575503 |
# This has worked properly for many years, since at least 1997. It worked properly with gawk 3.0.5
|
|
Packit |
575503 |
# and possibly later versions. Unfortunately I didn't keep track of exactly what version it
|
|
Packit |
575503 |
# broke on, but it was whatever came with Mandrake Linux 9.0. It continued to fail with version
|
|
Packit |
575503 |
# 3.1.2. However, the problem was eliminated with version 3.1.3 and remains
|
|
Packit |
575503 |
# eliminated in version 3.1.4.
|
|
Packit |
575503 |
#
|
|
Packit |
575503 |
# The problem was that an apparently random subset of records would loose some
|
|
Packit |
575503 |
# or all of their fields. Running the script on the same input always produces
|
|
Packit |
575503 |
# the same output with the same errors.
|
|
Packit |
575503 |
#
|
|
Packit |
575503 |
# The file Input is a subset of a real lexicon that produces errors using
|
|
Packit |
575503 |
# gawk 3.1.2. GoodOutput is the expected output. BadOutput is the erroneous
|
|
Packit |
575503 |
# output. A diff will show that there are actually two errors. One record
|
|
Packit |
575503 |
# has fields stripped as described above. Another is omitted in its entirety.
|
|
Packit |
575503 |
#
|
|
Packit |
575503 |
#
|
|
Packit |
575503 |
# Bill Poser, Linguistics, University of Pennsylvania
|
|
Packit |
575503 |
# http://www.ling.upenn.edu/~wjposer/ billposer@alum.mit.edu
|
|
Packit |
575503 |
# ----------------------------------------------------------------------------
|
|
Packit |
575503 |
#For each record that contains multiple items in its inverse headword (IH)
|
|
Packit |
575503 |
#field, generate a set of new records each containing exactly one item
|
|
Packit |
575503 |
#in the inverse headword field, otherwise copies of the original.
|
|
Packit |
575503 |
|
|
Packit |
575503 |
function CleanUp() #Clean up for next input record.
|
|
Packit |
575503 |
{
|
|
Packit |
575503 |
for(i in rec) delete rec[i];
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
BEGIN {
|
|
Packit |
575503 |
RS = "";
|
|
Packit |
575503 |
FS = "\n?%"
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
{
|
|
Packit |
575503 |
|
|
Packit |
575503 |
# First, create an associative array with the tags as indices.
|
|
Packit |
575503 |
for(i = 2; i <= NF; i++) { # The leading FS creates an initial empty field
|
|
Packit |
575503 |
split($i, f, ":");
|
|
Packit |
575503 |
rec[f[1]]=substr($i,index($i,":")+1);
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if(!("IH" in rec)) next;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
# Parse out the inverse headwords
|
|
Packit |
575503 |
|
|
Packit |
575503 |
items = split(rec["IH"],ihs,"/");
|
|
Packit |
575503 |
|
|
Packit |
575503 |
# Replace the old IH field.
|
|
Packit |
575503 |
|
|
Packit |
575503 |
sub(/%IH:/,"%OIH:",$0);
|
|
Packit |
575503 |
|
|
Packit |
575503 |
# Generate a new copy of the record for each inverse headword
|
|
Packit |
575503 |
|
|
Packit |
575503 |
for(i = 1; i <= items; i++){
|
|
Packit |
575503 |
entries+=1;
|
|
Packit |
575503 |
printf("%%IH:%s\n",ihs[i]);
|
|
Packit |
575503 |
printf("%s\n\n",$0);
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
CleanUp();
|
|
Packit |
575503 |
}
|