Blame test/wjposer1.awk

Packit 575503
# From arnold@f7.net  Sun Sep  5 12:30:53 2004
Packit 575503
# Date: Fri, 3 Sep 2004 00:54:32 -0400 (EDT)
Packit 575503
# From: William J Poser <wjposer@ldc.upenn.edu>
Packit 575503
# To: arnold@skeeve.com
Packit 575503
# Subject: gawk bug
Packit 575503
# Message-ID: <20040903004347.W80049@lorax.ldc.upenn.edu>
Packit 575503
# 
Packit 575503
# Here is a revised version of my previous message, modified to describe
Packit 575503
# the accompanying files.
Packit 575503
# 
Packit 575503
# IhSplit.awk should replicate every record with exactly one entry in the
Packit 575503
# IH field, delete records lacking an IH field, and produce as many copies
Packit 575503
# of records with two or more entries in the IH field as there are entries.
Packit 575503
# In the latter case, the original IH field should be relabelled OIH and
Packit 575503
# a new IH field be added at the beginning of the record.
Packit 575503
# 
Packit 575503
# This has worked properly for many years, since at least 1997. It worked properly with gawk 3.0.5
Packit 575503
# and possibly later versions. Unfortunately I didn't keep track of exactly what version it
Packit 575503
# broke on, but it was whatever came with Mandrake Linux 9.0. It continued to fail with version
Packit 575503
# 3.1.2. However, the problem was eliminated with version 3.1.3 and remains
Packit 575503
# eliminated in version 3.1.4.
Packit 575503
# 
Packit 575503
# The problem was that an apparently random subset of records would loose some
Packit 575503
# or all of their fields. Running the script on the same input always produces
Packit 575503
# the same output with the same errors.
Packit 575503
# 
Packit 575503
# The file Input is a subset of a real lexicon that produces errors using
Packit 575503
# gawk 3.1.2. GoodOutput is the expected output. BadOutput is the erroneous
Packit 575503
# output. A diff will show that there are actually two errors. One record
Packit 575503
# has fields stripped as described above. Another is omitted in its entirety.
Packit 575503
# 
Packit 575503
# 
Packit 575503
# Bill Poser, Linguistics, University of Pennsylvania
Packit 575503
# http://www.ling.upenn.edu/~wjposer/ billposer@alum.mit.edu
Packit 575503
# ----------------------------------------------------------------------------
Packit 575503
#For each record that contains multiple items in its inverse headword (IH)
Packit 575503
#field, generate a set of new records each containing exactly one item
Packit 575503
#in the inverse headword field, otherwise copies of the original.
Packit 575503
Packit 575503
function CleanUp() #Clean up for next input record.
Packit 575503
{
Packit 575503
  for(i in rec) delete rec[i];
Packit 575503
}
Packit 575503
Packit 575503
BEGIN {
Packit 575503
RS = "";
Packit 575503
FS = "\n?%"
Packit 575503
}
Packit 575503
{
Packit 575503
Packit 575503
# First, create an associative array with the tags as indices.
Packit 575503
  for(i = 2; i <= NF; i++) { # The leading FS creates an initial empty field
Packit 575503
       split($i, f, ":");
Packit 575503
       rec[f[1]]=substr($i,index($i,":")+1);
Packit 575503
  }
Packit 575503
Packit 575503
  if(!("IH" in rec)) next;
Packit 575503
Packit 575503
# Parse out the inverse headwords
Packit 575503
Packit 575503
     items = split(rec["IH"],ihs,"/");
Packit 575503
Packit 575503
# Replace the old IH field.
Packit 575503
Packit 575503
     sub(/%IH:/,"%OIH:",$0);
Packit 575503
Packit 575503
# Generate a new copy of the record for each inverse headword
Packit 575503
Packit 575503
       for(i = 1; i <= items; i++){
Packit 575503
	 entries+=1;
Packit 575503
         printf("%%IH:%s\n",ihs[i]);
Packit 575503
         printf("%s\n\n",$0);
Packit 575503
       }
Packit 575503
       CleanUp();
Packit 575503
  }