Blame specfilter.c

Packit ca9683
/* Copyright 2005-2009 Nadav Har'El and Dan Kenigsberg */
Packit ca9683
Packit ca9683
/* specfilter.c - word prefix-specifier 
Packit ca9683
 * The way prefixes currently work in Hspell is that each word has an 8-bit
Packit ca9683
 * (only 6 are used) "specifier", and each prefix has a bit mask, and the
Packit ca9683
 * word+prefix combination is accepted if one of the bits is 1 in both the
Packit ca9683
 * word specifier and prefix mask. 
Packit ca9683
 * The idea in this is that each bit corresponds to a prefix feature that
Packit ca9683
 * words need, and certain prefixes supply. If a word has two or more meanings,
Packit ca9683
 * the word's specifier might get two or more 1 bits.
Packit ca9683
 *
Packit ca9683
 * Now, it turns out (see bug 74) that while the word specifiers can take
Packit ca9683
 * on many values (currently, 32), some specifiers are actually equivalent,
Packit ca9683
 * in the sense that they end up allowing or disallowing exactly the same
Packit ca9683
 * set of prefixes. For example, a word with specifier 2 (PS_L) and a word
Packit ca9683
 * with specifier 3 (PS_L | PS_B) accepts the same prefixes because in our
Packit ca9683
 * existing prefix set (sett genprefixes.pl), every prefix which supplies
Packit ca9683
 * PS_B also supplies PS_L. Another example, a word with specifier  24
Packit ca9683
 * (PS_IMPER | PS_NONDEF) can get the same prefixes as a word with just a
Packit ca9683
 * specifier of 8 (PS_NONDEF).
Packit ca9683
 *
Packit ca9683
 * The goal of this program is to find which sets of word specifiers are
Packit ca9683
 * equivalent in the above sense, and given an input stream of word
Packit ca9683
 * specifiers (e.g., uncompressed hebrew.wgz.prefixes) it replaces all
Packit ca9683
 * different specifiers in one equivalence class to the same member
Packit ca9683
 * (the list of values left after this process is known in Mathematics
Packit ca9683
 * as a quotient set).
Packit ca9683
 * 
Packit ca9683
 * The purpose of all this is to reduce the number of different word
Packit ca9683
 * specifiers present in hebrew.wgz.prefixes. For example, in Hspell 0.9
Packit ca9683
 * this brought the number from 32 down to 9. This allows slightly (10%)
Packit ca9683
 * better compression of hebrew.wgz.prefixes, but more importantly,
Packit ca9683
 * allows us to generate a much smaller affix file for aspell (see
Packit ca9683
 * mk_he_affix.c), because it will contain just 9 prefix sets instead of 32.
Packit ca9683
 *
Packit ca9683
 * CAVEAT:
Packit ca9683
 *
Packit ca9683
 * I'm still not sure whether it is wise or not to use this process on
Packit ca9683
 * the final hebrew.wgz.prefixes used by Hspell. One one hand it will make
Packit ca9683
 * it slightly smaller, but on the other hand it makes use of information
Packit ca9683
 * previously available only in the code (prefixes.c - the list of prefixes
Packit ca9683
 * and their masks) inside the word list. Meaning that if the prefix list
Packit ca9683
 * code changes, the word data will need to be changed as well - a situation
Packit ca9683
 * we didn't have previously.
Packit ca9683
 * Moreover, it will also mean that we will not be able to have run-time
Packit ca9683
 * options that chose among different prefix sets. Luckily, the "-h" option
Packit ca9683
 * is fine in this respect (see comment below on why), but it is conceivable
Packit ca9683
 * (but not likely) that in the future we might want to use completely
Packit ca9683
 * different sets of prefix that behave differently. (?? what actually
Packit ca9683
 * matters is just the bag of different masks in the masks[] array))
Packit ca9683
 */
Packit ca9683
Packit ca9683
#include "prefixes.c"
Packit ca9683
Packit ca9683
#include <stdlib.h>
Packit ca9683
Packit ca9683
/* NOTE: currently, the equivalence of two word specifiers does not depend
Packit ca9683
   on whether He Hashe'ela is allowed or not. This is because He Hashe'ela
Packit ca9683
   only adds another prefix like shin hashimush - so whatever specifiers
Packit ca9683
   that can be distinguished by He Hashe'ela can already be distinguished
Packit ca9683
   by the shin.
Packit ca9683
   It is important to remember that this fact may not remain true if we add
Packit ca9683
   more options of prefix bitmasks arrays, so this decision might need to
Packit ca9683
   be revisited.
Packit ca9683
*/
Packit ca9683
#define MASKS masks_noH
Packit ca9683
#define SPECBITS 6 /* maximum number of bits in PrefixBits.pl */
Packit ca9683
Packit ca9683
#define NMASKS (sizeof(MASKS)/sizeof(MASKS[0])-1)
Packit ca9683
#define NSPECS (1<
Packit ca9683
Packit ca9683
/* filled by genequiv: */
Packit ca9683
int masks[NSPECS], nmasks;
Packit ca9683
int equivalent[NSPECS];
Packit ca9683
Packit ca9683
/* Check if two word specs, mask1 and mask2, are equivalent with all
Packit ca9683
   known prefixes, in the sense that each prefix is either allowed or
Packit ca9683
   disallowed the same for both masks.
Packit ca9683
*/
Packit ca9683
static int checkequiv(int spec1, int spec2)
Packit ca9683
{
Packit ca9683
	int k;
Packit ca9683
	for(k=0; k
Packit ca9683
		if( ((masks[k]&spec1) != 0) != ((masks[k]&spec2) != 0) )
Packit ca9683
			return 0;
Packit ca9683
	return 1;
Packit ca9683
}
Packit ca9683
Packit ca9683
/* Fill the equivalent[] array, giving to each specifier an equivalent
Packit ca9683
 * specifier chosen as the representative of its equivalence class.
Packit ca9683
 *. As a by product, also fill masks[] array (different masks).
Packit ca9683
 */
Packit ca9683
static void genequiv(void)
Packit ca9683
{
Packit ca9683
	int i,j;
Packit ca9683
        /* Stage 1: prepare a smaller masks[] array from the MASKS[] array.
Packit ca9683
	   (this is just a silly optimization - we could have used the MASKS
Packit ca9683
	   array directly just fine).
Packit ca9683
Packit ca9683
	   In checkequiv() we need to check all the prefix's masks. But doing
Packit ca9683
	   the loop directly over all masks is quite wasteful, because prefixes
Packit ca9683
	   are generated in groups and many of them have the same masks. For
Packit ca9683
	   example, Hspell 0,9's masks_noH has 242 entries, but just five
Packit ca9683
	   distinct values: 60 (two prefixes - namely waw and the null prefix),
Packit ca9683
	   43 (4 prefixes), 44 (14 prefixes), 32 (48 prefixes) and
Packit ca9683
	   42 (174 prefixes). So we copy the unique masks in MASKS into
Packit ca9683
	   the masks[] array. This makes finding the equivalent[] array 5
Packit ca9683
	   times quicker on my machine (just 1/40,000 of a second!)
Packit ca9683
	   
Packit ca9683
	   TODO: can this knowledge be somehow be used to squeeze he_affix.dat
Packit ca9683
	   even further, because this means that each of the 9 prefix sets
Packit ca9683
	   mentioned above are all just combinations of these 5 disjoint
Packit ca9683
	   (and therefore smaller) sets?
Packit ca9683
	*/
Packit ca9683
	for(i=0;i
Packit ca9683
	for(i=0;i
Packit ca9683
		masks[MASKS[i]]++;/* TODO: check that 0<=MASKS[i]
Packit ca9683
	nmasks=0;
Packit ca9683
	for(i=0;i
Packit ca9683
		if(masks[i])
Packit ca9683
			masks[nmasks++] = i;
Packit ca9683
Packit ca9683
	/* Stage 2: find the equivalent[] array */
Packit ca9683
	for(i=0;i
Packit ca9683
Packit ca9683
	for(i=0;i
Packit ca9683
		if(equivalent[i]) /*already in equivalence class*/
Packit ca9683
			continue;
Packit ca9683
		equivalent[i]=i; /*new equivalence class, find more members:*/
Packit ca9683
		for(j=i+1; j
Packit ca9683
			if(equivalent[j])
Packit ca9683
				continue;
Packit ca9683
			if(checkequiv(i, j))
Packit ca9683
				equivalent[j]=i;
Packit ca9683
		}
Packit ca9683
	}
Packit ca9683
}
Packit ca9683
Packit ca9683
#include <stdio.h>
Packit ca9683
int main(void){
Packit ca9683
	int i,j;
Packit ca9683
	int num;
Packit ca9683
Packit ca9683
	genequiv();
Packit ca9683
Packit ca9683
#if 0
Packit ca9683
	fprintf(stderr, "Prefix specifier equivalences:\n");
Packit ca9683
	for(i=0;i
Packit ca9683
		fprintf(stderr, "%d: %d\n",i,equivalent[i]);
Packit ca9683
	}
Packit ca9683
	fprintf(stderr, "\n");
Packit ca9683
#endif
Packit ca9683
Packit ca9683
	fprintf(stderr, "%d unique prefix masks:", nmasks);
Packit ca9683
	for(i=0;i
Packit ca9683
		fprintf(stderr, " %d", masks[i]);
Packit ca9683
	}
Packit ca9683
	fprintf(stderr, "\n");
Packit ca9683
Packit ca9683
	fprintf(stderr, "Prefix specifier equivalence classes:\n");
Packit ca9683
	num=0;
Packit ca9683
	for(i=0;i
Packit ca9683
		if(equivalent[i]
Packit ca9683
			continue;
Packit ca9683
		fprintf(stderr,"%d: %d",num++,i);
Packit ca9683
		for(j=i+1; j
Packit ca9683
			if(equivalent[j]==i){
Packit ca9683
				fprintf(stderr," %d",j);
Packit ca9683
			}
Packit ca9683
		}
Packit ca9683
		fprintf(stderr,"\n");
Packit ca9683
	}
Packit ca9683
Packit ca9683
	/* Read hebrew.wgz.prefixes, and replace each specifier by the
Packit ca9683
	   canonic member of its equivalence class */
Packit ca9683
	while ((i=getchar())!= EOF) {
Packit ca9683
		if(i >= NSPECS){
Packit ca9683
			fprintf(stderr, "value %d in hebrew.wgz.prefixes out of bound\n", i);
Packit ca9683
			exit(1);
Packit ca9683
		}
Packit ca9683
		putchar(equivalent[i]);
Packit ca9683
	}
Packit ca9683
	return 0;
Packit ca9683
}