|
Packit |
ca9683 |
/* Copyright 2005-2009 Nadav Har'El and Dan Kenigsberg */
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
/* specfilter.c - word prefix-specifier
|
|
Packit |
ca9683 |
* The way prefixes currently work in Hspell is that each word has an 8-bit
|
|
Packit |
ca9683 |
* (only 6 are used) "specifier", and each prefix has a bit mask, and the
|
|
Packit |
ca9683 |
* word+prefix combination is accepted if one of the bits is 1 in both the
|
|
Packit |
ca9683 |
* word specifier and prefix mask.
|
|
Packit |
ca9683 |
* The idea in this is that each bit corresponds to a prefix feature that
|
|
Packit |
ca9683 |
* words need, and certain prefixes supply. If a word has two or more meanings,
|
|
Packit |
ca9683 |
* the word's specifier might get two or more 1 bits.
|
|
Packit |
ca9683 |
*
|
|
Packit |
ca9683 |
* Now, it turns out (see bug 74) that while the word specifiers can take
|
|
Packit |
ca9683 |
* on many values (currently, 32), some specifiers are actually equivalent,
|
|
Packit |
ca9683 |
* in the sense that they end up allowing or disallowing exactly the same
|
|
Packit |
ca9683 |
* set of prefixes. For example, a word with specifier 2 (PS_L) and a word
|
|
Packit |
ca9683 |
* with specifier 3 (PS_L | PS_B) accepts the same prefixes because in our
|
|
Packit |
ca9683 |
* existing prefix set (sett genprefixes.pl), every prefix which supplies
|
|
Packit |
ca9683 |
* PS_B also supplies PS_L. Another example, a word with specifier 24
|
|
Packit |
ca9683 |
* (PS_IMPER | PS_NONDEF) can get the same prefixes as a word with just a
|
|
Packit |
ca9683 |
* specifier of 8 (PS_NONDEF).
|
|
Packit |
ca9683 |
*
|
|
Packit |
ca9683 |
* The goal of this program is to find which sets of word specifiers are
|
|
Packit |
ca9683 |
* equivalent in the above sense, and given an input stream of word
|
|
Packit |
ca9683 |
* specifiers (e.g., uncompressed hebrew.wgz.prefixes) it replaces all
|
|
Packit |
ca9683 |
* different specifiers in one equivalence class to the same member
|
|
Packit |
ca9683 |
* (the list of values left after this process is known in Mathematics
|
|
Packit |
ca9683 |
* as a quotient set).
|
|
Packit |
ca9683 |
*
|
|
Packit |
ca9683 |
* The purpose of all this is to reduce the number of different word
|
|
Packit |
ca9683 |
* specifiers present in hebrew.wgz.prefixes. For example, in Hspell 0.9
|
|
Packit |
ca9683 |
* this brought the number from 32 down to 9. This allows slightly (10%)
|
|
Packit |
ca9683 |
* better compression of hebrew.wgz.prefixes, but more importantly,
|
|
Packit |
ca9683 |
* allows us to generate a much smaller affix file for aspell (see
|
|
Packit |
ca9683 |
* mk_he_affix.c), because it will contain just 9 prefix sets instead of 32.
|
|
Packit |
ca9683 |
*
|
|
Packit |
ca9683 |
* CAVEAT:
|
|
Packit |
ca9683 |
*
|
|
Packit |
ca9683 |
* I'm still not sure whether it is wise or not to use this process on
|
|
Packit |
ca9683 |
* the final hebrew.wgz.prefixes used by Hspell. One one hand it will make
|
|
Packit |
ca9683 |
* it slightly smaller, but on the other hand it makes use of information
|
|
Packit |
ca9683 |
* previously available only in the code (prefixes.c - the list of prefixes
|
|
Packit |
ca9683 |
* and their masks) inside the word list. Meaning that if the prefix list
|
|
Packit |
ca9683 |
* code changes, the word data will need to be changed as well - a situation
|
|
Packit |
ca9683 |
* we didn't have previously.
|
|
Packit |
ca9683 |
* Moreover, it will also mean that we will not be able to have run-time
|
|
Packit |
ca9683 |
* options that chose among different prefix sets. Luckily, the "-h" option
|
|
Packit |
ca9683 |
* is fine in this respect (see comment below on why), but it is conceivable
|
|
Packit |
ca9683 |
* (but not likely) that in the future we might want to use completely
|
|
Packit |
ca9683 |
* different sets of prefix that behave differently. (?? what actually
|
|
Packit |
ca9683 |
* matters is just the bag of different masks in the masks[] array))
|
|
Packit |
ca9683 |
*/
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
#include "prefixes.c"
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
#include <stdlib.h>
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
/* NOTE: currently, the equivalence of two word specifiers does not depend
|
|
Packit |
ca9683 |
on whether He Hashe'ela is allowed or not. This is because He Hashe'ela
|
|
Packit |
ca9683 |
only adds another prefix like shin hashimush - so whatever specifiers
|
|
Packit |
ca9683 |
that can be distinguished by He Hashe'ela can already be distinguished
|
|
Packit |
ca9683 |
by the shin.
|
|
Packit |
ca9683 |
It is important to remember that this fact may not remain true if we add
|
|
Packit |
ca9683 |
more options of prefix bitmasks arrays, so this decision might need to
|
|
Packit |
ca9683 |
be revisited.
|
|
Packit |
ca9683 |
*/
|
|
Packit |
ca9683 |
#define MASKS masks_noH
|
|
Packit |
ca9683 |
#define SPECBITS 6 /* maximum number of bits in PrefixBits.pl */
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
#define NMASKS (sizeof(MASKS)/sizeof(MASKS[0])-1)
|
|
Packit |
ca9683 |
#define NSPECS (1<
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
/* filled by genequiv: */
|
|
Packit |
ca9683 |
int masks[NSPECS], nmasks;
|
|
Packit |
ca9683 |
int equivalent[NSPECS];
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
/* Check if two word specs, mask1 and mask2, are equivalent with all
|
|
Packit |
ca9683 |
known prefixes, in the sense that each prefix is either allowed or
|
|
Packit |
ca9683 |
disallowed the same for both masks.
|
|
Packit |
ca9683 |
*/
|
|
Packit |
ca9683 |
static int checkequiv(int spec1, int spec2)
|
|
Packit |
ca9683 |
{
|
|
Packit |
ca9683 |
int k;
|
|
Packit |
ca9683 |
for(k=0; k
|
|
Packit |
ca9683 |
if( ((masks[k]&spec1) != 0) != ((masks[k]&spec2) != 0) )
|
|
Packit |
ca9683 |
return 0;
|
|
Packit |
ca9683 |
return 1;
|
|
Packit |
ca9683 |
}
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
/* Fill the equivalent[] array, giving to each specifier an equivalent
|
|
Packit |
ca9683 |
* specifier chosen as the representative of its equivalence class.
|
|
Packit |
ca9683 |
*. As a by product, also fill masks[] array (different masks).
|
|
Packit |
ca9683 |
*/
|
|
Packit |
ca9683 |
static void genequiv(void)
|
|
Packit |
ca9683 |
{
|
|
Packit |
ca9683 |
int i,j;
|
|
Packit |
ca9683 |
/* Stage 1: prepare a smaller masks[] array from the MASKS[] array.
|
|
Packit |
ca9683 |
(this is just a silly optimization - we could have used the MASKS
|
|
Packit |
ca9683 |
array directly just fine).
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
In checkequiv() we need to check all the prefix's masks. But doing
|
|
Packit |
ca9683 |
the loop directly over all masks is quite wasteful, because prefixes
|
|
Packit |
ca9683 |
are generated in groups and many of them have the same masks. For
|
|
Packit |
ca9683 |
example, Hspell 0,9's masks_noH has 242 entries, but just five
|
|
Packit |
ca9683 |
distinct values: 60 (two prefixes - namely waw and the null prefix),
|
|
Packit |
ca9683 |
43 (4 prefixes), 44 (14 prefixes), 32 (48 prefixes) and
|
|
Packit |
ca9683 |
42 (174 prefixes). So we copy the unique masks in MASKS into
|
|
Packit |
ca9683 |
the masks[] array. This makes finding the equivalent[] array 5
|
|
Packit |
ca9683 |
times quicker on my machine (just 1/40,000 of a second!)
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
TODO: can this knowledge be somehow be used to squeeze he_affix.dat
|
|
Packit |
ca9683 |
even further, because this means that each of the 9 prefix sets
|
|
Packit |
ca9683 |
mentioned above are all just combinations of these 5 disjoint
|
|
Packit |
ca9683 |
(and therefore smaller) sets?
|
|
Packit |
ca9683 |
*/
|
|
Packit |
ca9683 |
for(i=0;i
|
|
Packit |
ca9683 |
for(i=0;i
|
|
Packit |
ca9683 |
masks[MASKS[i]]++;/* TODO: check that 0<=MASKS[i]
|
|
Packit |
ca9683 |
nmasks=0;
|
|
Packit |
ca9683 |
for(i=0;i
|
|
Packit |
ca9683 |
if(masks[i])
|
|
Packit |
ca9683 |
masks[nmasks++] = i;
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
/* Stage 2: find the equivalent[] array */
|
|
Packit |
ca9683 |
for(i=0;i
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
for(i=0;i
|
|
Packit |
ca9683 |
if(equivalent[i]) /*already in equivalence class*/
|
|
Packit |
ca9683 |
continue;
|
|
Packit |
ca9683 |
equivalent[i]=i; /*new equivalence class, find more members:*/
|
|
Packit |
ca9683 |
for(j=i+1; j
|
|
Packit |
ca9683 |
if(equivalent[j])
|
|
Packit |
ca9683 |
continue;
|
|
Packit |
ca9683 |
if(checkequiv(i, j))
|
|
Packit |
ca9683 |
equivalent[j]=i;
|
|
Packit |
ca9683 |
}
|
|
Packit |
ca9683 |
}
|
|
Packit |
ca9683 |
}
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
#include <stdio.h>
|
|
Packit |
ca9683 |
int main(void){
|
|
Packit |
ca9683 |
int i,j;
|
|
Packit |
ca9683 |
int num;
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
genequiv();
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
#if 0
|
|
Packit |
ca9683 |
fprintf(stderr, "Prefix specifier equivalences:\n");
|
|
Packit |
ca9683 |
for(i=0;i
|
|
Packit |
ca9683 |
fprintf(stderr, "%d: %d\n",i,equivalent[i]);
|
|
Packit |
ca9683 |
}
|
|
Packit |
ca9683 |
fprintf(stderr, "\n");
|
|
Packit |
ca9683 |
#endif
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
fprintf(stderr, "%d unique prefix masks:", nmasks);
|
|
Packit |
ca9683 |
for(i=0;i
|
|
Packit |
ca9683 |
fprintf(stderr, " %d", masks[i]);
|
|
Packit |
ca9683 |
}
|
|
Packit |
ca9683 |
fprintf(stderr, "\n");
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
fprintf(stderr, "Prefix specifier equivalence classes:\n");
|
|
Packit |
ca9683 |
num=0;
|
|
Packit |
ca9683 |
for(i=0;i
|
|
Packit |
ca9683 |
if(equivalent[i]
|
|
Packit |
ca9683 |
continue;
|
|
Packit |
ca9683 |
fprintf(stderr,"%d: %d",num++,i);
|
|
Packit |
ca9683 |
for(j=i+1; j
|
|
Packit |
ca9683 |
if(equivalent[j]==i){
|
|
Packit |
ca9683 |
fprintf(stderr," %d",j);
|
|
Packit |
ca9683 |
}
|
|
Packit |
ca9683 |
}
|
|
Packit |
ca9683 |
fprintf(stderr,"\n");
|
|
Packit |
ca9683 |
}
|
|
Packit |
ca9683 |
|
|
Packit |
ca9683 |
/* Read hebrew.wgz.prefixes, and replace each specifier by the
|
|
Packit |
ca9683 |
canonic member of its equivalence class */
|
|
Packit |
ca9683 |
while ((i=getchar())!= EOF) {
|
|
Packit |
ca9683 |
if(i >= NSPECS){
|
|
Packit |
ca9683 |
fprintf(stderr, "value %d in hebrew.wgz.prefixes out of bound\n", i);
|
|
Packit |
ca9683 |
exit(1);
|
|
Packit |
ca9683 |
}
|
|
Packit |
ca9683 |
putchar(equivalent[i]);
|
|
Packit |
ca9683 |
}
|
|
Packit |
ca9683 |
return 0;
|
|
Packit |
ca9683 |
}
|