Tree - source-git/hspell - CentOS Git server

source-git / hspell

Blame specfilter.c

Blob History Raw

Packit	ca9683	`/* Copyright 2005-2009 Nadav Har'El and Dan Kenigsberg */`
Packit	ca9683
Packit	ca9683	`/* specfilter.c - word prefix-specifier`
Packit	ca9683	`* The way prefixes currently work in Hspell is that each word has an 8-bit`
Packit	ca9683	`* (only 6 are used) "specifier", and each prefix has a bit mask, and the`
Packit	ca9683	`* word+prefix combination is accepted if one of the bits is 1 in both the`
Packit	ca9683	`* word specifier and prefix mask.`
Packit	ca9683	`* The idea in this is that each bit corresponds to a prefix feature that`
Packit	ca9683	`* words need, and certain prefixes supply. If a word has two or more meanings,`
Packit	ca9683	`* the word's specifier might get two or more 1 bits.`
Packit	ca9683	`*`
Packit	ca9683	`* Now, it turns out (see bug 74) that while the word specifiers can take`
Packit	ca9683	`* on many values (currently, 32), some specifiers are actually equivalent,`
Packit	ca9683	`* in the sense that they end up allowing or disallowing exactly the same`
Packit	ca9683	`* set of prefixes. For example, a word with specifier 2 (PS_L) and a word`
Packit	ca9683	`* with specifier 3 (PS_L \| PS_B) accepts the same prefixes because in our`
Packit	ca9683	`* existing prefix set (sett genprefixes.pl), every prefix which supplies`
Packit	ca9683	`* PS_B also supplies PS_L. Another example, a word with specifier 24`
Packit	ca9683	`* (PS_IMPER \| PS_NONDEF) can get the same prefixes as a word with just a`
Packit	ca9683	`* specifier of 8 (PS_NONDEF).`
Packit	ca9683	`*`
Packit	ca9683	`* The goal of this program is to find which sets of word specifiers are`
Packit	ca9683	`* equivalent in the above sense, and given an input stream of word`
Packit	ca9683	`* specifiers (e.g., uncompressed hebrew.wgz.prefixes) it replaces all`
Packit	ca9683	`* different specifiers in one equivalence class to the same member`
Packit	ca9683	`* (the list of values left after this process is known in Mathematics`
Packit	ca9683	`* as a quotient set).`
Packit	ca9683	`*`
Packit	ca9683	`* The purpose of all this is to reduce the number of different word`
Packit	ca9683	`* specifiers present in hebrew.wgz.prefixes. For example, in Hspell 0.9`
Packit	ca9683	`* this brought the number from 32 down to 9. This allows slightly (10%)`
Packit	ca9683	`* better compression of hebrew.wgz.prefixes, but more importantly,`
Packit	ca9683	`* allows us to generate a much smaller affix file for aspell (see`
Packit	ca9683	`* mk_he_affix.c), because it will contain just 9 prefix sets instead of 32.`
Packit	ca9683	`*`
Packit	ca9683	`* CAVEAT:`
Packit	ca9683	`*`
Packit	ca9683	`* I'm still not sure whether it is wise or not to use this process on`
Packit	ca9683	`* the final hebrew.wgz.prefixes used by Hspell. One one hand it will make`
Packit	ca9683	`* it slightly smaller, but on the other hand it makes use of information`
Packit	ca9683	`* previously available only in the code (prefixes.c - the list of prefixes`
Packit	ca9683	`* and their masks) inside the word list. Meaning that if the prefix list`
Packit	ca9683	`* code changes, the word data will need to be changed as well - a situation`
Packit	ca9683	`* we didn't have previously.`
Packit	ca9683	`* Moreover, it will also mean that we will not be able to have run-time`
Packit	ca9683	`* options that chose among different prefix sets. Luckily, the "-h" option`
Packit	ca9683	`* is fine in this respect (see comment below on why), but it is conceivable`
Packit	ca9683	`* (but not likely) that in the future we might want to use completely`
Packit	ca9683	`* different sets of prefix that behave differently. (?? what actually`
Packit	ca9683	`* matters is just the bag of different masks in the masks[] array))`
Packit	ca9683	`*/`
Packit	ca9683
Packit	ca9683	`#include "prefixes.c"`
Packit	ca9683
Packit	ca9683	`#include <stdlib.h>`
Packit	ca9683
Packit	ca9683	`/* NOTE: currently, the equivalence of two word specifiers does not depend`
Packit	ca9683	`on whether He Hashe'ela is allowed or not. This is because He Hashe'ela`
Packit	ca9683	`only adds another prefix like shin hashimush - so whatever specifiers`
Packit	ca9683	`that can be distinguished by He Hashe'ela can already be distinguished`
Packit	ca9683	`by the shin.`
Packit	ca9683	`It is important to remember that this fact may not remain true if we add`
Packit	ca9683	`more options of prefix bitmasks arrays, so this decision might need to`
Packit	ca9683	`be revisited.`
Packit	ca9683	`*/`
Packit	ca9683	`#define MASKS masks_noH`
Packit	ca9683	`#define SPECBITS 6 /* maximum number of bits in PrefixBits.pl */`
Packit	ca9683
Packit	ca9683	`#define NMASKS (sizeof(MASKS)/sizeof(MASKS[0])-1)`
Packit	ca9683	`#define NSPECS (1<`
Packit	ca9683
Packit	ca9683	`/* filled by genequiv: */`
Packit	ca9683	`int masks[NSPECS], nmasks;`
Packit	ca9683	`int equivalent[NSPECS];`
Packit	ca9683
Packit	ca9683	`/* Check if two word specs, mask1 and mask2, are equivalent with all`
Packit	ca9683	`known prefixes, in the sense that each prefix is either allowed or`
Packit	ca9683	`disallowed the same for both masks.`
Packit	ca9683	`*/`
Packit	ca9683	`static int checkequiv(int spec1, int spec2)`
Packit	ca9683	`{`
Packit	ca9683	`int k;`
Packit	ca9683	`for(k=0; k`
Packit	ca9683	`if( ((masks[k]&spec1) != 0) != ((masks[k]&spec2) != 0) )`
Packit	ca9683	`return 0;`
Packit	ca9683	`return 1;`
Packit	ca9683	`}`
Packit	ca9683
Packit	ca9683	`/* Fill the equivalent[] array, giving to each specifier an equivalent`
Packit	ca9683	`* specifier chosen as the representative of its equivalence class.`
Packit	ca9683	`*. As a by product, also fill masks[] array (different masks).`
Packit	ca9683	`*/`
Packit	ca9683	`static void genequiv(void)`
Packit	ca9683	`{`
Packit	ca9683	`int i,j;`
Packit	ca9683	`/* Stage 1: prepare a smaller masks[] array from the MASKS[] array.`
Packit	ca9683	`(this is just a silly optimization - we could have used the MASKS`
Packit	ca9683	`array directly just fine).`
Packit	ca9683
Packit	ca9683	`In checkequiv() we need to check all the prefix's masks. But doing`
Packit	ca9683	`the loop directly over all masks is quite wasteful, because prefixes`
Packit	ca9683	`are generated in groups and many of them have the same masks. For`
Packit	ca9683	`example, Hspell 0,9's masks_noH has 242 entries, but just five`
Packit	ca9683	`distinct values: 60 (two prefixes - namely waw and the null prefix),`
Packit	ca9683	`43 (4 prefixes), 44 (14 prefixes), 32 (48 prefixes) and`
Packit	ca9683	`42 (174 prefixes). So we copy the unique masks in MASKS into`
Packit	ca9683	`the masks[] array. This makes finding the equivalent[] array 5`
Packit	ca9683	`times quicker on my machine (just 1/40,000 of a second!)`
Packit	ca9683
Packit	ca9683	`TODO: can this knowledge be somehow be used to squeeze he_affix.dat`
Packit	ca9683	`even further, because this means that each of the 9 prefix sets`
Packit	ca9683	`mentioned above are all just combinations of these 5 disjoint`
Packit	ca9683	`(and therefore smaller) sets?`
Packit	ca9683	`*/`
Packit	ca9683	`for(i=0;i`
Packit	ca9683	`for(i=0;i`
Packit	ca9683	`masks[MASKS[i]]++;/* TODO: check that 0<=MASKS[i]`
Packit	ca9683	`nmasks=0;`
Packit	ca9683	`for(i=0;i`
Packit	ca9683	`if(masks[i])`
Packit	ca9683	`masks[nmasks++] = i;`
Packit	ca9683
Packit	ca9683	`/* Stage 2: find the equivalent[] array */`
Packit	ca9683	`for(i=0;i`
Packit	ca9683
Packit	ca9683	`for(i=0;i`
Packit	ca9683	`if(equivalent[i]) /already in equivalence class/`
Packit	ca9683	`continue;`
Packit	ca9683	`equivalent[i]=i; /new equivalence class, find more members:/`
Packit	ca9683	`for(j=i+1; j`
Packit	ca9683	`if(equivalent[j])`
Packit	ca9683	`continue;`
Packit	ca9683	`if(checkequiv(i, j))`
Packit	ca9683	`equivalent[j]=i;`
Packit	ca9683	`}`
Packit	ca9683	`}`
Packit	ca9683	`}`
Packit	ca9683
Packit	ca9683	`#include <stdio.h>`
Packit	ca9683	`int main(void){`
Packit	ca9683	`int i,j;`
Packit	ca9683	`int num;`
Packit	ca9683
Packit	ca9683	`genequiv();`
Packit	ca9683
Packit	ca9683	`#if 0`
Packit	ca9683	`fprintf(stderr, "Prefix specifier equivalences:\n");`
Packit	ca9683	`for(i=0;i`
Packit	ca9683	`fprintf(stderr, "%d: %d\n",i,equivalent[i]);`
Packit	ca9683	`}`
Packit	ca9683	`fprintf(stderr, "\n");`
Packit	ca9683	`#endif`
Packit	ca9683
Packit	ca9683	`fprintf(stderr, "%d unique prefix masks:", nmasks);`
Packit	ca9683	`for(i=0;i`
Packit	ca9683	`fprintf(stderr, " %d", masks[i]);`
Packit	ca9683	`}`
Packit	ca9683	`fprintf(stderr, "\n");`
Packit	ca9683
Packit	ca9683	`fprintf(stderr, "Prefix specifier equivalence classes:\n");`
Packit	ca9683	`num=0;`
Packit	ca9683	`for(i=0;i`
Packit	ca9683	`if(equivalent[i]`
Packit	ca9683	`continue;`
Packit	ca9683	`fprintf(stderr,"%d: %d",num++,i);`
Packit	ca9683	`for(j=i+1; j`
Packit	ca9683	`if(equivalent[j]==i){`
Packit	ca9683	`fprintf(stderr," %d",j);`
Packit	ca9683	`}`
Packit	ca9683	`}`
Packit	ca9683	`fprintf(stderr,"\n");`
Packit	ca9683	`}`
Packit	ca9683
Packit	ca9683	`/* Read hebrew.wgz.prefixes, and replace each specifier by the`
Packit	ca9683	`canonic member of its equivalence class */`
Packit	ca9683	`while ((i=getchar())!= EOF) {`
Packit	ca9683	`if(i >= NSPECS){`
Packit	ca9683	`fprintf(stderr, "value %d in hebrew.wgz.prefixes out of bound\n", i);`
Packit	ca9683	`exit(1);`
Packit	ca9683	`}`
Packit	ca9683	`putchar(equivalent[i]);`
Packit	ca9683	`}`
Packit	ca9683	`return 0;`
Packit	ca9683	`}`

source-git / hspell

Source Code

Blame specfilter.c