Blame dict_radix.c

Packit ca9683
/* Copyright (C) 2003-2009 Nadav Har'El and Dan Kenigsberg */
Packit ca9683
Packit ca9683
#include <stdio.h>
Packit ca9683
#include <sys/types.h>
Packit ca9683
#include <stdlib.h>
Packit ca9683
#include <string.h>
Packit ca9683
Packit ca9683
/* This is for declaring the uint32_t type, a type holding a 32-bit unsigned
Packit ca9683
   integer. It exists on Linux and on fairly modern Solaris, but
Packit ca9683
   maybe not anywhere else. We should use autoconf to solve this portability
Packit ca9683
   nightmare.
Packit ca9683
*/
Packit ca9683
#include <inttypes.h>
Packit ca9683
Packit ca9683
/* If the Zlib library is available, we use it for reading the compressed
Packit ca9683
   dictionaries, rather than opening a pipe to an external gzip process.
Packit ca9683
   In one measurement, this halved the loading time (0.1 sec to 0.05 sec).
Packit ca9683
   It also allowed Hspell to be used on systems where the Zlib library
Packit ca9683
   is available, but the gzip program is not (e.g., OpenOffice on MS-Windows).
Packit ca9683
Packit ca9683
   The definitions which are pretty bizarre, but help us convert the existing
Packit ca9683
   code into something that will work with zlib without ugly ifdefs
Packit ca9683
   everywhere (further ifdefs are only needed in some places). Note that
Packit ca9683
   when BUFFERED_ZLIB is enabled (and it is enabled by default here) we
Packit ca9683
   enable special buffered version of zlib (gzbuffered.h) instead of the
Packit ca9683
   normal zlib functions.
Packit ca9683
*/
Packit ca9683
#ifdef HAVE_ZLIB
Packit ca9683
#define BUFFERED_ZLIB
Packit ca9683
#undef FILE
Packit ca9683
#undef pclose
Packit ca9683
#undef getc
Packit ca9683
#ifdef BUFFERED_ZLIB
Packit ca9683
#include "gzbuffered.h"
Packit ca9683
#undef gzopen
Packit ca9683
#undef gzdopen
Packit ca9683
#define FILE void /* void* can be either normal FILE* or gzbFile*. Eek. */
Packit ca9683
#define gzopen(path,mode) gzb_open(path,mode)
Packit ca9683
#define gzdopen(path,mode) gzb_dopen(path,mode)
Packit ca9683
#define pclose(f) (gzb_close((gzbFile *)(f)))
Packit ca9683
#define getc(f) (gzb_getc(((gzbFile *)(f))))
Packit ca9683
#else
Packit ca9683
#include <zlib.h>
Packit ca9683
#define FILE void    /* FILE* is void*, a.k.a. voidp or gzFile */
Packit ca9683
#define pclose(f) (gzclose((f)))
Packit ca9683
#define getc(f) (gzgetc((f)))
Packit ca9683
#endif
Packit ca9683
#endif /* HAVE_ZLIB */
Packit ca9683
Packit ca9683
/* Our radix tree has four types of "nodes": leaf nodes, small nodes
Packit ca9683
 * (carrying up to SMALL_NODE_CHILDREN children), medium nodes (carrying up to
Packit ca9683
 * MEDIUM_NODE_CHILDREN) and full nodes carrying exactly NUM_LETTERS children.
Packit ca9683
 *
Packit ca9683
 * Since there are plenty of leaf nodes, we want these to be tiny, containing
Packit ca9683
 * basically just a value. Therefore we overload the same 32-bit "val_or_index"
Packit ca9683
 * position to be one of:
Packit ca9683
 * 1.  Empty  (in this case val_or_index==0)
Packit ca9683
 * 2.  Value  (value must be non-zero and 30 bit only!)
Packit ca9683
 * 3.  Index of full node (3 on highest 2 bits, the 30 lowest are the index)
Packit ca9683
 * 4.  Index of medium node (2 on highest 2 bits, the 30 lowest are the index)
Packit ca9683
 * 5.  Index of small node (1 on highest 2 bits, the 30 lowest are the index)
Packit ca9683
 */
Packit ca9683
#define CONST32(x) ((uint32_t)(x))
Packit ca9683
#define HIGHBITS ((CONST32(1)<<31) | (CONST32(1)<<30))
Packit ca9683
#define HIGHBITS_VALUE  (CONST32(0) << 30)
Packit ca9683
#define HIGHBITS_SMALL  (CONST32(1) << 30)
Packit ca9683
#define HIGHBITS_MEDIUM (CONST32(2) << 30)
Packit ca9683
#define HIGHBITS_FULL   (CONST32(3) << 30)
Packit ca9683
#define VALUEMASK (~HIGHBITS)
Packit ca9683
Packit ca9683
#define NUM_LETTERS 29  /* 27 Hebrew letters, " and ' */
Packit ca9683
/* When trying on the Hebrew dictionary, when there are only small and
Packit ca9683
 * full nodes, small_node_children=4 was the clear winner, taking 3363K
Packit ca9683
 * of memory.
Packit ca9683
 * When added medium nodes, there are two ties for minimal space usage
Packit ca9683
 * (at 2260K each): 2,8 and 3,8. Both have 1831 full nodes, 2,8 results in
Packit ca9683
 * 61771/25072 small/medium nodes, and 3,8 results in 71856/14987 small/medium
Packit ca9683
 * nodes.
Packit ca9683
 * One way to choose among them is to minimize search time. On average
Packit ca9683
 * searching a node with N children takes N/2 comparisons. If we pass
Packit ca9683
 * all nodes (and I doubt this is a meaningful measure... :( ) the 2,8
Packit ca9683
 * will make 162059 comparisons and 3,8 will make 167732. Again, roughly
Packit ca9683
 * the same, so I can't decide :(
Packit ca9683
 * Another deciding factor: read time. 2,8 is slightly quicker - I have
Packit ca9683
 * no idea why.
Packit ca9683
 *
Packit ca9683
 * Note: to minimize search time we might want to choose a set of sizes
Packit ca9683
 * which does not assure the smallest size. HOWEVER, one interesting thing
Packit ca9683
 * to note: the children in small and medium nodes are sorted. This might
Packit ca9683
 * mean that it is quicker to search the medium node using a binary search,
Packit ca9683
 * rather than linear? I don't know. Maybe for N=8, it ain't worth it.
Packit ca9683
 */
Packit ca9683
/*#define SMALL_NODE_CHILDREN 4*/
Packit ca9683
#define SMALL_NODE_CHILDREN 2
Packit ca9683
#define MEDIUM_NODE_CHILDREN 8
Packit ca9683
Packit ca9683
#if 0
Packit ca9683
 /*
Packit ca9683
 * NOTE:  SMALL-MEDIUM = 1-4 has an interesting advantage. At 2876K It wasn't
Packit ca9683
 * smallest (2-8 was, with 2257K) but it makes a lot of nodes full or
Packit ca9683
 * 1-child only (and therefore very quick to search) and only some nodes
Packit ca9683
 * with 4 children which is only slightly harder to search (only 2.5
Packit ca9683
 * comparisons needed on average).
Packit ca9683
 * */
Packit ca9683
/* search speed optimization */
Packit ca9683
#define SMALL_NODE_CHILDREN 1
Packit ca9683
#define MEDIUM_NODE_CHILDREN 4
Packit ca9683
#endif
Packit ca9683
Packit ca9683
struct node_index {
Packit ca9683
	/* if most-significant bit of val is on, it's an index. Otherwise,
Packit ca9683
	 * it's only a value (31 bit and nonzero).
Packit ca9683
	*/
Packit ca9683
	uint32_t val_or_index;
Packit ca9683
};
Packit ca9683
Packit ca9683
struct node {
Packit ca9683
	uint32_t value;
Packit ca9683
	struct node_index children[NUM_LETTERS];
Packit ca9683
};
Packit ca9683
struct node_small {
Packit ca9683
	uint32_t value;
Packit ca9683
	char chars[SMALL_NODE_CHILDREN];
Packit ca9683
	struct node_index children[SMALL_NODE_CHILDREN];
Packit ca9683
};
Packit ca9683
struct node_medium {
Packit ca9683
	uint32_t value;
Packit ca9683
	char chars[MEDIUM_NODE_CHILDREN];
Packit ca9683
	struct node_index children[MEDIUM_NODE_CHILDREN];
Packit ca9683
};
Packit ca9683
Packit ca9683
Packit ca9683
/* Note: char_to_letter prints a message when it comes across an invalid
Packit ca9683
   letter, so it should not be used in lookup(), only in reading the
Packit ca9683
   dictionary (which is assumed to contain only valid words). lookup()
Packit ca9683
   has its own implementation of this function inside it.
Packit ca9683
*/
Packit ca9683
static inline int char_to_letter(unsigned char c)
Packit ca9683
{
Packit ca9683
	if(c>=(unsigned char)'à' && c<(unsigned char)'à'+27){
Packit ca9683
		return c - (unsigned char)'à' + 2;
Packit ca9683
	} else if (c=='"'){
Packit ca9683
		return 0;
Packit ca9683
	} else if (c=='\''){
Packit ca9683
		return 1;
Packit ca9683
	} else {
Packit ca9683
		fprintf(stderr,"Hspell: unknown letter %c...\n",c);
Packit ca9683
		/* a silly thing to do, but what the heck */
Packit ca9683
		return 0;
Packit ca9683
	}
Packit ca9683
}
Packit ca9683
Packit ca9683
static inline unsigned char letter_to_char(int l)
Packit ca9683
{
Packit ca9683
	if(l>=2 && l<29){
Packit ca9683
		return l+(unsigned char)'à'-2;
Packit ca9683
	} else if(l==0){
Packit ca9683
		return '"';
Packit ca9683
	} else if(l==1){
Packit ca9683
		return '\'';
Packit ca9683
	} else {
Packit ca9683
		/* this will never happen in the current code: */
Packit ca9683
		fprintf(stderr,"Hspell: internal error: unknown letter %d... "
Packit ca9683
				"exiting.\n",l);
Packit ca9683
		exit(1);
Packit ca9683
	}
Packit ca9683
}
Packit ca9683
Packit ca9683
/* This routine was written for debugging purposes only, and not for
Packit ca9683
 * absolute efficiency.
Packit ca9683
 */
Packit ca9683
static void
Packit ca9683
do_print_tree(struct node *nodes, struct node_small *nodes_small,
Packit ca9683
	   struct node_medium *nodes_medium,
Packit ca9683
           struct node_index head, char *word, int len, int maxlen){
Packit ca9683
	int i;
Packit ca9683
	if(len>=maxlen){
Packit ca9683
		fprintf(stderr,"Hspell: do_print_tree(): warning: buffer overflow.\n");
Packit ca9683
		return;
Packit ca9683
	}
Packit ca9683
	if((head.val_or_index & HIGHBITS) == HIGHBITS_FULL){
Packit ca9683
		struct node *n = &nodes[head.val_or_index & VALUEMASK];
Packit ca9683
		if(n->value){
Packit ca9683
			word[len]='\0';
Packit ca9683
			printf("%s %d\n", word, n->value);
Packit ca9683
		}
Packit ca9683
		for(i=0;i
Packit ca9683
			word[len]=letter_to_char(i);
Packit ca9683
			do_print_tree(nodes,nodes_small,nodes_medium,
Packit ca9683
					n->children[i],word,len+1,maxlen);
Packit ca9683
		}
Packit ca9683
	} else if((head.val_or_index & HIGHBITS) == HIGHBITS_SMALL){
Packit ca9683
		struct node_small *n = &nodes_small[head.val_or_index & VALUEMASK];
Packit ca9683
		if(n->value){
Packit ca9683
			word[len]='\0';
Packit ca9683
			printf("%s %d\n", word, n->value);
Packit ca9683
		}
Packit ca9683
		for(i=0;i
Packit ca9683
			if(n->chars[i]){
Packit ca9683
				word[len]=n->chars[i];
Packit ca9683
				do_print_tree(nodes,nodes_small,nodes_medium,
Packit ca9683
					n->children[i],word,len+1,maxlen);
Packit ca9683
			}
Packit ca9683
		}
Packit ca9683
	} else if((head.val_or_index & HIGHBITS) == HIGHBITS_MEDIUM){
Packit ca9683
		struct node_medium *n = &nodes_medium[head.val_or_index & VALUEMASK];
Packit ca9683
		if(n->value){
Packit ca9683
			word[len]='\0';
Packit ca9683
			printf("%s %d\n", word, n->value);
Packit ca9683
		}
Packit ca9683
		for(i=0;i
Packit ca9683
			if(n->chars[i]){
Packit ca9683
				word[len]=n->chars[i];
Packit ca9683
				do_print_tree(nodes,nodes_small,nodes_medium,
Packit ca9683
					n->children[i],word,len+1,maxlen);
Packit ca9683
			}
Packit ca9683
		}
Packit ca9683
	} else if(head.val_or_index){
Packit ca9683
		word[len]='\0';
Packit ca9683
		printf("%s %d\n", word, head.val_or_index);
Packit ca9683
	}
Packit ca9683
}
Packit ca9683
Packit ca9683
struct dict_radix {
Packit ca9683
	/* The nodes used by the radix tree representation of the dictionary */
Packit ca9683
	int nnodes_small, size_nodes_small;
Packit ca9683
	struct node_small *nodes_small;
Packit ca9683
Packit ca9683
	int nnodes_medium, size_nodes_medium;
Packit ca9683
	struct node_medium *nodes_medium;
Packit ca9683
Packit ca9683
	int nnodes, size_nodes;
Packit ca9683
	struct node *nodes;
Packit ca9683
Packit ca9683
	struct node_index head;
Packit ca9683
Packit ca9683
	/* Freelist of recycled small nodes. As more words are added to the
Packit ca9683
	   dictionary in the process of read_dict(), small nodes become
Packit ca9683
	   medium and medium nodes become full. Because these small/medium
Packit ca9683
	   nodes that are no longer needed are in the middle of the node
Packit ca9683
	   list, we keep them aside in a freelist. They are recycled quickly,
Packit ca9683
	   as new small/medium nodes are continued to be created.
Packit ca9683
	 */
Packit ca9683
	int free_nodes_small[16], nfree_nodes_small;
Packit ca9683
	int free_nodes_medium[16], nfree_nodes_medium;
Packit ca9683
Packit ca9683
	int nwords;
Packit ca9683
};
Packit ca9683
Packit ca9683
/* new_dict_radix is the constructor for an opaque (to the includer of
Packit ca9683
   dict_radix.h) object.
Packit ca9683
*/
Packit ca9683
struct dict_radix *
Packit ca9683
new_dict_radix(void)
Packit ca9683
{
Packit ca9683
	struct dict_radix *dict;
Packit ca9683
	dict= (struct dict_radix *) malloc(sizeof(struct dict_radix));
Packit ca9683
	/* By default, zero all fields in dict_radix */
Packit ca9683
	if(dict)
Packit ca9683
		memset(dict, 0, sizeof(*dict));
Packit ca9683
	return dict;
Packit ca9683
}
Packit ca9683
Packit ca9683
/* Note that delete_dict_radix frees everything inside a dict_radix, and
Packit ca9683
   the dict_radix structure itself. The pointer given to it is no longer
Packit ca9683
   a valid pointer after this call.
Packit ca9683
*/
Packit ca9683
void
Packit ca9683
delete_dict_radix(struct dict_radix *dict)
Packit ca9683
{
Packit ca9683
	if(!dict)
Packit ca9683
		return; /* allow deleting null object, like in C++... */
Packit ca9683
	if(dict->nodes_small)
Packit ca9683
		free(dict->nodes_small);
Packit ca9683
	if(dict->nodes_medium)
Packit ca9683
		free(dict->nodes_medium);
Packit ca9683
	if(dict->nodes)
Packit ca9683
		free(dict->nodes);
Packit ca9683
	free(dict);
Packit ca9683
}
Packit ca9683
Packit ca9683
int
Packit ca9683
allocate_nodes(struct dict_radix *dict, int nsmall, int nmedium, int nfull)
Packit ca9683
{
Packit ca9683
	/* if already allocated, it's an error */
Packit ca9683
	if(dict->nodes)
Packit ca9683
		return -1;
Packit ca9683
Packit ca9683
	dict->nodes_small = malloc(sizeof(struct node_small)*nsmall);
Packit ca9683
	dict->size_nodes_small = nsmall;
Packit ca9683
Packit ca9683
	dict->nodes_medium = malloc(sizeof(struct node_medium)*nmedium);
Packit ca9683
	dict->size_nodes_medium = nmedium;
Packit ca9683
Packit ca9683
	dict->nodes = malloc(sizeof(struct node)*nfull);
Packit ca9683
	dict->size_nodes = nfull;
Packit ca9683
Packit ca9683
	if(dict->nodes_small==NULL || dict->nodes_medium==NULL ||
Packit ca9683
	   dict->nodes==NULL)
Packit ca9683
		return -2;
Packit ca9683
Packit ca9683
	return 0;
Packit ca9683
}
Packit ca9683
Packit ca9683
Packit ca9683
/* Efficiently read a compressed dictionary from the given directory.
Packit ca9683
   Use memory pre-allocation hints from another file in this directory.
Packit ca9683
Packit ca9683
   returns 1 on success, 0 on failure.
Packit ca9683
Packit ca9683
   TODO: there are too many printouts here. We need to return error
Packit ca9683
   numbers instead of all those printouts.
Packit ca9683
*/
Packit ca9683
Packit ca9683
#define PREFIX_FILE
Packit ca9683
Packit ca9683
#ifdef PREFIX_FILE
Packit ca9683
static int do_read_dict(FILE *fp, FILE *prefixes, struct dict_radix *dict);
Packit ca9683
#else
Packit ca9683
static int do_read_dict(FILE *fp, struct dict_radix *dict);
Packit ca9683
#endif
Packit ca9683
Packit ca9683
int
Packit ca9683
read_dict(struct dict_radix *dict, const char *dir)
Packit ca9683
{
Packit ca9683
	if(dir){
Packit ca9683
		FILE *fp;
Packit ca9683
		char s[1024];
Packit ca9683
		int small,medium,full,ret;
Packit ca9683
#ifdef PREFIX_FILE
Packit ca9683
		FILE *prefixes;
Packit ca9683
#endif
Packit ca9683
Packit ca9683
		snprintf(s,sizeof(s),"%s.sizes",dir);
Packit ca9683
		if(!(fp=fopen(s,"r"))){
Packit ca9683
			fprintf(stderr,"Hspell: can't open %s.\n",s);
Packit ca9683
			return 0;
Packit ca9683
		}
Packit ca9683
		if(fscanf(fp,"%d %d %d",&small,&medium,&full)!=3){
Packit ca9683
			fprintf(stderr,"Hspell: can't read from %s.\n",s);
Packit ca9683
			return 0;
Packit ca9683
		}
Packit ca9683
		fclose(fp);
Packit ca9683
Packit ca9683
#ifdef HAVE_ZLIB
Packit ca9683
		if(!(fp=gzopen(dir,"r"))){
Packit ca9683
			fprintf(stderr,"Hspell: can't open %s.\n",dir);
Packit ca9683
			return 0;
Packit ca9683
		}
Packit ca9683
#else
Packit ca9683
		snprintf(s,sizeof(s),"gzip -dc '%s'",dir);
Packit ca9683
		if(!(fp=popen(s,"r"))){
Packit ca9683
			fprintf(stderr,"Hspell: can't run %s.\n",s);
Packit ca9683
			return 0;
Packit ca9683
		}
Packit ca9683
#endif /* HAVE_ZLIB */
Packit ca9683
Packit ca9683
#ifdef PREFIX_FILE
Packit ca9683
#ifdef HAVE_ZLIB
Packit ca9683
		snprintf(s,sizeof(s),"%s.prefixes",dir);
Packit ca9683
		if(!(prefixes=gzopen(s,"rb"))){
Packit ca9683
			fprintf(stderr,"Hspell: can't open %s.\n",s);
Packit ca9683
			return 0;
Packit ca9683
		}
Packit ca9683
#else
Packit ca9683
		snprintf(s,sizeof(s),"gzip -dc '%s.prefixes'",dir);
Packit ca9683
		if(!(prefixes=popen(s,"rb"))){
Packit ca9683
			fprintf(stderr,"Hspell: can't run %s.\n",s);
Packit ca9683
			return 0;
Packit ca9683
		}
Packit ca9683
#endif /* HAVE_ZLIB */
Packit ca9683
#endif
Packit ca9683
Packit ca9683
		allocate_nodes(dict,small,medium,full);
Packit ca9683
#ifdef PREFIX_FILE
Packit ca9683
		ret=do_read_dict(fp, prefixes, dict);
Packit ca9683
		pclose(prefixes);
Packit ca9683
#else
Packit ca9683
		ret=do_read_dict(fp, dict);
Packit ca9683
#endif
Packit ca9683
		pclose(fp);
Packit ca9683
		return ret;
Packit ca9683
	} else {
Packit ca9683
#ifdef HAVE_ZLIB
Packit ca9683
		/* note that gzopen also works on non-gzipped files */
Packit ca9683
		FILE *in=gzdopen(fileno(stdin),"r");
Packit ca9683
#ifdef PREFIX_FILE
Packit ca9683
		FILE *zero=gzopen("/dev/zero","r");
Packit ca9683
#endif
Packit ca9683
#else
Packit ca9683
		FILE *in=stdin;
Packit ca9683
#ifdef PREFIX_FILE
Packit ca9683
		FILE *zero=fopen("/dev/zero","r");
Packit ca9683
#endif
Packit ca9683
#endif /* HAVE_ZLIB */
Packit ca9683
Packit ca9683
#ifdef PREFIX_FILE
Packit ca9683
		return do_read_dict(in, zero, dict);
Packit ca9683
#else
Packit ca9683
		return do_read_dict(in, dict);
Packit ca9683
#endif
Packit ca9683
	}
Packit ca9683
}
Packit ca9683
Packit ca9683
#ifdef PREFIX_FILE
Packit ca9683
static int do_read_dict(FILE *fp, FILE *prefixes, struct dict_radix *dict)
Packit ca9683
#else
Packit ca9683
static int do_read_dict(FILE *fp, struct dict_radix *dict)
Packit ca9683
#endif
Packit ca9683
{
Packit ca9683
	struct node_index *stack[256];
Packit ca9683
	int sdepth=0;
Packit ca9683
	int c,n,cc;
Packit ca9683
	/* Local copies of dict-> variables, for efficiency. */
Packit ca9683
	int nwords=0;
Packit ca9683
	struct node *nodes = dict->nodes;
Packit ca9683
	struct node_small *nodes_small = dict->nodes_small;
Packit ca9683
	struct node_medium *nodes_medium = dict->nodes_medium;
Packit ca9683
	int nnodes_small=0, nnodes_medium=0, nnodes=0;
Packit ca9683
Packit ca9683
	if(dict->nnodes||dict->nnodes_small||dict->nnodes_medium||
Packit ca9683
	   dict->nwords){
Packit ca9683
		fprintf(stderr, "Hspell: do_read_dict(): called for a non-"
Packit ca9683
			"empty dictionary\n");
Packit ca9683
		return 0;
Packit ca9683
	}
Packit ca9683
	if(!nodes||!nodes_small||!nodes_medium){
Packit ca9683
		fprintf(stderr, "Hspell: do_read_dict(): allocate_nodes() must"
Packit ca9683
			" be called first\n");
Packit ca9683
		return 0;
Packit ca9683
	}
Packit ca9683
Packit ca9683
	memset(&nodes[nnodes], 0, sizeof(nodes[nnodes]));
Packit ca9683
	dict->head.val_or_index=(nnodes++) | HIGHBITS_FULL;
Packit ca9683
	stack[0]=&dict->head;
Packit ca9683
	sdepth=0;
Packit ca9683
	while((c=getc(fp))!=EOF){
Packit ca9683
		if(c>='0' && c<='9'){
Packit ca9683
			/* new word - finalize old word first (set value) */
Packit ca9683
			nwords++; /* statistics */
Packit ca9683
			/* assert(!stack[sdepth]->val_or_index) */
Packit ca9683
#ifdef PREFIX_FILE
Packit ca9683
			stack[sdepth]->val_or_index=getc(prefixes);
Packit ca9683
#else
Packit ca9683
			stack[sdepth]->val_or_index=nwords; /** TODO: different values */
Packit ca9683
#endif
Packit ca9683
			/* and read how much to go back */
Packit ca9683
			n=0;
Packit ca9683
			do {
Packit ca9683
				/* base 10... */
Packit ca9683
				n*=10;
Packit ca9683
				n+=(c-'0');
Packit ca9683
			} while ((c=getc(fp))!=EOF && c>='0' && c<='9');
Packit ca9683
			sdepth-=n;
Packit ca9683
			if(sdepth<0 || sdepth >= (sizeof(stack)/sizeof(stack[0]))-1){
Packit ca9683
				fprintf(stderr,"Hspell: bad backlength %d... giving up\n", sdepth);
Packit ca9683
				return 0;
Packit ca9683
			}
Packit ca9683
			/* we got a new letter c - continue the loop */
Packit ca9683
		}
Packit ca9683
		/* word letter - add it */
Packit ca9683
		if(sdepth>=sizeof(stack)/sizeof(stack[0])-1){
Packit ca9683
			fprintf(stderr,"Hspell: word too long... giving up\n");
Packit ca9683
			return 0;
Packit ca9683
		}
Packit ca9683
		cc=char_to_letter(c);
Packit ca9683
		/* make sure previous node is a small or full node, not just a
Packit ca9683
		 * value, and if it is small, that it's not full */
Packit ca9683
		if((stack[sdepth]->val_or_index & HIGHBITS)==HIGHBITS_VALUE){
Packit ca9683
			int chosen;
Packit ca9683
			if(dict->nfree_nodes_small){
Packit ca9683
				chosen=dict->free_nodes_small
Packit ca9683
					[--(dict->nfree_nodes_small)];
Packit ca9683
			} else {
Packit ca9683
				chosen=nnodes_small;
Packit ca9683
				if(nnodes_small>=dict->size_nodes_small){
Packit ca9683
					fprintf(stderr,"Hspell: Realloc needed (small) - failing.\n");
Packit ca9683
					return 0;
Packit ca9683
				}
Packit ca9683
				nnodes_small++;
Packit ca9683
			}
Packit ca9683
			memset(&nodes_small[chosen], 0, sizeof(nodes_small[chosen]));
Packit ca9683
			nodes_small[chosen].value = stack[sdepth]->val_or_index;
Packit ca9683
			stack[sdepth]->val_or_index = chosen | HIGHBITS_SMALL;
Packit ca9683
Packit ca9683
			nodes_small[chosen].chars[0]=c;
Packit ca9683
			stack[sdepth+1] = &nodes_small[chosen].children[0];
Packit ca9683
		} else if((stack[sdepth]->val_or_index & HIGHBITS)==HIGHBITS_SMALL){
Packit ca9683
			int j;
Packit ca9683
			struct node_small *n=
Packit ca9683
			   &nodes_small[stack[sdepth]->val_or_index&VALUEMASK];
Packit ca9683
			/* is the small node not full yet? */
Packit ca9683
			for(j=0;j
Packit ca9683
				if(!n->chars[j]){
Packit ca9683
					n->chars[j]=c;
Packit ca9683
					stack[sdepth+1] = &n->children[j];
Packit ca9683
					break;
Packit ca9683
				}
Packit ca9683
			if(j==SMALL_NODE_CHILDREN){
Packit ca9683
				/* small node full! convert it to medium node */
Packit ca9683
				int chosen;
Packit ca9683
				if(dict->nfree_nodes_medium){
Packit ca9683
					chosen=dict->free_nodes_medium
Packit ca9683
						[--(dict->nfree_nodes_medium)];
Packit ca9683
				} else {
Packit ca9683
					chosen=nnodes_medium;
Packit ca9683
					if(nnodes_medium>=dict->size_nodes_medium){
Packit ca9683
						fprintf(stderr,"Hspell: Realloc needed (medium) - failing.\n");
Packit ca9683
						return 0;
Packit ca9683
					}
Packit ca9683
					nnodes_medium++;
Packit ca9683
				}
Packit ca9683
				memset(&nodes_medium[chosen], 0, sizeof(nodes_medium[chosen]));
Packit ca9683
				if(dict->nfree_nodes_small>=
Packit ca9683
				   sizeof(dict->free_nodes_small)/
Packit ca9683
				   sizeof(dict->free_nodes_small[0])){
Packit ca9683
					fprintf(stderr,"Hspell: overflow in free_nodes_small.\n");
Packit ca9683
					return 0;
Packit ca9683
				}
Packit ca9683
				dict->free_nodes_small
Packit ca9683
					[(dict->nfree_nodes_small)++]=
Packit ca9683
					stack[sdepth]->val_or_index & VALUEMASK;
Packit ca9683
				stack[sdepth]->val_or_index = chosen | HIGHBITS_MEDIUM;
Packit ca9683
				/* copy the children from n to nodes[nnodes]: */
Packit ca9683
				/* TODO: use memcpy instead! */
Packit ca9683
				nodes_medium[chosen].value = n->value;
Packit ca9683
				for(j=0;j
Packit ca9683
					nodes_medium[chosen].chars[j]=
Packit ca9683
						n->chars[j];
Packit ca9683
					nodes_medium[chosen].children[j]=
Packit ca9683
						n->children[j];
Packit ca9683
				}
Packit ca9683
				/* and finally choose the next child */
Packit ca9683
				nodes_medium[chosen].chars[SMALL_NODE_CHILDREN]=
Packit ca9683
					c;
Packit ca9683
				stack[sdepth+1] = &nodes_medium[chosen].
Packit ca9683
					children[SMALL_NODE_CHILDREN];
Packit ca9683
			}
Packit ca9683
		} else if((stack[sdepth]->val_or_index & HIGHBITS)==HIGHBITS_MEDIUM){
Packit ca9683
			int j;
Packit ca9683
			struct node_medium *n=
Packit ca9683
			   &nodes_medium[stack[sdepth]->val_or_index&VALUEMASK];
Packit ca9683
			/* is the medium node not full yet? */
Packit ca9683
			for(j=0;j
Packit ca9683
				if(!n->chars[j]){
Packit ca9683
					n->chars[j]=c;
Packit ca9683
					stack[sdepth+1] = &n->children[j];
Packit ca9683
					break;
Packit ca9683
				}
Packit ca9683
			if(j==MEDIUM_NODE_CHILDREN){
Packit ca9683
				/* medium node full! convert it to full node */
Packit ca9683
				if(nnodes>=dict->size_nodes){
Packit ca9683
					fprintf(stderr,"Hspell: Realloc needed (full) - failing.\n");
Packit ca9683
					return 0;
Packit ca9683
				}
Packit ca9683
				memset(&nodes[nnodes], 0, sizeof(nodes[nnodes]));
Packit ca9683
				nodes[nnodes].value = n->value;
Packit ca9683
				if(dict->nfree_nodes_medium>=
Packit ca9683
				   sizeof(dict->free_nodes_medium)/
Packit ca9683
				   sizeof(dict->free_nodes_medium[0])){
Packit ca9683
					fprintf(stderr,"Hspell: overflow in free_nodes_medium.\n");
Packit ca9683
					return 0;
Packit ca9683
				}
Packit ca9683
				dict->free_nodes_medium
Packit ca9683
					[(dict->nfree_nodes_medium)++]=
Packit ca9683
					stack[sdepth]->val_or_index & VALUEMASK;
Packit ca9683
				stack[sdepth]->val_or_index = nnodes | HIGHBITS_FULL;
Packit ca9683
				/* copy the children from n to nodes[nnodes]: */
Packit ca9683
				for(j=0;j
Packit ca9683
					nodes[nnodes].children[char_to_letter(
Packit ca9683
						n->chars[j])]=
Packit ca9683
						n->children[j];
Packit ca9683
				/* and finally choose the next child */
Packit ca9683
				stack[sdepth+1] = &nodes[nnodes].children[cc];
Packit ca9683
				nnodes++;
Packit ca9683
			}
Packit ca9683
		} else { /* HIGHBITS_FULL */
Packit ca9683
			stack[sdepth+1] = &nodes[
Packit ca9683
			  stack[sdepth]->val_or_index & VALUEMASK].children[cc];
Packit ca9683
		}
Packit ca9683
		sdepth++;
Packit ca9683
	}
Packit ca9683
	/* output last word */
Packit ca9683
	nwords++; /* statistics */
Packit ca9683
#ifdef PREFIX_FILE
Packit ca9683
	stack[sdepth]->val_or_index=getc(prefixes);
Packit ca9683
#else
Packit ca9683
	stack[sdepth]->val_or_index=nwords; /** TODO: different values */
Packit ca9683
#endif
Packit ca9683
Packit ca9683
	/* return local copies to dict-> structure */
Packit ca9683
	dict->nwords=nwords;
Packit ca9683
	dict->nnodes_small=nnodes_small;
Packit ca9683
	dict->nnodes_medium=nnodes_medium;
Packit ca9683
	dict->nnodes=nnodes;
Packit ca9683
Packit ca9683
	return 1;
Packit ca9683
}
Packit ca9683
Packit ca9683
void
Packit ca9683
print_stats(struct dict_radix *dict)
Packit ca9683
{
Packit ca9683
	fprintf(stderr,	"%d words in %d full nodes, %d medium nodes, "
Packit ca9683
		"%d small nodes.\n", dict->nwords, dict->nnodes,
Packit ca9683
		dict->nnodes_medium, dict->nnodes_small);
Packit ca9683
	fprintf(stderr, "%d nfree_nodes_small %d nfree_nodes_medium.\n",
Packit ca9683
		dict->nfree_nodes_small,dict->nfree_nodes_medium);
Packit ca9683
	fprintf(stderr, "node memory filled: %d K\n",
Packit ca9683
	       (int)(dict->nnodes*sizeof(struct node)
Packit ca9683
		+ dict->nnodes_small*sizeof(struct node_small)
Packit ca9683
		+ dict->nnodes_medium*sizeof(struct node_medium)
Packit ca9683
		       )/1024);
Packit ca9683
}
Packit ca9683
Packit ca9683
void
Packit ca9683
print_tree(struct dict_radix *dict)
Packit ca9683
{
Packit ca9683
	char word[256];
Packit ca9683
	do_print_tree(dict->nodes,dict->nodes_small,dict->nodes_medium,
Packit ca9683
		      dict->head,word,0,sizeof(word));
Packit ca9683
Packit ca9683
}
Packit ca9683
Packit ca9683
void
Packit ca9683
print_sizes(struct dict_radix *dict)
Packit ca9683
{
Packit ca9683
	printf("%d %d %d\n", dict->nnodes_small, dict->nnodes_medium,
Packit ca9683
		dict->nnodes);
Packit ca9683
}
Packit ca9683
Packit ca9683
int
Packit ca9683
lookup(const struct dict_radix *dict, const char *word)
Packit ca9683
{
Packit ca9683
	struct node_index current = dict->head;
Packit ca9683
	for(;;){
Packit ca9683
		switch(current.val_or_index & HIGHBITS){
Packit ca9683
		case HIGHBITS_VALUE:
Packit ca9683
			if(*word){
Packit ca9683
				/* The word isn't over yet but we reached a
Packit ca9683
				   leaf node. So the word isn't in the dict */
Packit ca9683
				return 0;
Packit ca9683
			} else {
Packit ca9683
				return current.val_or_index & VALUEMASK;
Packit ca9683
			}
Packit ca9683
			break;
Packit ca9683
		case HIGHBITS_SMALL:
Packit ca9683
			if(*word){
Packit ca9683
				struct node_small *n =
Packit ca9683
				      &dict->nodes_small[current.val_or_index
Packit ca9683
							 & VALUEMASK];
Packit ca9683
#if SMALL_NODE_CHILDREN==2
Packit ca9683
				if(n->chars[0]==*word)
Packit ca9683
					current=n->children[0];
Packit ca9683
				else if(n->chars[1]==*word)
Packit ca9683
					current=n->children[1];
Packit ca9683
				else
Packit ca9683
					return 0; /* not found... */
Packit ca9683
#else
Packit ca9683
#error "small node lookup not implemented except for 2 children."
Packit ca9683
#endif
Packit ca9683
			} else {
Packit ca9683
				return dict->nodes_small[current.val_or_index
Packit ca9683
							 & VALUEMASK]
Packit ca9683
					.value;
Packit ca9683
			}
Packit ca9683
			break;
Packit ca9683
		case HIGHBITS_MEDIUM:
Packit ca9683
			if(*word){
Packit ca9683
				struct node_medium *n =
Packit ca9683
				      &dict->nodes_medium[current.val_or_index
Packit ca9683
							  & VALUEMASK];
Packit ca9683
#if MEDIUM_NODE_CHILDREN==8
Packit ca9683
				register char c=*word, *cs=n->chars;
Packit ca9683
				/* TODO: use binary search? stop searching
Packit ca9683
				   on the first 0? All these optimizations
Packit ca9683
				   are probably useless for 8 chars... */
Packit ca9683
				if(*(cs++)==c)      current=n->children[0];
Packit ca9683
				else if(*(cs++)==c) current=n->children[1];
Packit ca9683
				else if(*(cs++)==c) current=n->children[2];
Packit ca9683
				else if(*(cs++)==c) current=n->children[3];
Packit ca9683
				else if(*(cs++)==c) current=n->children[4];
Packit ca9683
				else if(*(cs++)==c) current=n->children[5];
Packit ca9683
				else if(*(cs++)==c) current=n->children[6];
Packit ca9683
				else if(*(cs++)==c) current=n->children[7];
Packit ca9683
				else
Packit ca9683
					return 0; /* not found... */
Packit ca9683
#else
Packit ca9683
#error "medium node lookup not implemented except for 8 children."
Packit ca9683
#endif
Packit ca9683
			} else {
Packit ca9683
				return dict->nodes_medium[current.val_or_index
Packit ca9683
							  & VALUEMASK]
Packit ca9683
					.value;
Packit ca9683
			}
Packit ca9683
			break;
Packit ca9683
		case HIGHBITS_FULL:
Packit ca9683
			if(*word){
Packit ca9683
				/* the following is a copy of char_to_letter */
Packit ca9683
				register int ind;
Packit ca9683
				register unsigned char c = *word;
Packit ca9683
				if(c>=(unsigned char)'à' &&
Packit ca9683
				   c<(unsigned char)'à'+27)
Packit ca9683
					ind = c - (unsigned char)'à' + 2;
Packit ca9683
				else if (c=='"')
Packit ca9683
					ind = 0;
Packit ca9683
				else if (c=='\'')
Packit ca9683
					ind = 1;
Packit ca9683
				else
Packit ca9683
					return 0; /* non-Hebrew letter */
Packit ca9683
				current=dict->nodes[current.val_or_index
Packit ca9683
						    & VALUEMASK]
Packit ca9683
					.children[ind];
Packit ca9683
			} else {
Packit ca9683
				return dict->nodes[current.val_or_index
Packit ca9683
						   & VALUEMASK].value;
Packit ca9683
			}
Packit ca9683
			break;
Packit ca9683
		}
Packit ca9683
		word++;
Packit ca9683
	}
Packit ca9683
}