/* Copyright (C) 1995 Bjoern Beutel. */
/* Description. =============================================================*/
/* This module implements a static trie structure */
/* Includes. ================================================================*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <setjmp.h>
#include <glib.h>
#include "basic.h"
#include "pools.h"
#include "tries.h"
/* Macros. ==================================================================*/
#define ALIGN(addr, n) (((ptr_t) (addr) + (ptr_t) (n - 1)) & ~ (ptr_t) (n - 1))
/* Align ADDR to next multiple of N. */
#define INTS(n) (((n) + (ptr_t ) 3) >> 2)
/* Number of ints needed to store N bytes. */
/* Types. ===================================================================*/
typedef struct /* A node in a list of keys. */
{
list_node_t *next;
u_short_t key; /* Key for this node. */
int_t subnode; /* Index of node for KEY in trie. */
int_t content; /* Content associated with KEY. */
} key_node_t;
typedef struct /* A dynamic trie node. */
{
string_t prefix; /* The prefix of this node. */
int_t prefix_len; /* The length of PREFIX in bytes. */
list_t key_list; /* List of keys. */
} trie_node_t;
/* The trie is a vector of int_t that contains compact trie nodes.
* A compact trie node is int_t-aligned and looks as follows:
* u_byte_t prefix_len;
* char_t prefix[ prefix_len ];
* 0 or 1 pad bytes of value 0;
* u_short_t subnode_count;
* u_short_t content_count;
* u_short_t subnode_keys[ subnode_count ];
* u_short_t content_keys[ content_count ];
* 0 or 2 pad bytes of value 0;
* int_t subnodes[ subnode_count ];
* int_t contents[ content_count ]; */
typedef struct /* Pointers to the items of a compact trie node. */
{
u_byte_t *prefix_len; /* Number of chars that precede the keys. */
char_t *prefix; /* The chars that precede the keys. */
u_short_t *subnode_count; /* The number of subnodes in this node. */
u_short_t *content_count; /* The number of contents in this node. */
u_short_t *subnode_keys; /* The keys for the subnodes in this node. */
u_short_t *content_keys; /* The keys for the contents in this node. */
int_t *subnodes; /* Indexes of the subnodes. */
int_t *contents; /* Contents. */
} compact_node_t;
/*---------------------------------------------------------------------------*/
static int_t
store_trie_node( pool_t trie_pool, trie_node_t *node )
/* Store NODE in TRIE_POOL and return its index. */
{
int_t node_size; /* Size of the trie node in TRIE_POOL. */
int_t node_index; /* Index of the trie node in TRIE_POOL. */
int_t i, j, subnode_count, content_count;
key_node_t *key_node;
int_t *compact_node;
compact_node_t n; /* Pointers to the items of the compactified NODE. */
string_t s;
char_t *d;
/* Count the number of entries in NODE->KEY_LIST. */
subnode_count = content_count = 0;
FOREACH( key_node, node->key_list )
{
if (key_node->subnode != -1)
subnode_count++;
if (key_node->content != -1)
content_count++;
}
/* Get pool space for the new node. */
node_size = (INTS( sizeof( u_byte_t ) + sizeof( char_t ) * node->prefix_len
+ sizeof( u_short_t )
+ sizeof( u_short_t ) * subnode_count
+ sizeof( u_short_t )
+ sizeof( u_short_t ) * content_count )
+ subnode_count + content_count);
compact_node = get_pool_space( trie_pool, node_size, &node_index );
/* Set up the pointers to the items of the node. */
n.prefix_len = (u_byte_t *) compact_node;
n.prefix = (char_t *) (n.prefix_len + 1);
n.subnode_count = (u_short_t *) ALIGN(n.prefix + node->prefix_len, 2 );
n.content_count = (u_short_t *) (n.subnode_count + 1);
n.subnode_keys = (u_short_t *) (n.content_count + 1);
n.content_keys = (u_short_t *) (n.subnode_keys + subnode_count);
n.subnodes = (int_t *) ALIGN( n.content_keys + content_count, 4 );
n.contents = (int_t *) n.subnodes + subnode_count;
/* Copy the items. */
*n.prefix_len = node->prefix_len;
d = n.prefix;
for (s = node->prefix;
s < node->prefix + node->prefix_len;
s = g_utf8_next_char( s ))
{
d += g_unichar_to_utf8( g_unichar_tolower( g_utf8_get_char( s ) ), d );
}
*n.subnode_count = subnode_count;
*n.content_count = content_count;
i = j = 0;
FOREACH( key_node, node->key_list )
{
if (key_node->subnode != -1)
{
n.subnode_keys[i] = key_node->key;
n.subnodes[i] = key_node->subnode;
i++;
}
if (key_node->content != -1)
{
n.content_keys[j] = key_node->key;
n.contents[j] = key_node->content;
j++;
}
}
return node_index;
}
/*---------------------------------------------------------------------------*/
static int_t
new_subtrie( pool_t trie_pool, trie_entry_t entries[],
int_t start, int_t end, int_t index )
/* Create a subtrie for ENTRIES[ START ]..ENTRIES[ END - 1 ],
* store it in TRIE_POOL and return the index of its root node.
* The first INDEX bytes in the keys will be ignored, so they
* must match for the keys in ENTRIES[ START ]..ENTRIES[ END - 1 ]. */
{
trie_node_t node;
int_t key_idx; /* The key index. */
int_t node_index;
int_t sub_start, sub_end; /* Start and end of a subtrie. */
string_t first_key, last_key; /* Used as abbreviations. */
string_t f, l, e;
key_node_t *key_node;
if (start >= end)
return -1;
/* Look how many chars from INDEX onward are equal in the first and
* the last entry. This will be the prefix of the new node.
* If the key of the first entry is as long as the prefix, shorten the prefix
* by one character. */
first_key = entries[ start ].key;
last_key = entries[ end - 1 ].key;
for (f = first_key + index, l = last_key + index;
*g_utf8_next_char( f ) != EOS;
f = g_utf8_next_char( f ), l = g_utf8_next_char( l ))
{
if (g_unichar_tolower( g_utf8_get_char( f ) )
!= g_unichar_tolower( g_utf8_get_char( l ) ))
{
break;
}
}
key_idx = f - first_key;
node.prefix = first_key + index;
node.prefix_len = key_idx - index;
/* Scan list of entries for the first char behind the prefix. For each key,
* find the corresponding lower and upper index in the list of entries. */
clear_list( &node.key_list );
for (sub_start = start; sub_start < end; sub_start = sub_end)
{
/* Create a new node for the current key. */
key_node = new_node( &node.key_list, sizeof( key_node_t ), LIST_END );
f = entries[ sub_start ].key + key_idx;
key_node->key = g_unichar_tolower( g_utf8_get_char(f) );
e = g_utf8_next_char(f);
if (*e == EOS)
{
key_node->content = entries[ sub_start ].content;
sub_start++;
}
else
key_node->content = -1;
/* Find last entry with same key. */
for (sub_end = sub_start; sub_end < end; sub_end++)
{
l = entries[ sub_end ].key + key_idx;
if (g_unichar_tolower( g_utf8_get_char(l) ) != key_node->key)
break;
}
key_node->subnode = new_subtrie( trie_pool, entries, sub_start, sub_end,
key_idx + e - f );
}
/* Store compact node and clear the key list. */
node_index = store_trie_node( trie_pool, &node );
while (node.key_list.first != NULL)
free_first_node( &node.key_list );
return node_index;
}
/*---------------------------------------------------------------------------*/
void
new_trie( int_t n, trie_entry_t *entries,
pool_t *trie_pool, int_t *root_node_index )
/* Take N ENTRIES and build a trie of them.
* Return the trie in *TRIE_POOL
* and the index of its root node in *ROOT_NODE_INDEX.
* The keys in *ENTRIES must be non-empty, unique, and sorted. */
{
*trie_pool = new_pool( sizeof( int_t ) );
*root_node_index = new_subtrie( *trie_pool, entries, 0, n, 0 );
}
/*---------------------------------------------------------------------------*/
bool_t
lookup_trie( int_t *trie, int_t *node_index, string_t *input, int_t *content )
/* Test if a prefix of *INPUT matches the node at *NODE_INDEX in TRIE.
* If it does, return TRUE (else return FALSE) and:
* *CONTENT contains the associated content,
* *NODE contains the subnode for the matched input, and
* *INPUT points to the first char behind the prefix. */
{
int_t lower, upper, middle;
compact_node_t r; /* Pointers to the items of the root node; */
gunichar c;
while (*node_index != -1)
{
/* Test if node's prefix matches the given key. */
r.prefix_len = (u_byte_t *) (trie + *node_index);
r.prefix = (char_t *) (r.prefix_len + 1);
if (strncmp_no_case( *input, r.prefix, *r.prefix_len ) != 0)
return FALSE;
(*input) += *r.prefix_len;
/* Get the rest of the node. */
r.subnode_count = (u_short_t *) ALIGN(r.prefix + *r.prefix_len, 2);
r.content_count = (u_short_t *) (r.subnode_count + 1);
r.subnode_keys = (u_short_t *) (r.content_count + 1);
r.content_keys = (u_short_t *) (r.subnode_keys + *r.subnode_count);
r.subnodes = (int_t *) ALIGN( r.content_keys + *r.content_count, 4 );
r.contents = (int_t *) (r.subnodes + *r.subnode_count);
/* Perform binary search for subnode with given key. */
c = g_unichar_tolower( g_utf8_get_char( *input ) );
*node_index = -1;
lower = 0;
upper = *r.subnode_count - 1;
while (lower <= upper)
{
middle = (lower + upper) / 2;
if (c < r.subnode_keys[ middle ])
upper = middle - 1;
else if (c > r.subnode_keys[ middle ])
lower = middle + 1;
else /* This entry matches. */
{
*node_index = r.subnodes[ middle ];
break;
}
}
/* Perform binary search for content with given key. */
*content = -1;
lower = 0;
upper = *r.content_count - 1;
while (lower <= upper)
{
middle = (lower + upper) / 2;
if (c < r.content_keys[ middle ])
upper = middle - 1;
else if (c > r.content_keys[ middle ])
lower = middle + 1;
else /* This entry matches. */
{
*content = r.contents[ middle ];
break;
}
}
*input = g_utf8_next_char( *input );
if (*content != -1)
return TRUE;
}
return FALSE;
}
/* End of file. =============================================================*/