|
Packit |
d394d9 |
/* Copyright (C) 1995 Bjoern Beutel. */
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Description. =============================================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* This module implements a static trie structure */
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Includes. ================================================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
#include <stdlib.h>
|
|
Packit |
d394d9 |
#include <stdio.h>
|
|
Packit |
d394d9 |
#include <string.h>
|
|
Packit |
d394d9 |
#include <setjmp.h>
|
|
Packit |
d394d9 |
#include <glib.h>
|
|
Packit |
d394d9 |
#include "basic.h"
|
|
Packit |
d394d9 |
#include "pools.h"
|
|
Packit |
d394d9 |
#include "tries.h"
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Macros. ==================================================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
#define ALIGN(addr, n) (((ptr_t) (addr) + (ptr_t) (n - 1)) & ~ (ptr_t) (n - 1))
|
|
Packit |
d394d9 |
/* Align ADDR to next multiple of N. */
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
#define INTS(n) (((n) + (ptr_t ) 3) >> 2)
|
|
Packit |
d394d9 |
/* Number of ints needed to store N bytes. */
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Types. ===================================================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
typedef struct /* A node in a list of keys. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
list_node_t *next;
|
|
Packit |
d394d9 |
u_short_t key; /* Key for this node. */
|
|
Packit |
d394d9 |
int_t subnode; /* Index of node for KEY in trie. */
|
|
Packit |
d394d9 |
int_t content; /* Content associated with KEY. */
|
|
Packit |
d394d9 |
} key_node_t;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
typedef struct /* A dynamic trie node. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
string_t prefix; /* The prefix of this node. */
|
|
Packit |
d394d9 |
int_t prefix_len; /* The length of PREFIX in bytes. */
|
|
Packit |
d394d9 |
list_t key_list; /* List of keys. */
|
|
Packit |
d394d9 |
} trie_node_t;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* The trie is a vector of int_t that contains compact trie nodes.
|
|
Packit |
d394d9 |
* A compact trie node is int_t-aligned and looks as follows:
|
|
Packit |
d394d9 |
* u_byte_t prefix_len;
|
|
Packit |
d394d9 |
* char_t prefix[ prefix_len ];
|
|
Packit |
d394d9 |
* 0 or 1 pad bytes of value 0;
|
|
Packit |
d394d9 |
* u_short_t subnode_count;
|
|
Packit |
d394d9 |
* u_short_t content_count;
|
|
Packit |
d394d9 |
* u_short_t subnode_keys[ subnode_count ];
|
|
Packit |
d394d9 |
* u_short_t content_keys[ content_count ];
|
|
Packit |
d394d9 |
* 0 or 2 pad bytes of value 0;
|
|
Packit |
d394d9 |
* int_t subnodes[ subnode_count ];
|
|
Packit |
d394d9 |
* int_t contents[ content_count ]; */
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
typedef struct /* Pointers to the items of a compact trie node. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
u_byte_t *prefix_len; /* Number of chars that precede the keys. */
|
|
Packit |
d394d9 |
char_t *prefix; /* The chars that precede the keys. */
|
|
Packit |
d394d9 |
u_short_t *subnode_count; /* The number of subnodes in this node. */
|
|
Packit |
d394d9 |
u_short_t *content_count; /* The number of contents in this node. */
|
|
Packit |
d394d9 |
u_short_t *subnode_keys; /* The keys for the subnodes in this node. */
|
|
Packit |
d394d9 |
u_short_t *content_keys; /* The keys for the contents in this node. */
|
|
Packit |
d394d9 |
int_t *subnodes; /* Indexes of the subnodes. */
|
|
Packit |
d394d9 |
int_t *contents; /* Contents. */
|
|
Packit |
d394d9 |
} compact_node_t;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
static int_t
|
|
Packit |
d394d9 |
store_trie_node( pool_t trie_pool, trie_node_t *node )
|
|
Packit |
d394d9 |
/* Store NODE in TRIE_POOL and return its index. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
int_t node_size; /* Size of the trie node in TRIE_POOL. */
|
|
Packit |
d394d9 |
int_t node_index; /* Index of the trie node in TRIE_POOL. */
|
|
Packit |
d394d9 |
int_t i, j, subnode_count, content_count;
|
|
Packit |
d394d9 |
key_node_t *key_node;
|
|
Packit |
d394d9 |
int_t *compact_node;
|
|
Packit |
d394d9 |
compact_node_t n; /* Pointers to the items of the compactified NODE. */
|
|
Packit |
d394d9 |
string_t s;
|
|
Packit |
d394d9 |
char_t *d;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Count the number of entries in NODE->KEY_LIST. */
|
|
Packit |
d394d9 |
subnode_count = content_count = 0;
|
|
Packit |
d394d9 |
FOREACH( key_node, node->key_list )
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
if (key_node->subnode != -1)
|
|
Packit |
d394d9 |
subnode_count++;
|
|
Packit |
d394d9 |
if (key_node->content != -1)
|
|
Packit |
d394d9 |
content_count++;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Get pool space for the new node. */
|
|
Packit |
d394d9 |
node_size = (INTS( sizeof( u_byte_t ) + sizeof( char_t ) * node->prefix_len
|
|
Packit |
d394d9 |
+ sizeof( u_short_t )
|
|
Packit |
d394d9 |
+ sizeof( u_short_t ) * subnode_count
|
|
Packit |
d394d9 |
+ sizeof( u_short_t )
|
|
Packit |
d394d9 |
+ sizeof( u_short_t ) * content_count )
|
|
Packit |
d394d9 |
+ subnode_count + content_count);
|
|
Packit |
d394d9 |
compact_node = get_pool_space( trie_pool, node_size, &node_index );
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Set up the pointers to the items of the node. */
|
|
Packit |
d394d9 |
n.prefix_len = (u_byte_t *) compact_node;
|
|
Packit |
d394d9 |
n.prefix = (char_t *) (n.prefix_len + 1);
|
|
Packit |
d394d9 |
n.subnode_count = (u_short_t *) ALIGN(n.prefix + node->prefix_len, 2 );
|
|
Packit |
d394d9 |
n.content_count = (u_short_t *) (n.subnode_count + 1);
|
|
Packit |
d394d9 |
n.subnode_keys = (u_short_t *) (n.content_count + 1);
|
|
Packit |
d394d9 |
n.content_keys = (u_short_t *) (n.subnode_keys + subnode_count);
|
|
Packit |
d394d9 |
n.subnodes = (int_t *) ALIGN( n.content_keys + content_count, 4 );
|
|
Packit |
d394d9 |
n.contents = (int_t *) n.subnodes + subnode_count;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Copy the items. */
|
|
Packit |
d394d9 |
*n.prefix_len = node->prefix_len;
|
|
Packit |
d394d9 |
d = n.prefix;
|
|
Packit |
d394d9 |
for (s = node->prefix;
|
|
Packit |
d394d9 |
s < node->prefix + node->prefix_len;
|
|
Packit |
d394d9 |
s = g_utf8_next_char( s ))
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
d += g_unichar_to_utf8( g_unichar_tolower( g_utf8_get_char( s ) ), d );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
*n.subnode_count = subnode_count;
|
|
Packit |
d394d9 |
*n.content_count = content_count;
|
|
Packit |
d394d9 |
i = j = 0;
|
|
Packit |
d394d9 |
FOREACH( key_node, node->key_list )
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
if (key_node->subnode != -1)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
n.subnode_keys[i] = key_node->key;
|
|
Packit |
d394d9 |
n.subnodes[i] = key_node->subnode;
|
|
Packit |
d394d9 |
i++;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
if (key_node->content != -1)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
n.content_keys[j] = key_node->key;
|
|
Packit |
d394d9 |
n.contents[j] = key_node->content;
|
|
Packit |
d394d9 |
j++;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
return node_index;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
static int_t
|
|
Packit |
d394d9 |
new_subtrie( pool_t trie_pool, trie_entry_t entries[],
|
|
Packit |
d394d9 |
int_t start, int_t end, int_t index )
|
|
Packit |
d394d9 |
/* Create a subtrie for ENTRIES[ START ]..ENTRIES[ END - 1 ],
|
|
Packit |
d394d9 |
* store it in TRIE_POOL and return the index of its root node.
|
|
Packit |
d394d9 |
* The first INDEX bytes in the keys will be ignored, so they
|
|
Packit |
d394d9 |
* must match for the keys in ENTRIES[ START ]..ENTRIES[ END - 1 ]. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
trie_node_t node;
|
|
Packit |
d394d9 |
int_t key_idx; /* The key index. */
|
|
Packit |
d394d9 |
int_t node_index;
|
|
Packit |
d394d9 |
int_t sub_start, sub_end; /* Start and end of a subtrie. */
|
|
Packit |
d394d9 |
string_t first_key, last_key; /* Used as abbreviations. */
|
|
Packit |
d394d9 |
string_t f, l, e;
|
|
Packit |
d394d9 |
key_node_t *key_node;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
if (start >= end)
|
|
Packit |
d394d9 |
return -1;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Look how many chars from INDEX onward are equal in the first and
|
|
Packit |
d394d9 |
* the last entry. This will be the prefix of the new node.
|
|
Packit |
d394d9 |
* If the key of the first entry is as long as the prefix, shorten the prefix
|
|
Packit |
d394d9 |
* by one character. */
|
|
Packit |
d394d9 |
first_key = entries[ start ].key;
|
|
Packit |
d394d9 |
last_key = entries[ end - 1 ].key;
|
|
Packit |
d394d9 |
for (f = first_key + index, l = last_key + index;
|
|
Packit |
d394d9 |
*g_utf8_next_char( f ) != EOS;
|
|
Packit |
d394d9 |
f = g_utf8_next_char( f ), l = g_utf8_next_char( l ))
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
if (g_unichar_tolower( g_utf8_get_char( f ) )
|
|
Packit |
d394d9 |
!= g_unichar_tolower( g_utf8_get_char( l ) ))
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
break;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
key_idx = f - first_key;
|
|
Packit |
d394d9 |
node.prefix = first_key + index;
|
|
Packit |
d394d9 |
node.prefix_len = key_idx - index;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Scan list of entries for the first char behind the prefix. For each key,
|
|
Packit |
d394d9 |
* find the corresponding lower and upper index in the list of entries. */
|
|
Packit |
d394d9 |
clear_list( &node.key_list );
|
|
Packit |
d394d9 |
for (sub_start = start; sub_start < end; sub_start = sub_end)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
/* Create a new node for the current key. */
|
|
Packit |
d394d9 |
key_node = new_node( &node.key_list, sizeof( key_node_t ), LIST_END );
|
|
Packit |
d394d9 |
f = entries[ sub_start ].key + key_idx;
|
|
Packit |
d394d9 |
key_node->key = g_unichar_tolower( g_utf8_get_char(f) );
|
|
Packit |
d394d9 |
e = g_utf8_next_char(f);
|
|
Packit |
d394d9 |
if (*e == EOS)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
key_node->content = entries[ sub_start ].content;
|
|
Packit |
d394d9 |
sub_start++;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
else
|
|
Packit |
d394d9 |
key_node->content = -1;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Find last entry with same key. */
|
|
Packit |
d394d9 |
for (sub_end = sub_start; sub_end < end; sub_end++)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
l = entries[ sub_end ].key + key_idx;
|
|
Packit |
d394d9 |
if (g_unichar_tolower( g_utf8_get_char(l) ) != key_node->key)
|
|
Packit |
d394d9 |
break;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
key_node->subnode = new_subtrie( trie_pool, entries, sub_start, sub_end,
|
|
Packit |
d394d9 |
key_idx + e - f );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Store compact node and clear the key list. */
|
|
Packit |
d394d9 |
node_index = store_trie_node( trie_pool, &node );
|
|
Packit |
d394d9 |
while (node.key_list.first != NULL)
|
|
Packit |
d394d9 |
free_first_node( &node.key_list );
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
return node_index;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
void
|
|
Packit |
d394d9 |
new_trie( int_t n, trie_entry_t *entries,
|
|
Packit |
d394d9 |
pool_t *trie_pool, int_t *root_node_index )
|
|
Packit |
d394d9 |
/* Take N ENTRIES and build a trie of them.
|
|
Packit |
d394d9 |
* Return the trie in *TRIE_POOL
|
|
Packit |
d394d9 |
* and the index of its root node in *ROOT_NODE_INDEX.
|
|
Packit |
d394d9 |
* The keys in *ENTRIES must be non-empty, unique, and sorted. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
*trie_pool = new_pool( sizeof( int_t ) );
|
|
Packit |
d394d9 |
*root_node_index = new_subtrie( *trie_pool, entries, 0, n, 0 );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
bool_t
|
|
Packit |
d394d9 |
lookup_trie( int_t *trie, int_t *node_index, string_t *input, int_t *content )
|
|
Packit |
d394d9 |
/* Test if a prefix of *INPUT matches the node at *NODE_INDEX in TRIE.
|
|
Packit |
d394d9 |
* If it does, return TRUE (else return FALSE) and:
|
|
Packit |
d394d9 |
* *CONTENT contains the associated content,
|
|
Packit |
d394d9 |
* *NODE contains the subnode for the matched input, and
|
|
Packit |
d394d9 |
* *INPUT points to the first char behind the prefix. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
int_t lower, upper, middle;
|
|
Packit |
d394d9 |
compact_node_t r; /* Pointers to the items of the root node; */
|
|
Packit |
d394d9 |
gunichar c;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
while (*node_index != -1)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
/* Test if node's prefix matches the given key. */
|
|
Packit |
d394d9 |
r.prefix_len = (u_byte_t *) (trie + *node_index);
|
|
Packit |
d394d9 |
r.prefix = (char_t *) (r.prefix_len + 1);
|
|
Packit |
d394d9 |
if (strncmp_no_case( *input, r.prefix, *r.prefix_len ) != 0)
|
|
Packit |
d394d9 |
return FALSE;
|
|
Packit |
d394d9 |
(*input) += *r.prefix_len;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Get the rest of the node. */
|
|
Packit |
d394d9 |
r.subnode_count = (u_short_t *) ALIGN(r.prefix + *r.prefix_len, 2);
|
|
Packit |
d394d9 |
r.content_count = (u_short_t *) (r.subnode_count + 1);
|
|
Packit |
d394d9 |
r.subnode_keys = (u_short_t *) (r.content_count + 1);
|
|
Packit |
d394d9 |
r.content_keys = (u_short_t *) (r.subnode_keys + *r.subnode_count);
|
|
Packit |
d394d9 |
r.subnodes = (int_t *) ALIGN( r.content_keys + *r.content_count, 4 );
|
|
Packit |
d394d9 |
r.contents = (int_t *) (r.subnodes + *r.subnode_count);
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Perform binary search for subnode with given key. */
|
|
Packit |
d394d9 |
c = g_unichar_tolower( g_utf8_get_char( *input ) );
|
|
Packit |
d394d9 |
*node_index = -1;
|
|
Packit |
d394d9 |
lower = 0;
|
|
Packit |
d394d9 |
upper = *r.subnode_count - 1;
|
|
Packit |
d394d9 |
while (lower <= upper)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
middle = (lower + upper) / 2;
|
|
Packit |
d394d9 |
if (c < r.subnode_keys[ middle ])
|
|
Packit |
d394d9 |
upper = middle - 1;
|
|
Packit |
d394d9 |
else if (c > r.subnode_keys[ middle ])
|
|
Packit |
d394d9 |
lower = middle + 1;
|
|
Packit |
d394d9 |
else /* This entry matches. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
*node_index = r.subnodes[ middle ];
|
|
Packit |
d394d9 |
break;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Perform binary search for content with given key. */
|
|
Packit |
d394d9 |
*content = -1;
|
|
Packit |
d394d9 |
lower = 0;
|
|
Packit |
d394d9 |
upper = *r.content_count - 1;
|
|
Packit |
d394d9 |
while (lower <= upper)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
middle = (lower + upper) / 2;
|
|
Packit |
d394d9 |
if (c < r.content_keys[ middle ])
|
|
Packit |
d394d9 |
upper = middle - 1;
|
|
Packit |
d394d9 |
else if (c > r.content_keys[ middle ])
|
|
Packit |
d394d9 |
lower = middle + 1;
|
|
Packit |
d394d9 |
else /* This entry matches. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
*content = r.contents[ middle ];
|
|
Packit |
d394d9 |
break;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
*input = g_utf8_next_char( *input );
|
|
Packit |
d394d9 |
if (*content != -1)
|
|
Packit |
d394d9 |
return TRUE;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
return FALSE;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* End of file. =============================================================*/
|