Blob Blame History Raw
/* Copyright (C) 1995 Bjoern Beutel. */

/* Description. =============================================================*/

/* This module implements a static trie structure */

/* Includes. ================================================================*/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <setjmp.h>
#include <glib.h>
#include "basic.h"
#include "pools.h"
#include "tries.h"

/* Macros. ==================================================================*/

#define ALIGN(addr, n) (((ptr_t) (addr) + (ptr_t) (n - 1)) & ~ (ptr_t) (n - 1))
/* Align ADDR to next multiple of N. */

#define INTS(n) (((n) + (ptr_t ) 3) >> 2) 
/* Number of ints needed to store N bytes. */ 

/* Types. ===================================================================*/

typedef struct /* A node in a list of keys. */
{ 
  list_node_t *next;
  u_short_t key; /* Key for this node. */
  int_t subnode; /* Index of node for KEY in trie. */
  int_t content; /* Content associated with KEY. */
} key_node_t;

typedef struct /* A dynamic trie node. */
{ 
  string_t prefix; /* The prefix of this node. */
  int_t prefix_len; /* The length of PREFIX in bytes. */
  list_t key_list; /* List of keys. */
} trie_node_t;

/* The trie is a vector of int_t that contains compact trie nodes.
 * A compact trie node is int_t-aligned and looks as follows:
 *   u_byte_t prefix_len;
 *   char_t prefix[ prefix_len ];
 *   0 or 1 pad bytes of value 0;
 *   u_short_t subnode_count;
 *   u_short_t content_count;
 *   u_short_t subnode_keys[ subnode_count ];
 *   u_short_t content_keys[ content_count ];
 *   0 or 2 pad bytes of value 0;
 *   int_t subnodes[ subnode_count ];
 *   int_t contents[ content_count ]; */

typedef struct /* Pointers to the items of a compact trie node. */
{ 
  u_byte_t *prefix_len; /* Number of chars that precede the keys. */
  char_t *prefix; /* The chars that precede the keys. */ 
  u_short_t *subnode_count; /* The number of subnodes in this node. */
  u_short_t *content_count; /* The number of contents in this node. */
  u_short_t *subnode_keys; /* The keys for the subnodes in this node. */
  u_short_t *content_keys; /* The keys for the contents in this node. */
  int_t *subnodes; /* Indexes of the subnodes. */
  int_t *contents; /* Contents. */
} compact_node_t;

/*---------------------------------------------------------------------------*/

static int_t 
store_trie_node( pool_t trie_pool, trie_node_t *node )
/* Store NODE in TRIE_POOL and return its index. */
{
  int_t node_size; /* Size of the trie node in TRIE_POOL. */
  int_t node_index; /* Index of the trie node in TRIE_POOL. */
  int_t i, j, subnode_count, content_count;
  key_node_t *key_node;
  int_t *compact_node;
  compact_node_t n; /* Pointers to the items of the compactified NODE. */
  string_t s;
  char_t *d;

  /* Count the number of entries in NODE->KEY_LIST. */
  subnode_count = content_count = 0;
  FOREACH( key_node, node->key_list )
  {
    if (key_node->subnode != -1) 
      subnode_count++;
    if (key_node->content != -1) 
      content_count++;
  }

  /* Get pool space for the new node. */
  node_size = (INTS( sizeof( u_byte_t ) + sizeof( char_t ) * node->prefix_len 
                     + sizeof( u_short_t ) 
		     + sizeof( u_short_t ) * subnode_count
                     + sizeof( u_short_t ) 
		     + sizeof( u_short_t ) * content_count )
               + subnode_count + content_count);
  compact_node = get_pool_space( trie_pool, node_size, &node_index );

  /* Set up the pointers to the items of the node. */
  n.prefix_len = (u_byte_t *) compact_node;
  n.prefix = (char_t *) (n.prefix_len + 1);
  n.subnode_count = (u_short_t *) ALIGN(n.prefix + node->prefix_len, 2 );
  n.content_count = (u_short_t *) (n.subnode_count + 1);
  n.subnode_keys = (u_short_t *) (n.content_count + 1);
  n.content_keys = (u_short_t *) (n.subnode_keys + subnode_count);
  n.subnodes = (int_t *) ALIGN( n.content_keys + content_count, 4 );
  n.contents = (int_t *) n.subnodes + subnode_count;

  /* Copy the items. */
  *n.prefix_len = node->prefix_len;
  d = n.prefix;
  for (s = node->prefix; 
       s < node->prefix + node->prefix_len; 
       s = g_utf8_next_char( s ))
  {
    d += g_unichar_to_utf8( g_unichar_tolower( g_utf8_get_char( s ) ), d );
  }
  *n.subnode_count = subnode_count;
  *n.content_count = content_count;
  i = j = 0;
  FOREACH( key_node, node->key_list ) 
  { 
    if (key_node->subnode != -1)
    {
      n.subnode_keys[i] = key_node->key;
      n.subnodes[i] = key_node->subnode;
      i++;
    }
    if (key_node->content != -1)
    {
      n.content_keys[j] = key_node->key;
      n.contents[j] = key_node->content;
      j++;
    }
  }

  return node_index;
}

/*---------------------------------------------------------------------------*/

static int_t 
new_subtrie( pool_t trie_pool, trie_entry_t entries[], 
             int_t start, int_t end, int_t index )
/* Create a subtrie for ENTRIES[ START ]..ENTRIES[ END - 1 ], 
 * store it in TRIE_POOL and return the index of its root node.
 * The first INDEX bytes in the keys will be ignored, so they
 * must match for the keys in ENTRIES[ START ]..ENTRIES[ END - 1 ]. */
{
  trie_node_t node;
  int_t key_idx; /* The key index. */
  int_t node_index;
  int_t sub_start, sub_end; /* Start and end of a subtrie. */
  string_t first_key, last_key; /* Used as abbreviations. */
  string_t f, l, e;
  key_node_t *key_node;

  if (start >= end) 
    return -1;

  /* Look how many chars from INDEX onward are equal in the first and 
   * the last entry. This will be the prefix of the new node.
   * If the key of the first entry is as long as the prefix, shorten the prefix
   * by one character. */
  first_key = entries[ start ].key;
  last_key = entries[ end - 1 ].key;
  for (f = first_key + index, l = last_key + index; 
       *g_utf8_next_char( f ) != EOS; 
       f = g_utf8_next_char( f ), l = g_utf8_next_char( l ))
  {
    if (g_unichar_tolower( g_utf8_get_char( f ) ) 
	!= g_unichar_tolower( g_utf8_get_char( l ) ))
    {
      break;
    }
  }
  key_idx = f - first_key;
  node.prefix = first_key + index;
  node.prefix_len = key_idx - index;

  /* Scan list of entries for the first char behind the prefix. For each key,
   * find the corresponding lower and upper index in the list of entries. */
  clear_list( &node.key_list );
  for (sub_start = start; sub_start < end; sub_start = sub_end) 
  { 
    /* Create a new node for the current key. */
    key_node = new_node( &node.key_list, sizeof( key_node_t ), LIST_END );
    f = entries[ sub_start ].key + key_idx;
    key_node->key = g_unichar_tolower( g_utf8_get_char(f) );
    e = g_utf8_next_char(f);
    if (*e == EOS) 
    { 
      key_node->content = entries[ sub_start ].content;
      sub_start++;
    } 
    else 
      key_node->content = -1;

    /* Find last entry with same key. */
    for (sub_end = sub_start; sub_end < end; sub_end++) 
    {
      l = entries[ sub_end ].key + key_idx;
      if (g_unichar_tolower( g_utf8_get_char(l) ) != key_node->key)
	break;
    }
    key_node->subnode = new_subtrie( trie_pool, entries, sub_start, sub_end, 
                                     key_idx + e - f );
  }

  /* Store compact node and clear the key list. */
  node_index = store_trie_node( trie_pool, &node );
  while (node.key_list.first != NULL) 
    free_first_node( &node.key_list );

  return node_index;
}

/*---------------------------------------------------------------------------*/

void 
new_trie( int_t n, trie_entry_t *entries, 
          pool_t *trie_pool, int_t *root_node_index )
/* Take N ENTRIES and build a trie of them.
 * Return the trie in *TRIE_POOL 
 * and the index of its root node in *ROOT_NODE_INDEX.
 * The keys in *ENTRIES must be non-empty, unique, and sorted. */
{
  *trie_pool = new_pool( sizeof( int_t ) );
  *root_node_index = new_subtrie( *trie_pool, entries, 0, n, 0 );
}

/*---------------------------------------------------------------------------*/

bool_t
lookup_trie( int_t *trie, int_t *node_index, string_t *input, int_t *content )
/* Test if a prefix of *INPUT matches the node at *NODE_INDEX in TRIE.
 * If it does, return TRUE (else return FALSE) and:
 *   *CONTENT contains the associated content,
 *   *NODE contains the subnode for the matched input, and
 *   *INPUT points to the first char behind the prefix. */
{
  int_t lower, upper, middle;
  compact_node_t r; /* Pointers to the items of the root node; */
  gunichar c;

  while (*node_index != -1) 
  { 
    /* Test if node's prefix matches the given key. */
    r.prefix_len = (u_byte_t *) (trie + *node_index);
    r.prefix = (char_t *) (r.prefix_len + 1);
    if (strncmp_no_case( *input, r.prefix, *r.prefix_len ) != 0) 
      return FALSE;
    (*input) += *r.prefix_len;

    /* Get the rest of the node. */
    r.subnode_count = (u_short_t *) ALIGN(r.prefix + *r.prefix_len, 2);
    r.content_count = (u_short_t *) (r.subnode_count + 1);
    r.subnode_keys = (u_short_t *) (r.content_count + 1);
    r.content_keys = (u_short_t *) (r.subnode_keys + *r.subnode_count);
    r.subnodes = (int_t *) ALIGN( r.content_keys + *r.content_count, 4 );
    r.contents = (int_t *) (r.subnodes + *r.subnode_count);

    /* Perform binary search for subnode with given key. */
    c = g_unichar_tolower( g_utf8_get_char( *input ) );
    *node_index = -1;
    lower = 0;
    upper = *r.subnode_count - 1;
    while (lower <= upper) 
    { 
      middle = (lower + upper) / 2;
      if (c < r.subnode_keys[ middle ]) 
	upper = middle - 1;
      else if (c > r.subnode_keys[ middle ]) 
	lower = middle + 1;
      else /* This entry matches. */
      { 
	*node_index = r.subnodes[ middle ];
	break;
      }
    }

    /* Perform binary search for content with given key. */
    *content = -1;
    lower = 0;
    upper = *r.content_count - 1;
    while (lower <= upper) 
    { 
      middle = (lower + upper) / 2;
      if (c < r.content_keys[ middle ]) 
	upper = middle - 1;
      else if (c > r.content_keys[ middle ]) 
	lower = middle + 1;
      else /* This entry matches. */
      { 
        *content = r.contents[ middle ];
        break;
      }
    }

    *input = g_utf8_next_char( *input );
    if (*content != -1) 
      return TRUE;
  }
  return FALSE;
}

/* End of file. =============================================================*/