Blob Blame History Raw
/* dzl-fuzzy-mutable-index.c
 *
 * Copyright (C) 2014-2017 Christian Hergert <christian@hergert.me>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "config.h"

#include <ctype.h>
#include <string.h>

#include "search/dzl-fuzzy-mutable-index.h"

/**
 * SECTION:dzl-fuzzy-mutable-index
 * @title: DzlFuzzyMutableIndex Matching
 * @short_description: DzlFuzzyMutableIndex matching for GLib based programs.
 *
 * #DzlFuzzyMutableIndex provides a fulltext index that focuses around fuzzy
 * matching words. This version of the datastructure is focused around
 * in-memory storage. This makes mutability performance of adding or removing
 * items from the corpus simpler.
 *
 * If you need mostly read-only indexes, you might consider using
 * #DzlFuzzyIndex and #DzlFuzzyIndexBuilder which can create a disk-based file
 * and mmap() a read-only version of the data set.
 *
 * It is a programming error to modify #Fuzzy while holding onto an array
 * of #FuzzyMatch elements. The position of strings within the DzlFuzzyMutableIndexMatch
 * may no longer be valid.
 */

G_DEFINE_BOXED_TYPE (DzlFuzzyMutableIndex, dzl_fuzzy_mutable_index,
                     (GBoxedCopyFunc)dzl_fuzzy_mutable_index_ref,
                     (GBoxedFreeFunc)dzl_fuzzy_mutable_index_unref)

struct _DzlFuzzyMutableIndex
{
  volatile gint   ref_count;
  GByteArray     *heap;
  GArray         *id_to_text_offset;
  GPtrArray      *id_to_value;
  GHashTable     *char_tables;
  GHashTable     *removed;
  guint           in_bulk_insert : 1;
  guint           case_sensitive : 1;
};

#pragma pack(push, 1)
typedef struct
{
  guint id;
  guint pos : 16;
} DzlFuzzyMutableIndexItem;
#pragma pack(pop)

G_STATIC_ASSERT (sizeof(DzlFuzzyMutableIndexItem) == 6);

typedef struct
{
   DzlFuzzyMutableIndex        *fuzzy;
   GArray      **tables;
   gint         *state;
   guint         n_tables;
   gsize         max_matches;
   const gchar  *needle;
   GHashTable   *matches;
} DzlFuzzyMutableIndexLookup;

static gint
dzl_fuzzy_mutable_index_item_compare (gconstpointer a,
                                      gconstpointer b)
{
  gint ret;

  const DzlFuzzyMutableIndexItem *fa = a;
  const DzlFuzzyMutableIndexItem *fb = b;

  if ((ret = fa->id - fb->id) == 0)
    ret = fa->pos - fb->pos;

  return ret;
}

static gint
dzl_fuzzy_mutable_index_match_compare (gconstpointer a,
                                       gconstpointer b)
{
  const DzlFuzzyMutableIndexMatch *ma = a;
  const DzlFuzzyMutableIndexMatch *mb = b;

  if (ma->score < mb->score) {
    return 1;
  } else if (ma->score > mb->score) {
    return -1;
  }

  return strcmp (ma->key, mb->key);
}

DzlFuzzyMutableIndex *
dzl_fuzzy_mutable_index_ref (DzlFuzzyMutableIndex *fuzzy)
{
  g_return_val_if_fail (fuzzy, NULL);
  g_return_val_if_fail (fuzzy->ref_count > 0, NULL);

  g_atomic_int_inc (&fuzzy->ref_count);

  return fuzzy;
}

/**
 * dzl_fuzzy_mutable_index_new:
 * @case_sensitive: %TRUE if case should be preserved.
 *
 * Create a new #Fuzzy for fuzzy matching strings.
 *
 * Returns: A newly allocated #Fuzzy that should be freed with dzl_fuzzy_mutable_index_unref().
 */
DzlFuzzyMutableIndex *
dzl_fuzzy_mutable_index_new (gboolean case_sensitive)
{
  DzlFuzzyMutableIndex *fuzzy;

  fuzzy = g_slice_new0 (DzlFuzzyMutableIndex);
  fuzzy->ref_count = 1;
  fuzzy->heap = g_byte_array_new ();
  fuzzy->id_to_value = g_ptr_array_new ();
  fuzzy->id_to_text_offset = g_array_new (FALSE, FALSE, sizeof (gsize));
  fuzzy->char_tables = g_hash_table_new_full (NULL, NULL, NULL, (GDestroyNotify)g_array_unref);
  fuzzy->case_sensitive = case_sensitive;
  fuzzy->removed = g_hash_table_new (g_direct_hash, g_direct_equal);

  return fuzzy;
}

DzlFuzzyMutableIndex *
dzl_fuzzy_mutable_index_new_with_free_func (gboolean       case_sensitive,
                                            GDestroyNotify free_func)
{
  DzlFuzzyMutableIndex *fuzzy;

  fuzzy = dzl_fuzzy_mutable_index_new (case_sensitive);
  dzl_fuzzy_mutable_index_set_free_func (fuzzy, free_func);

  return fuzzy;
}

void
dzl_fuzzy_mutable_index_set_free_func (DzlFuzzyMutableIndex *fuzzy,
                                       GDestroyNotify        free_func)
{
  g_return_if_fail (fuzzy);

  g_ptr_array_set_free_func (fuzzy->id_to_value, free_func);
}

static gsize
dzl_fuzzy_mutable_index_heap_insert (DzlFuzzyMutableIndex *fuzzy,
                                     const gchar          *text)
{
  gsize ret;

  g_assert (fuzzy != NULL);
  g_assert (text != NULL);

  ret = fuzzy->heap->len;

  g_byte_array_append (fuzzy->heap, (guint8 *)text, strlen (text) + 1);

  return ret;
}

/**
 * dzl_fuzzy_mutable_index_begin_bulk_insert:
 * @fuzzy: (in): A #Fuzzy.
 *
 * Start a bulk insertion. @fuzzy is not ready for searching until
 * dzl_fuzzy_mutable_index_end_bulk_insert() has been called.
 *
 * This allows for inserting large numbers of strings and deferring
 * the final sort until dzl_fuzzy_mutable_index_end_bulk_insert().
 */
void
dzl_fuzzy_mutable_index_begin_bulk_insert (DzlFuzzyMutableIndex *fuzzy)
{
   g_return_if_fail (fuzzy);
   g_return_if_fail (!fuzzy->in_bulk_insert);

   fuzzy->in_bulk_insert = TRUE;
}

/**
 * dzl_fuzzy_mutable_index_end_bulk_insert:
 * @fuzzy: (in): A #Fuzzy.
 *
 * Complete a bulk insert and resort the index.
 */
void
dzl_fuzzy_mutable_index_end_bulk_insert (DzlFuzzyMutableIndex *fuzzy)
{
   GHashTableIter iter;
   gpointer key;
   gpointer value;

   g_return_if_fail(fuzzy);
   g_return_if_fail(fuzzy->in_bulk_insert);

   fuzzy->in_bulk_insert = FALSE;

   g_hash_table_iter_init (&iter, fuzzy->char_tables);

   while (g_hash_table_iter_next (&iter, &key, &value)) {
      GArray *table = value;

      g_array_sort (table, dzl_fuzzy_mutable_index_item_compare);
   }
}

/**
 * dzl_fuzzy_mutable_index_insert:
 * @fuzzy: (in): A #Fuzzy.
 * @key: (in): A UTF-8 encoded string.
 * @value: (in): A value to associate with key.
 *
 * Inserts a string into the fuzzy matcher.
 */
void
dzl_fuzzy_mutable_index_insert (DzlFuzzyMutableIndex *fuzzy,
                                const gchar          *key,
                                gpointer              value)
{
  const gchar *tmp;
  gchar *downcase = NULL;
  gsize offset;
  guint id;

  if (G_UNLIKELY (!key || !*key || (fuzzy->id_to_text_offset->len == G_MAXUINT)))
    return;

  if (!fuzzy->case_sensitive)
    downcase = g_utf8_casefold (key, -1);

  offset = dzl_fuzzy_mutable_index_heap_insert (fuzzy, key);
  id = fuzzy->id_to_text_offset->len;
  g_array_append_val (fuzzy->id_to_text_offset, offset);
  g_ptr_array_add (fuzzy->id_to_value, value);

  if (!fuzzy->case_sensitive)
    key = downcase;

  for (tmp = key; *tmp; tmp = g_utf8_next_char (tmp))
    {
      gunichar ch = g_utf8_get_char (tmp);
      GArray *table;
      DzlFuzzyMutableIndexItem item;

      table = g_hash_table_lookup (fuzzy->char_tables, GINT_TO_POINTER (ch));

      if (G_UNLIKELY (table == NULL))
        {
          table = g_array_new (FALSE, FALSE, sizeof (DzlFuzzyMutableIndexItem));
          g_hash_table_insert (fuzzy->char_tables, GINT_TO_POINTER (ch), table);
        }

      item.id = id;
      item.pos = (guint)(gsize)(tmp - key);

      g_array_append_val (table, item);
    }

  if (G_UNLIKELY (!fuzzy->in_bulk_insert))
    {
      for (tmp = key; *tmp; tmp = g_utf8_next_char (tmp))
        {
          GArray *table;
          gunichar ch;

          ch = g_utf8_get_char (tmp);
          table = g_hash_table_lookup (fuzzy->char_tables, GINT_TO_POINTER (ch));
          g_array_sort (table, dzl_fuzzy_mutable_index_item_compare);
        }
    }

  g_free (downcase);
}

/**
 * dzl_fuzzy_mutable_index_unref:
 * @fuzzy: A #Fuzzy.
 *
 * Decrements the reference count of fuzzy by one. When the reference count
 * reaches zero, the structure will be freed.
 */
void
dzl_fuzzy_mutable_index_unref (DzlFuzzyMutableIndex *fuzzy)
{
  g_return_if_fail (fuzzy);
  g_return_if_fail (fuzzy->ref_count > 0);

  if (G_UNLIKELY (g_atomic_int_dec_and_test (&fuzzy->ref_count)))
    {
      g_byte_array_unref (fuzzy->heap);
      fuzzy->heap = NULL;

      g_array_unref (fuzzy->id_to_text_offset);
      fuzzy->id_to_text_offset = NULL;

      g_ptr_array_unref (fuzzy->id_to_value);
      fuzzy->id_to_value = NULL;

      g_hash_table_unref (fuzzy->char_tables);
      fuzzy->char_tables = NULL;

      g_hash_table_unref (fuzzy->removed);
      fuzzy->removed = NULL;

      g_slice_free (DzlFuzzyMutableIndex, fuzzy);
    }
}

static gboolean
dzl_fuzzy_mutable_index_do_match (DzlFuzzyMutableIndexLookup *lookup,
                                  DzlFuzzyMutableIndexItem   *item,
                                  guint                       table_index,
                                  gint                        score)
{
  DzlFuzzyMutableIndexItem *iter;
  gpointer key;
  GArray *table;
  gint *state;
  gint iter_score;

  table = lookup->tables [table_index];
  state = &lookup->state [table_index];

  for (; state [0] < (gint)table->len; state [0]++)
    {
      iter = &g_array_index (table, DzlFuzzyMutableIndexItem, state[0]);

      if ((iter->id < item->id) || ((iter->id == item->id) && (iter->pos <= item->pos)))
        continue;
      else if (iter->id > item->id)
        break;

      iter_score = score + (iter->pos - item->pos);

      if ((table_index + 1) < lookup->n_tables)
        {
          if (dzl_fuzzy_mutable_index_do_match (lookup, iter, table_index + 1, iter_score))
            return TRUE;
          continue;
        }

      key = GINT_TO_POINTER (iter->id);

      if (!g_hash_table_contains (lookup->matches, key) ||
          (iter_score < GPOINTER_TO_INT (g_hash_table_lookup (lookup->matches, key))))
        g_hash_table_insert (lookup->matches, key, GINT_TO_POINTER (iter_score));

      return TRUE;
    }

  return FALSE;
}

static inline const gchar *
dzl_fuzzy_mutable_index_get_string (DzlFuzzyMutableIndex *fuzzy,
                                    gint                  id)
{
  guint offset = g_array_index (fuzzy->id_to_text_offset, gsize, id);
  return (const gchar *)&fuzzy->heap->data [offset];
}

/**
 * dzl_fuzzy_mutable_index_match:
 * @fuzzy: (in): A #Fuzzy.
 * @needle: (in): The needle to fuzzy search for.
 * @max_matches: (in): The max number of matches to return.
 *
 * DzlFuzzyMutableIndex searches within @fuzzy for strings that fuzzy match @needle.
 * Only up to @max_matches will be returned.
 *
 * TODO: max_matches is not yet respected.
 *
 * Returns: (transfer full) (element-type DzlFuzzyMutableIndexMatch): A newly allocated
 *   #GArray containing #FuzzyMatch elements. This should be freed when
 *   the caller is done with it using g_array_unref().
 *   It is a programming error to keep the structure around longer than
 *   the @fuzzy instance.
 */
GArray *
dzl_fuzzy_mutable_index_match (DzlFuzzyMutableIndex *fuzzy,
                               const gchar          *needle,
                               gsize                 max_matches)
{
  DzlFuzzyMutableIndexLookup lookup = { 0 };
  DzlFuzzyMutableIndexMatch match;
  DzlFuzzyMutableIndexItem *item;
  GHashTableIter iter;
  gpointer key;
  gpointer value;
  const gchar *tmp;
  GArray *matches = NULL;
  GArray *root;
  gchar *downcase = NULL;
  guint i;

  g_return_val_if_fail (fuzzy, NULL);
  g_return_val_if_fail (!fuzzy->in_bulk_insert, NULL);
  g_return_val_if_fail (needle, NULL);

  matches = g_array_new (FALSE, FALSE, sizeof (DzlFuzzyMutableIndexMatch));

  if (!*needle)
    goto cleanup;

  if (!fuzzy->case_sensitive)
    {
      downcase = g_utf8_casefold (needle, -1);
      needle = downcase;
    }

  lookup.fuzzy = fuzzy;
  lookup.n_tables = g_utf8_strlen (needle, -1);
  lookup.state = g_new0 (gint, lookup.n_tables);
  lookup.tables = g_new0 (GArray*, lookup.n_tables);
  lookup.needle = needle;
  lookup.max_matches = max_matches;
  lookup.matches = g_hash_table_new (NULL, NULL);

  for (i = 0, tmp = needle; *tmp; tmp = g_utf8_next_char (tmp))
    {
      gunichar ch;
      GArray *table;

      ch = g_utf8_get_char (tmp);
      table = g_hash_table_lookup (fuzzy->char_tables, GINT_TO_POINTER (ch));

      if (table == NULL)
        goto cleanup;

      lookup.tables [i++] = table;
    }

  g_assert (lookup.n_tables == i);
  g_assert (lookup.tables [0] != NULL);

  root = lookup.tables [0];

  if (G_LIKELY (lookup.n_tables > 1))
    {
      for (i = 0; i < root->len; i++)
        {
          item = &g_array_index (root, DzlFuzzyMutableIndexItem, i);
          dzl_fuzzy_mutable_index_do_match (&lookup, item, 1, MIN (16, item->pos * 4));
        }
    }
  else
    {
      guint last_id = G_MAXUINT;

      for (i = 0; i < root->len; i++)
        {
          item = &g_array_index (root, DzlFuzzyMutableIndexItem, i);
          match.id = GPOINTER_TO_INT (item->id);
          if (match.id != last_id)
            {
              match.key = dzl_fuzzy_mutable_index_get_string (fuzzy, item->id);
              match.value = g_ptr_array_index (fuzzy->id_to_value, item->id);
              match.score = 1.0 / (strlen (match.key) + item->pos);
              g_array_append_val (matches, match);
              last_id = match.id;
            }
        }

      goto cleanup;
    }

  g_hash_table_iter_init (&iter, lookup.matches);

  while (g_hash_table_iter_next (&iter, &key, &value))
    {
      /* Ignore keys that have a tombstone record. */
      if (g_hash_table_contains (fuzzy->removed, key))
        continue;

      match.id = GPOINTER_TO_INT (key);
      match.key = dzl_fuzzy_mutable_index_get_string (fuzzy, match.id);
      match.score = 1.0 / (strlen (match.key) + GPOINTER_TO_INT (value));
      match.value = g_ptr_array_index (fuzzy->id_to_value, match.id);

      g_array_append_val (matches, match);
    }

  /*
   * TODO: We could be more clever here when inserting into the array
   *       only if it is a lower score than the end or < max items.
   */

  if (max_matches != 0)
    {
      g_array_sort (matches, dzl_fuzzy_mutable_index_match_compare);

      if (max_matches && (matches->len > max_matches))
        g_array_set_size (matches, max_matches);
    }

cleanup:
  g_free (downcase);
  g_free (lookup.state);
  g_free (lookup.tables);
  g_clear_pointer (&lookup.matches, g_hash_table_unref);

  return matches;
}

gboolean
dzl_fuzzy_mutable_index_contains (DzlFuzzyMutableIndex *fuzzy,
                                  const gchar          *key)
{
  GArray *ar;
  gboolean ret;

  g_return_val_if_fail (fuzzy != NULL, FALSE);

  ar = dzl_fuzzy_mutable_index_match (fuzzy, key, 1);
  ret = (ar != NULL) && (ar->len > 0);
  g_clear_pointer (&ar, g_array_unref);

  return ret;
}

void
dzl_fuzzy_mutable_index_remove (DzlFuzzyMutableIndex *fuzzy,
                                const gchar          *key)
{
  GArray *ar;

  g_return_if_fail (fuzzy != NULL);

  if (!key || !*key)
    return;

  ar = dzl_fuzzy_mutable_index_match (fuzzy, key, 1);

  if (ar != NULL && ar->len > 0)
    {
      for (guint i = 0; i < ar->len; i++)
        {
          const DzlFuzzyMutableIndexMatch *match = &g_array_index (ar, DzlFuzzyMutableIndexMatch, i);

          if (g_strcmp0 (match->key, key) == 0)
            g_hash_table_insert (fuzzy->removed, GINT_TO_POINTER (match->id), NULL);
        }
    }

  g_clear_pointer (&ar, g_array_unref);
}

gchar *
dzl_fuzzy_highlight (const gchar *str,
                     const gchar *match,
                     gboolean     case_sensitive)
{
  static const gchar *begin = "<b>";
  static const gchar *end = "</b>";
  GString *ret;
  gunichar str_ch;
  gunichar match_ch;
  gboolean element_open = FALSE;

  if (str == NULL || match == NULL)
    return g_strdup (str);

  ret = g_string_new (NULL);

  for (; *str; str = g_utf8_next_char (str))
    {
      str_ch = g_utf8_get_char (str);
      match_ch = g_utf8_get_char (match);

      if (str_ch == '&')
        {
          if (0 == strncmp (str, "&amp;", 5))
            {
              str += 4;
              g_string_append (ret, "&amp;");
              continue;
            }
          else if (0 == strncmp (str, "&apos;", 6))
            {
              str += 5;
              g_string_append (ret, "&apos;");
              continue;
            }
        }

      if ((str_ch == match_ch) ||
          (!case_sensitive && g_unichar_tolower (str_ch) == g_unichar_tolower (match_ch)))
        {
          if (!element_open)
            {
              g_string_append (ret, begin);
              element_open = TRUE;
            }

          g_string_append_unichar (ret, str_ch);

          /* TODO: We could seek to the next char and append in a batch. */
          match = g_utf8_next_char (match);
        }
      else
        {
          if (element_open)
            {
              g_string_append (ret, end);
              element_open = FALSE;
            }

          g_string_append_unichar (ret, str_ch);
        }
    }

  if (element_open)
    g_string_append (ret, end);

  return g_string_free (ret, FALSE);
}