Blob Blame History Raw
/* dzl-fuzzy-index-builder.c
 *
 * Copyright (C) 2016 Christian Hergert <christian@hergert.me>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#define G_LOG_DOMAIN    "dzl-fuzzy-index-builder"
#define MAX_KEY_ENTRIES (0x00FFFFFF)

#include "config.h"

#include <stdlib.h>
#include <string.h>

#include "search/dzl-fuzzy-index-builder.h"
#include "util/dzl-variant.h"

struct _DzlFuzzyIndexBuilder
{
  GObject       object;

  guint         case_sensitive : 1;

  /*
   * This hash table contains a mapping of GVariants so that we
   * deduplicate insertions of the same document. This helps when
   * we have indexes that contain multiple strings to the same
   * piece of data.
   */
  GHashTable *documents_hash;

  /*
   * This array contains a pointer to the individual GVariants
   * while building the index. When writing the index to disk,
   * we create a fixed array from this array of varians.
   */
  GPtrArray *documents;

  /*
   * Since we will need to keep a copy of a lot of strings, we
   * use a GString chunk to reduce the presure on the allocator.
   * It can certainly leave some gaps that are unused in the
   * sequence of pages, but it is generally better than using
   * a GByteArray or some other pow^2 growing array.
   */
  GStringChunk *strings;

  /*
   * This maps a pointer to a string that is found in the strings
   * string chunk to a key id (stored as a pointer). The input
   * string must exist within strings as we use a direct hash from
   * the input pointer to map to the string to save on the cost
   * of key equality checks.
   */
  GHashTable *key_ids;

  /*
   * An array of keys where the index of the key is the "key_id" used
   * in other structures. The pointer points to a key within the
   * strings GStringChunk.
   */
  GPtrArray *keys;

  /*
   * This array maps our document id to a key id. When building the
   * search index we use this to disambiguate between multiple
   * documents pointing to the same document.
   */
  GArray *kv_pairs;

  /*
   * Metadata for the search index, which is stored as the "metadata"
   * key in the final search index. You can use fuzzy_index_get_metadata()
   * to retrieve values stored here.
   *
   * This might be useful to store things like the mtime of the data
   * you are indexes so that you know if you need to reindex. You might
   * also store the version of your indexer here so that when you update
   * your indexer code, you can force a rebuild of the index.
   */
  GHashTable *metadata;
};

typedef struct
{
  /* The position within the keys array of the key. */
  guint key_id;

  /* The position within the documents array of the document */
  guint document_id;
} KVPair;

typedef struct
{
  /*
   * The character position within the string in terms of unicode
   * characters, not byte-position.
   */
  guint position;

  /* The index into the kvpairs */
  guint lookaside_id;
} IndexItem;

G_DEFINE_TYPE (DzlFuzzyIndexBuilder, dzl_fuzzy_index_builder, G_TYPE_OBJECT)

enum {
  PROP_0,
  PROP_CASE_SENSITIVE,
  N_PROPS
};

static GParamSpec *properties [N_PROPS];

static guint
mask_priority (guint key_id)
{
  return key_id & 0x00FFFFFF;
}

static void
dzl_fuzzy_index_builder_finalize (GObject *object)
{
  DzlFuzzyIndexBuilder *self = (DzlFuzzyIndexBuilder *)object;

  g_clear_pointer (&self->documents_hash, g_hash_table_unref);
  g_clear_pointer (&self->documents, g_ptr_array_unref);
  g_clear_pointer (&self->strings, g_string_chunk_free);
  g_clear_pointer (&self->kv_pairs, g_array_unref);
  g_clear_pointer (&self->metadata, g_hash_table_unref);
  g_clear_pointer (&self->key_ids, g_hash_table_unref);
  g_clear_pointer (&self->keys, g_ptr_array_unref);

  G_OBJECT_CLASS (dzl_fuzzy_index_builder_parent_class)->finalize (object);
}

static void
dzl_fuzzy_index_builder_get_property (GObject    *object,
                                      guint       prop_id,
                                      GValue     *value,
                                      GParamSpec *pspec)
{
  DzlFuzzyIndexBuilder *self = DZL_FUZZY_INDEX_BUILDER (object);

  switch (prop_id)
    {
    case PROP_CASE_SENSITIVE:
      g_value_set_boolean (value, dzl_fuzzy_index_builder_get_case_sensitive (self));
      break;

    default:
      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
    }
}

static void
dzl_fuzzy_index_builder_set_property (GObject      *object,
                                  guint         prop_id,
                                  const GValue *value,
                                  GParamSpec   *pspec)
{
  DzlFuzzyIndexBuilder *self = DZL_FUZZY_INDEX_BUILDER (object);

  switch (prop_id)
    {
    case PROP_CASE_SENSITIVE:
      dzl_fuzzy_index_builder_set_case_sensitive (self, g_value_get_boolean (value));
      break;

    default:
      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
    }
}

static void
dzl_fuzzy_index_builder_class_init (DzlFuzzyIndexBuilderClass *klass)
{
  GObjectClass *object_class = G_OBJECT_CLASS (klass);

  object_class->finalize = dzl_fuzzy_index_builder_finalize;
  object_class->get_property = dzl_fuzzy_index_builder_get_property;
  object_class->set_property = dzl_fuzzy_index_builder_set_property;

  properties [PROP_CASE_SENSITIVE] =
    g_param_spec_boolean ("case-sensitive",
                          "Case Sensitive",
                          "Case Sensitive",
                          FALSE,
                          (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS));

  g_object_class_install_properties (object_class, N_PROPS, properties);
}

static void
dzl_fuzzy_index_builder_init (DzlFuzzyIndexBuilder *self)
{
  self->documents = g_ptr_array_new_with_free_func ((GDestroyNotify)g_variant_unref);
  self->documents_hash = g_hash_table_new (dzl_g_variant_hash, g_variant_equal);
  self->kv_pairs = g_array_new (FALSE, FALSE, sizeof (KVPair));
  self->strings = g_string_chunk_new (4096);
  self->key_ids = g_hash_table_new (NULL, NULL);
  self->keys = g_ptr_array_new ();
}

DzlFuzzyIndexBuilder *
dzl_fuzzy_index_builder_new (void)
{
  return g_object_new (DZL_TYPE_FUZZY_INDEX_BUILDER, NULL);
}

/**
 * dzl_fuzzy_index_builder_insert:
 * @self: A #DzlFuzzyIndexBuilder
 * @key: The UTF-8 encoded key for the document
 * @document: The document to store
 * @priority: An optional priority for the keyword.
 *
 * Inserts @document into the index using @key as the lookup key.
 *
 * If a matching document (checked by hashing @document) has already
 * been inserted, only a single instance of the document will be stored.
 *
 * If @document is floating, it will be consumed.
 *
 * @priority may be used to group results by priority. Priority must be
 * less than 256.
 *
 * Returns: The document id registered for @document.
 */
guint64
dzl_fuzzy_index_builder_insert (DzlFuzzyIndexBuilder *self,
                                const gchar          *key,
                                GVariant             *document,
                                guint                 priority)
{
  g_autoptr(GVariant) sunk_variant = NULL;
  GVariant *real_document = NULL;
  gpointer document_id = NULL;
  gpointer key_id = NULL;
  KVPair pair;

  g_return_val_if_fail (DZL_IS_FUZZY_INDEX_BUILDER (self), 0L);
  g_return_val_if_fail (key != NULL, 0L);
  g_return_val_if_fail (document != NULL, 0L);
  g_return_val_if_fail (priority <= 0xFF, 0L);

  if (g_variant_is_floating (document))
    sunk_variant = g_variant_ref_sink (document);

  /* move the priority bits into the proper area */
  priority = (priority & 0xFF) << 24;

  if (self->keys->len > MAX_KEY_ENTRIES)
    {
      g_warning ("Index is full, cannot add more entries");
      return 0L;
    }

  key = g_string_chunk_insert_const (self->strings, key);

  /*
   * We try to deduplicate document entries here by hashing the document and
   * looking for another matching it. This way our generated index can stay
   * relatively small when it comes to documents.
   */
  if (!g_hash_table_lookup_extended (self->documents_hash,
                                     document,
                                     (gpointer *)&real_document,
                                     &document_id))
    {
      document_id = GUINT_TO_POINTER (self->documents->len);
      real_document = g_variant_ref (document);
      g_ptr_array_add (self->documents, real_document);
      g_hash_table_insert (self->documents_hash, real_document, document_id);
    }

  /*
   * If we already have the key then reuse its key index. If not, then add it.
   */
  if (!g_hash_table_lookup_extended (self->key_ids, key, NULL, &key_id))
    {
      key_id = GUINT_TO_POINTER (self->keys->len);
      g_ptr_array_add (self->keys, (gchar *)key);
      g_hash_table_insert (self->key_ids, (gpointer)key, key_id);
    }

  /*
   * A bit of slight-of-hand here. We share keys between all key<->document
   * pairs, but steal the high bits for the key priority in the kvpair entry.
   * This allows for both deduplication and different priorities based on
   * certain document pairs.
   */
  pair.key_id = GPOINTER_TO_UINT (key_id) | priority;
  pair.document_id = GPOINTER_TO_UINT (document_id);

  g_array_append_val (self->kv_pairs, pair);

  return pair.document_id;
}

static gint
pos_doc_pair_compare (gconstpointer a,
                      gconstpointer b)
{
  const IndexItem *paira = a;
  const IndexItem *pairb = b;
  gint ret;

  ret = paira->lookaside_id - pairb->lookaside_id;

  if (ret == 0)
    ret = paira->position - pairb->position;

  return ret;
}

static GVariant *
dzl_fuzzy_index_builder_build_keys (DzlFuzzyIndexBuilder *self)
{
  g_assert (DZL_IS_FUZZY_INDEX_BUILDER (self));

  return g_variant_new_strv ((const gchar * const *)self->keys->pdata,
                             self->keys->len);
}

static GVariant *
dzl_fuzzy_index_builder_build_lookaside (DzlFuzzyIndexBuilder *self)
{
  g_assert (DZL_IS_FUZZY_INDEX_BUILDER (self));

  return g_variant_new_fixed_array ((const GVariantType *)"(uu)",
                                    self->kv_pairs->data,
                                    self->kv_pairs->len,
                                    sizeof (KVPair));
}

static GVariant *
dzl_fuzzy_index_builder_build_index (DzlFuzzyIndexBuilder *self)
{
  g_autoptr(GHashTable) rows = NULL;
  GVariantDict dict;
  GHashTableIter iter;
  gpointer keyptr;
  GArray *row;
  guint i;

  g_assert (DZL_IS_FUZZY_INDEX_BUILDER (self));

  rows = g_hash_table_new_full (NULL, NULL, NULL, (GDestroyNotify)g_array_unref);

  for (i = 0; i < self->kv_pairs->len; i++)
    {
      g_autofree gchar *lower = NULL;
      const gchar *key;
      const gchar *tmp;
      KVPair *kvpair;
      IndexItem item;
      guint position = 0;

      kvpair = &g_array_index (self->kv_pairs, KVPair, i);
      key = g_ptr_array_index (self->keys, mask_priority (kvpair->key_id));

      /* the priority for the key is stashed in the high 8 bits of
       * the kvpair.key_id. So we need to propagate that to the
       * entry in the index for resolution later.
       */
      item.lookaside_id = i | (kvpair->key_id & 0xFF000000);

      if (!self->case_sensitive)
        key = lower = g_utf8_casefold (key, -1);

      for (tmp = key; *tmp; tmp = g_utf8_next_char (tmp))
        {
          gunichar ch = g_utf8_get_char (tmp);

          row = g_hash_table_lookup (rows, GUINT_TO_POINTER (ch));

          if G_UNLIKELY (row == NULL)
            {
              row = g_array_new (FALSE, FALSE, sizeof (IndexItem));
              g_hash_table_insert (rows, GUINT_TO_POINTER (ch), row);
            }

          item.position = position++;
          g_array_append_val (row, item);
        }
    }

  g_variant_dict_init (&dict, NULL);

  g_hash_table_iter_init (&iter, rows);

  while (g_hash_table_iter_next (&iter, &keyptr, (gpointer *)&row))
    {
      gchar key[12];
      GVariant *variant;
      gunichar ch = GPOINTER_TO_UINT (keyptr);

      key [g_unichar_to_utf8 (ch, key)] = 0;

      g_array_sort (row, pos_doc_pair_compare);

      variant = g_variant_new_fixed_array ((const GVariantType *)"(uu)",
                                           row->data,
                                           row->len,
                                           sizeof (IndexItem));
      g_variant_dict_insert_value (&dict, key, variant);
    }

  return g_variant_dict_end (&dict);
}

static GVariant *
dzl_fuzzy_index_builder_build_metadata (DzlFuzzyIndexBuilder *self)
{
  GVariantDict dict;
  GHashTableIter iter;

  g_assert (DZL_IS_FUZZY_INDEX_BUILDER (self));

  g_variant_dict_init (&dict, NULL);

  if (self->metadata != NULL)
    {
      const gchar *key;
      GVariant *value;

      g_hash_table_iter_init (&iter, self->metadata);
      while (g_hash_table_iter_next (&iter, (gpointer *)&key, (gpointer *)&value))
        g_variant_dict_insert_value (&dict, key, value);
    }

  g_variant_dict_insert (&dict, "case-sensitive", "b", self->case_sensitive);

  return g_variant_dict_end (&dict);
}

static void
dzl_fuzzy_index_builder_write_worker (GTask        *task,
                                      gpointer      source_object,
                                      gpointer      task_data,
                                      GCancellable *cancellable)
{
  DzlFuzzyIndexBuilder *self = source_object;
  g_autoptr(GVariant) variant = NULL;
  g_autoptr(GVariant) documents = NULL;
  g_autoptr(GError) error = NULL;
  GVariantDict dict;
  GFile *file = task_data;

  g_assert (G_IS_TASK (task));
  g_assert (DZL_IS_FUZZY_INDEX_BUILDER (self));
  g_assert (G_IS_FILE (file));
  g_assert (!cancellable || G_IS_CANCELLABLE (cancellable));

  g_variant_dict_init (&dict, NULL);

  /* Set our version number for the document */
  g_variant_dict_insert (&dict, "version", "i", 1);

  /* Build our dicitionary of metadata */
  g_variant_dict_insert_value (&dict,
                               "metadata",
                               dzl_fuzzy_index_builder_build_metadata (self));

  /* Keys is an array of string keys where the index is the "key_id" */
  g_variant_dict_insert_value (&dict,
                               "keys",
                               dzl_fuzzy_index_builder_build_keys (self));

  /* The lookaside is a mapping of kvpair to the repsective keys and
   * documents. This allows the tables to use the kvpair id as the value
   * in the index so we can have both document deduplication as well as
   * the ability to disambiguate the keys which point to the same
   * document. The contents are "a{uu}".
   */
  g_variant_dict_insert_value (&dict,
                               "lookaside",
                               dzl_fuzzy_index_builder_build_lookaside (self));

  /* Build our dicitionary of character → [(pos,lookaside_id),..] tuples.
   * The position is the utf8 character position within the string.
   * The lookaside_id is the index within the lookaside buffer to locate
   * the document_id or key_id.
   */
  g_variant_dict_insert_value (&dict,
                               "tables",
                               dzl_fuzzy_index_builder_build_index (self));

  /*
   * The documents are stored as an array where the document identifier is
   * their index position. We then use a lookaside buffer to map the insertion
   * id to the document id. Otherwise, we can't disambiguate between two
   * keys that insert the same document (as we deduplicate documents inserted
   * into the index).
   */
  documents = g_variant_new_array (NULL,
                                   (GVariant * const *)self->documents->pdata,
                                   self->documents->len);
  g_variant_dict_insert_value (&dict, "documents", g_variant_ref_sink (documents));

  /* Now write the variant to disk */
  variant = g_variant_ref_sink (g_variant_dict_end (&dict));
  if (!g_file_replace_contents (file,
                                g_variant_get_data (variant),
                                g_variant_get_size (variant),
                                NULL,
                                FALSE,
                                G_FILE_CREATE_NONE,
                                NULL,
                                cancellable,
                                &error))
    g_task_return_error (task, g_steal_pointer (&error));
  else
    g_task_return_boolean (task, TRUE);
}

/**
 * dzl_fuzzy_index_builder_write_async:
 * @self: A #DzlFuzzyIndexBuilder
 * @file: A #GFile to write the index to
 * @io_priority: The priority for IO operations
 * @cancellable: (nullable): An optional #GCancellable or %NULL
 * @callback: A callback for completion or %NULL
 * @user_data: User data for @callback
 *
 * Builds and writes the index to @file. The file format is a
 * GVariant on disk and can be loaded and searched using
 * #FuzzyIndex.
 */
void
dzl_fuzzy_index_builder_write_async (DzlFuzzyIndexBuilder *self,
                                     GFile                *file,
                                     gint                  io_priority,
                                     GCancellable         *cancellable,
                                     GAsyncReadyCallback   callback,
                                     gpointer              user_data)
{
  g_autoptr(GTask) task = NULL;

  g_return_if_fail (DZL_IS_FUZZY_INDEX_BUILDER (self));
  g_return_if_fail (G_IS_FILE (file));
  g_return_if_fail (!cancellable || G_IS_CANCELLABLE (cancellable));

  task = g_task_new (self, cancellable, callback, user_data);
  g_task_set_source_tag (task, dzl_fuzzy_index_builder_write_async);
  g_task_set_priority (task, io_priority);
  g_task_set_task_data (task, g_object_ref (file), g_object_unref);
  g_task_run_in_thread (task, dzl_fuzzy_index_builder_write_worker);
}

gboolean
dzl_fuzzy_index_builder_write_finish (DzlFuzzyIndexBuilder  *self,
                                      GAsyncResult          *result,
                                      GError               **error)
{
  g_return_val_if_fail (DZL_IS_FUZZY_INDEX_BUILDER (self), FALSE);
  g_return_val_if_fail (G_IS_TASK (result), FALSE);

  return g_task_propagate_boolean (G_TASK (result), error);
}

gboolean
dzl_fuzzy_index_builder_write (DzlFuzzyIndexBuilder  *self,
                               GFile                 *file,
                               gint                   io_priority,
                               GCancellable          *cancellable,
                               GError               **error)
{
  g_autoptr(GTask) task = NULL;

  g_return_val_if_fail (DZL_IS_FUZZY_INDEX_BUILDER (self), FALSE);
  g_return_val_if_fail (G_IS_FILE (file), FALSE);
  g_return_val_if_fail (!cancellable || G_IS_CANCELLABLE (cancellable), FALSE);

  task = g_task_new (self, cancellable, NULL, NULL);
  g_task_set_source_tag (task, dzl_fuzzy_index_builder_write);
  g_task_set_priority (task, io_priority);
  g_task_set_task_data (task, g_object_ref (file), g_object_unref);

  dzl_fuzzy_index_builder_write_worker (task, self, file, cancellable);

  return g_task_propagate_boolean (task, error);
}

/**
 * dzl_fuzzy_index_builder_get_document:
 *
 * Returns the document that was inserted in a previous call to
 * dzl_fuzzy_index_builder_insert().
 *
 * Returns: (transfer none): A #GVariant
 */
const GVariant *
dzl_fuzzy_index_builder_get_document (DzlFuzzyIndexBuilder *self,
                                      guint64               document_id)
{
  g_return_val_if_fail (DZL_IS_FUZZY_INDEX_BUILDER (self), NULL);
  g_return_val_if_fail ((guint)document_id < self->documents->len, NULL);

  return g_ptr_array_index (self->documents, (guint)document_id);
}

void
dzl_fuzzy_index_builder_set_metadata (DzlFuzzyIndexBuilder *self,
                                      const gchar          *key,
                                      GVariant             *value)
{
  g_return_if_fail (DZL_IS_FUZZY_INDEX_BUILDER (self));
  g_return_if_fail (key != NULL);

  if (self->metadata == NULL)
    self->metadata = g_hash_table_new_full (g_str_hash,
                                            g_str_equal,
                                            g_free,
                                            (GDestroyNotify)g_variant_unref);

  if (value != NULL)
    g_hash_table_insert (self->metadata,
                         g_strdup (key),
                         g_variant_ref_sink (value));
  else
    g_hash_table_remove (self->metadata, key);
}

void
dzl_fuzzy_index_builder_set_metadata_string (DzlFuzzyIndexBuilder *self,
                                             const gchar          *key,
                                             const gchar          *value)
{
  g_return_if_fail (DZL_IS_FUZZY_INDEX_BUILDER (self));
  g_return_if_fail (key != NULL);
  g_return_if_fail (value != NULL);

  dzl_fuzzy_index_builder_set_metadata (self, key, g_variant_new_string (value));
}

void
dzl_fuzzy_index_builder_set_metadata_uint32 (DzlFuzzyIndexBuilder *self,
                                             const gchar          *key,
                                             guint32               value)
{
  g_return_if_fail (DZL_IS_FUZZY_INDEX_BUILDER (self));
  g_return_if_fail (key != NULL);

  dzl_fuzzy_index_builder_set_metadata (self, key, g_variant_new_uint32 (value));
}

void
dzl_fuzzy_index_builder_set_metadata_uint64 (DzlFuzzyIndexBuilder *self,
                                             const gchar          *key,
                                             guint64               value)
{
  g_return_if_fail (DZL_IS_FUZZY_INDEX_BUILDER (self));
  g_return_if_fail (key != NULL);

  dzl_fuzzy_index_builder_set_metadata (self, key, g_variant_new_uint64 (value));
}

gboolean
dzl_fuzzy_index_builder_get_case_sensitive (DzlFuzzyIndexBuilder *self)
{
  g_return_val_if_fail (DZL_IS_FUZZY_INDEX_BUILDER (self), FALSE);

  return self->case_sensitive;
}

void
dzl_fuzzy_index_builder_set_case_sensitive (DzlFuzzyIndexBuilder *self,
                                            gboolean              case_sensitive)
{
  g_return_if_fail (DZL_IS_FUZZY_INDEX_BUILDER (self));

  case_sensitive = !!case_sensitive;

  if (self->case_sensitive != case_sensitive)
    {
      self->case_sensitive = case_sensitive;
      g_object_notify_by_pspec (G_OBJECT (self), properties [PROP_CASE_SENSITIVE]);
    }
}