Blob Blame History Raw
/* dzl-fuzzy-index.c
 *
 * Copyright (C) 2016 Christian Hergert <chergert@redhat.com>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#define G_LOG_DOMAIN "dzl-fuzzy-index"

#include "config.h"

#include <string.h>

#include "dzl-fuzzy-index.h"
#include "dzl-fuzzy-index-cursor.h"
#include "dzl-fuzzy-index-private.h"

typedef struct
{
  guint key_id;
  guint document_id;
} LookasideEntry;

struct _DzlFuzzyIndex
{
  GObject       object;

  guint         loaded : 1;
  guint         case_sensitive : 1;

  GMappedFile  *mapped_file;

  /*
   * Toplevel variant for the whole document. This is loaded from the entire
   * contents of @mapped_file. It contains a dictionary of "a{sv}"
   * containing all of our index data tables.
   */
  GVariant *variant;

  /*
   * This is a variant containing the array of documents. The index of the
   * document is the corresponding document_id used in other data structres.
   * This maps to the "documents" field in @variant.
   */
  GVariant *documents;

  /*
   * The keys found within the index. The index of the key is the "key_id"
   * used in other datastructures, such as the @lookaside array.
   */
  GVariant *keys;

  /*
   * The lookaside array is used to disambiguate between multiple keys
   * pointing to the same document. Each element in the array is of type
   * "(uu)" with the first field being the "key_id" and the second field
   * being the "document_id". Each of these are indexes into the
   * corresponding @documents and @keys arrays.
   *
   * This is a fixed array type and therefore can have the raw data
   * accessed with g_variant_get_fixed_array() to save on lookup
   * costs.
   */
  GVariant *lookaside;

  /*
   * Raw pointers for fast access to the lookaside buffer.
   */
  const LookasideEntry *lookaside_raw;
  gsize lookaside_len;

  /*
   * This vardict is used to get the fixed array containing the
   * (offset, lookaside_id) for each unicode character in the index.
   * These are accessed by the cursors to layout the fulltext search
   * index by each character in the input string. Doing so, is what
   * gives us the O(mn) worst-case running time.
   */
  GVariantDict *tables;

  /*
   * The metadata located within the search index. This contains
   * metadata set with dzl_fuzzy_index_builder_set_metadata() or one
   * of its typed variants.
   */
  GVariantDict *metadata;
};

G_DEFINE_TYPE (DzlFuzzyIndex, dzl_fuzzy_index, G_TYPE_OBJECT)

static void
dzl_fuzzy_index_finalize (GObject *object)
{
  DzlFuzzyIndex *self = (DzlFuzzyIndex *)object;

  g_clear_pointer (&self->mapped_file, g_mapped_file_unref);
  g_clear_pointer (&self->variant, g_variant_unref);
  g_clear_pointer (&self->documents, g_variant_unref);
  g_clear_pointer (&self->keys, g_variant_unref);
  g_clear_pointer (&self->tables, g_variant_dict_unref);
  g_clear_pointer (&self->lookaside, g_variant_unref);
  g_clear_pointer (&self->metadata, g_variant_dict_unref);

  G_OBJECT_CLASS (dzl_fuzzy_index_parent_class)->finalize (object);
}

static void
dzl_fuzzy_index_class_init (DzlFuzzyIndexClass *klass)
{
  GObjectClass *object_class = G_OBJECT_CLASS (klass);

  object_class->finalize = dzl_fuzzy_index_finalize;
}

static void
dzl_fuzzy_index_init (DzlFuzzyIndex *self)
{
}

DzlFuzzyIndex *
dzl_fuzzy_index_new (void)
{
  return g_object_new (DZL_TYPE_FUZZY_INDEX, NULL);
}

static void
dzl_fuzzy_index_load_file_worker (GTask        *task,
                                  gpointer      source_object,
                                  gpointer      task_data,
                                  GCancellable *cancellable)
{
  g_autofree gchar *path = NULL;
  g_autoptr(GMappedFile) mapped_file = NULL;
  g_autoptr(GVariant) variant = NULL;
  g_autoptr(GVariant) documents = NULL;
  g_autoptr(GVariant) lookaside = NULL;
  g_autoptr(GVariant) keys = NULL;
  g_autoptr(GVariant) tables = NULL;
  g_autoptr(GVariant) metadata = NULL;
  g_autoptr(GError) error = NULL;
  DzlFuzzyIndex *self = source_object;
  GFile *file = task_data;
  GVariantDict dict;
  gint version = 0;
  gboolean case_sensitive = FALSE;

  g_assert (DZL_IS_FUZZY_INDEX (self));
  g_assert (G_IS_FILE (file));
  g_assert (!cancellable || G_IS_CANCELLABLE (cancellable));

  if (self->loaded)
    {
      g_task_return_new_error (task,
                               G_IO_ERROR,
                               G_IO_ERROR_INVAL,
                               "Cannot load index multiple times");
      return;
    }

  self->loaded = TRUE;

  if (!g_file_is_native (file) || NULL == (path = g_file_get_path (file)))
    {
      g_task_return_new_error (task,
                               G_IO_ERROR,
                               G_IO_ERROR_INVALID_FILENAME,
                               "Index must be a local file");
      return;
    }

  if (NULL == (mapped_file = g_mapped_file_new (path, FALSE, &error)))
    {
      g_task_return_error (task, g_steal_pointer (&error));
      return;
    }

  variant = g_variant_new_from_data (G_VARIANT_TYPE_VARDICT,
                                     g_mapped_file_get_contents (mapped_file),
                                     g_mapped_file_get_length (mapped_file),
                                     FALSE, NULL, NULL);

  if (variant == NULL)
    {
      g_task_return_new_error (task,
                               G_IO_ERROR,
                               G_IO_ERROR_INVAL,
                               "Failed to parse GVariant");
      return;
    }

  g_variant_ref_sink (variant);

  g_variant_dict_init (&dict, variant);

  if (!g_variant_dict_lookup (&dict, "version", "i", &version) || version != 1)
    {
      g_variant_dict_clear (&dict);
      g_task_return_new_error (task,
                               G_IO_ERROR,
                               G_IO_ERROR_INVAL,
                               "Version mismatch in gvariant. Got %d, expected 1",
                               version);
      return;
    }

  documents = g_variant_dict_lookup_value (&dict, "documents", G_VARIANT_TYPE_ARRAY);
  keys = g_variant_dict_lookup_value (&dict, "keys", G_VARIANT_TYPE_STRING_ARRAY);
  lookaside = g_variant_dict_lookup_value (&dict, "lookaside", G_VARIANT_TYPE_ARRAY);
  tables = g_variant_dict_lookup_value (&dict, "tables", G_VARIANT_TYPE_VARDICT);
  metadata = g_variant_dict_lookup_value (&dict, "metadata", G_VARIANT_TYPE_VARDICT);
  g_variant_dict_clear (&dict);

  if (keys == NULL || documents == NULL || tables == NULL || metadata == NULL)
    {
      g_task_return_new_error (task,
                               G_IO_ERROR,
                               G_IO_ERROR_INVAL,
                               "Invalid gvariant index");
      return;
    }

  self->mapped_file = g_steal_pointer (&mapped_file);
  self->variant = g_steal_pointer (&variant);
  self->documents = g_steal_pointer (&documents);
  self->lookaside = g_steal_pointer (&lookaside);
  self->keys = g_steal_pointer (&keys);
  self->tables = g_variant_dict_new (tables);
  self->metadata = g_variant_dict_new (metadata);

  self->lookaside_raw = g_variant_get_fixed_array (self->lookaside,
                                                   &self->lookaside_len,
                                                   sizeof (LookasideEntry));

  if (g_variant_dict_lookup (self->metadata, "case-sensitive", "b", &case_sensitive))
    self->case_sensitive = !!case_sensitive;

  g_task_return_boolean (task, TRUE);
}

void
dzl_fuzzy_index_load_file_async (DzlFuzzyIndex       *self,
                                 GFile               *file,
                                 GCancellable        *cancellable,
                                 GAsyncReadyCallback  callback,
                                 gpointer             user_data)
{
  g_autoptr(GTask) task = NULL;

  g_return_if_fail (DZL_IS_FUZZY_INDEX (self));
  g_return_if_fail (G_IS_FILE (file));
  g_return_if_fail (!cancellable || G_IS_CANCELLABLE (cancellable));

  task = g_task_new (self, cancellable, callback, user_data);
  g_task_set_source_tag (task, dzl_fuzzy_index_load_file);
  g_task_set_task_data (task, g_object_ref (file), g_object_unref);
  g_task_set_check_cancellable (task, FALSE);
  g_task_run_in_thread (task, dzl_fuzzy_index_load_file_worker);
}

gboolean
dzl_fuzzy_index_load_file_finish (DzlFuzzyIndex  *self,
                                  GAsyncResult   *result,
                                  GError        **error)
{
  g_return_val_if_fail (DZL_IS_FUZZY_INDEX (self), FALSE);
  g_return_val_if_fail (G_IS_TASK (result), FALSE);

  return g_task_propagate_boolean (G_TASK (result), error);
}

gboolean
dzl_fuzzy_index_load_file (DzlFuzzyIndex  *self,
                           GFile          *file,
                           GCancellable   *cancellable,
                           GError        **error)
{
  g_autoptr(GTask) task = NULL;

  g_return_val_if_fail (DZL_IS_FUZZY_INDEX (self), FALSE);
  g_return_val_if_fail (G_IS_FILE (file), FALSE);
  g_return_val_if_fail (!cancellable || G_IS_CANCELLABLE (cancellable), FALSE);

  task = g_task_new (self, cancellable, NULL, NULL);
  g_task_set_source_tag (task, dzl_fuzzy_index_load_file);
  g_task_set_task_data (task, g_object_ref (file), g_object_unref);
  g_task_set_check_cancellable (task, FALSE);

  dzl_fuzzy_index_load_file_worker (task, self, file, cancellable);

  return g_task_propagate_boolean (task, error);
}

static void
dzl_fuzzy_index_query_cb (GObject      *object,
                          GAsyncResult *result,
                          gpointer      user_data)
{
  DzlFuzzyIndexCursor *cursor = (DzlFuzzyIndexCursor *)object;
  g_autoptr(GTask) task = user_data;
  g_autoptr(GError) error = NULL;

  g_assert (DZL_IS_FUZZY_INDEX_CURSOR (cursor));
  g_assert (G_IS_ASYNC_RESULT (result));
  g_assert (G_IS_TASK (task));

  if (!g_async_initable_init_finish (G_ASYNC_INITABLE (cursor), result, &error))
    g_task_return_error (task, g_steal_pointer (&error));
  else
    g_task_return_pointer (task, g_object_ref (cursor), g_object_unref);
}

void
dzl_fuzzy_index_query_async (DzlFuzzyIndex       *self,
                             const gchar         *query,
                             guint                max_matches,
                             GCancellable        *cancellable,
                             GAsyncReadyCallback  callback,
                             gpointer             user_data)
{
  g_autoptr(GTask) task = NULL;
  g_autoptr(DzlFuzzyIndexCursor) cursor = NULL;

  g_return_if_fail (DZL_IS_FUZZY_INDEX (self));
  g_return_if_fail (query != NULL);
  g_return_if_fail (!cancellable || G_IS_CANCELLABLE (cancellable));

  task = g_task_new (self, cancellable, callback, user_data);
  g_task_set_priority (task, G_PRIORITY_LOW);
  g_task_set_source_tag (task, dzl_fuzzy_index_query_async);

  cursor = g_object_new (DZL_TYPE_FUZZY_INDEX_CURSOR,
                         "case-sensitive", self->case_sensitive,
                         "index", self,
                         "query", query,
                         "max-matches", max_matches,
                         "tables", self->tables,
                         NULL);

  g_async_initable_init_async (G_ASYNC_INITABLE (cursor),
                               G_PRIORITY_LOW,
                               cancellable,
                               dzl_fuzzy_index_query_cb,
                               g_steal_pointer (&task));
}

/**
 * dzl_fuzzy_index_query_finish:
 *
 * Completes an asynchronous request to dzl_fuzzy_index_query_async().
 *
 * Returns: (transfer full): A #GListModel of results.
 */
GListModel *
dzl_fuzzy_index_query_finish (DzlFuzzyIndex  *self,
                              GAsyncResult   *result,
                              GError        **error)
{
  g_return_val_if_fail (DZL_IS_FUZZY_INDEX (self), NULL);
  g_return_val_if_fail (G_IS_TASK (result), NULL);

  return g_task_propagate_pointer (G_TASK (result), error);
}

/**
 * dzl_fuzzy_index_get_metadata:
 *
 * Looks up the metadata for @key.
 *
 * Returns: (transfer full) (nullable): A #GVariant or %NULL.
 */
GVariant *
dzl_fuzzy_index_get_metadata (DzlFuzzyIndex *self,
                              const gchar   *key)
{
  g_return_val_if_fail (DZL_IS_FUZZY_INDEX (self), NULL);
  g_return_val_if_fail (key != NULL, NULL);

  if (self->metadata != NULL)
    return g_variant_dict_lookup_value (self->metadata, key, NULL);

  return NULL;
}

guint64
dzl_fuzzy_index_get_metadata_uint64 (DzlFuzzyIndex *self,
                                     const gchar   *key)
{
  g_autoptr(GVariant) ret = NULL;

  ret = dzl_fuzzy_index_get_metadata (self, key);

  if (ret != NULL)
    return g_variant_get_uint64 (ret);

  return G_GUINT64_CONSTANT (0);
}

guint32
dzl_fuzzy_index_get_metadata_uint32 (DzlFuzzyIndex *self,
                                     const gchar   *key)
{
  g_autoptr(GVariant) ret = NULL;

  ret = dzl_fuzzy_index_get_metadata (self, key);

  if (ret != NULL)
    return g_variant_get_uint32 (ret);

  return G_GUINT64_CONSTANT (0);
}

const gchar *
dzl_fuzzy_index_get_metadata_string (DzlFuzzyIndex *self,
                                     const gchar   *key)
{
  g_autoptr(GVariant) ret = NULL;

  ret = dzl_fuzzy_index_get_metadata (self, key);

  /*
   * This looks unsafe, but is safe because strings are \0 terminated
   * inside the variants and the result is a pointer to internal data.
   * Since that data exists on our mmap() region, we are all good.
   */
  if (ret != NULL)
    return g_variant_get_string (ret, NULL);

  return NULL;
}

/**
 * _dzl_fuzzy_index_lookup_document:
 * @self: A #DzlFuzzyIndex
 * @document_id: The identifier of the document
 *
 * This looks up the document found matching @document_id.
 *
 * This should be the document_id resolved through the lookaside
 * using _dzl_fuzzy_index_resolve().
 *
 * Returns: (transfer full) (nullable): A #GVariant if successful;
 *   otherwise %NULL.
 */
GVariant *
_dzl_fuzzy_index_lookup_document (DzlFuzzyIndex *self,
                                  guint          document_id)
{
  g_assert (DZL_IS_FUZZY_INDEX (self));

  return g_variant_get_child_value (self->documents, document_id);
}

gboolean
_dzl_fuzzy_index_resolve (DzlFuzzyIndex  *self,
                          guint           lookaside_id,
                          guint          *document_id,
                          const gchar   **key,
                          guint          *priority,
                          guint           in_score,
                          guint           last_offset,
                          gfloat         *out_score)
{
  const LookasideEntry *entry;
  const gchar *local_key = NULL;
  guint key_id;

  g_assert (DZL_IS_FUZZY_INDEX (self));
  g_assert (document_id != NULL);
  g_assert (out_score != NULL);
  g_assert (priority != NULL);

  /* Mask off the key priority */
  lookaside_id &= 0x00FFFFFF;

  if G_UNLIKELY (lookaside_id >= self->lookaside_len)
    return FALSE;

  entry = &self->lookaside_raw [lookaside_id];

  /* The key_id has a mask with the priority as well */
  key_id = entry->key_id & 0x00FFFFFF;
  if G_UNLIKELY (key_id >= g_variant_n_children (self->keys))
    return FALSE;

  g_variant_get_child (self->keys, key_id, "&s", &local_key);

  if (key != NULL)
    *key = local_key;

  if (document_id != NULL)
    *document_id = entry->document_id;

  *priority = (entry->key_id & 0xFF000000) >> 24;
  *out_score = ((1.0 / 256.0) / (1 + last_offset + in_score)) + ((255.0 - *priority) / 256.0);

  return TRUE;
}