Blob Blame History Raw
/* dzl-fuzzy-index-cursor.c
 *
 * Copyright (C) 2016 Christian Hergert <christian@hergert.me>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#define G_LOG_DOMAIN "dzl-fuzzy-index-cursor"

#include "config.h"

#include <string.h>

#include "search/dzl-fuzzy-index-cursor.h"
#include "search/dzl-fuzzy-index-match.h"
#include "search/dzl-fuzzy-index-private.h"
#include "util/dzl-int-pair.h"

struct _DzlFuzzyIndexCursor
{
  GObject          object;

  DzlFuzzyIndex   *index;
  gchar           *query;
  GVariantDict    *tables;
  GArray          *matches;
  guint            max_matches;
  guint            case_sensitive : 1;
};

typedef struct
{
  guint position;
  guint lookaside_id;
} DzlFuzzyIndexItem;

typedef struct
{
  const gchar *key;
  guint        document_id;
  gfloat       score;
  guint        priority;
} DzlFuzzyMatch;

typedef struct
{
  DzlFuzzyIndex                   *index;
  const DzlFuzzyIndexItem * const *tables;
  const gsize                     *tables_n_elements;
  gint                            *tables_state;
  guint                            n_tables;
  guint                            max_matches;
  const gchar                     *needle;
  GHashTable                      *matches;
} DzlFuzzyLookup;

enum {
  PROP_0,
  PROP_CASE_SENSITIVE,
  PROP_INDEX,
  PROP_TABLES,
  PROP_MAX_MATCHES,
  PROP_QUERY,
  N_PROPS
};

static void async_initable_iface_init (GAsyncInitableIface *iface);
static void list_model_iface_init     (GListModelInterface *iface);

static GParamSpec *properties [N_PROPS];

G_DEFINE_TYPE_WITH_CODE (DzlFuzzyIndexCursor, dzl_fuzzy_index_cursor, G_TYPE_OBJECT,
                         G_IMPLEMENT_INTERFACE (G_TYPE_ASYNC_INITABLE, async_initable_iface_init)
                         G_IMPLEMENT_INTERFACE (G_TYPE_LIST_MODEL, list_model_iface_init))

static inline gfloat
pointer_to_float (gpointer ptr)
{
  union {
    gpointer ptr;
#if GLIB_SIZEOF_VOID_P == 8
    gdouble fval;
#else
    gfloat fval;
#endif
  } convert;
  convert.ptr = ptr;
  return (gfloat)convert.fval;
}

static inline gpointer
float_to_pointer (gfloat fval)
{
  union {
    gpointer ptr;
#if GLIB_SIZEOF_VOID_P == 8
    gdouble fval;
#else
    gfloat fval;
#endif
  } convert;
  convert.fval = fval;
  return convert.ptr;
}

/* Not guaranteed, so assert to be sure */
G_STATIC_ASSERT (sizeof(gdouble) == 8);
G_STATIC_ASSERT (sizeof(gfloat) == 4);

static void
dzl_fuzzy_index_cursor_finalize (GObject *object)
{
  DzlFuzzyIndexCursor *self = (DzlFuzzyIndexCursor *)object;

  g_clear_object (&self->index);
  g_clear_pointer (&self->query, g_free);
  g_clear_pointer (&self->matches, g_array_unref);
  g_clear_pointer (&self->tables, g_variant_dict_unref);

  G_OBJECT_CLASS (dzl_fuzzy_index_cursor_parent_class)->finalize (object);
}

static void
dzl_fuzzy_index_cursor_get_property (GObject    *object,
                                 guint       prop_id,
                                 GValue     *value,
                                 GParamSpec *pspec)
{
  DzlFuzzyIndexCursor *self = DZL_FUZZY_INDEX_CURSOR(object);

  switch (prop_id)
    {
    case PROP_CASE_SENSITIVE:
      g_value_set_boolean (value, self->case_sensitive);
      break;

    case PROP_INDEX:
      g_value_set_object (value, self->index);
      break;

    case PROP_MAX_MATCHES:
      g_value_set_uint (value, self->max_matches);
      break;

    case PROP_QUERY:
      g_value_set_string (value, self->query);
      break;

    default:
      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
    }
}

static void
dzl_fuzzy_index_cursor_set_property (GObject      *object,
                                 guint         prop_id,
                                 const GValue *value,
                                 GParamSpec   *pspec)
{
  DzlFuzzyIndexCursor *self = DZL_FUZZY_INDEX_CURSOR(object);

  switch (prop_id)
    {
    case PROP_CASE_SENSITIVE:
      self->case_sensitive = g_value_get_boolean (value);
      break;

    case PROP_INDEX:
      self->index = g_value_dup_object (value);
      break;

    case PROP_TABLES:
      self->tables = g_value_dup_boxed (value);
      break;

    case PROP_MAX_MATCHES:
      self->max_matches = g_value_get_uint (value);
      break;

    case PROP_QUERY:
      self->query = g_value_dup_string (value);
      break;

    default:
      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
    }
}

static void
dzl_fuzzy_index_cursor_class_init (DzlFuzzyIndexCursorClass *klass)
{
  GObjectClass *object_class = G_OBJECT_CLASS (klass);

  object_class->finalize = dzl_fuzzy_index_cursor_finalize;
  object_class->get_property = dzl_fuzzy_index_cursor_get_property;
  object_class->set_property = dzl_fuzzy_index_cursor_set_property;

  properties [PROP_CASE_SENSITIVE] =
    g_param_spec_boolean ("case-sensitive",
                          "Case Sensitive",
                          "Case Sensitive",
                          FALSE,
                          (G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));

  properties [PROP_INDEX] =
    g_param_spec_object ("index",
                         "Index",
                         "The index this cursor is iterating",
                         DZL_TYPE_FUZZY_INDEX,
                         (G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));

  properties [PROP_TABLES] =
    g_param_spec_boxed ("tables",
                        "Tables",
                        "The dictionary of character indexes",
                        G_TYPE_VARIANT_DICT,
                        (G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));

  properties [PROP_QUERY] =
    g_param_spec_string ("query",
                         "Query",
                         "The query for the index",
                         NULL,
                         (G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));

  properties [PROP_MAX_MATCHES] =
    g_param_spec_uint ("max-matches",
                       "Max Matches",
                       "The max number of matches to display",
                       0,
                       G_MAXUINT,
                       0,
                       (G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));

  g_object_class_install_properties (object_class, N_PROPS, properties);
}

static void
dzl_fuzzy_index_cursor_init (DzlFuzzyIndexCursor *self)
{
  self->matches = g_array_new (FALSE, FALSE, sizeof (DzlFuzzyMatch));
}

static gint
fuzzy_match_compare (gconstpointer a,
                     gconstpointer b)
{
  const DzlFuzzyMatch *ma = a;
  const DzlFuzzyMatch *mb = b;

  if (ma->score < mb->score)
    return 1;
  else if (ma->score > mb->score)
    return -1;

  return strcmp (ma->key, mb->key);
}

static gboolean
fuzzy_do_match (const DzlFuzzyLookup    *lookup,
                const DzlFuzzyIndexItem *item,
                guint                    table_index,
                gint                     score)
{
  const DzlFuzzyIndexItem *table;
  const DzlFuzzyIndexItem *iter;
  gssize n_elements;
  gint *state;
  gint iter_score;

  g_assert (lookup != NULL);
  g_assert (item != NULL);
  g_assert (score >= 0);

  table = lookup->tables [table_index];
  n_elements = (gssize)lookup->tables_n_elements [table_index];
  state = &lookup->tables_state [table_index];

  for (; state [0] < n_elements; state [0]++)
    {
      DzlIntPair *lookup_pair;
      gboolean contains_document;

      iter = &table [state [0]];

      if ((iter->lookaside_id < item->lookaside_id) ||
          ((iter->lookaside_id == item->lookaside_id) &&
           (iter->position <= item->position)))
        continue;
      else if (iter->lookaside_id > item->lookaside_id)
        break;

      iter_score = score + (iter->position - item->position);

      if (table_index + 1 < lookup->n_tables)
        {
          if (fuzzy_do_match (lookup, iter, table_index + 1, iter_score))
            return TRUE;
          continue;
        }

      contains_document = g_hash_table_lookup_extended (lookup->matches,
                                                        GUINT_TO_POINTER (item->lookaside_id),
                                                        NULL,
                                                        (gpointer *)&lookup_pair);

      if (!contains_document || iter_score < dzl_int_pair_first (lookup_pair))
        g_hash_table_insert (lookup->matches,
                             GUINT_TO_POINTER (item->lookaside_id),
                             dzl_int_pair_new (iter_score, iter->position));

      return TRUE;
    }

  return FALSE;
}

static void
dzl_fuzzy_index_cursor_worker (GTask        *task,
                               gpointer      source_object,
                               gpointer      task_data,
                               GCancellable *cancellable)
{
  DzlFuzzyIndexCursor *self = source_object;
  g_autoptr(GHashTable) matches = NULL;
  g_autoptr(GHashTable) by_document = NULL;
  g_autoptr(GPtrArray) tables = NULL;
  g_autoptr(GArray) tables_n_elements = NULL;
  g_autofree gint *tables_state = NULL;
  g_autofree gchar *freeme = NULL;
  const gchar *query;
  DzlFuzzyLookup lookup = { 0 };
  GHashTableIter iter;
  const gchar *str;
  gpointer key, value;
  guint i;

  g_assert (DZL_IS_FUZZY_INDEX_CURSOR (self));
  g_assert (G_IS_TASK (task));

  if (g_task_return_error_if_cancelled (task))
    return;

  /* No matches with empty query */
  if (self->query == NULL || *self->query == '\0')
    goto cleanup;

  /* If we are not case-sensitive, we need to downcase the query string */
  query = self->query;
  if (!self->case_sensitive)
    query = freeme = g_utf8_casefold (query, -1);

  tables = g_ptr_array_new ();
  tables_n_elements = g_array_new (FALSE, FALSE, sizeof (gsize));
  matches = g_hash_table_new_full (NULL, NULL, NULL, (GDestroyNotify)dzl_int_pair_free);

  for (str = query; *str; str = g_utf8_next_char (str))
    {
      gunichar ch = g_utf8_get_char (str);
      g_autoptr(GVariant) table = NULL;
      gconstpointer fixed;
      gsize n_elements;
      gchar char_key[8];

      if (g_unichar_isspace (ch))
        continue;

      char_key [g_unichar_to_utf8 (ch, char_key)] = '\0';
      table = g_variant_dict_lookup_value (self->tables,
                                           char_key,
                                           (const GVariantType *)"a(uu)");

      /* No possible matches, missing table for character */
      if (table == NULL)
        goto cleanup;

      fixed = g_variant_get_fixed_array (table, &n_elements, sizeof (DzlFuzzyIndexItem));
      g_array_append_val (tables_n_elements, n_elements);
      g_ptr_array_add (tables, (gpointer)fixed);
    }

  if (tables->len == 0)
    goto cleanup;

  g_assert (tables->len > 0);
  g_assert (tables->len == tables_n_elements->len);

  tables_state = g_new0 (gint, tables->len);

  lookup.index = self->index;
  lookup.matches = matches;
  lookup.tables = (const DzlFuzzyIndexItem * const *)tables->pdata;
  lookup.tables_n_elements = (const gsize *)(gpointer)tables_n_elements->data;
  lookup.tables_state = tables_state;
  lookup.n_tables = tables->len;
  lookup.needle = query;
  lookup.max_matches = self->max_matches;

  if G_LIKELY (lookup.n_tables > 1)
    {
      for (i = 0; i < lookup.tables_n_elements[0]; i++)
        {
          const DzlFuzzyIndexItem *item = &lookup.tables[0][i];

          fuzzy_do_match (&lookup, item, 1, MIN (16, item->position * 2));
        }
    }
  else
    {
      guint last_id = G_MAXUINT;

      for (i = 0; i < lookup.tables_n_elements[0]; i++)
        {
          const DzlFuzzyIndexItem *item = &lookup.tables[0][i];
          DzlFuzzyMatch match;

          if (item->lookaside_id != last_id)
            {
              last_id = item->lookaside_id;

              if G_UNLIKELY (!_dzl_fuzzy_index_resolve (self->index,
                                                        item->lookaside_id,
                                                        &match.document_id,
                                                        &match.key,
                                                        &match.priority,
                                                        item->position,
                                                        item->position,
                                                        &match.score))
                continue;

              g_array_append_val (self->matches, match);
            }
        }

      goto cleanup;
    }

  if (g_task_return_error_if_cancelled (task))
    return;

  by_document = g_hash_table_new (NULL, NULL);

  g_hash_table_iter_init (&iter, matches);

  while (g_hash_table_iter_next (&iter, &key, &value))
    {
      DzlIntPair *pair = value;
      guint score = dzl_int_pair_first (pair);
      guint last_offset = dzl_int_pair_second (pair);
      gpointer other_score;
      DzlFuzzyMatch match;
      guint lookaside_id = GPOINTER_TO_UINT (key);

      if G_UNLIKELY (!_dzl_fuzzy_index_resolve (self->index,
                                                lookaside_id,
                                                &match.document_id,
                                                &match.key,
                                                &match.priority,
                                                score,
                                                last_offset,
                                                &match.score))
        continue;

      if (g_hash_table_lookup_extended (by_document,
                                        GUINT_TO_POINTER (match.document_id),
                                        NULL,
                                        &other_score) &&
          match.score <= pointer_to_float (other_score))
        continue;

      g_hash_table_insert (by_document,
                           GUINT_TO_POINTER (match.document_id),
                           float_to_pointer (match.score));

      g_array_append_val (self->matches, match);
    }

  /*
   * Because we have to do the deduplication of documents after
   * searching all the potential matches, we could have duplicates
   * in the array. So we need to filter them out. Not that big
   * of a deal, since we can do remove_fast on the index and
   * we have to sort them afterwards anyway. Potentially, we could
   * do both at once if we felt like doing our own sort.
   */
  for (i = 0; i < self->matches->len; i++)
    {
      DzlFuzzyMatch *match;
      gpointer other_score;

    next:
      match = &g_array_index (self->matches, DzlFuzzyMatch, i);
      other_score = g_hash_table_lookup (by_document, GUINT_TO_POINTER (match->document_id));

      /*
       * This item should have been discarded, but we didn't have enough
       * information at the time we built the array.
       */
      if (pointer_to_float (other_score) < match->score)
        {
          g_array_remove_index_fast (self->matches, i);
          if (i < self->matches->len - 1)
            goto next;
        }
    }

  if (g_task_return_error_if_cancelled (task))
    return;

cleanup:
  if (self->matches != NULL)
    {
      g_array_sort (self->matches, fuzzy_match_compare);
      if (lookup.max_matches > 0 && lookup.max_matches < self->matches->len)
        g_array_set_size (self->matches, lookup.max_matches);
    }

  g_task_return_boolean (task, TRUE);
}

static void
dzl_fuzzy_index_cursor_init_async (GAsyncInitable      *initable,
                                   gint                 io_priority,
                                   GCancellable        *cancellable,
                                   GAsyncReadyCallback  callback,
                                   gpointer             user_data)
{
  DzlFuzzyIndexCursor *self = (DzlFuzzyIndexCursor *)initable;
  g_autoptr(GTask) task = NULL;

  g_assert (DZL_IS_FUZZY_INDEX_CURSOR (self));
  g_assert (!cancellable || G_IS_CANCELLABLE (cancellable));

  task = g_task_new (self, cancellable, callback, user_data);
  g_task_set_source_tag (task, dzl_fuzzy_index_cursor_init_async);
  g_task_set_priority (task, io_priority);
  g_task_set_check_cancellable (task, FALSE);
  g_task_run_in_thread (task, dzl_fuzzy_index_cursor_worker);
}

static gboolean
dzl_fuzzy_index_cursor_init_finish (GAsyncInitable  *initiable,
                                    GAsyncResult    *result,
                                    GError         **error)
{
  g_assert (DZL_IS_FUZZY_INDEX_CURSOR (initiable));
  g_assert (G_IS_TASK (result));

  return g_task_propagate_boolean (G_TASK (result), error);
}

static void
async_initable_iface_init (GAsyncInitableIface *iface)
{
  iface->init_async = dzl_fuzzy_index_cursor_init_async;
  iface->init_finish = dzl_fuzzy_index_cursor_init_finish;
}

static GType
dzl_fuzzy_index_cursor_get_item_type (GListModel *model)
{
  return DZL_TYPE_FUZZY_INDEX_MATCH;
}

static guint
dzl_fuzzy_index_cursor_get_n_items (GListModel *model)
{
  DzlFuzzyIndexCursor *self = (DzlFuzzyIndexCursor *)model;

  g_assert (DZL_IS_FUZZY_INDEX_CURSOR (self));

  return self->matches->len;
}

static gpointer
dzl_fuzzy_index_cursor_get_item (GListModel *model,
                                 guint       position)
{
  DzlFuzzyIndexCursor *self = (DzlFuzzyIndexCursor *)model;
  g_autoptr(GVariant) document = NULL;
  DzlFuzzyMatch *match;

  g_assert (DZL_IS_FUZZY_INDEX_CURSOR (self));
  g_assert (position < self->matches->len);

  match = &g_array_index (self->matches, DzlFuzzyMatch, position);

  document = _dzl_fuzzy_index_lookup_document (self->index, match->document_id);

  return g_object_new (DZL_TYPE_FUZZY_INDEX_MATCH,
                       "document", document,
                       "key", match->key,
                       "score", match->score,
                       "priority", match->priority,
                       NULL);
}

static void
list_model_iface_init (GListModelInterface *iface)
{
  iface->get_item_type = dzl_fuzzy_index_cursor_get_item_type;
  iface->get_n_items = dzl_fuzzy_index_cursor_get_n_items;
  iface->get_item = dzl_fuzzy_index_cursor_get_item;
}

/**
 * dzl_fuzzy_index_cursor_get_index:
 * @self: A #DzlFuzzyIndexCursor
 *
 * Gets the index the cursor is iterating.
 *
 * Returns: (transfer none): A #DzlFuzzyIndex.
 */
DzlFuzzyIndex *
dzl_fuzzy_index_cursor_get_index (DzlFuzzyIndexCursor *self)
{
  g_return_val_if_fail (DZL_IS_FUZZY_INDEX_CURSOR (self), NULL);

  return self->index;
}