Blame gettext-tools/src/msgl-fsearch.c

Packit Bot 06c835
/* Fast fuzzy searching among messages.
Packit Bot 06c835
   Copyright (C) 2006, 2008, 2011, 2015 Free Software Foundation, Inc.
Packit Bot 06c835
   Written by Bruno Haible <bruno@clisp.org>, 2006.
Packit Bot 06c835
Packit Bot 06c835
   This program is free software: you can redistribute it and/or modify
Packit Bot 06c835
   it under the terms of the GNU General Public License as published by
Packit Bot 06c835
   the Free Software Foundation; either version 3 of the License, or
Packit Bot 06c835
   (at your option) any later version.
Packit Bot 06c835
Packit Bot 06c835
   This program is distributed in the hope that it will be useful,
Packit Bot 06c835
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit Bot 06c835
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit Bot 06c835
   GNU General Public License for more details.
Packit Bot 06c835
Packit Bot 06c835
   You should have received a copy of the GNU General Public License
Packit Bot 06c835
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
Packit Bot 06c835
Packit Bot 06c835
#ifdef HAVE_CONFIG_H
Packit Bot 06c835
# include <config.h>
Packit Bot 06c835
#endif
Packit Bot 06c835
Packit Bot 06c835
/* Specification.  */
Packit Bot 06c835
#include "msgl-fsearch.h"
Packit Bot 06c835
Packit Bot 06c835
#include <math.h>
Packit Bot 06c835
#include <stdlib.h>
Packit Bot 06c835
Packit Bot 06c835
#include "xalloc.h"
Packit Bot 06c835
#include "po-charset.h"
Packit Bot 06c835
Packit Bot 06c835
Packit Bot 06c835
/* Fuzzy searching of L strings in a large set of N messages (assuming
Packit Bot 06c835
   they have all the same small size) takes O(L * N) when using repeated
Packit Bot 06c835
   fstrcmp() calls.  When for example L = 800 and N = 69000, this is slow.
Packit Bot 06c835
Packit Bot 06c835
   So we preprocess the set of N messages, yielding a data structure
Packit Bot 06c835
   that allows to select the similar messages for any given string, and
Packit Bot 06c835
   apply fstrcmp() only to the search result.  This allows to reduce the
Packit Bot 06c835
   time to something between O(L * 1) and O(L * N) - depending on the
Packit Bot 06c835
   structure of the strings.  In the example with L = 800 and N = 69000,
Packit Bot 06c835
   memory use increases by a factor of 2 and the time decreases from
Packit Bot 06c835
   9068 s to 230 s.
Packit Bot 06c835
Packit Bot 06c835
   The data structure is a hash table mapping each n-gram (here with n=4)
Packit Bot 06c835
   occurring in the message strings to the list of messages that contains
Packit Bot 06c835
   it.  For example, if the message list is
Packit Bot 06c835
      [0] "close"
Packit Bot 06c835
      [1] "losers"
Packit Bot 06c835
   the index is a hash table mapping
Packit Bot 06c835
      "clos" -> { 0 }
Packit Bot 06c835
      "lose" -> { 0, 1 }
Packit Bot 06c835
      "oser" -> { 1 }
Packit Bot 06c835
      "sers" -> { 1 }
Packit Bot 06c835
   Searching the similar messages to, say, "closest", is done by looking up
Packit Bot 06c835
   all its 4-grams in the hash table and summing up the results:
Packit Bot 06c835
      "clos" -> { 0 }
Packit Bot 06c835
      "lose" -> { 0, 1 }
Packit Bot 06c835
      "oses" -> {}
Packit Bot 06c835
      "sest" -> {}
Packit Bot 06c835
   => total: { 2x0, 1x1 }
Packit Bot 06c835
   The list is sorted according to decreasing frequency: { 0, 1, ... }
Packit Bot 06c835
   and only the first few messages from this frequency list are passed to
Packit Bot 06c835
   fstrcmp().
Packit Bot 06c835
Packit Bot 06c835
   The n-gram similarity and the fstrcmp() similarity are quite different
Packit Bot 06c835
   metrics.  For example, "close" and "c l o s e" have no n-grams in common
Packit Bot 06c835
   (even for n as small as n = 2); however, fstrcmp() will find that they
Packit Bot 06c835
   are quite similar (= 0.71).  Conversely, "AAAA BBBB ... ZZZZ" and
Packit Bot 06c835
   "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
Packit Bot 06c835
   only 0.02.  Also, repeated n-grams don't have the same effect on the
Packit Bot 06c835
   two similarity measures.  But all this doesn't matter much in practice.
Packit Bot 06c835
Packit Bot 06c835
   We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
Packit Bot 06c835
   lists are likely too long.  (Well, with ideographic languages like Chinese,
Packit Bot 06c835
   n = 3 might actually be quite good.  Anyway, n = 4 is not bad in this case
Packit Bot 06c835
   either.)
Packit Bot 06c835
Packit Bot 06c835
   The units are characters in the current encoding.  Not just bytes,
Packit Bot 06c835
   because 4 consecutive bytes in UTF-8 or GB18030 don't mean much.  */
Packit Bot 06c835
Packit Bot 06c835
/* Each message is represented by its index in the message list.  */
Packit Bot 06c835
typedef unsigned int index_ty;
Packit Bot 06c835
Packit Bot 06c835
/* An index list has its allocated size and length tacked at the front.
Packit Bot 06c835
   The indices are sorted in ascending order.  */
Packit Bot 06c835
typedef index_ty *index_list_ty;
Packit Bot 06c835
#define IL_ALLOCATED 0
Packit Bot 06c835
#define IL_LENGTH 1
Packit Bot 06c835
Packit Bot 06c835
/* Create a new index list containing only a given index.  */
Packit Bot 06c835
static inline index_list_ty
Packit Bot 06c835
new_index (index_ty idx)
Packit Bot 06c835
{
Packit Bot 06c835
  index_ty *list = XNMALLOC (2 + 1, index_ty);
Packit Bot 06c835
  list[IL_ALLOCATED] = 1;
Packit Bot 06c835
  list[IL_LENGTH] = 1;
Packit Bot 06c835
  list[2] = idx;
Packit Bot 06c835
Packit Bot 06c835
  return list;
Packit Bot 06c835
}
Packit Bot 06c835
Packit Bot 06c835
/* Add a given index, greater or equal than all previous indices, to an
Packit Bot 06c835
   index list.
Packit Bot 06c835
   Return the new index list, if it had to be reallocated, or NULL if it
Packit Bot 06c835
   didn't change.  */
Packit Bot 06c835
static inline index_list_ty
Packit Bot 06c835
addlast_index (index_list_ty list, index_ty idx)
Packit Bot 06c835
{
Packit Bot 06c835
  index_list_ty result;
Packit Bot 06c835
  size_t length = list[IL_LENGTH];
Packit Bot 06c835
Packit Bot 06c835
  /* Look whether it should be inserted.  */
Packit Bot 06c835
  if (length > 0 && list[2 + (length - 1)] == idx)
Packit Bot 06c835
    return NULL;
Packit Bot 06c835
Packit Bot 06c835
  /* Now make room for one more list element.  */
Packit Bot 06c835
  result = NULL;
Packit Bot 06c835
  if (length == list[IL_ALLOCATED])
Packit Bot 06c835
    {
Packit Bot 06c835
      size_t new_allocated = 2 * length - (length >> 6);
Packit Bot 06c835
      list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
Packit Bot 06c835
      list[IL_ALLOCATED] = new_allocated;
Packit Bot 06c835
      result = list;
Packit Bot 06c835
    }
Packit Bot 06c835
  list[2 + length] = idx;
Packit Bot 06c835
  list[IL_LENGTH] = length + 1;
Packit Bot 06c835
  return result;
Packit Bot 06c835
}
Packit Bot 06c835
Packit Bot 06c835
/* Add a given index to an index list.
Packit Bot 06c835
   Return the new index list, if it had to be reallocated, or NULL if it
Packit Bot 06c835
   didn't change.  */
Packit Bot 06c835
static inline index_list_ty
Packit Bot 06c835
add_index (index_list_ty list, index_ty idx)
Packit Bot 06c835
{
Packit Bot 06c835
  index_list_ty result;
Packit Bot 06c835
  size_t length = list[IL_LENGTH];
Packit Bot 06c835
Packit Bot 06c835
  /* Look where it should be inserted.  */
Packit Bot 06c835
  size_t lo = 0;
Packit Bot 06c835
  size_t hi = length;
Packit Bot 06c835
  while (lo < hi)
Packit Bot 06c835
    {
Packit Bot 06c835
      /* Here we know that idx must be inserted at a position >= lo, <= hi.  */
Packit Bot 06c835
      size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
Packit Bot 06c835
      index_ty val = list[2 + mid];
Packit Bot 06c835
      if (val < idx)
Packit Bot 06c835
        lo = mid + 1;
Packit Bot 06c835
      else if (val > idx)
Packit Bot 06c835
        hi = mid;
Packit Bot 06c835
      else
Packit Bot 06c835
        return NULL;
Packit Bot 06c835
    }
Packit Bot 06c835
Packit Bot 06c835
  /* Now make room for one more list element.  */
Packit Bot 06c835
  result = NULL;
Packit Bot 06c835
  if (length == list[IL_ALLOCATED])
Packit Bot 06c835
    {
Packit Bot 06c835
      size_t new_allocated = 2 * length - (length >> 6);
Packit Bot 06c835
      list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
Packit Bot 06c835
      list[IL_ALLOCATED] = new_allocated;
Packit Bot 06c835
      result = list;
Packit Bot 06c835
    }
Packit Bot 06c835
  list[IL_LENGTH] = length + 1;
Packit Bot 06c835
  for (; length > hi; length--)
Packit Bot 06c835
    list[2 + length] = list[1 + length];
Packit Bot 06c835
  list[2 + length] = idx;
Packit Bot 06c835
  return result;
Packit Bot 06c835
}
Packit Bot 06c835
Packit Bot 06c835
/* We use 4-grams, therefore strings with less than 4 characters cannot be
Packit Bot 06c835
   handled through the 4-grams table and need to be handled specially.
Packit Bot 06c835
   Since every character occupies at most 4 bytes (see po-charset.c),
Packit Bot 06c835
   this means the size of such short strings is bounded by:  */
Packit Bot 06c835
#define SHORT_STRING_MAX_CHARACTERS (4 - 1)
Packit Bot 06c835
#define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)
Packit Bot 06c835
Packit Bot 06c835
/* Such short strings are handled by direct comparison with all messages
Packit Bot 06c835
   of appropriate size.  Note that for two strings of length 0 <= l1 <= l2,
Packit Bot 06c835
   fstrcmp() is <= 2 * l1 / (l1 + l2).  Since we are only interested in
Packit Bot 06c835
   fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
Packit Bot 06c835
   limit the search to lengths l' in the range
Packit Bot 06c835
     l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
Packit Bot 06c835
   Thus we need the list of the short strings up to length:  */
Packit Bot 06c835
#if !defined __SUNPRO_C
Packit Bot 06c835
# define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 / FUZZY_THRESHOLD - 1))
Packit Bot 06c835
#else
Packit Bot 06c835
/* Sun C on Solaris 8 cannot compute this constant expression.  */
Packit Bot 06c835
# define SHORT_MSG_MAX 28
Packit Bot 06c835
#endif
Packit Bot 06c835
Packit Bot 06c835
/* A fuzzy index contains a hash table mapping all n-grams to their
Packit Bot 06c835
   occurrences list.  */
Packit Bot 06c835
struct message_fuzzy_index_ty
Packit Bot 06c835
{
Packit Bot 06c835
  message_ty **messages;
Packit Bot 06c835
  character_iterator_t iterator;
Packit Bot 06c835
  hash_table gram4;
Packit Bot 06c835
  size_t firstfew;
Packit Bot 06c835
  message_list_ty **short_messages;
Packit Bot 06c835
};
Packit Bot 06c835
Packit Bot 06c835
/* Allocate a fuzzy index corresponding to a given list of messages.
Packit Bot 06c835
   The list of messages and the msgctxt and msgid fields of the messages
Packit Bot 06c835
   inside it must not be modified while the returned fuzzy index is in use.  */
Packit Bot 06c835
message_fuzzy_index_ty *
Packit Bot 06c835
message_fuzzy_index_alloc (const message_list_ty *mlp,
Packit Bot 06c835
                           const char *canon_charset)
Packit Bot 06c835
{
Packit Bot 06c835
  message_fuzzy_index_ty *findex = XMALLOC (message_fuzzy_index_ty);
Packit Bot 06c835
  size_t count = mlp->nitems;
Packit Bot 06c835
  size_t j;
Packit Bot 06c835
  size_t l;
Packit Bot 06c835
Packit Bot 06c835
  findex->messages = mlp->item;
Packit Bot 06c835
  findex->iterator = po_charset_character_iterator (canon_charset);
Packit Bot 06c835
Packit Bot 06c835
  /* Setup hash table.  */
Packit Bot 06c835
  if (hash_init (&findex->gram4, 10 * count) < 0)
Packit Bot 06c835
    xalloc_die ();
Packit Bot 06c835
  for (j = 0; j < count; j++)
Packit Bot 06c835
    {
Packit Bot 06c835
      message_ty *mp = mlp->item[j];
Packit Bot 06c835
Packit Bot 06c835
      if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
Packit Bot 06c835
        {
Packit Bot 06c835
          const char *str = mp->msgid;
Packit Bot 06c835
Packit Bot 06c835
          /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
Packit Bot 06c835
          const char *p0 = str;
Packit Bot 06c835
          if (*p0 != '\0')
Packit Bot 06c835
            {
Packit Bot 06c835
              const char *p1 = p0 + findex->iterator (p0);
Packit Bot 06c835
              if (*p1 != '\0')
Packit Bot 06c835
                {
Packit Bot 06c835
                  const char *p2 = p1 + findex->iterator (p1);
Packit Bot 06c835
                  if (*p2 != '\0')
Packit Bot 06c835
                    {
Packit Bot 06c835
                      const char *p3 = p2 + findex->iterator (p2);
Packit Bot 06c835
                      if (*p3 != '\0')
Packit Bot 06c835
                        {
Packit Bot 06c835
                          const char *p4 = p3 + findex->iterator (p3);
Packit Bot 06c835
                          for (;;)
Packit Bot 06c835
                            {
Packit Bot 06c835
                              /* The segment from p0 to p4 is a 4-gram of
Packit Bot 06c835
                                 characters.  Add a hash table entry that maps
Packit Bot 06c835
                                 it to the index j, or extend the existing
Packit Bot 06c835
                                 hash table entry accordingly.  */
Packit Bot 06c835
                              void *found;
Packit Bot 06c835
Packit Bot 06c835
                              if (hash_find_entry (&findex->gram4, p0, p4 - p0,
Packit Bot 06c835
                                                   &found) == 0)
Packit Bot 06c835
                                {
Packit Bot 06c835
                                  index_list_ty list = (index_list_ty) found;
Packit Bot 06c835
                                  list = addlast_index (list, j);
Packit Bot 06c835
                                  if (list != NULL)
Packit Bot 06c835
                                    hash_set_value (&findex->gram4, p0, p4 - p0,
Packit Bot 06c835
                                                    list);
Packit Bot 06c835
                                }
Packit Bot 06c835
                              else
Packit Bot 06c835
                                hash_insert_entry (&findex->gram4, p0, p4 - p0,
Packit Bot 06c835
                                                   new_index (j));
Packit Bot 06c835
Packit Bot 06c835
                              /* Advance.  */
Packit Bot 06c835
                              if (*p4 == '\0')
Packit Bot 06c835
                                break;
Packit Bot 06c835
                              p0 = p1;
Packit Bot 06c835
                              p1 = p2;
Packit Bot 06c835
                              p2 = p3;
Packit Bot 06c835
                              p3 = p4;
Packit Bot 06c835
                              p4 = p4 + findex->iterator (p4);
Packit Bot 06c835
                            }
Packit Bot 06c835
                        }
Packit Bot 06c835
                    }
Packit Bot 06c835
                }
Packit Bot 06c835
            }
Packit Bot 06c835
        }
Packit Bot 06c835
    }
Packit Bot 06c835
Packit Bot 06c835
  /* Shrink memory used by the hash table.  */
Packit Bot 06c835
  {
Packit Bot 06c835
    void *iter;
Packit Bot 06c835
    const void *key;
Packit Bot 06c835
    size_t keylen;
Packit Bot 06c835
    void **valuep;
Packit Bot 06c835
Packit Bot 06c835
    iter = NULL;
Packit Bot 06c835
    while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
Packit Bot 06c835
           == 0)
Packit Bot 06c835
      {
Packit Bot 06c835
        index_list_ty list = (index_list_ty) *valuep;
Packit Bot 06c835
        index_ty length = list[IL_LENGTH];
Packit Bot 06c835
Packit Bot 06c835
        if (length < list[IL_ALLOCATED])
Packit Bot 06c835
          {
Packit Bot 06c835
            list[IL_ALLOCATED] = length;
Packit Bot 06c835
            *valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
Packit Bot 06c835
          }
Packit Bot 06c835
      }
Packit Bot 06c835
  }
Packit Bot 06c835
Packit Bot 06c835
  findex->firstfew = (int) sqrt ((double) count);
Packit Bot 06c835
  if (findex->firstfew < 10)
Packit Bot 06c835
    findex->firstfew = 10;
Packit Bot 06c835
Packit Bot 06c835
  /* Setup lists of short messages.  */
Packit Bot 06c835
  findex->short_messages = XNMALLOC (SHORT_MSG_MAX + 1, message_list_ty *);
Packit Bot 06c835
  for (l = 0; l <= SHORT_MSG_MAX; l++)
Packit Bot 06c835
    findex->short_messages[l] = message_list_alloc (false);
Packit Bot 06c835
  for (j = 0; j < count; j++)
Packit Bot 06c835
    {
Packit Bot 06c835
      message_ty *mp = mlp->item[j];
Packit Bot 06c835
Packit Bot 06c835
      if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
Packit Bot 06c835
        {
Packit Bot 06c835
          const char *str = mp->msgid;
Packit Bot 06c835
          size_t len = strlen (str);
Packit Bot 06c835
Packit Bot 06c835
          if (len <= SHORT_MSG_MAX)
Packit Bot 06c835
            message_list_append (findex->short_messages[len], mp);
Packit Bot 06c835
        }
Packit Bot 06c835
    }
Packit Bot 06c835
Packit Bot 06c835
  /* Shrink memory used by the lists of short messages.  */
Packit Bot 06c835
  for (l = 0; l <= SHORT_MSG_MAX; l++)
Packit Bot 06c835
    {
Packit Bot 06c835
      message_list_ty *mlp = findex->short_messages[l];
Packit Bot 06c835
Packit Bot 06c835
      if (mlp->nitems < mlp->nitems_max)
Packit Bot 06c835
        {
Packit Bot 06c835
          mlp->nitems_max = mlp->nitems;
Packit Bot 06c835
          mlp->item =
Packit Bot 06c835
            (message_ty **)
Packit Bot 06c835
            xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *));
Packit Bot 06c835
        }
Packit Bot 06c835
    }
Packit Bot 06c835
Packit Bot 06c835
  return findex;
Packit Bot 06c835
}
Packit Bot 06c835
Packit Bot 06c835
/* An index with multiplicity.  */
Packit Bot 06c835
struct mult_index
Packit Bot 06c835
{
Packit Bot 06c835
  index_ty index;
Packit Bot 06c835
  unsigned int count;
Packit Bot 06c835
};
Packit Bot 06c835
Packit Bot 06c835
/* A list of indices with multiplicity.
Packit Bot 06c835
   The indices are sorted in ascending order.  */
Packit Bot 06c835
struct mult_index_list
Packit Bot 06c835
{
Packit Bot 06c835
  struct mult_index *item;
Packit Bot 06c835
  size_t nitems;
Packit Bot 06c835
  size_t nitems_max;
Packit Bot 06c835
  /* Work area.  */
Packit Bot 06c835
  struct mult_index *item2;
Packit Bot 06c835
  size_t nitems2_max;
Packit Bot 06c835
};
Packit Bot 06c835
Packit Bot 06c835
/* Initialize an empty list of indices with multiplicity.  */
Packit Bot 06c835
static inline void
Packit Bot 06c835
mult_index_list_init (struct mult_index_list *accu)
Packit Bot 06c835
{
Packit Bot 06c835
  accu->nitems = 0;
Packit Bot 06c835
  accu->nitems_max = 0;
Packit Bot 06c835
  accu->item = NULL;
Packit Bot 06c835
  accu->nitems2_max = 0;
Packit Bot 06c835
  accu->item2 = NULL;
Packit Bot 06c835
}
Packit Bot 06c835
Packit Bot 06c835
/* Add an index list to a list of indices with multiplicity.  */
Packit Bot 06c835
static inline void
Packit Bot 06c835
mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
Packit Bot 06c835
{
Packit Bot 06c835
  size_t len1 = accu->nitems;
Packit Bot 06c835
  size_t len2 = list[IL_LENGTH];
Packit Bot 06c835
  size_t need = len1 + len2;
Packit Bot 06c835
  struct mult_index *ptr1;
Packit Bot 06c835
  struct mult_index *ptr1_end;
Packit Bot 06c835
  index_ty *ptr2;
Packit Bot 06c835
  index_ty *ptr2_end;
Packit Bot 06c835
  struct mult_index *destptr;
Packit Bot 06c835
Packit Bot 06c835
  /* Make the work area large enough.  */
Packit Bot 06c835
  if (accu->nitems2_max < need)
Packit Bot 06c835
    {
Packit Bot 06c835
      size_t new_max = 2 * accu->nitems2_max + 1;
Packit Bot 06c835
Packit Bot 06c835
      if (new_max < need)
Packit Bot 06c835
        new_max = need;
Packit Bot 06c835
      if (accu->item2 != NULL)
Packit Bot 06c835
        free (accu->item2);
Packit Bot 06c835
      accu->item2 = XNMALLOC (new_max, struct mult_index);
Packit Bot 06c835
      accu->nitems2_max = new_max;
Packit Bot 06c835
    }
Packit Bot 06c835
Packit Bot 06c835
  /* Make a linear pass through accu and list simultaneously.  */
Packit Bot 06c835
  ptr1 = accu->item;
Packit Bot 06c835
  ptr1_end = ptr1 + len1;
Packit Bot 06c835
  ptr2 = list + 2;
Packit Bot 06c835
  ptr2_end = ptr2 + len2;
Packit Bot 06c835
  destptr = accu->item2;
Packit Bot 06c835
  while (ptr1 < ptr1_end && ptr2 < ptr2_end)
Packit Bot 06c835
    {
Packit Bot 06c835
      if (ptr1->index < *ptr2)
Packit Bot 06c835
        {
Packit Bot 06c835
          *destptr = *ptr1;
Packit Bot 06c835
          ptr1++;
Packit Bot 06c835
        }
Packit Bot 06c835
      else if (ptr1->index > *ptr2)
Packit Bot 06c835
        {
Packit Bot 06c835
          destptr->index = *ptr2;
Packit Bot 06c835
          destptr->count = 1;
Packit Bot 06c835
          ptr2++;
Packit Bot 06c835
        }
Packit Bot 06c835
      else /* ptr1->index == list[2 + i2] */
Packit Bot 06c835
        {
Packit Bot 06c835
          destptr->index = ptr1->index;
Packit Bot 06c835
          destptr->count = ptr1->count + 1;
Packit Bot 06c835
          ptr1++;
Packit Bot 06c835
          ptr2++;
Packit Bot 06c835
        }
Packit Bot 06c835
      destptr++;
Packit Bot 06c835
    }
Packit Bot 06c835
  while (ptr1 < ptr1_end)
Packit Bot 06c835
    {
Packit Bot 06c835
      *destptr = *ptr1;
Packit Bot 06c835
      ptr1++;
Packit Bot 06c835
      destptr++;
Packit Bot 06c835
    }
Packit Bot 06c835
  while (ptr2 < ptr2_end)
Packit Bot 06c835
    {
Packit Bot 06c835
      destptr->index = *ptr2;
Packit Bot 06c835
      destptr->count = 1;
Packit Bot 06c835
      ptr2++;
Packit Bot 06c835
      destptr++;
Packit Bot 06c835
    }
Packit Bot 06c835
Packit Bot 06c835
  /* Swap accu->item and accu->item2.  */
Packit Bot 06c835
  {
Packit Bot 06c835
    struct mult_index *dest = accu->item2;
Packit Bot 06c835
    size_t dest_max = accu->nitems2_max;
Packit Bot 06c835
Packit Bot 06c835
    accu->item2 = accu->item;
Packit Bot 06c835
    accu->nitems2_max = accu->nitems_max;
Packit Bot 06c835
Packit Bot 06c835
    accu->item = dest;
Packit Bot 06c835
    accu->nitems = destptr - dest;
Packit Bot 06c835
    accu->nitems_max = dest_max;
Packit Bot 06c835
  }
Packit Bot 06c835
}
Packit Bot 06c835
Packit Bot 06c835
/* Compares two indices with multiplicity, according to their multiplicity.  */
Packit Bot 06c835
static int
Packit Bot 06c835
mult_index_compare (const void *p1, const void *p2)
Packit Bot 06c835
{
Packit Bot 06c835
  const struct mult_index *ptr1 = (const struct mult_index *) p1;
Packit Bot 06c835
  const struct mult_index *ptr2 = (const struct mult_index *) p2;
Packit Bot 06c835
Packit Bot 06c835
  if (ptr1->count < ptr2->count)
Packit Bot 06c835
    return 1;
Packit Bot 06c835
  if (ptr1->count > ptr2->count)
Packit Bot 06c835
    return -1;
Packit Bot 06c835
  /* For reproduceable results, also take into account the indices.  */
Packit Bot 06c835
  if (ptr1->index > ptr2->index)
Packit Bot 06c835
    return 1;
Packit Bot 06c835
  if (ptr1->index < ptr2->index)
Packit Bot 06c835
    return -1;
Packit Bot 06c835
  return 0;
Packit Bot 06c835
}
Packit Bot 06c835
Packit Bot 06c835
/* Sort a list of indices with multiplicity according to decreasing
Packit Bot 06c835
   multiplicity.  */
Packit Bot 06c835
static inline void
Packit Bot 06c835
mult_index_list_sort (struct mult_index_list *accu)
Packit Bot 06c835
{
Packit Bot 06c835
  if (accu->nitems > 1)
Packit Bot 06c835
    qsort (accu->item, accu->nitems, sizeof (struct mult_index),
Packit Bot 06c835
           mult_index_compare);
Packit Bot 06c835
}
Packit Bot 06c835
Packit Bot 06c835
/* Frees a list of indices with multiplicity.  */
Packit Bot 06c835
static inline void
Packit Bot 06c835
mult_index_list_free (struct mult_index_list *accu)
Packit Bot 06c835
{
Packit Bot 06c835
  if (accu->item != NULL)
Packit Bot 06c835
    free (accu->item);
Packit Bot 06c835
  if (accu->item2 != NULL)
Packit Bot 06c835
    free (accu->item2);
Packit Bot 06c835
}
Packit Bot 06c835
Packit Bot 06c835
/* Find a good match for the given msgctxt and msgid in the given fuzzy index.
Packit Bot 06c835
   The match does not need to be optimal.
Packit Bot 06c835
   Ignore matches for which the fuzzy_search_goal_function is < LOWER_BOUND.
Packit Bot 06c835
   LOWER_BOUND must be >= FUZZY_THRESHOLD.
Packit Bot 06c835
   If HEURISTIC is true, only the few best messages among the list - according
Packit Bot 06c835
   to a certain heuristic - are considered.  If HEURISTIC is false, all
Packit Bot 06c835
   messages with a fuzzy_search_goal_function > FUZZY_THRESHOLD are considered,
Packit Bot 06c835
   like in message_list_search_fuzzy (except that in ambiguous cases where
Packit Bot 06c835
   several best matches exist, message_list_search_fuzzy chooses the one with
Packit Bot 06c835
   the smallest index whereas message_fuzzy_index_search makes a better
Packit Bot 06c835
   choice).  */
Packit Bot 06c835
message_ty *
Packit Bot 06c835
message_fuzzy_index_search (message_fuzzy_index_ty *findex,
Packit Bot 06c835
                            const char *msgctxt, const char *msgid,
Packit Bot 06c835
                            double lower_bound,
Packit Bot 06c835
                            bool heuristic)
Packit Bot 06c835
{
Packit Bot 06c835
  const char *str = msgid;
Packit Bot 06c835
Packit Bot 06c835
  /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
Packit Bot 06c835
  const char *p0 = str;
Packit Bot 06c835
  if (*p0 != '\0')
Packit Bot 06c835
    {
Packit Bot 06c835
      const char *p1 = p0 + findex->iterator (p0);
Packit Bot 06c835
      if (*p1 != '\0')
Packit Bot 06c835
        {
Packit Bot 06c835
          const char *p2 = p1 + findex->iterator (p1);
Packit Bot 06c835
          if (*p2 != '\0')
Packit Bot 06c835
            {
Packit Bot 06c835
              const char *p3 = p2 + findex->iterator (p2);
Packit Bot 06c835
              if (*p3 != '\0')
Packit Bot 06c835
                {
Packit Bot 06c835
                  const char *p4 = p3 + findex->iterator (p3);
Packit Bot 06c835
                  struct mult_index_list accu;
Packit Bot 06c835
Packit Bot 06c835
                  mult_index_list_init (&accu);
Packit Bot 06c835
                  for (;;)
Packit Bot 06c835
                    {
Packit Bot 06c835
                      /* The segment from p0 to p4 is a 4-gram of
Packit Bot 06c835
                         characters.  Get the hash table entry containing
Packit Bot 06c835
                         a list of indices, and add it to the accu.  */
Packit Bot 06c835
                      void *found;
Packit Bot 06c835
Packit Bot 06c835
                      if (hash_find_entry (&findex->gram4, p0, p4 - p0,
Packit Bot 06c835
                                           &found) == 0)
Packit Bot 06c835
                        {
Packit Bot 06c835
                          index_list_ty list = (index_list_ty) found;
Packit Bot 06c835
                          mult_index_list_accumulate (&accu, list);
Packit Bot 06c835
                        }
Packit Bot 06c835
Packit Bot 06c835
                      /* Advance.  */
Packit Bot 06c835
                      if (*p4 == '\0')
Packit Bot 06c835
                        break;
Packit Bot 06c835
                      p0 = p1;
Packit Bot 06c835
                      p1 = p2;
Packit Bot 06c835
                      p2 = p3;
Packit Bot 06c835
                      p3 = p4;
Packit Bot 06c835
                      p4 = p4 + findex->iterator (p4);
Packit Bot 06c835
                    }
Packit Bot 06c835
Packit Bot 06c835
                  /* Sort in decreasing count order.  */
Packit Bot 06c835
                  mult_index_list_sort (&accu);
Packit Bot 06c835
Packit Bot 06c835
                  /* Iterate over this sorted list, and maximize the
Packit Bot 06c835
                     fuzzy_search_goal_function() result.
Packit Bot 06c835
                     If HEURISTIC is true, take only the first few messages.
Packit Bot 06c835
                     If HEURISTIC is false, consider all messages - to match
Packit Bot 06c835
                     the behaviour of message_list_search_fuzzy -, but process
Packit Bot 06c835
                     them in the order of the sorted list.  This increases
Packit Bot 06c835
                     the chances that the later calls to fstrcmp_bounded() (via
Packit Bot 06c835
                     fuzzy_search_goal_function()) terminate quickly, thanks
Packit Bot 06c835
                     to the best_weight which will be quite high already after
Packit Bot 06c835
                     the first few messages.  */
Packit Bot 06c835
                  {
Packit Bot 06c835
                    size_t count;
Packit Bot 06c835
                    struct mult_index *ptr;
Packit Bot 06c835
                    message_ty *best_mp;
Packit Bot 06c835
                    double best_weight;
Packit Bot 06c835
Packit Bot 06c835
                    count = accu.nitems;
Packit Bot 06c835
                    if (heuristic)
Packit Bot 06c835
                      {
Packit Bot 06c835
                        if (count > findex->firstfew)
Packit Bot 06c835
                          count = findex->firstfew;
Packit Bot 06c835
                      }
Packit Bot 06c835
Packit Bot 06c835
                    best_weight = lower_bound;
Packit Bot 06c835
                    best_mp = NULL;
Packit Bot 06c835
                    for (ptr = accu.item; count > 0; ptr++, count--)
Packit Bot 06c835
                      {
Packit Bot 06c835
                        message_ty *mp = findex->messages[ptr->index];
Packit Bot 06c835
                        double weight =
Packit Bot 06c835
                          fuzzy_search_goal_function (mp, msgctxt, msgid,
Packit Bot 06c835
                                                      best_weight);
Packit Bot 06c835
Packit Bot 06c835
                        if (weight > best_weight)
Packit Bot 06c835
                          {
Packit Bot 06c835
                            best_weight = weight;
Packit Bot 06c835
                            best_mp = mp;
Packit Bot 06c835
                          }
Packit Bot 06c835
                      }
Packit Bot 06c835
Packit Bot 06c835
                    mult_index_list_free (&accu);
Packit Bot 06c835
Packit Bot 06c835
                    return best_mp;
Packit Bot 06c835
                  }
Packit Bot 06c835
                }
Packit Bot 06c835
            }
Packit Bot 06c835
        }
Packit Bot 06c835
    }
Packit Bot 06c835
Packit Bot 06c835
  /* The string had less than 4 characters.  */
Packit Bot 06c835
  {
Packit Bot 06c835
    size_t l = strlen (str);
Packit Bot 06c835
    size_t lmin, lmax;
Packit Bot 06c835
    message_ty *best_mp;
Packit Bot 06c835
    double best_weight;
Packit Bot 06c835
Packit Bot 06c835
    if (!(l <= SHORT_STRING_MAX_BYTES))
Packit Bot 06c835
      abort ();
Packit Bot 06c835
Packit Bot 06c835
    /* Walk through those short messages which have an appropriate length.
Packit Bot 06c835
       See the comment before SHORT_MSG_MAX.  */
Packit Bot 06c835
    lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
Packit Bot 06c835
    lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
Packit Bot 06c835
    if (!(lmax <= SHORT_MSG_MAX))
Packit Bot 06c835
      abort ();
Packit Bot 06c835
Packit Bot 06c835
    best_weight = lower_bound;
Packit Bot 06c835
    best_mp = NULL;
Packit Bot 06c835
    for (l = lmin; l <= lmax; l++)
Packit Bot 06c835
      {
Packit Bot 06c835
        message_list_ty *mlp = findex->short_messages[l];
Packit Bot 06c835
        size_t j;
Packit Bot 06c835
Packit Bot 06c835
        for (j = 0; j < mlp->nitems; j++)
Packit Bot 06c835
          {
Packit Bot 06c835
            message_ty *mp = mlp->item[j];
Packit Bot 06c835
            double weight =
Packit Bot 06c835
              fuzzy_search_goal_function (mp, msgctxt, msgid, best_weight);
Packit Bot 06c835
Packit Bot 06c835
            if (weight > best_weight)
Packit Bot 06c835
              {
Packit Bot 06c835
                best_weight = weight;
Packit Bot 06c835
                best_mp = mp;
Packit Bot 06c835
              }
Packit Bot 06c835
          }
Packit Bot 06c835
      }
Packit Bot 06c835
Packit Bot 06c835
    return best_mp;
Packit Bot 06c835
  }
Packit Bot 06c835
}
Packit Bot 06c835
Packit Bot 06c835
/* Free a fuzzy index.  */
Packit Bot 06c835
void
Packit Bot 06c835
message_fuzzy_index_free (message_fuzzy_index_ty *findex)
Packit Bot 06c835
{
Packit Bot 06c835
  size_t l;
Packit Bot 06c835
  void *iter;
Packit Bot 06c835
  const void *key;
Packit Bot 06c835
  size_t keylen;
Packit Bot 06c835
  void *data;
Packit Bot 06c835
Packit Bot 06c835
  /* Free the short lists.  */
Packit Bot 06c835
  for (l = 0; l <= SHORT_MSG_MAX; l++)
Packit Bot 06c835
    message_list_free (findex->short_messages[l], 1);
Packit Bot 06c835
  free (findex->short_messages);
Packit Bot 06c835
Packit Bot 06c835
  /* Free the index lists occurring as values in the hash tables.  */
Packit Bot 06c835
  iter = NULL;
Packit Bot 06c835
  while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
Packit Bot 06c835
    free ((index_list_ty *) data);
Packit Bot 06c835
  /* Free the hash table itself.  */
Packit Bot 06c835
  hash_destroy (&findex->gram4);
Packit Bot 06c835
Packit Bot 06c835
  free (findex);
Packit Bot 06c835
}