|
Packit Bot |
06c835 |
/* Fast fuzzy searching among messages.
|
|
Packit Bot |
06c835 |
Copyright (C) 2006, 2008, 2011, 2015 Free Software Foundation, Inc.
|
|
Packit Bot |
06c835 |
Written by Bruno Haible <bruno@clisp.org>, 2006.
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
This program is free software: you can redistribute it and/or modify
|
|
Packit Bot |
06c835 |
it under the terms of the GNU General Public License as published by
|
|
Packit Bot |
06c835 |
the Free Software Foundation; either version 3 of the License, or
|
|
Packit Bot |
06c835 |
(at your option) any later version.
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
This program is distributed in the hope that it will be useful,
|
|
Packit Bot |
06c835 |
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
Packit Bot |
06c835 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
Packit Bot |
06c835 |
GNU General Public License for more details.
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
You should have received a copy of the GNU General Public License
|
|
Packit Bot |
06c835 |
along with this program. If not, see <http://www.gnu.org/licenses/>. */
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
#ifdef HAVE_CONFIG_H
|
|
Packit Bot |
06c835 |
# include <config.h>
|
|
Packit Bot |
06c835 |
#endif
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Specification. */
|
|
Packit Bot |
06c835 |
#include "msgl-fsearch.h"
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
#include <math.h>
|
|
Packit Bot |
06c835 |
#include <stdlib.h>
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
#include "xalloc.h"
|
|
Packit Bot |
06c835 |
#include "po-charset.h"
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Fuzzy searching of L strings in a large set of N messages (assuming
|
|
Packit Bot |
06c835 |
they have all the same small size) takes O(L * N) when using repeated
|
|
Packit Bot |
06c835 |
fstrcmp() calls. When for example L = 800 and N = 69000, this is slow.
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
So we preprocess the set of N messages, yielding a data structure
|
|
Packit Bot |
06c835 |
that allows to select the similar messages for any given string, and
|
|
Packit Bot |
06c835 |
apply fstrcmp() only to the search result. This allows to reduce the
|
|
Packit Bot |
06c835 |
time to something between O(L * 1) and O(L * N) - depending on the
|
|
Packit Bot |
06c835 |
structure of the strings. In the example with L = 800 and N = 69000,
|
|
Packit Bot |
06c835 |
memory use increases by a factor of 2 and the time decreases from
|
|
Packit Bot |
06c835 |
9068 s to 230 s.
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
The data structure is a hash table mapping each n-gram (here with n=4)
|
|
Packit Bot |
06c835 |
occurring in the message strings to the list of messages that contains
|
|
Packit Bot |
06c835 |
it. For example, if the message list is
|
|
Packit Bot |
06c835 |
[0] "close"
|
|
Packit Bot |
06c835 |
[1] "losers"
|
|
Packit Bot |
06c835 |
the index is a hash table mapping
|
|
Packit Bot |
06c835 |
"clos" -> { 0 }
|
|
Packit Bot |
06c835 |
"lose" -> { 0, 1 }
|
|
Packit Bot |
06c835 |
"oser" -> { 1 }
|
|
Packit Bot |
06c835 |
"sers" -> { 1 }
|
|
Packit Bot |
06c835 |
Searching the similar messages to, say, "closest", is done by looking up
|
|
Packit Bot |
06c835 |
all its 4-grams in the hash table and summing up the results:
|
|
Packit Bot |
06c835 |
"clos" -> { 0 }
|
|
Packit Bot |
06c835 |
"lose" -> { 0, 1 }
|
|
Packit Bot |
06c835 |
"oses" -> {}
|
|
Packit Bot |
06c835 |
"sest" -> {}
|
|
Packit Bot |
06c835 |
=> total: { 2x0, 1x1 }
|
|
Packit Bot |
06c835 |
The list is sorted according to decreasing frequency: { 0, 1, ... }
|
|
Packit Bot |
06c835 |
and only the first few messages from this frequency list are passed to
|
|
Packit Bot |
06c835 |
fstrcmp().
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
The n-gram similarity and the fstrcmp() similarity are quite different
|
|
Packit Bot |
06c835 |
metrics. For example, "close" and "c l o s e" have no n-grams in common
|
|
Packit Bot |
06c835 |
(even for n as small as n = 2); however, fstrcmp() will find that they
|
|
Packit Bot |
06c835 |
are quite similar (= 0.71). Conversely, "AAAA BBBB ... ZZZZ" and
|
|
Packit Bot |
06c835 |
"ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
|
|
Packit Bot |
06c835 |
only 0.02. Also, repeated n-grams don't have the same effect on the
|
|
Packit Bot |
06c835 |
two similarity measures. But all this doesn't matter much in practice.
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
|
|
Packit Bot |
06c835 |
lists are likely too long. (Well, with ideographic languages like Chinese,
|
|
Packit Bot |
06c835 |
n = 3 might actually be quite good. Anyway, n = 4 is not bad in this case
|
|
Packit Bot |
06c835 |
either.)
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
The units are characters in the current encoding. Not just bytes,
|
|
Packit Bot |
06c835 |
because 4 consecutive bytes in UTF-8 or GB18030 don't mean much. */
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Each message is represented by its index in the message list. */
|
|
Packit Bot |
06c835 |
typedef unsigned int index_ty;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* An index list has its allocated size and length tacked at the front.
|
|
Packit Bot |
06c835 |
The indices are sorted in ascending order. */
|
|
Packit Bot |
06c835 |
typedef index_ty *index_list_ty;
|
|
Packit Bot |
06c835 |
#define IL_ALLOCATED 0
|
|
Packit Bot |
06c835 |
#define IL_LENGTH 1
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Create a new index list containing only a given index. */
|
|
Packit Bot |
06c835 |
static inline index_list_ty
|
|
Packit Bot |
06c835 |
new_index (index_ty idx)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
index_ty *list = XNMALLOC (2 + 1, index_ty);
|
|
Packit Bot |
06c835 |
list[IL_ALLOCATED] = 1;
|
|
Packit Bot |
06c835 |
list[IL_LENGTH] = 1;
|
|
Packit Bot |
06c835 |
list[2] = idx;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
return list;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Add a given index, greater or equal than all previous indices, to an
|
|
Packit Bot |
06c835 |
index list.
|
|
Packit Bot |
06c835 |
Return the new index list, if it had to be reallocated, or NULL if it
|
|
Packit Bot |
06c835 |
didn't change. */
|
|
Packit Bot |
06c835 |
static inline index_list_ty
|
|
Packit Bot |
06c835 |
addlast_index (index_list_ty list, index_ty idx)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
index_list_ty result;
|
|
Packit Bot |
06c835 |
size_t length = list[IL_LENGTH];
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Look whether it should be inserted. */
|
|
Packit Bot |
06c835 |
if (length > 0 && list[2 + (length - 1)] == idx)
|
|
Packit Bot |
06c835 |
return NULL;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Now make room for one more list element. */
|
|
Packit Bot |
06c835 |
result = NULL;
|
|
Packit Bot |
06c835 |
if (length == list[IL_ALLOCATED])
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
size_t new_allocated = 2 * length - (length >> 6);
|
|
Packit Bot |
06c835 |
list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
|
|
Packit Bot |
06c835 |
list[IL_ALLOCATED] = new_allocated;
|
|
Packit Bot |
06c835 |
result = list;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
list[2 + length] = idx;
|
|
Packit Bot |
06c835 |
list[IL_LENGTH] = length + 1;
|
|
Packit Bot |
06c835 |
return result;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Add a given index to an index list.
|
|
Packit Bot |
06c835 |
Return the new index list, if it had to be reallocated, or NULL if it
|
|
Packit Bot |
06c835 |
didn't change. */
|
|
Packit Bot |
06c835 |
static inline index_list_ty
|
|
Packit Bot |
06c835 |
add_index (index_list_ty list, index_ty idx)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
index_list_ty result;
|
|
Packit Bot |
06c835 |
size_t length = list[IL_LENGTH];
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Look where it should be inserted. */
|
|
Packit Bot |
06c835 |
size_t lo = 0;
|
|
Packit Bot |
06c835 |
size_t hi = length;
|
|
Packit Bot |
06c835 |
while (lo < hi)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
/* Here we know that idx must be inserted at a position >= lo, <= hi. */
|
|
Packit Bot |
06c835 |
size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
|
|
Packit Bot |
06c835 |
index_ty val = list[2 + mid];
|
|
Packit Bot |
06c835 |
if (val < idx)
|
|
Packit Bot |
06c835 |
lo = mid + 1;
|
|
Packit Bot |
06c835 |
else if (val > idx)
|
|
Packit Bot |
06c835 |
hi = mid;
|
|
Packit Bot |
06c835 |
else
|
|
Packit Bot |
06c835 |
return NULL;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Now make room for one more list element. */
|
|
Packit Bot |
06c835 |
result = NULL;
|
|
Packit Bot |
06c835 |
if (length == list[IL_ALLOCATED])
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
size_t new_allocated = 2 * length - (length >> 6);
|
|
Packit Bot |
06c835 |
list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
|
|
Packit Bot |
06c835 |
list[IL_ALLOCATED] = new_allocated;
|
|
Packit Bot |
06c835 |
result = list;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
list[IL_LENGTH] = length + 1;
|
|
Packit Bot |
06c835 |
for (; length > hi; length--)
|
|
Packit Bot |
06c835 |
list[2 + length] = list[1 + length];
|
|
Packit Bot |
06c835 |
list[2 + length] = idx;
|
|
Packit Bot |
06c835 |
return result;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* We use 4-grams, therefore strings with less than 4 characters cannot be
|
|
Packit Bot |
06c835 |
handled through the 4-grams table and need to be handled specially.
|
|
Packit Bot |
06c835 |
Since every character occupies at most 4 bytes (see po-charset.c),
|
|
Packit Bot |
06c835 |
this means the size of such short strings is bounded by: */
|
|
Packit Bot |
06c835 |
#define SHORT_STRING_MAX_CHARACTERS (4 - 1)
|
|
Packit Bot |
06c835 |
#define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Such short strings are handled by direct comparison with all messages
|
|
Packit Bot |
06c835 |
of appropriate size. Note that for two strings of length 0 <= l1 <= l2,
|
|
Packit Bot |
06c835 |
fstrcmp() is <= 2 * l1 / (l1 + l2). Since we are only interested in
|
|
Packit Bot |
06c835 |
fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
|
|
Packit Bot |
06c835 |
limit the search to lengths l' in the range
|
|
Packit Bot |
06c835 |
l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
|
|
Packit Bot |
06c835 |
Thus we need the list of the short strings up to length: */
|
|
Packit Bot |
06c835 |
#if !defined __SUNPRO_C
|
|
Packit Bot |
06c835 |
# define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 / FUZZY_THRESHOLD - 1))
|
|
Packit Bot |
06c835 |
#else
|
|
Packit Bot |
06c835 |
/* Sun C on Solaris 8 cannot compute this constant expression. */
|
|
Packit Bot |
06c835 |
# define SHORT_MSG_MAX 28
|
|
Packit Bot |
06c835 |
#endif
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* A fuzzy index contains a hash table mapping all n-grams to their
|
|
Packit Bot |
06c835 |
occurrences list. */
|
|
Packit Bot |
06c835 |
struct message_fuzzy_index_ty
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
message_ty **messages;
|
|
Packit Bot |
06c835 |
character_iterator_t iterator;
|
|
Packit Bot |
06c835 |
hash_table gram4;
|
|
Packit Bot |
06c835 |
size_t firstfew;
|
|
Packit Bot |
06c835 |
message_list_ty **short_messages;
|
|
Packit Bot |
06c835 |
};
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Allocate a fuzzy index corresponding to a given list of messages.
|
|
Packit Bot |
06c835 |
The list of messages and the msgctxt and msgid fields of the messages
|
|
Packit Bot |
06c835 |
inside it must not be modified while the returned fuzzy index is in use. */
|
|
Packit Bot |
06c835 |
message_fuzzy_index_ty *
|
|
Packit Bot |
06c835 |
message_fuzzy_index_alloc (const message_list_ty *mlp,
|
|
Packit Bot |
06c835 |
const char *canon_charset)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
message_fuzzy_index_ty *findex = XMALLOC (message_fuzzy_index_ty);
|
|
Packit Bot |
06c835 |
size_t count = mlp->nitems;
|
|
Packit Bot |
06c835 |
size_t j;
|
|
Packit Bot |
06c835 |
size_t l;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
findex->messages = mlp->item;
|
|
Packit Bot |
06c835 |
findex->iterator = po_charset_character_iterator (canon_charset);
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Setup hash table. */
|
|
Packit Bot |
06c835 |
if (hash_init (&findex->gram4, 10 * count) < 0)
|
|
Packit Bot |
06c835 |
xalloc_die ();
|
|
Packit Bot |
06c835 |
for (j = 0; j < count; j++)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
message_ty *mp = mlp->item[j];
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
const char *str = mp->msgid;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Let p0 < p1 < p2 < p3 < p4 walk through the string. */
|
|
Packit Bot |
06c835 |
const char *p0 = str;
|
|
Packit Bot |
06c835 |
if (*p0 != '\0')
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
const char *p1 = p0 + findex->iterator (p0);
|
|
Packit Bot |
06c835 |
if (*p1 != '\0')
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
const char *p2 = p1 + findex->iterator (p1);
|
|
Packit Bot |
06c835 |
if (*p2 != '\0')
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
const char *p3 = p2 + findex->iterator (p2);
|
|
Packit Bot |
06c835 |
if (*p3 != '\0')
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
const char *p4 = p3 + findex->iterator (p3);
|
|
Packit Bot |
06c835 |
for (;;)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
/* The segment from p0 to p4 is a 4-gram of
|
|
Packit Bot |
06c835 |
characters. Add a hash table entry that maps
|
|
Packit Bot |
06c835 |
it to the index j, or extend the existing
|
|
Packit Bot |
06c835 |
hash table entry accordingly. */
|
|
Packit Bot |
06c835 |
void *found;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
if (hash_find_entry (&findex->gram4, p0, p4 - p0,
|
|
Packit Bot |
06c835 |
&found) == 0)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
index_list_ty list = (index_list_ty) found;
|
|
Packit Bot |
06c835 |
list = addlast_index (list, j);
|
|
Packit Bot |
06c835 |
if (list != NULL)
|
|
Packit Bot |
06c835 |
hash_set_value (&findex->gram4, p0, p4 - p0,
|
|
Packit Bot |
06c835 |
list);
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
else
|
|
Packit Bot |
06c835 |
hash_insert_entry (&findex->gram4, p0, p4 - p0,
|
|
Packit Bot |
06c835 |
new_index (j));
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Advance. */
|
|
Packit Bot |
06c835 |
if (*p4 == '\0')
|
|
Packit Bot |
06c835 |
break;
|
|
Packit Bot |
06c835 |
p0 = p1;
|
|
Packit Bot |
06c835 |
p1 = p2;
|
|
Packit Bot |
06c835 |
p2 = p3;
|
|
Packit Bot |
06c835 |
p3 = p4;
|
|
Packit Bot |
06c835 |
p4 = p4 + findex->iterator (p4);
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Shrink memory used by the hash table. */
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
void *iter;
|
|
Packit Bot |
06c835 |
const void *key;
|
|
Packit Bot |
06c835 |
size_t keylen;
|
|
Packit Bot |
06c835 |
void **valuep;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
iter = NULL;
|
|
Packit Bot |
06c835 |
while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
|
|
Packit Bot |
06c835 |
== 0)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
index_list_ty list = (index_list_ty) *valuep;
|
|
Packit Bot |
06c835 |
index_ty length = list[IL_LENGTH];
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
if (length < list[IL_ALLOCATED])
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
list[IL_ALLOCATED] = length;
|
|
Packit Bot |
06c835 |
*valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
findex->firstfew = (int) sqrt ((double) count);
|
|
Packit Bot |
06c835 |
if (findex->firstfew < 10)
|
|
Packit Bot |
06c835 |
findex->firstfew = 10;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Setup lists of short messages. */
|
|
Packit Bot |
06c835 |
findex->short_messages = XNMALLOC (SHORT_MSG_MAX + 1, message_list_ty *);
|
|
Packit Bot |
06c835 |
for (l = 0; l <= SHORT_MSG_MAX; l++)
|
|
Packit Bot |
06c835 |
findex->short_messages[l] = message_list_alloc (false);
|
|
Packit Bot |
06c835 |
for (j = 0; j < count; j++)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
message_ty *mp = mlp->item[j];
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
const char *str = mp->msgid;
|
|
Packit Bot |
06c835 |
size_t len = strlen (str);
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
if (len <= SHORT_MSG_MAX)
|
|
Packit Bot |
06c835 |
message_list_append (findex->short_messages[len], mp);
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Shrink memory used by the lists of short messages. */
|
|
Packit Bot |
06c835 |
for (l = 0; l <= SHORT_MSG_MAX; l++)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
message_list_ty *mlp = findex->short_messages[l];
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
if (mlp->nitems < mlp->nitems_max)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
mlp->nitems_max = mlp->nitems;
|
|
Packit Bot |
06c835 |
mlp->item =
|
|
Packit Bot |
06c835 |
(message_ty **)
|
|
Packit Bot |
06c835 |
xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *));
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
return findex;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* An index with multiplicity. */
|
|
Packit Bot |
06c835 |
struct mult_index
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
index_ty index;
|
|
Packit Bot |
06c835 |
unsigned int count;
|
|
Packit Bot |
06c835 |
};
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* A list of indices with multiplicity.
|
|
Packit Bot |
06c835 |
The indices are sorted in ascending order. */
|
|
Packit Bot |
06c835 |
struct mult_index_list
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
struct mult_index *item;
|
|
Packit Bot |
06c835 |
size_t nitems;
|
|
Packit Bot |
06c835 |
size_t nitems_max;
|
|
Packit Bot |
06c835 |
/* Work area. */
|
|
Packit Bot |
06c835 |
struct mult_index *item2;
|
|
Packit Bot |
06c835 |
size_t nitems2_max;
|
|
Packit Bot |
06c835 |
};
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Initialize an empty list of indices with multiplicity. */
|
|
Packit Bot |
06c835 |
static inline void
|
|
Packit Bot |
06c835 |
mult_index_list_init (struct mult_index_list *accu)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
accu->nitems = 0;
|
|
Packit Bot |
06c835 |
accu->nitems_max = 0;
|
|
Packit Bot |
06c835 |
accu->item = NULL;
|
|
Packit Bot |
06c835 |
accu->nitems2_max = 0;
|
|
Packit Bot |
06c835 |
accu->item2 = NULL;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Add an index list to a list of indices with multiplicity. */
|
|
Packit Bot |
06c835 |
static inline void
|
|
Packit Bot |
06c835 |
mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
size_t len1 = accu->nitems;
|
|
Packit Bot |
06c835 |
size_t len2 = list[IL_LENGTH];
|
|
Packit Bot |
06c835 |
size_t need = len1 + len2;
|
|
Packit Bot |
06c835 |
struct mult_index *ptr1;
|
|
Packit Bot |
06c835 |
struct mult_index *ptr1_end;
|
|
Packit Bot |
06c835 |
index_ty *ptr2;
|
|
Packit Bot |
06c835 |
index_ty *ptr2_end;
|
|
Packit Bot |
06c835 |
struct mult_index *destptr;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Make the work area large enough. */
|
|
Packit Bot |
06c835 |
if (accu->nitems2_max < need)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
size_t new_max = 2 * accu->nitems2_max + 1;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
if (new_max < need)
|
|
Packit Bot |
06c835 |
new_max = need;
|
|
Packit Bot |
06c835 |
if (accu->item2 != NULL)
|
|
Packit Bot |
06c835 |
free (accu->item2);
|
|
Packit Bot |
06c835 |
accu->item2 = XNMALLOC (new_max, struct mult_index);
|
|
Packit Bot |
06c835 |
accu->nitems2_max = new_max;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Make a linear pass through accu and list simultaneously. */
|
|
Packit Bot |
06c835 |
ptr1 = accu->item;
|
|
Packit Bot |
06c835 |
ptr1_end = ptr1 + len1;
|
|
Packit Bot |
06c835 |
ptr2 = list + 2;
|
|
Packit Bot |
06c835 |
ptr2_end = ptr2 + len2;
|
|
Packit Bot |
06c835 |
destptr = accu->item2;
|
|
Packit Bot |
06c835 |
while (ptr1 < ptr1_end && ptr2 < ptr2_end)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
if (ptr1->index < *ptr2)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
*destptr = *ptr1;
|
|
Packit Bot |
06c835 |
ptr1++;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
else if (ptr1->index > *ptr2)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
destptr->index = *ptr2;
|
|
Packit Bot |
06c835 |
destptr->count = 1;
|
|
Packit Bot |
06c835 |
ptr2++;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
else /* ptr1->index == list[2 + i2] */
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
destptr->index = ptr1->index;
|
|
Packit Bot |
06c835 |
destptr->count = ptr1->count + 1;
|
|
Packit Bot |
06c835 |
ptr1++;
|
|
Packit Bot |
06c835 |
ptr2++;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
destptr++;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
while (ptr1 < ptr1_end)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
*destptr = *ptr1;
|
|
Packit Bot |
06c835 |
ptr1++;
|
|
Packit Bot |
06c835 |
destptr++;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
while (ptr2 < ptr2_end)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
destptr->index = *ptr2;
|
|
Packit Bot |
06c835 |
destptr->count = 1;
|
|
Packit Bot |
06c835 |
ptr2++;
|
|
Packit Bot |
06c835 |
destptr++;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Swap accu->item and accu->item2. */
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
struct mult_index *dest = accu->item2;
|
|
Packit Bot |
06c835 |
size_t dest_max = accu->nitems2_max;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
accu->item2 = accu->item;
|
|
Packit Bot |
06c835 |
accu->nitems2_max = accu->nitems_max;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
accu->item = dest;
|
|
Packit Bot |
06c835 |
accu->nitems = destptr - dest;
|
|
Packit Bot |
06c835 |
accu->nitems_max = dest_max;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Compares two indices with multiplicity, according to their multiplicity. */
|
|
Packit Bot |
06c835 |
static int
|
|
Packit Bot |
06c835 |
mult_index_compare (const void *p1, const void *p2)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
const struct mult_index *ptr1 = (const struct mult_index *) p1;
|
|
Packit Bot |
06c835 |
const struct mult_index *ptr2 = (const struct mult_index *) p2;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
if (ptr1->count < ptr2->count)
|
|
Packit Bot |
06c835 |
return 1;
|
|
Packit Bot |
06c835 |
if (ptr1->count > ptr2->count)
|
|
Packit Bot |
06c835 |
return -1;
|
|
Packit Bot |
06c835 |
/* For reproduceable results, also take into account the indices. */
|
|
Packit Bot |
06c835 |
if (ptr1->index > ptr2->index)
|
|
Packit Bot |
06c835 |
return 1;
|
|
Packit Bot |
06c835 |
if (ptr1->index < ptr2->index)
|
|
Packit Bot |
06c835 |
return -1;
|
|
Packit Bot |
06c835 |
return 0;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Sort a list of indices with multiplicity according to decreasing
|
|
Packit Bot |
06c835 |
multiplicity. */
|
|
Packit Bot |
06c835 |
static inline void
|
|
Packit Bot |
06c835 |
mult_index_list_sort (struct mult_index_list *accu)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
if (accu->nitems > 1)
|
|
Packit Bot |
06c835 |
qsort (accu->item, accu->nitems, sizeof (struct mult_index),
|
|
Packit Bot |
06c835 |
mult_index_compare);
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Frees a list of indices with multiplicity. */
|
|
Packit Bot |
06c835 |
static inline void
|
|
Packit Bot |
06c835 |
mult_index_list_free (struct mult_index_list *accu)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
if (accu->item != NULL)
|
|
Packit Bot |
06c835 |
free (accu->item);
|
|
Packit Bot |
06c835 |
if (accu->item2 != NULL)
|
|
Packit Bot |
06c835 |
free (accu->item2);
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Find a good match for the given msgctxt and msgid in the given fuzzy index.
|
|
Packit Bot |
06c835 |
The match does not need to be optimal.
|
|
Packit Bot |
06c835 |
Ignore matches for which the fuzzy_search_goal_function is < LOWER_BOUND.
|
|
Packit Bot |
06c835 |
LOWER_BOUND must be >= FUZZY_THRESHOLD.
|
|
Packit Bot |
06c835 |
If HEURISTIC is true, only the few best messages among the list - according
|
|
Packit Bot |
06c835 |
to a certain heuristic - are considered. If HEURISTIC is false, all
|
|
Packit Bot |
06c835 |
messages with a fuzzy_search_goal_function > FUZZY_THRESHOLD are considered,
|
|
Packit Bot |
06c835 |
like in message_list_search_fuzzy (except that in ambiguous cases where
|
|
Packit Bot |
06c835 |
several best matches exist, message_list_search_fuzzy chooses the one with
|
|
Packit Bot |
06c835 |
the smallest index whereas message_fuzzy_index_search makes a better
|
|
Packit Bot |
06c835 |
choice). */
|
|
Packit Bot |
06c835 |
message_ty *
|
|
Packit Bot |
06c835 |
message_fuzzy_index_search (message_fuzzy_index_ty *findex,
|
|
Packit Bot |
06c835 |
const char *msgctxt, const char *msgid,
|
|
Packit Bot |
06c835 |
double lower_bound,
|
|
Packit Bot |
06c835 |
bool heuristic)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
const char *str = msgid;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Let p0 < p1 < p2 < p3 < p4 walk through the string. */
|
|
Packit Bot |
06c835 |
const char *p0 = str;
|
|
Packit Bot |
06c835 |
if (*p0 != '\0')
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
const char *p1 = p0 + findex->iterator (p0);
|
|
Packit Bot |
06c835 |
if (*p1 != '\0')
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
const char *p2 = p1 + findex->iterator (p1);
|
|
Packit Bot |
06c835 |
if (*p2 != '\0')
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
const char *p3 = p2 + findex->iterator (p2);
|
|
Packit Bot |
06c835 |
if (*p3 != '\0')
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
const char *p4 = p3 + findex->iterator (p3);
|
|
Packit Bot |
06c835 |
struct mult_index_list accu;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
mult_index_list_init (&accu);
|
|
Packit Bot |
06c835 |
for (;;)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
/* The segment from p0 to p4 is a 4-gram of
|
|
Packit Bot |
06c835 |
characters. Get the hash table entry containing
|
|
Packit Bot |
06c835 |
a list of indices, and add it to the accu. */
|
|
Packit Bot |
06c835 |
void *found;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
if (hash_find_entry (&findex->gram4, p0, p4 - p0,
|
|
Packit Bot |
06c835 |
&found) == 0)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
index_list_ty list = (index_list_ty) found;
|
|
Packit Bot |
06c835 |
mult_index_list_accumulate (&accu, list);
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Advance. */
|
|
Packit Bot |
06c835 |
if (*p4 == '\0')
|
|
Packit Bot |
06c835 |
break;
|
|
Packit Bot |
06c835 |
p0 = p1;
|
|
Packit Bot |
06c835 |
p1 = p2;
|
|
Packit Bot |
06c835 |
p2 = p3;
|
|
Packit Bot |
06c835 |
p3 = p4;
|
|
Packit Bot |
06c835 |
p4 = p4 + findex->iterator (p4);
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Sort in decreasing count order. */
|
|
Packit Bot |
06c835 |
mult_index_list_sort (&accu);
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Iterate over this sorted list, and maximize the
|
|
Packit Bot |
06c835 |
fuzzy_search_goal_function() result.
|
|
Packit Bot |
06c835 |
If HEURISTIC is true, take only the first few messages.
|
|
Packit Bot |
06c835 |
If HEURISTIC is false, consider all messages - to match
|
|
Packit Bot |
06c835 |
the behaviour of message_list_search_fuzzy -, but process
|
|
Packit Bot |
06c835 |
them in the order of the sorted list. This increases
|
|
Packit Bot |
06c835 |
the chances that the later calls to fstrcmp_bounded() (via
|
|
Packit Bot |
06c835 |
fuzzy_search_goal_function()) terminate quickly, thanks
|
|
Packit Bot |
06c835 |
to the best_weight which will be quite high already after
|
|
Packit Bot |
06c835 |
the first few messages. */
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
size_t count;
|
|
Packit Bot |
06c835 |
struct mult_index *ptr;
|
|
Packit Bot |
06c835 |
message_ty *best_mp;
|
|
Packit Bot |
06c835 |
double best_weight;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
count = accu.nitems;
|
|
Packit Bot |
06c835 |
if (heuristic)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
if (count > findex->firstfew)
|
|
Packit Bot |
06c835 |
count = findex->firstfew;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
best_weight = lower_bound;
|
|
Packit Bot |
06c835 |
best_mp = NULL;
|
|
Packit Bot |
06c835 |
for (ptr = accu.item; count > 0; ptr++, count--)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
message_ty *mp = findex->messages[ptr->index];
|
|
Packit Bot |
06c835 |
double weight =
|
|
Packit Bot |
06c835 |
fuzzy_search_goal_function (mp, msgctxt, msgid,
|
|
Packit Bot |
06c835 |
best_weight);
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
if (weight > best_weight)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
best_weight = weight;
|
|
Packit Bot |
06c835 |
best_mp = mp;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
mult_index_list_free (&accu);
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
return best_mp;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* The string had less than 4 characters. */
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
size_t l = strlen (str);
|
|
Packit Bot |
06c835 |
size_t lmin, lmax;
|
|
Packit Bot |
06c835 |
message_ty *best_mp;
|
|
Packit Bot |
06c835 |
double best_weight;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
if (!(l <= SHORT_STRING_MAX_BYTES))
|
|
Packit Bot |
06c835 |
abort ();
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Walk through those short messages which have an appropriate length.
|
|
Packit Bot |
06c835 |
See the comment before SHORT_MSG_MAX. */
|
|
Packit Bot |
06c835 |
lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
|
|
Packit Bot |
06c835 |
lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
|
|
Packit Bot |
06c835 |
if (!(lmax <= SHORT_MSG_MAX))
|
|
Packit Bot |
06c835 |
abort ();
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
best_weight = lower_bound;
|
|
Packit Bot |
06c835 |
best_mp = NULL;
|
|
Packit Bot |
06c835 |
for (l = lmin; l <= lmax; l++)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
message_list_ty *mlp = findex->short_messages[l];
|
|
Packit Bot |
06c835 |
size_t j;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
for (j = 0; j < mlp->nitems; j++)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
message_ty *mp = mlp->item[j];
|
|
Packit Bot |
06c835 |
double weight =
|
|
Packit Bot |
06c835 |
fuzzy_search_goal_function (mp, msgctxt, msgid, best_weight);
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
if (weight > best_weight)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
best_weight = weight;
|
|
Packit Bot |
06c835 |
best_mp = mp;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
return best_mp;
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
}
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Free a fuzzy index. */
|
|
Packit Bot |
06c835 |
void
|
|
Packit Bot |
06c835 |
message_fuzzy_index_free (message_fuzzy_index_ty *findex)
|
|
Packit Bot |
06c835 |
{
|
|
Packit Bot |
06c835 |
size_t l;
|
|
Packit Bot |
06c835 |
void *iter;
|
|
Packit Bot |
06c835 |
const void *key;
|
|
Packit Bot |
06c835 |
size_t keylen;
|
|
Packit Bot |
06c835 |
void *data;
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Free the short lists. */
|
|
Packit Bot |
06c835 |
for (l = 0; l <= SHORT_MSG_MAX; l++)
|
|
Packit Bot |
06c835 |
message_list_free (findex->short_messages[l], 1);
|
|
Packit Bot |
06c835 |
free (findex->short_messages);
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
/* Free the index lists occurring as values in the hash tables. */
|
|
Packit Bot |
06c835 |
iter = NULL;
|
|
Packit Bot |
06c835 |
while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
|
|
Packit Bot |
06c835 |
free ((index_list_ty *) data);
|
|
Packit Bot |
06c835 |
/* Free the hash table itself. */
|
|
Packit Bot |
06c835 |
hash_destroy (&findex->gram4);
|
|
Packit Bot |
06c835 |
|
|
Packit Bot |
06c835 |
free (findex);
|
|
Packit Bot |
06c835 |
}
|