|
Packit |
c5a612 |
#include <stdlib.h>
|
|
Packit |
c5a612 |
#include <string.h>
|
|
Packit |
c5a612 |
#include <limits.h>
|
|
Packit |
c5a612 |
#include <utils.h>
|
|
Packit |
c5a612 |
#include <misspell.h>
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
enum string_distance_function {
|
|
Packit |
c5a612 |
DELETION = 0, /* m1 */
|
|
Packit |
c5a612 |
INSERTION, /* m2 */
|
|
Packit |
c5a612 |
TRANSFORMATION, /* m3 */
|
|
Packit |
c5a612 |
};
|
|
Packit |
c5a612 |
#define DISTANCE_MAX (TRANSFORMATION + 1)
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
static unsigned int min_distance(unsigned int *cost)
|
|
Packit |
c5a612 |
{
|
|
Packit |
c5a612 |
unsigned int min = UINT_MAX;
|
|
Packit |
c5a612 |
int k;
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
for (k = 0; k < DISTANCE_MAX; k++) {
|
|
Packit |
c5a612 |
if (cost[k] < min)
|
|
Packit |
c5a612 |
min = cost[k];
|
|
Packit |
c5a612 |
}
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
return min;
|
|
Packit |
c5a612 |
}
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
/* A simple implementation of "The string-to-string correction problem (1974)"
|
|
Packit |
c5a612 |
* by Robert A. Wagner.
|
|
Packit |
c5a612 |
*/
|
|
Packit |
c5a612 |
static unsigned int string_distance(const char *a, const char *b)
|
|
Packit |
c5a612 |
{
|
|
Packit |
c5a612 |
unsigned int len_a = strlen(a);
|
|
Packit |
c5a612 |
unsigned int len_b = strlen(b);
|
|
Packit |
c5a612 |
unsigned int *distance;
|
|
Packit |
c5a612 |
unsigned int i, j, ret;
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
distance = xzalloc((len_a + 1) * (len_b + 1) * sizeof(unsigned int));
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
#define DISTANCE(__i, __j) distance[(__i) * len_b + (__j)]
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
for (i = 0; i <= len_a; i++)
|
|
Packit |
c5a612 |
DISTANCE(i, 0) = i;
|
|
Packit |
c5a612 |
for (j = 0; j <= len_b; j++)
|
|
Packit |
c5a612 |
DISTANCE(0, j) = j;
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
for (i = 1; i <= len_a; i++) {
|
|
Packit |
c5a612 |
for (j = 1; j <= len_b; j++) {
|
|
Packit |
c5a612 |
unsigned int subcost = (a[i] == b[j]) ? 0 : 1;
|
|
Packit |
c5a612 |
unsigned int cost[3];
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
cost[DELETION] = DISTANCE(i - 1, j) + 1;
|
|
Packit |
c5a612 |
cost[INSERTION] = DISTANCE(i, j - 1) + 1;
|
|
Packit |
c5a612 |
cost[TRANSFORMATION] = DISTANCE(i - 1, j - 1) + subcost;
|
|
Packit |
c5a612 |
DISTANCE(i, j) = min_distance(cost);
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
if (i > 1 && j > 1 &&
|
|
Packit |
c5a612 |
a[i] == b[j - 1] &&
|
|
Packit |
c5a612 |
a[i - 1] == b[j])
|
|
Packit |
c5a612 |
DISTANCE(i, j) =
|
|
Packit |
c5a612 |
min(DISTANCE(i, j),
|
|
Packit |
c5a612 |
DISTANCE(i - 2, j - 2) + subcost);
|
|
Packit |
c5a612 |
}
|
|
Packit |
c5a612 |
}
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
ret = DISTANCE(len_a, len_b);
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
xfree(distance);
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
return ret;
|
|
Packit |
c5a612 |
}
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
void string_misspell_init(struct string_misspell_state *st)
|
|
Packit |
c5a612 |
{
|
|
Packit |
c5a612 |
st->obj = NULL;
|
|
Packit |
c5a612 |
st->min_distance = UINT_MAX;
|
|
Packit |
c5a612 |
}
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
int string_misspell_update(const char *a, const char *b,
|
|
Packit |
c5a612 |
void *obj, struct string_misspell_state *st)
|
|
Packit |
c5a612 |
{
|
|
Packit |
c5a612 |
unsigned int len_a, len_b, max_len, min_len, distance, threshold;
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
len_a = strlen(a);
|
|
Packit |
c5a612 |
len_b = strlen(b);
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
max_len = max(len_a, len_b);
|
|
Packit |
c5a612 |
min_len = min(len_a, len_b);
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
if (max_len <= 1)
|
|
Packit |
c5a612 |
return 0;
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
if (max_len - min_len <= 1)
|
|
Packit |
c5a612 |
threshold = max(div_round_up(max_len, 3), 1);
|
|
Packit |
c5a612 |
else
|
|
Packit |
c5a612 |
threshold = div_round_up(max_len + 2, 3);
|
|
Packit |
c5a612 |
|
|
Packit |
c5a612 |
distance = string_distance(a, b);
|
|
Packit |
c5a612 |
if (distance > threshold)
|
|
Packit |
c5a612 |
return 0;
|
|
Packit |
c5a612 |
else if (distance < st->min_distance) {
|
|
Packit |
c5a612 |
st->min_distance = distance;
|
|
Packit |
c5a612 |
st->obj = obj;
|
|
Packit |
c5a612 |
return 1;
|
|
Packit |
c5a612 |
}
|
|
Packit |
c5a612 |
return 0;
|
|
Packit |
c5a612 |
}
|