/* enchant
* Copyright (C) 2003, 2004 Dom Lachowicz
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.
*
* In addition, as a special exception, Dom Lachowicz
* gives permission to link the code of this program with
* non-LGPL Spelling Provider libraries (eg: a MSFT Office
* spell checker backend) and distribute linked combinations including
* the two. You must obey the GNU Lesser General Public License in all
* respects for all of the code used other than said providers. If you modify
* this file, you may extend this exception to your version of the
* file, but you are not obligated to do so. If you do not wish to
* do so, delete this exception statement from your version.
*/
/**
*
* This file implements personal word list (PWL) dictionaries in the
* type EnchantPWL.
*
* Under the hood, a PWL is stored as a Trie. Checking strings for
* correctness and making suggestions is done by traversing the Trie
* while allowing a certain number of miss-steps. Due to the prefix
* compression of the Trie, this allows all strings in the PWL within
* a given edit distance of the target word to be enumerated quite
* efficiently.
*
* Ideas for the future:
*
* - Order the suggestions first by edit distance, then by
* soundex key, to put the most similar sounding suggestions
* at the front of the list. Would need a "soundex" that is
* general enough to handle languages other than English.
*
* - iterative deepening to find suggestions, rather than a straight
* search to depth three.
*
*/
#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <fcntl.h>
#include <glib.h>
#include <glib/gstdio.h>
#include "enchant-provider.h"
#include "unused-parameter.h"
#include "pwl.h"
#define ENCHANT_PWL_MAX_ERRORS 3
#define ENCHANT_PWL_MAX_SUGGS 15
static const gunichar BOM = 0xfeff;
/* A PWL dictionary is stored as a Trie-like data structure EnchantTrie.
* The EnchantTrie datatype is completely recursive - all child nodes
* are simply EnchantTrie pointers. This means that all functions
* that potentially modify a trie need to return the modified trie,
* as additional memory may have been allocated.
*
* The empty trie is simply the null pointer. If a trie contains
* a single string, it is recorded in the "value" attribute and
* "subtries" is set to NULL. When two or more strings are contained,
* "value" is NULL and "subtries" is a GHashTable mapping the first
* character of each string to the subtrie containing the remainder of
* that string.
*
* All strings stored in the Trie are assumed to be in UTF format.
* Branching is done on unicode characters, not individual bytes.
*/
typedef struct str_enchant_trie EnchantTrie;
struct str_enchant_trie
{
char* value; /* final string found under this node */
GHashTable* subtries; /* Maps characters to subtries */
};
struct str_enchant_pwl
{
EnchantTrie* trie;
char * filename;
time_t file_changed;
GHashTable *words_in_trie;
};
/* Special Trie node indicating the end of a string */
static EnchantTrie n_EOSTrie;
static EnchantTrie* EOSTrie = &n_EOSTrie;
/* mode for searching trie */
typedef enum enum_matcher_mode EnchantTrieMatcherMode;
enum enum_matcher_mode
{
case_sensitive,
case_insensitive
};
/* The EnchantTrieMatcher structure maintains the state necessary to
* search for matching strings within an EnchantTrie. It includes a
* callback function which will be called with each matching string
* as it is found. The arguments to this function are:
*
* - a freshly-allocated copy of the matching string, which must
* be freed by external code
* - the EnchantTrieMatcher object, giving the context of the match
* (e.g. number of errors)
*/
typedef struct str_enchant_trie_matcher EnchantTrieMatcher;
struct str_enchant_trie_matcher
{
int num_errors; /* Num errors encountered so far. */
int max_errors; /* Max errors before search should terminate */
char* word; /* Word being searched for */
ssize_t word_pos; /* Current position in the word */
char* path; /* Path taken through the trie so far */
ssize_t path_len; /* Length of allocated path string */
ssize_t path_pos; /* Current end pos of path string */
EnchantTrieMatcherMode mode;
void (*cbfunc)(char*,EnchantTrieMatcher*); /* callback func */
void* cbdata; /* Private data for use by callback func */
};
/* To allow the list of suggestions to be built up an item at a time,
* its state is maintained in an EnchantSuggList object.
*/
typedef struct str_enchant_sugg_list
{
char** suggs;
int* sugg_errs;
size_t n_suggs;
} EnchantSuggList;
/*
* Function Prototypes
*/
static void enchant_pwl_add_to_trie(EnchantPWL *pwl,
const char *const word, size_t len);
static void enchant_pwl_refresh_from_file(EnchantPWL* pwl);
static void enchant_pwl_check_cb(char* match,EnchantTrieMatcher* matcher);
static void enchant_pwl_suggest_cb(char* match,EnchantTrieMatcher* matcher);
static void enchant_trie_free(EnchantTrie* trie);
static void enchant_trie_free_cb(void*,void*,void*);
static EnchantTrie* enchant_trie_insert(EnchantTrie* trie,const char *const word);
static void enchant_trie_remove(EnchantTrie* trie,const char *const word);
static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matcher);
static void enchant_trie_find_matches_cb(void* keyV,void* subtrieV,void* matcherV);
static EnchantTrieMatcher* enchant_trie_matcher_init(const char* const word, size_t len,
int maxerrs,
EnchantTrieMatcherMode mode,
void(*cbfunc)(char*,EnchantTrieMatcher*),
void* cbdata);
static void enchant_trie_matcher_free(EnchantTrieMatcher* matcher);
static void enchant_trie_matcher_pushpath(EnchantTrieMatcher* matcher,char* newchars);
static void enchant_trie_matcher_poppath(EnchantTrieMatcher* matcher,int num);
static int edit_dist(const char* word1, const char* word2);
#define enchant_lock_file(f) flock (fileno (f), LOCK_EX)
#define enchant_unlock_file(f) flock (fileno (f), LOCK_UN)
/**
* enchant_pwl_init
*
* Returns: a new PWL object used to store/check/suggest words.
*/
EnchantPWL* enchant_pwl_init(void)
{
EnchantPWL *pwl = g_new0(EnchantPWL, 1);
pwl->words_in_trie = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
return pwl;
}
/**
* enchant_pwl_init_with_file
*
* Returns: a new PWL object used to store/check/suggest words
* or NULL if the file cannot be opened or created
*/
EnchantPWL* enchant_pwl_init_with_file(const char * file)
{
g_return_val_if_fail (file != NULL, NULL);
FILE* fd = g_fopen(file, "a+b");
if(fd == NULL)
return NULL;
fclose(fd);
EnchantPWL *pwl = enchant_pwl_init();
pwl->filename = g_strdup(file);
pwl->file_changed = 0;
enchant_pwl_refresh_from_file(pwl);
return pwl;
}
static void enchant_pwl_refresh_from_file(EnchantPWL* pwl)
{
GStatBuf stats;
if(!pwl->filename ||
g_stat(pwl->filename, &stats) != 0 || /* presumably I won't be able to open the file either */
pwl->file_changed == stats.st_mtime) /* nothing changed since last read */
return;
enchant_trie_free(pwl->trie);
pwl->trie = NULL;
g_hash_table_destroy (pwl->words_in_trie);
pwl->words_in_trie = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
FILE *f = g_fopen(pwl->filename, "rb");
if (!f)
return;
pwl->file_changed = stats.st_mtime;
enchant_lock_file (f);
char buffer[BUFSIZ + 1];
size_t line_number = 1;
for (; NULL != (fgets (buffer, sizeof (buffer), f)); ++line_number)
{
char *line = buffer;
if(line_number == 1 && BOM == g_utf8_get_char(line))
line = g_utf8_next_char(line);
size_t l = strlen(line)-1;
if (line[l]=='\n')
line[l] = '\0';
else if(!feof(f)) /* ignore lines longer than BUFSIZ. */
{
g_warning ("Line too long (ignored) in %s at line:%zu\n", pwl->filename, line_number);
while (NULL != (fgets (buffer, sizeof (buffer), f)))
{
if (line[strlen(buffer)-1]=='\n')
break;
}
continue;
}
if( line[0] && line[0] != '#')
{
if(g_utf8_validate(line, -1, NULL))
enchant_pwl_add_to_trie(pwl, line, strlen(line));
else
g_warning ("Bad UTF-8 sequence in %s at line:%zu\n", pwl->filename, line_number);
}
}
enchant_unlock_file (f);
fclose (f);
}
void enchant_pwl_free(EnchantPWL *pwl)
{
enchant_trie_free(pwl->trie);
g_free(pwl->filename);
g_hash_table_destroy (pwl->words_in_trie);
g_free(pwl);
}
static void enchant_pwl_add_to_trie(EnchantPWL *pwl,
const char *const word, size_t len)
{
char * normalized_word = g_utf8_normalize (word, len, G_NORMALIZE_NFD);
if(NULL != g_hash_table_lookup (pwl->words_in_trie, normalized_word)) {
g_free (normalized_word);
return;
}
g_hash_table_insert (pwl->words_in_trie, normalized_word, g_strndup(word,len));
pwl->trie = enchant_trie_insert(pwl->trie, normalized_word);
}
static void enchant_pwl_remove_from_trie(EnchantPWL *pwl,
const char *const word, size_t len)
{
char * normalized_word = g_utf8_normalize (word, len, G_NORMALIZE_NFD);
if( g_hash_table_remove (pwl->words_in_trie, normalized_word) )
{
enchant_trie_remove(pwl->trie, normalized_word);
if(pwl->trie && pwl->trie->subtries == NULL && pwl->trie->value == NULL) {
enchant_trie_free (pwl->trie);
pwl->trie = NULL; /* make trie empty if has no content */
}
}
g_free(normalized_word);
}
void enchant_pwl_add(EnchantPWL *pwl,
const char *const word, size_t len)
{
enchant_pwl_refresh_from_file(pwl);
enchant_pwl_add_to_trie(pwl, word, len);
if (pwl->filename != NULL)
{
FILE *f = g_fopen(pwl->filename, "a+b");
if (f)
{
/* Since this function does not signal I/O
errors, only use return values to avoid
doing things that seem futile. */
enchant_lock_file (f);
GStatBuf stats;
if(g_stat(pwl->filename, &stats)==0)
pwl->file_changed = stats.st_mtime;
/* Add a newline if the file doesn't end with one. */
if (fseek (f, -1, SEEK_END) == 0)
{
int c = getc (f);
fseek (f, 0L, SEEK_CUR); /* ISO C requires positioning between read and write. */
if (c != '\n')
putc ('\n', f);
}
if (fwrite (word, sizeof(char), len, f) == len)
{
putc ('\n', f);
}
enchant_unlock_file (f);
fclose (f);
}
}
}
void enchant_pwl_remove(EnchantPWL *pwl,
const char *const word, size_t len)
{
if(enchant_pwl_check(pwl, word, len) == 1)
return;
enchant_pwl_refresh_from_file(pwl);
enchant_pwl_remove_from_trie(pwl, word, len);
if (pwl->filename)
{
char * contents;
size_t length;
if(!g_file_get_contents(pwl->filename, &contents, &length, NULL))
return;
FILE *f = g_fopen(pwl->filename, "wb"); /*binary because g_file_get_contents reads binary*/
if (f)
{
enchant_lock_file (f);
char *key = g_strndup(word, len);
char *filestart;
if(BOM == g_utf8_get_char(contents))
{
filestart = g_utf8_next_char(contents);
fwrite (contents, sizeof(char), filestart-contents, f);
}
else
filestart = contents;
char *searchstart = filestart;
for(;;)
{
/*find word*/
char *needle = strstr(searchstart, key);
if(needle == NULL)
{
fwrite (searchstart, sizeof(char), length - (searchstart - contents), f);
break;
}
else
{
char* foundend = needle+len;
if((needle == filestart || contents[needle-contents-1] == '\n' || contents[needle-contents-1] == '\r') &&
(foundend == contents + length || *foundend == '\n' || *foundend == '\r'))
{
fwrite (searchstart, sizeof(char), needle - searchstart, f);
searchstart = foundend;
while (*searchstart == '\n' || *searchstart == '\r')
++searchstart;
}
else {
fwrite (searchstart, sizeof(char), needle - searchstart+1, f);
searchstart = needle+1;
}
}
}
g_free(key);
GStatBuf stats;
if(g_stat(pwl->filename, &stats)==0)
pwl->file_changed = stats.st_mtime;
enchant_unlock_file (f);
fclose (f);
}
g_free(contents);
}
}
static int enchant_pwl_contains(EnchantPWL *pwl, const char *const word, size_t len)
{
int count = 0;
EnchantTrieMatcher *matcher = enchant_trie_matcher_init(word, len, 0, case_sensitive, enchant_pwl_check_cb,
&count);
enchant_trie_find_matches(pwl->trie,matcher);
enchant_trie_matcher_free(matcher);
return count != 0;
}
static int enchant_is_all_caps(const char*const word, size_t len)
{
g_return_val_if_fail (word && *word, 0);
int hasCap = 0;
for (const char *it = word; it < word + len; it = g_utf8_next_char(it))
{
GUnicodeType type = g_unichar_type(g_utf8_get_char(it));
switch(type)
{
case G_UNICODE_UPPERCASE_LETTER:
hasCap = 1;
break;
case G_UNICODE_TITLECASE_LETTER:
case G_UNICODE_LOWERCASE_LETTER:
return 0;
case G_UNICODE_CONTROL:
case G_UNICODE_FORMAT:
case G_UNICODE_UNASSIGNED:
case G_UNICODE_PRIVATE_USE:
case G_UNICODE_SURROGATE:
case G_UNICODE_MODIFIER_LETTER:
case G_UNICODE_OTHER_LETTER:
case G_UNICODE_COMBINING_MARK:
case G_UNICODE_ENCLOSING_MARK:
case G_UNICODE_NON_SPACING_MARK:
case G_UNICODE_DECIMAL_NUMBER:
case G_UNICODE_LETTER_NUMBER:
case G_UNICODE_OTHER_NUMBER:
case G_UNICODE_CONNECT_PUNCTUATION:
case G_UNICODE_DASH_PUNCTUATION:
case G_UNICODE_CLOSE_PUNCTUATION:
case G_UNICODE_FINAL_PUNCTUATION:
case G_UNICODE_INITIAL_PUNCTUATION:
case G_UNICODE_OTHER_PUNCTUATION:
case G_UNICODE_OPEN_PUNCTUATION:
case G_UNICODE_CURRENCY_SYMBOL:
case G_UNICODE_MODIFIER_SYMBOL:
case G_UNICODE_MATH_SYMBOL:
case G_UNICODE_OTHER_SYMBOL:
case G_UNICODE_LINE_SEPARATOR:
case G_UNICODE_PARAGRAPH_SEPARATOR:
case G_UNICODE_SPACE_SEPARATOR:
default:
break;
}
}
return hasCap;
}
static _GL_ATTRIBUTE_PURE int enchant_is_title_case(const char * const word, size_t len)
{
g_return_val_if_fail (word && *word, 0);
gunichar ch = g_utf8_get_char(word);
GUnicodeType type = g_unichar_type(ch);
if ((type != G_UNICODE_UPPERCASE_LETTER && type != G_UNICODE_TITLECASE_LETTER) ||
ch != g_unichar_totitle(ch))
return 0;
for (const char* it = g_utf8_next_char(word); it < word + len; it = g_utf8_next_char(it))
{
type = g_unichar_type(g_utf8_get_char(it));
if (type == G_UNICODE_UPPERCASE_LETTER || type == G_UNICODE_TITLECASE_LETTER)
return 0;
}
return 1;
}
static gchar* enchant_utf8_strtitle(const gchar*str, gssize len)
{
gchar *upperStr = g_utf8_strup(str, len); /* for locale-sensitive casing */
gunichar title_case_char = g_unichar_totitle(g_utf8_get_char(upperStr));
gchar title_case_utf8[7];
gint utf8len = g_unichar_to_utf8(title_case_char, title_case_utf8);
title_case_utf8[utf8len] = '\0';
gchar *lowerTail = g_utf8_strdown(g_utf8_next_char(upperStr), -1);
gchar *result = g_strconcat(title_case_utf8, lowerTail, NULL);
g_free(upperStr);
g_free(lowerTail);
return result;
}
int enchant_pwl_check(EnchantPWL *pwl, const char *const word, size_t len)
{
enchant_pwl_refresh_from_file(pwl);
int exists = enchant_pwl_contains(pwl, word, len);
if(exists)
return 0;
int isAllCaps = 0;
if(enchant_is_title_case(word, len) || (isAllCaps = enchant_is_all_caps(word, len)))
{
char * lower_case_word = g_utf8_strdown(word, len);
exists = enchant_pwl_contains(pwl, lower_case_word, strlen(lower_case_word));
g_free(lower_case_word);
if(exists)
return 0;
if(isAllCaps)
{
char * title_case_word = enchant_utf8_strtitle(word, len);
exists = enchant_pwl_contains(pwl, title_case_word, strlen(title_case_word));
g_free(title_case_word);
if(exists)
return 0;
}
}
return 1; /* not found */
}
/* matcher callback when a match is found*/
static void enchant_pwl_check_cb(char* match,EnchantTrieMatcher* matcher)
{
g_free(match);
(*((int*)(matcher->cbdata)))++;
}
static void enchant_pwl_case_and_denormalize_suggestions(EnchantPWL *pwl,
const char *const word, size_t len,
EnchantSuggList* suggs_list)
{
gchar* (*utf8_case_convert_function)(const gchar*str, gssize len) = NULL;
if(enchant_is_title_case(word, len))
utf8_case_convert_function = enchant_utf8_strtitle;
else if (enchant_is_all_caps(word, len))
utf8_case_convert_function = g_utf8_strup;
for (size_t i = 0; i < suggs_list->n_suggs; ++i)
{
gchar* suggestion = g_hash_table_lookup (pwl->words_in_trie, suggs_list->suggs[i]);
size_t suggestion_len = strlen(suggestion);
gchar* cased_suggestion;
if (utf8_case_convert_function && !enchant_is_all_caps(suggestion, suggestion_len))
cased_suggestion = utf8_case_convert_function(suggestion, suggestion_len);
else
cased_suggestion = g_strndup(suggestion, suggestion_len);
g_free(suggs_list->suggs[i]);
suggs_list->suggs[i] = cased_suggestion;
}
}
static int best_distance(char** suggs, const char *const word, size_t len)
{
char *normalized_word = g_utf8_normalize (word, len, G_NORMALIZE_NFD);
int best_dist = g_utf8_strlen(normalized_word, -1);
for (char **sugg_it = suggs; *sugg_it; ++sugg_it)
{
char* normalized_sugg = g_utf8_normalize (*sugg_it, -1, G_NORMALIZE_NFD);
int dist = edit_dist(normalized_word, normalized_sugg);
g_free(normalized_sugg);
best_dist = MIN (dist, best_dist);
}
g_free(normalized_word);
return best_dist;
}
/* gives the best set of suggestions from pwl that are at least as good as the
* given suggs (if suggs == NULL just best from pwl) */
char** enchant_pwl_suggest(EnchantPWL *pwl, const char *const word,
size_t len, char** suggs, size_t* out_n_suggs)
{
int max_dist = suggs ? best_distance(suggs, word, len) : ENCHANT_PWL_MAX_ERRORS;
max_dist = MIN (max_dist, ENCHANT_PWL_MAX_ERRORS);
enchant_pwl_refresh_from_file(pwl);
EnchantSuggList sugg_list;
sugg_list.suggs = g_new0(char*,ENCHANT_PWL_MAX_SUGGS+1);
sugg_list.sugg_errs = g_new0(int,ENCHANT_PWL_MAX_SUGGS);
sugg_list.n_suggs = 0;
EnchantTrieMatcher *matcher = enchant_trie_matcher_init(word, len, max_dist,
case_insensitive,
enchant_pwl_suggest_cb,
&sugg_list);
enchant_trie_find_matches(pwl->trie,matcher);
enchant_trie_matcher_free(matcher);
g_free(sugg_list.sugg_errs);
sugg_list.suggs[sugg_list.n_suggs] = NULL;
(*out_n_suggs) = sugg_list.n_suggs;
enchant_pwl_case_and_denormalize_suggestions(pwl, word, len, &sugg_list);
return sugg_list.suggs;
}
/* matcher callback when a match is found*/
static void enchant_pwl_suggest_cb(char* match,EnchantTrieMatcher* matcher)
{
EnchantSuggList *sugg_list = (EnchantSuggList*)(matcher->cbdata);
/* only get best errors so adapt */
if(matcher->num_errors < matcher->max_errors)
matcher->max_errors = matcher->num_errors;
/* Find appropriate location in the array, if any */
/* In future, this could be done using binary search... */
size_t loc;
for(loc=0; loc < sugg_list->n_suggs; loc++) {
/* Better than an existing suggestion, so stop */
if(sugg_list->sugg_errs[loc] > matcher->num_errors) {
break;
}
/* Already in the list with better score, just return */
if(strcmp(match,sugg_list->suggs[loc])==0) {
g_free(match);
return;
}
}
/* If it's not going to fit, just throw it away */
if(loc >= ENCHANT_PWL_MAX_SUGGS) {
g_free(match);
return;
}
int changes = 1; /* num words added to list */
/* Remove all elements with worse score */
for(size_t i=loc; i < sugg_list->n_suggs; i++){
g_free(sugg_list->suggs[i]);
changes--;
}
sugg_list->suggs[loc] = match;
sugg_list->sugg_errs[loc] = matcher->num_errors;
sugg_list->n_suggs = sugg_list->n_suggs + changes;
}
static void enchant_trie_free(EnchantTrie* trie)
{
/* Don't try to free NULL or the EOSTrie pointer */
if(trie == NULL || trie == EOSTrie) {
return;
}
/* Because we have not set a destroy function for the hashtable
* (to make code cleaner below), we need to explicitly free all
* subtries with a callback function.
*/
if (trie->subtries != NULL) {
g_hash_table_foreach(trie->subtries,enchant_trie_free_cb,NULL);
g_hash_table_destroy(trie->subtries);
}
g_free(trie->value);
g_free(trie);
}
static void enchant_trie_free_cb(void* key _GL_UNUSED_PARAMETER,
void* value,
void* data _GL_UNUSED_PARAMETER)
{
enchant_trie_free((EnchantTrie*) value);
}
static EnchantTrie* enchant_trie_insert(EnchantTrie* trie,const char *const word)
{
if (trie == NULL) {
trie = g_new0(EnchantTrie, 1);
}
if (trie->value == NULL) {
if (trie->subtries == NULL) {
/* When single word, store in trie->value */
trie->value = g_strdup(word);
} else {
/* Store multiple words in subtries */
if (word[0] == '\0') {
/* Mark end-of-string with special node */
char *tmpWord = g_strdup("");
g_hash_table_insert(trie->subtries, tmpWord, EOSTrie);
} else {
ssize_t nxtCh = (ssize_t)(g_utf8_next_char(word)-word);
char *tmpWord = g_strndup(word,nxtCh);
EnchantTrie* subtrie = g_hash_table_lookup(trie->subtries, tmpWord);
subtrie = enchant_trie_insert(subtrie, word + nxtCh);
g_hash_table_insert(trie->subtries, tmpWord, subtrie);
}
}
} else {
/* Create new hash table for subtries, and reinsert */
trie->subtries = g_hash_table_new_full(g_str_hash,
g_str_equal, g_free, NULL);
char *tmpWord = trie->value;
trie->value = NULL;
enchant_trie_insert(trie, tmpWord);
enchant_trie_insert(trie, word);
g_free(tmpWord);
}
return trie;
}
static void enchant_trie_remove(EnchantTrie* trie,const char *const word)
{
if (trie == NULL)
return;
if (trie->value == NULL) {
if (trie->subtries != NULL) {
/* Store multiple words in subtries */
if (word[0] == '\0') {
/* End-of-string is marked with special node */
g_hash_table_remove(trie->subtries, "");
} else {
ssize_t nxtCh = (ssize_t)(g_utf8_next_char(word) - word);
char *tmpWord = g_strndup(word, nxtCh);
EnchantTrie *subtrie = g_hash_table_lookup(trie->subtries, tmpWord);
enchant_trie_remove(subtrie, word + nxtCh);
if(subtrie->subtries == NULL && subtrie->value == NULL) {
g_hash_table_remove(trie->subtries, tmpWord);
enchant_trie_free (subtrie);
}
g_free(tmpWord);
}
if(g_hash_table_size(trie->subtries) == 1)
{
GList* keys = g_hash_table_get_keys(trie->subtries);
char *key = (char*) keys->data;
EnchantTrie *subtrie = g_hash_table_lookup(trie->subtries, key);
/* only remove trie nodes that have values by propagating these up */
if(subtrie->value)
{
trie->value = g_strconcat(key, subtrie->value, NULL);
enchant_trie_free(subtrie);
g_hash_table_destroy(trie->subtries);
trie->subtries = NULL;
}
g_list_free(keys);
}
}
} else {
if(strcmp(trie->value, word) == 0)
{
g_free(trie->value);
trie->value = NULL;
}
}
}
static EnchantTrie* enchant_trie_get_subtrie(EnchantTrie* trie,
EnchantTrieMatcher* matcher,
char** nxtChS)
{
if(trie->subtries == NULL || *nxtChS == NULL)
return NULL;
EnchantTrie *subtrie = g_hash_table_lookup(trie->subtries,*nxtChS);
if(subtrie == NULL && matcher->mode == case_insensitive) {
char* nxtChSUp = g_utf8_strup(*nxtChS, -1); /* we ignore the title case scenario since that will give us an edit_distance of one which is acceptable since this mode is used for suggestions*/
g_free(*nxtChS);
*nxtChS = nxtChSUp;
subtrie = g_hash_table_lookup(trie->subtries,nxtChSUp);
}
return subtrie;
}
static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matcher)
{
g_return_if_fail(matcher);
/* Can't match in the empty trie */
if(trie == NULL) {
return;
}
/* Bail out if over the error limits */
if(matcher->num_errors > matcher->max_errors){
return;
}
/* If the end of a string has been reached, no point recursing */
if (trie == EOSTrie) {
size_t word_len = strlen(matcher->word);
int errs = matcher->num_errors;
if((ssize_t)word_len > matcher->word_pos) {
matcher->num_errors = errs + word_len - matcher->word_pos;
}
if (matcher->num_errors <= matcher->max_errors) {
matcher->cbfunc(g_strdup(matcher->path),matcher);
}
matcher->num_errors = errs;
return;
}
/* If there is a value, just check it, no recursion */
if (trie->value != NULL) {
gchar* value;
int errs = matcher->num_errors;
value = trie->value;
if(matcher->mode == case_insensitive)
{
value = g_utf8_strdown(value, -1);
}
matcher->num_errors = errs + edit_dist(value,
&(matcher->word[matcher->word_pos]));
if(matcher->mode == case_insensitive)
{
g_free(value);
}
if (matcher->num_errors <= matcher->max_errors) {
matcher->cbfunc(g_strconcat(matcher->path,
trie->value,NULL),
matcher);
}
matcher->num_errors = errs;
return;
}
ssize_t nxtChI = (ssize_t)(g_utf8_next_char(&matcher->word[matcher->word_pos]) - matcher->word);
char *nxtChS = g_strndup(&matcher->word[matcher->word_pos],
(nxtChI - matcher->word_pos));
/* Precisely match the first character, and recurse */
EnchantTrie* subtrie = enchant_trie_get_subtrie(trie, matcher, &nxtChS);
if (subtrie != NULL) {
enchant_trie_matcher_pushpath(matcher,nxtChS);
ssize_t oldPos = matcher->word_pos;
matcher->word_pos = nxtChI;
enchant_trie_find_matches(subtrie,matcher);
matcher->word_pos = oldPos;
enchant_trie_matcher_poppath(matcher,strlen(nxtChS));
}
g_free(nxtChS);
matcher->num_errors++;
if (matcher->word[matcher->word_pos] != '\0') {
/* Match on inserting word[0] */
ssize_t oldPos = matcher->word_pos;
matcher->word_pos = nxtChI;
enchant_trie_find_matches(trie,matcher);
matcher->word_pos = oldPos;
}
/* for each subtrie, match on delete or substitute word[0] or transpose word[0] and word[1] */
g_hash_table_foreach(trie->subtries,
enchant_trie_find_matches_cb,
matcher);
matcher->num_errors--;
}
static void enchant_trie_find_matches_cb(void* keyV,void* subtrieV,void* matcherV)
{
char *key = (char*) keyV;
EnchantTrie *subtrie = (EnchantTrie*) subtrieV;
EnchantTrieMatcher *matcher = (EnchantTrieMatcher*) matcherV;
ssize_t nxtChI = (ssize_t) (g_utf8_next_char(&matcher->word[matcher->word_pos]) - matcher->word);
/* Dont handle actual matches, that's already done */
if (strncmp(key,&matcher->word[matcher->word_pos],nxtChI-matcher->word_pos) == 0) {
return;
}
enchant_trie_matcher_pushpath(matcher,key);
/* Match on deleting word[0] */
enchant_trie_find_matches(subtrie,matcher);
/* Match on substituting word[0] */
ssize_t oldPos = matcher->word_pos;
matcher->word_pos = nxtChI;
enchant_trie_find_matches(subtrie,matcher);
enchant_trie_matcher_poppath(matcher,strlen(key));
/* Match on transposing word[0] and word[1] */
char *key2 = g_strndup(&matcher->word[oldPos],nxtChI-oldPos);
EnchantTrie *subtrie2 = enchant_trie_get_subtrie(subtrie, matcher, &key2);
if(subtrie2 != NULL) {
nxtChI = (ssize_t) (g_utf8_next_char(&matcher->word[matcher->word_pos]) - matcher->word);
if (strncmp(key,&matcher->word[matcher->word_pos],nxtChI-matcher->word_pos) == 0) {
matcher->word_pos = nxtChI;
enchant_trie_matcher_pushpath(matcher,key);
enchant_trie_matcher_pushpath(matcher,key2);
enchant_trie_find_matches(subtrie2,matcher);
enchant_trie_matcher_poppath(matcher,strlen(key2));
enchant_trie_matcher_poppath(matcher,strlen(key));
}
}
g_free(key2);
matcher->word_pos = oldPos;
}
static EnchantTrieMatcher* enchant_trie_matcher_init(const char* const word,
size_t len,
int maxerrs,
EnchantTrieMatcherMode mode,
void(*cbfunc)(char*,EnchantTrieMatcher*),
void* cbdata)
{
char * normalized_word = g_utf8_normalize (word, len, G_NORMALIZE_NFD);
len = strlen(normalized_word);
char * pattern = normalized_word;
if (mode == case_insensitive)
{
pattern = g_utf8_strdown (normalized_word, len);
g_free(normalized_word);
}
EnchantTrieMatcher* matcher = g_new(EnchantTrieMatcher,1);
matcher->num_errors = 0;
matcher->max_errors = maxerrs;
matcher->word = g_new0(char,len+maxerrs+1); // Ensure matcher does not overrun buffer
strcpy(matcher->word, pattern);
g_free(pattern);
matcher->word_pos = 0;
matcher->path = g_new0(char,len+maxerrs+1);
matcher->path[0] = '\0';
matcher->path_len = len+maxerrs+1;
matcher->path_pos = 0;
matcher->mode = mode;
matcher->cbfunc = cbfunc;
matcher->cbdata = cbdata;
return matcher;
}
static void enchant_trie_matcher_free(EnchantTrieMatcher* matcher)
{
g_free(matcher->word);
g_free(matcher->path);
g_free(matcher);
}
static void enchant_trie_matcher_pushpath(EnchantTrieMatcher* matcher,char* newchars)
{
ssize_t len = strlen(newchars);
if(matcher->path_pos + len >= matcher->path_len) {
matcher->path_len = matcher->path_len + len + 10;
matcher->path = g_renew(char,matcher->path,matcher->path_len);
}
for (ssize_t i = 0; i < len; i++) {
matcher->path[matcher->path_pos + i] = newchars[i];
}
matcher->path_pos = matcher->path_pos + len;
matcher->path[matcher->path_pos] = '\0';
}
static void enchant_trie_matcher_poppath(EnchantTrieMatcher* matcher,int num)
{
g_return_if_fail(matcher->path_pos >= 0);
matcher->path_pos = matcher->path_pos - num;
if(matcher->path_pos < 0) {
matcher->path_pos = 0;
}
matcher->path[matcher->path_pos] = '\0';
}
static int edit_dist(const char* utf8word1, const char* utf8word2)
{
glong len1, len2;
gunichar * word1 = g_utf8_to_ucs4_fast(utf8word1, -1, &len1);
gunichar * word2 = g_utf8_to_ucs4_fast(utf8word2, -1, &len2);
int * table = g_new0(int, (len1+1)*(len2+1));
/* Initialise outer rows of table */
for (glong i = 0; i < len1 + 1; i++)
table[i*(len2+1)] = i;
for (glong i = 0; i < len2 + 1; i++)
table[i] = i;
/* Fill in table in dynamic programming style */
for (glong i = 1; i < len1+1; i++){
for (glong j = 1; j < len2+1; j++) {
int cost = word1[i-1] != word2[j-1];
int v1 = table[(i-1)*(len2+1)+j] + 1;
if (i > 1 && j > 1 && word1[i-1] == word2[j-2] && word1[i-2] == word2[j-1]) {
v1 = MIN (v1, table[(i-2)*(len2+1)+(j-2)] + cost);
}
int v2 = table[i*(len2+1)+(j-1)] + 1;
int v3 = table[(i-1)*(len2+1)+(j-1)] + cost;
table[i*(len2+1)+j] = MIN (MIN (v1, v2), v3);
}
}
int cost = table[len1*(len2+1) + len2];
g_free(word1);
g_free(word2);
g_free(table);
return cost;
}