/*
uniform interface to particular languages
Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>
This program is free software; you can redistribute it and/or modify it
under the terms of version 2 of the GNU General Public License as published
by the Free Software Foundation.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif /* HAVE_CONFIG_H */
#include "enca.h"
#include "internal.h"
/**
* Language `none'.
*
* This language has no regular charsets, so only multibyte encodings are
* tested
**/
static const EncaLanguageInfo ENCA_LANGUAGE___ = {
"__", /* name */
"none", /* human name */
0, /* number of charsets */
NULL, /* their names */
NULL, /* character weights */
NULL, /* significancy data */
NULL, /* letter data */
NULL, /* pair data */
0, /* sum of weights */
NULL, /* hook function */
NULL, /* eolhook function */
NULL, /* lcuchook function */
NULL, /* ratinghook function */
};
/* All languages. */
static const EncaLanguageInfo *const LANGUAGE_LIST[] = {
&ENCA_LANGUAGE_BE, /* Belarusian. */
&ENCA_LANGUAGE_BG, /* Bulgarian. */
&ENCA_LANGUAGE_CS, /* Czech. */
&ENCA_LANGUAGE_ET, /* Estonian. */
&ENCA_LANGUAGE_HR, /* Croatian. */
&ENCA_LANGUAGE_HU, /* Hungarian. */
&ENCA_LANGUAGE_LT, /* Latvian. */
&ENCA_LANGUAGE_LV, /* Lithuanian. */
&ENCA_LANGUAGE_PL, /* Polish. */
&ENCA_LANGUAGE_RU, /* Russian. */
&ENCA_LANGUAGE_SK, /* Slovak. */
&ENCA_LANGUAGE_SL, /* Slovene. */
&ENCA_LANGUAGE_UK, /* Ukrainian. */
&ENCA_LANGUAGE_ZH, /* Chinese. */
&ENCA_LANGUAGE___, /* None. */
};
#define NLANGUAGES (ELEMENTS(LANGUAGE_LIST))
/* Local prototypes. */
static int* language_charsets_ids(const EncaLanguageInfo *lang);
static const EncaLanguageInfo* find_language(const char *langname);
/**
* enca_language_init:
* @analyser: Analyzer state to be initialized for this language.
* @langname: Two-letter ISO-639 language code.
*
* Initializes analyser for language @langname.
*
* Assumes @analyser is unitinialized, calling with an initialized @analyser
* leads to memory leak.
*
* Returns: Nonzero on success, zero otherwise.
**/
int
enca_language_init(EncaAnalyserState *analyser,
const char *langname)
{
const EncaLanguageInfo *lang;
assert(langname != NULL);
analyser->lang = NULL;
analyser->ncharsets = 0;
analyser->charsets = NULL;
analyser->lcbits = NULL;
analyser->ucbits = NULL;
lang = find_language(langname);
if (lang == NULL)
return 0;
analyser->lang = lang;
if (lang->ncharsets == 0)
return 1;
analyser->ncharsets = lang->ncharsets;
analyser->charsets = language_charsets_ids(lang);
return 1;
}
/**
* enca_language_destroy:
* @analyser: Analyzer state whose language part should be destroyed.
*
* Destroys the language part of analyser state @analyser.
**/
void
enca_language_destroy(EncaAnalyserState *analyser)
{
enca_free(analyser->charsets);
enca_free(analyser->lcbits);
enca_free(analyser->ucbits);
analyser->ncharsets = 0;
analyser->lang = NULL;
}
/**
* enca_get_languages:
* @n: The number of languages will be stored here.
*
* Returns list of known languages.
*
* The returned strings are two-letter ISO-639 language codes, the same as
* enca_analyser_alloc() accepts.
*
* The list of languages has to be freed by caller; the strings themselves
* must be considered constant and must NOT be freed.
*
* Returns: The list of languages, storing their number into *@n.
**/
const char**
enca_get_languages(size_t *n)
{
const char **languages;
size_t i;
languages = NEW(const char*, NLANGUAGES);
for (i = 0; i < NLANGUAGES; i++)
languages[i] = LANGUAGE_LIST[i]->name;
*n = NLANGUAGES;
return languages;
}
/**
* enca_analyser_language:
* @analyser: An analyser.
*
* Returns name of language which was @analyser initialized for.
*
* The returned string must be considered constant and must NOT be freed.
*
* Returns: The language name.
**/
const char*
enca_analyser_language(EncaAnalyser analyser)
{
assert(analyser != NULL);
return analyser->lang->name;
}
/**
* enca_language_english_name:
* @lang: A two-letter language code, such as obtained from
* enca_analyser_language() or enca_get_languages().
*
* Returns an English name of a language given its ISO-639 code.
*
* The returned string must be considered constant and must NOT be freed.
*
* Returns: The English language name.
**/
const char*
enca_language_english_name(const char *lang)
{
const EncaLanguageInfo *linfo;
linfo = find_language(lang);
if (!linfo)
return NULL;
return linfo->humanname;
}
/**
* enca_get_language_charsets:
* @langname: Two-letter ISO-639 language code.
* @n: The number of charsets will be stored here.
*
* Returns list of identifiers of charsets supported for language @language.
*
* The list of charset identifiers has to be freed by caller.
*
* Returns: The list of charsets, storing their number into *@n. When language
* contains no charsets or @langname is invalid, #NULL is returned
* and zero stored into *@n.
**/
int*
enca_get_language_charsets(const char *langname,
size_t *n)
{
const EncaLanguageInfo *lang;
assert(langname != NULL);
lang = find_language(langname);
if (lang == NULL) {
*n = 0;
return NULL;
}
*n = lang->ncharsets;
return language_charsets_ids(lang);
}
/**
* language_charsets_ids:
* @lang: A language.
*
* Creates and fills table of charset identifiers of charsets supported for
* language @lang.
*
* The size of the table is determined by @lang->ncharsets.
*
* Returns: The charsets id table; #NULL when @lang has no charsets.
**/
static int*
language_charsets_ids(const EncaLanguageInfo *lang)
{
int *charsets;
size_t i;
assert(lang != NULL);
if (lang->ncharsets == 0)
return NULL;
charsets = NEW(int, lang->ncharsets);
for (i = 0; i < lang->ncharsets; i++) {
charsets[i] = enca_name_to_charset(lang->csnames[i]);
assert(charsets[i] != ENCA_CS_UNKNOWN);
}
return charsets;
}
/**
* find_language:
* @langname: Language (i.e. locale) name.
*
* Finds language @langname.
*
* Returns: Pointer to its language information data; #NULL if not found.
**/
static const EncaLanguageInfo*
find_language(const char *langname)
{
const EncaLanguageInfo *lang = NULL;
size_t i;
if (langname == NULL)
return NULL;
for (i = 0; i < NLANGUAGES; i++) {
if (strcmp(langname, LANGUAGE_LIST[i]->name) == 0) {
lang = LANGUAGE_LIST[i];
break;
}
}
return lang;
}
/**
* enca_get_charset_similarity_matrix:
* @lang: A language.
*
* Computes character weight similarity matrix for language @lang.
*
* sim[i,j] is normalized to sim[i,i] thus:
* - a row i contains ,probabilities` different languages will look like the
* i-th one
* - a column i contains ,probabilities` the i-th language will look like
* the other languages.
*
* For all practical applications, the higher one of sim[i,j] and sim[j,i]
* is important.
*
* Note: this is not used anywhere, only by simtable.
*
* Returns: The matrix, its size is determined by @lang->ncharsets; #NULL
* for language with no charsets.
**/
double*
enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang)
{
const size_t n = lang->ncharsets;
const unsigned short int *const *w = lang->weights;
const unsigned short int *s = lang->significant;
double *smat;
size_t i, j, c;
assert(lang != NULL);
if (n == 0)
return NULL;
/* Below diagonal. */
smat = NEW(double, n*n);
for (i = 0; i < n; i++) {
for (j = 0; j <= i; j++) {
smat[i*n + j] = 0.0;
for (c = 0; c < 0x100; c++)
smat[i*n + j] += (double)w[i][c] * (double)w[j][c] / (s[c] + EPSILON);
}
}
/* Above diagonal. */
for (i = 0; i < n; i++) {
for (j = i+1; j < n; j++)
smat[i*n + j] = smat[j*n + i];
}
/* Normalize. */
for (i = 0; i < n; i++) {
double wmax = smat[i*n + i];
for (j = 0; j < n; j++) {
smat[i*n + j] /= wmax;
}
}
return smat;
}
/* vim: ts=2
*/