/*
checks for doubly-encoded utf-8
Copyright (C) 2000-2002 David Necas (Yeti) <yeti@physics.muni.cz>
This program is free software; you can redistribute it and/or modify it
under the terms of version 2 of the GNU General Public License as published
by the Free Software Foundation.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif /* HAVE_CONFIG_H */
#include <stdlib.h>
#include <math.h>
#include "enca.h"
#include "internal.h"
/* Local prototypes. */
static void compute_double_utf8_weights (EncaAnalyserState *analyser);
static void create_ucs2_weight_table (EncaUTFCheckData *amap,
size_t size,
int *wbuf);
static void mark_scratch_buffer (EncaAnalyserState *analyser);
/**
* enca_double_utf8_init:
* @analyser: Analyzer state to be initialized.
*
* Initializes double-UTF-8 check.
*
* In fact it initializes the fields to #NULL's, they are actually initialized
* only when needed.
**/
void
enca_double_utf8_init(EncaAnalyserState *analyser)
{
analyser->utfch = NULL;
analyser->utfbuf = NULL;
}
/**
* enca_double_utf8_destroy:
* @analyser: Analyzer state whose double-UTF-8 check part should be destroyed.
*
* Destroys the double-UTF-8 check part of analyser state @analyser.
**/
void
enca_double_utf8_destroy(EncaAnalyserState *analyser)
{
size_t i;
if (analyser->utfch == NULL)
return;
enca_free(analyser->utfbuf);
for (i = 0; i < analyser->ncharsets; i++) {
enca_free(analyser->utfch[i].ucs2);
enca_free(analyser->utfch[i].weights);
}
enca_free(analyser->utfch);
}
/**
* enca_double_utf8_check:
* @analyser: Analyzer state determinig the language for double-UTF-8 check.
* @buffer: The buffer to be checked [@size].
* @size: The size of @buffer.
*
* Checks buffer for double-UTF-8 encoding.
*
* Double-UTF-8 encoding is the result of [errorneous] conversion of UTF-8 text
* to UTF-8 again, as if it was in some 8bit charset. This is quite hard to
* recover from.
*
* The analayser mostly only determines what language will be assumed,
* the rest of this test is independent on the main guessing routines.
* When @buffer doesn't containing UTF-8 text, the result is undefined
* (namely, false positives are possible).
*
* Calling this function when language is `none' has currently no effect.
*
* Returns: Nonzero, when @buffer probably contains doubly-UTF-8 encoded text.
* More precisely, it returns the number of charsets which are
* possible candidates for source charset. You can then use
* enca_double_utf8_get_candidates() to retrieve the charsets.
**/
int
enca_double_utf8_check(EncaAnalyser analyser,
const unsigned char *buffer,
size_t size)
{
long int ucs4char = 0;
int remains_10xxxxxx = 0;
size_t i;
if (analyser->ncharsets == 0 || analyser->lang->weights == 0)
return 0;
/* Compute weights when we are called the first time. */
if (analyser->utfch == NULL)
compute_double_utf8_weights(analyser);
mark_scratch_buffer(analyser);
/* Parse. */
for (i = 0; i < size; i++) {
unsigned char b = buffer[i];
if (!remains_10xxxxxx) {
if ((b & 0x80) == 0) /* 7bit characters */
continue;
if ((b & 0xe0) == 0xc0) { /* 110xxxxx 10xxxxxx sequence */
ucs4char = b & 0x1f;
remains_10xxxxxx = 1;
continue;
}
if ((b & 0xf0) == 0xe0) { /* 1110xxxx 2 x 10xxxxxx sequence */
ucs4char = b & 0x0f;
remains_10xxxxxx = 2;
continue;
}
/* Following are valid 32-bit UCS characters, but not 16-bit Unicode,
nevertheless we accept them. */
if ((b & 0xf8) == 0xf0) { /* 1110xxxx 3 x 10xxxxxx sequence */
ucs4char = b & 0x07;
remains_10xxxxxx = 3;
continue;
}
if ((b & 0xfc) == 0xf8) { /* 1110xxxx 4 x 10xxxxxx sequence */
ucs4char = b & 0x03;
remains_10xxxxxx = 4;
continue;
}
if ((b & 0xfe) == 0xfc) { /* 1110xxxx 5 x 10xxxxxx sequence */
ucs4char = b & 0x01;
remains_10xxxxxx = 5;
continue;
}
/* We can get here only when input is invalid: (b & 0xc0) == 0x80. */
remains_10xxxxxx = 0;
}
else {
/* Broken 10xxxxxx sequence? */
if ((b & 0xc0) != 0x80) {
remains_10xxxxxx = 0;
}
else {
/* Good 10xxxxxx continuation. */
ucs4char <<= 6;
ucs4char |= b & 0x3f;
remains_10xxxxxx--;
/* Do we have a whole character?
* (We must not touch positions in utfbuf containing zeroes.) */
if (remains_10xxxxxx == 0
&& ucs4char < 0x10000
&& analyser->utfbuf[ucs4char] != 0) {
if (analyser->utfbuf[ucs4char] < 0)
analyser->utfbuf[ucs4char] = 1;
else
analyser->utfbuf[ucs4char]++;
}
}
}
}
/* Compute the ratings. */
for (i = 0; i < analyser->ncharsets; i++) {
EncaUTFCheckData *amap = analyser->utfch + i;
size_t j;
amap->rating = 0.0;
amap->result = 0;
for (j = 0; j < amap->size; j++)
amap->rating += analyser->utfbuf[amap->ucs2[j]] * amap->weights[j];
}
/* Now check whether we've found some negative ratings. */
{
size_t min = 0;
size_t max = 0;
for (i = 1; i < analyser->ncharsets; i++) {
if (analyser->utfch[i].rating < analyser->utfch[min].rating)
min = i;
if (analyser->utfch[i].rating > analyser->utfch[max].rating)
max = i;
}
if (analyser->utfch[min].rating < 0.0
&& -analyser->utfch[min].rating > 0.5*analyser->utfch[max].rating) {
size_t total = 0;
double q = analyser->utfch[min].rating
* (1.0 - 45.0*exp(-4.5*analyser->options.threshold));
for (i = 0; i < analyser->ncharsets; i++) {
if (analyser->utfch[i].rating < q) {
analyser->utfch[i].result = 1;
total++;
}
}
return total;
}
}
return 0;
}
/**
* enca_double_utf8_get_candidates:
* @analyser: Analyzer state for which double-UTF-8 candidates are to be
* returned.
*
* Returns array of double-UTF-8 source charset candidates from the last check.
*
* The returned array should be freed by caller then no longer needed. Its
* is the return value of the preceding enca_double_utf8_check() call.
*
* When called before any double-UTF-8 test has been performed yet or after
* and unsuccessfull double-UTF-8 test, it returns NULL, but the result after
* an unsuccessfull check should be considered undefined.
*
* Returns: An array containing charset id's of possible source charsets from
* which the sample was doubly-UTF-8 encoded. The array may contain
* only one value, but usually enca is not able to decide between
* e.g. ISO-8859-2 and Win1250, thus more candidates are returned.
**/
int*
enca_double_utf8_get_candidates(EncaAnalyser analyser)
{
size_t j = 0;
size_t i;
int *candidates;
assert(analyser);
if (analyser->utfch == NULL)
return NULL;
for (i = 0; i < analyser->ncharsets; i++) {
if (analyser->utfch[i].result)
j++;
}
if (j == 0)
return NULL;
candidates = NEW(int, j);
j = 0;
for (i = 0; i < analyser->ncharsets; i++) {
if (analyser->utfch[i].result) {
candidates[j] = analyser->charsets[i];
j++;
}
}
return candidates;
}
/**
* compute_double_utf8_weights:
* @analyser: Analyzer state whose double-UTF-8 check weigths should be
* computed.
*
* Computes UCS-2 character weights used in double-UTF-8 check. Must be
* called at most once for a given analyser. It also allocates the scratch
* buffer analyser->utfbuf and leaves it filled with zeroes.
**/
static void
compute_double_utf8_weights(EncaAnalyserState *analyser)
{
int *buf;
unsigned int ucs2map[0x100];
size_t i, j;
assert(analyser != NULL);
assert(analyser->lang != NULL);
assert(analyser->utfch == NULL);
assert(analyser->utfbuf == NULL);
if (analyser->ncharsets == 0)
return;
analyser->utfch = NEW(EncaUTFCheckData, analyser->ncharsets);
analyser->utfbuf = NEW(int, 0x10000);
buf = analyser->utfbuf;
for (i = 0; i < 0x10000; i++)
buf[i] = 0;
/* For all charsets compute UTF-8 prefix byte occurence tables and select
* those characters having the highest difference between occurences when
* counted as UTF-8 prefix and when counted as a regular character. */
for (j = 0; j < analyser->ncharsets; j++) {
const unsigned short int *const w = analyser->lang->weights[j];
size_t table_size = 0;
assert(enca_charset_has_ucs2_map(analyser->charsets[j]));
enca_charset_ucs2_map(analyser->charsets[j], ucs2map);
/* Go through all characters, some maps may map even 7bits to something
* else. Compute required table size meanwhile. */
for (i = 0; i < 0x100; i++) {
unsigned int ucs2c = ucs2map[i];
assert(ucs2c < 0x10000);
if (w[i] == 0)
continue;
/* Count the character weight as positive. */
if (ucs2c < 0x80 || ucs2c == ENCA_NOT_A_CHAR)
continue;
if (buf[ucs2c] == 0)
table_size++;
buf[ucs2c] += w[i];
/* Transform the character and count UTF-8 transformed first byte weight
* as negative. */
if (ucs2c < 0x800)
ucs2c = ucs2map[0xc0 | (ucs2c >> 6)];
else
ucs2c = ucs2map[0xe0 | (ucs2c >> 12)];
if (ucs2c < 0x80 || ucs2c == ENCA_NOT_A_CHAR)
continue;
if (buf[ucs2c] == 0)
table_size++;
buf[ucs2c] -= w[i];
if (buf[ucs2c] == 0)
buf[ucs2c] = 1;
}
/* Build the table of significant UCS-2 characters, i.e. characters
* having nonzero weight. */
create_ucs2_weight_table(analyser->utfch + j, table_size, buf);
}
}
/**
* create_ucs2_weight_table:
* @amap: A pointer to Double-UTF8-check data to be filled.
* @size: The number of UCS-2 characters with nonzero weight in @wbuf.
* @wbuf: UCS-2 character weights [@size].
*
* Creates `compressed' UCS-2 weight table.
**/
static void
create_ucs2_weight_table(EncaUTFCheckData *amap,
size_t size,
int *wbuf)
{
unsigned int ucs2c;
size_t i;
amap->size = size;
amap->ucs2 = NEW(int, size);
amap->weights = NEW(int, size);
i = 0;
for (ucs2c = 0; ucs2c < 0x10000; ucs2c++) {
if (wbuf[ucs2c] != 0) {
assert(i < size);
amap->ucs2[i] = ucs2c;
amap->weights[i] = wbuf[ucs2c];
wbuf[ucs2c] = 0; /* Fill the buffer with zeroes. */
i++;
}
}
assert(i == size);
}
/**
* mark_scratch_buffer:
* @analyser: Analyzer whose significant ucs2 characters are to be marked in
* @analyser->utfbuf.
*
* Marks significant characters in @analyser->utfbuf with -1.
*
* The @analyser->utfbuf buffer is magic. Once we found the significant
* characters in compute_double_utf8_weights(), we always keep zeroes at
* positions of nonsiginifant characters. This way we never have to scan
* through the whole buffer, not even to fill it wit zeroes -- we put zeroes
* only where we know we changed it.
*
* -1 is used to mark significant characters before counting, because it's not
* zero.
**/
static void
mark_scratch_buffer(EncaAnalyserState *analyser)
{
size_t i, j;
for (j = 0; j < analyser->ncharsets; j++) {
EncaUTFCheckData *amap = analyser->utfch + j;
for (i = 0; i < amap->size; i++)
analyser->utfbuf[amap->ucs2[i]] = -1;
}
}