Tree - source-git/enca - CentOS Git server

source-git / enca

Blame lib/utf8_double.c

Blob History Raw

Packit	57a33d	`/*`
Packit	57a33d	`checks for doubly-encoded utf-8`
Packit	57a33d
Packit	57a33d	`Copyright (C) 2000-2002 David Necas (Yeti) <yeti@physics.muni.cz>`
Packit	57a33d
Packit	57a33d	`This program is free software; you can redistribute it and/or modify it`
Packit	57a33d	`under the terms of version 2 of the GNU General Public License as published`
Packit	57a33d	`by the Free Software Foundation.`
Packit	57a33d
Packit	57a33d	`This program is distributed in the hope that it will be useful, but WITHOUT`
Packit	57a33d	`ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or`
Packit	57a33d	`FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for`
Packit	57a33d	`more details.`
Packit	57a33d
Packit	57a33d	`You should have received a copy of the GNU General Public License along`
Packit	57a33d	`with this program; if not, write to the Free Software Foundation, Inc.,`
Packit	57a33d	`59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.`
Packit	57a33d	`*/`
Packit	57a33d	`#ifdef HAVE_CONFIG_H`
Packit	57a33d	`# include "config.h"`
Packit	57a33d	`#endif /* HAVE_CONFIG_H */`
Packit	57a33d
Packit	57a33d	`#include <stdlib.h>`
Packit	57a33d	`#include <math.h>`
Packit	57a33d
Packit	57a33d	`#include "enca.h"`
Packit	57a33d	`#include "internal.h"`
Packit	57a33d
Packit	57a33d	`/* Local prototypes. */`
Packit	57a33d	`static void compute_double_utf8_weights (EncaAnalyserState *analyser);`
Packit	57a33d	`static void create_ucs2_weight_table (EncaUTFCheckData *amap,`
Packit	57a33d	`size_t size,`
Packit	57a33d	`int *wbuf);`
Packit	57a33d	`static void mark_scratch_buffer (EncaAnalyserState *analyser);`
Packit	57a33d
Packit	57a33d	`/**`
Packit	57a33d	`* enca_double_utf8_init:`
Packit	57a33d	`* @analyser: Analyzer state to be initialized.`
Packit	57a33d	`*`
Packit	57a33d	`* Initializes double-UTF-8 check.`
Packit	57a33d	`*`
Packit	57a33d	`* In fact it initializes the fields to #NULL's, they are actually initialized`
Packit	57a33d	`* only when needed.`
Packit	57a33d	`**/`
Packit	57a33d	`void`
Packit	57a33d	`enca_double_utf8_init(EncaAnalyserState *analyser)`
Packit	57a33d	`{`
Packit	57a33d	`analyser->utfch = NULL;`
Packit	57a33d	`analyser->utfbuf = NULL;`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`/**`
Packit	57a33d	`* enca_double_utf8_destroy:`
Packit	57a33d	`* @analyser: Analyzer state whose double-UTF-8 check part should be destroyed.`
Packit	57a33d	`*`
Packit	57a33d	`* Destroys the double-UTF-8 check part of analyser state @analyser.`
Packit	57a33d	`**/`
Packit	57a33d	`void`
Packit	57a33d	`enca_double_utf8_destroy(EncaAnalyserState *analyser)`
Packit	57a33d	`{`
Packit	57a33d	`size_t i;`
Packit	57a33d
Packit	57a33d	`if (analyser->utfch == NULL)`
Packit	57a33d	`return;`
Packit	57a33d
Packit	57a33d	`enca_free(analyser->utfbuf);`
Packit	57a33d
Packit	57a33d	`for (i = 0; i < analyser->ncharsets; i++) {`
Packit	57a33d	`enca_free(analyser->utfch[i].ucs2);`
Packit	57a33d	`enca_free(analyser->utfch[i].weights);`
Packit	57a33d	`}`
Packit	57a33d	`enca_free(analyser->utfch);`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`/**`
Packit	57a33d	`* enca_double_utf8_check:`
Packit	57a33d	`* @analyser: Analyzer state determinig the language for double-UTF-8 check.`
Packit	57a33d	`* @buffer: The buffer to be checked [@size].`
Packit	57a33d	`* @size: The size of @buffer.`
Packit	57a33d	`*`
Packit	57a33d	`* Checks buffer for double-UTF-8 encoding.`
Packit	57a33d	`*`
Packit	57a33d	`* Double-UTF-8 encoding is the result of [errorneous] conversion of UTF-8 text`
Packit	57a33d	`* to UTF-8 again, as if it was in some 8bit charset. This is quite hard to`
Packit	57a33d	`* recover from.`
Packit	57a33d	`*`
Packit	57a33d	`* The analayser mostly only determines what language will be assumed,`
Packit	57a33d	`* the rest of this test is independent on the main guessing routines.`
Packit	57a33d	`* When @buffer doesn't containing UTF-8 text, the result is undefined`
Packit	57a33d	`* (namely, false positives are possible).`
Packit	57a33d	`*`
Packit	57a33d	* Calling this function when language is `none' has currently no effect.
Packit	57a33d	`*`
Packit	57a33d	`* Returns: Nonzero, when @buffer probably contains doubly-UTF-8 encoded text.`
Packit	57a33d	`* More precisely, it returns the number of charsets which are`
Packit	57a33d	`* possible candidates for source charset. You can then use`
Packit	57a33d	`* enca_double_utf8_get_candidates() to retrieve the charsets.`
Packit	57a33d	`**/`
Packit	57a33d	`int`
Packit	57a33d	`enca_double_utf8_check(EncaAnalyser analyser,`
Packit	57a33d	`const unsigned char *buffer,`
Packit	57a33d	`size_t size)`
Packit	57a33d	`{`
Packit	57a33d	`long int ucs4char = 0;`
Packit	57a33d	`int remains_10xxxxxx = 0;`
Packit	57a33d	`size_t i;`
Packit	57a33d
Packit	57a33d	`if (analyser->ncharsets == 0 \|\| analyser->lang->weights == 0)`
Packit	57a33d	`return 0;`
Packit	57a33d
Packit	57a33d	`/* Compute weights when we are called the first time. */`
Packit	57a33d	`if (analyser->utfch == NULL)`
Packit	57a33d	`compute_double_utf8_weights(analyser);`
Packit	57a33d
Packit	57a33d	`mark_scratch_buffer(analyser);`
Packit	57a33d
Packit	57a33d	`/* Parse. */`
Packit	57a33d	`for (i = 0; i < size; i++) {`
Packit	57a33d	`unsigned char b = buffer[i];`
Packit	57a33d
Packit	57a33d	`if (!remains_10xxxxxx) {`
Packit	57a33d	`if ((b & 0x80) == 0) /* 7bit characters */`
Packit	57a33d	`continue;`
Packit	57a33d	`if ((b & 0xe0) == 0xc0) { /* 110xxxxx 10xxxxxx sequence */`
Packit	57a33d	`ucs4char = b & 0x1f;`
Packit	57a33d	`remains_10xxxxxx = 1;`
Packit	57a33d	`continue;`
Packit	57a33d	`}`
Packit	57a33d	`if ((b & 0xf0) == 0xe0) { /* 1110xxxx 2 x 10xxxxxx sequence */`
Packit	57a33d	`ucs4char = b & 0x0f;`
Packit	57a33d	`remains_10xxxxxx = 2;`
Packit	57a33d	`continue;`
Packit	57a33d	`}`
Packit	57a33d	`/* Following are valid 32-bit UCS characters, but not 16-bit Unicode,`
Packit	57a33d	`nevertheless we accept them. */`
Packit	57a33d	`if ((b & 0xf8) == 0xf0) { /* 1110xxxx 3 x 10xxxxxx sequence */`
Packit	57a33d	`ucs4char = b & 0x07;`
Packit	57a33d	`remains_10xxxxxx = 3;`
Packit	57a33d	`continue;`
Packit	57a33d	`}`
Packit	57a33d	`if ((b & 0xfc) == 0xf8) { /* 1110xxxx 4 x 10xxxxxx sequence */`
Packit	57a33d	`ucs4char = b & 0x03;`
Packit	57a33d	`remains_10xxxxxx = 4;`
Packit	57a33d	`continue;`
Packit	57a33d	`}`
Packit	57a33d	`if ((b & 0xfe) == 0xfc) { /* 1110xxxx 5 x 10xxxxxx sequence */`
Packit	57a33d	`ucs4char = b & 0x01;`
Packit	57a33d	`remains_10xxxxxx = 5;`
Packit	57a33d	`continue;`
Packit	57a33d	`}`
Packit	57a33d	`/* We can get here only when input is invalid: (b & 0xc0) == 0x80. */`
Packit	57a33d	`remains_10xxxxxx = 0;`
Packit	57a33d	`}`
Packit	57a33d	`else {`
Packit	57a33d	`/* Broken 10xxxxxx sequence? */`
Packit	57a33d	`if ((b & 0xc0) != 0x80) {`
Packit	57a33d	`remains_10xxxxxx = 0;`
Packit	57a33d	`}`
Packit	57a33d	`else {`
Packit	57a33d	`/* Good 10xxxxxx continuation. */`
Packit	57a33d	`ucs4char <<= 6;`
Packit	57a33d	`ucs4char \|= b & 0x3f;`
Packit	57a33d	`remains_10xxxxxx--;`
Packit	57a33d
Packit	57a33d	`/* Do we have a whole character?`
Packit	57a33d	`* (We must not touch positions in utfbuf containing zeroes.) */`
Packit	57a33d	`if (remains_10xxxxxx == 0`
Packit	57a33d	`&& ucs4char < 0x10000`
Packit	57a33d	`&& analyser->utfbuf[ucs4char] != 0) {`
Packit	57a33d	`if (analyser->utfbuf[ucs4char] < 0)`
Packit	57a33d	`analyser->utfbuf[ucs4char] = 1;`
Packit	57a33d	`else`
Packit	57a33d	`analyser->utfbuf[ucs4char]++;`
Packit	57a33d	`}`
Packit	57a33d	`}`
Packit	57a33d	`}`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`/* Compute the ratings. */`
Packit	57a33d	`for (i = 0; i < analyser->ncharsets; i++) {`
Packit	57a33d	`EncaUTFCheckData *amap = analyser->utfch + i;`
Packit	57a33d	`size_t j;`
Packit	57a33d
Packit	57a33d	`amap->rating = 0.0;`
Packit	57a33d	`amap->result = 0;`
Packit	57a33d	`for (j = 0; j < amap->size; j++)`
Packit	57a33d	`amap->rating += analyser->utfbuf[amap->ucs2[j]] * amap->weights[j];`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`/* Now check whether we've found some negative ratings. */`
Packit	57a33d	`{`
Packit	57a33d	`size_t min = 0;`
Packit	57a33d	`size_t max = 0;`
Packit	57a33d
Packit	57a33d	`for (i = 1; i < analyser->ncharsets; i++) {`
Packit	57a33d	`if (analyser->utfch[i].rating < analyser->utfch[min].rating)`
Packit	57a33d	`min = i;`
Packit	57a33d	`if (analyser->utfch[i].rating > analyser->utfch[max].rating)`
Packit	57a33d	`max = i;`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`if (analyser->utfch[min].rating < 0.0`
Packit	57a33d	`&& -analyser->utfch[min].rating > 0.5*analyser->utfch[max].rating) {`
Packit	57a33d	`size_t total = 0;`
Packit	57a33d	`double q = analyser->utfch[min].rating`
Packit	57a33d	`* (1.0 - 45.0exp(-4.5analyser->options.threshold));`
Packit	57a33d
Packit	57a33d	`for (i = 0; i < analyser->ncharsets; i++) {`
Packit	57a33d	`if (analyser->utfch[i].rating < q) {`
Packit	57a33d	`analyser->utfch[i].result = 1;`
Packit	57a33d	`total++;`
Packit	57a33d	`}`
Packit	57a33d	`}`
Packit	57a33d	`return total;`
Packit	57a33d	`}`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`return 0;`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`/**`
Packit	57a33d	`* enca_double_utf8_get_candidates:`
Packit	57a33d	`* @analyser: Analyzer state for which double-UTF-8 candidates are to be`
Packit	57a33d	`* returned.`
Packit	57a33d	`*`
Packit	57a33d	`* Returns array of double-UTF-8 source charset candidates from the last check.`
Packit	57a33d	`*`
Packit	57a33d	`* The returned array should be freed by caller then no longer needed. Its`
Packit	57a33d	`* is the return value of the preceding enca_double_utf8_check() call.`
Packit	57a33d	`*`
Packit	57a33d	`* When called before any double-UTF-8 test has been performed yet or after`
Packit	57a33d	`* and unsuccessfull double-UTF-8 test, it returns NULL, but the result after`
Packit	57a33d	`* an unsuccessfull check should be considered undefined.`
Packit	57a33d	`*`
Packit	57a33d	`* Returns: An array containing charset id's of possible source charsets from`
Packit	57a33d	`* which the sample was doubly-UTF-8 encoded. The array may contain`
Packit	57a33d	`* only one value, but usually enca is not able to decide between`
Packit	57a33d	`* e.g. ISO-8859-2 and Win1250, thus more candidates are returned.`
Packit	57a33d	`**/`
Packit	57a33d	`int*`
Packit	57a33d	`enca_double_utf8_get_candidates(EncaAnalyser analyser)`
Packit	57a33d	`{`
Packit	57a33d	`size_t j = 0;`
Packit	57a33d	`size_t i;`
Packit	57a33d	`int *candidates;`
Packit	57a33d
Packit	57a33d	`assert(analyser);`
Packit	57a33d	`if (analyser->utfch == NULL)`
Packit	57a33d	`return NULL;`
Packit	57a33d
Packit	57a33d	`for (i = 0; i < analyser->ncharsets; i++) {`
Packit	57a33d	`if (analyser->utfch[i].result)`
Packit	57a33d	`j++;`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`if (j == 0)`
Packit	57a33d	`return NULL;`
Packit	57a33d
Packit	57a33d	`candidates = NEW(int, j);`
Packit	57a33d	`j = 0;`
Packit	57a33d	`for (i = 0; i < analyser->ncharsets; i++) {`
Packit	57a33d	`if (analyser->utfch[i].result) {`
Packit	57a33d	`candidates[j] = analyser->charsets[i];`
Packit	57a33d	`j++;`
Packit	57a33d	`}`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`return candidates;`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`/**`
Packit	57a33d	`* compute_double_utf8_weights:`
Packit	57a33d	`* @analyser: Analyzer state whose double-UTF-8 check weigths should be`
Packit	57a33d	`* computed.`
Packit	57a33d	`*`
Packit	57a33d	`* Computes UCS-2 character weights used in double-UTF-8 check. Must be`
Packit	57a33d	`* called at most once for a given analyser. It also allocates the scratch`
Packit	57a33d	`* buffer analyser->utfbuf and leaves it filled with zeroes.`
Packit	57a33d	`**/`
Packit	57a33d	`static void`
Packit	57a33d	`compute_double_utf8_weights(EncaAnalyserState *analyser)`
Packit	57a33d	`{`
Packit	57a33d	`int *buf;`
Packit	57a33d	`unsigned int ucs2map[0x100];`
Packit	57a33d	`size_t i, j;`
Packit	57a33d
Packit	57a33d	`assert(analyser != NULL);`
Packit	57a33d	`assert(analyser->lang != NULL);`
Packit	57a33d	`assert(analyser->utfch == NULL);`
Packit	57a33d	`assert(analyser->utfbuf == NULL);`
Packit	57a33d	`if (analyser->ncharsets == 0)`
Packit	57a33d	`return;`
Packit	57a33d
Packit	57a33d	`analyser->utfch = NEW(EncaUTFCheckData, analyser->ncharsets);`
Packit	57a33d	`analyser->utfbuf = NEW(int, 0x10000);`
Packit	57a33d	`buf = analyser->utfbuf;`
Packit	57a33d
Packit	57a33d	`for (i = 0; i < 0x10000; i++)`
Packit	57a33d	`buf[i] = 0;`
Packit	57a33d
Packit	57a33d	`/* For all charsets compute UTF-8 prefix byte occurence tables and select`
Packit	57a33d	`* those characters having the highest difference between occurences when`
Packit	57a33d	`* counted as UTF-8 prefix and when counted as a regular character. */`
Packit	57a33d	`for (j = 0; j < analyser->ncharsets; j++) {`
Packit	57a33d	`const unsigned short int *const w = analyser->lang->weights[j];`
Packit	57a33d	`size_t table_size = 0;`
Packit	57a33d
Packit	57a33d	`assert(enca_charset_has_ucs2_map(analyser->charsets[j]));`
Packit	57a33d	`enca_charset_ucs2_map(analyser->charsets[j], ucs2map);`
Packit	57a33d
Packit	57a33d	`/* Go through all characters, some maps may map even 7bits to something`
Packit	57a33d	`* else. Compute required table size meanwhile. */`
Packit	57a33d	`for (i = 0; i < 0x100; i++) {`
Packit	57a33d	`unsigned int ucs2c = ucs2map[i];`
Packit	57a33d	`assert(ucs2c < 0x10000);`
Packit	57a33d
Packit	57a33d	`if (w[i] == 0)`
Packit	57a33d	`continue;`
Packit	57a33d
Packit	57a33d	`/* Count the character weight as positive. */`
Packit	57a33d	`if (ucs2c < 0x80 \|\| ucs2c == ENCA_NOT_A_CHAR)`
Packit	57a33d	`continue;`
Packit	57a33d
Packit	57a33d	`if (buf[ucs2c] == 0)`
Packit	57a33d	`table_size++;`
Packit	57a33d	`buf[ucs2c] += w[i];`
Packit	57a33d
Packit	57a33d	`/* Transform the character and count UTF-8 transformed first byte weight`
Packit	57a33d	`* as negative. */`
Packit	57a33d	`if (ucs2c < 0x800)`
Packit	57a33d	`ucs2c = ucs2map[0xc0 \| (ucs2c >> 6)];`
Packit	57a33d	`else`
Packit	57a33d	`ucs2c = ucs2map[0xe0 \| (ucs2c >> 12)];`
Packit	57a33d
Packit	57a33d	`if (ucs2c < 0x80 \|\| ucs2c == ENCA_NOT_A_CHAR)`
Packit	57a33d	`continue;`
Packit	57a33d
Packit	57a33d	`if (buf[ucs2c] == 0)`
Packit	57a33d	`table_size++;`
Packit	57a33d	`buf[ucs2c] -= w[i];`
Packit	57a33d	`if (buf[ucs2c] == 0)`
Packit	57a33d	`buf[ucs2c] = 1;`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`/* Build the table of significant UCS-2 characters, i.e. characters`
Packit	57a33d	`* having nonzero weight. */`
Packit	57a33d	`create_ucs2_weight_table(analyser->utfch + j, table_size, buf);`
Packit	57a33d	`}`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`/**`
Packit	57a33d	`* create_ucs2_weight_table:`
Packit	57a33d	`* @amap: A pointer to Double-UTF8-check data to be filled.`
Packit	57a33d	`* @size: The number of UCS-2 characters with nonzero weight in @wbuf.`
Packit	57a33d	`* @wbuf: UCS-2 character weights [@size].`
Packit	57a33d	`*`
Packit	57a33d	* Creates `compressed' UCS-2 weight table.
Packit	57a33d	`**/`
Packit	57a33d	`static void`
Packit	57a33d	`create_ucs2_weight_table(EncaUTFCheckData *amap,`
Packit	57a33d	`size_t size,`
Packit	57a33d	`int *wbuf)`
Packit	57a33d	`{`
Packit	57a33d	`unsigned int ucs2c;`
Packit	57a33d	`size_t i;`
Packit	57a33d
Packit	57a33d	`amap->size = size;`
Packit	57a33d	`amap->ucs2 = NEW(int, size);`
Packit	57a33d	`amap->weights = NEW(int, size);`
Packit	57a33d
Packit	57a33d	`i = 0;`
Packit	57a33d	`for (ucs2c = 0; ucs2c < 0x10000; ucs2c++) {`
Packit	57a33d	`if (wbuf[ucs2c] != 0) {`
Packit	57a33d	`assert(i < size);`
Packit	57a33d
Packit	57a33d	`amap->ucs2[i] = ucs2c;`
Packit	57a33d	`amap->weights[i] = wbuf[ucs2c];`
Packit	57a33d	`wbuf[ucs2c] = 0; /* Fill the buffer with zeroes. */`
Packit	57a33d	`i++;`
Packit	57a33d	`}`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`assert(i == size);`
Packit	57a33d	`}`
Packit	57a33d
Packit	57a33d	`/**`
Packit	57a33d	`* mark_scratch_buffer:`
Packit	57a33d	`* @analyser: Analyzer whose significant ucs2 characters are to be marked in`
Packit	57a33d	`* @analyser->utfbuf.`
Packit	57a33d	`*`
Packit	57a33d	`* Marks significant characters in @analyser->utfbuf with -1.`
Packit	57a33d	`*`
Packit	57a33d	`* The @analyser->utfbuf buffer is magic. Once we found the significant`
Packit	57a33d	`* characters in compute_double_utf8_weights(), we always keep zeroes at`
Packit	57a33d	`* positions of nonsiginifant characters. This way we never have to scan`
Packit	57a33d	`* through the whole buffer, not even to fill it wit zeroes -- we put zeroes`
Packit	57a33d	`* only where we know we changed it.`
Packit	57a33d	`*`
Packit	57a33d	`* -1 is used to mark significant characters before counting, because it's not`
Packit	57a33d	`* zero.`
Packit	57a33d	`**/`
Packit	57a33d	`static void`
Packit	57a33d	`mark_scratch_buffer(EncaAnalyserState *analyser)`
Packit	57a33d	`{`
Packit	57a33d	`size_t i, j;`
Packit	57a33d
Packit	57a33d	`for (j = 0; j < analyser->ncharsets; j++) {`
Packit	57a33d	`EncaUTFCheckData *amap = analyser->utfch + j;`
Packit	57a33d
Packit	57a33d	`for (i = 0; i < amap->size; i++)`
Packit	57a33d	`analyser->utfbuf[amap->ucs2[i]] = -1;`
Packit	57a33d	`}`
Packit	57a33d	`}`

source-git / enca

Source Code

Blame lib/utf8_double.c