|
Packit |
57a33d |
/*
|
|
Packit |
57a33d |
checks for doubly-encoded utf-8
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
Copyright (C) 2000-2002 David Necas (Yeti) <yeti@physics.muni.cz>
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
This program is free software; you can redistribute it and/or modify it
|
|
Packit |
57a33d |
under the terms of version 2 of the GNU General Public License as published
|
|
Packit |
57a33d |
by the Free Software Foundation.
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
This program is distributed in the hope that it will be useful, but WITHOUT
|
|
Packit |
57a33d |
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
Packit |
57a33d |
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
Packit |
57a33d |
more details.
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
You should have received a copy of the GNU General Public License along
|
|
Packit |
57a33d |
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
Packit |
57a33d |
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
|
|
Packit |
57a33d |
*/
|
|
Packit |
57a33d |
#ifdef HAVE_CONFIG_H
|
|
Packit |
57a33d |
# include "config.h"
|
|
Packit |
57a33d |
#endif /* HAVE_CONFIG_H */
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
#include <stdlib.h>
|
|
Packit |
57a33d |
#include <math.h>
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
#include "enca.h"
|
|
Packit |
57a33d |
#include "internal.h"
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/* Local prototypes. */
|
|
Packit |
57a33d |
static void compute_double_utf8_weights (EncaAnalyserState *analyser);
|
|
Packit |
57a33d |
static void create_ucs2_weight_table (EncaUTFCheckData *amap,
|
|
Packit |
57a33d |
size_t size,
|
|
Packit |
57a33d |
int *wbuf);
|
|
Packit |
57a33d |
static void mark_scratch_buffer (EncaAnalyserState *analyser);
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/**
|
|
Packit |
57a33d |
* enca_double_utf8_init:
|
|
Packit |
57a33d |
* @analyser: Analyzer state to be initialized.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* Initializes double-UTF-8 check.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* In fact it initializes the fields to #NULL's, they are actually initialized
|
|
Packit |
57a33d |
* only when needed.
|
|
Packit |
57a33d |
**/
|
|
Packit |
57a33d |
void
|
|
Packit |
57a33d |
enca_double_utf8_init(EncaAnalyserState *analyser)
|
|
Packit |
57a33d |
{
|
|
Packit |
57a33d |
analyser->utfch = NULL;
|
|
Packit |
57a33d |
analyser->utfbuf = NULL;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/**
|
|
Packit |
57a33d |
* enca_double_utf8_destroy:
|
|
Packit |
57a33d |
* @analyser: Analyzer state whose double-UTF-8 check part should be destroyed.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* Destroys the double-UTF-8 check part of analyser state @analyser.
|
|
Packit |
57a33d |
**/
|
|
Packit |
57a33d |
void
|
|
Packit |
57a33d |
enca_double_utf8_destroy(EncaAnalyserState *analyser)
|
|
Packit |
57a33d |
{
|
|
Packit |
57a33d |
size_t i;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
if (analyser->utfch == NULL)
|
|
Packit |
57a33d |
return;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
enca_free(analyser->utfbuf);
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
for (i = 0; i < analyser->ncharsets; i++) {
|
|
Packit |
57a33d |
enca_free(analyser->utfch[i].ucs2);
|
|
Packit |
57a33d |
enca_free(analyser->utfch[i].weights);
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
enca_free(analyser->utfch);
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/**
|
|
Packit |
57a33d |
* enca_double_utf8_check:
|
|
Packit |
57a33d |
* @analyser: Analyzer state determinig the language for double-UTF-8 check.
|
|
Packit |
57a33d |
* @buffer: The buffer to be checked [@size].
|
|
Packit |
57a33d |
* @size: The size of @buffer.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* Checks buffer for double-UTF-8 encoding.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* Double-UTF-8 encoding is the result of [errorneous] conversion of UTF-8 text
|
|
Packit |
57a33d |
* to UTF-8 again, as if it was in some 8bit charset. This is quite hard to
|
|
Packit |
57a33d |
* recover from.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* The analayser mostly only determines what language will be assumed,
|
|
Packit |
57a33d |
* the rest of this test is independent on the main guessing routines.
|
|
Packit |
57a33d |
* When @buffer doesn't containing UTF-8 text, the result is undefined
|
|
Packit |
57a33d |
* (namely, false positives are possible).
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* Calling this function when language is `none' has currently no effect.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* Returns: Nonzero, when @buffer probably contains doubly-UTF-8 encoded text.
|
|
Packit |
57a33d |
* More precisely, it returns the number of charsets which are
|
|
Packit |
57a33d |
* possible candidates for source charset. You can then use
|
|
Packit |
57a33d |
* enca_double_utf8_get_candidates() to retrieve the charsets.
|
|
Packit |
57a33d |
**/
|
|
Packit |
57a33d |
int
|
|
Packit |
57a33d |
enca_double_utf8_check(EncaAnalyser analyser,
|
|
Packit |
57a33d |
const unsigned char *buffer,
|
|
Packit |
57a33d |
size_t size)
|
|
Packit |
57a33d |
{
|
|
Packit |
57a33d |
long int ucs4char = 0;
|
|
Packit |
57a33d |
int remains_10xxxxxx = 0;
|
|
Packit |
57a33d |
size_t i;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
if (analyser->ncharsets == 0 || analyser->lang->weights == 0)
|
|
Packit |
57a33d |
return 0;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/* Compute weights when we are called the first time. */
|
|
Packit |
57a33d |
if (analyser->utfch == NULL)
|
|
Packit |
57a33d |
compute_double_utf8_weights(analyser);
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
mark_scratch_buffer(analyser);
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/* Parse. */
|
|
Packit |
57a33d |
for (i = 0; i < size; i++) {
|
|
Packit |
57a33d |
unsigned char b = buffer[i];
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
if (!remains_10xxxxxx) {
|
|
Packit |
57a33d |
if ((b & 0x80) == 0) /* 7bit characters */
|
|
Packit |
57a33d |
continue;
|
|
Packit |
57a33d |
if ((b & 0xe0) == 0xc0) { /* 110xxxxx 10xxxxxx sequence */
|
|
Packit |
57a33d |
ucs4char = b & 0x1f;
|
|
Packit |
57a33d |
remains_10xxxxxx = 1;
|
|
Packit |
57a33d |
continue;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
if ((b & 0xf0) == 0xe0) { /* 1110xxxx 2 x 10xxxxxx sequence */
|
|
Packit |
57a33d |
ucs4char = b & 0x0f;
|
|
Packit |
57a33d |
remains_10xxxxxx = 2;
|
|
Packit |
57a33d |
continue;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
/* Following are valid 32-bit UCS characters, but not 16-bit Unicode,
|
|
Packit |
57a33d |
nevertheless we accept them. */
|
|
Packit |
57a33d |
if ((b & 0xf8) == 0xf0) { /* 1110xxxx 3 x 10xxxxxx sequence */
|
|
Packit |
57a33d |
ucs4char = b & 0x07;
|
|
Packit |
57a33d |
remains_10xxxxxx = 3;
|
|
Packit |
57a33d |
continue;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
if ((b & 0xfc) == 0xf8) { /* 1110xxxx 4 x 10xxxxxx sequence */
|
|
Packit |
57a33d |
ucs4char = b & 0x03;
|
|
Packit |
57a33d |
remains_10xxxxxx = 4;
|
|
Packit |
57a33d |
continue;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
if ((b & 0xfe) == 0xfc) { /* 1110xxxx 5 x 10xxxxxx sequence */
|
|
Packit |
57a33d |
ucs4char = b & 0x01;
|
|
Packit |
57a33d |
remains_10xxxxxx = 5;
|
|
Packit |
57a33d |
continue;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
/* We can get here only when input is invalid: (b & 0xc0) == 0x80. */
|
|
Packit |
57a33d |
remains_10xxxxxx = 0;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
else {
|
|
Packit |
57a33d |
/* Broken 10xxxxxx sequence? */
|
|
Packit |
57a33d |
if ((b & 0xc0) != 0x80) {
|
|
Packit |
57a33d |
remains_10xxxxxx = 0;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
else {
|
|
Packit |
57a33d |
/* Good 10xxxxxx continuation. */
|
|
Packit |
57a33d |
ucs4char <<= 6;
|
|
Packit |
57a33d |
ucs4char |= b & 0x3f;
|
|
Packit |
57a33d |
remains_10xxxxxx--;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/* Do we have a whole character?
|
|
Packit |
57a33d |
* (We must not touch positions in utfbuf containing zeroes.) */
|
|
Packit |
57a33d |
if (remains_10xxxxxx == 0
|
|
Packit |
57a33d |
&& ucs4char < 0x10000
|
|
Packit |
57a33d |
&& analyser->utfbuf[ucs4char] != 0) {
|
|
Packit |
57a33d |
if (analyser->utfbuf[ucs4char] < 0)
|
|
Packit |
57a33d |
analyser->utfbuf[ucs4char] = 1;
|
|
Packit |
57a33d |
else
|
|
Packit |
57a33d |
analyser->utfbuf[ucs4char]++;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/* Compute the ratings. */
|
|
Packit |
57a33d |
for (i = 0; i < analyser->ncharsets; i++) {
|
|
Packit |
57a33d |
EncaUTFCheckData *amap = analyser->utfch + i;
|
|
Packit |
57a33d |
size_t j;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
amap->rating = 0.0;
|
|
Packit |
57a33d |
amap->result = 0;
|
|
Packit |
57a33d |
for (j = 0; j < amap->size; j++)
|
|
Packit |
57a33d |
amap->rating += analyser->utfbuf[amap->ucs2[j]] * amap->weights[j];
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/* Now check whether we've found some negative ratings. */
|
|
Packit |
57a33d |
{
|
|
Packit |
57a33d |
size_t min = 0;
|
|
Packit |
57a33d |
size_t max = 0;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
for (i = 1; i < analyser->ncharsets; i++) {
|
|
Packit |
57a33d |
if (analyser->utfch[i].rating < analyser->utfch[min].rating)
|
|
Packit |
57a33d |
min = i;
|
|
Packit |
57a33d |
if (analyser->utfch[i].rating > analyser->utfch[max].rating)
|
|
Packit |
57a33d |
max = i;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
if (analyser->utfch[min].rating < 0.0
|
|
Packit |
57a33d |
&& -analyser->utfch[min].rating > 0.5*analyser->utfch[max].rating) {
|
|
Packit |
57a33d |
size_t total = 0;
|
|
Packit |
57a33d |
double q = analyser->utfch[min].rating
|
|
Packit |
57a33d |
* (1.0 - 45.0*exp(-4.5*analyser->options.threshold));
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
for (i = 0; i < analyser->ncharsets; i++) {
|
|
Packit |
57a33d |
if (analyser->utfch[i].rating < q) {
|
|
Packit |
57a33d |
analyser->utfch[i].result = 1;
|
|
Packit |
57a33d |
total++;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
return total;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
return 0;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/**
|
|
Packit |
57a33d |
* enca_double_utf8_get_candidates:
|
|
Packit |
57a33d |
* @analyser: Analyzer state for which double-UTF-8 candidates are to be
|
|
Packit |
57a33d |
* returned.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* Returns array of double-UTF-8 source charset candidates from the last check.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* The returned array should be freed by caller then no longer needed. Its
|
|
Packit |
57a33d |
* is the return value of the preceding enca_double_utf8_check() call.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* When called before any double-UTF-8 test has been performed yet or after
|
|
Packit |
57a33d |
* and unsuccessfull double-UTF-8 test, it returns NULL, but the result after
|
|
Packit |
57a33d |
* an unsuccessfull check should be considered undefined.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* Returns: An array containing charset id's of possible source charsets from
|
|
Packit |
57a33d |
* which the sample was doubly-UTF-8 encoded. The array may contain
|
|
Packit |
57a33d |
* only one value, but usually enca is not able to decide between
|
|
Packit |
57a33d |
* e.g. ISO-8859-2 and Win1250, thus more candidates are returned.
|
|
Packit |
57a33d |
**/
|
|
Packit |
57a33d |
int*
|
|
Packit |
57a33d |
enca_double_utf8_get_candidates(EncaAnalyser analyser)
|
|
Packit |
57a33d |
{
|
|
Packit |
57a33d |
size_t j = 0;
|
|
Packit |
57a33d |
size_t i;
|
|
Packit |
57a33d |
int *candidates;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
assert(analyser);
|
|
Packit |
57a33d |
if (analyser->utfch == NULL)
|
|
Packit |
57a33d |
return NULL;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
for (i = 0; i < analyser->ncharsets; i++) {
|
|
Packit |
57a33d |
if (analyser->utfch[i].result)
|
|
Packit |
57a33d |
j++;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
if (j == 0)
|
|
Packit |
57a33d |
return NULL;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
candidates = NEW(int, j);
|
|
Packit |
57a33d |
j = 0;
|
|
Packit |
57a33d |
for (i = 0; i < analyser->ncharsets; i++) {
|
|
Packit |
57a33d |
if (analyser->utfch[i].result) {
|
|
Packit |
57a33d |
candidates[j] = analyser->charsets[i];
|
|
Packit |
57a33d |
j++;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
return candidates;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/**
|
|
Packit |
57a33d |
* compute_double_utf8_weights:
|
|
Packit |
57a33d |
* @analyser: Analyzer state whose double-UTF-8 check weigths should be
|
|
Packit |
57a33d |
* computed.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* Computes UCS-2 character weights used in double-UTF-8 check. Must be
|
|
Packit |
57a33d |
* called at most once for a given analyser. It also allocates the scratch
|
|
Packit |
57a33d |
* buffer analyser->utfbuf and leaves it filled with zeroes.
|
|
Packit |
57a33d |
**/
|
|
Packit |
57a33d |
static void
|
|
Packit |
57a33d |
compute_double_utf8_weights(EncaAnalyserState *analyser)
|
|
Packit |
57a33d |
{
|
|
Packit |
57a33d |
int *buf;
|
|
Packit |
57a33d |
unsigned int ucs2map[0x100];
|
|
Packit |
57a33d |
size_t i, j;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
assert(analyser != NULL);
|
|
Packit |
57a33d |
assert(analyser->lang != NULL);
|
|
Packit |
57a33d |
assert(analyser->utfch == NULL);
|
|
Packit |
57a33d |
assert(analyser->utfbuf == NULL);
|
|
Packit |
57a33d |
if (analyser->ncharsets == 0)
|
|
Packit |
57a33d |
return;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
analyser->utfch = NEW(EncaUTFCheckData, analyser->ncharsets);
|
|
Packit |
57a33d |
analyser->utfbuf = NEW(int, 0x10000);
|
|
Packit |
57a33d |
buf = analyser->utfbuf;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
for (i = 0; i < 0x10000; i++)
|
|
Packit |
57a33d |
buf[i] = 0;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/* For all charsets compute UTF-8 prefix byte occurence tables and select
|
|
Packit |
57a33d |
* those characters having the highest difference between occurences when
|
|
Packit |
57a33d |
* counted as UTF-8 prefix and when counted as a regular character. */
|
|
Packit |
57a33d |
for (j = 0; j < analyser->ncharsets; j++) {
|
|
Packit |
57a33d |
const unsigned short int *const w = analyser->lang->weights[j];
|
|
Packit |
57a33d |
size_t table_size = 0;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
assert(enca_charset_has_ucs2_map(analyser->charsets[j]));
|
|
Packit |
57a33d |
enca_charset_ucs2_map(analyser->charsets[j], ucs2map);
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/* Go through all characters, some maps may map even 7bits to something
|
|
Packit |
57a33d |
* else. Compute required table size meanwhile. */
|
|
Packit |
57a33d |
for (i = 0; i < 0x100; i++) {
|
|
Packit |
57a33d |
unsigned int ucs2c = ucs2map[i];
|
|
Packit |
57a33d |
assert(ucs2c < 0x10000);
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
if (w[i] == 0)
|
|
Packit |
57a33d |
continue;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/* Count the character weight as positive. */
|
|
Packit |
57a33d |
if (ucs2c < 0x80 || ucs2c == ENCA_NOT_A_CHAR)
|
|
Packit |
57a33d |
continue;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
if (buf[ucs2c] == 0)
|
|
Packit |
57a33d |
table_size++;
|
|
Packit |
57a33d |
buf[ucs2c] += w[i];
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/* Transform the character and count UTF-8 transformed first byte weight
|
|
Packit |
57a33d |
* as negative. */
|
|
Packit |
57a33d |
if (ucs2c < 0x800)
|
|
Packit |
57a33d |
ucs2c = ucs2map[0xc0 | (ucs2c >> 6)];
|
|
Packit |
57a33d |
else
|
|
Packit |
57a33d |
ucs2c = ucs2map[0xe0 | (ucs2c >> 12)];
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
if (ucs2c < 0x80 || ucs2c == ENCA_NOT_A_CHAR)
|
|
Packit |
57a33d |
continue;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
if (buf[ucs2c] == 0)
|
|
Packit |
57a33d |
table_size++;
|
|
Packit |
57a33d |
buf[ucs2c] -= w[i];
|
|
Packit |
57a33d |
if (buf[ucs2c] == 0)
|
|
Packit |
57a33d |
buf[ucs2c] = 1;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/* Build the table of significant UCS-2 characters, i.e. characters
|
|
Packit |
57a33d |
* having nonzero weight. */
|
|
Packit |
57a33d |
create_ucs2_weight_table(analyser->utfch + j, table_size, buf);
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/**
|
|
Packit |
57a33d |
* create_ucs2_weight_table:
|
|
Packit |
57a33d |
* @amap: A pointer to Double-UTF8-check data to be filled.
|
|
Packit |
57a33d |
* @size: The number of UCS-2 characters with nonzero weight in @wbuf.
|
|
Packit |
57a33d |
* @wbuf: UCS-2 character weights [@size].
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* Creates `compressed' UCS-2 weight table.
|
|
Packit |
57a33d |
**/
|
|
Packit |
57a33d |
static void
|
|
Packit |
57a33d |
create_ucs2_weight_table(EncaUTFCheckData *amap,
|
|
Packit |
57a33d |
size_t size,
|
|
Packit |
57a33d |
int *wbuf)
|
|
Packit |
57a33d |
{
|
|
Packit |
57a33d |
unsigned int ucs2c;
|
|
Packit |
57a33d |
size_t i;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
amap->size = size;
|
|
Packit |
57a33d |
amap->ucs2 = NEW(int, size);
|
|
Packit |
57a33d |
amap->weights = NEW(int, size);
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
i = 0;
|
|
Packit |
57a33d |
for (ucs2c = 0; ucs2c < 0x10000; ucs2c++) {
|
|
Packit |
57a33d |
if (wbuf[ucs2c] != 0) {
|
|
Packit |
57a33d |
assert(i < size);
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
amap->ucs2[i] = ucs2c;
|
|
Packit |
57a33d |
amap->weights[i] = wbuf[ucs2c];
|
|
Packit |
57a33d |
wbuf[ucs2c] = 0; /* Fill the buffer with zeroes. */
|
|
Packit |
57a33d |
i++;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
assert(i == size);
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
/**
|
|
Packit |
57a33d |
* mark_scratch_buffer:
|
|
Packit |
57a33d |
* @analyser: Analyzer whose significant ucs2 characters are to be marked in
|
|
Packit |
57a33d |
* @analyser->utfbuf.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* Marks significant characters in @analyser->utfbuf with -1.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* The @analyser->utfbuf buffer is magic. Once we found the significant
|
|
Packit |
57a33d |
* characters in compute_double_utf8_weights(), we always keep zeroes at
|
|
Packit |
57a33d |
* positions of nonsiginifant characters. This way we never have to scan
|
|
Packit |
57a33d |
* through the whole buffer, not even to fill it wit zeroes -- we put zeroes
|
|
Packit |
57a33d |
* only where we know we changed it.
|
|
Packit |
57a33d |
*
|
|
Packit |
57a33d |
* -1 is used to mark significant characters before counting, because it's not
|
|
Packit |
57a33d |
* zero.
|
|
Packit |
57a33d |
**/
|
|
Packit |
57a33d |
static void
|
|
Packit |
57a33d |
mark_scratch_buffer(EncaAnalyserState *analyser)
|
|
Packit |
57a33d |
{
|
|
Packit |
57a33d |
size_t i, j;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
for (j = 0; j < analyser->ncharsets; j++) {
|
|
Packit |
57a33d |
EncaUTFCheckData *amap = analyser->utfch + j;
|
|
Packit |
57a33d |
|
|
Packit |
57a33d |
for (i = 0; i < amap->size; i++)
|
|
Packit |
57a33d |
analyser->utfbuf[amap->ucs2[i]] = -1;
|
|
Packit |
57a33d |
}
|
|
Packit |
57a33d |
}
|