Blame lib/utf8_double.c

Packit 57a33d
/*
Packit 57a33d
  checks for doubly-encoded utf-8
Packit 57a33d
Packit 57a33d
  Copyright (C) 2000-2002 David Necas (Yeti) <yeti@physics.muni.cz>
Packit 57a33d
Packit 57a33d
  This program is free software; you can redistribute it and/or modify it
Packit 57a33d
  under the terms of version 2 of the GNU General Public License as published
Packit 57a33d
  by the Free Software Foundation.
Packit 57a33d
Packit 57a33d
  This program is distributed in the hope that it will be useful, but WITHOUT
Packit 57a33d
  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
Packit 57a33d
  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
Packit 57a33d
  more details.
Packit 57a33d
Packit 57a33d
  You should have received a copy of the GNU General Public License along
Packit 57a33d
  with this program; if not, write to the Free Software Foundation, Inc.,
Packit 57a33d
  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
Packit 57a33d
*/
Packit 57a33d
#ifdef HAVE_CONFIG_H
Packit 57a33d
#  include "config.h"
Packit 57a33d
#endif /* HAVE_CONFIG_H */
Packit 57a33d
Packit 57a33d
#include <stdlib.h>
Packit 57a33d
#include <math.h>
Packit 57a33d
Packit 57a33d
#include "enca.h"
Packit 57a33d
#include "internal.h"
Packit 57a33d
Packit 57a33d
/* Local prototypes. */
Packit 57a33d
static void compute_double_utf8_weights (EncaAnalyserState *analyser);
Packit 57a33d
static void create_ucs2_weight_table    (EncaUTFCheckData *amap,
Packit 57a33d
                                         size_t size,
Packit 57a33d
                                         int *wbuf);
Packit 57a33d
static void mark_scratch_buffer         (EncaAnalyserState *analyser);
Packit 57a33d
Packit 57a33d
/**
Packit 57a33d
 * enca_double_utf8_init:
Packit 57a33d
 * @analyser: Analyzer state to be initialized.
Packit 57a33d
 *
Packit 57a33d
 * Initializes double-UTF-8 check.
Packit 57a33d
 *
Packit 57a33d
 * In fact it initializes the fields to #NULL's, they are actually initialized
Packit 57a33d
 * only when needed.
Packit 57a33d
 **/
Packit 57a33d
void
Packit 57a33d
enca_double_utf8_init(EncaAnalyserState *analyser)
Packit 57a33d
{
Packit 57a33d
  analyser->utfch = NULL;
Packit 57a33d
  analyser->utfbuf = NULL;
Packit 57a33d
}
Packit 57a33d
Packit 57a33d
/**
Packit 57a33d
 * enca_double_utf8_destroy:
Packit 57a33d
 * @analyser: Analyzer state whose double-UTF-8 check part should be destroyed.
Packit 57a33d
 *
Packit 57a33d
 * Destroys the double-UTF-8 check part of analyser state @analyser.
Packit 57a33d
 **/
Packit 57a33d
void
Packit 57a33d
enca_double_utf8_destroy(EncaAnalyserState *analyser)
Packit 57a33d
{
Packit 57a33d
  size_t i;
Packit 57a33d
Packit 57a33d
  if (analyser->utfch == NULL)
Packit 57a33d
    return;
Packit 57a33d
Packit 57a33d
  enca_free(analyser->utfbuf);
Packit 57a33d
Packit 57a33d
  for (i = 0; i < analyser->ncharsets; i++) {
Packit 57a33d
    enca_free(analyser->utfch[i].ucs2);
Packit 57a33d
    enca_free(analyser->utfch[i].weights);
Packit 57a33d
  }
Packit 57a33d
  enca_free(analyser->utfch);
Packit 57a33d
}
Packit 57a33d
Packit 57a33d
/**
Packit 57a33d
 * enca_double_utf8_check:
Packit 57a33d
 * @analyser: Analyzer state determinig the language for double-UTF-8 check.
Packit 57a33d
 * @buffer: The buffer to be checked [@size].
Packit 57a33d
 * @size: The size of @buffer.
Packit 57a33d
 *
Packit 57a33d
 * Checks buffer for double-UTF-8 encoding.
Packit 57a33d
 *
Packit 57a33d
 * Double-UTF-8 encoding is the result of [errorneous] conversion of UTF-8 text
Packit 57a33d
 * to UTF-8 again, as if it was in some 8bit charset.  This is quite hard to
Packit 57a33d
 * recover from.
Packit 57a33d
 *
Packit 57a33d
 * The analayser mostly only determines what language will be assumed,
Packit 57a33d
 * the rest of this test is independent on the main guessing routines.
Packit 57a33d
 * When @buffer doesn't containing UTF-8 text, the result is undefined
Packit 57a33d
 * (namely, false positives are possible).
Packit 57a33d
 *
Packit 57a33d
 * Calling this function when language is `none' has currently no effect.
Packit 57a33d
 *
Packit 57a33d
 * Returns: Nonzero, when @buffer probably contains doubly-UTF-8 encoded text.
Packit 57a33d
 *          More precisely, it returns the number of charsets which are
Packit 57a33d
 *          possible candidates for source charset.  You can then use
Packit 57a33d
 *          enca_double_utf8_get_candidates() to retrieve the charsets.
Packit 57a33d
 **/
Packit 57a33d
int
Packit 57a33d
enca_double_utf8_check(EncaAnalyser analyser,
Packit 57a33d
                       const unsigned char *buffer,
Packit 57a33d
                       size_t size)
Packit 57a33d
{
Packit 57a33d
  long int ucs4char = 0;
Packit 57a33d
  int remains_10xxxxxx = 0;
Packit 57a33d
  size_t i;
Packit 57a33d
Packit 57a33d
  if (analyser->ncharsets == 0 || analyser->lang->weights == 0)
Packit 57a33d
    return 0;
Packit 57a33d
Packit 57a33d
  /* Compute weights when we are called the first time. */
Packit 57a33d
  if (analyser->utfch == NULL)
Packit 57a33d
    compute_double_utf8_weights(analyser);
Packit 57a33d
Packit 57a33d
  mark_scratch_buffer(analyser);
Packit 57a33d
Packit 57a33d
  /* Parse. */
Packit 57a33d
  for (i = 0; i < size; i++) {
Packit 57a33d
    unsigned char b = buffer[i];
Packit 57a33d
Packit 57a33d
    if (!remains_10xxxxxx) {
Packit 57a33d
      if ((b & 0x80) == 0) /* 7bit characters */
Packit 57a33d
        continue;
Packit 57a33d
      if ((b & 0xe0) == 0xc0) { /* 110xxxxx 10xxxxxx sequence */
Packit 57a33d
        ucs4char = b & 0x1f;
Packit 57a33d
        remains_10xxxxxx = 1;
Packit 57a33d
        continue;
Packit 57a33d
      }
Packit 57a33d
      if ((b & 0xf0) == 0xe0) { /* 1110xxxx 2 x 10xxxxxx sequence */
Packit 57a33d
        ucs4char = b & 0x0f;
Packit 57a33d
        remains_10xxxxxx = 2;
Packit 57a33d
        continue;
Packit 57a33d
      }
Packit 57a33d
      /* Following are valid 32-bit UCS characters, but not 16-bit Unicode,
Packit 57a33d
         nevertheless we accept them. */
Packit 57a33d
      if ((b & 0xf8) == 0xf0) { /* 1110xxxx 3 x 10xxxxxx sequence */
Packit 57a33d
        ucs4char = b & 0x07;
Packit 57a33d
        remains_10xxxxxx = 3;
Packit 57a33d
        continue;
Packit 57a33d
      }
Packit 57a33d
      if ((b & 0xfc) == 0xf8) { /* 1110xxxx 4 x 10xxxxxx sequence */
Packit 57a33d
        ucs4char = b & 0x03;
Packit 57a33d
        remains_10xxxxxx = 4;
Packit 57a33d
        continue;
Packit 57a33d
      }
Packit 57a33d
      if ((b & 0xfe) == 0xfc) { /* 1110xxxx 5 x 10xxxxxx sequence */
Packit 57a33d
        ucs4char = b & 0x01;
Packit 57a33d
        remains_10xxxxxx = 5;
Packit 57a33d
        continue;
Packit 57a33d
      }
Packit 57a33d
      /* We can get here only when input is invalid: (b & 0xc0) == 0x80. */
Packit 57a33d
      remains_10xxxxxx = 0;
Packit 57a33d
    }
Packit 57a33d
    else {
Packit 57a33d
      /* Broken 10xxxxxx sequence? */
Packit 57a33d
      if ((b & 0xc0) != 0x80) {
Packit 57a33d
        remains_10xxxxxx = 0;
Packit 57a33d
      }
Packit 57a33d
      else {
Packit 57a33d
        /* Good 10xxxxxx continuation. */
Packit 57a33d
        ucs4char <<= 6;
Packit 57a33d
        ucs4char |= b & 0x3f;
Packit 57a33d
        remains_10xxxxxx--;
Packit 57a33d
Packit 57a33d
        /* Do we have a whole character?
Packit 57a33d
         * (We must not touch positions in utfbuf containing zeroes.) */
Packit 57a33d
        if (remains_10xxxxxx == 0
Packit 57a33d
            && ucs4char < 0x10000
Packit 57a33d
            && analyser->utfbuf[ucs4char] != 0) {
Packit 57a33d
          if (analyser->utfbuf[ucs4char] < 0)
Packit 57a33d
            analyser->utfbuf[ucs4char] = 1;
Packit 57a33d
          else
Packit 57a33d
            analyser->utfbuf[ucs4char]++;
Packit 57a33d
        }
Packit 57a33d
      }
Packit 57a33d
    }
Packit 57a33d
  }
Packit 57a33d
Packit 57a33d
  /* Compute the ratings. */
Packit 57a33d
  for (i = 0; i < analyser->ncharsets; i++) {
Packit 57a33d
    EncaUTFCheckData *amap = analyser->utfch + i;
Packit 57a33d
    size_t j;
Packit 57a33d
Packit 57a33d
    amap->rating = 0.0;
Packit 57a33d
    amap->result = 0;
Packit 57a33d
    for (j = 0; j < amap->size; j++)
Packit 57a33d
      amap->rating += analyser->utfbuf[amap->ucs2[j]] * amap->weights[j];
Packit 57a33d
  }
Packit 57a33d
Packit 57a33d
  /* Now check whether we've found some negative ratings. */
Packit 57a33d
  {
Packit 57a33d
    size_t min = 0;
Packit 57a33d
    size_t max = 0;
Packit 57a33d
Packit 57a33d
    for (i = 1; i < analyser->ncharsets; i++) {
Packit 57a33d
      if (analyser->utfch[i].rating < analyser->utfch[min].rating)
Packit 57a33d
        min = i;
Packit 57a33d
      if (analyser->utfch[i].rating > analyser->utfch[max].rating)
Packit 57a33d
        max = i;
Packit 57a33d
    }
Packit 57a33d
Packit 57a33d
    if (analyser->utfch[min].rating < 0.0
Packit 57a33d
        && -analyser->utfch[min].rating > 0.5*analyser->utfch[max].rating) {
Packit 57a33d
      size_t total = 0;
Packit 57a33d
      double q = analyser->utfch[min].rating
Packit 57a33d
                 * (1.0 - 45.0*exp(-4.5*analyser->options.threshold));
Packit 57a33d
Packit 57a33d
      for (i = 0; i < analyser->ncharsets; i++) {
Packit 57a33d
        if (analyser->utfch[i].rating < q) {
Packit 57a33d
          analyser->utfch[i].result = 1;
Packit 57a33d
          total++;
Packit 57a33d
        }
Packit 57a33d
      }
Packit 57a33d
      return total;
Packit 57a33d
    }
Packit 57a33d
  }
Packit 57a33d
Packit 57a33d
  return 0;
Packit 57a33d
}
Packit 57a33d
Packit 57a33d
/**
Packit 57a33d
 * enca_double_utf8_get_candidates:
Packit 57a33d
 * @analyser: Analyzer state for which double-UTF-8 candidates are to be
Packit 57a33d
 *            returned.
Packit 57a33d
 *
Packit 57a33d
 * Returns array of double-UTF-8 source charset candidates from the last check.
Packit 57a33d
 *
Packit 57a33d
 * The returned array should be freed by caller then no longer needed. Its
Packit 57a33d
 * is the return value of the preceding enca_double_utf8_check() call.
Packit 57a33d
 *
Packit 57a33d
 * When called before any double-UTF-8 test has been performed yet or after
Packit 57a33d
 * and unsuccessfull double-UTF-8 test, it returns NULL, but the result after
Packit 57a33d
 * an unsuccessfull check should be considered undefined.
Packit 57a33d
 *
Packit 57a33d
 * Returns: An array containing charset id's of possible source charsets from
Packit 57a33d
 *          which the sample was doubly-UTF-8 encoded.  The array may contain
Packit 57a33d
 *          only one value, but usually enca is not able to decide between
Packit 57a33d
 *          e.g. ISO-8859-2 and Win1250, thus more candidates are returned.
Packit 57a33d
 **/
Packit 57a33d
int*
Packit 57a33d
enca_double_utf8_get_candidates(EncaAnalyser analyser)
Packit 57a33d
{
Packit 57a33d
  size_t j = 0;
Packit 57a33d
  size_t i;
Packit 57a33d
  int *candidates;
Packit 57a33d
Packit 57a33d
  assert(analyser);
Packit 57a33d
  if (analyser->utfch == NULL)
Packit 57a33d
    return NULL;
Packit 57a33d
Packit 57a33d
  for (i = 0; i < analyser->ncharsets; i++) {
Packit 57a33d
    if (analyser->utfch[i].result)
Packit 57a33d
      j++;
Packit 57a33d
  }
Packit 57a33d
Packit 57a33d
  if (j == 0)
Packit 57a33d
    return NULL;
Packit 57a33d
Packit 57a33d
  candidates = NEW(int, j);
Packit 57a33d
  j = 0;
Packit 57a33d
  for (i = 0; i < analyser->ncharsets; i++) {
Packit 57a33d
    if (analyser->utfch[i].result) {
Packit 57a33d
      candidates[j] = analyser->charsets[i];
Packit 57a33d
      j++;
Packit 57a33d
    }
Packit 57a33d
  }
Packit 57a33d
Packit 57a33d
  return candidates;
Packit 57a33d
}
Packit 57a33d
Packit 57a33d
/**
Packit 57a33d
 * compute_double_utf8_weights:
Packit 57a33d
 * @analyser: Analyzer state whose double-UTF-8 check weigths should be
Packit 57a33d
 *            computed.
Packit 57a33d
 *
Packit 57a33d
 * Computes UCS-2 character weights used in double-UTF-8 check.  Must be
Packit 57a33d
 * called at most once for a given analyser.  It also allocates the scratch
Packit 57a33d
 * buffer analyser->utfbuf and leaves it filled with zeroes.
Packit 57a33d
 **/
Packit 57a33d
static void
Packit 57a33d
compute_double_utf8_weights(EncaAnalyserState *analyser)
Packit 57a33d
{
Packit 57a33d
  int *buf;
Packit 57a33d
  unsigned int ucs2map[0x100];
Packit 57a33d
  size_t i, j;
Packit 57a33d
Packit 57a33d
  assert(analyser != NULL);
Packit 57a33d
  assert(analyser->lang != NULL);
Packit 57a33d
  assert(analyser->utfch == NULL);
Packit 57a33d
  assert(analyser->utfbuf == NULL);
Packit 57a33d
  if (analyser->ncharsets == 0)
Packit 57a33d
    return;
Packit 57a33d
Packit 57a33d
  analyser->utfch = NEW(EncaUTFCheckData, analyser->ncharsets);
Packit 57a33d
  analyser->utfbuf = NEW(int, 0x10000);
Packit 57a33d
  buf = analyser->utfbuf;
Packit 57a33d
Packit 57a33d
  for (i = 0; i < 0x10000; i++)
Packit 57a33d
    buf[i] = 0;
Packit 57a33d
Packit 57a33d
  /* For all charsets compute UTF-8 prefix byte occurence tables and select
Packit 57a33d
   * those characters having the highest difference between occurences when
Packit 57a33d
   * counted as UTF-8 prefix and when counted as a regular character. */
Packit 57a33d
  for (j = 0; j < analyser->ncharsets; j++) {
Packit 57a33d
    const unsigned short int *const w = analyser->lang->weights[j];
Packit 57a33d
    size_t table_size = 0;
Packit 57a33d
Packit 57a33d
    assert(enca_charset_has_ucs2_map(analyser->charsets[j]));
Packit 57a33d
    enca_charset_ucs2_map(analyser->charsets[j], ucs2map);
Packit 57a33d
Packit 57a33d
    /* Go through all characters, some maps may map even 7bits to something
Packit 57a33d
     * else. Compute required table size meanwhile. */
Packit 57a33d
    for (i = 0; i < 0x100; i++) {
Packit 57a33d
      unsigned int ucs2c = ucs2map[i];
Packit 57a33d
      assert(ucs2c < 0x10000);
Packit 57a33d
Packit 57a33d
      if (w[i] == 0)
Packit 57a33d
        continue;
Packit 57a33d
Packit 57a33d
      /* Count the character weight as positive. */
Packit 57a33d
      if (ucs2c < 0x80 || ucs2c == ENCA_NOT_A_CHAR)
Packit 57a33d
        continue;
Packit 57a33d
Packit 57a33d
      if (buf[ucs2c] == 0)
Packit 57a33d
        table_size++;
Packit 57a33d
      buf[ucs2c] += w[i];
Packit 57a33d
Packit 57a33d
      /* Transform the character and count UTF-8 transformed first byte weight
Packit 57a33d
       * as negative. */
Packit 57a33d
      if (ucs2c < 0x800)
Packit 57a33d
        ucs2c = ucs2map[0xc0 | (ucs2c >> 6)];
Packit 57a33d
      else
Packit 57a33d
        ucs2c = ucs2map[0xe0 | (ucs2c >> 12)];
Packit 57a33d
Packit 57a33d
      if (ucs2c < 0x80 || ucs2c == ENCA_NOT_A_CHAR)
Packit 57a33d
        continue;
Packit 57a33d
Packit 57a33d
      if (buf[ucs2c] == 0)
Packit 57a33d
        table_size++;
Packit 57a33d
      buf[ucs2c] -= w[i];
Packit 57a33d
      if (buf[ucs2c] == 0)
Packit 57a33d
        buf[ucs2c] = 1;
Packit 57a33d
    }
Packit 57a33d
Packit 57a33d
    /* Build the table of significant UCS-2 characters, i.e. characters
Packit 57a33d
     * having nonzero weight. */
Packit 57a33d
    create_ucs2_weight_table(analyser->utfch + j, table_size, buf);
Packit 57a33d
  }
Packit 57a33d
}
Packit 57a33d
Packit 57a33d
/**
Packit 57a33d
 * create_ucs2_weight_table:
Packit 57a33d
 * @amap: A pointer to Double-UTF8-check data to be filled.
Packit 57a33d
 * @size: The number of UCS-2 characters with nonzero weight in @wbuf.
Packit 57a33d
 * @wbuf: UCS-2 character weights [@size].
Packit 57a33d
 *
Packit 57a33d
 * Creates `compressed' UCS-2 weight table.
Packit 57a33d
 **/
Packit 57a33d
static void
Packit 57a33d
create_ucs2_weight_table(EncaUTFCheckData *amap,
Packit 57a33d
                         size_t size,
Packit 57a33d
                         int *wbuf)
Packit 57a33d
{
Packit 57a33d
  unsigned int ucs2c;
Packit 57a33d
  size_t i;
Packit 57a33d
Packit 57a33d
  amap->size = size;
Packit 57a33d
  amap->ucs2 = NEW(int, size);
Packit 57a33d
  amap->weights = NEW(int, size);
Packit 57a33d
Packit 57a33d
  i = 0;
Packit 57a33d
  for (ucs2c = 0; ucs2c < 0x10000; ucs2c++) {
Packit 57a33d
    if (wbuf[ucs2c] != 0) {
Packit 57a33d
      assert(i < size);
Packit 57a33d
Packit 57a33d
      amap->ucs2[i] = ucs2c;
Packit 57a33d
      amap->weights[i] = wbuf[ucs2c];
Packit 57a33d
      wbuf[ucs2c] = 0;  /* Fill the buffer with zeroes. */
Packit 57a33d
      i++;
Packit 57a33d
    }
Packit 57a33d
  }
Packit 57a33d
Packit 57a33d
  assert(i == size);
Packit 57a33d
}
Packit 57a33d
Packit 57a33d
/**
Packit 57a33d
 * mark_scratch_buffer:
Packit 57a33d
 * @analyser: Analyzer whose significant ucs2 characters are to be marked in
Packit 57a33d
 *            @analyser->utfbuf.
Packit 57a33d
 *
Packit 57a33d
 * Marks significant characters in @analyser->utfbuf with -1.
Packit 57a33d
 *
Packit 57a33d
 * The @analyser->utfbuf buffer is magic.  Once we found the significant
Packit 57a33d
 * characters in compute_double_utf8_weights(), we always keep zeroes at
Packit 57a33d
 * positions of nonsiginifant characters.  This way we never have to scan
Packit 57a33d
 * through the whole buffer, not even to fill it wit zeroes -- we put zeroes
Packit 57a33d
 * only where we know we changed it.
Packit 57a33d
 *
Packit 57a33d
 * -1 is used to mark significant characters before counting, because it's not
Packit 57a33d
 * zero.
Packit 57a33d
 **/
Packit 57a33d
static void
Packit 57a33d
mark_scratch_buffer(EncaAnalyserState *analyser)
Packit 57a33d
{
Packit 57a33d
  size_t i, j;
Packit 57a33d
Packit 57a33d
  for (j = 0; j < analyser->ncharsets; j++) {
Packit 57a33d
    EncaUTFCheckData *amap = analyser->utfch + j;
Packit 57a33d
Packit 57a33d
    for (i = 0; i < amap->size; i++)
Packit 57a33d
      analyser->utfbuf[amap->ucs2[i]] = -1;
Packit 57a33d
  }
Packit 57a33d
}