Blob Blame History Raw
/*
  pair-frequency based tests (used for 8bit-dense languages)

  Copyright (C) 2003 David Necas (Yeti) <yeti@physics.muni.cz>

  This program is free software; you can redistribute it and/or modify it
  under the terms of version 2 of the GNU General Public License as published
  by the Free Software Foundation.

  This program is distributed in the hope that it will be useful, but WITHOUT
  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  more details.

  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif /* HAVE_CONFIG_H */

#include <stdlib.h>
#include <math.h>

#include "enca.h"
#include "internal.h"

/* Local prototypes. */
static size_t count_all_8bit_pairs(EncaAnalyserState *analyser);
static void compute_pair2bits(EncaAnalyserState *analyser);
static void count_good_pairs(EncaAnalyserState *analyser);

/**
 * enca_pair_init:
 * @analyser: Analyzer state to be initialized.
 *
 * Initializes pair statistics data.
 *
 * In fact it just sets everything to #NULL, to be initialized when needed.
 **/
void
enca_pair_init(EncaAnalyserState *analyser)
{
  analyser->bitcounts = NULL;
  analyser->pairratings = NULL;
  analyser->pair2bits = NULL;
}

/**
 * enca_pair_destroy:
 * @analyser: Analyzer state whose pair statistics part should be destroyed.
 *
 * Destroys the pair statistics part of analyser state @analyser.
 **/
void
enca_pair_destroy(EncaAnalyserState *analyser)
{
  enca_free(analyser->pair2bits);
  enca_free(analyser->bitcounts);
  enca_free(analyser->pairratings);
}

/**
 * enca_pair_analyse:
 * @analyser: Analysed containing the sample for pair frequency analysis.
 *
 * Performs pair-frequency based analysis, provided that the language supports
 * it (does nothing otherwise).
 *
 * Returns: Nonzero when the character set was succesfully determined,
 *          @analyser->@result.@charset is then directly modified.
 **/
int
enca_pair_analyse(EncaAnalyserState *analyser)
{
  const unsigned char *const *letters = analyser->lang->letters;
  const unsigned char **const *pairs = analyser->lang->pairs;
  size_t ncharsets = analyser->ncharsets;
  size_t i, best, all8bitpairs;
  double q;

  if (!letters || !pairs)
    return 0;

  if (!analyser->pairratings)
    analyser->pairratings = NEW(size_t, ncharsets);

  /* count the good pairs and find winner
   * initialize when we are called the first time */
  if (!analyser->pair2bits) {
    compute_pair2bits(analyser);
    analyser->bitcounts = NEW(size_t, 1 << ncharsets);
  }
  memset(analyser->pairratings, 0, ncharsets*sizeof(size_t));
  all8bitpairs = count_all_8bit_pairs(analyser);
  count_good_pairs(analyser);

  best = 0;
  for (i = 1; i < ncharsets; i++) {
    if (analyser->pairratings[i] > analyser->pairratings[best])
      best = i;
  }

  /* Just a Right Value */
  q = 1.0 - exp(3.0*(1.0 - analyser->options.threshold));
  if (analyser->pairratings[best] >= analyser->options.min_chars
      && analyser->pairratings[best] >= q*all8bitpairs) {
    analyser->result.charset = analyser->charsets[best];
    return 1;
  }

  /* I don't like saying it, but the sample seems to be garbage... */
  return 0;
}

/**
 * count_all_8bit_pairs:
 * @analyser: An analyser.
 *
 * Count all pairs containing at least one 8bit characters.
 *
 * Returns: The number of such pairs.
 **/
static size_t
count_all_8bit_pairs(EncaAnalyserState *analyser)
{
  unsigned char *buffer = analyser->buffer;
  size_t size = analyser->size;
  size_t i, c, sum8bits;

  sum8bits = 0;
  c = FILL_NONLETTER;
  for (i = size; i; i--) {
    if ((c | *buffer) & 0x80)
      sum8bits++;
    c = *(buffer++);
  }
  if (size && (c & 0x80))
    sum8bits++;

  return sum8bits;
}

/**
 * count_good_pairs:
 * @analyser: An analyser.
 *
 * Count `good' pairs for each charset.
 *
 * Makes use of @analyser->pair2bits.  See compute_pair2bits() comment for
 * description of how it works.
 **/
static void
count_good_pairs(EncaAnalyserState *analyser)
{
  size_t *ratings = analyser->pairratings;
  unsigned char *pair2bits = analyser->pair2bits;
  size_t *bitcounts = analyser->bitcounts;
  size_t ncharsets = analyser->ncharsets;
  const unsigned char *buffer = analyser->buffer;
  size_t size = analyser->size;
  size_t i, j, c, cs;

  assert(ncharsets <= 8);
  assert(pair2bits);
  assert(bitcounts);
  assert(ratings);

  memset(bitcounts, 0, (1 << ncharsets)*sizeof(size_t));
  c = FILL_NONLETTER << 8;
  for (i = size; i; i--) {
    bitcounts[pair2bits[c | *buffer]]++;
    c = *(buffer++) << 8;
  }
  if (size)
    bitcounts[pair2bits[c | FILL_NONLETTER]]++;

  memset(ratings, 0, ncharsets*sizeof(size_t));
  for (cs = 0; cs < ncharsets; cs++) {
    size_t bit = 1 << cs;
    size_t rating = 0;

    for (i = 0; i < (1U << ncharsets); i += 2*bit) {
      for (j = i+bit; j < i+2*bit; j++)
        rating += bitcounts[j];
    }
    ratings[cs] = rating;
  }
}

/**
 * compute_pair2bits:
 * @analyser: An analyser.
 *
 * Allocate and fill @analyser->@pair2bits.
 *
 * The meaning of pair2bits is following: it's indexed by pairs (0x100*first
 * + second) and each @pair2bits element has set a bit corresponding to a
 * charset when the pair is `good' for the charset.
 *
 * To determine `good' pair counts for all charsets we don't have to count
 * the pairs, we only have to count the bit combinations (@bitcounts) and the
 * per charset pair counts can be easily constructed from them -- by summing
 * those with the particular bit set.
 **/
static void
compute_pair2bits(EncaAnalyserState *analyser)
{
  size_t ncharsets = analyser->ncharsets;
  size_t cs, c;

  assert(analyser->pair2bits == NULL);
  assert(analyser->ncharsets <= 8);

  analyser->pair2bits = NEW(unsigned char, 0x10000);
  memset(analyser->pair2bits, 0, 0x10000);
  for (cs = 0; cs < ncharsets; cs++) {
    const unsigned char *letters = analyser->lang->letters[cs];
    const unsigned char *const *pairs = analyser->lang->pairs[cs];
    size_t bit = 1 << cs;

    for (c = 0; c < 0x100; c++) {
      size_t j = letters[c];

      if (j != 255) {
        const unsigned char *s = pairs[j];
        unsigned char *p = analyser->pair2bits + (c << 8);

        /* set the corresponding bit for all pairs */
        do {
          p[(size_t)*(s++)] |= bit;
        } while (*s);
      }
    }
  }
}