/*
language info: chinese
Copyright (C) 2005 Meng Jie (Zuxy) <zuxy.meng@gmail.com>
This program is free software; you can redistribute it and/or modify it
under the terms of version 2 of the GNU General Public License as published
by the Free Software Foundation.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif /* HAVE_CONFIG_H */
#include "enca.h"
#include "internal.h"
#include "data/chinese/chinese.h"
static int hook(EncaAnalyserState *analyser);
static int calc_rating(EncaAnalyserState *analyser);
/* Not 8-bit clean, can't be a HZ here */
static int is_hz(const unsigned char* str __attribute__((unused))) { return 0; }
static const char *const CHARSET_NAMES[] = {
"gbk",
"big5",
"hz"
};
static ValidityFunc* validity_check_table[] = {
is_gbk,
is_big5,
is_hz
};
static RateFunc* rate_calc_table[] = {
in_gbk,
in_big5,
NULL
};
#define NCHARSETS (sizeof(CHARSET_NAMES)/sizeof(const char* const))
/**
* ENCA_LANGUAGE_ZH:
*
* Chinese language.
*
* Everything the world out there needs to know about this language.
**/
const EncaLanguageInfo ENCA_LANGUAGE_ZH = {
"zh",
"chinese",
NCHARSETS,
CHARSET_NAMES,
0,
0,
0,
0,
0,
&hook,
NULL,
NULL,
&calc_rating
};
/**
* hook:
* @analyser: Analyser state whose charset ratings are to be modified.
*
* Adjust ratings for language "zh", see calc_rating below.
*
* Returns: Nonzero if charset ratigns have been actually modified, zero
* otherwise.
**/
static int
hook(EncaAnalyserState *analyser)
{
const size_t* order = analyser->order;
double* rating_first = &analyser->ratings[order[0]];
double* rating_second = &analyser->ratings[order[1]];
if (*rating_second < 0) {
*rating_second = 0.;
if (*rating_first < 0)
*rating_first = 0.;
else
*rating_first = 1.; /* Make sure that the first won */
return 1;
}
return 0;
}
/**
* calc_rating:
* @analyser: An analyser.
*
* Calculating ratings for GBK and Big5, respectively, and
* ratings may be set to negative values when invalid a character
* for a charset was encoutered. This should not affect the result of
* enca_find_max_sec, but must be adjust to positive by hook for
* the final comparison.
*
* Returns: Always return 1
**/
static int calc_rating(EncaAnalyserState *analyser)
{
int islowbyte = 0;
unsigned int i, j;
unsigned char low;
const size_t size = analyser->size;
const unsigned char *buffer = analyser->buffer;
double *ratings = analyser->ratings;
int continue_check[NCHARSETS];
const struct zh_weight* pweight;
assert(analyser->ncharsets == NCHARSETS
&& sizeof(rate_calc_table)/sizeof(RateFunc*) == NCHARSETS
&& sizeof(validity_check_table)/sizeof(ValidityFunc*) == NCHARSETS);
for (i = 0; i < NCHARSETS; i++) {
continue_check[i] = 1;
ratings[i] = 0.;
}
for (i = 0; i < size; i++) {
low = buffer[i];
/* low byte */
if (islowbyte) {
const unsigned char* hanzi = buffer + i - 1;
assert(i);
for (j = 0; j < NCHARSETS; j++) {
if (continue_check[j]) {
continue_check[j] = validity_check_table[j](hanzi);
if (!continue_check[j])
ratings[j] = -1.;
else {
pweight = rate_calc_table[j](hanzi);
if (pweight)
ratings[j] += pweight->freq;
}
}
}
islowbyte = 0;
continue;
}
if (low & 0x80)
islowbyte = 1;
}
#ifdef DEBUG
printf("GBK: %f, BIG5: %f\n", ratings[0], ratings[1]);
#endif
/* Unfinished DBCS. */
if (islowbyte && analyser->options.termination_strictness > 0)
{
for (i = 0; i < NCHARSETS; i++)
ratings[i] = 0.;
}
return 1;
}
/* vim: ts=2
*/