/*
filters and hooks that various languages can use
Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>
This program is free software; you can redistribute it and/or modify it
under the terms of version 2 of the GNU General Public License as published
by the Free Software Foundation.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif /* HAVE_CONFIG_H */
#include <math.h>
#include "enca.h"
#include "internal.h"
/**
* EncaBoxDraw:
* @csname: Charset name.
* @isvbox: All other box drawing characters.
* @h1: Horizontal line character (light).
* @h2: Horizontal line character (heavy).
*
* Information about box-drawing characters for a charset.
**/
struct _EncaBoxDraw {
const char *csname;
const unsigned char *isvbox;
unsigned char h1;
unsigned char h2;
};
typedef struct _EncaBoxDraw EncaBoxDraw;
/* THIS IS A GENERATED TABLE, see tools/expand_table.pl */
static const unsigned char BOXVERT_IBM852[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
/* These are identical */
#define BOXVERT_IBM775 BOXVERT_IBM852
/* THIS IS A GENERATED TABLE, see tools/expand_table.pl */
static const unsigned char BOXVERT_KEYBCS2[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
/* These are identical */
#define BOXVERT_IBM866 BOXVERT_KEYBCS2
#define BOXVERT_CP1125 BOXVERT_KEYBCS2
/* THIS IS A GENERATED TABLE, see tools/expand_table.pl */
static const unsigned char BOXVERT_KOI8R[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
#if 0
/* UNUSED */
/* THIS IS A GENERATED TABLE, see tools/expand_table.pl */
static const unsigned char BOXVERT_KOI8RU[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
#endif
/* THIS IS A GENERATED TABLE, see tools/expand_table.pl */
static const unsigned char BOXVERT_KOI8U[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
/* THIS IS A GENERATED TABLE, see tools/expand_table.pl */
static const unsigned char BOXVERT_KOI8UNI[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static const EncaBoxDraw BOXDRAW[] = {
{ "cp1125", BOXVERT_CP1125, 196, 205 },
{ "ibm775", BOXVERT_IBM775, 196, 205 },
{ "ibm852", BOXVERT_IBM852, 196, 205 },
{ "ibm866", BOXVERT_IBM866, 196, 205 },
{ "keybcs2", BOXVERT_KEYBCS2, 196, 205 },
{ "koi8r", BOXVERT_KOI8R, 128, 160 },
{ "koi8u", BOXVERT_KOI8U, 128, 160 },
{ "koi8uni", BOXVERT_KOI8UNI, 128, 128 }, /* there's only one */
#if 0
{ "koi8ru", BOXVERT_KOI8RU, 128, 160 },
#endif
};
/* Local prototypes. */
static size_t filter_boxdraw_out(int charset,
unsigned char *buffer,
size_t size,
unsigned char fill_char);
/**
* enca_filter_boxdraw:
* @analyser: Analyser whose charsets should be considered for filtration.
* @fill_char: Replacement character for filtered bytes.
*
* Runs boxdrawing characters filter on @buffer for each charset in @language.
*
* Returns: Number of characters filtered out.
**/
size_t
enca_filter_boxdraw(EncaAnalyserState *analyser,
unsigned char fill_char)
{
size_t i;
size_t filtered = 0;
for (i = 0; i < analyser->ncharsets; i++) {
filtered += filter_boxdraw_out(analyser->charsets[i],
analyser->buffer, analyser->size,
fill_char);
}
return filtered;
}
/**
* filter_boxdraw_out:
* @charset: Charset whose associated filter should be applied.
* @buffer: Buffer to be filtered.
* @size: Size of @buffer.
* @fill_char: Replacement character for filtered bytes.
*
* Replaces box-drawing characters in @buffer with @fill_char.
*
* Not all possibly box-drawing characters are replaced, only those meeting
* certain conditions to reduce false filtering. It's assumed
* isspace(@fill_char) is true (it aborts when it isn't).
*
* It's OK to call with @charset which has no filter associated, it just
* returns zero then.
*
* Returns: The number of characters filtered.
**/
static size_t
filter_boxdraw_out(int charset,
unsigned char *buffer,
size_t size,
unsigned char fill_char)
{
static int charset_id[ELEMENTS(BOXDRAW)];
static int charset_id_initialized = 0;
const EncaBoxDraw *bd;
size_t i, n, xout;
assert(enca_isspace(fill_char));
if (!charset_id_initialized) {
for (i = 0; i < ELEMENTS(BOXDRAW); i++) {
charset_id[i] = enca_name_to_charset(BOXDRAW[i].csname);
assert(charset_id[i] != ENCA_CS_UNKNOWN);
}
charset_id_initialized = 1;
}
/* Find whether we have any filter associated with this charset. */
bd = NULL;
for (i = 0; i < ELEMENTS(BOXDRAW); i++) {
if (charset_id[i] == charset) {
bd = BOXDRAW + i;
break;
}
}
if (bd == NULL)
return 0;
xout = 0;
/* First stage:
* Horizontal lines, they must occur at least two in a row. */
i = 0;
while (i < size-1) {
if (buffer[i] == bd->h1 || buffer[i] == bd->h2) {
for (n = i+1; buffer[n] == buffer[i] && n < size; n++)
;
if (n > i+1) {
memset(buffer + i, fill_char, n - i);
xout += n - i;
}
i = n;
}
else i++;
}
/* Second stage:
* Vertical/mixed, they must occur separated by whitespace.
* We assume isspace(fill_char) is true. */
if (size > 1
&& bd->isvbox[buffer[0]]
&& enca_isspace(buffer[1])) {
buffer[0] = fill_char;
xout++;
}
for (i = 1; i < size-1; i++) {
if (bd->isvbox[buffer[i]]
&& enca_isspace(buffer[i-1])
&& enca_isspace(buffer[i+1])) {
buffer[i] = fill_char;
xout++;
}
}
if (size > 1
&& bd->isvbox[buffer[size-1]]
&& enca_isspace(buffer[size-2])) {
buffer[size-1] = fill_char;
xout++;
}
return xout;
}
/**
* enca_language_hook_ncs:
* @analyser: Analyser whose charset ratings are to be modified.
* @ncs: The number of charsets.
* @hookdata: What characters of which charsets should be given the extra
* weight.
*
* Decide between two charsets differing only in a few characters.
*
* If the two most probable charsets correspond to @hookdata charsets,
* give the characters they differ half the weight of all other characters
* together, thus allowing to decide between the two very similar charsets.
*
* It also recomputes @order when something changes.
*
* Returns: Nonzero when @ratings were actually modified, nonzero otherwise.
**/
int
enca_language_hook_ncs(EncaAnalyserState *analyser,
size_t ncs,
EncaLanguageHookData1CS *hookdata)
{
const int *const ids = analyser->charsets;
const size_t ncharsets = analyser->ncharsets;
const size_t *counts = analyser->counts;
const size_t *const order = analyser->order;
double *const ratings = analyser->ratings;
size_t maxcnt, j, k, m;
double q;
assert(ncharsets > 0);
assert(ncs <= ncharsets);
if (ncs < 2)
return 0;
/*
for (j = 0; j < ncharsets; j++) {
fprintf(stderr, "%s:\t%g\n", enca_csname(ids[order[j]]), ratings[order[j]]);
}
*/
/* Find id's and check whether they are the first */
for (j = 0; j < ncs; j++) {
EncaLanguageHookData1CS *h = hookdata + j;
/* Find charset if unknown */
if (h->cs == (size_t)-1) {
int id;
id = enca_name_to_charset(h->name);
assert(id != ENCA_CS_UNKNOWN);
k = 0;
while (k < ncharsets && id != ids[k])
k++;
assert(k < ncharsets);
h->cs = k;
}
/* If any charset is not between the first ncs ones, do nothing. */
k = 0;
while (k < ncs && order[k] != h->cs)
k++;
if (k == ncs)
return 0;
}
/* Sum the extra-important characters and find maximum. */
maxcnt = 0;
for (j = 0; j < ncs; j++) {
EncaLanguageHookData1CS const *h = hookdata + j;
for (m = k = 0; k < h->size; k++)
m += counts[h->list[k]];
if (m > maxcnt)
maxcnt = m;
}
if (maxcnt == 0)
return 0;
/* Substract something from charsets that have less than maximum. */
q = 0.5 * ratings[order[0]]/(maxcnt + EPSILON);
for (j = 0; j < ncs; j++) {
EncaLanguageHookData1CS const *h = hookdata + j;
m = maxcnt;
for (k = 0; k < h->size; k++)
m -= counts[h->list[k]];
ratings[h->cs] -= q*m;
}
enca_find_max_sec(analyser);
return 1;
}
/**
* enca_language_hook_eol:
* @analyser: Analyser whose charset ratings are to be modified.
* @ncs: The number of charsets.
* @hookdata: What characters of which charsets should be decided with based
* on the EOL type.
*
* Decide between two charsets differing only in EOL type or other surface.
*
* The (surface mask, charset) pairs are scanned in order. If a matching
* surface is found, ratings of all other charsets in the list are zeroed.
* So you can place a surface mask of all 1s at the end to match when nothing
* else matches.
*
* All the charsets have to have the same rating, or nothing happens.
*
* It also recomputes @order when something changes.
*
* Returns: Nonzero when @ratings were actually modified, nonzero otherwise.
**/
int
enca_language_hook_eol(EncaAnalyserState *analyser,
size_t ncs,
EncaLanguageHookDataEOL *hookdata)
{
const int *const ids = analyser->charsets;
const size_t ncharsets = analyser->ncharsets;
const size_t *const order = analyser->order;
double *const ratings = analyser->ratings;
size_t j, k;
assert(ncharsets > 0);
assert(ncs <= ncharsets);
if (ncs < 2)
return 0;
/* Rating equality check. */
for (j = 1; j < ncs; j++) {
if (fabs(ratings[order[j-1]] - ratings[order[j]]) > EPSILON)
return 0;
}
/* Find id's and check whether they are the first */
for (j = 0; j < ncs; j++) {
EncaLanguageHookDataEOL *h = hookdata + j;
/* Find charset if unknown */
if (h->cs == (size_t)-1) {
int id;
id = enca_name_to_charset(h->name);
assert(id != ENCA_CS_UNKNOWN);
k = 0;
while (k < ncharsets && id != ids[k])
k++;
assert(k < ncharsets);
h->cs = k;
}
/* If any charset is not between the first ncs ones, do nothing. */
k = 0;
while (k < ncs && order[k] != h->cs)
k++;
if (k == ncs)
return 0;
}
/* Find first matching EOL type. */
for (j = 0; j < ncs; j++) {
EncaLanguageHookDataEOL const *h = hookdata + j;
if (h->eol & analyser->result.surface) {
int chg = 0;
for (k = 0; k < ncs; k++) {
h = hookdata + k;
if (k != j && ratings[h->cs] > 0.0) {
ratings[h->cs] = 0.0;
chg = 1;
}
}
if (chg)
enca_find_max_sec(analyser);
return chg;
}
}
return 0;
}
/* vim: ts=2
*/