Blob Blame History Raw
#ifndef LIBENCA_H
#define LIBENCA_H
/***************************************************************************
 *
 *  Do not use anything from this file in applications.
 *  Or else don't be surprised when they mysteriously crash.
 *  Changes in internal interfaces DON'T count as interface
 *  changes and DON'T cause library API version changes.
 *
 ***************************************************************************/

#include <assert.h>

#include "enca.h"

/* str- an mem- function, theoretically they are all in string.h */
#ifdef HAVE_STRING_H
#  include <string.h>
#else /* HAVE_STRING_H */
#  ifdef HAVE_STRINGS_H
#    include <strings.h>
#  endif /* HAVE_STRINGS_H */
#endif /* HAVE_STRING_H */

#ifdef HAVE_MEMORY_H
#  include <memory.h>
#endif /* HAVE_MEMORY_H */

#ifdef DEBUG
#  include <stdio.h>
#endif /* DEBUG */

/* Flags for character type table.
 * 0-10 are standard ones, 11-13 Enca-specific. */
enum {
  ENCA_CTYPE_ALNUM  = 1 << 0,
  ENCA_CTYPE_ALPHA  = 1 << 1,
  ENCA_CTYPE_CNTRL  = 1 << 2,
  ENCA_CTYPE_DIGIT  = 1 << 3,
  ENCA_CTYPE_GRAPH  = 1 << 4,
  ENCA_CTYPE_LOWER  = 1 << 5,
  ENCA_CTYPE_PRINT  = 1 << 6,
  ENCA_CTYPE_PUNCT  = 1 << 7,
  ENCA_CTYPE_SPACE  = 1 << 8,
  ENCA_CTYPE_UPPER  = 1 << 9,
  ENCA_CTYPE_XDIGIT = 1 << 10,
  ENCA_CTYPE_NAME   = 1 << 11,
  ENCA_CTYPE_BINARY = 1 << 12,
  ENCA_CTYPE_TEXT   = 1 << 13
};

/* Forward delcarations of structured Enca types */
typedef struct _EncaAnalyserOptions EncaAnalyserOptions;
typedef struct _EncaAnalyserState EncaAnalyserState;
typedef struct _EncaCharsetInfo EncaCharsetInfo;
typedef struct _EncaLanguageInfo EncaLanguageInfo;
typedef struct _EncaLanguageHookData1CS EncaLanguageHookData1CS;
typedef struct _EncaLanguageHookDataEOL EncaLanguageHookDataEOL;
typedef struct _EncaUTFCheckData EncaUTFCheckData;

/**
 * EncaCharsetInfo:
 * @enca: Default, implicit name in enca.
 * @rfc1345: RFC1345 charset name.
 *          (For charsets not in RFC1345, some canonical name is invented.)
 * @cstocs: Cstocs charset name or -1.
 * @iconv: Iconv charset name or -1.
 * @mime: Preferred MIME charset name or -1.
 * @human: Human comprehensible description.
 * @flags: Charset properties (7bit, 8bit, multibyte, ...).
 * @nsurface: Natural surface (`implied' in recode).
 *
 * General charset informnations.
 *
 * All the #int fields are indices in #ALIAS_LIST[].
 **/
struct _EncaCharsetInfo {
  int enca;
  int rfc1345;
  int cstocs;
  int iconv;
  int mime;
  const char *human;
  unsigned int flags;
  unsigned int nsurface;
};

/**
 * EncaHookFunc:
 * @analyser: Analyser state whose charset ratings are to be modified.
 *
 * Language hook function type.
 *
 * Launches language specific hooks for a particular language.
 *
 * Returns: Nonzero if charset ratigns have been actually modified, zero
 * otherwise.
 **/
typedef int (* EncaHookFunc)(EncaAnalyserState *analyser);

/**
 * EncaGuessFunc:
 * @analyser: Analyser state whose buffer should be checked.
 *
 * Special (multibyte) encoding check function type.
 *
 * Returns: Nonzero if analyser->result has been set, zero otherwise.
 **/
typedef int (* EncaGuessFunc)(EncaAnalyserState *analyser);

/**
 * EncaLanguageInfo:
 * @name: Language name, or more precisely, locale name.
 * @humanname: Normal human-readable [English] language name.
 * @ncharsets: Number of charsets in this language.
 * @csnames: Charset names [@ncharsets].
 * @weights: Character weights for charsets [@ncharsets][0x100].
 * @significant: Character significancy data [0x100].
 * @letters: Characters considered letters (255's have no entry in @pairs,
 *           zeroes are non-letters aka FILL_NONLETTERs)
 * @pairs: Frequent pair table [max number in @letters].
 * @weight_sum: Sum of all @weights (is the same for all charsets).
 * @hook: Hook function (deciding hard cases).
 * @eolhook: EOL hook function (deciding ambiguous cases based on EOL type).
 * @ratinghook: Helper to calculate ratings for weightingless languages.
 *
 * Language specific data.
 **/
struct _EncaLanguageInfo {
  const char *name;
  const char *humanname;
  size_t ncharsets;
  const char *const *csnames;
  const unsigned short int *const *weights;
  const unsigned short int *significant;
  const unsigned char *const *letters;
  const unsigned char **const *pairs;
  long int weight_sum;
  EncaHookFunc hook;
  EncaHookFunc eolhook;
  EncaHookFunc lcuchook;
  EncaHookFunc ratinghook;
};

/**
 * EncaAnalyserOptions:
 * @const_buffer: Treat buffer as const?  Otherwise its content can be,
 *                and probably will be, modified.
 * @min_chars: Minimal number significant characters.
 * @threshold: Minimal ratio between winner and the second.
 * @multibyte_enabled: Check for multibyte encodings?
 * @interpreted_surfaces: Allow surfaces causing fundamental reinterpretation?
 * @ambiguous_mode: Ambiguous mode?
 * @filtering: Allow binary and box-drawing filters?
 * @test_garbageness: Do test garbageness?
 * @termination_strictness: Disallow broken multibyte sequences at buffer end?
 *
 * Analyser options, a part of analyser state.
 **/
struct _EncaAnalyserOptions {
  int const_buffer;
  size_t min_chars;
  double threshold;
  int multibyte_enabled;
  int interpreted_surfaces;
  int ambiguous_mode;
  int filtering;
  int test_garbageness;
  int termination_strictness;
};

/**
 * EncaAnalyserState:
 * @lang: Language informations.
 * @ncharsets: Number of 8bit charsets in this language.
 *             (Equal to @lang->ncharsets.)
 * @charsets: 8bit charset id's [@ncharsets].
 * @gerrno: Guessing gerrno.
 * @size: Size of buffer.
 * @buffer: Buffer whose encoding is to be detected [@size].
 *         (Owned by outer world.)
 * @result: Result returned to caller.
 * @counts: Character counts [0x100].
 * @bin: Number of `binary' characters.
 * @up: Number of 8bit characters.
 * @ratings: 8bit charset ratings [@ncharsets].
 * @order: Charset indices (not id's) sorted by ratings in descending order
 *         [ncharsets].
 * @size2: Size of buffer2.
 * @buffer2: A temporary secondary buffer [@size2].
 * @utfch: Double-UTF-8 test data [@ncharsets].
 * @utfbuf: Double-UTF-8 buffer for various UCS-2 character counting [0x10000].
 *          (Magic: see mark_scratch_buffer() for description.)
 * @pair2bits: Character pair map to charsets [0x100000] (indexed
 *             0x100*first + second).  Each bit corresponds to one charset,
 *             when set, the pair is `good' for the given charset.  The
 *             type is char, so it breaks for @ncharsets > 8, but it should
 *             not be accessed from outer world, so it can be easily enlarged
 *             to more bits.
 * @bitcounts: Counts for each possible bit combinations in @pair2bits
 *             [0x1 << ncharsets].
 * @pairratings: Counts of `good' pairs per charset [@ncharsets].
 * @lcbits: If a character is lowercase in some charset, correspinding bit
 *          is set [0x100].
 * @ucbits: If a character is uppercase in some charset, correspinding bit
 *          is set [0x100].
 * @options: Analyser options.
 *
 * The internal analyser state.
 *
 * Passed as an opaque object (`this') to analyser calls.
 **/
struct _EncaAnalyserState {
  /* Language data. */
  const EncaLanguageInfo *lang;
  size_t ncharsets;
  int *charsets;
  /* Analyser state. */
  EncaErrno gerrno;
  size_t size;
  unsigned char *buffer;
  EncaEncoding result;
  size_t *counts;
  size_t bin;
  size_t up;
  double *ratings;
  size_t *order;
  size_t size2;
  unsigned char *buffer2;
  /* Double-UTF-8 data. */
  EncaUTFCheckData *utfch;
  int *utfbuf;
  /* Pair frequency data */
  unsigned char *pair2bits;
  size_t *bitcounts;
  size_t *pairratings;
  /* LCUC data XXX: unused (yet) */
  size_t *lcbits;
  size_t *ucbits;
  /* Options. */
  EncaAnalyserOptions options;
};

/**
 * EncaLanguageHookData1CS:
 * @name: Charset name.
 * @size: Number of characters in @list.
 * @list: Extra-important character list for the charset.
 * @cs: Charset number.  This is an index in @analyser arrays (like @charsets),
 *      NOT a charset id.
 *
 * Cointainer for data needed by enca_language_hook_ncs().
 **/
struct _EncaLanguageHookData1CS {
  const char *name;
  size_t size;
  const unsigned char *list;
  size_t cs;
};

/**
 * EncaLanguageHookDataEOL:
 * @name: Charset name.
 * @eol: The corresponding #EncaSurface bit.
 * @cs: Charset number.  This is an index in @analyser arrays (like @charsets),
 *      NOT a charset id.
 *
 * Cointainer for data needed by enca_language_hook_eol().
 **/
struct _EncaLanguageHookDataEOL {
  const char *name;
  EncaSurface eol;
  size_t cs;
};

/**
 * EncaUTFCheckData:
 * @rating: Total rating for this charset.
 * @size: Number of UCS-2 characters.
 * @result: Nonzero when the sample is probably Doubly-UTF-8 encoded from
 *          this charset.
 * @ucs2: List of significant UCS-2 characters, in order [@size].
 * @weights: Weights for double-UTF-8 check [@size].  Positive means normal
 *           UTF-8, negative doubly-encoded.
 *
 * Data needed by double-UTF-8 check, per language charset.
 **/
struct _EncaUTFCheckData {
  double rating;
  size_t size;
  int result;
  int *ucs2;
  int *weights;
};

/**
 * FILL_NONLETTER:
 *
 * Replacement character for non-letters in pair frequencies.
 **/
#define FILL_NONLETTER '.'

/**
 * EPSILON:
 *
 * `Zero' for float comparsion (and to prevent division by zero, etc.).
 **/
#define EPSILON 0.000001

/**
 * LF:
 *
 * Line feed character (End-of-line on Unix).
 **/
#define LF ((unsigned char)'\n')

/**
 * CR:
 *
 * Carriage return character (End-of-line on Macintosh).
 **/
#define CR ((unsigned char)'\r')

/* Character type macros.
 *
 * The `text' and `binary' flags mark characters that can cause switch to
 * binary/text mode in filter_binary().  The view of what is text and what
 * is binary is quite simplistic, as we don't know the charset...
 *
 * The `name' flag marks characters acceptable in charset identifiers.
 **/
#define enca_ctype_test(c, t) ((enca_ctype_data[(unsigned char)c] & t) != 0)

#define enca_isalnum(c)  enca_ctype_test((c), ENCA_CTYPE_ALNUM)
#define enca_isalpha(c)  enca_ctype_test((c), ENCA_CTYPE_ALPHA)
#define enca_iscntrl(c)  enca_ctype_test((c), ENCA_CTYPE_CNTRL)
#define enca_isdigit(c)  enca_ctype_test((c), ENCA_CTYPE_DIGIT)
#define enca_isgraph(c)  enca_ctype_test((c), ENCA_CTYPE_GRAPH)
#define enca_islower(c)  enca_ctype_test((c), ENCA_CTYPE_LOWER)
#define enca_isprint(c)  enca_ctype_test((c), ENCA_CTYPE_PRINT)
#define enca_ispunct(c)  enca_ctype_test((c), ENCA_CTYPE_PUNCT)
#define enca_isspace(c)  enca_ctype_test((c), ENCA_CTYPE_SPACE)
#define enca_isupper(c)  enca_ctype_test((c), ENCA_CTYPE_UPPER)
#define enca_isxdigit(c) enca_ctype_test((c), ENCA_CTYPE_XDIGIT)
#define enca_isname(c)   enca_ctype_test((c), ENCA_CTYPE_NAME)
#define enca_isbinary(c) enca_ctype_test((c), ENCA_CTYPE_BINARY)
#define enca_istext(c)   enca_ctype_test((c), ENCA_CTYPE_TEXT)

/**
 * ELEMENTS:
 * @array: An array whose size is to be computed.
 *
 * Compute the number of elements of a static array.
 *
 * Returns: the number of elements.
 **/
#define ELEMENTS(array) (sizeof(array)/sizeof((array)[0]))

void*  enca_malloc  (size_t size);
void*  enca_realloc (void *ptr,
                     size_t size);

/**
 * enca_free:
 * @ptr: Pointer to memory to free.
 *
 * Frees memory pointed by @ptr with free() hack and assigns it a safe value,
 * thus may be called more than once.
 *
 * @ptr MUST be l-value.
 **/
#define enca_free(ptr) \
  { if (ptr) free(ptr); ptr=NULL; }

/**
 * NEW:
 * @type: Data type to allocate.
 * @n: Number of elements to allocate.
 *
 * An enca_malloc() wrapper.
 *
 * Returns: Pointer to the newly allocated memory.
 **/
#define NEW(type,n) ((type*)enca_malloc((n)*sizeof(type)))

/**
 * RENEW:
 * @ptr: Pointer to already allocate memory or #NULL.
 * @type: Data type to allocate.
 * @n: Number of elements to resize the memory to.
 *
 * An enca_realloc() wrapper.
 *
 * Returns: Pointer to the reallocated memory (or pointer safe to call free()
 * on when @n is zero).
 **/
#define RENEW(ptr,type,n) ((type*)enca_realloc((ptr),(n)*sizeof(type)))

/**
 * MAKE_HOOK_LINE:
 * @name: A charset name in C-style identifier suitable form.
 *
 * Ugly code `beautifier' macro for language hooks.
 **/
#define MAKE_HOOK_LINE(name) \
  { #name, ELEMENTS(list_##name), list_##name, (size_t)-1 }

/* Always use our, since we rely on enca_strdup(NULL) -> NULL */
char* enca_strdup(const char *s);

#ifndef HAVE_STRSTR
const char* enca_strstr(const char *haystack,
                        const char* needle);
#else/* not HAVE_STRSTR */
# define enca_strstr strstr
#endif /* not HAVE_STRSTR */

#ifndef HAVE_STPCPY
char* enca_stpcpy(char *dest,
                  const char *src);
#else /* not HAVE_STPCPY */
# define enca_stpcpy stpcpy
#endif /* not HAVE_STPCPY */

/**
 * enca_csname:
 * @cs: A charset id.
 *
 * A shorthand for printing names with #ENCA_NAME_STYLE_ENCA.
 **/
#define enca_csname(cs) enca_charset_name((cs), ENCA_NAME_STYLE_ENCA)

/* common.c */
char* enca_strconcat (const char *str,
                      ...);
char* enca_strappend (char *str,
                      ...);

/* encnames.c */
int         enca_name_to_charset  (const char *csname);
EncaSurface enca_name_to_surface  (const char *sname);

/* enca.c */
int         enca_language_init    (EncaAnalyserState *analyser,
                                   const char *langname);
void        enca_language_destroy (EncaAnalyserState *analyser);
double*     enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang);

/* unicodemap.c */
int         enca_charsets_subset_identical (int charset1,
                                            int charset2,
                                            const size_t *counts);

/* filters.c */
size_t      enca_filter_boxdraw    (EncaAnalyserState *analyser,
                                    unsigned char fill_char);
int         enca_language_hook_ncs (EncaAnalyserState *analyser,
                                    size_t ncs,
                                    EncaLanguageHookData1CS *hookdata);
int         enca_language_hook_eol (EncaAnalyserState *analyser,
                                    size_t ncs,
                                    EncaLanguageHookDataEOL *hookdata);

/* guess.c */
void        enca_guess_init    (EncaAnalyserState *analyser);
void        enca_guess_destroy (EncaAnalyserState *analyser);
EncaSurface enca_eol_surface   (const unsigned char *buffer,
                                size_t size,
                                const size_t *counts);
void        enca_find_max_sec  (EncaAnalyserState *analyser);

/* utf8_double.c */
void        enca_double_utf8_init    (EncaAnalyserState *analyser);
void        enca_double_utf8_destroy (EncaAnalyserState *analyser);

/* pair.c */
void        enca_pair_init    (EncaAnalyserState *analyser);
void        enca_pair_destroy (EncaAnalyserState *analyser);
int         enca_pair_analyse (EncaAnalyserState *analyser);

/* Languages. */
extern const EncaLanguageInfo ENCA_LANGUAGE_BE;
extern const EncaLanguageInfo ENCA_LANGUAGE_BG;
extern const EncaLanguageInfo ENCA_LANGUAGE_CS;
extern const EncaLanguageInfo ENCA_LANGUAGE_ET;
extern const EncaLanguageInfo ENCA_LANGUAGE_HR;
extern const EncaLanguageInfo ENCA_LANGUAGE_HU;
extern const EncaLanguageInfo ENCA_LANGUAGE_LT;
extern const EncaLanguageInfo ENCA_LANGUAGE_LV;
extern const EncaLanguageInfo ENCA_LANGUAGE_PL;
extern const EncaLanguageInfo ENCA_LANGUAGE_RU;
extern const EncaLanguageInfo ENCA_LANGUAGE_SK;
extern const EncaLanguageInfo ENCA_LANGUAGE_SL;
extern const EncaLanguageInfo ENCA_LANGUAGE_UK;
extern const EncaLanguageInfo ENCA_LANGUAGE_ZH;

/* Multibyte test lists.
 * These arrays must be NULL-terminated. */
extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_ASCII[];
extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_8BIT[];
extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_BINARY[];
extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_8BIT_TOLERANT[];

/* Locale-independent character type table. */
extern const short int enca_ctype_data[0x100];

#endif /* not LIBENCA_H */