/*
Extremely Naive Charset Analyser. main module
Copyright (C) 2000-2002 David Necas (Yeti) <yeti@physics.muni.cz>
This program is free software; you can redistribute it and/or modify it
under the terms of version 2 of the GNU General Public License as published
by the Free Software Foundation.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#include <math.h>
#include "common.h"
/* Local prototypes. */
static int process_file (EncaAnalyser an,
const char *fname);
static void dwim_libenca_options(EncaAnalyser an,
const File *file);
static void print_results (const char *fname,
EncaAnalyser an,
EncaEncoding result,
int gerrno);
static void indent_surface (const char *s);
static void double_utf8_chk (EncaAnalyser an,
const unsigned char *sample,
size_t size);
/* process options and do some other initializations, then go through the
file list and process files one by one
at the end, exit and return 0 on succes, 1 on failure, 2 on troubles */
int
main(int argc, char *argv[])
{
char **pp_file, **flist; /* filename list pointer */
long int err=0; /* nonzero if process_file() ever returned nonzero */
EncaAnalyser an;
/* Process command line arguments. */
pp_file = flist = process_opt(argc, argv);
/* Initialization. */
if (options.verbosity_level > 2)
fprintf(stderr, "Initializing language %s\n", options.language);
an = enca_analyser_alloc(options.language);
if (!an) {
fprintf(stderr, "%s: Language `%s' is unknown or not supported.\n"
"Run `%s --list languages' to get list "
"of supported languages.\n"
"Run `%s -L none' to test only language independent, "
"multibyte encodings.\n",
program_name, options.language,
program_name,
program_name);
exit(EXIT_TROUBLE);
}
enca_set_threshold(an, 1.38);
enca_set_multibyte(an, 1);
enca_set_ambiguity(an, 1);
enca_set_garbage_test(an, 1);
/* Any files specified on command line? */
if (pp_file == NULL) {
/* No => read stdin. */
err = process_file(an, NULL);
}
else {
/* Process file list, cumultate the worst error in err. */
while (*pp_file != NULL) {
err |= process_file(an, *pp_file);
enca_free(*pp_file);
pp_file++;
}
}
/* Free buffer */
process_file(NULL, NULL);
enca_analyser_free(an);
enca_free(options.language);
enca_free(options.target_enc_str);
enca_free(flist);
if (err & EXIT_TROUBLE)
err = EXIT_TROUBLE;
return err;
}
/* process file named fname
this is the `boss' function
returns 0 on succes, 1 on failure, 2 on troubles */
static int
process_file(EncaAnalyser an,
const char *fname)
{
static int utf8 = ENCA_CS_UNKNOWN;
static Buffer *buffer = NULL; /* persistent i/o buffer */
int ot_is_convert = (options.output_type == OTYPE_CONVERT);
EncaEncoding result; /* the guessed encoding */
File *file; /* the processed file */
if (!an) {
buffer_free(buffer);
return 0;
}
/* Initialize when we are called the first time. */
if (buffer == NULL)
buffer = buffer_new(buffer_size);
if (!enca_charset_is_known(utf8)) {
utf8 = enca_name_to_charset("utf8");
assert(enca_charset_is_known(utf8));
}
/* Read sample. */
file = file_new(fname, buffer);
if (file_open(file, ot_is_convert ? "r+b" : "rb") != 0) {
file_free(file);
return EXIT_TROUBLE;
}
if (file_read(file) == -1) {
file_free(file);
return EXIT_TROUBLE;
}
if (!ot_is_convert)
file_close(file);
/* Guess encoding. */
dwim_libenca_options(an, file);
if (ot_is_convert)
result = enca_analyse_const(an, buffer->data, buffer->pos);
else
result = enca_analyse(an, buffer->data, buffer->pos);
/* Is conversion required? */
if (ot_is_convert) {
int err = 0;
if (enca_charset_is_known(result.charset))
err = convert(file, result);
else {
if (enca_errno(an) != ENCA_EEMPTY) {
fprintf(stderr, "%s: Cannot convert `%s' from unknown encoding\n",
program_name,
ffname_r(file->name));
}
/* Copy stdin to stdout unchanged. */
if (file->name == NULL)
err = copy_and_convert(file, file, NULL);
}
file_free(file);
if ((err == ERR_OK && !enca_charset_is_known(result.charset)
&& enca_errno(an) != ENCA_EEMPTY)
|| err == ERR_CANNOT)
return EXIT_FAILURE;
return (err == ERR_OK) ? EXIT_SUCCESS : EXIT_TROUBLE;
}
/* Print results. */
print_results(file->name, an, result, enca_errno(an));
if (result.charset == utf8)
double_utf8_chk(an, buffer->data, buffer->pos);
file_free(file);
return enca_charset_is_known(result.charset) ? EXIT_SUCCESS : EXIT_FAILURE;
}
/*
* DWIM
*
* Choose some suitable values of all the libenca tuning parameters.
*/
static void
dwim_libenca_options(EncaAnalyser an, const File *file)
{
const double mu = 0.005; /* derivation in 0 */
const double m = 15.0; /* value in infinity */
ssize_t size = file->buffer->pos;
size_t sgnf;
/* The number of significant characters */
if (!size)
sgnf = 1;
else
sgnf = ceil((double)size/(size/m + 1.0/mu));
enca_set_significant(an, sgnf);
/* When buffer contains whole file, require correct termination. */
if (file->size == size)
enca_set_termination_strictness(an, 1);
else
enca_set_termination_strictness(an, 0);
enca_set_filtering(an, sgnf > 2);
}
/**
* Prints results.
**/
static void
print_results(const char *fname,
EncaAnalyser an,
EncaEncoding result,
int gerrno)
{
char *s;
EncaSurface surf = result.surface
& ~enca_charset_natural_surface(result.charset);
if (options.prefix_filename)
printf("%s: ", ffname_r(fname));
switch (options.output_type) {
case OTYPE_ALIASES:
print_aliases(result.charset);
break;
case OTYPE_CANON:
if (surf) {
s = enca_get_surface_name(surf, ENCA_NAME_STYLE_ENCA);
fputs(enca_charset_name(result.charset, ENCA_NAME_STYLE_ENCA), stdout);
puts(s);
enca_free(s);
}
else
puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_ENCA));
break;
case OTYPE_HUMAN:
case OTYPE_DETAILS:
if (surf) {
s = enca_get_surface_name(surf, ENCA_NAME_STYLE_HUMAN);
puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_HUMAN));
indent_surface(s);
enca_free(s);
}
else
puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_HUMAN));
break;
case OTYPE_RFC1345:
puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_RFC1345));
break;
case OTYPE_CS2CS:
if (enca_charset_name(result.charset, ENCA_NAME_STYLE_CSTOCS) != NULL)
puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_CSTOCS));
else
puts(enca_charset_name(ENCA_CS_UNKNOWN, ENCA_NAME_STYLE_CSTOCS));
break;
case OTYPE_ICONV:
if (enca_charset_name(result.charset, ENCA_NAME_STYLE_ICONV) != NULL)
puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_ICONV));
else
puts(enca_charset_name(ENCA_CS_UNKNOWN, ENCA_NAME_STYLE_ICONV));
break;
case OTYPE_MIME:
if (enca_charset_name(result.charset, ENCA_NAME_STYLE_MIME) != NULL)
puts(enca_charset_name(result.charset, ENCA_NAME_STYLE_MIME));
else
puts(enca_charset_name(ENCA_CS_UNKNOWN, ENCA_NAME_STYLE_MIME));
break;
default:
abort();
break;
}
if (gerrno && options.output_type == OTYPE_DETAILS) {
printf(" Failure reason: %s.\n", enca_strerror(an, gerrno));
}
}
/**
* Reformats surface names as returned from enca_get_surface_name() one
* per line to be indented and prints them.
**/
static void
indent_surface(const char *s)
{
const char *p;
while ((p = strchr(s, '\n')) != NULL) {
p++;
printf(" %.*s", (int)(p-s), s);
s = p;
}
}
/**
* Checks for doubly-encoded UTF-8 and prints a line when it looks so.
**/
static void
double_utf8_chk(EncaAnalyser an,
const unsigned char *sample,
size_t size)
{
size_t dbl, i;
int *candidates;
if (options.output_type != OTYPE_DETAILS
&& options.output_type != OTYPE_HUMAN)
return;
dbl = enca_double_utf8_check(an, sample, size);
if (!dbl)
return;
candidates = enca_double_utf8_get_candidates(an);
if (candidates == NULL)
return;
if (dbl == 1)
printf(" Doubly-encoded to UTF-8 from");
else
printf(" Doubly-encoded to UTF-8 from one of:");
for (i = 0; i < dbl; i++)
printf(" %s", enca_charset_name(candidates[i], ENCA_NAME_STYLE_ENCA));
putchar('\n');
enca_free(candidates);
}
/* vim: ts=2
*/