/*
conversion to other encodings
Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>
This program is free software; you can redistribute it and/or modify it
under the terms of version 2 of the GNU General Public License as published
by the Free Software Foundation.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#include "common.h"
#ifdef HAVE_SYS_WAIT_H
# include <sys/wait.h>
#else
pid_t waitpid(pid_t pid, int *status, int options);
#endif
/* We can't go on w/o this, defining struct stat manually is braindamaged. */
#include <sys/types.h>
#include <sys/stat.h>
/* converter flags */
#define CONV_EXTERN 0x0001
/* converter-type (filename, input encoding, output encoding) */
typedef int (* ConverterFunc)(File*, EncaEncoding);
/* struct converter data */
typedef struct _ConverterData ConverterData;
struct _ConverterData {
unsigned long int flags; /* flags */
ConverterFunc convfunc; /* pointer to converter function */
};
/* struct converter list */
typedef struct _Converter Converter;
struct _Converter {
const Abbreviation *conv; /* the converter (an abbreviation table entry) */
Converter *next; /* next in the list */
};
/* converter list */
static Converter *converters = NULL;
/* data for xtable */
static struct {
size_t ncharsets; /* number of charsets */
int *charsets; /* charset id's for active language [ncharsets] */
byte *tables; /* tables from charsets to target_charset [ncharsets * 0x100] */
int *have_table; /* whether particular table is already cached [ncharsets] */
unsigned int *ucs2_map; /* temporary space for map computation [0x10000] */
unsigned int target_map[0x100];
}
xdata = { 0, NULL, NULL, NULL, NULL, { 0 } };
/* Local prototypes. */
static int convert_builtin (File *file,
EncaEncoding from_enc);
static const byte* xtable (int from_charset);
static void xdata_free (void);
static const ConverterData cdata_builtin = { 0, &convert_builtin };
#ifdef HAVE_LIBRECODE
static const ConverterData cdata_librecode = { 0, &convert_recode };
#endif /* HAVE_LIBRECODE */
#ifdef HAVE_GOOD_ICONV
static const ConverterData cdata_iconv = { 0, &convert_iconv };
#endif /* HAVE_GOOD_ICONV */
#ifdef ENABLE_EXTERNAL
static const ConverterData cdata_extern = { CONV_EXTERN, &convert_external };
#endif /* ENABLE_EXTERNAL */
static const Abbreviation CONVERTERS[] = {
{ "built-in", &cdata_builtin },
#ifdef HAVE_LIBRECODE
{ "librecode", &cdata_librecode },
#endif /* HAVE_LIBRECODE */
#ifdef HAVE_GOOD_ICONV
{ "iconv", &cdata_iconv },
#endif /* HAVE_GOOD_ICONV */
#ifdef ENABLE_EXTERNAL
{ "extern", &cdata_extern }
#endif /* ENABLE_EXTERNAL */
};
/* decide which converter should be run and do common checks
from_enc, to_enc are current and requested encoding
returns error code
it doesn't open the file (guess() did it) and doesn't close it (caller does
it) */
int
convert(File *file,
EncaEncoding from_enc)
{
Converter *conv;
int err;
if (options.verbosity_level) {
fprintf(stderr, "%s: converting `%s': %s\n",
program_name, ffname_r(file->name),
format_request_string(from_enc, options.target_enc, 0));
}
/* do nothing when requested encoding is current encoding
(`nothing' may include copying stdin to stdout) */
if (from_enc.charset == options.target_enc.charset
&& from_enc.surface == options.target_enc.surface) {
if (file->name != NULL)
return ERR_OK;
else
return copy_and_convert(file, file, NULL);
}
/* try sequentially all allowed converters until we find some that can
perform the conversion or exahust the list */
conv = converters;
while (conv != NULL) {
if (options.verbosity_level > 1) {
fprintf(stderr, " trying to convert `%s' using %s\n",
ffname_r(file->name), conv->conv->name);
}
err = ((ConverterData *)conv->conv->data)->convfunc(file, from_enc);
if (err == ERR_OK)
return ERR_OK;
if ((((ConverterData *)conv->conv->data)->flags & CONV_EXTERN) != 0) {
fprintf(stderr, "%s: external converter failed on `%s', "
"probably destroying it\n",
program_name, ffname_w(file->name));
}
/* don't tempt fate in case of i/o or other serious problems */
if (err != ERR_CANNOT)
return ERR_IOFAIL;
conv = conv->next;
}
/* no converter able/allowed to perform given conversion, that's bad */
fprintf(stderr, "%s: no converter is able/allowed to perform "
"conversion %s on file `%s'\n",
program_name,
format_request_string(from_enc, options.target_enc, 0),
ffname_r(file->name));
/* nevertheless stdin should be copied to stdout anyway it cannot make
more mess */
if (file->name == NULL)
copy_and_convert(file, file, NULL);
return ERR_CANNOT;
}
/* built-in converter
performs conversion by in place modification of file named fname
or by calling copy_and_convert() for stdin -> stdout conversion
returns zero on success, error code otherwise */
static int
convert_builtin(File *file,
EncaEncoding from_enc)
{
static int ascii = ENCA_CS_UNKNOWN;
Buffer *buf; /* file->buffer alias */
const byte *xlat; /* conversion table */
if (!enca_charset_is_known(ascii)) {
ascii = enca_name_to_charset("ascii");
assert(enca_charset_is_known(ascii));
}
/* surfaces can cause fail iff user specificaly requested some
* or when they are other type than EOLs */
{
EncaSurface srf = options.target_enc.surface ^ from_enc.surface;
if ((options.target_enc.surface
&& from_enc.surface != options.target_enc.surface)
|| srf != (srf & ENCA_SURFACE_MASK_EOL)) {
if (options.verbosity_level > 2)
fprintf(stderr, "%s: built-in: cannot convert between "
"different surfaces\n",
program_name);
return ERR_CANNOT;
}
}
/* catch trivial conversions */
{
int identity = 0;
if (from_enc.charset == options.target_enc.charset)
identity = 1;
if (from_enc.charset == ascii
&& enca_charset_is_8bit(options.target_enc.charset)
&& !enca_charset_is_binary(options.target_enc.charset))
identity = 1;
if (identity) {
if (file->name == NULL)
return copy_and_convert(file, file, NULL);
else
return ERR_OK;
}
}
xlat = xtable(from_enc.charset);
if (xlat == NULL)
return ERR_CANNOT;
if (file->name == NULL)
return copy_and_convert(file, file, xlat);
/* read buffer_size bytes, convert, write back, etc. to death (or eof,
whichever come first) */
buf = file->buffer;
buf->pos = 0;
if (file_seek(file, 0, SEEK_SET) == -1)
return ERR_IOFAIL;
do {
if (file_read(file) == -1)
return ERR_IOFAIL;
if (buf->pos == 0)
break;
{
size_t len = buf->pos;
byte *p = buf->data;
do {
*p = xlat[*p];
p++;
} while (--len);
}
if (file_seek(file, -(buf->pos), SEEK_CUR) == -1)
return ERR_IOFAIL;
if (file_write(file) == -1)
return ERR_IOFAIL;
/* XXX: apparent no-op
but ISO C requires fseek() or ftell() between subsequent fwrite() and
fread(), or else the latter _may_ read nonsense -- and it actually does
read nonsense with glibc-2.2 (at least); see fopen(3) */
if (file_seek(file, 0, SEEK_CUR) == -1)
return ERR_IOFAIL;
} while (1);
return ERR_OK;
}
/* copy file file_from to file file_to, optionally performing xlat conversion
(if not NULL)
file_from has to be already opened for reading,
file_to has to be already opened for writing
they have to share common buffer
returns 0 on success, nonzero on failure */
int
copy_and_convert(File *file_from, File *file_to, const byte *xlat)
{
Buffer *buf; /* file_from->buffer alias */
if (xlat == NULL && options.verbosity_level > 3)
fprintf(stderr, " copying `%s' to `%s'\n",
ffname_r(file_from->name),
ffname_w(file_to->name));
assert(file_from->buffer == file_to->buffer);
buf = file_from->buffer;
/* If there's something in the buffer, process it first. */
if (file_from->buffer->pos != 0) {
if (xlat != NULL) {
size_t len = buf->pos;
byte *p = buf->data;
do {
*p = xlat[*p];
p++;
} while (--len);
}
if (file_write(file_to) == -1)
return ERR_IOFAIL;
}
/* Then copy the rest. */
do {
if (file_read(file_from) == -1)
return ERR_IOFAIL;
if (buf->pos == 0)
break;
if (xlat != NULL) {
size_t len = buf->pos;
byte *p = buf->data;
do {
*p = xlat[*p];
p++;
} while (--len);
}
if (file_write(file_to) == -1)
return ERR_IOFAIL;
} while (1);
fflush(file_to->stream);
return ERR_OK;
}
/* add converter to list of converters
(note `none' adds nothing and causes removing of all converters instead)
returns zero if everything went ok, nonzero otherwise */
int
add_converter(const char *cname)
{
/* no converters symbolic name */
static const char *CONVERTER_NAME_NONE = "none";
const Abbreviation *data;
Converter *conv = NULL, *conv1;
/* remove everything when we got `none' */
if (strcmp(CONVERTER_NAME_NONE, cname) == 0) {
if (options.verbosity_level > 3)
fprintf(stderr, "Removing all converters\n");
while (converters != NULL) {
conv = converters->next;
enca_free(converters);
converters = conv;
}
return 0;
}
/* find converter data */
data = expand_abbreviation(cname, CONVERTERS, ELEMENTS(CONVERTERS),
"converter");
if (data == NULL)
return 1;
/* add it to the end of converter list */
if (options.verbosity_level > 3)
fprintf(stderr, "Adding converter `%s'\n", data->name);
if (converters == NULL)
converters = conv = NEW(Converter, 1);
else {
for (conv1 = converters; conv1 != NULL; conv1 = conv1->next) {
/* reject duplicities */
if (data == conv1->conv->data) {
fprintf(stderr, "%s: converter %s specified more than once\n",
program_name,
conv1->conv->name);
return 1;
}
conv = conv1;
}
conv->next = NEW(Converter, 1);
conv = conv->next;
}
conv->next = NULL;
conv->conv = data;
return 0;
}
/* return nonzero if the list contains external converter */
int
external_converter_listed(void)
{
Converter *conv;
for (conv = converters; conv; conv = conv->next) {
if (((ConverterData*)conv->conv->data)->flags & CONV_EXTERN)
return 1;
}
return 0;
}
/* print white separated list of all valid converter names */
void
print_converter_list(void)
{
size_t i;
for (i = 0; i < sizeof(CONVERTERS)/sizeof(Abbreviation); i++)
printf("%s\n", CONVERTERS[i].name);
}
/* create and return request string for conversion from e1 to e2
filters out natrual surfaces || mask
is NOT thread-safe
returned string must NOT be freed and must be cosidered volatile */
const char*
format_request_string(EncaEncoding e1,
EncaEncoding e2,
EncaSurface mask)
{
static char *s = NULL;
char *p, *q;
const char *e2_name, *e1_name;
enca_free(s);
/* build s sequentially since value returned by surface_name() is lost
by the second call */
e1_name = enca_charset_name(e1.charset, ENCA_NAME_STYLE_ENCA);
p = enca_get_surface_name(e1.surface
& ~(enca_charset_natural_surface(e1.charset)
| mask),
ENCA_NAME_STYLE_ENCA);
if (!enca_charset_is_known(e2.charset)) {
q = enca_strdup("");
e2_name = options.target_enc_str;
}
else {
q = enca_get_surface_name(e2.surface
& ~(enca_charset_natural_surface(e2.charset)
| mask),
ENCA_NAME_STYLE_ENCA);
e2_name = enca_charset_name(e2.charset, ENCA_NAME_STYLE_ENCA);
}
s = enca_strconcat(e1_name, p, "..", e2_name, q, NULL);
enca_free(p);
enca_free(q);
return s;
}
/**
* xtable:
* @from_charset: Charset id for which the conversion table should be returned.
*
* Returns translation table from charset @from to (global) target charset.
*
* The returned table must be considered constant and must NOT be freed.
*
* Only conversion between charsets of one language is supported. We assume
* a language contains all known charsets usable for represenation of texts,
* so other charsets are taken as incompatible.
*
* Globals used: options.target_enc.charset, options.language.
*
* Returns: The conversion table [0x100]; #NULL on failure.
**/
static const byte*
xtable(int from_charset)
{
static int xtable_initialized = 0;
unsigned int from_map[0x100];
size_t i;
ssize_t fidx;
if (!enca_charset_has_ucs2_map(options.target_enc.charset)
|| !enca_charset_has_ucs2_map(from_charset))
return NULL;
/* Initialize when we are called the first time. */
if (!xtable_initialized) {
/* Allocate various tables. Never freed. */
xdata.charsets = enca_get_language_charsets(options.language,
&xdata.ncharsets);
assert(xdata.ncharsets > 1);
xdata.have_table = NEW(int, xdata.ncharsets);
xdata.tables = NEW(byte, 0x100*xdata.ncharsets);
xdata.ucs2_map = NEW(unsigned int, 0x10000);
for (i = 0; i < xdata.ncharsets; i++)
xdata.have_table[i] = 0;
/* Initialize tables to identity */
for (i = 0; i < 0x100; i++)
xdata.tables[i] = (byte)i;
for (i = 1; i < xdata.ncharsets; i++)
memcpy(xdata.tables + 0x100*i, xdata.tables, 0x100);
/* Check whether target_charset belongs to given language */
fidx = -1;
for (i = 0; i < xdata.ncharsets; i++) {
if (xdata.charsets[i] == options.target_enc.charset) {
fidx = i;
break;
}
}
if (fidx < 0)
return NULL;
{
int map_created;
map_created = enca_charset_ucs2_map(options.target_enc.charset,
xdata.target_map);
assert(map_created);
}
atexit(xdata_free);
}
/* Check whether from_charset belongs to given language */
fidx = -1;
for (i = 0; i < xdata.ncharsets; i++) {
if (xdata.charsets[i] == from_charset) {
fidx = i;
break;
}
}
if (fidx < 0)
return NULL;
/* Return table if cached. */
if (xdata.have_table[fidx])
return xdata.tables + 0x100*fidx;
/* Otherwise it must be generated */
{
int map_created;
map_created = enca_charset_ucs2_map(from_charset, from_map);
assert(map_created);
}
for (i = 0; i < 0x10000; i++)
xdata.ucs2_map[i] = ENCA_NOT_A_CHAR;
for (i = 0; i < 0x100; i++) {
size_t j = 0xff - i;
if (xdata.target_map[j] != ENCA_NOT_A_CHAR)
xdata.ucs2_map[xdata.target_map[j]] = (unsigned int)j;
}
/* XXX XXX XXX XXX XXX Warning: Extreme brain damage! XXX XXX XXX XXX XXX
* When converting to ibm866 we have to replace Belarusian/Ukrainian i/I
* with Latin versions. I've been told everybody expect this. */
if (options.target_enc.charset == enca_name_to_charset("ibm866")) {
xdata.ucs2_map[0x0406] = (byte)'I';
xdata.ucs2_map[0x0456] = (byte)'i';
}
for (i = 0; i < 0x100; i++) {
size_t j = 0xff - i;
if (from_map[j] != ENCA_NOT_A_CHAR
&& xdata.ucs2_map[from_map[j]] != ENCA_NOT_A_CHAR)
xdata.tables[0x100*fidx + j] = (byte)xdata.ucs2_map[from_map[j]];
}
return xdata.tables + 0x100*fidx;
}
static void
xdata_free(void)
{
enca_free(xdata.charsets);
enca_free(xdata.tables);
enca_free(xdata.have_table);
enca_free(xdata.ucs2_map);
}
/* vim: ts=2
*/