Blame utf8trans/utf8trans.c

Packit e4b6da
/* vim: sta et sw=4
Packit e4b6da
 */
Packit e4b6da
Packit e4b6da
/*
Packit e4b6da
 * $Id: utf8trans.c,v 1.12 2006/04/13 01:00:01 stevecheng Exp $
Packit e4b6da
 *
Packit e4b6da
 * (C) 2001 Steve Cheng <stevecheng@users.sourceforge.net>
Packit e4b6da
 *
Packit e4b6da
 * See ../COPYING for the copyright status of this software.
Packit e4b6da
 *
Packit e4b6da
 */
Packit e4b6da
Packit e4b6da
#ifdef HAVE_CONFIG_H
Packit e4b6da
#include "config.h"
Packit e4b6da
#endif
Packit e4b6da
Packit e4b6da
#define _GNU_SOURCE     /* For getline */
Packit e4b6da
Packit e4b6da
#include <stdio.h>
Packit e4b6da
#include <stdlib.h>
Packit e4b6da
#include <string.h>
Packit e4b6da
#include <errno.h>
Packit e4b6da
Packit e4b6da
#ifdef HAVE_UNISTD_H
Packit e4b6da
#include <unistd.h>
Packit e4b6da
#endif
Packit e4b6da
Packit e4b6da
#include "mtable.h"
Packit e4b6da
#include "strings_buffer.h"
Packit e4b6da
Packit e4b6da
/* UCS-4 character */
Packit e4b6da
typedef unsigned int CHAR;
Packit e4b6da
Packit e4b6da
#ifdef HAVE_GETOPT_H
Packit e4b6da
#include <getopt.h>
Packit e4b6da
#endif
Packit e4b6da
Packit e4b6da
#ifdef HAVE_GETOPT_LONG
Packit e4b6da
/* Long-option specification */
Packit e4b6da
struct option long_options[] =
Packit e4b6da
{
Packit e4b6da
    { "version", 0, 0, 'v' },
Packit e4b6da
    { "help", 0, 0, 'h' },
Packit e4b6da
    { "modify", 0, 0, 'm' },
Packit e4b6da
    
Packit e4b6da
    { 0, 0, 0, 0 }
Packit e4b6da
};
Packit e4b6da
#endif
Packit e4b6da
Packit e4b6da
const char *prog_name;
Packit e4b6da
const char *charmap_filename = NULL;
Packit e4b6da
mtable_t charmap_table;
Packit e4b6da
static int charmap_table_exponents[] = { 8, 8, 8, 8, 0 };
Packit e4b6da
strings_buffer_t charmap_strings;
Packit e4b6da
int modify_in_place = 0;
Packit e4b6da
Packit e4b6da
int do_options(int argc, char *argv[]);
Packit e4b6da
void print_version(void);
Packit e4b6da
void print_usage();
Packit e4b6da
void add_translation(CHAR codepoint, char *translation);
Packit e4b6da
void parse_charmap(FILE *stream);
Packit e4b6da
char *encode_utf8(CHAR codepoint);
Packit e4b6da
void translate(FILE *in, FILE *out);
Packit e4b6da
Packit e4b6da
#ifndef HAVE_GETLINE
Packit e4b6da
ssize_t getline(char **lineptr, size_t *n, FILE *stream);
Packit e4b6da
#endif
Packit e4b6da
Packit e4b6da
int
Packit e4b6da
main(int argc, char *argv[])
Packit e4b6da
{
Packit e4b6da
    FILE *charmap_f;
Packit e4b6da
    int optind;
Packit e4b6da
    
Packit e4b6da
    prog_name = argv[0];
Packit e4b6da
    optind = do_options(argc, argv);
Packit e4b6da
    
Packit e4b6da
    charmap_table = mtable_new(charmap_table_exponents);
Packit e4b6da
    charmap_strings = strings_buffer_new(4096);
Packit e4b6da
    
Packit e4b6da
    /* Read translation spec */
Packit e4b6da
    charmap_filename = argv[optind];
Packit e4b6da
    charmap_f = fopen(charmap_filename, "r");
Packit e4b6da
    if(!charmap_f) {
Packit e4b6da
        fprintf(stderr, "%s:%s: %s\n", 
Packit e4b6da
                prog_name,
Packit e4b6da
                charmap_filename,
Packit e4b6da
                strerror(errno));
Packit e4b6da
        exit(1);
Packit e4b6da
    }
Packit e4b6da
Packit e4b6da
    parse_charmap(charmap_f);
Packit e4b6da
Packit e4b6da
    fclose(charmap_f);
Packit e4b6da
Packit e4b6da
    optind++;
Packit e4b6da
    
Packit e4b6da
    if(!argv[optind]) {
Packit e4b6da
        translate(stdin, stdout);
Packit e4b6da
    }
Packit e4b6da
    else {
Packit e4b6da
        int i;
Packit e4b6da
        FILE *f, *out;
Packit e4b6da
        for(i = optind; argv[i]; i++) {
Packit e4b6da
            out = stdout;
Packit e4b6da
Packit e4b6da
            f = fopen(argv[i], "r");
Packit e4b6da
            if(!f) {
Packit e4b6da
                fprintf(stderr, "%s:%s: %s\n",
Packit e4b6da
                        prog_name,
Packit e4b6da
                        argv[i],
Packit e4b6da
                        strerror(errno));
Packit e4b6da
                exit(1);
Packit e4b6da
            }
Packit e4b6da
Packit e4b6da
#ifdef HAVE_UNISTD_H
Packit e4b6da
            if(modify_in_place) {
Packit e4b6da
                if(unlink(argv[i]) < 0) {
Packit e4b6da
                    fprintf(stderr, "%s:%s: %s\n",
Packit e4b6da
                            prog_name,
Packit e4b6da
                            argv[i],
Packit e4b6da
                            strerror(errno));
Packit e4b6da
                    fclose(f);
Packit e4b6da
                    continue;
Packit e4b6da
                }
Packit e4b6da
                    
Packit e4b6da
                out = fopen(argv[i], "w");
Packit e4b6da
                if(!out) {
Packit e4b6da
                    fprintf(stderr, "%s:%s: %s\n",
Packit e4b6da
                            prog_name,
Packit e4b6da
                            argv[i],
Packit e4b6da
                            strerror(errno));
Packit e4b6da
                    fclose(f);
Packit e4b6da
                    continue;
Packit e4b6da
                }
Packit e4b6da
            }
Packit e4b6da
#endif
Packit e4b6da
            
Packit e4b6da
            translate(f, out);
Packit e4b6da
            fclose(f);
Packit e4b6da
Packit e4b6da
            if(modify_in_place)
Packit e4b6da
                fclose(out);
Packit e4b6da
        }
Packit e4b6da
    }
Packit e4b6da
Packit e4b6da
    mtable_delete(charmap_table);
Packit e4b6da
    strings_buffer_delete(charmap_strings);
Packit e4b6da
Packit e4b6da
    return 0;
Packit e4b6da
}
Packit e4b6da
Packit e4b6da
void
Packit e4b6da
print_version(void)
Packit e4b6da
{
Packit e4b6da
    puts("utf8trans (part of docbook2X" 
Packit e4b6da
#ifdef HAVE_CONFIG_H
Packit e4b6da
    VERSION
Packit e4b6da
#endif
Packit e4b6da
        ")");
Packit e4b6da
Packit e4b6da
    puts("$Revision: 1.12 $ $Date: 2006/04/13 01:00:01 $");
Packit e4b6da
    puts("<URL:http://docbook2x.sourceforge.net/>\n");
Packit e4b6da
    
Packit e4b6da
    puts("Copyright (C) 2000-2004 Steve Cheng\n"
Packit e4b6da
         "This is free software; see the source for copying conditions.\n"
Packit e4b6da
         "There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR\n"
Packit e4b6da
         "A PARTICULAR PURPOSE.");
Packit e4b6da
}
Packit e4b6da
Packit e4b6da
void
Packit e4b6da
print_usage()
Packit e4b6da
{
Packit e4b6da
    printf("Usage: %s [options] CHARMAP [FILES...]\n", prog_name);
Packit e4b6da
    puts("Transliterate UTF-8 characters according to a table.\n");
Packit e4b6da
Packit e4b6da
#ifdef HAVE_UNISTD_H
Packit e4b6da
#ifdef HAVE_GETOPT_LONG
Packit e4b6da
    puts("  -m, --modify            modify given files in-place\n"
Packit e4b6da
         "  -v, --version           display version information and exit\n"
Packit e4b6da
         "  -h, --help              display this usage information\n");
Packit e4b6da
#else
Packit e4b6da
    puts("  -m                      modify given files in-place\n"
Packit e4b6da
         "  -v                      display version information and exit\n"
Packit e4b6da
         "  -h                      display this usage information\n");
Packit e4b6da
#endif
Packit e4b6da
#endif
Packit e4b6da
Packit e4b6da
    puts("See utf8trans(1) for details on this program.\n");
Packit e4b6da
}
Packit e4b6da
    
Packit e4b6da
int
Packit e4b6da
do_options(int argc, char *argv[])
Packit e4b6da
{
Packit e4b6da
#ifdef HAVE_UNISTD_H    /* On a Unix, so have some version of getopt */
Packit e4b6da
    int optc;
Packit e4b6da
Packit e4b6da
#ifdef HAVE_GETOPT_LONG
Packit e4b6da
    while((optc = getopt_long(argc, argv, "vhm", 
Packit e4b6da
                long_options, NULL)) != -1)
Packit e4b6da
#else
Packit e4b6da
    while((optc = getopt(argc, argv, "vhm")) != -1)
Packit e4b6da
#endif  /* HAVE_GETOPT_LONG */
Packit e4b6da
    {
Packit e4b6da
        switch(optc) {
Packit e4b6da
Packit e4b6da
        /* --version */
Packit e4b6da
        case 'v':
Packit e4b6da
            print_version();
Packit e4b6da
            exit(0);
Packit e4b6da
            
Packit e4b6da
        /* --help */
Packit e4b6da
        case 'h':
Packit e4b6da
            print_usage();
Packit e4b6da
            exit(0);
Packit e4b6da
Packit e4b6da
        /* --modify */
Packit e4b6da
        case 'm':
Packit e4b6da
            modify_in_place = 1;
Packit e4b6da
            break;
Packit e4b6da
Packit e4b6da
        case '?':
Packit e4b6da
        default:
Packit e4b6da
            exit(1);
Packit e4b6da
        }
Packit e4b6da
    }
Packit e4b6da
Packit e4b6da
    if(optind > argc-1) {
Packit e4b6da
        fprintf(stderr, "%s: must specify charmap\n", prog_name);
Packit e4b6da
        exit(1);
Packit e4b6da
    }
Packit e4b6da
Packit e4b6da
    return optind;
Packit e4b6da
    
Packit e4b6da
#else   /* No getopt, so don't process any options.
Packit e4b6da
           They are all trivial, so that justifies ignoring them. */
Packit e4b6da
Packit e4b6da
    if(argc < 2) {
Packit e4b6da
        fprintf(stderr, "%s: must specify charmap\n", prog_name);
Packit e4b6da
        exit(1);
Packit e4b6da
    }
Packit e4b6da
    
Packit e4b6da
    return 1;
Packit e4b6da
Packit e4b6da
#endif  
Packit e4b6da
}
Packit e4b6da
Packit e4b6da
void 
Packit e4b6da
add_translation(CHAR codepoint, char *translation)
Packit e4b6da
{
Packit e4b6da
    char *s = strings_buffer_add(&charmap_strings, translation);
Packit e4b6da
    mtable_set(charmap_table, codepoint, s);
Packit e4b6da
}
Packit e4b6da
Packit e4b6da
char *
Packit e4b6da
get_translation(CHAR codepoint)
Packit e4b6da
{
Packit e4b6da
    char *translation = mtable_get(charmap_table, codepoint);
Packit e4b6da
    if(translation != NULL)
Packit e4b6da
        return translation;
Packit e4b6da
    else
Packit e4b6da
        return encode_utf8(codepoint);
Packit e4b6da
}
Packit e4b6da
Packit e4b6da
char *
Packit e4b6da
encode_utf8(CHAR c)
Packit e4b6da
{
Packit e4b6da
    static char buf[7];
Packit e4b6da
    char *p = buf;
Packit e4b6da
    
Packit e4b6da
    if(c < 0x80) {
Packit e4b6da
        *p++ = c;
Packit e4b6da
        *p++ = '\0';
Packit e4b6da
    } else if(c < 0x800) {
Packit e4b6da
        *p++ = 0xC0 | (c>>6);
Packit e4b6da
        *p++ = 0x80 | (c & 0x3F);
Packit e4b6da
        *p++ = '\0';
Packit e4b6da
    } else if(c < 0x10000) {
Packit e4b6da
        *p++ = 0xE0 | (c>>12);
Packit e4b6da
        *p++ = 0x80 | ((c>>6) & 0x3F);
Packit e4b6da
        *p++ = 0x80 | (c & 0x3F);
Packit e4b6da
        *p++ = '\0';
Packit e4b6da
    } else if (c < 0x200000) {
Packit e4b6da
        *p++ = 0xF0 | (c>>18);
Packit e4b6da
        *p++ = 0x80 | ((c>>12) & 0x3F);
Packit e4b6da
        *p++ = 0x80 | ((c>>6) & 0x3F);
Packit e4b6da
        *p++ = 0x80 | (c & 0x3F);
Packit e4b6da
        *p++ = '\0';
Packit e4b6da
    } else if (c < 0x4000000) {
Packit e4b6da
        *p++ = 0xF8 | (c>>24);
Packit e4b6da
        *p++ = 0x80 | ((c>>18) & 0x3F);
Packit e4b6da
        *p++ = 0x80 | ((c>>12) & 0x3F);
Packit e4b6da
        *p++ = 0x80 | ((c>>6) & 0x3F);
Packit e4b6da
        *p++ = 0x80 | (c & 0x3F);
Packit e4b6da
        *p++ = '\0';
Packit e4b6da
    } else if (c < 0x80000000) {
Packit e4b6da
        *p++ = 0xFC | (c>>30);
Packit e4b6da
        *p++ = 0x80 | ((c>>24) & 0x3F);
Packit e4b6da
        *p++ = 0x80 | ((c>>18) & 0x3F);
Packit e4b6da
        *p++ = 0x80 | ((c>>12) & 0x3F);
Packit e4b6da
        *p++ = 0x80 | ((c>>6) & 0x3F);
Packit e4b6da
        *p++ = 0x80 | (c & 0x3F);
Packit e4b6da
        *p++ = '\0';
Packit e4b6da
    } else {
Packit e4b6da
        /* Oops */
Packit e4b6da
        abort();
Packit e4b6da
    }
Packit e4b6da
    
Packit e4b6da
    return buf;
Packit e4b6da
}
Packit e4b6da
Packit e4b6da
/* 0 to 9, a to f, A to F */
Packit e4b6da
#define IS_HEXDIGIT(c) (((c) >= 48 && (c) <= 57) || \
Packit e4b6da
                        ((c) >= 97 && (c) <= 102) || \
Packit e4b6da
                        ((c) >= 65 && (c) <= 70))
Packit e4b6da
#define IS_SPACE(c) ((c) == ' ' || (c) == '\t')
Packit e4b6da
Packit e4b6da
void parse_charmap(FILE *stream)
Packit e4b6da
{
Packit e4b6da
    char *buf = NULL;
Packit e4b6da
    size_t bufsize = 0;
Packit e4b6da
    char *p, *c, *t;
Packit e4b6da
    int linecount = 0;
Packit e4b6da
Packit e4b6da
    CHAR codepoint;
Packit e4b6da
    
Packit e4b6da
    while(!feof(stream)) {
Packit e4b6da
        linecount++;
Packit e4b6da
        if(getline(&buf, &bufsize, stream) == -1) {
Packit e4b6da
            if(!feof(stream)) {
Packit e4b6da
                fprintf(stderr, "%s:%s: %s",
Packit e4b6da
                        prog_name, charmap_filename, 
Packit e4b6da
                        strerror(errno));
Packit e4b6da
                exit(2);
Packit e4b6da
            }
Packit e4b6da
            goto nextline;
Packit e4b6da
        }
Packit e4b6da
Packit e4b6da
        /* Chomp newline */
Packit e4b6da
        p = buf + (strlen(buf)-1);
Packit e4b6da
        if(*p == '\n') *p = '\0';
Packit e4b6da
Packit e4b6da
        /* Skip to codepoint */
Packit e4b6da
        for(c = buf; *c && IS_SPACE(*c); c++);
Packit e4b6da
Packit e4b6da
        /* Skip empty lines and comment lines */
Packit e4b6da
        if(*c == '\0' || *c == '#')
Packit e4b6da
            goto nextline;
Packit e4b6da
Packit e4b6da
        t = NULL;
Packit e4b6da
Packit e4b6da
        /* Parse the codepoint (a number in hex) */
Packit e4b6da
        for(p = c; *p; p++) {
Packit e4b6da
            if(!IS_HEXDIGIT(*p)) {
Packit e4b6da
                if(!IS_SPACE(*p)) {
Packit e4b6da
                    fprintf(stderr, "%s:%s:%d: %s",
Packit e4b6da
                            prog_name, charmap_filename, linecount,
Packit e4b6da
                            "(parsing codepoint) invalid hex number\n");
Packit e4b6da
                    goto nextline;
Packit e4b6da
                }
Packit e4b6da
Packit e4b6da
                *p = '\0';
Packit e4b6da
                
Packit e4b6da
                if(sscanf(c, "%x", &codepoint) != 1) {
Packit e4b6da
                    fprintf(stderr, "%s:%s:%d: %s",
Packit e4b6da
                            prog_name, charmap_filename, linecount,
Packit e4b6da
                            "(parsing codepoint) invalid hex number\n");
Packit e4b6da
                    goto nextline;
Packit e4b6da
                }
Packit e4b6da
Packit e4b6da
                t = ++p;
Packit e4b6da
                break;
Packit e4b6da
            }
Packit e4b6da
        }
Packit e4b6da
Packit e4b6da
        if(t) {
Packit e4b6da
            add_translation(codepoint, t);
Packit e4b6da
        } else {
Packit e4b6da
            /* No translation text */
Packit e4b6da
            if(sscanf(c, "%x", &codepoint) != 1) {
Packit e4b6da
                fprintf(stderr, "%s:%s:%d: %s",
Packit e4b6da
                        prog_name, charmap_filename, linecount,
Packit e4b6da
                        "(parsing codepoint) invalid hex number\n");
Packit e4b6da
                goto nextline;
Packit e4b6da
            }
Packit e4b6da
            add_translation(codepoint, "");
Packit e4b6da
        }
Packit e4b6da
nextline: ;
Packit e4b6da
    }
Packit e4b6da
Packit e4b6da
    if(buf)
Packit e4b6da
        free(buf);
Packit e4b6da
}
Packit e4b6da
Packit e4b6da
CHAR
Packit e4b6da
read_utf8_char(FILE *stream)
Packit e4b6da
{
Packit e4b6da
    CHAR character;
Packit e4b6da
    int b, n, i;
Packit e4b6da
Packit e4b6da
    b = fgetc(stream);
Packit e4b6da
    if(b == EOF)
Packit e4b6da
        return 0xFFFFFFFF;
Packit e4b6da
Packit e4b6da
    /* UTF-8 sequence leading byte */
Packit e4b6da
    if((b & 0xC0) == 0xC0) {
Packit e4b6da
        /* Count bytes and eat lead bits */
Packit e4b6da
        for(n = 0; b & 0x80; b<<=1, n++);
Packit e4b6da
        b = (b & 0xFF) >> n;
Packit e4b6da
Packit e4b6da
        if(n > 6 || n < 2) return 0xFFFD;
Packit e4b6da
Packit e4b6da
        switch(n) {
Packit e4b6da
            case 6: b <<= 6;
Packit e4b6da
            case 5: b <<= 6;
Packit e4b6da
            case 4: b <<= 6;
Packit e4b6da
            case 3: b <<= 6;
Packit e4b6da
            case 2: b <<= 6;
Packit e4b6da
        }
Packit e4b6da
        character = b;
Packit e4b6da
        
Packit e4b6da
        for(i = n; i>1; i--) {
Packit e4b6da
            b = fgetc(stream);
Packit e4b6da
            if(b == EOF) return 0xFFFD;
Packit e4b6da
            if((b & 0xC0) != 0x80) return 0xFFFD;
Packit e4b6da
            b &= 0x3F;
Packit e4b6da
Packit e4b6da
            switch(i) {
Packit e4b6da
                case 6: b <<= 6;
Packit e4b6da
                case 5: b <<= 6;
Packit e4b6da
                case 4: b <<= 6;
Packit e4b6da
                case 3: b <<= 6;
Packit e4b6da
                case 2: ;
Packit e4b6da
            }
Packit e4b6da
            
Packit e4b6da
            character |= b;
Packit e4b6da
        }
Packit e4b6da
Packit e4b6da
        /* Check for overlong sequences */
Packit e4b6da
        switch(n) {
Packit e4b6da
            case 6: if(character < 0x4000000) return 0xFFFD;
Packit e4b6da
            case 5: if(character < 0x200000) return 0xFFFD;
Packit e4b6da
            case 4: if(character < 0x10000) return 0xFFFD;
Packit e4b6da
            case 3: if(character < 0x800) return 0xFFFD;
Packit e4b6da
            case 2: if(character < 0x80) return 0xFFFD;
Packit e4b6da
        }
Packit e4b6da
Packit e4b6da
        return character;
Packit e4b6da
    }
Packit e4b6da
Packit e4b6da
    /* UTF-8 sequence continuation byte */
Packit e4b6da
    else if((b & 0xC0) == 0x80) {
Packit e4b6da
        return 0xFFFD;
Packit e4b6da
    }
Packit e4b6da
Packit e4b6da
    /* ASCII character */
Packit e4b6da
    else {
Packit e4b6da
        return (CHAR)b;
Packit e4b6da
    }
Packit e4b6da
}
Packit e4b6da
    
Packit e4b6da
void 
Packit e4b6da
translate(FILE *in, FILE *out)
Packit e4b6da
{
Packit e4b6da
    CHAR character;
Packit e4b6da
    
Packit e4b6da
    while(!feof(in))
Packit e4b6da
    {
Packit e4b6da
        character = read_utf8_char(in);
Packit e4b6da
Packit e4b6da
        if(character == 0xFFFFFFFF)
Packit e4b6da
            break;
Packit e4b6da
        
Packit e4b6da
        /* Don't lose null characters in input */
Packit e4b6da
        if(character == 0 && !mtable_get(charmap_table, 0))
Packit e4b6da
            fputc(0, out);
Packit e4b6da
        else
Packit e4b6da
            fputs(get_translation(character), out);
Packit e4b6da
    }
Packit e4b6da
}
Packit e4b6da
Packit e4b6da
#if !HAVE_GETLINE
Packit e4b6da
ssize_t getline(char **lineptr, size_t *n, FILE *stream)
Packit e4b6da
{
Packit e4b6da
    ssize_t k = 0;
Packit e4b6da
    int c;
Packit e4b6da
Packit e4b6da
    if(!*lineptr) {
Packit e4b6da
        *lineptr = malloc(256);
Packit e4b6da
        if(!*lineptr)
Packit e4b6da
            return -1;
Packit e4b6da
        *n = 256;
Packit e4b6da
    }
Packit e4b6da
Packit e4b6da
    do {
Packit e4b6da
        c = fgetc(stream);
Packit e4b6da
        if(c == EOF) {
Packit e4b6da
            if(k == 0) {
Packit e4b6da
                (*lineptr)[0] = 0;
Packit e4b6da
                return -1;
Packit e4b6da
            }
Packit e4b6da
            
Packit e4b6da
            break;
Packit e4b6da
        }
Packit e4b6da
Packit e4b6da
        if(k == *n - 1) {
Packit e4b6da
            char *p = realloc(*lineptr, *n *2);
Packit e4b6da
            if(!p)
Packit e4b6da
                return -1;
Packit e4b6da
            *lineptr = p;
Packit e4b6da
        }
Packit e4b6da
Packit e4b6da
        (*lineptr)[k++] = c;
Packit e4b6da
    } while(c != '\n');
Packit e4b6da
Packit e4b6da
    (*lineptr)[k] = 0;
Packit e4b6da
    
Packit e4b6da
    return k;
Packit e4b6da
}
Packit e4b6da
#endif