Blob Blame History Raw
/* vim: sta et sw=4
 */

/*
 * $Id: utf8trans.c,v 1.12 2006/04/13 01:00:01 stevecheng Exp $
 *
 * (C) 2001 Steve Cheng <stevecheng@users.sourceforge.net>
 *
 * See ../COPYING for the copyright status of this software.
 *
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#define _GNU_SOURCE     /* For getline */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

#include "mtable.h"
#include "strings_buffer.h"

/* UCS-4 character */
typedef unsigned int CHAR;

#ifdef HAVE_GETOPT_H
#include <getopt.h>
#endif

#ifdef HAVE_GETOPT_LONG
/* Long-option specification */
struct option long_options[] =
{
    { "version", 0, 0, 'v' },
    { "help", 0, 0, 'h' },
    { "modify", 0, 0, 'm' },
    
    { 0, 0, 0, 0 }
};
#endif

const char *prog_name;
const char *charmap_filename = NULL;
mtable_t charmap_table;
static int charmap_table_exponents[] = { 8, 8, 8, 8, 0 };
strings_buffer_t charmap_strings;
int modify_in_place = 0;

int do_options(int argc, char *argv[]);
void print_version(void);
void print_usage();
void add_translation(CHAR codepoint, char *translation);
void parse_charmap(FILE *stream);
char *encode_utf8(CHAR codepoint);
void translate(FILE *in, FILE *out);

#ifndef HAVE_GETLINE
ssize_t getline(char **lineptr, size_t *n, FILE *stream);
#endif

int
main(int argc, char *argv[])
{
    FILE *charmap_f;
    int optind;
    
    prog_name = argv[0];
    optind = do_options(argc, argv);
    
    charmap_table = mtable_new(charmap_table_exponents);
    charmap_strings = strings_buffer_new(4096);
    
    /* Read translation spec */
    charmap_filename = argv[optind];
    charmap_f = fopen(charmap_filename, "r");
    if(!charmap_f) {
        fprintf(stderr, "%s:%s: %s\n", 
                prog_name,
                charmap_filename,
                strerror(errno));
        exit(1);
    }

    parse_charmap(charmap_f);

    fclose(charmap_f);

    optind++;
    
    if(!argv[optind]) {
        translate(stdin, stdout);
    }
    else {
        int i;
        FILE *f, *out;
        for(i = optind; argv[i]; i++) {
            out = stdout;

            f = fopen(argv[i], "r");
            if(!f) {
                fprintf(stderr, "%s:%s: %s\n",
                        prog_name,
                        argv[i],
                        strerror(errno));
                exit(1);
            }

#ifdef HAVE_UNISTD_H
            if(modify_in_place) {
                if(unlink(argv[i]) < 0) {
                    fprintf(stderr, "%s:%s: %s\n",
                            prog_name,
                            argv[i],
                            strerror(errno));
                    fclose(f);
                    continue;
                }
                    
                out = fopen(argv[i], "w");
                if(!out) {
                    fprintf(stderr, "%s:%s: %s\n",
                            prog_name,
                            argv[i],
                            strerror(errno));
                    fclose(f);
                    continue;
                }
            }
#endif
            
            translate(f, out);
            fclose(f);

            if(modify_in_place)
                fclose(out);
        }
    }

    mtable_delete(charmap_table);
    strings_buffer_delete(charmap_strings);

    return 0;
}

void
print_version(void)
{
    puts("utf8trans (part of docbook2X" 
#ifdef HAVE_CONFIG_H
    VERSION
#endif
        ")");

    puts("$Revision: 1.12 $ $Date: 2006/04/13 01:00:01 $");
    puts("<URL:http://docbook2x.sourceforge.net/>\n");
    
    puts("Copyright (C) 2000-2004 Steve Cheng\n"
         "This is free software; see the source for copying conditions.\n"
         "There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR\n"
         "A PARTICULAR PURPOSE.");
}

void
print_usage()
{
    printf("Usage: %s [options] CHARMAP [FILES...]\n", prog_name);
    puts("Transliterate UTF-8 characters according to a table.\n");

#ifdef HAVE_UNISTD_H
#ifdef HAVE_GETOPT_LONG
    puts("  -m, --modify            modify given files in-place\n"
         "  -v, --version           display version information and exit\n"
         "  -h, --help              display this usage information\n");
#else
    puts("  -m                      modify given files in-place\n"
         "  -v                      display version information and exit\n"
         "  -h                      display this usage information\n");
#endif
#endif

    puts("See utf8trans(1) for details on this program.\n");
}
    
int
do_options(int argc, char *argv[])
{
#ifdef HAVE_UNISTD_H    /* On a Unix, so have some version of getopt */
    int optc;

#ifdef HAVE_GETOPT_LONG
    while((optc = getopt_long(argc, argv, "vhm", 
                long_options, NULL)) != -1)
#else
    while((optc = getopt(argc, argv, "vhm")) != -1)
#endif  /* HAVE_GETOPT_LONG */
    {
        switch(optc) {

        /* --version */
        case 'v':
            print_version();
            exit(0);
            
        /* --help */
        case 'h':
            print_usage();
            exit(0);

        /* --modify */
        case 'm':
            modify_in_place = 1;
            break;

        case '?':
        default:
            exit(1);
        }
    }

    if(optind > argc-1) {
        fprintf(stderr, "%s: must specify charmap\n", prog_name);
        exit(1);
    }

    return optind;
    
#else   /* No getopt, so don't process any options.
           They are all trivial, so that justifies ignoring them. */

    if(argc < 2) {
        fprintf(stderr, "%s: must specify charmap\n", prog_name);
        exit(1);
    }
    
    return 1;

#endif  
}

void 
add_translation(CHAR codepoint, char *translation)
{
    char *s = strings_buffer_add(&charmap_strings, translation);
    mtable_set(charmap_table, codepoint, s);
}

char *
get_translation(CHAR codepoint)
{
    char *translation = mtable_get(charmap_table, codepoint);
    if(translation != NULL)
        return translation;
    else
        return encode_utf8(codepoint);
}

char *
encode_utf8(CHAR c)
{
    static char buf[7];
    char *p = buf;
    
    if(c < 0x80) {
        *p++ = c;
        *p++ = '\0';
    } else if(c < 0x800) {
        *p++ = 0xC0 | (c>>6);
        *p++ = 0x80 | (c & 0x3F);
        *p++ = '\0';
    } else if(c < 0x10000) {
        *p++ = 0xE0 | (c>>12);
        *p++ = 0x80 | ((c>>6) & 0x3F);
        *p++ = 0x80 | (c & 0x3F);
        *p++ = '\0';
    } else if (c < 0x200000) {
        *p++ = 0xF0 | (c>>18);
        *p++ = 0x80 | ((c>>12) & 0x3F);
        *p++ = 0x80 | ((c>>6) & 0x3F);
        *p++ = 0x80 | (c & 0x3F);
        *p++ = '\0';
    } else if (c < 0x4000000) {
        *p++ = 0xF8 | (c>>24);
        *p++ = 0x80 | ((c>>18) & 0x3F);
        *p++ = 0x80 | ((c>>12) & 0x3F);
        *p++ = 0x80 | ((c>>6) & 0x3F);
        *p++ = 0x80 | (c & 0x3F);
        *p++ = '\0';
    } else if (c < 0x80000000) {
        *p++ = 0xFC | (c>>30);
        *p++ = 0x80 | ((c>>24) & 0x3F);
        *p++ = 0x80 | ((c>>18) & 0x3F);
        *p++ = 0x80 | ((c>>12) & 0x3F);
        *p++ = 0x80 | ((c>>6) & 0x3F);
        *p++ = 0x80 | (c & 0x3F);
        *p++ = '\0';
    } else {
        /* Oops */
        abort();
    }
    
    return buf;
}

/* 0 to 9, a to f, A to F */
#define IS_HEXDIGIT(c) (((c) >= 48 && (c) <= 57) || \
                        ((c) >= 97 && (c) <= 102) || \
                        ((c) >= 65 && (c) <= 70))
#define IS_SPACE(c) ((c) == ' ' || (c) == '\t')

void parse_charmap(FILE *stream)
{
    char *buf = NULL;
    size_t bufsize = 0;
    char *p, *c, *t;
    int linecount = 0;

    CHAR codepoint;
    
    while(!feof(stream)) {
        linecount++;
        if(getline(&buf, &bufsize, stream) == -1) {
            if(!feof(stream)) {
                fprintf(stderr, "%s:%s: %s",
                        prog_name, charmap_filename, 
                        strerror(errno));
                exit(2);
            }
            goto nextline;
        }

        /* Chomp newline */
        p = buf + (strlen(buf)-1);
        if(*p == '\n') *p = '\0';

        /* Skip to codepoint */
        for(c = buf; *c && IS_SPACE(*c); c++);

        /* Skip empty lines and comment lines */
        if(*c == '\0' || *c == '#')
            goto nextline;

        t = NULL;

        /* Parse the codepoint (a number in hex) */
        for(p = c; *p; p++) {
            if(!IS_HEXDIGIT(*p)) {
                if(!IS_SPACE(*p)) {
                    fprintf(stderr, "%s:%s:%d: %s",
                            prog_name, charmap_filename, linecount,
                            "(parsing codepoint) invalid hex number\n");
                    goto nextline;
                }

                *p = '\0';
                
                if(sscanf(c, "%x", &codepoint) != 1) {
                    fprintf(stderr, "%s:%s:%d: %s",
                            prog_name, charmap_filename, linecount,
                            "(parsing codepoint) invalid hex number\n");
                    goto nextline;
                }

                t = ++p;
                break;
            }
        }

        if(t) {
            add_translation(codepoint, t);
        } else {
            /* No translation text */
            if(sscanf(c, "%x", &codepoint) != 1) {
                fprintf(stderr, "%s:%s:%d: %s",
                        prog_name, charmap_filename, linecount,
                        "(parsing codepoint) invalid hex number\n");
                goto nextline;
            }
            add_translation(codepoint, "");
        }
nextline: ;
    }

    if(buf)
        free(buf);
}

CHAR
read_utf8_char(FILE *stream)
{
    CHAR character;
    int b, n, i;

    b = fgetc(stream);
    if(b == EOF)
        return 0xFFFFFFFF;

    /* UTF-8 sequence leading byte */
    if((b & 0xC0) == 0xC0) {
        /* Count bytes and eat lead bits */
        for(n = 0; b & 0x80; b<<=1, n++);
        b = (b & 0xFF) >> n;

        if(n > 6 || n < 2) return 0xFFFD;

        switch(n) {
            case 6: b <<= 6;
            case 5: b <<= 6;
            case 4: b <<= 6;
            case 3: b <<= 6;
            case 2: b <<= 6;
        }
        character = b;
        
        for(i = n; i>1; i--) {
            b = fgetc(stream);
            if(b == EOF) return 0xFFFD;
            if((b & 0xC0) != 0x80) return 0xFFFD;
            b &= 0x3F;

            switch(i) {
                case 6: b <<= 6;
                case 5: b <<= 6;
                case 4: b <<= 6;
                case 3: b <<= 6;
                case 2: ;
            }
            
            character |= b;
        }

        /* Check for overlong sequences */
        switch(n) {
            case 6: if(character < 0x4000000) return 0xFFFD;
            case 5: if(character < 0x200000) return 0xFFFD;
            case 4: if(character < 0x10000) return 0xFFFD;
            case 3: if(character < 0x800) return 0xFFFD;
            case 2: if(character < 0x80) return 0xFFFD;
        }

        return character;
    }

    /* UTF-8 sequence continuation byte */
    else if((b & 0xC0) == 0x80) {
        return 0xFFFD;
    }

    /* ASCII character */
    else {
        return (CHAR)b;
    }
}
    
void 
translate(FILE *in, FILE *out)
{
    CHAR character;
    
    while(!feof(in))
    {
        character = read_utf8_char(in);

        if(character == 0xFFFFFFFF)
            break;
        
        /* Don't lose null characters in input */
        if(character == 0 && !mtable_get(charmap_table, 0))
            fputc(0, out);
        else
            fputs(get_translation(character), out);
    }
}

#if !HAVE_GETLINE
ssize_t getline(char **lineptr, size_t *n, FILE *stream)
{
    ssize_t k = 0;
    int c;

    if(!*lineptr) {
        *lineptr = malloc(256);
        if(!*lineptr)
            return -1;
        *n = 256;
    }

    do {
        c = fgetc(stream);
        if(c == EOF) {
            if(k == 0) {
                (*lineptr)[0] = 0;
                return -1;
            }
            
            break;
        }

        if(k == *n - 1) {
            char *p = realloc(*lineptr, *n *2);
            if(!p)
                return -1;
            *lineptr = p;
        }

        (*lineptr)[k++] = c;
    } while(c != '\n');

    (*lineptr)[k] = 0;
    
    return k;
}
#endif