Blob Blame History Raw
/*****************************************************************************

NAME:
   iconvert.c -- provide iconv() support for bogofilter's lexer.

AUTHOR:
   David Relson <relson@osagesoftware.com>

******************************************************************************/

/**
 ** Note: 01/07/05
 **
 ** "make check" changes:
 **
 **    t.systest - msg.3.txt
 **		0x92	It’s
 **
 **    t.lexer.mbx - spam.mbx
 **	  msg**4
 **		0xAE	Club®
 **	  msg**20
 **		0xA0	MyNetOffers 
 **		0x93	“Your 
 **		0x94	Account”
 **/

#include "common.h"

#include <stdlib.h>
#include <errno.h>
#include <assert.h>
#include "buff.h"
#include "iconvert.h"

extern	iconv_t cd;

static void iconv_print_error(int err, buff_t *src)
{
    if (DEBUG_ICONV(1)) {
	const char *msg = NULL;
	switch (err) {
	case EILSEQ:		/* invalid multibyte sequence */
	    msg = "EILSEQ";
	    break;
	case EINVAL:		/* incomplete multibyte sequence */
	    msg = "EINVAL";
	    break;
	case E2BIG:		/* output buffer has no more room */
	    msg = "E2BIG";
	    break;
	}
	if (msg != NULL)
	    fprintf(dbgout, "err: %s (%d), tx: %p, rd: %d, ln: %d, sz: %d\n",
		    msg, err, src->t.u.text, src->read, src->t.leng, src->size);
    }
}

static void convert(iconv_t xd, buff_t *restrict src, buff_t *restrict dst)
{
    bool done = false;

    while (!done) {
	char * inbuf;
	size_t inbytesleft;

	char * outbuf;
	size_t outbytesleft;
	size_t count;

	inbuf = (char *)src->t.u.text + src->read;
	inbytesleft = src->t.leng - src->read;

	outbuf = (char *)dst->t.u.text + dst->t.leng;
	outbytesleft = dst->size - dst->read - dst->t.leng;

	if (outbytesleft == 0)
	    break;

	/*
	 * The iconv function converts one multibyte character at a time, and for
	 * each character conversion it increments *inbuf and decrements
	 * *inbytesleft by the number of converted input bytes, it increments
	 * *outbuf and decrements *outbytesleft by the number of converted output
	 * bytes, and it updates the conversion state contained in cd. The
	 * conversion can stop for four reasons:
	 */

	count = iconv(xd, (ICONV_CONST char **)&inbuf, &inbytesleft, &outbuf, &outbytesleft);

	/*
	 * 1. An invalid multibyte sequence is encountered
	 * in the input. In this case it sets errno to
	 * EILSEQ and returns (size_t)(-1). *inbuf is left
	 * pointing to the beginning of the invalid
	 * multibyte sequence.

	 * 2. The input byte sequence has been entirely
	 * converted, i.e. *inbytesleft has gone down to
	 * 0. In this case iconv returns the number of
	 * non-reversible conversions performed during
	 * this call.

	 * 3. An incomplete multibyte sequence is
	 * encountered in the input, and the input byte
	 * sequence terminates after it. In this case it
	 * sets errno to EINVAL and returns
	 * (size_t)(-1). *inbuf is left pointing to the
	 * beginning of the incomplete multibyte sequence.

	 * 4. The output buffer has no more room for the
	 * next converted character. In this case it sets
	 * errno to E2BIG and returns (size_t)(-1).

	 * A different case is when inbuf is NULL or *inbuf is
	 * NULL, but outbuf is not NULL and *outbuf is not
	 * NULL. In this case, the iconv function attempts to
	 * set cd's conversion state to the initial state and
	 * store a corresponding shift sequence at *outbuf. At
	 * most *outbytesleft bytes, starting at *outbuf, will
	 * be written. If the output buffer has no more room
	 * for this reset sequence, it sets errno to E2BIG and
	 * returns (size_t)(-1). Otherwise it increments
	 * *outbuf and decrements *outbytesleft by the number
	 * of bytes written.
	 * 
	 * A third case is when inbuf is NULL or inbuf is
	 * NULL, and outbuf is NULL or outbuf is NULL. In this
	 * case, the iconv function sets cd's conversion state
	 * to the initial state.
	 */

	if (count == (size_t)(-1)) {

	    int err = errno;

	    iconv_print_error(err, src);

	    switch (err) {
	    case EILSEQ:		/* invalid multibyte sequence */
	    case EINVAL:		/* incomplete multibyte sequence */
		if (outbytesleft == 0) {
                    done = true;
                    break;
		}
		/* copy 1 byte (or substitute a '?') */
		if (!replace_nonascii_characters)
		    *outbuf = *inbuf;
		else
		    *outbuf = '?';
		/* update counts and pointers */
		inbytesleft -= 1;
		outbytesleft -= 1;
		inbuf  += 1;
		outbuf += 1;
		break;

	    case E2BIG:			/* output buffer has no more room */
					/* TODO:  Provide proper handling of E2BIG */
		done = true;
		break;

	    default:
                /* Linux man page states that other error codes may occur
                 * thus, safer to leave that loop on unknown error, right? */
		done = true;
		break;
	    }
	}
	src->read = src->t.leng - inbytesleft;
	dst->t.leng = dst->size - dst->read - outbytesleft;

	if (src->read >= src->t.leng)
	    done = true;
    }

    Z(dst->t.u.text[dst->t.leng]);	/* for easier debugging - removable */

    if (DEBUG_ICONV(1) &&
	src->t.leng != src->read)
	fprintf(dbgout, "tx: %p, rd: %d, ln: %d, sz: %d\n",
		src->t.u.text, src->read, src->t.leng, src->size);
}

static void copy(buff_t *restrict src, buff_t *restrict dst)
{
    /* if conversion not available, use memcpy */
    dst->t.leng = min(dst->size, src->t.leng);
    memcpy(dst->t.u.text, src->t.u.text, dst->t.leng+D);
}

void iconvert(buff_t *restrict src, buff_t *restrict dst)
{
    assert(src->t.u.text != dst->t.u.text);
    if (cd == NULL)
	copy(src, dst);
    else
	convert(cd, src, dst);
}

void iconvert_cd(iconv_t xd, buff_t *restrict src, buff_t *restrict dst)
{
    assert(src->t.u.text != dst->t.u.text);
    if (xd == (iconv_t)-1)
	copy(src, dst);
    else
	convert(xd, src, dst);
}