Tree - source-git/bogofilter - CentOS Git server

source-git / bogofilter

Files

Commit: 2365ce638f24e2a80af308ec978cdf368ddc099b

Blob Blame History Raw

 /**
 * \file lexer.c
 * bogofilter's lexical analyzer (control routines)
 *
 * \date 2003-01-01 split out of lexer.l
 */
 
#include "common.h"
 
#include <ctype.h>
#include <stdlib.h>
#include <assert.h>
 
#include "base64.h"
#include "bogoconfig.h"
#include "bogoreader.h"
#include "charset.h"
#include "error.h"
#ifndef	DISABLE_UNICODE
#include "convert_unicode.h"
#include "iconvert.h"
#endif
#include "lexer.h"
#include "memstr.h"
#include "mime.h"
#include "msgcounts.h"
#include "qp.h"
#include "textblock.h"
#include "token.h"
#include "word.h"
#include "xmalloc.h"
 
/* Global Variables */
 
extern int yylineno;
 
bool msg_header = true;
bool have_body  = false;
lexer_t *lexer = NULL;
 
/* Local Variables */
 
static lexer_t v3_lexer = {
    yylex,
    lexer_v3_get_token
};
 
lexer_t msg_count_lexer = {
    read_msg_count_line,
    msg_count_get_token
};
 
/* Function Prototypes */
 
static int yy_get_new_line(buff_t *buff);
static int get_decoded_line(buff_t *buff);
static int skip_folded_line(buff_t *buff);
 
/* Function Definitions */
 
void lexer_init(void)
{
    mime_reset();
    token_init();
    lexer_v3_init(NULL);
    init_charset_table(charset_default);
}
 
static void lexer_display_buffer(buff_t *buff)
{
    fprintf(dbgout, "*** %2d %c%c %2ld ",
	    yylineno-1, msg_header ? 'h' : 'b', yy_get_state(),
	    (long)(buff->t.leng - buff->read));
    buff_puts(buff, 0, dbgout);
    if (buff->t.leng > 0 && buff->t.u.text[buff->t.leng-1] != '\n')
	fputc('\n', dbgout);
}
 
/**
 * Check for lines wholly composed of printable characters as they can
 * cause a scanner abort "input buffer overflow, can't enlarge buffer
 * because scanner uses REJECT"
 *
 * \bug this function must go, we need to fix the lexer
 */
static bool long_token(byte *buf, uint count)
{
    uint i;
    for (i=0; i < count; i += 1) {
	byte c = buf[i];
	/* 10/23/05 - fix SIGSEGV with msg.1023.6479.txt
	** evidently caused by 09/07/05 patch for 0.96.2
	*/
	if (c == '\0')
	    break;
	if ((iscntrl(c) || isspace(c) || ispunct(c)) && (c != '_'))
	    return false;
    }
    return true;
}
 
static int yy_get_new_line(buff_t *buff)
{
    int count = (*reader_getline)(buff);
    const byte *buf = buff->t.u.text;
 
    static size_t hdrlen = 0;
    if (hdrlen==0)
	hdrlen=strlen(spam_header_name);
 
    if (count > 0)
	yylineno += 1;
 
    if (count == EOF) {
	if (fpin == NULL || !ferror(fpin)) {
	    return YY_NULL;
	}
	else {
	    print_error(__FILE__, __LINE__, "input in flex scanner failed\n");
	    exit(EX_ERROR);
	}
    }
 
    /* Mime header check needs to be performed on raw input
    ** -- before mime decoding.  Without it, flex aborts:
    ** "fatal flex scanner internal error--end of buffer missed" */
 
    if (buff->t.leng > 2 &&
	buf[0] == '-' && buf[1] == '-' &&
	got_mime_boundary(&buff->t)) {
	yy_set_state_initial();
    }
 
    if (count >= 0 && DEBUG_LEXER(0))
	lexer_display_buffer(buff);
 
    /* skip spam_header ("X-Bogosity:") lines */
    while (msg_header
	   && count != EOF
/* don't skip if inside message/rfc822 */
	   && msg_state->parent == NULL
	   && buff->t.leng >= hdrlen
	   && memcmp(buff->t.u.text,spam_header_name,hdrlen) == 0) {
	count = skip_folded_line(buff);
    }
 
    return count;
}
 
static int get_decoded_line(buff_t *buff)
{
    int count;
    buff_t *linebuff;
    /* since msg_state might change during calls */
    bool mime_dont_decode = msg_state->mime_dont_decode;
 
#ifdef	DISABLE_UNICODE
    linebuff = buff;
#else
    if (encoding == E_RAW ||
	mime_dont_decode ) {
	linebuff = buff;
    }
    else {
	static buff_t *tempbuff = NULL;
 
	if (tempbuff == NULL)
	    tempbuff = (buff_t *) calloc(sizeof(buff_t), 1);
 
	/* UTF-8 uses up to six octets per character.  Make input buffer
	 * sufficiently small that the UTF-8 text can fit in the output
	 * buffer */
	if (tempbuff->size < buff->size / 6) {
	    xfree(tempbuff->t.u.text);
	    tempbuff->size = buff->size / 6;
	    tempbuff->t.u.text = (byte *) xmalloc(tempbuff->size+D);
	}
 
	tempbuff->t.leng = tempbuff->read = 0;
	linebuff = tempbuff;
    }
#endif
 
    /* note that this call might invoke got_mimeboundary() thus
     * changing the global msg_state variable */
    count = yy_get_new_line(linebuff);
 
    if (count == EOF) {
	if ( !ferror(fpin))
	    return YY_NULL;
	else {
	    print_error(__FILE__, __LINE__, "input in flex scanner failed\n");
	    exit(EX_ERROR);
	}
    }
 
    /* Save the text on a linked list of lines.
     * Note that we store fixed-length blocks here, not lines.
     * One very long physical line could break up into more
     * than one of these. */
 
    if (passthrough && count > 0)
	textblock_add(linebuff->t.u.text+linebuff->read, (size_t) count);
 
    if ( !msg_header && 
	 !mime_dont_decode &&
	 msg_state->mime_type != MIME_TYPE_UNKNOWN)
    {
	word_t temp;
	uint decoded_count;
 
	temp.leng = (uint) count;
	temp.u.text = linebuff->t.u.text+linebuff->read;
 
	decoded_count = mime_decode(&temp);
	/*change buffer size only if the decoding worked */
	if (decoded_count != 0 && decoded_count < (uint) count) {
	    linebuff->t.leng -= (uint) (count - decoded_count);
	    count = (int) decoded_count;
	    if (DEBUG_LEXER(1))
		lexer_display_buffer(linebuff);
	}
    }
 
#ifndef	DISABLE_UNICODE
    if (encoding == E_UNICODE &&
	!mime_dont_decode &&
        count > 0)
    {
	iconvert(linebuff, buff);
 
	/* If we return count = 0 here, the caller will think we have
	 * no more bytes left to read, even though before the iconvert
	 * call we had a positive number of bytes. This *will* lead to
	 * a message truncation which we try to avoid by simply
	 * returning another in-band error code. */
	if (buff->t.leng == 0) {
	    count = -2;
        } else {
	    /* iconvert, treating multi-byte sequences, can shrink or enlarge
	     * the output compared to its input.  Correct count. */
	    count = buff->t.leng;
	}
    }
#endif
 
#ifdef EXCESSIVE_DEBUG
    /* debug */
    fprintf(dbgout, "%d: ", count);
    buff_puts(buff, 0, dbgout);
    fprintf(dbgout, "\n");
#endif
 
    /* CRLF -> NL */
    if (count >= 2) {
	byte *buf = buff->t.u.text;
	if (memcmp(buf + count - 2, CRLF, 2) == 0) {
	    count --;
	    --buff->t.leng;
	    *(buf + count - 1) = (byte) '\n';
	}
    }
 
    if (buff->t.leng < buff->size)     /* for easier debugging - removable */
	Z(buff->t.u.text[buff->t.leng]);  /* for easier debugging - removable */
 
    return count;
}
 
static int skip_folded_line(buff_t *buff)
{
    for (;;) {
	int count;
	buff->t.leng = 0;
	count = reader_getline(buff);
	yylineno += 1;
	/* only check for LWSP-char (RFC-822) aka. WSP (RFC-2822),
	 * these only include SP and HTAB */
	if (buff->t.u.text[0] != ' ' &&
	    buff->t.u.text[0] != '\t')
	    return count;
	/* Check for empty line which terminates message header */
	if (is_eol((char *)buff->t.u.text, count))
	    return count;
    }
}
 
void yyinit(void)
{
    yylineno = 0;
 
    if ( !msg_count_file)
	lexer = &v3_lexer;
}
 
int yyinput(byte *buf, size_t used, size_t size)
/* input getter for the scanner */
{
    int cnt;
    int count = 0;
    buff_t buff;
 
    buff_init(&buff, buf, 0, (uint) size);
 
    /* After reading a line of text, check if it has special characters.
     * If not, trim some, but leave enough to match a max length token.
     * Then read more text.  This will ensure that a really long sequence
     * of alphanumerics, which bogofilter will ignore anyway, doesn't crash
     * the flex lexer.
     */
 
    while ((cnt = get_decoded_line(&buff)) != 0) {
        if (cnt > 0)
            count = buff.t.leng;
 
	/* Note: some malformed messages can cause xfgetsl() to report
	** "Invalid buffer size, exiting."  and then abort.  This
	** can happen when the parser is in html mode and there's a
	** leading '<' but no closing '>'.
	**
	** The "fix" is to check for a nearly full lexer buffer and
	** discard most of it.
	*/
 
	/* if not nearly full */
	if (used < 1000 || used < size * 10)
	    break;
 
	if (count >= MAX_TOKEN_LEN * 2 && 
	    long_token(buff.t.u.text, (uint) count)) {
	    /* Make sure not to shift bytes outside the buffer */
	    if (buff.t.leng >= (uint) count) {
		    uint start = buff.t.leng - count;
		    uint length = count - max_token_len;
		    buff_shift(&buff, start, length);
	    }
	    count = buff.t.leng;
	}
	else
	    break;
    }
 
    if (msg_state &&
	msg_state->mime_dont_decode &&
	(msg_state->mime_disposition != MIME_DISPOSITION_UNKNOWN)) {
        assert(size <= INT_MAX && count <= (int)size);
	return (count == EOF ? 0 : count);   /* not decode at all */
    }
 
#if	defined(CP866) && !defined(ENABLE_ICONV)
    /* EK -  decoding things like &#1084 and charset_table */
    count = decode_and_htmlUNICODE_to_cp866(buf, count);
#endif
 
    if (replace_nonascii_characters) {
	/* do non-ascii replacement */
	int i;
	for (i = 0; i < count; i++ )
	{
	    byte ch = buf[i];
	    buf[i] = charset_table[ch];
	}
    }
 
    if (DEBUG_LEXER(2))
	fprintf(dbgout, "*** yyinput(\"%-.*s\", %lu, %lu) = %d\n", count, buf, (unsigned long)used, (unsigned long)size, count);
 
    assert(size <= INT_MAX && count <= (int)size);
    return (count == EOF ? 0 : count);
}
 
static char *charset_as_string(const byte *txt, const size_t len)
{
    static char *charset_text = NULL;
    static unsigned short charset_leng = 0;
 
    if (charset_text == NULL)
	charset_text = (char *)xmalloc(len+D);
    else {
	if (charset_leng < len) {
	    charset_leng = len;
	    charset_text = (char *)xrealloc(charset_text, charset_leng+D);
	}
    }
 
    memcpy(charset_text, txt, len);
    Z(charset_text[len]);			/* for easier debugging - removable */
 
    return charset_text;
}
 
word_t *text_decode(word_t *w)
{
    word_t *r = w;
    byte *const beg = w->u.text;		/* base pointer, fixed */
    byte *const fin = beg + w->leng;	/* end+1 position */
 
    byte *txt = (byte *) memstr(w->u.text, w->leng, "=?");	/* input position */
    uint size = (uint) (txt - beg);				/* output offset */
 
#ifndef	DISABLE_UNICODE
    size_t max = w->leng * 4;
    static buff_t * buf = NULL;
#endif
 
    if (txt == NULL)
	return r;
 
#ifndef	DISABLE_UNICODE
    if (encoding == E_UNICODE) {
	if (buf == NULL)
	    buf = buff_new((byte *)xmalloc(max+D), 0, max);
	r = &buf->t;				/* Use buf to return unicode result */
 
	buf->t.leng = 0;
	if (buf->size < max) {
	    buf->size = max;
	    buf->t.u.text = (byte *) xrealloc(buf->t.u.text, buf->size+D);
	}
 
	buf->t.leng = size;
	memcpy(buf->t.u.text, beg, size );
	Z(buf->t.u.text[buf->t.leng]);		/* for easier debugging - removable */
    }
#endif
 
    if (DEBUG_LEXER(2)) {
	fputs("**1**  ", dbgout);
	word_puts(w, 0, dbgout);
	fputs("\n", dbgout);
    }
 
    while (txt < fin) {
	byte *typ, *tmp, *end;
	uint len;
	bool adjacent;
 
	char *charset;
 
	txt += 2;
	typ = (byte *) memchr((char *)txt+1, '?', fin-txt);	/* Encoding type - 'B' or 'Q' */
	*typ++ = '\0';						/* nul terminate */
 
	charset = charset_as_string(txt, typ - txt - 1);
 
	tmp = typ + 2;						/* start of encoded word */
	end = (byte *) memstr((char *)tmp, fin-tmp, "?=");	/* last byte of encoded word  */
	len = end - tmp;
 
	w->u.text = tmp;				/* Start of encoded word */
	w->leng = len;				/* Length of encoded word */
	Z(w->u.text[w->leng]);			/* for easier debugging - removable */
 
	if (DEBUG_LEXER(2)) {
	    fputs("**2**  ", dbgout);
	    word_puts(w, 0, dbgout);
	    fputs("\n", dbgout);
	}
 
	switch (tolower(*typ)) {		/* ... encoding type */
	case 'b':
	    if (base64_validate(w))
		len = base64_decode(w);		/* decode base64 */
	    break;
	case 'q':
	    if (qp_validate(w, RFC2047))
		len = qp_decode(w, RFC2047);	/* decode quoted-printable */
	    break;
	}
 
	/* move decoded word to where the encoded used to be */
	if (encoding == E_RAW) {
	    memmove(beg+size, w->u.text, len);
	    size += len;			/* bump output pointer */
	    Z(beg[size]);			/* for easier debugging - removable */
 
	    if (DEBUG_LEXER(3))
		fprintf(dbgout, "**3**  %s\n", beg);
	}
 
#ifndef	DISABLE_UNICODE
	if (encoding == E_UNICODE) {
	    iconv_t cd;
	    buff_t  src;
 
	    /* convert 'word_t *w' to 'buff_t src' because
	    ** iconvert_cd() needs buff_t pointers
	    */
	    src.t.u.text = w->u.text;
	    src.t.leng = len;
	    src.read   = 0;
	    src.size   = len;
 
	    cd = bf_iconv_open( charset_unicode, charset );
	    iconvert_cd(cd, &src, buf);
	    iconv_close(cd);
 
	    if (DEBUG_LEXER(3)) {
		fputs("**4**  ", dbgout);
		word_puts(&buf->t, 0, dbgout);
		fputs("\n", dbgout);
	    }
	}
#endif
 
	txt = end + 2;	/* skip ?= trailer */
	if (txt >= fin)
	    break;
 
	/* check for next encoded word */
	end = (byte *) memstr((char *)txt, fin-txt, "=?");
	adjacent = end != NULL;
 
	/* clear adjacent flag if non-whitespace character found between
	 * adjacent encoded words */
	if (adjacent) {
	    tmp = txt;
	    while (adjacent && tmp < end) {
		if (*tmp && strchr(" \t\r\n", *tmp))
		    tmp += 1;
		else
		    adjacent = false;
	    }
	}
 
	/* we have a next encoded word and we've had only whitespace
	 * between the current and the next */
	if (adjacent)
	    /* just skip whitespace */
	    txt = end;
	else
	    /* copy everything that was between the encoded words */
	    while (txt < end) {
		if (encoding == E_RAW)
		    beg[size++] = *txt++;
#ifndef	DISABLE_UNICODE
		if (encoding == E_UNICODE)
		    buf->t.u.text[buf->t.leng++] = *txt++;
#endif
	    }
    }
 
    if (encoding == E_RAW) {
	r->u.text = beg;
	r->leng = size;
    }
 
    return r;
}
 
/*
 * The following sets edit modes for GNU EMACS
 * Local Variables:
 * mode:c
 * End:
 */

	/**
	* \file lexer.c
	* bogofilter's lexical analyzer (control routines)
	*
	* \date 2003-01-01 split out of lexer.l
	*/

	#include "common.h"

	#include <ctype.h>
	#include <stdlib.h>
	#include <assert.h>

	#include "base64.h"
	#include "bogoconfig.h"
	#include "bogoreader.h"
	#include "charset.h"
	#include "error.h"
	#ifndef DISABLE_UNICODE
	#include "convert_unicode.h"
	#include "iconvert.h"
	#endif
	#include "lexer.h"
	#include "memstr.h"
	#include "mime.h"
	#include "msgcounts.h"
	#include "qp.h"
	#include "textblock.h"
	#include "token.h"
	#include "word.h"
	#include "xmalloc.h"

	/* Global Variables */

	extern int yylineno;

	bool msg_header = true;
	bool have_body = false;
	lexer_t *lexer = NULL;

	/* Local Variables */

	static lexer_t v3_lexer = {
	yylex,
	lexer_v3_get_token
	};

	lexer_t msg_count_lexer = {
	read_msg_count_line,
	msg_count_get_token
	};

	/* Function Prototypes */

	static int yy_get_new_line(buff_t *buff);
	static int get_decoded_line(buff_t *buff);
	static int skip_folded_line(buff_t *buff);

	/* Function Definitions */

	void lexer_init(void)
	{
	mime_reset();
	token_init();
	lexer_v3_init(NULL);
	init_charset_table(charset_default);
	}

	static void lexer_display_buffer(buff_t *buff)
	{
	fprintf(dbgout, "*** %2d %c%c %2ld ",
	yylineno-1, msg_header ? 'h' : 'b', yy_get_state(),
	(long)(buff->t.leng - buff->read));
	buff_puts(buff, 0, dbgout);
	if (buff->t.leng > 0 && buff->t.u.text[buff->t.leng-1] != '\n')
	fputc('\n', dbgout);
	}

	/**
	* Check for lines wholly composed of printable characters as they can
	* cause a scanner abort "input buffer overflow, can't enlarge buffer
	* because scanner uses REJECT"
	*
	* \bug this function must go, we need to fix the lexer
	*/
	static bool long_token(byte *buf, uint count)
	{
	uint i;
	for (i=0; i < count; i += 1) {
	byte c = buf[i];
	/* 10/23/05 - fix SIGSEGV with msg.1023.6479.txt
	** evidently caused by 09/07/05 patch for 0.96.2
	*/
	if (c == '\0')
	break;
	if ((iscntrl(c) \|\| isspace(c) \|\| ispunct(c)) && (c != '_'))
	return false;
	}
	return true;
	}

	static int yy_get_new_line(buff_t *buff)
	{
	int count = (*reader_getline)(buff);
	const byte *buf = buff->t.u.text;

	static size_t hdrlen = 0;
	if (hdrlen==0)
	hdrlen=strlen(spam_header_name);

	if (count > 0)
	yylineno += 1;

	if (count == EOF) {
	if (fpin == NULL \|\| !ferror(fpin)) {
	return YY_NULL;
	}
	else {
	print_error(__FILE__, __LINE__, "input in flex scanner failed\n");
	exit(EX_ERROR);
	}
	}

	/* Mime header check needs to be performed on raw input
	** -- before mime decoding. Without it, flex aborts:
	** "fatal flex scanner internal error--end of buffer missed" */

	if (buff->t.leng > 2 &&
	buf[0] == '-' && buf[1] == '-' &&
	got_mime_boundary(&buff->t)) {
	yy_set_state_initial();
	}

	if (count >= 0 && DEBUG_LEXER(0))
	lexer_display_buffer(buff);

	/* skip spam_header ("X-Bogosity:") lines */
	while (msg_header
	&& count != EOF
	/* don't skip if inside message/rfc822 */
	&& msg_state->parent == NULL
	&& buff->t.leng >= hdrlen
	&& memcmp(buff->t.u.text,spam_header_name,hdrlen) == 0) {
	count = skip_folded_line(buff);
	}

	return count;
	}

	static int get_decoded_line(buff_t *buff)
	{
	int count;
	buff_t *linebuff;
	/* since msg_state might change during calls */
	bool mime_dont_decode = msg_state->mime_dont_decode;

	#ifdef DISABLE_UNICODE
	linebuff = buff;
	#else
	if (encoding == E_RAW \|\|
	mime_dont_decode ) {
	linebuff = buff;
	}
	else {
	static buff_t *tempbuff = NULL;

	if (tempbuff == NULL)
	tempbuff = (buff_t *) calloc(sizeof(buff_t), 1);

	/* UTF-8 uses up to six octets per character. Make input buffer
	* sufficiently small that the UTF-8 text can fit in the output
	* buffer */
	if (tempbuff->size < buff->size / 6) {
	xfree(tempbuff->t.u.text);
	tempbuff->size = buff->size / 6;
	tempbuff->t.u.text = (byte *) xmalloc(tempbuff->size+D);
	}

	tempbuff->t.leng = tempbuff->read = 0;
	linebuff = tempbuff;
	}
	#endif

	/* note that this call might invoke got_mimeboundary() thus
	* changing the global msg_state variable */
	count = yy_get_new_line(linebuff);

	if (count == EOF) {
	if ( !ferror(fpin))
	return YY_NULL;
	else {
	print_error(__FILE__, __LINE__, "input in flex scanner failed\n");
	exit(EX_ERROR);
	}
	}

	/* Save the text on a linked list of lines.
	* Note that we store fixed-length blocks here, not lines.
	* One very long physical line could break up into more
	* than one of these. */

	if (passthrough && count > 0)
	textblock_add(linebuff->t.u.text+linebuff->read, (size_t) count);

	if ( !msg_header &&
	!mime_dont_decode &&
	msg_state->mime_type != MIME_TYPE_UNKNOWN)
	{
	word_t temp;
	uint decoded_count;

	temp.leng = (uint) count;
	temp.u.text = linebuff->t.u.text+linebuff->read;

	decoded_count = mime_decode(&temp);
	/change buffer size only if the decoding worked /
	if (decoded_count != 0 && decoded_count < (uint) count) {
	linebuff->t.leng -= (uint) (count - decoded_count);
	count = (int) decoded_count;
	if (DEBUG_LEXER(1))
	lexer_display_buffer(linebuff);
	}
	}

	#ifndef DISABLE_UNICODE
	if (encoding == E_UNICODE &&
	!mime_dont_decode &&
	count > 0)
	{
	iconvert(linebuff, buff);

	/* If we return count = 0 here, the caller will think we have
	* no more bytes left to read, even though before the iconvert
	* call we had a positive number of bytes. This will lead to
	* a message truncation which we try to avoid by simply
	* returning another in-band error code. */
	if (buff->t.leng == 0) {
	count = -2;
	} else {
	/* iconvert, treating multi-byte sequences, can shrink or enlarge
	* the output compared to its input. Correct count. */
	count = buff->t.leng;
	}
	}
	#endif

	#ifdef EXCESSIVE_DEBUG
	/* debug */
	fprintf(dbgout, "%d: ", count);
	buff_puts(buff, 0, dbgout);
	fprintf(dbgout, "\n");
	#endif

	/* CRLF -> NL */
	if (count >= 2) {
	byte *buf = buff->t.u.text;
	if (memcmp(buf + count - 2, CRLF, 2) == 0) {
	count --;
	--buff->t.leng;
	*(buf + count - 1) = (byte) '\n';
	}
	}

	if (buff->t.leng < buff->size) /* for easier debugging - removable */
	Z(buff->t.u.text[buff->t.leng]); /* for easier debugging - removable */

	return count;
	}

	static int skip_folded_line(buff_t *buff)
	{
	for (;;) {
	int count;
	buff->t.leng = 0;
	count = reader_getline(buff);
	yylineno += 1;
	/* only check for LWSP-char (RFC-822) aka. WSP (RFC-2822),
	* these only include SP and HTAB */
	if (buff->t.u.text[0] != ' ' &&
	buff->t.u.text[0] != '\t')
	return count;
	/* Check for empty line which terminates message header */
	if (is_eol((char *)buff->t.u.text, count))
	return count;
	}
	}

	void yyinit(void)
	{
	yylineno = 0;

	if ( !msg_count_file)
	lexer = &v3_lexer;
	}

	int yyinput(byte *buf, size_t used, size_t size)
	/* input getter for the scanner */
	{
	int cnt;
	int count = 0;
	buff_t buff;

	buff_init(&buff, buf, 0, (uint) size);

	/* After reading a line of text, check if it has special characters.
	* If not, trim some, but leave enough to match a max length token.
	* Then read more text. This will ensure that a really long sequence
	* of alphanumerics, which bogofilter will ignore anyway, doesn't crash
	* the flex lexer.
	*/

	while ((cnt = get_decoded_line(&buff)) != 0) {
	if (cnt > 0)
	count = buff.t.leng;

	/* Note: some malformed messages can cause xfgetsl() to report
	** "Invalid buffer size, exiting." and then abort. This
	** can happen when the parser is in html mode and there's a
	** leading '<' but no closing '>'.
	**
	** The "fix" is to check for a nearly full lexer buffer and
	** discard most of it.
	*/

	/* if not nearly full */
	if (used < 1000 \|\| used < size * 10)
	break;

	if (count >= MAX_TOKEN_LEN * 2 &&
	long_token(buff.t.u.text, (uint) count)) {
	/* Make sure not to shift bytes outside the buffer */
	if (buff.t.leng >= (uint) count) {
	uint start = buff.t.leng - count;
	uint length = count - max_token_len;
	buff_shift(&buff, start, length);
	}
	count = buff.t.leng;
	}
	else
	break;
	}

	if (msg_state &&
	msg_state->mime_dont_decode &&
	(msg_state->mime_disposition != MIME_DISPOSITION_UNKNOWN)) {
	assert(size <= INT_MAX && count <= (int)size);
	return (count == EOF ? 0 : count); /* not decode at all */
	}

	#if defined(CP866) && !defined(ENABLE_ICONV)
	/* EK - decoding things like &#1084 and charset_table */
	count = decode_and_htmlUNICODE_to_cp866(buf, count);
	#endif

	if (replace_nonascii_characters) {
	/* do non-ascii replacement */
	int i;
	for (i = 0; i < count; i++ )
	{
	byte ch = buf[i];
	buf[i] = charset_table[ch];
	}
	}

	if (DEBUG_LEXER(2))
	fprintf(dbgout, "*** yyinput(\"%-.*s\", %lu, %lu) = %d\n", count, buf, (unsigned long)used, (unsigned long)size, count);

	assert(size <= INT_MAX && count <= (int)size);
	return (count == EOF ? 0 : count);
	}

	static char charset_as_string(const byte txt, const size_t len)
	{
	static char *charset_text = NULL;
	static unsigned short charset_leng = 0;

	if (charset_text == NULL)
	charset_text = (char *)xmalloc(len+D);
	else {
	if (charset_leng < len) {
	charset_leng = len;
	charset_text = (char *)xrealloc(charset_text, charset_leng+D);
	}
	}

	memcpy(charset_text, txt, len);
	Z(charset_text[len]); /* for easier debugging - removable */

	return charset_text;
	}

	word_t text_decode(word_t w)
	{
	word_t *r = w;
	byte const beg = w->u.text; / base pointer, fixed */
	byte const fin = beg + w->leng; / end+1 position */

	byte txt = (byte ) memstr(w->u.text, w->leng, "=?"); /* input position */
	uint size = (uint) (txt - beg); /* output offset */

	#ifndef DISABLE_UNICODE
	size_t max = w->leng * 4;
	static buff_t * buf = NULL;
	#endif

	if (txt == NULL)
	return r;

	#ifndef DISABLE_UNICODE
	if (encoding == E_UNICODE) {
	if (buf == NULL)
	buf = buff_new((byte *)xmalloc(max+D), 0, max);
	r = &buf->t; /* Use buf to return unicode result */

	buf->t.leng = 0;
	if (buf->size < max) {
	buf->size = max;
	buf->t.u.text = (byte *) xrealloc(buf->t.u.text, buf->size+D);
	}

	buf->t.leng = size;
	memcpy(buf->t.u.text, beg, size );
	Z(buf->t.u.text[buf->t.leng]); /* for easier debugging - removable */
	}
	#endif

	if (DEBUG_LEXER(2)) {
	fputs("1 ", dbgout);
	word_puts(w, 0, dbgout);
	fputs("\n", dbgout);
	}

	while (txt < fin) {
	byte typ, tmp, *end;
	uint len;
	bool adjacent;

	char *charset;

	txt += 2;
	typ = (byte ) memchr((char )txt+1, '?', fin-txt); /* Encoding type - 'B' or 'Q' */
	typ++ = '\0'; / nul terminate */

	charset = charset_as_string(txt, typ - txt - 1);

	tmp = typ + 2; /* start of encoded word */
	end = (byte ) memstr((char )tmp, fin-tmp, "?="); /* last byte of encoded word */
	len = end - tmp;

	w->u.text = tmp; /* Start of encoded word */
	w->leng = len; /* Length of encoded word */
	Z(w->u.text[w->leng]); /* for easier debugging - removable */

	if (DEBUG_LEXER(2)) {
	fputs("2 ", dbgout);
	word_puts(w, 0, dbgout);
	fputs("\n", dbgout);
	}

	switch (tolower(typ)) { / ... encoding type */
	case 'b':
	if (base64_validate(w))
	len = base64_decode(w); /* decode base64 */
	break;
	case 'q':
	if (qp_validate(w, RFC2047))
	len = qp_decode(w, RFC2047); /* decode quoted-printable */
	break;
	}

	/* move decoded word to where the encoded used to be */
	if (encoding == E_RAW) {
	memmove(beg+size, w->u.text, len);
	size += len; /* bump output pointer */
	Z(beg[size]); /* for easier debugging - removable */

	if (DEBUG_LEXER(3))
	fprintf(dbgout, "3 %s\n", beg);
	}

	#ifndef DISABLE_UNICODE
	if (encoding == E_UNICODE) {
	iconv_t cd;
	buff_t src;

	/* convert 'word_t *w' to 'buff_t src' because
	** iconvert_cd() needs buff_t pointers
	*/
	src.t.u.text = w->u.text;
	src.t.leng = len;
	src.read = 0;
	src.size = len;

	cd = bf_iconv_open( charset_unicode, charset );
	iconvert_cd(cd, &src, buf);
	iconv_close(cd);

	if (DEBUG_LEXER(3)) {
	fputs("4 ", dbgout);
	word_puts(&buf->t, 0, dbgout);
	fputs("\n", dbgout);
	}
	}
	#endif

	txt = end + 2; /* skip ?= trailer */
	if (txt >= fin)
	break;

	/* check for next encoded word */
	end = (byte ) memstr((char )txt, fin-txt, "=?");
	adjacent = end != NULL;

	/* clear adjacent flag if non-whitespace character found between
	* adjacent encoded words */
	if (adjacent) {
	tmp = txt;
	while (adjacent && tmp < end) {
	if (tmp && strchr(" \t\r\n", tmp))
	tmp += 1;
	else
	adjacent = false;
	}
	}

	/* we have a next encoded word and we've had only whitespace
	* between the current and the next */
	if (adjacent)
	/* just skip whitespace */
	txt = end;
	else
	/* copy everything that was between the encoded words */
	while (txt < end) {
	if (encoding == E_RAW)
	beg[size++] = *txt++;
	#ifndef DISABLE_UNICODE
	if (encoding == E_UNICODE)
	buf->t.u.text[buf->t.leng++] = *txt++;
	#endif
	}
	}

	if (encoding == E_RAW) {
	r->u.text = beg;
	r->leng = size;
	}

	return r;
	}

	/*
	* The following sets edit modes for GNU EMACS
	* Local Variables:
	* mode:c
	* End:
	*/

source-git / bogofilter

Source Code

Files