Blob Blame History Raw
/*****************************************************************************

NAME:
   token.c -- post-lexer token processing

   12/08/02 - split out from lexer.l

AUTHOR:
   David Relson <relson@osagesoftware.com>

******************************************************************************/

#include "common.h"

#include <assert.h>
#include <ctype.h>
#include <stdlib.h>

#include "bogoreader.h"
#include "charset.h"
#include "error.h"
#include "mime.h"
#include "msgcounts.h"
#include "word.h"
#include "token.h"
#include "xmemrchr.h"

#define	MSG_COUNT_PADDING 2 * 10	/* space for 2 10-digit numbers */

/* Local Variables */

word_t	*msg_addr;	/* First IP Address in Received: statement */
word_t	*msg_id;	/* Message ID */
static size_t max_msg_id_len;
word_t	*queue_id;	/* Message's first queue ID */

static token_t save_class = NONE;
static word_t *ipsave;

static byte  *yylval_text;
static size_t yylval_text_size;
static word_t yylval;

static word_t *w_to   = NULL;	/* To:          */
static word_t *w_from = NULL;	/* From:        */
static word_t *w_rtrn = NULL;	/* Return-Path: */
static word_t *w_subj = NULL;	/* Subject:     */
static word_t *w_recv = NULL;	/* Received:    */
static word_t *w_head = NULL;	/* Header:      */
static word_t *w_mime = NULL;	/* Mime:        */
static word_t *w_ip   = NULL;	/* ip:          */
static word_t *w_url  = NULL;	/* url:         */

/* Global Variables */

bool block_on_subnets = false;

static word_t *token_prefix = NULL;
static uint32_t token_prefix_len;

#define NONBLANK "spc:invalid_end_of_header"
static word_t *nonblank_line = NULL;

static uint tok_count         = 0;
static uint init_token        = 1;
static word_t *p_multi_words  = NULL;
static byte   *p_multi_buff   = NULL;
static byte   *p_multi_text   = NULL;
static word_t **w_token_array = NULL;

/* Function Prototypes */

static void    token_clear(void);
static token_t parse_new_token(word_t *token);
static void    add_token_to_array(word_t *token);
static void    build_token_from_array(word_t *token);
static uint    token_copy_leng(const char *str, uint leng, byte *dest);

/* Function Definitions */

static void init_token_array(void)
{
    uint i;
    byte *text;
    word_t *words;
		    
    p_multi_words = (word_t *)calloc( max_token_len, sizeof(word_t) );
    p_multi_buff  = (byte *)malloc( max_multi_token_len+D );
    p_multi_text  = (byte *)calloc( max_token_len+1+D, multi_token_count );
    w_token_array = (word_t **)calloc( multi_token_count, sizeof(*w_token_array) );

    text = p_multi_text;
    words = p_multi_words;

    for (i = 0; i < multi_token_count; i += 1) {
	words->leng = 0;
	words->u.text = text;
	w_token_array[i] = words;
	words += 1;
	text += max_token_len+1+D;
    }
}

static void free_token_array(void)
{
    free(p_multi_words);
    free(p_multi_text );
    free(p_multi_buff );
    free(w_token_array);
}

static void token_set( word_t *token, byte *text, uint leng )
{
    token->leng = leng;
    memcpy(token->u.text, text, leng);		/* include nul terminator */
    token->u.text[leng] = '\0';			/* ensure nul termination */
}

static inline void token_copy( word_t *dst, word_t *src )
{
    token_set(dst, src->u.text, src->leng);
}

static void build_prefixed_token( word_t *prefix, word_t *token,
				  word_t *temp, uint32_t temp_size )
{
    uint len = token->leng + prefix->leng;
    
    if (len >= temp_size)
	len = temp_size - prefix->leng - 1;

    temp->leng = len;
    memmove(temp->u.text+prefix->leng, token->u.text, len-prefix->leng);
    memcpy(temp->u.text, prefix->u.text, prefix->leng);
    Z(temp->u.text[temp->leng]);

    token->leng = temp->leng;
    token->u.text = temp->u.text;
}

#define WRAP(n)	((n) % multi_token_count)

token_t get_token(word_t *token)
{
    token_t cls;
    
    bool fSingle = (tok_count < 2 ||
		    tok_count <= init_token ||
		    multi_token_count <= init_token);

    if (fSingle) {
	cls = parse_new_token(token);

	if (multi_token_count > 1)
	    add_token_to_array(token);
    }
    else {
	cls = TOKEN;
	build_token_from_array(token);
    }

    if (token_prefix != NULL) {
	/* IP addresses get special prefix */
	if (save_class != IPADDR) {
	    build_prefixed_token(token_prefix, token, &yylval, yylval_text_size);
	}
	else {
	    word_t *prefix = (wordlist_version >= IP_PREFIX) ? w_ip : w_url;
	    build_prefixed_token(prefix, token, &yylval, yylval_text_size);
	}

	/* if excessive length caused by prefix, get another token */
	if (fSingle && token->leng > max_token_len)
	    cls = get_token(token);
    }

    return cls;
}

token_t parse_new_token(word_t *token)
{
    token_t cls = NONE;
    unsigned char *cp;
    bool done = false;

    /* If saved IPADDR, truncate last octet */
    if ( block_on_subnets && save_class == IPADDR )
    {
	byte *t = (byte *)xmemrchr(ipsave->u.text, '.', ipsave->leng);
	if (t == NULL)
	    save_class = NONE;
	else
	{
	    ipsave->leng = (uint) (t - ipsave->u.text);
	    token_set( token, ipsave->u.text, ipsave->leng);
	    cls = save_class;
	    done = true;
	}
    }

    while (!done) {
	uint leng;
	byte *text;

	cls = (*lexer->yylex)();

	token->leng = lexer->get_parser_token(&token->u.text);
	Z(token->u.text[token->leng]);	/* for easier debugging - removable */

	leng = token->leng;
	text = token->u.text;

	if (DEBUG_TEXT(2)) {
	    word_puts(token, 0, dbgout);
	    fputc('\n', dbgout);
	}
 
	if (cls == NONE) /* End of message */
	    break;

	switch (cls) {

	case EOH:	/* end of header - bogus if not empty */
	    if (leng > max_token_len)
		continue;

	    if (msg_state->mime_type == MIME_MESSAGE)
		mime_add_child(msg_state);
	    if (leng == 1)
		continue;
	    else {	/* "spc:invalid_end_of_header" */
		token_copy( &yylval, nonblank_line);
		done = true;
	    }
	    break;

	case BOUNDARY:	/* don't return boundary tokens to the user */
	    continue;

	case VERP:	/* Variable Envelope Return Path */
	{
	    byte *st = (byte *)text;
	    byte *in;
	    byte *fst = NULL;
	    byte *lst = NULL;

	    for (in = st; *in != '\0'; in += 1) {
		if (*in == '-') {
		    if (fst == NULL)
			fst = in;
		    lst = in;
		}
	    }

	    if (fst != NULL && lst != NULL && lst - fst  > 3) {
		byte *ot = fst;
		*ot++ = '-';
		*ot++ = '#';
		for (in = lst; *in != '\0'; in += 1, ot += 1)
		    *ot = *in;
		token->leng = leng = (uint) (ot - st);
	    }
	    Z(token->u.text[token->leng]);	/* for easier debugging - removable */
	}
	break;

	case HEADKEY:
	{
	    if (!header_line_markup || *text == '\0')
		continue;
	    else {
		const char *delim = strchr((const char *)text, ':');
		leng = (uint) (delim - (const char *)text);
		if (leng > max_token_len)
		    continue;
		token_set( &yylval, text, leng);
	    }
	}

	/*@fallthrough@*/

	case TOKEN:	/* ignore anything when not reading text MIME types */
	    if (leng < min_token_len)
		continue;

	/*@fallthrough@*/

	case MONEY:	/* 2 character money is OK */
	    if (leng > max_token_len)
		continue;

	    token->u.text = text;
	    token->leng = leng;

	    if (token_prefix == NULL) {
		switch (msg_state->mime_type) {
		case MIME_TEXT:
		case MIME_TEXT_HTML:
		case MIME_TEXT_PLAIN:
		case MIME_MULTIPART:
		    break;
		case MIME_MESSAGE:
		case MIME_APPLICATION:
		case MIME_IMAGE:
		    continue;
		default:
		    continue;
		}
	    }
	    break;

	case MESSAGE_ID:
	    /* special token;  saved for formatted output, but not returned to bogofilter */
	    /** \bug: the parser MUST be aligned with lexer_v3.l! */
	    if (leng < max_token_len)
	    {
		while (!isspace(text[0])) {
		    text += 1;
		    leng -= 1;
		}
		while (isspace(text[0])) {
		    text += 1;
		    leng -= 1;
		}
		token_set( msg_id, text, leng);
	    }
	    continue;

	case QUEUE_ID:
	    /* special token;  saved for formatted output, but not returned to bogofilter */
	    /** \bug: the parser MUST be aligned with lexer_v3.l! */
	    if (*queue_id->u.text == '\0' &&
		leng < max_token_len )
	    {
		while (isspace(text[0])) {
		    text += 1;
		    leng -= 1;
		}
		if (memcmp(text, "id", 2) == 0) {
		    text += 2;
		    leng -= 2;
		}
		while (isspace(text[0])) {
		    text += 1;
		    leng -= 1;
		}
		if (text[0] == '<') {
		    text += 1;
		    leng -= 1;
		}
		if (text[leng-1] == '>') {
		    leng -= 1;
		}
		leng = min(queue_id->leng, leng);
		memcpy( queue_id->u.text, text, leng );
		Z(queue_id->u.text[leng]);
	    }
	    continue;

	case MESSAGE_ADDR:
	{
	    /* trim brackets */
	    text += 1;
	    leng -= 2;
	    Z(text[leng]);	/* for easier debugging - removable */
	    token_set( &yylval, text, leng);
	    /* if top level, no address, not localhost, .... */
	    if (token_prefix == w_recv &&
		msg_state->parent == NULL && 
		*msg_addr->u.text == '\0' &&
		strcmp((char *)text, "127.0.0.1") != 0)
	    {
		/* Not guaranteed to be the originating address of the message. */
		memcpy( msg_addr->u.text, yylval.u.text, min(msg_addr->leng, yylval.leng)+D );
		Z(msg_addr->u.text[yylval.leng]);
	    }
	}

	/*@fallthrough@*/

	case IPADDR:
	    if (block_on_subnets)
	    {
		int q1, q2, q3, q4;
		/*
		 * Trick collected by ESR in real time during John
		 * Graham-Cummings's talk at Paul Graham's spam conference
		 * in January 2003...  Some spammers know that people are
		 * doing recognition on spamhaus IP addresses.  They use
		 * the fact that HTML clients normally interpret IP addresses
		 * by doing a simple accumulate-and-shift algorithm; they
		 * add large random multiples of 256 to the quads to
		 * mask their origin.  Nuke the high bits to unmask the
		 * address.
		 */

		if (sscanf((const char *)text, "%d.%d.%d.%d", &q1, &q2, &q3, &q4) == 4)
		    /* safe because result string guaranteed to be shorter */
		    sprintf((char *)text, "%d.%d.%d.%d",
			    q1 & 0xff, q2 & 0xff, q3 & 0xff, q4 & 0xff);
		leng = strlen((const char *)text);

		token->u.text = text;
		token->leng = leng;

		token_copy( ipsave, token );

		save_class = IPADDR;

		return (cls);
	    }

	    token->u.text = text;
	    token->leng = leng;

	    break;

	case NONE:		/* nothing to do */
	    break;

	case MSG_COUNT_LINE:
	    msg_count_file = true;
	    multi_token_count = 1;
	    header_line_markup = false;
	    token_prefix = NULL;
	    lexer = &msg_count_lexer;
	    if (mbox_mode) {
		/* Allows processing multiple messages, **
		** but only a single file.              */
		reader_more = msgcount_more;
	    }
	    continue;

	case BOGO_LEX_LINE:
	    token_set( &yylval, text, leng);
	    done = true;
	    break;
	}

	if (DEBUG_TEXT(1)) {
	    word_puts(&yylval, 0, dbgout);
	    fputc('\n', dbgout);
	}

	/* eat all long words */
	if (token->leng <= max_token_len)
	    done = true;
    }

   if (!msg_count_file) {
	/* Remove trailing blanks */
	/* From "From ", for example */
	while (token->leng > 1 && token->u.text[token->leng-1] == ' ') {
	    token->leng -= 1;
	    token->u.text[token->leng] = (byte) '\0';
	}

	/* Remove trailing colon */
	if (token->leng > 1 && token->u.text[token->leng-1] == ':') {
	    token->leng -= 1;
	    token->u.text[token->leng] = (byte) '\0';
	}

	if (replace_nonascii_characters) {
	    /* replace nonascii characters by '?'s */
	    for (cp = token->u.text; cp < token->u.text+token->leng; cp += 1)
		*cp = casefold_table[*cp];
	}
    }

    return(cls);
}

/* save token in token array */

static void add_token_to_array(word_t *token)
{
    word_t *w = w_token_array[WRAP(tok_count)];

    w->leng = token->leng;
    memcpy(w->u.text, token->u.text, w->leng);
    Z(w->u.text[w->leng]);	/* for easier debugging - removable */

    if (DEBUG_MULTI(1))
	fprintf(stderr, "%s:%d  %2s  %2d %2d %p %s\n", __FILE__, __LINE__,
		"", tok_count, w->leng, w->u.text, w->u.text);

    tok_count += 1;
    init_token = 1;

    return;
}

static void build_token_from_array(word_t *token)
{
    int tok;

    const char *sep = "";
    uint  leng;
    byte *dest;

    leng = init_token;
    for ( tok = init_token; tok >= 0; tok -= 1 ) {
	uint idx = tok_count - 1 - tok;
	leng += strlen((char *) w_token_array[WRAP(idx)]->u.text);
    }

    if (leng > max_multi_token_len)
	leng = max_multi_token_len;

    token->leng = leng;
    token->u.text = dest = p_multi_buff;

    for ( tok = init_token; tok >= 0; tok -= 1 ) {
	uint  idx = tok_count - 1 - tok;
	uint  len = w_token_array[WRAP(idx)]->leng;
	byte *str = w_token_array[WRAP(idx)]->u.text;

	if (DEBUG_MULTI(1))
	    fprintf(stderr, "%s:%d  %2d  %2d %2d %p %s\n", __FILE__, __LINE__,
		    idx, tok_count, len, str, str);
	
	len = token_copy_leng((const char *)sep, leng, dest);
	leng -= len;
	dest += len;

	len = token_copy_leng((const char *)str, leng, dest);
	leng -= len;
	dest += len;

	sep = "*";
    }

    Z(token->u.text[token->leng]);	/* for easier debugging - removable */
    init_token += 1;			/* progress to next multi-token */

    return;
}

static uint token_copy_leng(const char *str, uint leng, byte *dest)
{
    uint len = strlen(str);
    if (leng < len)
	len  = leng;
    if (len != 0)
	memcpy(dest, str, len);
    return (uint) len;
}

void token_init(void)
{
    static bool fTokenInit = false;

    yyinit();

    if ( fTokenInit) {
	token_clear();
    }
    else {
	fTokenInit = true;

	if (max_multi_token_len == 0)
	    max_multi_token_len = (max_token_len+1) * multi_token_count + MAX_PREFIX_LEN;

	yylval_text_size = max_multi_token_len + MSG_COUNT_PADDING;

	yylval_text = (byte *) malloc( yylval_text_size+D );
	yylval.leng   = 0;
	yylval.u.text   = yylval_text;

	/* First IP Address in Received: statement */
	msg_addr = word_new( NULL, max_token_len );

	/* Message ID */
        max_msg_id_len  = max_token_len * 3;
	msg_id = word_new( NULL, max_msg_id_len );

	/* Message's first queue ID */
	queue_id = word_new( NULL, max_token_len );

	ipsave = word_new( NULL, max_token_len );

	/* word_new() used to avoid compiler complaints */
	w_to   = word_news("to:");	/* To:          */
	w_from = word_news("from:");	/* From:        */
	w_rtrn = word_news("rtrn:");	/* Return-Path: */
	w_subj = word_news("subj:");	/* Subject:     */
	w_recv = word_news("rcvd:");	/* Received:    */
	w_head = word_news("head:");	/* Header:      */
	w_mime = word_news("mime:");	/* Mime:        */
	w_ip   = word_news("ip:");	/* ip:          */
	w_url  = word_news("url:");	/* url:         */
	nonblank_line = word_news(NONBLANK);

	/* do multi-word token initializations */
	init_token_array();
    }

    return;
}

void clr_tag(void)
{
    token_prefix = NULL;
    tok_count = 0;
}

void set_tag(const char *text)
{
    word_t *old_prefix = token_prefix;

    if (!header_line_markup)
	return;

    if (msg_state->parent != NULL &&
	msg_state->parent->mime_type == MIME_MESSAGE) {
	clr_tag();			/* don't tag if inside message/rfc822 */
	return;
    }

    switch (tolower((unsigned char)*text)) {
    case 'c':				/* CC: */
    case 't':
	token_prefix = w_to;		/* To: */
	break;
    case 'f':
	token_prefix = w_from;		/* From: */
	break;
    case 'h':
	if (msg_state->parent == NULL)
	    token_prefix = w_head;	/* Header: */
	else
	    token_prefix = w_mime;	/* Mime:   */
	break;
    case 'r':
	if (tolower((unsigned char)text[2]) == 't')
	    token_prefix = w_rtrn;	/* Return-Path: */
	else
	    token_prefix = w_recv;	/* Received: */
	break;
    case 's':
	token_prefix = w_subj;		/* Subject: */
	break;
    default:
	fprintf(stderr, "%s:%d  invalid tag - '%s'\n",
		__FILE__, __LINE__,
		text);
	exit(EX_ERROR);
    }

    token_prefix_len = token_prefix->leng;
    assert(token_prefix_len <= MAX_PREFIX_LEN);

    if (DEBUG_LEXER(2)) {
	fprintf(dbgout,"--- set_tag(%s) -> prefix=", text);
	if (token_prefix)
	    word_puts(token_prefix, 0, dbgout);
	fputc('\n', dbgout);
    }

    /* discard tokens when prefix changes */
    if (old_prefix != NULL && old_prefix != token_prefix)
	tok_count = 0;

    return;
}

void set_msg_id(byte *text, uint leng)
{
    uint n = min(leng, max_msg_id_len);
    token_set( msg_id, text, n );
}

#define WFREE(n)	word_free(n); n = NULL

/* Cleanup storage allocation */
void token_cleanup()
{
    WFREE(w_to);
    WFREE(w_from);
    WFREE(w_rtrn);
    WFREE(w_subj);
    WFREE(w_recv);
    WFREE(w_head);
    WFREE(w_mime);
    WFREE(w_ip);
    WFREE(w_url);
    WFREE(nonblank_line);

    token_clear();

    /* do multi-word token cleanup */
    free_token_array();
}

void token_clear()
{
    if (msg_addr != NULL)
    {
	*msg_addr->u.text = '\0';
	*msg_id->u.text   = '\0';
	*queue_id->u.text = '\0';
    }
}