Blob Blame History Raw
/**
* \file lexer.c
* bogofilter's lexical analyzer (control routines)
*
* \date 2003-01-01 split out of lexer.l
*/
#include "common.h"
#include <ctype.h>
#include <stdlib.h>
#include <assert.h>
#include "base64.h"
#include "bogoconfig.h"
#include "bogoreader.h"
#include "charset.h"
#include "error.h"
#ifndef DISABLE_UNICODE
#include "convert_unicode.h"
#include "iconvert.h"
#endif
#include "lexer.h"
#include "memstr.h"
#include "mime.h"
#include "msgcounts.h"
#include "qp.h"
#include "textblock.h"
#include "token.h"
#include "word.h"
#include "xmalloc.h"
/* Global Variables */
extern int yylineno;
bool msg_header = true;
bool have_body = false;
lexer_t *lexer = NULL;
/* Local Variables */
static lexer_t v3_lexer = {
yylex,
lexer_v3_get_token
};
lexer_t msg_count_lexer = {
read_msg_count_line,
msg_count_get_token
};
/* Function Prototypes */
static int yy_get_new_line(buff_t *buff);
static int get_decoded_line(buff_t *buff);
static int skip_folded_line(buff_t *buff);
/* Function Definitions */
void lexer_init(void)
{
mime_reset();
token_init();
lexer_v3_init(NULL);
init_charset_table(charset_default);
}
static void lexer_display_buffer(buff_t *buff)
{
fprintf(dbgout, "*** %2d %c%c %2ld ",
yylineno-1, msg_header ? 'h' : 'b', yy_get_state(),
(long)(buff->t.leng - buff->read));
buff_puts(buff, 0, dbgout);
if (buff->t.leng > 0 && buff->t.u.text[buff->t.leng-1] != '\n')
fputc('\n', dbgout);
}
/**
* Check for lines wholly composed of printable characters as they can
* cause a scanner abort "input buffer overflow, can't enlarge buffer
* because scanner uses REJECT"
*
* \bug this function must go, we need to fix the lexer
*/
static bool long_token(byte *buf, uint count)
{
uint i;
for (i=0; i < count; i += 1) {
byte c = buf[i];
/* 10/23/05 - fix SIGSEGV with msg.1023.6479.txt
** evidently caused by 09/07/05 patch for 0.96.2
*/
if (c == '\0')
break;
if ((iscntrl(c) || isspace(c) || ispunct(c)) && (c != '_'))
return false;
}
return true;
}
static int yy_get_new_line(buff_t *buff)
{
int count = (*reader_getline)(buff);
const byte *buf = buff->t.u.text;
static size_t hdrlen = 0;
if (hdrlen==0)
hdrlen=strlen(spam_header_name);
if (count > 0)
yylineno += 1;
if (count == EOF) {
if (fpin == NULL || !ferror(fpin)) {
return YY_NULL;
}
else {
print_error(__FILE__, __LINE__, "input in flex scanner failed\n");
exit(EX_ERROR);
}
}
/* Mime header check needs to be performed on raw input
** -- before mime decoding. Without it, flex aborts:
** "fatal flex scanner internal error--end of buffer missed" */
if (buff->t.leng > 2 &&
buf[0] == '-' && buf[1] == '-' &&
got_mime_boundary(&buff->t)) {
yy_set_state_initial();
}
if (count >= 0 && DEBUG_LEXER(0))
lexer_display_buffer(buff);
/* skip spam_header ("X-Bogosity:") lines */
while (msg_header
&& count != EOF
/* don't skip if inside message/rfc822 */
&& msg_state->parent == NULL
&& buff->t.leng >= hdrlen
&& memcmp(buff->t.u.text,spam_header_name,hdrlen) == 0) {
count = skip_folded_line(buff);
}
return count;
}
static int get_decoded_line(buff_t *buff)
{
int count;
buff_t *linebuff;
/* since msg_state might change during calls */
bool mime_dont_decode = msg_state->mime_dont_decode;
#ifdef DISABLE_UNICODE
linebuff = buff;
#else
if (encoding == E_RAW ||
mime_dont_decode ) {
linebuff = buff;
}
else {
static buff_t *tempbuff = NULL;
if (tempbuff == NULL)
tempbuff = (buff_t *) calloc(sizeof(buff_t), 1);
/* UTF-8 uses up to six octets per character. Make input buffer
* sufficiently small that the UTF-8 text can fit in the output
* buffer */
if (tempbuff->size < buff->size / 6) {
xfree(tempbuff->t.u.text);
tempbuff->size = buff->size / 6;
tempbuff->t.u.text = (byte *) xmalloc(tempbuff->size+D);
}
tempbuff->t.leng = tempbuff->read = 0;
linebuff = tempbuff;
}
#endif
/* note that this call might invoke got_mimeboundary() thus
* changing the global msg_state variable */
count = yy_get_new_line(linebuff);
if (count == EOF) {
if ( !ferror(fpin))
return YY_NULL;
else {
print_error(__FILE__, __LINE__, "input in flex scanner failed\n");
exit(EX_ERROR);
}
}
/* Save the text on a linked list of lines.
* Note that we store fixed-length blocks here, not lines.
* One very long physical line could break up into more
* than one of these. */
if (passthrough && count > 0)
textblock_add(linebuff->t.u.text+linebuff->read, (size_t) count);
if ( !msg_header &&
!mime_dont_decode &&
msg_state->mime_type != MIME_TYPE_UNKNOWN)
{
word_t temp;
uint decoded_count;
temp.leng = (uint) count;
temp.u.text = linebuff->t.u.text+linebuff->read;
decoded_count = mime_decode(&temp);
/*change buffer size only if the decoding worked */
if (decoded_count != 0 && decoded_count < (uint) count) {
linebuff->t.leng -= (uint) (count - decoded_count);
count = (int) decoded_count;
if (DEBUG_LEXER(1))
lexer_display_buffer(linebuff);
}
}
#ifndef DISABLE_UNICODE
if (encoding == E_UNICODE &&
!mime_dont_decode &&
count > 0)
{
iconvert(linebuff, buff);
/* If we return count = 0 here, the caller will think we have
* no more bytes left to read, even though before the iconvert
* call we had a positive number of bytes. This *will* lead to
* a message truncation which we try to avoid by simply
* returning another in-band error code. */
if (buff->t.leng == 0) {
count = -2;
} else {
/* iconvert, treating multi-byte sequences, can shrink or enlarge
* the output compared to its input. Correct count. */
count = buff->t.leng;
}
}
#endif
#ifdef EXCESSIVE_DEBUG
/* debug */
fprintf(dbgout, "%d: ", count);
buff_puts(buff, 0, dbgout);
fprintf(dbgout, "\n");
#endif
/* CRLF -> NL */
if (count >= 2) {
byte *buf = buff->t.u.text;
if (memcmp(buf + count - 2, CRLF, 2) == 0) {
count --;
--buff->t.leng;
*(buf + count - 1) = (byte) '\n';
}
}
if (buff->t.leng < buff->size) /* for easier debugging - removable */
Z(buff->t.u.text[buff->t.leng]); /* for easier debugging - removable */
return count;
}
static int skip_folded_line(buff_t *buff)
{
for (;;) {
int count;
buff->t.leng = 0;
count = reader_getline(buff);
yylineno += 1;
/* only check for LWSP-char (RFC-822) aka. WSP (RFC-2822),
* these only include SP and HTAB */
if (buff->t.u.text[0] != ' ' &&
buff->t.u.text[0] != '\t')
return count;
/* Check for empty line which terminates message header */
if (is_eol((char *)buff->t.u.text, count))
return count;
}
}
void yyinit(void)
{
yylineno = 0;
if ( !msg_count_file)
lexer = &v3_lexer;
}
int yyinput(byte *buf, size_t used, size_t size)
/* input getter for the scanner */
{
int cnt;
int count = 0;
buff_t buff;
buff_init(&buff, buf, 0, (uint) size);
/* After reading a line of text, check if it has special characters.
* If not, trim some, but leave enough to match a max length token.
* Then read more text. This will ensure that a really long sequence
* of alphanumerics, which bogofilter will ignore anyway, doesn't crash
* the flex lexer.
*/
while ((cnt = get_decoded_line(&buff)) != 0) {
if (cnt > 0)
count = buff.t.leng;
/* Note: some malformed messages can cause xfgetsl() to report
** "Invalid buffer size, exiting." and then abort. This
** can happen when the parser is in html mode and there's a
** leading '<' but no closing '>'.
**
** The "fix" is to check for a nearly full lexer buffer and
** discard most of it.
*/
/* if not nearly full */
if (used < 1000 || used < size * 10)
break;
if (count >= MAX_TOKEN_LEN * 2 &&
long_token(buff.t.u.text, (uint) count)) {
/* Make sure not to shift bytes outside the buffer */
if (buff.t.leng >= (uint) count) {
uint start = buff.t.leng - count;
uint length = count - max_token_len;
buff_shift(&buff, start, length);
}
count = buff.t.leng;
}
else
break;
}
if (msg_state &&
msg_state->mime_dont_decode &&
(msg_state->mime_disposition != MIME_DISPOSITION_UNKNOWN)) {
assert(size <= INT_MAX && count <= (int)size);
return (count == EOF ? 0 : count); /* not decode at all */
}
#if defined(CP866) && !defined(ENABLE_ICONV)
/* EK - decoding things like &#1084 and charset_table */
count = decode_and_htmlUNICODE_to_cp866(buf, count);
#endif
if (replace_nonascii_characters) {
/* do non-ascii replacement */
int i;
for (i = 0; i < count; i++ )
{
byte ch = buf[i];
buf[i] = charset_table[ch];
}
}
if (DEBUG_LEXER(2))
fprintf(dbgout, "*** yyinput(\"%-.*s\", %lu, %lu) = %d\n", count, buf, (unsigned long)used, (unsigned long)size, count);
assert(size <= INT_MAX && count <= (int)size);
return (count == EOF ? 0 : count);
}
static char *charset_as_string(const byte *txt, const size_t len)
{
static char *charset_text = NULL;
static unsigned short charset_leng = 0;
if (charset_text == NULL)
charset_text = (char *)xmalloc(len+D);
else {
if (charset_leng < len) {
charset_leng = len;
charset_text = (char *)xrealloc(charset_text, charset_leng+D);
}
}
memcpy(charset_text, txt, len);
Z(charset_text[len]); /* for easier debugging - removable */
return charset_text;
}
word_t *text_decode(word_t *w)
{
word_t *r = w;
byte *const beg = w->u.text; /* base pointer, fixed */
byte *const fin = beg + w->leng; /* end+1 position */
byte *txt = (byte *) memstr(w->u.text, w->leng, "=?"); /* input position */
uint size = (uint) (txt - beg); /* output offset */
#ifndef DISABLE_UNICODE
size_t max = w->leng * 4;
static buff_t * buf = NULL;
#endif
if (txt == NULL)
return r;
#ifndef DISABLE_UNICODE
if (encoding == E_UNICODE) {
if (buf == NULL)
buf = buff_new((byte *)xmalloc(max+D), 0, max);
r = &buf->t; /* Use buf to return unicode result */
buf->t.leng = 0;
if (buf->size < max) {
buf->size = max;
buf->t.u.text = (byte *) xrealloc(buf->t.u.text, buf->size+D);
}
buf->t.leng = size;
memcpy(buf->t.u.text, beg, size );
Z(buf->t.u.text[buf->t.leng]); /* for easier debugging - removable */
}
#endif
if (DEBUG_LEXER(2)) {
fputs("**1** ", dbgout);
word_puts(w, 0, dbgout);
fputs("\n", dbgout);
}
while (txt < fin) {
byte *typ, *tmp, *end;
uint len;
bool adjacent;
char *charset;
txt += 2;
typ = (byte *) memchr((char *)txt+1, '?', fin-txt); /* Encoding type - 'B' or 'Q' */
*typ++ = '\0'; /* nul terminate */
charset = charset_as_string(txt, typ - txt - 1);
tmp = typ + 2; /* start of encoded word */
end = (byte *) memstr((char *)tmp, fin-tmp, "?="); /* last byte of encoded word */
len = end - tmp;
w->u.text = tmp; /* Start of encoded word */
w->leng = len; /* Length of encoded word */
Z(w->u.text[w->leng]); /* for easier debugging - removable */
if (DEBUG_LEXER(2)) {
fputs("**2** ", dbgout);
word_puts(w, 0, dbgout);
fputs("\n", dbgout);
}
switch (tolower(*typ)) { /* ... encoding type */
case 'b':
if (base64_validate(w))
len = base64_decode(w); /* decode base64 */
break;
case 'q':
if (qp_validate(w, RFC2047))
len = qp_decode(w, RFC2047); /* decode quoted-printable */
break;
}
/* move decoded word to where the encoded used to be */
if (encoding == E_RAW) {
memmove(beg+size, w->u.text, len);
size += len; /* bump output pointer */
Z(beg[size]); /* for easier debugging - removable */
if (DEBUG_LEXER(3))
fprintf(dbgout, "**3** %s\n", beg);
}
#ifndef DISABLE_UNICODE
if (encoding == E_UNICODE) {
iconv_t cd;
buff_t src;
/* convert 'word_t *w' to 'buff_t src' because
** iconvert_cd() needs buff_t pointers
*/
src.t.u.text = w->u.text;
src.t.leng = len;
src.read = 0;
src.size = len;
cd = bf_iconv_open( charset_unicode, charset );
iconvert_cd(cd, &src, buf);
iconv_close(cd);
if (DEBUG_LEXER(3)) {
fputs("**4** ", dbgout);
word_puts(&buf->t, 0, dbgout);
fputs("\n", dbgout);
}
}
#endif
txt = end + 2; /* skip ?= trailer */
if (txt >= fin)
break;
/* check for next encoded word */
end = (byte *) memstr((char *)txt, fin-txt, "=?");
adjacent = end != NULL;
/* clear adjacent flag if non-whitespace character found between
* adjacent encoded words */
if (adjacent) {
tmp = txt;
while (adjacent && tmp < end) {
if (*tmp && strchr(" \t\r\n", *tmp))
tmp += 1;
else
adjacent = false;
}
}
/* we have a next encoded word and we've had only whitespace
* between the current and the next */
if (adjacent)
/* just skip whitespace */
txt = end;
else
/* copy everything that was between the encoded words */
while (txt < end) {
if (encoding == E_RAW)
beg[size++] = *txt++;
#ifndef DISABLE_UNICODE
if (encoding == E_UNICODE)
buf->t.u.text[buf->t.leng++] = *txt++;
#endif
}
}
if (encoding == E_RAW) {
r->u.text = beg;
r->leng = size;
}
return r;
}
/*
* The following sets edit modes for GNU EMACS
* Local Variables:
* mode:c
* End:
*/