| |
| |
| |
| |
| |
| |
| |
| #include "common.h" |
| |
| #include <ctype.h> |
| #include <stdlib.h> |
| #include <assert.h> |
| |
| #include "base64.h" |
| #include "bogoconfig.h" |
| #include "bogoreader.h" |
| #include "charset.h" |
| #include "error.h" |
| #ifndef DISABLE_UNICODE |
| #include "convert_unicode.h" |
| #include "iconvert.h" |
| #endif |
| #include "lexer.h" |
| #include "memstr.h" |
| #include "mime.h" |
| #include "msgcounts.h" |
| #include "qp.h" |
| #include "textblock.h" |
| #include "token.h" |
| #include "word.h" |
| #include "xmalloc.h" |
| |
| |
| |
| extern int yylineno; |
| |
| bool msg_header = true; |
| bool have_body = false; |
| lexer_t *lexer = NULL; |
| |
| |
| |
| static lexer_t v3_lexer = { |
| yylex, |
| lexer_v3_get_token |
| }; |
| |
| lexer_t msg_count_lexer = { |
| read_msg_count_line, |
| msg_count_get_token |
| }; |
| |
| |
| |
| static int yy_get_new_line(buff_t *buff); |
| static int get_decoded_line(buff_t *buff); |
| static int skip_folded_line(buff_t *buff); |
| |
| |
| |
| void lexer_init(void) |
| { |
| mime_reset(); |
| token_init(); |
| lexer_v3_init(NULL); |
| init_charset_table(charset_default); |
| } |
| |
| static void lexer_display_buffer(buff_t *buff) |
| { |
| fprintf(dbgout, "*** %2d %c%c %2ld ", |
| yylineno-1, msg_header ? 'h' : 'b', yy_get_state(), |
| (long)(buff->t.leng - buff->read)); |
| buff_puts(buff, 0, dbgout); |
| if (buff->t.leng > 0 && buff->t.u.text[buff->t.leng-1] != '\n') |
| fputc('\n', dbgout); |
| } |
| |
| |
| |
| |
| |
| |
| |
| |
| static bool long_token(byte *buf, uint count) |
| { |
| uint i; |
| for (i=0; i < count; i += 1) { |
| byte c = buf[i]; |
| |
| |
| |
| if (c == '\0') |
| break; |
| if ((iscntrl(c) || isspace(c) || ispunct(c)) && (c != '_')) |
| return false; |
| } |
| return true; |
| } |
| |
| static int yy_get_new_line(buff_t *buff) |
| { |
| int count = (*reader_getline)(buff); |
| const byte *buf = buff->t.u.text; |
| |
| static size_t hdrlen = 0; |
| if (hdrlen==0) |
| hdrlen=strlen(spam_header_name); |
| |
| if (count > 0) |
| yylineno += 1; |
| |
| if (count == EOF) { |
| if (fpin == NULL || !ferror(fpin)) { |
| return YY_NULL; |
| } |
| else { |
| print_error(__FILE__, __LINE__, "input in flex scanner failed\n"); |
| exit(EX_ERROR); |
| } |
| } |
| |
| |
| |
| |
| |
| if (buff->t.leng > 2 && |
| buf[0] == '-' && buf[1] == '-' && |
| got_mime_boundary(&buff->t)) { |
| yy_set_state_initial(); |
| } |
| |
| if (count >= 0 && DEBUG_LEXER(0)) |
| lexer_display_buffer(buff); |
| |
| |
| while (msg_header |
| && count != EOF |
| |
| && msg_state->parent == NULL |
| && buff->t.leng >= hdrlen |
| && memcmp(buff->t.u.text,spam_header_name,hdrlen) == 0) { |
| count = skip_folded_line(buff); |
| } |
| |
| return count; |
| } |
| |
| static int get_decoded_line(buff_t *buff) |
| { |
| int count; |
| buff_t *linebuff; |
| |
| bool mime_dont_decode = msg_state->mime_dont_decode; |
| |
| #ifdef DISABLE_UNICODE |
| linebuff = buff; |
| #else |
| if (encoding == E_RAW || |
| mime_dont_decode ) { |
| linebuff = buff; |
| } |
| else { |
| static buff_t *tempbuff = NULL; |
| |
| if (tempbuff == NULL) |
| tempbuff = (buff_t *) calloc(sizeof(buff_t), 1); |
| |
| |
| |
| |
| if (tempbuff->size < buff->size / 6) { |
| xfree(tempbuff->t.u.text); |
| tempbuff->size = buff->size / 6; |
| tempbuff->t.u.text = (byte *) xmalloc(tempbuff->size+D); |
| } |
| |
| tempbuff->t.leng = tempbuff->read = 0; |
| linebuff = tempbuff; |
| } |
| #endif |
| |
| |
| |
| count = yy_get_new_line(linebuff); |
| |
| if (count == EOF) { |
| if ( !ferror(fpin)) |
| return YY_NULL; |
| else { |
| print_error(__FILE__, __LINE__, "input in flex scanner failed\n"); |
| exit(EX_ERROR); |
| } |
| } |
| |
| |
| |
| |
| |
| |
| if (passthrough && count > 0) |
| textblock_add(linebuff->t.u.text+linebuff->read, (size_t) count); |
| |
| if ( !msg_header && |
| !mime_dont_decode && |
| msg_state->mime_type != MIME_TYPE_UNKNOWN) |
| { |
| word_t temp; |
| uint decoded_count; |
| |
| temp.leng = (uint) count; |
| temp.u.text = linebuff->t.u.text+linebuff->read; |
| |
| decoded_count = mime_decode(&temp); |
| |
| if (decoded_count != 0 && decoded_count < (uint) count) { |
| linebuff->t.leng -= (uint) (count - decoded_count); |
| count = (int) decoded_count; |
| if (DEBUG_LEXER(1)) |
| lexer_display_buffer(linebuff); |
| } |
| } |
| |
| #ifndef DISABLE_UNICODE |
| if (encoding == E_UNICODE && |
| !mime_dont_decode && |
| count > 0) |
| { |
| iconvert(linebuff, buff); |
| |
| |
| |
| |
| |
| |
| if (buff->t.leng == 0) { |
| count = -2; |
| } else { |
| |
| |
| count = buff->t.leng; |
| } |
| } |
| #endif |
| |
| #ifdef EXCESSIVE_DEBUG |
| |
| fprintf(dbgout, "%d: ", count); |
| buff_puts(buff, 0, dbgout); |
| fprintf(dbgout, "\n"); |
| #endif |
| |
| |
| if (count >= 2) { |
| byte *buf = buff->t.u.text; |
| if (memcmp(buf + count - 2, CRLF, 2) == 0) { |
| count --; |
| --buff->t.leng; |
| *(buf + count - 1) = (byte) '\n'; |
| } |
| } |
| |
| if (buff->t.leng < buff->size) |
| Z(buff->t.u.text[buff->t.leng]); |
| |
| return count; |
| } |
| |
| static int skip_folded_line(buff_t *buff) |
| { |
| for (;;) { |
| int count; |
| buff->t.leng = 0; |
| count = reader_getline(buff); |
| yylineno += 1; |
| |
| |
| if (buff->t.u.text[0] != ' ' && |
| buff->t.u.text[0] != '\t') |
| return count; |
| |
| if (is_eol((char *)buff->t.u.text, count)) |
| return count; |
| } |
| } |
| |
| void yyinit(void) |
| { |
| yylineno = 0; |
| |
| if ( !msg_count_file) |
| lexer = &v3_lexer; |
| } |
| |
| int yyinput(byte *buf, size_t used, size_t size) |
| |
| { |
| int cnt; |
| int count = 0; |
| buff_t buff; |
| |
| buff_init(&buff, buf, 0, (uint) size); |
| |
| |
| |
| |
| |
| |
| |
| |
| while ((cnt = get_decoded_line(&buff)) != 0) { |
| if (cnt > 0) |
| count = buff.t.leng; |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| if (used < 1000 || used < size * 10) |
| break; |
| |
| if (count >= MAX_TOKEN_LEN * 2 && |
| long_token(buff.t.u.text, (uint) count)) { |
| |
| if (buff.t.leng >= (uint) count) { |
| uint start = buff.t.leng - count; |
| uint length = count - max_token_len; |
| buff_shift(&buff, start, length); |
| } |
| count = buff.t.leng; |
| } |
| else |
| break; |
| } |
| |
| if (msg_state && |
| msg_state->mime_dont_decode && |
| (msg_state->mime_disposition != MIME_DISPOSITION_UNKNOWN)) { |
| assert(size <= INT_MAX && count <= (int)size); |
| return (count == EOF ? 0 : count); |
| } |
| |
| #if defined(CP866) && !defined(ENABLE_ICONV) |
| |
| count = decode_and_htmlUNICODE_to_cp866(buf, count); |
| #endif |
| |
| if (replace_nonascii_characters) { |
| |
| int i; |
| for (i = 0; i < count; i++ ) |
| { |
| byte ch = buf[i]; |
| buf[i] = charset_table[ch]; |
| } |
| } |
| |
| if (DEBUG_LEXER(2)) |
| fprintf(dbgout, "*** yyinput(\"%-.*s\", %lu, %lu) = %d\n", count, buf, (unsigned long)used, (unsigned long)size, count); |
| |
| assert(size <= INT_MAX && count <= (int)size); |
| return (count == EOF ? 0 : count); |
| } |
| |
| static char *charset_as_string(const byte *txt, const size_t len) |
| { |
| static char *charset_text = NULL; |
| static unsigned short charset_leng = 0; |
| |
| if (charset_text == NULL) |
| charset_text = (char *)xmalloc(len+D); |
| else { |
| if (charset_leng < len) { |
| charset_leng = len; |
| charset_text = (char *)xrealloc(charset_text, charset_leng+D); |
| } |
| } |
| |
| memcpy(charset_text, txt, len); |
| Z(charset_text[len]); |
| |
| return charset_text; |
| } |
| |
| word_t *text_decode(word_t *w) |
| { |
| word_t *r = w; |
| byte *const beg = w->u.text; |
| byte *const fin = beg + w->leng; |
| |
| byte *txt = (byte *) memstr(w->u.text, w->leng, "=?"); |
| uint size = (uint) (txt - beg); |
| |
| #ifndef DISABLE_UNICODE |
| size_t max = w->leng * 4; |
| static buff_t * buf = NULL; |
| #endif |
| |
| if (txt == NULL) |
| return r; |
| |
| #ifndef DISABLE_UNICODE |
| if (encoding == E_UNICODE) { |
| if (buf == NULL) |
| buf = buff_new((byte *)xmalloc(max+D), 0, max); |
| r = &buf->t; |
| |
| buf->t.leng = 0; |
| if (buf->size < max) { |
| buf->size = max; |
| buf->t.u.text = (byte *) xrealloc(buf->t.u.text, buf->size+D); |
| } |
| |
| buf->t.leng = size; |
| memcpy(buf->t.u.text, beg, size ); |
| Z(buf->t.u.text[buf->t.leng]); |
| } |
| #endif |
| |
| if (DEBUG_LEXER(2)) { |
| fputs("**1** ", dbgout); |
| word_puts(w, 0, dbgout); |
| fputs("\n", dbgout); |
| } |
| |
| while (txt < fin) { |
| byte *typ, *tmp, *end; |
| uint len; |
| bool adjacent; |
| |
| char *charset; |
| |
| txt += 2; |
| typ = (byte *) memchr((char *)txt+1, '?', fin-txt); |
| *typ++ = '\0'; |
| |
| charset = charset_as_string(txt, typ - txt - 1); |
| |
| tmp = typ + 2; |
| end = (byte *) memstr((char *)tmp, fin-tmp, "?="); |
| len = end - tmp; |
| |
| w->u.text = tmp; |
| w->leng = len; |
| Z(w->u.text[w->leng]); |
| |
| if (DEBUG_LEXER(2)) { |
| fputs("**2** ", dbgout); |
| word_puts(w, 0, dbgout); |
| fputs("\n", dbgout); |
| } |
| |
| switch (tolower(*typ)) { |
| case 'b': |
| if (base64_validate(w)) |
| len = base64_decode(w); |
| break; |
| case 'q': |
| if (qp_validate(w, RFC2047)) |
| len = qp_decode(w, RFC2047); |
| break; |
| } |
| |
| |
| if (encoding == E_RAW) { |
| memmove(beg+size, w->u.text, len); |
| size += len; |
| Z(beg[size]); |
| |
| if (DEBUG_LEXER(3)) |
| fprintf(dbgout, "**3** %s\n", beg); |
| } |
| |
| #ifndef DISABLE_UNICODE |
| if (encoding == E_UNICODE) { |
| iconv_t cd; |
| buff_t src; |
| |
| |
| |
| |
| src.t.u.text = w->u.text; |
| src.t.leng = len; |
| src.read = 0; |
| src.size = len; |
| |
| cd = bf_iconv_open( charset_unicode, charset ); |
| iconvert_cd(cd, &src, buf); |
| iconv_close(cd); |
| |
| if (DEBUG_LEXER(3)) { |
| fputs("**4** ", dbgout); |
| word_puts(&buf->t, 0, dbgout); |
| fputs("\n", dbgout); |
| } |
| } |
| #endif |
| |
| txt = end + 2; |
| if (txt >= fin) |
| break; |
| |
| |
| end = (byte *) memstr((char *)txt, fin-txt, "=?"); |
| adjacent = end != NULL; |
| |
| |
| |
| if (adjacent) { |
| tmp = txt; |
| while (adjacent && tmp < end) { |
| if (*tmp && strchr(" \t\r\n", *tmp)) |
| tmp += 1; |
| else |
| adjacent = false; |
| } |
| } |
| |
| |
| |
| if (adjacent) |
| |
| txt = end; |
| else |
| |
| while (txt < end) { |
| if (encoding == E_RAW) |
| beg[size++] = *txt++; |
| #ifndef DISABLE_UNICODE |
| if (encoding == E_UNICODE) |
| buf->t.u.text[buf->t.leng++] = *txt++; |
| #endif |
| } |
| } |
| |
| if (encoding == E_RAW) { |
| r->u.text = beg; |
| r->leng = size; |
| } |
| |
| return r; |
| } |
| |
| |
| |
| |
| |
| |
| |