Blob Blame History Raw
%{
/*
 * NAME
 *   lexer_v3.l -- bogofilter's lexical analyzer for message headers
 *
 *   01/01/2003 - split out of lexer.l
 *
*/

/*
 * Our lexical analysis is different from Paul Graham's rules: 
 *
 * We throw away headers that are readily identifiable as dates.
 * We throw away all digit strings that don't look like IP address parts.
 * We thow away lines beginning with <tab>id<space> -- mailer UDs.
 *
 * These are optimizations to keep the token lists from bloating.
 * The big win is recognizing machine-generated unique IDs that
 * we'll never see again and shouldn't 
 *
 * We don't treat dot between two alphanumerics as a separator,
 * because we want to keep domain names and IP addresses together as 
 * recognizable units. 
 *
 * Having done the above, there isn't much need to recognize URLs.  
 * If a URL is a spam indicator, very likely any other URL from the
 * same site is as well, so the hostname part should be an adequate
 * statistical trigger.  
 *
 * LEXED_TOKENS, which are found in "msg-count" files need a special pattern
 * because they can be:
 *	1 - normal bogofilter tokens
 *	2 - url:xxx and subj: tokens
 *	3 - mime boundaries
 */

/* 12 May 2003
 * Added Paul Graham's latest ideas on parsing.
 * (From http://www.paulgraham.com/better.html)
 *
 * 1. Case is preserved.
 *
 * 2. Exclamation points are constituent characters.
 *
 * 3. Periods and commas are constituents if they occur between two
 *    digits. This lets me get ip addresses and prices intact.
 *
 * 4. A price range like $20-25 yields two tokens, $20 and $25.
 *
 * 5. Tokens that occur within the To, From, Subject, and Return-Path
 *    lines, or within urls, get marked accordingly.
 *    For example. "foo" in the Subject line becomes "subj:foo".
*/

/* DR 08/29/2003:
**
** With flex-2.5.31 and '%option never-interactive noreject', file
** msg.dr.0118.base64 (in tests/bogofilter/inputs/split.d) parses
** incorrectly because line 24 isn't base64 decoded.
*/

#define YY_NO_INPUT

#include "common.h"

#include <ctype.h>
#include <stdlib.h>

#include "buff.h"
#include "charset.h"
#include "lexer.h"
#include "mime.h"		/* for mime_*() */
#include "msgcounts.h"
#include "textblock.h"
#include "token.h"
#include "xmalloc.h"

#define YY_DECL token_t yylex(void)
    YY_DECL;			/* declare function */

#define YY_INPUT(buf,result,max_size) result = yyinput((byte *)buf, result, max_size)
#define YY_EXIT_FAILURE EX_ERROR

#undef	stderr
#define	stderr	dbgout		/* for debug & -D options */

static int lineno;

#define	FLEX_VER(MAJ, MIN, SUB) ( MAJ * 1000 + MIN * 100 + SUB)

#ifndef	YY_FLEX_SUBMINOR_VERSION
#define	FLEX_VERSION_BF FLEX_VER(YY_FLEX_MAJOR_VERSION, YY_FLEX_MINOR_VERSION, 0)
#else
#define	FLEX_VERSION_BF FLEX_VER(YY_FLEX_MAJOR_VERSION, YY_FLEX_MINOR_VERSION, YY_FLEX_SUBMINOR_VERSION)
#endif

#if	FLEX_VERSION_BF < 2531
int yylineno;
#endif

/* Function Prototypes/Forward Declarations */

static word_t *yy_text(void);
static void html_char(void);
static void html_reorder(void);

static void url_char(void);

static void skip_to(char chr);
static void yy_unput(const byte *txt, uint len);

char yy_get_state(void);
void yy_set_state_initial(void);

static void header(void);

/* Function Definitions */

static word_t *yy_text(void)
{
    static word_t yyt;
    yyt.u.text = (byte *)yytext;
    yyt.leng = yyleng;
    return &yyt;
}

%}

%option warn
%option nodebug debug
%option align caseless 8bit
/***********************************************************************
 WARNING: The scanner must be interactive so as not to look ahead past
 \n = EOH and other header/body delimiters, else it will fail when MIME
 decoding, because part of the body has already been read ahead.
 ***********************************************************************/
%option interactive always-interactive
%option noreject noyywrap

UINT8		([01]?[0-9]?[0-9]|2([0-4][0-9]|5[0-5]))
IPADDR		{UINT8}\.{UINT8}\.{UINT8}\.{UINT8}
BCHARSNOSPC	[[:alnum:]()+_,-./:=?#\']
BCHARS		[[:alnum:]()+_,-./:=?#\' ]
MIME_BOUNDARY	{BCHARS}*{BCHARSNOSPC}

ID		<?[[:alnum:]\-\.]+>?
CHARSET		[[:alnum:]-]+
VERPID		[[:alnum:]#-]+[[:digit:]]+[[:alnum:]#-]+
MTYPE		[[:blank:]]*[[:alnum:]/-]*

NUM		[[:digit:]]+
NUM_NUM		\ {NUM}\ {NUM}
MSG_COUNT	^\".MSG_COUNT\"

FRONT_CHAR	[^[:blank:][:cntrl:][:digit:][:punct:]]
MID_CHAR	[^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]]
BOGOLEX_CHAR	[^[:blank:][:cntrl:]   <>;=()&%#@+|/\\{}^\"?,\[\]]
BACK_CHAR	[^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]._~\'\`\-]

TOKEN		{FRONT_CHAR}({MID_CHAR}*{BACK_CHAR})?

/*  RFC2047.2
    encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
    charset = token    ; see section 3
    encoding = token   ; see section 4
    token = 1*<Any CHAR except SPACE, CTLs, and especials>
    especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "
		<"> / "/" / "[" / "]" / "?" / "." / "="
    encoded-text = 1*<Any printable ASCII character other than "?"
		      or SPACE>
		   ; (but see "Use of encoded-words in message
		   ; headers", section 5)
*/

/* 09/01/03
  Using "[^?]" in the pattern and validating the charset in 'C'
  reduces executable size by approx 290k.
  new: ENCODED_WORD =\?{CHARSET}\?[bq]\?[^?]*\?=
  old: ENCODED_WORD =\?{CHARSET}\?(b\?{BASE64}\|q\?{QP})\?=

  BASE64	[0-9a-zA-Z/+=]+
  QP		[!->@-~]+
*/

WHITESPACE	[[:blank:]\n]
NOTWHITESPACE	[^[:blank:]\n]

HTML_ENCODING	"&#"x?[[:xdigit:]]+";"
URL_ENCODING	"%"[[:xdigit:]][[:xdigit:]]

ENCODED_WORD	=\?{CHARSET}\?[bq]\?[^?\n]*\?=
ENCODED_TOKEN	({FRONT_CHAR}{MID_CHAR}*)?({ENCODED_WORD}{WHITESPACE}+)*{ENCODED_WORD}

/*
HTML_WI_COMMENTS	"<"[^>]*">"
HTML_WO_COMMENTS	"<"[^!][^>]*">"\|"<>"
*/

HTMLTOKEN		"<"[^>]*">"

/*
 * Generally, there are some html tags that cause an "eyebreak" and some
 * that do not. For example, the "P" tag or the "BR" tag cause a break,
 * and can be interpreted in place, while, the B (bold) tag does not.
 * No close tags seem to cause a break.
 * Comments do not.  This is an attempt to make an exhaustive list of
 * tags that cause an "eyebreak". When the exit tag also causes a break,
 * we include the /?. I believe this to be a complete list of tags that
 * can cause a formatting break.
 */

HBREAK		p|br|li|h[1-6]|hr|title|table|center|dd|dt|iframe|img|input|select|td|textarea|th|\/?(div|blockquote|pre|dir|dl|fieldset|legend|form|menu|ol|ul)

BREAKHTML	"<"({HBREAK}({WHITESPACE}[^>]*|""))">"

VERP		{TOKEN}-{VERPID}-{TOKEN}={TOKEN}@{TOKEN}

%s TEXT HTML BOGO_LEX
%s HTOKEN HDISCARD SCOMMENT LCOMMENT
%s PGP_HEAD PGP_BODY

%%

<INITIAL,BOGO_LEX>{MSG_COUNT}{NUM_NUM}		{ if (lineno == 0) { 
						      BEGIN BOGO_LEX; 
						      set_msg_counts_from_str(strchr(yytext, ' ') + 1); 
						  }
						  return MSG_COUNT_LINE;
						}
<BOGO_LEX>^\"{BOGOLEX_CHAR}+\"{NUM_NUM}$	{ return BOGO_LEX_LINE; }
<BOGO_LEX>\n					{ lineno += 1; }

<INITIAL>{ENCODED_TOKEN}			{ word_t *raw = yy_text();
						  word_t *txt = text_decode(raw);
						  yy_unput(txt->u.text, txt->leng);
						}

<INITIAL>^(To|CC|From|Return-Path|Subject|Received):	{ set_tag(yytext); }
<INITIAL>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE}	{ mime_content(yy_text()); skip_to(':'); header(); return TOKEN; }

<INITIAL>^Message-ID:.*				{ /* save token for logging */
						  unsigned long off = 11;
						  while(isspace((unsigned char)yytext[off]) && off < INT_MAX && off < (unsigned long)yyleng)
						      off++;
						  set_msg_id((unsigned char *)(yytext+off), yyleng-off);
						  header();
						  return HEADKEY;
						}
<INITIAL>^(Delivery-)?Date:.*			|
<INITIAL>^Resent-Message-ID:.*			|
<INITIAL>^(In-Reply-To|References):.* 		{ header(); return HEADKEY; }

<INITIAL>boundary=[ ]*\"?{MIME_BOUNDARY}\"?	{ mime_boundary_set(yy_text()); }
<INITIAL>charset=\"?{CHARSET}\"?		{ got_charset(yytext); skip_to('='); header(); return TOKEN; }

<INITIAL>(file)?name=\"?			/* ignore */
<INITIAL>[[:blank:]]id{WHITESPACE}+{ID}		{ return QUEUE_ID; }

 /**********************************************************************
 WARNING: Do NOT add header (<INITIAL>) rules that require characters
 beyond a LF character (\n) - doing so will make the parser read ahead
 parts of the body in the wrong MIME decoding mode and goof up
 seriously.
 **********************************************************************/

<INITIAL>^\n					{ enum mimetype type = get_content_type();
						  have_body = true;
						  msg_header = false;
						  clr_tag();
						  switch (type) { 
						  case MIME_TEXT_HTML:	BEGIN HTML; break;
						  case MIME_MESSAGE:	yy_set_state_initial(); break;
						  default:		BEGIN TEXT; 
						  }
						  if (DEBUG_LEXER(1))
						      fprintf(dbgout, "*** end of header\n");
						  return EOH;
						}
<INITIAL>^{TOKEN}				{ header(); return TOKEN; }
<INITIAL>\n					{ lineno += 1; }

<INITIAL>{VERP} 				{ skip_to('='); return VERP; }

^-----BEGIN\ PGP\ SIGNATURE-----$		{ BEGIN PGP_HEAD;
						  yy_unput((byte *)yytext, yyleng);
						}
<PGP_HEAD>^\n					{ BEGIN PGP_BODY; }
<PGP_BODY>{TOKEN}				{ /* ignore */ }
^-----END\ PGP\ SIGNATURE-----$ 		{ BEGIN TEXT;
						  yy_unput((byte *)yytext, yyleng);
						}
^--{MIME_BOUNDARY}(--)?$			{ if (got_mime_boundary(yy_text())) {
						      yy_set_state_initial();
						      return BOUNDARY;
						  } else {
						      yyless(2);
						  }
						}

  /* This has to match just as much or more than the below rules, so as to be the 
     controlling rule. */
<HTML>{TOKEN}({HTMLTOKEN}*{BREAKHTML}+{HTMLTOKEN}*.?|{HTMLTOKEN}+{WHITESPACE})		{ 
    			char *chr = (char *)memchr(yytext, '<', yyleng);	/* find start of html tag */
			size_t len = chr - yytext;
			yyless(len);
			return TOKEN;
			}

<HTML>{TOKEN}{HTMLTOKEN}+/{NOTWHITESPACE}	{ html_reorder(); }

<HTML>"<!--"					{ BEGIN SCOMMENT; }
<HTML>"<!"					{ BEGIN LCOMMENT; }
<HTML>"<"(a|img|font){WHITESPACE}		{ BEGIN HTOKEN; }
<HTML>"<"					{ BEGIN HDISCARD; }	/* unknown tag */

<HTOKEN>{TOKEN}					{ return TOKEN; }
<HDISCARD,LCOMMENT,SCOMMENT>{TOKEN}		{ /* discard innards of html tokens and comments */ }

<HTOKEN,HDISCARD,LCOMMENT>">"			{ BEGIN HTML; }	/* end of tag, loose comment; return to normal html processing */
<SCOMMENT>"-->"					{ BEGIN HTML; }	/* end of strict comment; return to normal html processing */
"<"\!DOCTYPE\ HTML\ PUBLIC\ .*">" 		{ BEGIN HTML; }

{IPADDR}					{ return IPADDR;}
"\["({IPADDR})"\]"				{ return MESSAGE_ADDR;}

{TOKEN}						{ return TOKEN;}

<HTML>{TOKEN}?{HTML_ENCODING}			{ html_char(); }	/* process escaped chars, eg '&#101;' is 'a' */
<HTOKEN>"/"[^/[:blank:]\n%]*{URL_ENCODING}+	{ url_char(); }		/* process escaped chars, eg '%61'    is 'a' */

\${NUM}(\.{NUM})?				{ return MONEY;}	/* Dollars and cents */

.						/* ignore character */
\n						{ lineno += 1; clr_tag(); }
<<EOF>>						{ return NONE; }
%%

static void header(void)
{
    set_tag("Header");
}

void lexer_v3_init(FILE *fp)
{
    lineno = 0;
    have_body = false;
    yy_set_state_initial();
    yyrestart(fp);
}

static void skip_to(char chr)
{
    size_t len = strchr(yytext, chr) - yytext;
    yyless(len);
}

static void html_reorder(void)
{
    char *chr = (char *)memchr(yytext, '<', yyleng);	/* find start of html tag */
    size_t len = chr - yytext;
    char *yycopy = (char *)xmalloc(yyleng + 1); 	/* +1 for NUL byte below */

    memcpy(yycopy, yytext+len, yyleng-len);	/* copy tag to start of buffer */
    memcpy(yycopy+yyleng-len, yytext, len);	/* copy leading text to end of buffer */
    yycopy[yyleng] = '\0';			/* for debugging */

    yy_unput((byte *)yycopy, yyleng);
    xfree(yycopy);
}

static int xtoi(char *in, size_t len)
{
    int val = 0;
    while (isxdigit((byte) *in) && (len-- > 0)) {
	char c = *in++;
	val <<= 4;
	val += isdigit((byte)c)
	    ? (c - '0')
	    : (tolower((byte)c) - 'a' + 10);
    }
    return val;
}

static void html_char(void)
{
    char *txt = strstr(yytext, "&#");	/* find decodable char */
    size_t len = txt - yytext;
    int  val;
    char *yycopy = NULL;

    if (len != 0) {
	yycopy = (char *)xmalloc(yyleng + 1);	/* +1 for NUL byte below */
	memcpy(yycopy, yytext, yyleng);	/* copy tag to start of buffer */
	yycopy[yyleng] = '\0';		/* for debugging */
    }

    txt += 2;
    val = isdigit((byte) *txt) ? atoi(txt) : xtoi(txt+1, 4);

    /* xtoi() limits conversion to 4 characters */
    /* atoi() limits value to 0x7fffffff, i.e. 2147483647 */
    /* no problem on linux */

    if ((val > 0) && (val < 256) && 
	isprint(val)) {			/* use it if printable */
	yyunput(val, yytext);
	yyleng = len;			/* adjust len to pre-char count */
    }
    else {
	if (yycopy)
	    yycopy[yyleng-1] = ' ';	/* prevents parsing loop */
    }

    if (yycopy != NULL) {
	yy_unput((byte *)yycopy, yyleng);
	xfree(yycopy);
    }
}

static void url_char(void)
{
    char *src, *dst;
    src = dst = yytext;

    while (src < yytext + yyleng) {
	char c = *src++;
	if (c == '%') {
	    c = xtoi(src, 2);
	    src += 2;
	}
	*dst++ = c;
    }
    while (dst > yytext) {
	yyunput(*--dst, yytext);
    }
}

static void yy_unput(const byte *txt, uint len)
{
    while (len-- > 0)
	yyunput(txt[len], yytext);
}

char yy_get_state()
{
    switch (YYSTATE) {
    case INITIAL:  return 'i';
    case TEXT:     return 't';
    case HTML:
    case HTOKEN:   return 'h';
    case SCOMMENT: return 's';
    case LCOMMENT: return 'l';
    default:       return 'o';
    }
}

void yy_set_state_initial(void)
{
    BEGIN INITIAL;
    msg_header = true;
    header();

    if (DEBUG_LEXER(1))
	fprintf(dbgout, "BEGIN INITIAL\n");

#ifdef	FLEX_DEBUG
    yy_flex_debug = BOGOTEST('L');
#endif
}

long lexer_v3_get_token(byte **output)
{
	*output = (byte *)yytext;
	return yyleng;
}

/*
 * The following sets edit modes for GNU EMACS
 * Local Variables:
 * mode:c
 * indent-tabs-mode:t
 * End:
 */