Blob Blame History Raw
/*****************************************************************************

NAME:
   bogofilter.c -- detect spam and bogons presented on standard input.

AUTHORS:
   Eric S. Raymond <esr@thyrsus.com>
   David Relson    <relson@osagesoftware.com>
   Matthias Andree <matthias.andree@gmx.de>
   Greg Louis      <glouis@dynamicro.on.ca>

THEORY:

   Originally implemented as Paul Graham's variant of Bayes filtering,
   as described in 

     "A Plan For Spam", http://www.paulgraham.com/spam.html

   Updated in accordance with Gary Robinson's proposed modifications,
   as described at

    http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html

******************************************************************************/

#include "common.h"

#include <string.h>
#include <stdlib.h>

#include "bogofilter.h"
#include "bogoconfig.h"
#include "bogoreader.h"
#include "collect.h"
#include "format.h"
#include "passthrough.h"
#include "register.h"
#include "rstats.h"
#include "score.h"

/*
**	case B_NORMAL:		
**	case B_STDIN:		* '-b' - streaming (stdin) mode *
**	case B_CMDLINE:		* '-B' - command line mode *
**
**loop:
**    read & parse a message
**	if -p, save textblocks
**    register if -snSN && -pe
**    classify if -pue && ! -snSN
**    register if -u
**    write    if -p
**    if (-snSN && -pe) || -u
**	free tokens
**    else
**	accumulate tokens	
**
**end:	register if -snSN && ! -pe
*/

/* Function Definitions */

void print_stats(FILE *fp)
{
    msg_print_stats(fp);
}

rc_t bogofilter(int argc, char **argv)
{
    uint msgcount = 0;
    rc_t status = RC_OK;
    bool register_opt = (run_type & (REG_SPAM | UNREG_SPAM | REG_GOOD | UNREG_GOOD)) != 0;
    bool register_bef = register_opt && passthrough;
    bool register_aft = ((register_opt && !passthrough) || (run_type & RUN_UPDATE)) != 0;
    bool write_msg    = passthrough || Rtable;
    bool classify_msg = write_msg || ((run_type & (RUN_NORMAL | RUN_UPDATE))) != 0;

    wordhash_t *words;

    score_initialize();			/* initialize constants */

    if (query)
	return query_config();

    words = register_aft ? wordhash_new() : NULL;

    bogoreader_init(argc, (const char * const *) argv);

    while ((*reader_more)()) {
	wordhash_t *w = wordhash_new();

	rstats_init();
	passthrough_setup();

	collect_words(w);
	wordhash_sort(w);
	msgcount += 1;

	format_set_counts(w->count, msgcount);

        if (!passthrough_keepopen())
            bogoreader_close_ifeof();
        
	if (register_opt && DEBUG_REGISTER(1))
	    fprintf(dbgout, "Message #%ld\n", (long) msgcount);
	if (register_bef)
	    register_words(run_type, w, 1);
	if (register_aft)
	    wordhash_add(words, w, &wordprop_init);

	if (classify_msg || write_msg) {
	    double spamicity;
	    lookup_words(w);			/* This reads the database */
	    spamicity = msg_compute_spamicity(w);
	    status = msg_status();
	    if (run_type & RUN_UPDATE)		/* Note: don't register if RC_UNSURE */
	    {
		if (status == RC_SPAM && spamicity <= 1.0 - thresh_update)
		    register_words(REG_SPAM, w, msgcount);
		if (status == RC_HAM && spamicity >= thresh_update)
		    register_words(REG_GOOD, w, msgcount);
	    }

	    if (verbose && !passthrough && !quiet) {
		const char *filename = (*reader_filename)();
		if (filename)
		    fprintf(fpo, "%s ", filename); 
	    }

	    write_message(status);		/* passthrough */
	    if (logflag && !register_opt) {
		write_log_message(status);
		msgcount = 0;
	    }
	}
	wordhash_free(w);

	passthrough_cleanup();
	rstats_cleanup();

	if (DEBUG_MEMORY(2))
	    MEMDISPLAY;

	if (fDie)
	    exit(EX_ERROR);
    }

    bogoreader_fini();

    if (DEBUG_MEMORY(1))
	MEMDISPLAY;

    if (register_aft && ((run_type & RUN_UPDATE) == 0)) {
	wordhash_sort(words);
	register_words(run_type, words, msgcount);
    }

    score_cleanup();

    if (logflag && register_opt)
	write_log_message(status);

    wordhash_free(words);

    if (DEBUG_MEMORY(1))
	MEMDISPLAY;

    return status;
}

/* Done */