Blob Blame History Raw
#!/bin/sh

#	parmtest.sh - a parameter testing script.
#
#	This script can be used to test a bogofilter option
#	to see whether the option makes bogofilter is more
#	effective or less effective.
#
#	It is presently set to test 'tag_header_lines', but
#	by creating different config files can be used to test
#	other config file options.
#
#	For each test named in variable TESTS, a corresponding 
#	config file, e.g. $test.cf, is needed.  Bogofilter is
#	trained using $test.cf to create wordlists for the test.
#	The newly created wordlists are saved in directory $test.d.
#	This allows a test to be run using the same parameters
#	used to build the wordlists.
#
#	For each mbox file used in testing, an output file is
#	created that gives the result of testing each message
#	in the mbox.  The result lines include:
#
#		S/H/U - the spam/ham/unsure classification.
#		spamicity - 0.0 to 1.0
#		message number - 0000, 0001, ...
#		subject line - "Subject: This is a test"
#
#	A summary line is generated for each file giving
#	test name, filename, S count, H count, and U count, 
#	for example:
#
#		default.spam.mbx.out 1603    5  136
#		default.good.mbx.out    3 4934  108

BIN=~/bin
BOGOUTIL="$BIN/bogoutil"
BOGOFILTER="$BIN/bogofilter"

#
TEST_DIR="./test.bogofilter"
BOGOFILTER_DIR="$TEST_DIR"
export BOGOFILTER_DIR

#specify files for creating the test wordlists
SPAM_TRAIN=$(ls spam_train*mbx)
GOOD_TRAIN=$(ls good_train*mbx)

#specify files for testing
SPAM_TEST=$(ls spam_test*mbx)
GOOD_TEST=$(ls good_test*mbx)

#specify names of tests to run
TESTS="default tag_header_lines"

if [ -z "$1" ] ; then

# create standard (base) config file

    cat <<EOF > base.cf
algorithm = fisher
ham_cutoff = 0.10
spam_cutoff = 0.95
terse_format = %1.1c %f
header_format = %1.1c
spamicity_tags = S, H, U
spamicity_formats = %0.6f %0.6f %0.6f
EOF

    # copy base config file for each test
    for test in $TESTS ; do
	cat base.cf > $test.cf
    done

    # customize config files
    cat <<EOF >> default.cf
tag_header_lines=no
EOF

    cat <<EOF >> tag_header_lines.cf
tag_header_lines=yes
EOF

    [ ! -d $TEST_DIR ] && mkdir $TEST_DIR

    date

    for test in $TESTS ; do

	echo $test

	# train (build wordlists for testing)
	if [ ! -d $test.d ] ; then
	    mkdir $test.d
	    rm -f $TEST_DIR/*.db
	    if [ ! -f spamlist.db ] ; then
		for m in $SPAM_TRAIN ; do
		    echo $m
		    $BOGOFILTER -v -c $test.cf -s < $m
		done
		cp -p $TEST_DIR/spamlist.db $test.d
	    fi
	    if [ ! -f goodlist.db ] ; then
		for m in $GOOD_TRAIN ; do
		    echo $m
		    $BOGOFILTER -v -c $test.cfd -n < $m
		done
		cp -p $TEST_DIR/goodlist.db $test.d
	    fi
	    $BOGOUTIL -w $test.d .MSG_COUNT
	fi

	# copy wordlists for use
	cp -pf $test.d/*.db $TEST_DIR

	# test (score accuracy)
	cfg="$test.cf"
	list="$SPAM_TEST $GOOD_TEST"

	for mbox in $list ; do
	    b=$(basename $mbox)
	    f="$test.$b.out"
	    cat /dev/null > $f
	    FILENO=0000 formail -s $0 $cfg $f < $mbox
	    s=$(grep "^S" $f | wc -l)
	    h=$(grep "^H" $f | wc -l)
	    u=$(grep "^U" $f | wc -l)
	    printf "%-20s %4d %4d %4d\n" $(basename $f) $s $h $u
	done

	echo ""
    done
else
    cfg="$1"
    out="$2"
    cat > $$.tmp
    result=$($BOGOFILTER -t -v -c $cfg < $$.tmp | head -1)
    echo $result  $FILENO  $(grep ^Subject: $$.tmp | head -1) >> $out
    rm -f $$.tmp
fi