#!/bin/sh
# Train bogofilter from a ham and spam corpus
#
# Copyright 2003 by Trevor Harrison (trevor-trainbogo@harrison.org)
#
# This file is released under the GPL. See http://www.gnu.org/licenses/gpl.txt
# Note: this script has not yet had bogofilter maintainer review.
# Security concerned people should not run it if in doubt about its security.
usage()
{
echo "USAGE:"
echo
echo " trainbogo.sh [options]"
echo
echo "OPTIONS:"
echo
echo " Required arguments:"
echo " -H hamdir points to directory with all your ham"
echo " -S spamdir points to directory will all your spam"
echo
echo " Optional arguments:"
echo " -s statdir directory where stat and tmp files are created."
echo " default is ./stats.tmp"
echo " -b pathtobogofilter points to the bogofilter executable,"
echo " with any bogofilter options you need."
echo " ex. -b \"/usr/local/bin/bogofilter -d /etc/bogodb\""
echo " -f force rebuild of ham and spam directory index. Will"
echo " cause msgs to be sorted into new order unless"
echo " -p and -t are used."
echo " -c cleanup statdir when done. (default is not to)"
echo " -p rndseed specify the pid.timestamp used to randomize the msgs."
echo " ex. -p 5432.1049498805"
echo " -m don't test or train bogofilter, just show cached stats."
echo " -n don't train bogofilter, just test."
echo " -q don't show stats or dots. (quiet)"
echo " -h show help."
echo
}
help()
{
echo "trainbogo.sh"
echo
echo " Train bogofilter from a qmail maildir type ham and spam corpus"
echo
echo " This script relies on you having seperated your qmail maildir messages into"
echo " ham and spam directories. This script randomizes the message order, and"
echo " then feeds each message in turn into bogofilter, noting if bogofilter"
echo " correctly identified the message as ham or spam. If mis-identified, it"
echo " trains bogofilter with that message, and then re-tests to see if bogofilter"
echo " correctly identifies the message."
echo
echo " When I've used this script on my ham/spam collection, it takes about 4"
echo " consecutive executions to get my wordlists to a 0 false positive state."
echo " Just because this script reports 0 failed trainings doesn't mean that you"
echo " are ready to go. Run the script a second time to make sure. You should"
echo " keep running the script until you get 0 misdetections and, of course, 0"
echo " retrain failed's."
echo
echo " While running, trainbogo.sh will write some dots and dashes to the screen."
echo
echo " . = successfully categorized the message."
echo " - = failed to categorized the message, and training was turned off (-n)."
echo " + = successfully categorized the message after being retrained."
echo " f = failed to categorize the message after training."
echo
echo " The results of the testing can be found in the statsdir. Log files have"
echo " the filename of each message that match the logfile name:"
echo
echo " trainbogo.log.[0,1].[success,fail]"
echo " 0 = spam message log"
echo " 1 = ham message log"
echo " success/fail = were/weren't correctly categorized."
echo
usage
}
verbose()
{
[ -n "${verbose}" ] && echo $@
}
normal()
{
[ -z "${quiet}" ] && echo $@
}
normaln()
{
[ -z "${quiet}" ] && printf "%s" "$*"
}
cleanup()
{
verbose "Performing cleanup"
[ -z "${log}" ] || [ -z "${list}" ] || [ "${docleanup}" != "y" ] && return
rm -f ${log}.[01].success ${log}.[01].fail \
${log}.[01].train.success ${log}.[01].train.fail \
${list}
[ "${madestatsdir}" = "y" ] && [ -n "${statsdir}" ] && rmdir --ignore-fail-on-non-empty "${statsdir}"
}
dofilelist=
dotrain=y
dotest=y
docleanup=
verbose=
quiet=
statsdir="${PWD}/stats.tmp/"
origstatsdir="${statsdir}"
bf=bogofilter
while getopts "H:S:s:b:p:fcmnqvh" optname; do
case "${optname}" in
"H") hamdir="$OPTARG" ;;
"S") spamdir="$OPTARG" ;;
"s") statsdir="$OPTARG" ;;
"b") bf="$OPTARG";;
"f") dofilelist=y ;;
"c") docleanup=y ;;
"p") rndseed=$OPTARG ;;
"m") dotest= ; dotrain= ;;
"n") dotrain= ;;
"q") quiet=y ;;
"v") verbose=y ;;
"h") help; exit ;;
esac
done
# Check for required options
[ -z "${hamdir}" ] || [ ! -d "${hamdir}" ] && echo "Missing or bad -H option" && usage && exit
[ -z "${spamdir}" ] || [ ! -d "${spamdir}" ] && echo "Missing or bad -S option" && usage && exit
[ -z "${statsdir}" ] && echo "Bad statsdir option" && usage && exit
# make the stats dir if its missing, but only if its the default stats dir and not user specified
[ "${statsdir}" = "${origstatsdir}" ] && [ ! -d "${statsdir}" ] && mkdir "${statsdir}" && madestatsdir=y
[ ! -d "${statsdir}" ] && echo "Missing statsdir (-s option)" && exit
# check for bogofilter
bfbin=$(which ${bf%% *})
[ $? -ne 0 ] && echo "Missing bogofilter, not in path? (${bf})" && exit
[ ! -x "${bfbin}" ] && echo "Missing or bad bogofilter binary! (${bf})" && exit
list="${statsdir}/trainbogo.filenames.txt"
log="${statsdir}/trainbogo.log"
# Init log files
if [ ! -f "${log}.0.success" ] || [ -n "${dotest}" ] || [ -n "${dotrain}" ] ; then
verbose "init log files"
>"${log}.0.success"
>"${log}.1.success"
>"${log}.0.fail"
>"${log}.1.fail"
>"${log}.0.train.success"
>"${log}.0.train.fail"
>"${log}.1.train.success"
>"${log}.1.train.fail"
fi
# First make a randomly sorted list of all the ham and spam files (if needed)
if [ ! -f "${list}" ] || [ -n "${dofilelist}" ]; then
# MD5 all the spam and ham
[ -z "${rndseed}" ] && rndseed="$$.$(date +%s)"
normal "MD5'ing ham and spam corpus, rndseed used: ${rndseed}"
>"${list}"
for i in "${hamdir}"/* "${spamdir}"/*
do
[ ! -f "${i}" ] && continue
md5=$(printf "%s" "${rndseed}${i}" | md5sum | sed "s/ -//")
echo "${md5} ${i}" >> "${list}"
done
[ $(wc -l < "${list}") -eq 0 ] && echo "No files to work on!!!" && exit
# This randomizes the file names by sorting on the md5 hash
normal "Randomizing ham and spam"
sort "${list}" > "${list}.tmp"
mv -f "${list}.tmp" "${list}"
# Drop the hash
sed "s/^.\{32\} \(.*\)/\1/" < "${list}" > "${list}.tmp"
mv -f "${list}.tmp" "${list}"
# Put expected bogofilter error levels in front of each filename
# Using @'s for sed's rule delimiter because ${hamdir} can have /'s.
# Hopefully there won't be any @'s in the ham/spam dir name.
sed "s@^${hamdir}\(.*\)@1 ${hamdir}\\1@g; s@^${spamdir}\(.*\)@0 ${spamdir}\\1@g" < "${list}" > "${list}.tmp"
mv -f "${list}.tmp" "${list}"
fi
# Read each filename from the filelist and test and train bogofilter.
if [ -n "${dotest}" ] || [ -n "${dotrain}" ]; then
normal "Training bogofilter"
(while read spamstatus fname
do
normaln "${lastdot}"
bogotest=$(${bf} -v < "${fname}")
ret=$?
if [ ${spamstatus} -eq ${ret} ]; then # bogofilter detected this message correctly
echo "${fname}" >> "${log}.${spamstatus}.success"
lastdot="."
continue
fi
# Bogofilter failed to detect the msg correctly
echo "${fname}" >> "${log}.${spamstatus}.fail"
lastdot="-"
[ -z "${dotrain}" ] && continue
# Set the bogofilter option for training
if [ ${spamstatus} -eq 0 ]; then
bfopt="-s"
else
bfopt="-n"
fi
# Train bogofilter
${bf} ${bfopt} < "${fname}"
# Test again
bogotest=$(${bf} -v < "${fname}")
ret=$?
# Did it train successfully?
if [ ${spamstatus} -eq ${ret} ]; then
testresult="success"
lastdot="+"
else
testresult="fail"
lastdot="f"
fi
# Log train result
echo "${fname}" >> "${log}.${spamstatus}.train.${testresult}"
done) < ${list}
fi
echo
echo
if [ -z "${quiet}" ]; then
total_msg=$(wc -l < "${list}")
total_ham_msg=$(ls "${hamdir}" | wc -l)
total_ham_success=$(wc -l < "${log}.1.success")
total_ham_fail=$(wc -l < "${log}.1.fail")
total_ham_train_fail=$(wc -l < "${log}.1.train.fail")
total_spam_msg=$(ls "${spamdir}" | wc -l)
total_spam_success=$(wc -l < "${log}.0.success")
total_spam_fail=$(wc -l < "${log}.0.fail")
total_spam_train_fail=$(wc -l < "${log}.0.train.fail")
echo "Total messages: ${total_msg}"
echo
echo "Total ham: ${total_ham_msg}"
echo "Misdetected ham: ${total_ham_fail}"
[ -n "${dotrain}" ] && echo " retrain fail: ${total_ham_train_fail}"
echo
echo "Total spam: ${total_spam_msg}"
echo "Misdetected spam: ${total_spam_fail}"
[ -n "${dotrain}" ] && echo " retrain fail: ${total_spam_train_fail}"
echo
fi
normal "Done"
cleanup
# done