Blob Blame History Raw
/*****************************************************************************

NAME:
   bogohist.c -- print bogofilter histogram

AUTHOR:
   Gyepi Sam <gyepi@praxis-sw.com>

******************************************************************************/

#include "common.h"

#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include <sys/stat.h>

#include "bogohist.h"
#include "prob.h"
#include "datastore.h"
#include "msgcounts.h"
#include "word.h"
#include "wordlists.h"
#include "xmalloc.h"

static uint ham_only,  ham_hapax;
static uint spam_only, spam_hapax;

static uint mgood, mbad;

#define	INTERVALS	20
#define PCT(n)		100.0 * n / count

typedef struct rhistogram_s rhistogram_t;
struct rhistogram_s {
    uint32_t count[INTERVALS];
};

/* Function Prototypes */

/* Function Definitions */

static ex_t ds_histogram_hook(/*@unused@*/ word_t *key, dsv_t *data,
			     void *userdata)
{
    rhistogram_t *hist = (rhistogram_t *)userdata;

    double fw = calc_prob(data->goodcount, data->spamcount, mgood, mbad);
    uint idx = min(fw * INTERVALS, INTERVALS-1);

    /* ignore meta-tokens */
    if (*key->u.text == (byte) '.')
	return EX_OK;

    hist->count[idx] += 1;

    if (data->spamcount == 0) {
	ham_only += 1;
	if (data->goodcount == 1)
	    ham_hapax += 1;
    }

    if (data->goodcount == 0) {
	spam_only += 1;
	if (data->spamcount == 1)
	    spam_hapax += 1;
    }

    return EX_OK;
}

static int print_histogram(rhistogram_t *hist)
{
    uint i, r;
    uint maxcnt = 0;
    uint count = 0;

    if (verbose == 0)
	(void)printf("Histogram\n");

    if (verbose == 1) {
	hist->count[0]           -= ham_hapax;
	hist->count[INTERVALS-1] -= spam_hapax;
	(void)printf("Histogram without hapaxes\n");
    }

    if (verbose == 2) {
	hist->count[0]           -= ham_only;
	hist->count[INTERVALS-1] -= spam_only;
	(void)printf("Histogram without pure ham and spam\n");
    }

    (void)printf("%5s%8s  %3s  %s\n", "score", "count", "pct", "histogram");

    for (i=0; i<INTERVALS; i+=1)
    {
	uint32_t cnt = hist->count[i];
	if (cnt > maxcnt) 
	    maxcnt = cnt;
	count += cnt;
    }

    /* Print histogram */
    for (i=0; i<INTERVALS; i+=1)
    {
	uint32_t cnt = hist->count[i];
	double beg = 1.0 * i / INTERVALS;
	double pct = PCT(cnt);

	/* print interval, count, probability, percent, and spamicity */
	(void)printf("%3.2f %8u %5.2f ", beg, cnt, pct);

	/* scale histogram to 48 characters */
	if (maxcnt>48) cnt = (cnt * 48 + maxcnt - 1) / maxcnt;

	/* display histogram */
	for (r=0; r<cnt; r+=1)
	    (void)fputc( '#', stdout);
	(void)fputc( '\n', stdout);
    }

    (void)printf("tot  %8u\n", count);

    return count;
}

ex_t histogram(bfpath *bfp)
{
    ex_t rc;
    uint count;
    void *dsh, *dbe;
    dsv_t val;

    rhistogram_t hist;

    dbe = ds_init(bfp);
    if (dbe == NULL)
	return EX_ERROR;

    dsh = ds_open(dbe, bfp, DS_READ);
    if (dsh == NULL)
	return EX_ERROR;

    if (DST_OK != ds_txn_begin(dsh)) {
	ds_close(dsh);
	ds_cleanup(dbe);
	fprintf(stderr, "cannot begin transaction!\n");
	return EX_ERROR;
    }

    ds_get_msgcounts(dsh, &val);
    mgood = val.goodcount;
    mbad = val.spamcount;

    memset(&hist, 0, sizeof(hist));
    rc = ds_foreach(dsh, ds_histogram_hook, &hist);

    if (DST_OK != ds_txn_commit(dsh)) {
	ds_close(dsh);
	ds_cleanup(dbe);
	fprintf(stderr, "cannot commit transaction!\n");
	return EX_ERROR;
    }

    ds_close(dsh);
    ds_cleanup(dbe);

    count = print_histogram(&hist);

    if (verbose > 0) {
	printf("hapaxes:  ham %7u, spam %7u\n", ham_hapax, spam_hapax);
	printf("   pure:  ham %7u, spam %7u\n", ham_only,  spam_only);
    }
    else {
	printf("hapaxes:  ham %7u (%5.2f%%), spam %7u (%5.2f%%)\n", ham_hapax, PCT(ham_hapax), spam_hapax, PCT(spam_hapax));
	printf("   pure:  ham %7u (%5.2f%%), spam %7u (%5.2f%%)\n", ham_only,  PCT(ham_only),  spam_only,  PCT(spam_only));
    }

    return rc;
}

/* for a standalone program:
**
**	cc -o bogohist.prog.o -DMAIN -c bogohist.c
**	cc -o bogohist bogohist.prog.o libbogofilter.a strlcpy.o strlcat.o -ldb  -lm
*/

#ifdef	MAIN
const char *progname = "bogohist";

int main(int argc, char *argv[])
{
    if (argc < 2) {
	fprintf(stderr, "usage: %s BOGOFILTER_DIR\n", progname);
	exit(1);
    }
    else {
	const char *path = argv[1];
	int rc = histogram(path);
	exit(rc);
    }
}
#endif