Blob Blame History Raw
/*****************************************************************************

NAME:
   robx.c -- computes robx value by reading wordlist.db

AUTHOR:
   David Relson - C version
   Greg Lous - perl version
   
******************************************************************************/

#include "bftypes.h"
#include "common.h"

#include <errno.h>

#include "datastore.h"
#include "rand_sleep.h"
#include "robx.h"
#include "wordlists.h"

/* Function Prototypes */

/* Function Definitions */

typedef struct robhook_data {
    double   sum;
    uint32_t count;
    uint32_t spam_cnt;
    uint32_t good_cnt;
    dsh_t    *dsh;
    double   scalefactor;
} rhd_t;

static void robx_accum(rhd_t *rh, 
		       const word_t *key,
		       const dsv_t *data)
{
    uint32_t goodness = data->goodcount;
    uint32_t spamness = data->spamcount;
    double prob = spamness / (goodness * rh->scalefactor + spamness);
    bool doit = goodness + spamness >= 10;

    if (doit) {
	rh->sum += prob;
	rh->count += 1;
    }

    /* print if -vvv and token in both word lists, or -vvvv */
    if ((verbose > 2 && doit) || verbose > 3) {
	fprintf(dbgout, "cnt: %4lu,  sum: %11.6f,  ratio: %9.6f,"
		"  sp: %3lu,  gd: %3lu,  p: %9.6f,  t: %.*s\n", 
		(unsigned long)rh->count, rh->sum, rh->sum / rh->count,
		(unsigned long)spamness, (unsigned long)goodness, prob,
		CLAMP_INT_MAX(key->leng), key->u.text);
    }
}

static ex_t robx_hook(word_t *key, dsv_t *data, 
		     void *userdata)
{
    struct robhook_data *rh = (struct robhook_data *)userdata;

    /* ignore system meta-data */
    if (*key->u.text != '.')
	robx_accum(rh, key, data);

    return EX_OK;
}

/** returns negative for failure.
 * used by bogoutil and bogotune */
double compute_robinson_x(void)
{
    int ret;
    double rx = /* C89 does not support NAN, C99 does. */
#ifdef NAN
	    NAN
#else
	    -999
#endif
	    ;
    dsh_t *dsh;
    wordlist_t *wordlist;

    struct robhook_data rh;

    open_wordlists(DS_READ);
    wordlist = get_default_wordlist(word_lists);

    dsh = wordlist->dsh;

    rh.spam_cnt = wordlist->msgcount[IX_SPAM];
    rh.good_cnt = wordlist->msgcount[IX_GOOD];

    if (rh.spam_cnt == 0 || rh.good_cnt == 0)
	wordlist_error(ENOENT);
    
    rh.scalefactor = (double)rh.spam_cnt/(double)rh.good_cnt;

    rh.dsh = dsh;
    rh.sum = 0.0;
    rh.count = 0;

    do {
	ret = ds_foreach(dsh, robx_hook, &rh);
	if (ret == DS_ABORT_RETRY) {
	    rand_sleep(1000, 1000000);
	    begin_wordlist(wordlist);
	}
    } while (ret == DS_ABORT_RETRY);

    if (rh.count == 0)
	ret = -1;
    else
	rx = rh.sum/rh.count;
    if (verbose > 2)
	printf("%s: %u, %u, scale: %f, sum: %f, cnt: %6d, .ROBX: %f\n",
	       MSG_COUNT, rh.spam_cnt, rh.good_cnt,
	       rh.scalefactor, rh.sum, (int)rh.count, rx);

    close_wordlists(true);

    return ret ? -1 : rx;
}