/*****************************************************************************
NAME:
maint.c -- wordlist maintenance functions
AUTHOR:
David Relson
******************************************************************************/
#include "common.h"
#include <assert.h>
#include <stdlib.h>
#include "buff.h"
#include "datastore.h"
#include "error.h"
#include "charset.h"
#ifndef DISABLE_UNICODE
#include "convert_unicode.h"
#include "iconvert.h"
#endif
#include "maint.h"
#include "transaction.h"
#include "wordlists.h"
#include "xmalloc.h"
#include "xstrdup.h"
uint32_t thresh_count = 0;
YYYYMMDD thresh_date = 0;
size_t size_min = 0;
size_t size_max = 0;
bool timestamp_tokens = true;
bool upgrade_wordlist_version = false;
#ifndef DISABLE_UNICODE
e_enc old_encoding;
e_enc new_encoding;
#define DEFAULT_OR_UNICODE(enc) (enc == E_RAW) ? charset_default : charset_unicode
#endif
/* Function Prototypes */
/* Function Definitions */
/* Keep high counts */
static bool keep_count(uint32_t count)
{
bool ok = count > thresh_count;
if (count > 0 && DEBUG_DATABASE(1))
fprintf(dbgout, "keep_count: %lu > %lu -> %c\n",
(unsigned long)count, (unsigned long)thresh_count,
ok ? 't' : 'f' );
return ok;
}
/* Keep recent dates */
static bool keep_date(YYYYMMDD date)
{
bool ok = thresh_date < date;
if (DEBUG_DATABASE(1))
fprintf(dbgout, "keep_date: %ld < %ld -> %c\n",
(long)thresh_date, (long)date, ok ? 't' : 'f' );
return ok;
}
/* Keep sizes within bounds */
static bool keep_size(size_t size)
{
bool ok = (size_min <= size) && (size <= size_max);
if (DEBUG_DATABASE(1))
fprintf(dbgout, "keep_size: %lu <= %lu <= %lu -> %c\n",
(unsigned long)size_min, (unsigned long)size, (unsigned long)size_max,
ok ? 't' : 'f' );
return ok;
}
static void merge_tokens(const word_t *old_token, const word_t *new_token, const dsv_t *in_val, ta_t *transaction, void *vhandle)
{
int ret;
dsv_t tmp;
/* delete original token */
ta_delete(transaction, vhandle, old_token);
/* retrieve and update nonascii token*/
ret = ta_read(transaction, vhandle, new_token, &tmp);
if (ret == EX_OK) {
tmp.spamcount += in_val->spamcount;
tmp.goodcount += in_val->goodcount;
tmp.date = max(tmp.date, in_val->date); /* date in form YYYYMMDD */
} else {
memcpy(&tmp, in_val, sizeof(dsv_t));
};
set_date(tmp.date); /* set timestamp */
ta_write(transaction, vhandle, new_token, &tmp);
set_date(0);
}
static void replace_token(const word_t *old_token, const word_t *new_token, const dsv_t *in_val, ta_t *transaction, void *vhandle)
{
/* delete original token */
ta_delete(transaction, vhandle, old_token);
/* retrieve and update nonascii token*/
set_date(in_val->date); /* set timestamp */
ta_write(transaction, vhandle, new_token, in_val);
set_date(0);
}
/* Keep token if at least one user given constraint should be kept */
/* Discard if all user given constraints are satisfied */
bool discard_token(word_t *token, const dsv_t *in_val)
{
bool discard;
if (token->u.text[0] == '.') { /* keep .ENCODING, .MSG_COUNT, and .ROBX */
if (0 == word_cmps(token, MSG_COUNT))
return false;
if (0 == word_cmps(token, ROBX_W))
return false;
if (0 == word_cmps(token, WORDLIST_ENCODING))
return false;
}
discard = (thresh_count != 0) || (thresh_date != 0) || (size_min != 0) || (size_max != 0);
if (discard) {
if (thresh_count != 0 &&
(keep_count(in_val->spamcount) || keep_count(in_val->goodcount)))
discard = false;
if (thresh_date != 0 && keep_date(in_val->date))
discard = false;
if ((size_min != 0 || size_max != 0) &&
keep_size(token->leng))
discard = false;
}
return discard;
}
bool do_replace_nonascii_characters(byte *str, size_t len)
{
bool change = false;
assert(str != NULL);
while (len--) {
if (*str & 0x80) {
*str = '?';
change = true;
}
str++;
}
return change;
}
struct userdata_t {
void *vhandle;
ta_t *transaction;
};
static ex_t maintain_hook(word_t *w_key, dsv_t *in_val,
void *userdata)
{
size_t len;
word_t token;
void *vhandle = ((struct userdata_t *) userdata)->vhandle;
ta_t *transaction = ((struct userdata_t *) userdata)->transaction;
token.u.text = w_key->u.text;
token.leng = w_key->leng;
len = strlen(MSG_COUNT);
if (len == token.leng &&
strncmp((char *)token.u.text, MSG_COUNT, token.leng) == 0)
return EX_OK;
if (discard_token(&token, in_val)) {
ex_t ret = ta_delete(transaction, vhandle, &token) ? EX_ERROR : EX_OK;
if (DEBUG_DATABASE(0))
fprintf(dbgout, "deleting '%.*s'\n", (int)min(INT_MAX, token.leng), (char *)token.u.text);
return ret;
}
if (replace_nonascii_characters)
{
word_t new_token;
new_token.u.text = (byte *)xmalloc(token.leng + 1);
memcpy(new_token.u.text, token.u.text, token.leng);
new_token.leng = token.leng;
new_token.u.text[new_token.leng] = '\0';
if (do_replace_nonascii_characters(new_token.u.text, new_token.leng))
merge_tokens(&token, &new_token, in_val, transaction, vhandle);
xfree(new_token.u.text);
}
#ifndef DISABLE_UNICODE
if (old_encoding != new_encoding)
{
buff_t new_buff;
buff_t old_buff;
old_buff.read = 0;
old_buff.size = token.leng;
old_buff.t.u.text = token.u.text;
old_buff.t.leng = token.leng;
new_buff.read = 0;
new_buff.size = token.leng * 6;
new_buff.t.leng = 0;
new_buff.t.u.text = (byte *)xmalloc(new_buff.size);
iconvert(&old_buff, &new_buff);
if (old_buff.t.leng != new_buff.t.leng ||
memcmp(old_buff.t.u.text, new_buff.t.u.text, new_buff.t.leng) != 0) {
if (DEBUG_ICONV(2)) {
fputs("*** ", dbgout); word_puts(&old_buff.t, 0, dbgout); fputs( "\n", dbgout);
fputs("*** ", dbgout); word_puts(&new_buff.t, 0, dbgout); fputs( "\n", dbgout);
}
merge_tokens(&old_buff.t, &new_buff.t, in_val, transaction, vhandle);
}
xfree(new_buff.t.u.text);
}
#endif
if (upgrade_wordlist_version)
{
switch (wordlist_version)
{
case IP_PREFIX:
{
/* up-to-date - nothing to do */
break;
}
case 0:
{
/* update to "ip:" prefix level */
const char *url_hdr = "url:";
size_t url_len = strlen(url_hdr);
const char *ip_hdr = "ip:";
size_t ip_len = strlen(ip_hdr);
if (token.leng > url_len && memcmp(token.u.text, url_hdr, url_len) == 0)
{
word_t new_token;
new_token.leng = token.leng + ip_len - url_len;
new_token.u.text = (byte *)xmalloc(new_token.leng + 1);
memcpy(new_token.u.text, ip_hdr, ip_len);
memcpy(new_token.u.text+ip_len, token.u.text+url_len, token.leng - url_len);
new_token.u.text[new_token.leng] = '\0';
replace_token(&token, &new_token, in_val, transaction, vhandle);
xfree(new_token.u.text);
}
break;
}
}
}
return EX_OK;
}
static bool check_wordlist_version(dsh_t *dsh)
{
dsv_t val;
ds_get_wordlist_version(dsh, &val);
if (val.count[0] >= CURRENT_VERSION)
return true;
else
return false;
}
static ex_t maintain_wordlist(void *database)
{
ta_t *transaction = ta_init();
struct userdata_t userdata;
ex_t ret;
bool done = false;
userdata.vhandle = database;
userdata.transaction = transaction;
if (DST_OK == ds_txn_begin(database)) {
#ifndef DISABLE_UNICODE
dsv_t val;
int rc = ds_get_wordlist_encoding(database, &val);
new_encoding = encoding;
if (rc == 0)
old_encoding = (e_enc)val.spamcount; /* found | FIXME: is the cast correct? */
else
old_encoding = E_RAW; /* not found */
if (old_encoding != new_encoding) {
const char *from_charset = DEFAULT_OR_UNICODE(old_encoding);
const char *to_charset = DEFAULT_OR_UNICODE(new_encoding);
init_charset_table_iconv(from_charset, to_charset);
}
#endif
ret = ds_foreach(database, maintain_hook, &userdata);
} else
ret = EX_ERROR;
if (upgrade_wordlist_version) {
done = check_wordlist_version((dsh_t *)database);
if (!done)
fprintf(dbgout, "Upgrading wordlist.\n");
else
fprintf(dbgout, "Wordlist has already been upgraded.\n");
}
if (!done && upgrade_wordlist_version)
{
dsv_t val;
val.count[0] = CURRENT_VERSION;
val.count[1] = 0;
ds_set_wordlist_version(database, &val);
}
#ifndef DISABLE_UNICODE
if (old_encoding != new_encoding) {
dsv_t val;
word_t enco;
enco.u.text = (byte *)xstrdup(WORDLIST_ENCODING);
enco.leng = strlen(WORDLIST_ENCODING);
val.count[0] = new_encoding;
val.count[1] = 0;
val.date = 0;
ds_write(database, &enco, &val);
xfree(enco.u.text);
}
#endif
if (ta_commit(transaction) != TA_OK)
ret = EX_ERROR;
if (DST_OK != ds_txn_commit(database))
ret = EX_ERROR;
return ret;
}
ex_t maintain_wordlist_file(bfpath *bfp)
{
ex_t rc;
dsh_t *dsh;
void *dbe;
dbe = ds_init(bfp);
dsh = (dsh_t *)ds_open(dbe, bfp, DS_WRITE);
if (dsh == NULL)
return EX_ERROR;
rc = maintain_wordlist(dsh);
ds_close(dsh);
ds_cleanup(dbe);
return rc;
}