/***************************************************************************** NAME: maint.c -- wordlist maintenance functions AUTHOR: David Relson ******************************************************************************/ #include "common.h" #include #include #include "buff.h" #include "datastore.h" #include "error.h" #include "charset.h" #ifndef DISABLE_UNICODE #include "convert_unicode.h" #include "iconvert.h" #endif #include "maint.h" #include "transaction.h" #include "wordlists.h" #include "xmalloc.h" #include "xstrdup.h" uint32_t thresh_count = 0; YYYYMMDD thresh_date = 0; size_t size_min = 0; size_t size_max = 0; bool timestamp_tokens = true; bool upgrade_wordlist_version = false; #ifndef DISABLE_UNICODE e_enc old_encoding; e_enc new_encoding; #define DEFAULT_OR_UNICODE(enc) (enc == E_RAW) ? charset_default : charset_unicode #endif /* Function Prototypes */ /* Function Definitions */ /* Keep high counts */ static bool keep_count(uint32_t count) { bool ok = count > thresh_count; if (count > 0 && DEBUG_DATABASE(1)) fprintf(dbgout, "keep_count: %lu > %lu -> %c\n", (unsigned long)count, (unsigned long)thresh_count, ok ? 't' : 'f' ); return ok; } /* Keep recent dates */ static bool keep_date(YYYYMMDD date) { bool ok = thresh_date < date; if (DEBUG_DATABASE(1)) fprintf(dbgout, "keep_date: %ld < %ld -> %c\n", (long)thresh_date, (long)date, ok ? 't' : 'f' ); return ok; } /* Keep sizes within bounds */ static bool keep_size(size_t size) { bool ok = (size_min <= size) && (size <= size_max); if (DEBUG_DATABASE(1)) fprintf(dbgout, "keep_size: %lu <= %lu <= %lu -> %c\n", (unsigned long)size_min, (unsigned long)size, (unsigned long)size_max, ok ? 't' : 'f' ); return ok; } static void merge_tokens(const word_t *old_token, const word_t *new_token, const dsv_t *in_val, ta_t *transaction, void *vhandle) { int ret; dsv_t tmp; /* delete original token */ ta_delete(transaction, vhandle, old_token); /* retrieve and update nonascii token*/ ret = ta_read(transaction, vhandle, new_token, &tmp); if (ret == EX_OK) { tmp.spamcount += in_val->spamcount; tmp.goodcount += in_val->goodcount; tmp.date = max(tmp.date, in_val->date); /* date in form YYYYMMDD */ } else { memcpy(&tmp, in_val, sizeof(dsv_t)); }; set_date(tmp.date); /* set timestamp */ ta_write(transaction, vhandle, new_token, &tmp); set_date(0); } static void replace_token(const word_t *old_token, const word_t *new_token, const dsv_t *in_val, ta_t *transaction, void *vhandle) { /* delete original token */ ta_delete(transaction, vhandle, old_token); /* retrieve and update nonascii token*/ set_date(in_val->date); /* set timestamp */ ta_write(transaction, vhandle, new_token, in_val); set_date(0); } /* Keep token if at least one user given constraint should be kept */ /* Discard if all user given constraints are satisfied */ bool discard_token(word_t *token, const dsv_t *in_val) { bool discard; if (token->u.text[0] == '.') { /* keep .ENCODING, .MSG_COUNT, and .ROBX */ if (0 == word_cmps(token, MSG_COUNT)) return false; if (0 == word_cmps(token, ROBX_W)) return false; if (0 == word_cmps(token, WORDLIST_ENCODING)) return false; } discard = (thresh_count != 0) || (thresh_date != 0) || (size_min != 0) || (size_max != 0); if (discard) { if (thresh_count != 0 && (keep_count(in_val->spamcount) || keep_count(in_val->goodcount))) discard = false; if (thresh_date != 0 && keep_date(in_val->date)) discard = false; if ((size_min != 0 || size_max != 0) && keep_size(token->leng)) discard = false; } return discard; } bool do_replace_nonascii_characters(byte *str, size_t len) { bool change = false; assert(str != NULL); while (len--) { if (*str & 0x80) { *str = '?'; change = true; } str++; } return change; } struct userdata_t { void *vhandle; ta_t *transaction; }; static ex_t maintain_hook(word_t *w_key, dsv_t *in_val, void *userdata) { size_t len; word_t token; void *vhandle = ((struct userdata_t *) userdata)->vhandle; ta_t *transaction = ((struct userdata_t *) userdata)->transaction; token.u.text = w_key->u.text; token.leng = w_key->leng; len = strlen(MSG_COUNT); if (len == token.leng && strncmp((char *)token.u.text, MSG_COUNT, token.leng) == 0) return EX_OK; if (discard_token(&token, in_val)) { ex_t ret = ta_delete(transaction, vhandle, &token) ? EX_ERROR : EX_OK; if (DEBUG_DATABASE(0)) fprintf(dbgout, "deleting '%.*s'\n", (int)min(INT_MAX, token.leng), (char *)token.u.text); return ret; } if (replace_nonascii_characters) { word_t new_token; new_token.u.text = (byte *)xmalloc(token.leng + 1); memcpy(new_token.u.text, token.u.text, token.leng); new_token.leng = token.leng; new_token.u.text[new_token.leng] = '\0'; if (do_replace_nonascii_characters(new_token.u.text, new_token.leng)) merge_tokens(&token, &new_token, in_val, transaction, vhandle); xfree(new_token.u.text); } #ifndef DISABLE_UNICODE if (old_encoding != new_encoding) { buff_t new_buff; buff_t old_buff; old_buff.read = 0; old_buff.size = token.leng; old_buff.t.u.text = token.u.text; old_buff.t.leng = token.leng; new_buff.read = 0; new_buff.size = token.leng * 6; new_buff.t.leng = 0; new_buff.t.u.text = (byte *)xmalloc(new_buff.size); iconvert(&old_buff, &new_buff); if (old_buff.t.leng != new_buff.t.leng || memcmp(old_buff.t.u.text, new_buff.t.u.text, new_buff.t.leng) != 0) { if (DEBUG_ICONV(2)) { fputs("*** ", dbgout); word_puts(&old_buff.t, 0, dbgout); fputs( "\n", dbgout); fputs("*** ", dbgout); word_puts(&new_buff.t, 0, dbgout); fputs( "\n", dbgout); } merge_tokens(&old_buff.t, &new_buff.t, in_val, transaction, vhandle); } xfree(new_buff.t.u.text); } #endif if (upgrade_wordlist_version) { switch (wordlist_version) { case IP_PREFIX: { /* up-to-date - nothing to do */ break; } case 0: { /* update to "ip:" prefix level */ const char *url_hdr = "url:"; size_t url_len = strlen(url_hdr); const char *ip_hdr = "ip:"; size_t ip_len = strlen(ip_hdr); if (token.leng > url_len && memcmp(token.u.text, url_hdr, url_len) == 0) { word_t new_token; new_token.leng = token.leng + ip_len - url_len; new_token.u.text = (byte *)xmalloc(new_token.leng + 1); memcpy(new_token.u.text, ip_hdr, ip_len); memcpy(new_token.u.text+ip_len, token.u.text+url_len, token.leng - url_len); new_token.u.text[new_token.leng] = '\0'; replace_token(&token, &new_token, in_val, transaction, vhandle); xfree(new_token.u.text); } break; } } } return EX_OK; } static bool check_wordlist_version(dsh_t *dsh) { dsv_t val; ds_get_wordlist_version(dsh, &val); if (val.count[0] >= CURRENT_VERSION) return true; else return false; } static ex_t maintain_wordlist(void *database) { ta_t *transaction = ta_init(); struct userdata_t userdata; ex_t ret; bool done = false; userdata.vhandle = database; userdata.transaction = transaction; if (DST_OK == ds_txn_begin(database)) { #ifndef DISABLE_UNICODE dsv_t val; int rc = ds_get_wordlist_encoding(database, &val); new_encoding = encoding; if (rc == 0) old_encoding = (e_enc)val.spamcount; /* found | FIXME: is the cast correct? */ else old_encoding = E_RAW; /* not found */ if (old_encoding != new_encoding) { const char *from_charset = DEFAULT_OR_UNICODE(old_encoding); const char *to_charset = DEFAULT_OR_UNICODE(new_encoding); init_charset_table_iconv(from_charset, to_charset); } #endif ret = ds_foreach(database, maintain_hook, &userdata); } else ret = EX_ERROR; if (upgrade_wordlist_version) { done = check_wordlist_version((dsh_t *)database); if (!done) fprintf(dbgout, "Upgrading wordlist.\n"); else fprintf(dbgout, "Wordlist has already been upgraded.\n"); } if (!done && upgrade_wordlist_version) { dsv_t val; val.count[0] = CURRENT_VERSION; val.count[1] = 0; ds_set_wordlist_version(database, &val); } #ifndef DISABLE_UNICODE if (old_encoding != new_encoding) { dsv_t val; word_t enco; enco.u.text = (byte *)xstrdup(WORDLIST_ENCODING); enco.leng = strlen(WORDLIST_ENCODING); val.count[0] = new_encoding; val.count[1] = 0; val.date = 0; ds_write(database, &enco, &val); xfree(enco.u.text); } #endif if (ta_commit(transaction) != TA_OK) ret = EX_ERROR; if (DST_OK != ds_txn_commit(database)) ret = EX_ERROR; return ret; } ex_t maintain_wordlist_file(bfpath *bfp) { ex_t rc; dsh_t *dsh; void *dbe; dbe = ds_init(bfp); dsh = (dsh_t *)ds_open(dbe, bfp, DS_WRITE); if (dsh == NULL) return EX_ERROR; rc = maintain_wordlist(dsh); ds_close(dsh); ds_cleanup(dbe); return rc; }