Blob Blame History Raw
/* collect.c -- tokenize input and cap word frequencies, return a wordhash */

#include "common.h"

#include <assert.h>
#include <stdlib.h>

#include "charset.h"
#include "mime.h"
#include "wordhash.h"
#include "token.h"

#include "collect.h"

void wordprop_init(void *vwordprop)
{
    wordprop_t *wp = (wordprop_t *)vwordprop;
    memset(wp, 0, sizeof(*wp));
}

void wordcnts_init(void *vwordcnts)
{
    wordcnts_t *wc = (wordcnts_t *)vwordcnts;
    memset(wc, 0, sizeof(*wc));
}

void wordcnts_incr(wordcnts_t *w1, wordcnts_t *w2)
{
    w1->good += w2->good;
    w1->bad  += w2->bad;
}

/* Tokenize input text and save words in the wordhash_t hash table.
 *
 * Returns:  true if the EOF token has not been read.
 */
void collect_words(wordhash_t *wh)
{
    if (DEBUG_WORDLIST(2)) fprintf(dbgout, "### collect_words() begins\n");

    lexer_init();

    for (;;){
	wordprop_t *wp;
	word_t token;
	token_t cls = get_token( &token );

	if (cls == NONE)
	    break;

	if (cls == BOGO_LEX_LINE)
	{
	    char *beg = (char *)token.u.text+1;	/* skip leading quote mark */
	    char *end = strchr(beg, '"');
	    assert(end);
	    token.leng = end - beg;
	    memmove(token.u.text, token.u.text + 1, token.leng + 1);
	    token.u.text[token.leng] = '\0';	/* ensure nul termination */
	}

	wp = (wordprop_t *)wordhash_insert(wh, &token, sizeof(wordprop_t), &wordprop_init);
	if (wh->type != WH_CNTS)
	    wp->freq = 1;

/******* EK **********/

#ifdef	CP866
/* mime charset hack */
	{
	    static bool hasCharset=false;
	    if (hasCharset)  /* prev token == charset */
	    {
		if (token.leng > 5 &&
		    !strncmp(token.text, "mime:", 5))
		    set_charset(token.text+5);
	    }
	    hasCharset = 0;
	    if (token.leng == 5+7)
	    {
		if (!strncmp(token.text, "mime:", 5) &&
		    !strncasecmp(token.text+5, "charset", 7))
		    hasCharset = true;
	    }
	}
#endif

/******* end of EK addition **********/

	if (DEBUG_WORDLIST(3)) {
	    fprintf(dbgout, "%3d ", (int) wh->count);
	    word_puts(&token, 0, dbgout);
	    fputc('\n', dbgout);
	}

	if (cls == BOGO_LEX_LINE)
	{
	    char *s = (char *)token.u.text;
	    assert(s != 0);
	    s += token.leng + 2;
	    wp->cnts.bad = atoi(s);
	    s = strchr(s+1, ' ');
	    assert(s != 0);
	    wp->cnts.good = atoi(s + 1);
	    wp->cnts.msgs_good = msgs_good;
	    wp->cnts.msgs_bad = msgs_bad;
	}
    }
    
    if (DEBUG_WORDLIST(2)) fprintf(dbgout, "### collect_words() ends\n");

    return;
}