/*****************************************************************************
NAME:
bogoreader.c -- process input files
AUTHORS: (C) Copyright 2003-2005 by
David Relson <relson@osagesoftware.com>
Matthias Andree <matthias.andree@gmx.de>
******************************************************************************/
/*
** Formats supported:
**
** mbox
** Maildir
** MH folder
** rmail
** ANT RISC-OS only
**
** msg-count special for bogofilter
*/
#include "common.h"
#include <ctype.h>
#include <errno.h>
#include <stdlib.h>
#include "bogoreader.h"
#include "error.h"
#include "fgetsl.h"
#include "lexer.h"
#include "paths.h"
#include "token.h"
#include "xmalloc.h"
static void (*fini)(void);
static int argc;
static const char * const *argv;
static const char *filename;
static char namebuff[PATH_LEN + 1 + MEMBERSIZE(struct dirent, d_name)];
static char dir_name[PATH_LEN+1];
static FILE *yy_file;
typedef enum ms_e {MS_FILE, MS_MAILDIR, MS_MH } ms_t;
static ms_t mailstore_type;
static bool mail_first = true; /* for the _next_mail functions */
static bool mailstore_first = true; /* for the _next_mailstore functions */
static bool firstline = true; /* for mailbox /^From / match */
static bool have_message = false;
/* Lexer-Reader Interface */
reader_more_t *reader_more;
reader_line_t *reader_getline;
reader_file_t *reader_filename;
/* Function Prototypes */
/* these functions check if there are more file names in bulk modes,
* read-mail/mbox-from-stdin for uniformity */
static reader_more_t stdin_next_mailstore;
static reader_more_t b_stdin_next_mailstore;
static reader_more_t b_args_next_mailstore;
/* these functions check if there is more mail in a mailbox/maildir/...
* to process, trivial mail_next_mail for uniformity */
static reader_more_t dir_next_mail;
static reader_more_t mail_next_mail;
static reader_more_t mailbox_next_mail;
/* maildir is the mailbox format specified in
* http://cr.yp.to/proto/maildir.html */
static reader_line_t simple_getline; /* ignores /^From / */
static reader_line_t mailbox_getline; /* minds /^From / */
static reader_line_t rmail_getline; /* minds /^#! rmail/ */
static reader_line_t ant_getline; /* minds /^MAIL TO:/ */
static reader_file_t get_filename;
static void bogoreader_close(void);
typedef enum { MBOX, MC, RMAIL, ANT } mbox_t;
typedef struct {
const char *sep;
uint len;
mbox_t type;
reader_line_t *fcn;
} sep_2_box_t;
static sep_2_box_t sep_2_box[] = {
{ "From ", 5, MBOX, mailbox_getline },
{ "\".MSG_COUNT\"", 12, MC, mailbox_getline }, /* msg-count */
{ "#! rmail", 8, RMAIL, rmail_getline },
{ "MAIL FROM:", 10, ANT, ant_getline } /* RISC-OS only */
};
static uint seplen = 0;
static const char *separator = NULL;
static void dir_init(const char *name);
static void dir_fini(void);
typedef enum st_e { IS_DIR, IS_FILE, IS_ERR } st_t;
/* Function Definitions */
bool is_eol(const char *buf, size_t len)
{
bool ans = ((len == 1 && memcmp(buf, NL, 1) == 0) ||
(len == 2 && memcmp(buf, CRLF, 2) == 0));
return ans;
}
static reader_line_t *get_reader_line(FILE *fp) {
uint i;
int c;
reader_line_t *fcn = mailbox_getline;
if (fp == NULL)
return NULL;
c = fgetc(fp);
ungetc(c, fp);
for (i = 0; i < COUNTOF(sep_2_box); i += 1) {
sep_2_box_t *s = sep_2_box + i;
if (s->sep[0] == c) {
fcn = s->fcn;
seplen = s->len;
separator = s->sep;
break;
}
}
if (fcn == mailbox_getline && !mbox_mode)
fcn = simple_getline;
return fcn;
}
/* Checks if name is a directory.
* Returns IS_DIR for directory, IS_FILE for other type, IS_ERR for error
*/
static st_t isdir(const char *name)
{
struct stat stat_buf;
if (stat(name, &stat_buf)) return IS_ERR;
return (S_ISDIR(stat_buf.st_mode) != 0) ? IS_DIR : IS_FILE;
}
static void save_dirname(const char *name)
{
size_t l = strlen(name);
l = min(l, sizeof(dir_name)-2);
memcpy(dir_name, name, l);
if (dir_name[l-1] == DIRSEP_C)
l -= 1;
dir_name[l] = '\0';
}
static const char* const maildir_subs[]={ DIRSEP_S "new", DIRSEP_S "cur", NULL };
static const char *const *maildir_sub;
static DIR *reader_dir;
/* MA: Check if the given name points to a Maildir. We don't require the
* /tmp directory for simplicity.
* This function checks if dir, dir/new and dir/cur are all directories.
* Returns IS_DIR for directory, IS_FILE for other type, IS_ERR for error
*/
static st_t ismaildir(const char *dir) {
st_t r;
size_t l;
char *x;
const char *const *y;
const size_t maxlen = 4;
r = isdir(dir);
if (r != IS_DIR) return r;
x = (char *)xmalloc((l = strlen(dir)) + maxlen /* append */ + 1 /* NUL */);
memcpy(x, dir, l);
for (y = maildir_subs; *y; y++) {
strlcpy(x + l, *y, maxlen + 1);
r = isdir(x);
if (r != IS_DIR) {
xfree(x);
return r;
}
}
xfree(x);
return IS_DIR;
}
static void dummy_fini(void) { }
static reader_more_t *mailstore_next_store;
static reader_more_t *mailstore_next_mail = NULL;
/* this is the 'nesting driver' for our input.
* mailstore := one of { mail, mbox, maildir }
* if we have a current mailstore-specific handle, check that if we have
* further input in the mailstore first. if we don't, see if we have
* further mailstores to process
*/
static bool reader__next_mail(void)
{
for (;;) {
/* check mailstore-specific method */
if (mailstore_next_mail) {
if ((*mailstore_next_mail)()) /* more mails in the mailstore */
return true;
mailstore_next_mail = NULL;
}
/* ok, that one has been exhausted, try the next mailstore */
/* mailstore_next_store opens the mailstore */
if (!(*mailstore_next_store)())
return false;
/* ok, we have more mailstores, so check if the current mailstore has
* input - loop.
*/
}
}
/* open mailstore (Maildir, mbox file or file with a single mail) and set
* _getline and _next_mail pointers dependent on the mailstore's type.
*
* - automatically detects maildir
* - does not automatically distinguish between mbox and mail
* and takes mbox_mode instead
*/
static bool open_mailstore(const char *name)
{
filename = name;
bogoreader_close();
firstline = true;
switch (isdir(filename)) {
case IS_FILE:
if (DEBUG_READER(0))
fprintf(dbgout, "%s:%d - assuming %s is a %s\n", __FILE__, __LINE__, filename, mbox_mode ? "mbox" : "message");
fpin = fopen( filename, "r" );
if (fpin == NULL) {
fprintf(stderr, "Can't open file '%s': %s\n", filename,
strerror(errno));
return false;
} else {
mail_first = true;
msg_count_file = false;
reader_getline = get_reader_line(fpin);
mailstore_next_mail = mbox_mode ? mailbox_next_mail : mail_next_mail;
return true;
}
case IS_DIR:
if (ismaildir(filename) == IS_DIR) {
/* MAILDIR */
mailstore_type = MS_MAILDIR;
dir_init(filename);
reader_getline = simple_getline;
mailstore_next_mail = dir_next_mail;
return true;
} else {
/* MH */
mailstore_type = MS_MH;
dir_init(filename);
reader_getline = simple_getline;
mailstore_next_mail = dir_next_mail;
return true;
}
case IS_ERR:
fprintf(stderr, "Can't stat mailstore '%s': %s\n",
filename, strerror(errno));
break;
default:
fprintf(stderr, "Can't identify type of mailstore '%s'\n", filename);
break;
}
return false;
}
/*** _next_mailstore functions ***********************************************/
/* this initializes for reading a single mail or a mbox from stdin */
static bool stdin_next_mailstore(void)
{
bool val = mailstore_first;
reader_getline = get_reader_line(fpin);
if (reader_getline == NULL)
return false;
mailstore_next_mail = mbox_mode ? mailbox_next_mail : mail_next_mail;
mailstore_first = false;
return val;
}
/* this reads file names from stdin and processes them according to
* their type */
static bool b_stdin_next_mailstore(void)
{
int len;
filename = namebuff;
if ((len = fgetsl(namebuff, sizeof(namebuff), stdin)) <= 0)
return false;
if (len > 0 && namebuff[len-1] == '\n')
namebuff[len-1] = '\0';
return open_mailstore(filename);
}
/* this reads file names from the command line and processes them
* according to their type */
static bool b_args_next_mailstore(void)
{
if (argc <= 0)
return false;
filename = *argv;
argc -= 1;
argv += 1;
return open_mailstore(filename);
}
/*** _next_mail functions ***********************************************/
/* trivial function, returns true on first run,
* returns false on all subsequent runs */
static bool mail_next_mail(void)
{
bool val = mail_first;
mail_first = false;
return val;
}
/* always returns true on the first run
* subsequent runs return true when a From line was encountered */
static bool mailbox_next_mail(void)
{
bool val = mail_first || have_message;
mail_first = false;
return val;
}
/* iterates over files in a directory */
static bool dir_next_mail(void)
{
struct dirent *dirent;
struct stat st;
for (;;) {
if (reader_dir == NULL) {
char *x = dir_name;
/* open next directory */
if (mailstore_type == MS_MAILDIR) {
size_t siz;
if (*maildir_sub == NULL)
return false; /* IMPORTANT for termination */
siz = strlen(dir_name) + 4 + 1;
x = (char *)xmalloc(siz);
strlcpy(x, dir_name, siz);
strlcat(x, *(maildir_sub++), siz);
}
reader_dir = opendir(x);
if (!reader_dir) {
fprintf(stderr, "cannot open directory '%s': %s", x,
strerror(errno));
}
if (x != dir_name)
xfree(x);
}
while ((errno = 0, dirent = readdir(reader_dir)) != NULL) {
/* skip private files */
if ((mailstore_type == MS_MAILDIR && dirent->d_name[0] != '.') ||
(mailstore_type == MS_MH && isdigit((unsigned char)dirent->d_name[0])))
break;
}
if (errno) {
fprintf(stderr, "Cannot read directory %s: %s",
dir_name, strerror(errno));
exit(EX_ERROR);
}
if (dirent == NULL) {
if (reader_dir)
closedir(reader_dir);
reader_dir = NULL;
if (mailstore_type == MS_MAILDIR)
continue;
if (mailstore_type == MS_MH)
return false;
}
filename = namebuff;
snprintf(namebuff, sizeof(namebuff), "%s%s%c%s", dir_name,
(mailstore_type == MS_MH) ? "" : *(maildir_sub-1),
DIRSEP_C, dirent->d_name);
bogoreader_close();
fpin = fopen( filename, "r" );
if (fpin == NULL) {
fprintf(stderr, "Warning: can't open file '%s': %s\n", filename,
strerror(errno));
/* don't barf, the file may have been changed by another MUA,
* or a directory that just doesn't belong there, just skip it */
continue;
}
/* skip non-regular files */
if (0 == fstat(fileno(fpin), &st) && !S_ISREG(st.st_mode))
continue;
if (DEBUG_READER(0))
fprintf(dbgout, "%s:%d - reading %s (%p)\n", __FILE__, __LINE__,
filename, (void *)fpin);
return true;
}
}
/*** _getline functions ***********************************************/
/* reads from a mailbox, paying attention to ^From lines */
static int mailbox_getline(buff_t *buff)
{
uint used = buff->t.leng;
byte *buf = buff->t.u.text + used;
int count;
static word_t *saved = NULL;
static bool emptyline = false; /* for mailbox /^From / match */
if (saved != NULL) {
count = saved->leng;
buff_add(buff, saved);
word_free(saved);
saved = NULL;
return count;
}
count = buff_fgetsl(buff, fpin);
have_message = false;
/* XXX FIXME: do we need to unescape the >From, >>From, >>>From, ... lines
* by discarding the first ">"? */
/* DR 08/25/03 - NO!!! */
if ((firstline || emptyline) &&
seplen != 0 && count >= (int) seplen && memcmp(separator, buf, seplen) == 0)
{
if (firstline) {
firstline = false;
}
else {
have_message = true;
saved = word_new(buf, count);
count = EOF;
}
}
else {
if (buff->t.leng < buff->size) /* for easier debugging - removable */
Z(buff->t.u.text[buff->t.leng]); /* for easier debugging - removable */
}
emptyline = is_eol((char *)buf, count);
return count;
}
/* reads from an rmail batch, paying attention to ^#! rmail lines */
static int rmail_getline(buff_t *buff)
{
int count;
uint used = buff->t.leng;
byte *buf = buff->t.u.text + used;
static word_t *saved = NULL;
static uint bytesleft = 0;
if (saved != NULL) {
count = saved->leng;
buff_add(buff, saved);
word_free(saved);
saved = NULL;
return count;
}
if (bytesleft) {
count = buff_fgetsln(buff, fpin, bytesleft);
if (count > 0)
bytesleft -= count;
return count;
}
count = buff_fgetsl(buff, fpin);
have_message = false;
if (count >= (int) seplen && memcmp(separator, buf, seplen) == 0)
{
uint i;
bytesleft = 0;
for (i = seplen; i < (uint) count; i++) {
if (isspace(buf[i])) continue;
if (!isdigit(buf[i])) break;
bytesleft = bytesleft * 10 + (buf[i] - '0');
}
if (firstline) {
firstline = false;
}
else {
have_message = true;
saved = word_new(buf, count);
count = EOF;
}
} else {
if (buff->t.leng < buff->size) /* for easier debugging - removable */
Z(buff->t.u.text[buff->t.leng]); /* for easier debugging - removable */
}
return count;
}
/* reads from an ANT batch, paying attention to ^#! rmail lines */
static int ant_getline(buff_t *buff)
{
int count;
uint used = buff->t.leng;
byte *buf = buff->t.u.text + used;
static word_t *saved = NULL;
static bool dot_found = true;
if (saved != NULL) {
count = saved->leng;
buff_add(buff, saved);
word_free(saved);
saved = NULL;
return count;
}
count = buff_fgetsl(buff, fpin);
have_message = false;
if (dot_found && count >= (int) seplen && memcmp(separator, buf, seplen) == 0)
{
dot_found = false; /* ignore until dot */
if (firstline) {
firstline = false;
}
else {
have_message = true;
saved = word_new(buf, count);
count = EOF;
}
} else {
if ((count == 2 || count == 3) &&
(buf[0] == '.') &&
(buf[1] == '\r' || buf[1] == '\n'))
dot_found = true; /* dot found. look for separator */
if (buff->t.leng < buff->size) /* for easier debugging - removable */
Z(buff->t.u.text[buff->t.leng]); /* for easier debugging - removable */
}
return count;
}
/* reads a file as a single mail ( no ^From detection ). */
static int simple_getline(buff_t *buff)
{
int count = buff_fgetsl(buff, fpin);
if (buff->t.leng < buff->size) /* for easier debugging - removable */
Z(buff->t.u.text[buff->t.leng]);/* for easier debugging - removable */
return count;
}
/* initialize for MH directory and
* Maildir subdirectories (cur and new). */
static void dir_init(const char *name)
{
reader_dir = NULL;
fini = dir_fini;
if (mailstore_type == MS_MAILDIR)
maildir_sub = maildir_subs;
save_dirname(name);
return;
}
/* finish up MH/Maildir store */
static void dir_fini(void)
{
if (reader_dir)
closedir(reader_dir);
reader_dir = NULL;
return;
}
/* returns current file name */
static const char *get_filename(void)
{
return filename;
}
/* global reader initialization, exported */
void bogoreader_init(int _argc, const char * const *_argv)
{
mailstore_first = mail_first = true;
reader_more = reader__next_mail;
fini = dummy_fini;
switch (bulk_mode) {
case B_NORMAL: /* read mail (mbox) from stdin */
yy_file = fpin;
mailstore_next_store = stdin_next_mailstore;
if (run_type & (REG_SPAM|REG_GOOD|UNREG_SPAM|UNREG_GOOD))
mbox_mode = true;
break;
case B_STDIN: /* '-b' - streaming (stdin) mode */
mailstore_next_store = b_stdin_next_mailstore;
break;
case B_CMDLINE: /* '-B' - command line mode */
argc = _argc;
argv = (const char * const *) _argv;
mailstore_next_store = b_args_next_mailstore;
mailstore_next_mail = NULL;
break;
default:
fprintf(stderr, "Unknown bulk_mode = %d\n", (int) bulk_mode);
abort();
break;
}
reader_filename = get_filename;
}
/* For bogoconfig to distinguish '-I file' from '-I dir' */
/* global reader initialization, exported */
void bogoreader_name(const char *name)
{
bool ok;
switch (isdir(name)) {
case IS_FILE:
fpin = fopen( name, "r" );
ok = fpin != NULL;
break;
case IS_DIR:
ok = open_mailstore(name);
break;
default:
ok = false;
break;
}
if (!ok) {
fprintf(stderr, "Can't read '%s'\n", name);
exit(EX_ERROR);
}
}
/* cleanup after reading a message, exported. */
/* Only called if the passthrough code says it is ok to close the file. */
void bogoreader_close_ifeof(void)
{
if (fpin && feof(fpin))
bogoreader_close();
}
/* global cleanup, exported */
void bogoreader_fini(void)
{
bogoreader_close();
fini();
}
static void bogoreader_close(void)
{
if (fpin && fpin != stdin)
fclose(fpin);
fpin = NULL;
}