Tree - source-git/ghc-hs-bibutils

source-git / ghc-hs-bibutils

Files

Commit: b0dfe5ed912f3d22dbd8cb3ff9b9f8918f48f386
Blob Blame History Raw
/*
 * bibtexin.c
 *
 * Copyright (c) Chris Putnam 2003-2018
 *
 * Program and source code released under the GPL version 2
 *
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "is_ws.h"
#include "str.h"
#include "utf8.h"
#include "str_conv.h"
#include "fields.h"
#include "slist.h"
#include "name.h"
#include "title.h"
#include "url.h"
#include "reftypes.h"
#include "bibformats.h"
#include "generic.h"

static slist find    = { 0, 0, 0, NULL };
static slist replace = { 0, 0, 0, NULL };

extern variants bibtex_all[];
extern int bibtex_nall;

/*****************************************************
 PUBLIC: void bibtexin_initparams()
*****************************************************/

static int  bibtexin_convertf( fields *bibin, fields *info, int reftype, param *p );
static int  bibtexin_processf( fields *bibin, char *data, char *filename, long nref, param *p );
static int  bibtexin_cleanf( bibl *bin, param *p );
static int  bibtexin_readf( FILE *fp, char *buf, int bufsize, int *bufpos, str *line, str *reference, int *fcharset );
static int  bibtexin_typef( fields *bibin, char *filename, int nrefs, param *p );

void
bibtexin_initparams( param *p, const char *progname )
{
	p->readformat       = BIBL_BIBTEXIN;
	p->charsetin        = BIBL_CHARSET_DEFAULT;
	p->charsetin_src    = BIBL_SRC_DEFAULT;
	p->latexin          = 1;
	p->xmlin            = 0;
	p->utf8in           = 0;
	p->nosplittitle     = 0;
	p->verbose          = 0;
	p->addcount         = 0;
	p->output_raw       = 0;

	p->readf    = bibtexin_readf;
	p->processf = bibtexin_processf;
	p->cleanf   = bibtexin_cleanf;
	p->typef    = bibtexin_typef;
	p->convertf = bibtexin_convertf;
	p->all      = bibtex_all;
	p->nall     = bibtex_nall;

	slist_init( &(p->asis) );
	slist_init( &(p->corps) );

	if ( !progname ) p->progname = NULL;
	else p->progname = strdup( progname );
}

/*****************************************************
 PUBLIC: int bibtexin_readf()
*****************************************************/

/*
 * readf can "read too far", so we store this information in line, thus
 * the next new text is in line, either from having read too far or
 * from the next chunk obtained via str_fget()
 *
 * return 1 on success, 0 on error/end-of-file
 *
 */
static int
readmore( FILE *fp, char *buf, int bufsize, int *bufpos, str *line )
{
	if ( line->len ) return 1;
	else return str_fget( fp, buf, bufsize, bufpos, line );
}

/*
 * readf()
 *
 * returns zero if cannot get reference and hit end of-file
 * returns 1 if last reference in file, 2 if reference within file
 */
static int
bibtexin_readf( FILE *fp, char *buf, int bufsize, int *bufpos, str *line, str *reference, int *fcharset )
{
	int haveref = 0;
	char *p;
	*fcharset = CHARSET_UNKNOWN;
	while ( haveref!=2 && readmore( fp, buf, bufsize, bufpos, line ) ) {
		if ( line->len == 0 ) continue; /* blank line */
		p = &(line->data[0]);
		/* Recognize UTF8 BOM */
		if ( line->len > 2 && 
				(unsigned char)(p[0])==0xEF &&
				(unsigned char)(p[1])==0xBB &&
				(unsigned char)(p[2])==0xBF ) {
			*fcharset = CHARSET_UNICODE;
			p += 3;
		}
		p = skip_ws( p );
		if ( *p == '%' ) { /* commented out line */
			str_empty( line );
			continue;
		}
		if ( *p == '@' ) haveref++;
		if ( haveref && haveref<2 ) {
			str_strcatc( reference, p );
			str_addchar( reference, '\n' );
			str_empty( line );
		} else if ( !haveref ) str_empty( line );
	
	}
	return haveref;
}

/*****************************************************
 PUBLIC: int bibtexin_processf()
*****************************************************/

static char*
process_bibtextype( char *p, str *type )
{
	str tmp;
	str_init( &tmp );

	if ( *p=='@' ) p++;
	p = str_cpytodelim( &tmp, p, "{( \t\r\n", 0 );
	p = skip_ws( p );
	if ( *p=='{' || *p=='(' ) p++;
	p = skip_ws( p );

	if ( str_has_value( &tmp ) ) str_strcpy( type, &tmp );
	else str_empty( type );

	str_free( &tmp );
	return p;
}

static char*
process_bibtexid( char *p, str *id )
{
	char *start_p = p;
	str tmp;

	str_init( &tmp );
	p = str_cpytodelim( &tmp, p, ",", 1 );

	if ( str_has_value( &tmp ) ) {
		if ( strchr( tmp.data, '=' ) ) {
			/* Endnote writes bibtex files w/o fields, try to
			 * distinguish via presence of an equal sign.... if
			 * it's there, assume that it's a tag/data pair instead
			 * and roll back.
			 */
			p = start_p;
			str_empty( id );
		} else {
			str_strcpy( id, &tmp );
		}
	} else {
		str_empty( id );
	}

	str_free( &tmp );
	return skip_ws( p );
}

static char *
bibtex_tag( char *p, str *tag )
{
	p = str_cpytodelim( tag, skip_ws( p ), "= \t\r\n", 0 );
	if ( str_memerr( tag ) ) return NULL;
	return skip_ws( p );
}

static char *
bibtex_data( char *p, fields *bibin, slist *tokens, long nref, param *pm )
{
	unsigned int nbracket = 0, nquotes = 0;
	char *startp = p;
	str tok, *t;

	str_init( &tok );
	while ( p && *p ) {
		if ( !nquotes && !nbracket ) {
			if ( *p==',' || *p=='=' || *p=='}' || *p==')' )
				goto out;
		}
		if ( *p=='\"' && nbracket==0 && ( p==startp || *(p-1)!='\\' ) ) {
			nquotes = !nquotes;
			str_addchar( &tok, *p );
			if ( !nquotes ) {
				if ( str_memerr( &tok ) ) { p=NULL; goto out; }
				t = slist_add( tokens, &tok );
				if ( !t ) { p=NULL; goto out0; }
				str_empty( &tok );
			}
		} else if ( *p=='#' && !nquotes && !nbracket ) {
			if ( str_has_value( &tok ) ) {
				if ( str_memerr( &tok ) ) { p=NULL; goto out; }
				t = slist_add( tokens, &tok );
				if ( !t ) { p=NULL; goto out0; }
			}
			str_strcpyc( &tok, "#" );
			t = slist_add( tokens, &tok );
			if ( !t ) { p=NULL; goto out0; }
			str_empty( &tok );
		} else if ( *p=='{' && !nquotes && ( p==startp || *(p-1)!='\\' ) ) {
			nbracket++;
			str_addchar( &tok, *p );
		} else if ( *p=='}' && !nquotes && ( p==startp || *(p-1)!='\\' ) ) {
			nbracket--;
			str_addchar( &tok, *p );
			if ( nbracket==0 ) {
				if ( str_memerr( &tok ) ) { p=NULL; goto out; }
				t = slist_add( tokens, &tok );
				if ( !t ) { p=NULL; goto out; }
				str_empty( &tok );
			}
		} else if ( !is_ws( *p ) || nquotes || nbracket ) {
			if ( !is_ws( *p ) ) str_addchar( &tok, *p );
			else {
				if ( tok.len!=0 && *p!='\n' && *p!='\r' )
					str_addchar( &tok, *p );
				else if ( tok.len!=0 && (*p=='\n' || *p=='\r')) {
					str_addchar( &tok, ' ' );
					while ( is_ws( *(p+1) ) ) p++;
				}
			}
		} else if ( is_ws( *p ) ) {
			if ( tok.len ) {
				if ( str_memerr( &tok ) ) { p=NULL; goto out; }
				t = slist_add( tokens, &tok );
				if ( !t ) { p=NULL; goto out; }
				str_empty( &tok );
			}
		}
		p++;
	}
out:
	if ( nbracket!=0 ) {
		fprintf( stderr, "%s: Mismatch in number of brackets in reference %ld.\n", pm->progname, nref );
	}
	if ( nquotes!=0 ) {
		fprintf( stderr, "%s: Mismatch in number of quotes in reference %ld.\n", pm->progname, nref );
	}
	if ( str_has_value( &tok ) ) {
		if ( str_memerr( &tok ) ) { p = NULL; goto out; }
		t = slist_add( tokens, &tok );
		if ( !t ) p = NULL;
	}
out0:
	str_free( &tok );
	return p;
}

/* replace_strings()
 *
 * do string replacement -- only if unprotected by quotation marks or curly brackets
 */
static void
replace_strings( slist *tokens, fields *bibin, param *pm )
{
	int i, n, ok;
	char *q;
	str *s;
	i = 0;
	while ( i < tokens->n ) {
		s = slist_str( tokens, i );
		if ( !strcmp( s->data, "#" ) ) {
		} else if ( s->data[0]!='\"' && s->data[0]!='{' ) {
			n = slist_find( &find, s );
			if ( n!=-1 ) {
				str_strcpy( s, slist_str( &replace, n ) );
			} else {
				q = s->data;
				ok = 1;
				while ( *q && ok ) {
					if ( !isdigit( *q ) ) ok = 0;
					q++;
				}
			}
		}
		i++;
	}
}

static int
string_concatenate( slist *tokens, fields *bibin, long nref, param *pm )
{
	int i, status;
	str *s, *t;
	i = 0;
	while ( i < tokens->n ) {
		s = slist_str( tokens, i );
		if ( !strcmp( s->data, "#" ) ) {
			if ( i==0 || i==tokens->n-1 ) {
				fprintf( stderr, "%s: Warning: Stray string concatenation "
					"('#' character) in reference %ld\n", pm->progname, nref );
				status = slist_remove( tokens, i );
				if ( status!=SLIST_OK ) return BIBL_ERR_MEMERR;
				continue;
			}
			s = slist_str( tokens, i-1 );
			if ( s->data[0]!='\"' && s->data[s->len-1]!='\"' )
				fprintf( stderr, "%s: Warning: String concentation should "
					"be used in context of quotations marks in reference %ld\n", pm->progname, nref );
			t = slist_str( tokens, i+1 );
			if ( t->data[0]!='\"' && t->data[s->len-1]!='\"' )
				fprintf( stderr, "%s: Warning: String concentation should "
					"be used in context of quotations marks in reference %ld\n", pm->progname, nref );
			if ( ( s->data[s->len-1]=='\"' && t->data[0]=='\"') || (s->data[s->len-1]=='}' && t->data[0]=='{') ) {
				str_trimend( s, 1 );
				str_trimbegin( t, 1 );
				str_strcat( s, t );
			} else {
				str_strcat( s, t );
			}
			status = slist_remove( tokens, i );
			if ( status!=SLIST_OK ) return BIBL_ERR_MEMERR;
			status = slist_remove( tokens, i );
			if ( status!=SLIST_OK ) return BIBL_ERR_MEMERR;
		} else i++;
	}
	return BIBL_OK;
}

/* return NULL on memory error */
static char *
process_bibtexline( char *p, str *tag, str *data, uchar stripquotes, fields *bibin, long nref, param *pm )
{
	int i, status;
	slist tokens;
	str *s;

	str_empty( data );

	p = bibtex_tag( p, tag );

	if ( str_is_empty( tag ) ) {
		/* ...skip this line */
		while ( *p && *p!='\n' && *p!='\r' ) p++;
		while ( *p=='\n' || *p=='\r' ) p++;
		return p;
	}

	slist_init( &tokens );

	if ( *p=='=' ) {
		p = bibtex_data( p+1, bibin, &tokens, nref, pm );
		if ( p==NULL ) goto out;
	}

	replace_strings( &tokens, bibin, pm );

	status = string_concatenate( &tokens, bibin, nref, pm );
	if ( status!=BIBL_OK ) {
		p = NULL;
		goto out;
	}

	for ( i=0; i<tokens.n; i++ ) {
		s = slist_str( &tokens, i );
		if ( ( stripquotes && s->data[0]=='\"' && s->data[s->len-1]=='\"' ) ||
		     ( s->data[0]=='{' && s->data[s->len-1]=='}' ) ) {
			str_trimbegin( s, 1 );
			str_trimend( s, 1 );
		}
		str_strcat( data, slist_str( &tokens, i ) );
	}
out:
	slist_free( &tokens );
	return p;
}

/* process_cite()
 *
 */
static int
process_cite( fields *bibin, char *p, char *filename, long nref, param *pm )
{
	int fstatus, status = BIBL_OK;
	str type, id, tag, data;

	strs_init( &type, &id, &tag, &data, NULL );

	p = process_bibtextype( p, &type );
	p = process_bibtexid( p, &id );

	if ( str_is_empty( &type ) || str_is_empty( &id ) ) goto out;

	fstatus = fields_add( bibin, "INTERNAL_TYPE", str_cstr( &type ), 0 );
	if ( fstatus!=FIELDS_OK ) { status = BIBL_ERR_MEMERR; goto out; }

	fstatus = fields_add( bibin, "REFNUM", str_cstr( &id), 0 );
	if ( fstatus!=FIELDS_OK ) { status = BIBL_ERR_MEMERR; goto out; }

	while ( *p ) {
		p = process_bibtexline( p, &tag, &data, 1, bibin, nref, pm );
		if ( p==NULL ) { status = BIBL_ERR_MEMERR; goto out; }
		/* no anonymous or empty fields allowed */
		if ( str_has_value( &tag ) && str_has_value( &data ) ) {
			fstatus = fields_add( bibin, str_cstr( &tag ), str_cstr( &data ), 0 );
			if ( fstatus!=FIELDS_OK ) { status = BIBL_ERR_MEMERR; goto out; }
		}
		strs_empty( &tag, &data, NULL );
	}
out:
	strs_free( &type, &id, &tag, &data, NULL );
	return status;
}

/* process_string()
 *
 * Handle lines like:
 *
 * '@STRING{TL = {Tetrahedron Lett.}}'
 *
 * p should point to just after '@STRING'
 *
 * In BibTeX, if a string is defined several times, the last one is kept.
 *
 */
static int
process_string( char *p, long nref, param *pm )
{
	int n, status = BIBL_OK;
	str s1, s2, *t;
	strs_init( &s1, &s2, NULL );
	while ( *p && *p!='{' && *p!='(' ) p++;
	if ( *p=='{' || *p=='(' ) p++;
	p = process_bibtexline( skip_ws( p ), &s1, &s2, 0, NULL, nref, pm );
	if ( p==NULL ) { status = BIBL_ERR_MEMERR; goto out; }
	if ( str_has_value( &s2 ) ) {
		str_findreplace( &s2, "\\ ", " " );
	}
	if ( str_has_value( &s1 ) ) {
		n = slist_find( &find, &s1 );
		if ( n==-1 ) {
			t = slist_add( &find, &s1 );
			if ( t==NULL ) { status = BIBL_ERR_MEMERR; goto out; }
			if ( str_has_value( &s2 ) ) t = slist_add( &replace, &s2 );
			else t = slist_addc( &replace, "" );
			if ( t==NULL ) { status = BIBL_ERR_MEMERR; goto out; }
		} else {
			if ( str_has_value( &s2 ) ) t = slist_set( &replace, n, &s2 );
			else t = slist_setc( &replace, n, "" );
			if ( t==NULL ) { status = BIBL_ERR_MEMERR; goto out; }
		}
	}
out:
	strs_free( &s1, &s2, NULL );
	return status;
}

/* bibtexin_processf()
 *
 * Handle '@STRING', '@reftype', and ignore '@COMMENT'
 */
static int
bibtexin_processf( fields *bibin, char *data, char *filename, long nref, param *p )
{
	if ( !strncasecmp( data, "@STRING", 7 ) ) {
		process_string( data+7, nref, p );
		return 0;
	} else if ( !strncasecmp( data, "@COMMENT", 8 ) ) {
		/* Not sure if these are real Bibtex, but not references */
		return 0;
	} else {
		process_cite( bibin, data, filename, nref, p );
		return 1;
	}
}

/*****************************************************
 PUBLIC: void bibtexin_cleanf()
*****************************************************/

static int
bibtex_protected( str *data )
{
	if ( data->data[0]=='{' && data->data[data->len-1]=='}' ) return 1;
	if ( data->data[0]=='\"' && data->data[data->len-1]=='\"' ) return 1;
	return 0;
}

static int
bibtex_split( slist *tokens, str *s )
{
	int i, n = s->len, nbrackets = 0, status = BIBL_OK;
	str tok, *t;

	str_init( &tok );

	for ( i=0; i<n; ++i ) {
		if ( s->data[i]=='{' && ( i==0 || s->data[i-1]!='\\' ) ) {
			nbrackets++;
			str_addchar( &tok, '{' );
		} else if ( s->data[i]=='}' && ( i==0 || s->data[i-1]!='\\' ) ) {
			nbrackets--;
			str_addchar( &tok, '}' );
		} else if ( !is_ws( s->data[i] ) || nbrackets ) {
			str_addchar( &tok, s->data[i] );
		} else if ( is_ws( s->data[i] ) ) {
			if ( str_has_value( &tok ) ) {
				t = slist_add( tokens, &tok );
				if ( !t ) {
					status = BIBL_ERR_MEMERR;
					goto out;
				}
			}
			str_empty( &tok );
		}
	}
	if ( str_has_value( &tok ) ) {
		t = slist_add( tokens, &tok );
		if ( !t ) {
			status = BIBL_ERR_MEMERR;
			goto out;
		}
	}

	for ( i=0; i<tokens->n; ++i ) {
		str_trimstartingws( slist_str( tokens, i ) );
		str_trimendingws( slist_str( tokens, i ) );
	}
out:
	str_free( &tok );
	return status;
}

static int
bibtex_addtitleurl( fields *info, str *in )
{
	int fstatus, status = BIBL_OK;
	str s;
	char *p;

	str_init( &s );

	/* ...skip past "\href{" and copy to "}" */
	p = str_cpytodelim( &s, in->data + 6, "}", 1 );
	if ( str_memerr( &s ) ) { status = BIBL_ERR_MEMERR; goto out; }

	/* ...add to URL */
	fstatus = fields_add( info, "URL", s.data, 0 );
	if ( fstatus!=FIELDS_OK ) { status = BIBL_ERR_MEMERR; goto out; }

	/* ...return deleted fragment to str in */
	(void) str_cpytodelim( &s, p, "", 0 );
	if ( str_memerr( &s ) ) { status = BIBL_ERR_MEMERR; goto out; }
	str_swapstrings( &s, in );

out:
	str_free( &s );
	return status;
}

static int
is_url_tag( str *tag )
{
	if ( str_has_value( tag ) ) {
		if ( !strcasecmp( str_cstr( tag ), "url" ) ) return 1;
	}
	return 0;
}

static int
is_name_tag( str *tag )
{
	if ( str_has_value( tag ) ) {
		if ( !strcasecmp( str_cstr( tag ), "author" ) ) return 1;
		if ( !strcasecmp( str_cstr( tag ), "editor" ) ) return 1;
	}
	return 0;
}

static void
bibtex_process_tilde( str *s )
{
	char *p, *q;
	int n = 0;

	p = q = s->data;
	if ( !p ) return;
	while ( *p ) {
		if ( *p=='~' ) {
			*q = ' ';
		} else if ( *p=='\\' && *(p+1)=='~' ) {
			n++;
			p++;
			*q = '~';
		} else {
			*q = *p;
		}
		p++;
		q++;
	}
	*q = '\0';
	s->len -= n;
}

static void
bibtex_process_bracket( str *s )
{
	char *p, *q;
	int n = 0;

	p = q = s->data;
	if ( !p ) return;
	while ( *p ) {
		if ( *p=='\\' && ( *(p+1)=='{' || *(p+1)=='}' ) ) {
			n++;
			p++;
			*q = *p;
			q++;
		} else if ( *p=='{' || *p=='}' ) {
			n++;
		} else {
			*q = *p;
			q++;
		}
		p++;
	}
	*q = '\0';
	s->len -= n;
}

static void
bibtex_cleantoken( str *s )
{
	/* 'textcomp' annotations */
	str_findreplace( s, "\\textit", "" );
	str_findreplace( s, "\\textbf", "" );
	str_findreplace( s, "\\textsl", "" );
	str_findreplace( s, "\\textsc", "" );
	str_findreplace( s, "\\textsf", "" );
	str_findreplace( s, "\\texttt", "" );
	str_findreplace( s, "\\textsubscript", "" );
	str_findreplace( s, "\\textsuperscript", "" );
	str_findreplace( s, "\\emph", "" );
	str_findreplace( s, "\\url", "" );
	str_findreplace( s, "\\mbox", "" );

	/* Other text annotations */
	str_findreplace( s, "\\it ", "" );
	str_findreplace( s, "\\em ", "" );

	str_findreplace( s, "\\%", "%" );
	str_findreplace( s, "\\$", "$" );
	while ( str_findreplace( s, "  ", " " ) ) {}

	/* 'textcomp' annotations that we don't want to substitute on output*/
	str_findreplace( s, "\\textdollar", "$" );
	str_findreplace( s, "\\textunderscore", "_" );

	bibtex_process_bracket( s );
	bibtex_process_tilde( s );

}

static int
bibtex_cleandata( str *tag, str *s, fields *info, param *p )
{
	int i, status;
	slist tokens;
	str *tok;
	if ( str_is_empty( s ) ) return BIBL_OK;
	/* protect url from undergoing any parsing */
	if ( is_url_tag( tag ) ) return BIBL_OK;
	slist_init( &tokens );
	status = bibtex_split( &tokens, s );
	if ( status!=BIBL_OK ) goto out;
	for ( i=0; i<tokens.n; ++i ) {
		tok = slist_str( &tokens, i );
		if ( bibtex_protected( tok ) ) {
			if (!strncasecmp(tok->data,"\\href{", 6)) {
				bibtex_addtitleurl( info, tok );
			}
		}
		if ( p->latexin && !is_name_tag( tag ) && !is_url_tag( tag ) )
			bibtex_cleantoken( tok );
	}
	str_empty( s );
	for ( i=0; i<tokens.n; ++i ) {
		tok = slist_str( &tokens, i );
		if ( i>0 ) str_addchar( s, ' ' );
		str_strcat( s, tok );
	}
out:
	slist_free( &tokens );
	return status;
}

static int
bibtexin_cleanref( fields *bibin, param *p )
{
	int i, n, status;
	str *t, *d;
	n = fields_num( bibin );
	for ( i=0; i<n; ++i ) {
		t = fields_tag( bibin, i, FIELDS_STRP_NOUSE );
		d = fields_value( bibin, i, FIELDS_STRP_NOUSE );
		status = bibtex_cleandata( t, d, bibin, p );
		if ( status!=BIBL_OK ) return status;
	}
	return BIBL_OK;
}

static long
bibtexin_findref( bibl *bin, char *citekey )
{
	int n;
	long i;
	for ( i=0; i<bin->nrefs; ++i ) {
		n = fields_find( bin->ref[i], "refnum", LEVEL_ANY );
		if ( n==FIELDS_NOTFOUND ) continue;
		if ( !strcmp( bin->ref[i]->data[n].data, citekey ) ) return i;
	}
	return -1;
}

static void
bibtexin_nocrossref( bibl *bin, long i, int n, param *p )
{
	int n1 = fields_find( bin->ref[i], "REFNUM", LEVEL_ANY );
	if ( p->progname ) fprintf( stderr, "%s: ", p->progname );
	fprintf( stderr, "Cannot find cross-reference '%s'",
			bin->ref[i]->data[n].data );
	if ( n1!=FIELDS_NOTFOUND ) fprintf( stderr, " for reference '%s'\n",
			bin->ref[i]->data[n1].data );
	fprintf( stderr, "\n" );
}

static int
bibtexin_crossref_oneref( fields *bibref, fields *bibcross )
{
	int j, n, nl, ntype, fstatus, status = BIBL_OK;
	char *type, *nt, *nv;

	ntype = fields_find( bibref, "INTERNAL_TYPE", LEVEL_ANY );
	type = ( char * ) fields_value( bibref, ntype, FIELDS_CHRP_NOUSE );

	n = fields_num( bibcross );
	for ( j=0; j<n; ++j ) {
		nt = ( char * ) fields_tag( bibcross, j, FIELDS_CHRP_NOUSE );
		if ( !strcasecmp( nt, "INTERNAL_TYPE" ) ) continue;
		if ( !strcasecmp( nt, "REFNUM" ) ) continue;
		if ( !strcasecmp( nt, "TITLE" ) ) {
			if ( !strcasecmp( type, "Inproceedings" ) ||
			     !strcasecmp( type, "Incollection" ) )
				nt = "booktitle";
		}
		nv = ( char * ) fields_value( bibcross, j, FIELDS_CHRP_NOUSE );
		nl = fields_level( bibcross, j ) + 1;
		fstatus = fields_add( bibref, nt, nv, nl );
		if ( fstatus!=FIELDS_OK ) {
			status = BIBL_ERR_MEMERR;
			goto out;
		}
	}
out:
	return status;
}

static int
bibtexin_crossref( bibl *bin, param *p )
{
	int i, n, ncross, status = BIBL_OK;
	fields *bibref, *bibcross;

	for ( i=0; i<bin->nrefs; ++i ) {
		bibref = bin->ref[i];
		n = fields_find( bibref, "CROSSREF", LEVEL_ANY );
		if ( n==FIELDS_NOTFOUND ) continue;
		fields_setused( bibref, n );
		ncross = bibtexin_findref( bin, (char*) fields_value( bibref, n, FIELDS_CHRP ) );
		if ( ncross==-1 ) {
			bibtexin_nocrossref( bin, i, n, p );
			continue;
		}
		bibcross = bin->ref[ncross];
		status = bibtexin_crossref_oneref( bibref, bibcross );
		if ( status!=BIBL_OK ) goto out;
	}
out:
	return status;
}

static int
bibtexin_cleanf( bibl *bin, param *p )
{
	int status = BIBL_OK;
	long i;

        for ( i=0; i<bin->nrefs; ++i )
		status = bibtexin_cleanref( bin->ref[i], p );
	bibtexin_crossref( bin, p );
	return status;
}

/*****************************************************
 PUBLIC: int bibtexin_typef()
*****************************************************/

static int
bibtexin_typef( fields *bibin, char *filename, int nrefs, param *p )
{
	int ntypename, nrefname, is_default;
	char *refname = "", *typename = "";

	ntypename = fields_find( bibin, "INTERNAL_TYPE", LEVEL_MAIN );
	nrefname  = fields_find( bibin, "REFNUM",        LEVEL_MAIN );
	if ( nrefname!=FIELDS_NOTFOUND )  refname  = fields_value( bibin, nrefname,  FIELDS_CHRP_NOUSE );
	if ( ntypename!=FIELDS_NOTFOUND ) typename = fields_value( bibin, ntypename, FIELDS_CHRP_NOUSE );

	return get_reftype( typename, nrefs, p->progname, p->all, p->nall, refname, &is_default, REFTYPE_CHATTY );
}

/*****************************************************
 PUBLIC: int bibtexin_convertf(), returns BIBL_OK or BIBL_ERR_MEMERR
*****************************************************/

static int
bibtex_matches_list( fields *bibout, char *tag, char *suffix, str *data, int level,
		slist *names, int *match )
{
	int i, fstatus, status = BIBL_OK;
	str newtag;

	*match = 0;
	if ( names->n==0 ) return status;

	str_init( &newtag );

	for ( i=0; i<names->n; ++i ) {
		if ( strcmp( str_cstr( data ), slist_cstr( names, i ) ) ) continue;
		str_initstrc( &newtag, tag );
		str_strcatc( &newtag, suffix );
		fstatus = fields_add( bibout, str_cstr( &newtag ), str_cstr( data ), level );
		if ( fstatus!=FIELDS_OK ) {
			status = BIBL_ERR_MEMERR;
			goto out;
		}
		*match = 1;
		goto out;
	}

out:
	str_free( &newtag );
	return status;
}

/**** bibtexin_btorg ****/

/*
 * BibTeX uses 'organization' in lieu of publisher if that field is missing.
 * Otherwise output as
 * <name type="corporate">
 *    <namePart>The organization</namePart>
 *    <role>
 *       <roleTerm authority="marcrelator" type="text">organizer of meeting</roleTerm>
 *    </role>
 * </name>
 */

static int
bibtexin_btorg( fields *bibin, int m, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
{
	int n, fstatus;
	n = fields_find( bibin, "publisher", LEVEL_ANY );
	if ( n==FIELDS_NOTFOUND )
		fstatus = fields_add( bibout, "PUBLISHER", str_cstr( invalue ), level );
	else
		fstatus = fields_add( bibout, "ORGANIZER:CORP", str_cstr( invalue ), level );
	if ( fstatus==FIELDS_OK ) return BIBL_OK;
	else return BIBL_ERR_MEMERR;
}

/**** bibtexin_btsente() ****/

/*
 * sentelink = {file://localhost/full/path/to/file.pdf,Sente,PDF}
 *
 * Sente is an academic reference manager for MacOSX and Apple iPad.
 */

static int
bibtexin_btsente( fields *bibin, int n, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
{
	int fstatus, status = BIBL_OK;
	str link;

	str_init( &link );
	str_cpytodelim( &link, skip_ws( invalue->data ), ",", 0 );
	str_trimendingws( &link );
	if ( str_memerr( &link ) ) status = BIBL_ERR_MEMERR;

	if ( status==BIBL_OK && link.len ) {
		fstatus = fields_add( bibout, "FILEATTACH", str_cstr( &link ), level );
		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
	}

	str_free( &link );
	return status;
}

/**** bibtexin_linkedfile() ****/

static int
count_colons( char *p )
{
	int n = 0;
	while ( *p ) {
		if ( *p==':' ) n++;
		p++;
	}
	return n;
}

static int
first_colon( char *p )
{
	int n = 0;
	while ( p[n] && p[n]!=':' ) n++;
	return n;
}

static int
last_colon( char *p )
{
	int n = strlen( p ) - 1;
	while ( n>0 && p[n]!=':' ) n--;
	return n;
}

/*
 * file={Description:/full/path/to/file.pdf:PDF}
 */
static int
bibtexin_linkedfile( fields *bibin, int m, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
{
	int fstatus, status = BIBL_OK;
	char *p = invalue->data;
	int i, n, n1, n2;
	str link;

	n = count_colons( p );
	if ( n > 1 ) {
		/* A DOS file can contain a colon ":C:/....pdf:PDF" */
		/* Extract after 1st and up to last colons */
		n1 = first_colon( p ) + 1;
		n2 = last_colon( p );
		str_init( &link );
		for ( i=n1; i<n2; ++i ) {
			str_addchar( &link, p[i] );
		}
		str_trimstartingws( &link );
		str_trimendingws( &link );
		if ( str_memerr( &link ) ) {
			status = BIBL_ERR_MEMERR;
			goto out;
		}
		if ( link.len ) {
			fstatus = fields_add( bibout, "FILEATTACH", link.data, level );
			if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
		}
out:
		str_free( &link );
	} else {
		/* This field isn't formatted properly, so just copy directly */
		fstatus = fields_add( bibout, "FILEATTACH", p, level );
		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
	}
	return status;

}

/**** bibtexin_howpublished() ****/

/*    howpublished={},
 *
 * Normally indicates the manner in which something was
 * published in lieu of a formal publisher, so typically
 * 'howpublished' and 'publisher' will never be in the
 * same reference.
 *
 * Occassionally, people put Diploma thesis information
 * into the field, so check that first.
 *
 * Returns BIBL_OK or BIBL_ERR_MEMERR
 */

static int
bibtexin_howpublished( fields *bibin, int n, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
{
	int fstatus, status = BIBL_OK;
	if ( !strncasecmp( str_cstr( invalue ), "Diplom", 6 ) ) {
		fstatus = fields_replace_or_add( bibout, "GENRE:BIBUTILS", "Diploma thesis", level );
		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
	}
	else if ( !strncasecmp( str_cstr( invalue ), "HSabilitation", 13 ) ) {
		fstatus = fields_replace_or_add( bibout, "GENRE:BIBUTILS", "Habilitation thesis", level );
		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
	}
	else if ( !strncasecmp( str_cstr( invalue ), "Licentiate", 10 ) ) {
		fstatus = fields_replace_or_add( bibout, "GENRE:BIBUTILS", "Licentiate thesis", level );
		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
	}
	else if ( is_embedded_link( str_cstr( invalue ) ) ) {
		status =  urls_split_and_add( str_cstr( invalue ), bibout, level );
	}
	else {
		fstatus = fields_add( bibout, "PUBLISHER", str_cstr( invalue ), level );
		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
	}
	return status;
}

/**** bibtexin_eprint() ****/

/* Try to capture situations like
 *
 * eprint="1605.02026",
 * archivePrefix="arXiv",
 *
 * or
 *
 * eprint="13211131",
 * eprinttype="medline",
 *
 * If we don't know anything, concatenate archivePrefix:eprint
 * and push into URL. (Could be wrong)
 *
 * If no info, just push eprint into URL. (Could be wrong)
 */
static int
process_eprint_with_prefix( fields *bibout, char *prefix, str *value, int level )
{
	int fstatus, status = BIBL_OK;
	str merge;

	if ( !strcmp( prefix, "arXiv" ) ) {
		fstatus = fields_add( bibout, "ARXIV", value->data, level );
		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
	}

	else if ( !strcmp( prefix, "jstor" ) ) {
		fstatus = fields_add( bibout, "JSTOR", value->data, level );
		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
	}

	else if ( !strcmp( prefix, "medline" ) ) {
		fstatus = fields_add( bibout, "MEDLINE", value->data, level );
		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
	}

	else if ( !strcmp( prefix, "pubmed" ) ) {
		fstatus = fields_add( bibout, "PMID", value->data, level );
		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
	}

	/* ...if this is unknown prefix, merge prefix & eprint */
	else {
		str_init( &merge );
		str_mergestrs( &merge, prefix, ":", value->data, NULL );
		fstatus = fields_add( bibout, "URL", merge.data, level );
		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
		str_free( &merge );
	}

	return status;
}
static int
process_eprint_without_prefix( fields *bibout, str *value, int level )
{
	int fstatus;

	/* ...no archivePrefix, need to handle just 'eprint' tag */
	fstatus = fields_add( bibout, "URL", value->data, level );

	if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
	else return BIBL_OK;
}

static int
bibtexin_eprint( fields *bibin, int m, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
{
	char *prefix;
	int n;

	/* ...do we have an archivePrefix too? */
	n = fields_find( bibin, "ARCHIVEPREFIX", level );
	if ( n==FIELDS_NOTFOUND ) n = fields_find( bibin, "EPRINTTYPE", level );
	if ( n!=FIELDS_NOTFOUND ) {
		prefix = fields_value( bibin, n, FIELDS_CHRP );
		return process_eprint_with_prefix( bibout, prefix, invalue, level );
	}

	/* ...no we don't */
	return process_eprint_without_prefix( bibout, invalue, level );
}

/**** bibtexin_keyword() ****/

/* Split keywords="" with semicolons.
 * Commas are also frequently used, but will break
 * entries like:
 *       keywords="Microscopy, Confocal"
 * Returns BIBL_OK or BIBL_ERR_MEMERR
 */

static int
bibtexin_keyword( fields *bibin, int m, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
{
	int fstatus, status = BIBL_OK;
	str keyword;
	char *p;

	p = invalue->data;
	str_init( &keyword );

	while ( *p ) {
		p = str_cpytodelim( &keyword, skip_ws( p ), ";", 1 );
		str_trimendingws( &keyword );
		if ( str_memerr( &keyword ) ) {
			status = BIBL_ERR_MEMERR;
			goto out;
		}
		if ( keyword.len ) {
			fstatus = fields_add( bibout, "KEYWORD", keyword.data, level );
			if ( fstatus!=FIELDS_OK ) {
				status = BIBL_ERR_MEMERR;
				goto out;
			}
		}
	}
out:
	str_free( &keyword );
	return status;
}

/*
 * bibtex_names( bibout, newtag, field, level);
 *
 * split names in author list separated by and's (use '|' character)
 * and add names
 *
 * returns BIBL_OK on success, BIBL_ERR_MEMERR on memory error
 */

static int
bibtexin_person( fields *bibin, int m, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
{
	int begin, end, ok, n, etal, i, status, match;
	slist tokens;

	/* If we match the asis or corps list add and bail. */
	status = bibtex_matches_list( bibout, outtag, ":ASIS", invalue, level, &(pm->asis), &match );
	if ( match==1 || status!=BIBL_OK ) return status;
	status = bibtex_matches_list( bibout, outtag, ":CORP", invalue, level, &(pm->corps), &match );
	if ( match==1 || status!=BIBL_OK ) return status;

	slist_init( &tokens );

	bibtex_split( &tokens, invalue );
	for ( i=0; i<tokens.n; ++i )
		bibtex_cleantoken( slist_str( &tokens, i ) );

	etal = name_findetal( &tokens );

	begin = 0;
	n = tokens.n - etal;
	while ( begin < n ) {

		end = begin + 1;

		while ( end < n && strcasecmp( slist_cstr( &tokens, end ), "and" ) )
			end++;

		if ( end - begin == 1 ) {
			ok = name_addsingleelement( bibout, outtag, slist_cstr( &tokens, begin ), level, 0 );
			if ( !ok ) { status = BIBL_ERR_MEMERR; goto out; }
		} else {
			ok = name_addmultielement( bibout, outtag, &tokens, begin, end, level );
			if ( !ok ) { status = BIBL_ERR_MEMERR; goto out; }
		}

		begin = end + 1;

		/* Handle repeated 'and' errors: authors="G. F. Author and and B. K. Author" */
		while ( begin < n && !strcasecmp( slist_cstr( &tokens, begin ), "and" ) )
			begin++;
	}

	if ( etal ) {
		ok = name_addsingleelement( bibout, outtag, "et al.", level, 0 );
		if ( !ok ) status = BIBL_ERR_MEMERR;
	}

out:
	slist_free( &tokens );
	return status;

}

/**** bibtexin_title() ****/

/* bibtexin_titleinbook_isbooktitle()
 *
 * Normally, the title field of inbook refers to the book.  The
 * section in a @inbook reference is untitled.  If it's titled,
 * the @incollection should be used.  For example, in:
 *
 * @inbook{
 *    title="xxx"
 * }
 *
 * the booktitle is "xxx".
 *
 * However, @inbook is frequently abused (and treated like
 * @incollection) so that title and booktitle are present
 * and title is now 'supposed' to refer to the section.  For example:
 *
 * @inbook{
 *     title="yyy",
 *     booktitle="xxx"
 * }
 *
 * Therefore report whether or not booktitle is present as well
 * as title in @inbook references.  If not, then make 'title'
 * correspond to the title of the book, not the section.
 *
 */
static int
bibtexin_titleinbook_isbooktitle( fields *bibin, char *intag )
{
	int n;

	/* ...look only at 'title="xxx"' elements */
	if ( strcasecmp( intag, "TITLE" ) ) return 0;

	/* ...look only at '@inbook' references */
	n = fields_find( bibin, "INTERNAL_TYPE", LEVEL_ANY );
	if ( n==FIELDS_NOTFOUND ) return 0;
	if ( strcasecmp( fields_value( bibin, n, FIELDS_CHRP ), "INBOOK" ) ) return 0;

	/* ...look to see if 'booktitle="yyy"' exists */
	n = fields_find( bibin, "BOOKTITLE", LEVEL_ANY );
	if ( n==FIELDS_NOTFOUND ) return 0;
	else return 1;
}

static int
bibtexin_title( fields *bibin, int n, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
{
	int ok;

	if ( bibtexin_titleinbook_isbooktitle( bibin, intag->data ) ) level=LEVEL_MAIN;
	ok = title_process( bibout, "TITLE", invalue->data, level, pm->nosplittitle );
	if ( ok ) return BIBL_OK;
	else return BIBL_ERR_MEMERR;
}

static void
bibtexin_notag( param *p, char *tag )
{
	if ( p->verbose && strcmp( tag, "INTERNAL_TYPE" ) ) {
		if ( p->progname ) fprintf( stderr, "%s: ", p->progname );
		fprintf( stderr, "Cannot find tag '%s'\n", tag );
	}
}

static int
bibtexin_convertf( fields *bibin, fields *bibout, int reftype, param *p )
{
	static int (*convertfns[NUM_REFTYPES])(fields *, int, str *, str *, int, param *, char *, fields *) = {
		[ 0 ... NUM_REFTYPES-1 ] = generic_null,
		[ SIMPLE       ] = generic_simple,
		[ TITLE        ] = bibtexin_title,
		[ PERSON       ] = bibtexin_person,
		[ PAGES        ] = generic_pages,
		[ KEYWORD      ] = bibtexin_keyword,
		[ EPRINT       ] = bibtexin_eprint,
		[ HOWPUBLISHED ] = bibtexin_howpublished,
		[ LINKEDFILE   ] = bibtexin_linkedfile,
		[ NOTES        ] = generic_notes,
		[ GENRE        ] = generic_genre,
		[ BT_SENTE     ] = bibtexin_btsente,
		[ BT_ORG       ] = bibtexin_btorg,
		[ URL          ] = generic_url
	};

	int process, level, i, nfields, status = BIBL_OK;
	str *intag, *invalue;
	char *outtag;

	nfields = fields_num( bibin );
	for ( i=0; i<nfields; ++i ) {

		if ( fields_used( bibin, i ) )   continue; /* e.g. successful crossref */
		if ( fields_notag( bibin, i ) )  continue;
		if ( fields_nodata( bibin, i ) ) continue;

		intag   = fields_tag( bibin, i, FIELDS_STRP );
		invalue = fields_value( bibin, i, FIELDS_STRP );

		if ( !translate_oldtag( str_cstr( intag ), reftype, p->all, p->nall, &process, &level, &outtag ) ) {
			bibtexin_notag( p, str_cstr( intag ) );
			continue;
		}

		status = convertfns[ process ] ( bibin, i, intag, invalue, level, p, outtag, bibout );
		if ( status!=BIBL_OK ) return status;
	}

	if ( status==BIBL_OK && p->verbose ) fields_report( bibout, stderr );

	return status;
}
source-git / ghc-hs-bibutils

Source Code

Files