Blob Blame History Raw
/*
 * endxmlin.c
 *
 * Copyright (c) Chris Putnam 2006-2018
 *
 * Program and source code released under the GPL version 2
 *
 */
#include <stdio.h>
#include <stdlib.h>
#include "str.h"
#include "str_conv.h"
#include "fields.h"
#include "name.h"
#include "xml.h"
#include "xml_encoding.h"
#include "reftypes.h"
#include "bibformats.h"

typedef struct {
	char *attrib;
	char *internal;
} attribs;

extern variants end_all[];
extern int end_nall;

static int endxmlin_readf( FILE *fp, char *buf, int bufsize, int *bufpos, str *line, str *reference, int *fcharset );
static int endxmlin_processf( fields *endin, char *p, char *filename, long nref, param *pm );
extern int endin_typef( fields *endin, char *filename, int nrefs, param *p );
extern int endin_convertf( fields *endin, fields *info, int reftype, param *p );
extern int endin_cleanf( bibl *bin, param *p );


/*****************************************************
 PUBLIC: void endxmlin_initparams()
*****************************************************/
void
endxmlin_initparams( param *p, const char *progname )
{
	p->readformat       = BIBL_ENDNOTEXMLIN;
	p->charsetin        = BIBL_CHARSET_DEFAULT;
	p->charsetin_src    = BIBL_SRC_DEFAULT;
	p->latexin          = 0;
	p->xmlin            = 1;
	p->utf8in           = 1;
	p->nosplittitle     = 0;
	p->verbose          = 0;
	p->addcount         = 0;
	p->output_raw       = 0;

	p->readf    = endxmlin_readf;
	p->processf = endxmlin_processf;
	p->cleanf   = NULL;
	p->typef    = endin_typef;
	p->convertf = endin_convertf;
	p->all      = end_all;
	p->nall     = end_nall;

	slist_init( &(p->asis) );
	slist_init( &(p->corps) );

	if ( !progname ) p->progname = NULL;
	else p->progname = strdup( progname );
}

/*****************************************************
 PUBLIC: int endxmlin_readf()
*****************************************************/

static int
xml_readmore( FILE *fp, char *buf, int bufsize, int *bufpos )
{
	if ( !feof( fp ) && fgets( buf, bufsize, fp ) ) return 0;
	return 1;
}

static int
endxmlin_readf( FILE *fp, char *buf, int bufsize, int *bufpos, str *line, str *reference, int *fcharset )
{
	str tmp;
	char *startptr = NULL, *endptr = NULL;
	int haveref = 0, inref = 0, done = 0, file_charset = CHARSET_UNKNOWN, m;
	str_init( &tmp );
	while ( !haveref && !done ) {
		if ( line->data ) {
			if ( !inref ) {
				startptr = xml_find_start( line->data, "RECORD" );
				if ( startptr ) inref = 1;
			} else
				endptr = xml_find_end( line->data, "RECORD" );
		}

		/* If no <record> tag, we can trim up to last 8 bytes */
		/* Emptying string can lose fragments of <record> tag */
		if ( !startptr ) {
			if ( line->len > 8 ) {
				int n = 8;
				char *p = &(line->data[line->len-1]);
				while ( *p && n ) { p--; n--; }
				str_segdel( line, line->data, p );
			}
		}

		if ( !startptr || !endptr ) {
			done = xml_readmore( fp, buf, bufsize, bufpos );
			str_strcatc( line, buf );
		} else {
			/* we can reallocate in the str_strcat, so re-find */
			startptr = xml_find_start( line->data, "RECORD" );
			endptr = xml_find_end( line->data, "RECORD" );
			str_segcpy( reference, startptr, endptr );
			/* clear out information in line */
			str_strcpyc( &tmp, endptr );
			str_strcpy( line, &tmp );
			haveref = 1;
		}
		if ( line->data ) {
			m = xml_getencoding( line );
			if ( m!=CHARSET_UNKNOWN ) file_charset = m;
		}
	}
	str_free( &tmp );
	*fcharset = file_charset;
	return haveref;
}

/*****************************************************
 PUBLIC: int endxmlin_processf()
*****************************************************/

/*
 * add data to fields
 */

/*
 * handle fields with (potentially) several style pieces
 *
 *   <datatype>
 *          <style>aaaaa</style>
 *   </datatype>
 *
 *   <datatype>aaaaaa</datatype>
 *
 *   <datatype>
 *          <style>aaa</style><style>aaaa</style>
 *   </datatype>
 */
static int
endxmlin_datar( xml *node, str *s )
{
	int status;

	if ( xml_has_value( node ) ) {
		str_strcat( s, &(node->value) );
		if ( str_memerr( s ) ) return BIBL_ERR_MEMERR;
	}
	if ( node->down && xml_tag_matches( node->down, "style" ) ) {
		status = endxmlin_datar( node->down, s );
		if ( status!=BIBL_OK ) return status;
	}
	if ( xml_tag_matches( node, "style" ) && node->next ) {
		status = endxmlin_datar( node->next, s );
		if ( status!=BIBL_OK ) return status;
	}

	return BIBL_OK;
}

static int
endxmlin_data( xml *node, char *inttag, fields *info, int level )
{
	int status;
	str s;

	str_init( &s );

	status = endxmlin_datar( node, &s );
	if ( status!=BIBL_OK ) return status;

	if ( str_has_value( &s ) ) {
		status = fields_add( info, inttag, s.data, level );
		if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR;
	}

	str_free( &s );
	return BIBL_OK;
}

/* <titles>
 *    <title>
 *       <style>ACTUAL TITLE HERE</style><style>MORE TITLE</style>
 *    </title>
 * </titles>
 */
static int
endxmlin_titles( xml *node, fields *info )
{
	attribs a[] = {
		{ "title", "%T" },
		{ "secondary-title", "%B" },
		{ "tertiary-title", "%S" },
		{ "alt-title", "%!" },
		{ "short-title", "SHORTTITLE" },
	};
	int i, status, n = sizeof( a ) / sizeof ( a[0] );
	str title;
	str_init( &title );
	for ( i=0; i<n; ++i ) {
		if ( xml_tag_matches( node, a[i].attrib ) && node->down ) {
			str_empty( &title );
			status = endxmlin_datar( node, &title );
			if ( status!=BIBL_OK ) return BIBL_ERR_MEMERR;
			str_trimstartingws( &title );
			str_trimendingws( &title );
			status = fields_add( info, a[i].internal, title.data, 0);
			if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR;
		}
	}
	if ( node->next ) {
		status = endxmlin_titles( node->next, info );
		if ( status!=BIBL_OK ) return status;
	}
	str_free( &title );
	return BIBL_OK;
}

/* <contributors>
 *    <secondary-authors>
 *        <author>
 *             <style>ACTUAL AUTHORS HERE</style>
 *        </author>
 *    </secondary-authors>
 * </contributors>
 */
/* <!ATTLIST author
 *      corp-name CDATA #IMPLIED
 *      first-name CDATA #IMPLIED
 *      initials CDATA #IMPLIED
 *      last-name CDATA #IMPLIED
 *      middle-initial CDATA #IMPLIED
 *      role CDATA #IMPLIED
 *      salutation CDATA #IMPLIED
 *      suffix CDATA #IMPLIED
 *      title CDATA #IMPLIED
 * >
 *
 */
static int
endxmlin_contributor( xml *node, fields *info, char *int_tag, int level )
{
	int status;
	status = endxmlin_data( node, int_tag, info, level );
	if ( status!=BIBL_OK ) return status;
	if ( node->next ) {
		status = endxmlin_contributor( node->next, info, int_tag, level );
		if ( status!=BIBL_OK ) return status;
	}
	return BIBL_OK;
}

static int
endxmlin_contributors( xml *node, fields *info )
{
	attribs a[] = {
		{ "authors", "%A" },
		{ "secondary-authors", "%E" },
		{ "tertiary-authors", "%Y" },
		{ "subsidiary-authors", "%?" },
		{ "translated-authors", "%?" },
	};
	int i, status, n = sizeof( a ) / sizeof ( a[0] );
	for ( i=0; i<n; ++i ) {
		if ( xml_tag_matches( node, a[i].attrib ) && node->down ) {
			status = endxmlin_contributor( node->down, info, a[i].internal, 0 );
			if ( status!=BIBL_OK ) return status;
		}
	}
	if ( node->next ) {
		status = endxmlin_contributors( node->next, info );
		if ( status!=BIBL_OK ) return status;
	}
	return BIBL_OK;
}

static int
endxmlin_keyword( xml *node, fields *info )
{
	int status;
	if ( xml_tag_matches( node, "keyword" ) ) {
		status = endxmlin_data( node, "%K", info, 0 );
		if ( status!=BIBL_OK ) return status;
	}
	if ( node->next ) {
		status = endxmlin_keyword( node->next, info );
		if ( status!=BIBL_OK ) return status;
	}
	return BIBL_OK;
}

static int
endxmlin_keywords( xml *node, fields *info )
{
	if ( node->down && xml_tag_matches( node->down, "keyword" ) )
		return endxmlin_keyword( node->down, info );
	return BIBL_OK;
}

/*
 *<electronic-resource-num><style face="normal" font="default" 
 * size="100%">10.1007/BF00356334</style></electronic-resource-num>
 */
static int
endxmlin_ern( xml *node, fields *info )
{
	if ( xml_tag_matches( node, "electronic-resource-num" ) )
		return endxmlin_data( node, "DOI", info, 0 );
	return BIBL_OK;
}

static int
endxmlin_language( xml *node, fields *info )
{
	if ( xml_tag_matches( node, "language" ) )
		return endxmlin_data( node, "%G", info, 0 );
	return BIBL_OK;
}

/*
 * <urls>
 *    <pdf-urls>
 *           <url>internal-pdf://Zukin_1995_The_Cultures_of_Cities-0000551425/Zukin_1995_The_Cultures_of_Cities.pdf</url>
 *    </pdf-urls>
 * </urls>
 */
static int
endxmlin_fileattach( xml *node, fields *info )
{
	int status;
	if ( xml_tag_matches( node, "url" ) ) {
		status = endxmlin_data( node, "FILEATTACH", info, 0 );
		if ( status!=BIBL_OK ) return status;
	}
	if ( node->down ) {
		status = endxmlin_fileattach( node->down, info );
		if ( status!=BIBL_OK ) return status;
	}
	if ( node->next ) {
		status = endxmlin_fileattach( node->next, info );
		if ( status!=BIBL_OK ) return status;
	}
	return BIBL_OK;
}

static int
endxmlin_urls( xml *node, fields *info )
{
	int status;
	if ( xml_tag_matches( node, "pdf-urls" ) && node->down ) {
		status = endxmlin_fileattach( node->down, info );
		if ( status!=BIBL_OK ) return status;
	} else if ( xml_tag_matches( node, "url" ) ) {
		status = endxmlin_data( node, "%U", info, 0 );
		if ( status!=BIBL_OK ) return status;
	} else {
		if ( node->down ) {
			if ( xml_tag_matches( node->down, "related-urls" ) ||
			     xml_tag_matches( node->down, "pdf-urls" ) ||
			     xml_tag_matches( node->down, "url" ) ) {
				status = endxmlin_urls( node->down, info );
				if ( status!=BIBL_OK ) return status;
			}
		}
	}
	if ( node->next ) {
		status = endxmlin_urls( node->next, info );
		if ( status!=BIBL_OK ) return status;
	}
	return BIBL_OK;
}

static int
endxmlin_pubdates( xml *node, fields *info )
{
	if ( xml_tag_matches( node, "date" ) )
		return endxmlin_data( node, "%8", info, 0 );
	else {
		if ( node->down && xml_tag_matches( node->down, "date" ) )
			return endxmlin_pubdates( node->down, info );
	}
	return BIBL_OK;
}

static int
endxmlin_dates( xml *node, fields *info )
{
	int status;
	if ( xml_tag_matches( node, "year" ) ) {
		status = endxmlin_data( node, "%D", info, 0 );
		if ( status!=BIBL_OK ) return status;
	} else {
		if ( node->down ) {
			if ( xml_tag_matches( node->down, "year" ) ) {
				status = endxmlin_dates( node->down, info );
				if ( status!=BIBL_OK ) return status;
			}
			if ( xml_tag_matches( node->down, "pub-dates" ) ) {
				status = endxmlin_pubdates( node->down, info );
				if ( status!=BIBL_OK ) return status;
			}
		}
	}
	if ( node->next ) {
		status = endxmlin_dates( node->next, info );
		if ( status!=BIBL_OK ) return status;
	}
	return BIBL_OK;
}

/*
 * <ref-type name="Journal Article">17</ref-type>
 */
static int
endxmlin_reftype( xml *node, fields *info )
{
	int status;
	str *s;

	s = xml_attribute( node, "name" );
	if ( str_has_value( s ) ) {
		status = fields_add( info, "%0", str_cstr( s ), 0 );
		if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR;
	}

	return BIBL_OK;
}

static int
endxmlin_record( xml *node, fields *info )
{
	attribs a[] = {
		{ "volume", "%V" },
		{ "num-vol", "%6" },
		{ "pages",  "%P" },
		{ "number", "%N" },
		{ "issue",  "%N" },
		{ "label",  "%F" },
		{ "auth-address", "%C" },
		{ "auth-affiliation", "%C" },
		{ "pub-location", "%C" },
		{ "publisher", "%I" },
		{ "abstract", "%X" },
		{ "edition", "%7" },
		{ "reprint-edition", "%)" },
		{ "section", "%&" },
		{ "accession-num", "%M" },
		{ "call-num", "%L" },
		{ "isbn", "%@" },
		{ "notes", "%O" },
		{ "custom1", "%1" },
		{ "custom2", "%2" },
		{ "custom3", "%3" },
		{ "custom4", "%4" },
		{ "custom5", "%#" },
		{ "custom6", "%$" },
	};
	int i, status, n = sizeof ( a ) / sizeof( a[0] );
	if ( xml_tag_matches( node, "DATABASE" ) ) {
	} else if ( xml_tag_matches( node, "SOURCE-APP" ) ) {
	} else if ( xml_tag_matches( node, "REC-NUMBER" ) ) {
	} else if ( xml_tag_matches( node, "ref-type" ) ) {
		status = endxmlin_reftype( node, info );
		if ( status!=BIBL_OK ) return status;
	} else if ( xml_tag_matches( node, "contributors" ) ) {
		if ( node->down ) {
			status = endxmlin_contributors( node->down, info );
			if ( status!=BIBL_OK ) return status;
		}
	} else if ( xml_tag_matches( node, "titles" ) ) {
		if ( node->down ) endxmlin_titles( node->down, info );
	} else if ( xml_tag_matches( node, "keywords" ) ) {
		status = endxmlin_keywords( node, info );
		if ( status!=BIBL_OK ) return status;
	} else if ( xml_tag_matches( node, "urls" ) ) {
		status = endxmlin_urls( node, info );
		if ( status!=BIBL_OK ) return status;
	} else if ( xml_tag_matches( node, "electronic-resource-num" ) ) {
		status = endxmlin_ern( node, info );
		if ( status!=BIBL_OK ) return status;
	} else if ( xml_tag_matches( node, "dates" ) ) {
		status = endxmlin_dates( node, info );
		if ( status!=BIBL_OK ) return status;
	} else if ( xml_tag_matches( node, "language" ) ) {
		status = endxmlin_language( node, info );
		if ( status!=BIBL_OK ) return status;
	} else if ( xml_tag_matches( node, "periodical" ) ) {
	} else if ( xml_tag_matches( node, "secondary-volume" ) ) {
	} else if ( xml_tag_matches( node, "secondary-issue" ) ) {
	} else if ( xml_tag_matches( node, "reprint-status" ) ) {
	} else if ( xml_tag_matches( node, "orig-pub" ) ) {
	} else if ( xml_tag_matches( node, "report-id" ) ) {
	} else if ( xml_tag_matches( node, "coden" ) ) {
	} else if ( xml_tag_matches( node, "caption" ) ) {
	} else if ( xml_tag_matches( node, "research-notes" ) ) {
	} else if ( xml_tag_matches( node, "work-type" ) ) {
	} else if ( xml_tag_matches( node, "reviewed-item" ) ) {
	} else if ( xml_tag_matches( node, "availability" ) ) {
	} else if ( xml_tag_matches( node, "remote-source" ) ) {
	} else if ( xml_tag_matches( node, "meeting-place" ) ) {
	} else if ( xml_tag_matches( node, "work-location" ) ) {
	} else if ( xml_tag_matches( node, "work-extent" ) ) {
	} else if ( xml_tag_matches( node, "pack-method" ) ) {
	} else if ( xml_tag_matches( node, "size" ) ) {
	} else if ( xml_tag_matches( node, "repro-ratio" ) ) {
	} else if ( xml_tag_matches( node, "remote-database-name" ) ) {
	} else if ( xml_tag_matches( node, "remote-database-provider" ) ) {
	} else if ( xml_tag_matches( node, "access-date" ) ) {
	} else if ( xml_tag_matches( node, "modified-data" ) ) {
	} else if ( xml_tag_matches( node, "misc1" ) ) {
	} else if ( xml_tag_matches( node, "misc2" ) ) {
	} else if ( xml_tag_matches( node, "misc3" ) ) {
	} else {
		for ( i=0; i<n; ++i ) {
			if ( xml_tag_matches( node, a[i].attrib ) ) {
				status = endxmlin_data( node, a[i].internal, info, 0 );
				if ( status!=BIBL_OK ) return status;
			}
		}
	}
	if ( node->next ) {
		status = endxmlin_record( node->next, info );
		if ( status!=BIBL_OK ) return status;
	}
	return BIBL_OK;
}

static int
endxmlin_assembleref( xml *node, fields *info )
{
	int status;
	if ( str_is_empty( &(node->tag) ) ) {
		if ( node->down )
			return endxmlin_assembleref( node->down, info );
	} else if ( xml_tag_matches( node, "RECORD" ) ) {
		if ( node->down ) {
			status = endxmlin_record( node->down, info );
			if ( status!=BIBL_OK ) return status;
		}
	}
	return BIBL_OK;
}

/* endxmlin_processf first operates by converting to endnote input
 * the endnote->mods conversion happens in convertf.
 *
 * this is necessary as the xml format is as nasty and as overloaded
 * as the tags used in the Refer format output
 */
static int
endxmlin_processf( fields *fin, char *data, char *filename, long nref, param *pm )
{
	int status;
	xml top;

	xml_init( &top );
	xml_parse( data, &top );
	status = endxmlin_assembleref( &top, fin );
	xml_free( &top );

	if ( status==BIBL_OK ) return 1;
	return 0;
}