Blob Blame History Raw
/*
 * medin.c
 *
 * Copyright (c) Chris Putnam 2004-2018
 *
 * Source code released under the GPL version 2
 *
 */
#include <stdio.h>
#include <stdlib.h>
#include "is_ws.h"
#include "str.h"
#include "str_conv.h"
#include "fields.h"
#include "xml.h"
#include "xml_encoding.h"
#include "iso639_2.h"
#include "bibutils.h"
#include "bibformats.h"

static int medin_readf( FILE *fp, char *buf, int bufsize, int *bufpos, str *line, str *reference, int *fcharset );
static int medin_processf( fields *medin, char *data, char *filename, long nref, param *p );


/*****************************************************
 PUBLIC: void medin_initparams()
*****************************************************/
void
medin_initparams( param *p, const char *progname )
{
	p->readformat       = BIBL_MEDLINEIN;
	p->charsetin        = BIBL_CHARSET_UNICODE;
	p->charsetin_src    = BIBL_SRC_DEFAULT;
	p->latexin          = 0;
	p->xmlin            = 1;
	p->utf8in           = 1;
	p->nosplittitle     = 0;
	p->verbose          = 0;
	p->addcount         = 0;
	p->output_raw       = BIBL_RAW_WITHMAKEREFID |
	                      BIBL_RAW_WITHCHARCONVERT;

	p->readf    = medin_readf;
	p->processf = medin_processf;
	p->cleanf   = NULL;
	p->typef    = NULL;
	p->convertf = NULL;
	p->all      = NULL;
	p->nall     = 0;

	slist_init( &(p->asis) );
	slist_init( &(p->corps) );

	if ( !progname ) p->progname = NULL;
	else p->progname = strdup( progname );
}

/*****************************************************
 PUBLIC: int medin_readf()
*****************************************************/

/*
 * The only difference between MEDLINE and PUBMED in format is
 * that the entire library is wrapped in <MedlineCitationSet>
 * or <PubmedArticle> tags...
 */
static char *wrapper[] = { "PubmedArticle", "MedlineCitation" };
static int nwrapper = sizeof( wrapper ) / sizeof( wrapper[0] );

static char *
medin_findstartwrapper( char *buf, int *ntype )
{
	char *startptr=NULL;
	int i;
	for ( i=0; i<nwrapper && startptr==NULL; ++i ) {
		startptr = xml_find_start( buf, wrapper[ i ] );
		if ( startptr && *ntype==-1 ) *ntype = i;
	}
	return startptr;
}

static char *
medin_findendwrapper( char *buf, int ntype )
{
	char *endptr = xml_find_end( buf, wrapper[ ntype ] );
	return endptr;
}

static int
medin_readf( FILE *fp, char *buf, int bufsize, int *bufpos, str *line, str *reference, int *fcharset )
{
	str tmp;
	char *startptr = NULL, *endptr;
	int haveref = 0, inref = 0, file_charset = CHARSET_UNKNOWN, m, type = -1;
	str_init( &tmp );
	while ( !haveref && str_fget( fp, buf, bufsize, bufpos, line ) ) {
		if ( line->data ) {
			m = xml_getencoding( line );
			if ( m!=CHARSET_UNKNOWN ) file_charset = m;
		}
		if ( line->data ) {
			startptr = medin_findstartwrapper( line->data, &type );
		}
		if ( startptr || inref ) {
			if ( inref ) str_strcat( &tmp, line );
			else {
				str_strcatc( &tmp, startptr );
				inref = 1;
			}
			endptr = medin_findendwrapper( str_cstr( &tmp ), type );
			if ( endptr ) {
				str_segcpy( reference, str_cstr( &tmp ), endptr );
				haveref = 1;
			}
		}
	}
	str_free( &tmp );
	*fcharset = file_charset;
	return haveref;
}

/*****************************************************
 PUBLIC: int medin_processf()
*****************************************************/

typedef struct xml_convert {
	char *in;       /* The input tag */
	char *a, *aval; /* The attribute="attribute_value" pair, if nec. */
	char *out;      /* The output tag */
	int level;
} xml_convert;

static int
medin_doconvert( xml *node, fields *info, xml_convert *c, int nc, int *found )
{
	int i, fstatus;
	char *d;
	*found = 0;
	if ( !xml_has_value( node ) ) return BIBL_OK;
	d = xml_value_cstr( node );
	for ( i=0; i<nc && *found==0; ++i ) {
		if ( c[i].a==NULL ) {
			if ( xml_tag_matches( node, c[i].in ) ) {
				*found = 1;
				fstatus = fields_add( info, c[i].out, d, c[i].level );
				if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
			}
		} else {
			if ( xml_tag_has_attribute( node, c[i].in, c[i].a, c[i].aval ) ) {
				*found = 1;
				fstatus = fields_add( info, c[i].out, d, c[i].level );
				if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
			}
		}
	
	}
	return BIBL_OK;
}

/* <ArticleTitle>Mechanism and.....</ArticleTitle>
 */
static int
medin_articletitle( xml *node, fields *info )
{
	int fstatus, status = BIBL_OK;
	if ( xml_has_value( node ) ) {
		fstatus = fields_add( info, "TITLE", xml_value_cstr( node ), 0 );
		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
	}
	return status;
}

/*            <MedlineDate>2003 Jan-Feb</MedlineDate> */
static int
medin_medlinedate( fields *info, char *p, int level )
{
	int fstatus;
	str tmp;

	str_init( &tmp );

	p = str_cpytodelim( &tmp, skip_ws( p ), " \t\n\r", 0 );
	if ( str_memerr( &tmp ) ) return BIBL_ERR_MEMERR;

	if ( str_has_value( &tmp ) ) {
		fstatus = fields_add( info, "PARTDATE:YEAR", str_cstr( &tmp ), level );
		if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
	}

	p = str_cpytodelim( &tmp, skip_ws( p ), " \t\n\r", 0 );
	if ( str_memerr( &tmp ) ) return BIBL_ERR_MEMERR;

	if ( str_has_value( &tmp ) ) {
		str_findreplace( &tmp, "-", "/" );
		fstatus = fields_add( info, "PARTDATE:MONTH", str_cstr( &tmp ), level );
		if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
	}

	(void) str_cpytodelim( &tmp, skip_ws( p ), " \t\n\r", 0 );
	if ( str_memerr( &tmp ) ) return BIBL_ERR_MEMERR;

	if ( str_has_value( &tmp ) ) {
		fstatus = fields_add( info, "PARTDATE:DAY", str_cstr( &tmp ), level );
		if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
	}

	str_free( &tmp );

	return BIBL_OK;
}

/* <Langauge>eng</Language>
 */
static int
medin_language( xml *node, fields *info, int level )
{
	char *code, *language;
	int fstatus;
	code = xml_value_cstr( node );
	if ( !code ) return BIBL_OK;
	language = iso639_2_from_code( code );
	if ( language )
		fstatus = fields_add( info, "LANGUAGE", language, level );
	else
		fstatus = fields_add( info, "LANGUAGE", code, level );
	if ( fstatus==FIELDS_OK ) return BIBL_OK;
	else return BIBL_ERR_MEMERR;
}

/* <Journal>
 *    <ISSN>0027-8424</ISSN>
 *    <JournalIssue PrintYN="Y">
 *       <Volume>100</Volume>
 *       <Issue>21</Issue>
 *       <PubDate>
 *          <Year>2003</Year>
 *          <Month>Oct</Month>
 *          <Day>14</Day>
 *       </PubDate>
 *    </Journal Issue>
 * </Journal>
 *
 * or....
 *
 * <Journal>
 *    <ISSN IssnType="Print">0735-0414</ISSN>
 *    <JournalIssue CitedMedium="Print">
 *        <Volume>38</Volume>
 *        <Issue>1</Issue>
 *        <PubDate>
 *            <MedlineDate>2003 Jan-Feb</MedlineDate>
 *        </PubDate>
 *    </JournalIssue>
 *    <Title>Alcohol and alcoholism (Oxford, Oxfordshire)  </Title>
 *    <ISOAbbreviation>Alcohol Alcohol.</ISOAbbreviation>
 * </Journal>
 */
static int
medin_journal1( xml *node, fields *info )
{
	xml_convert c[] = {
		{ "Title",           NULL, NULL, "TITLE",          1 },
		{ "ISOAbbreviation", NULL, NULL, "SHORTTITLE",     1 },
		{ "ISSN",            NULL, NULL, "ISSN",           1 },
		{ "Volume",          NULL, NULL, "VOLUME",         1 },
		{ "Issue",           NULL, NULL, "ISSUE",          1 },
		{ "Year",            NULL, NULL, "PARTDATE:YEAR",  1 },
		{ "Month",           NULL, NULL, "PARTDATE:MONTH", 1 },
		{ "Day",             NULL, NULL, "PARTDATE:DAY",   1 },
	};
	int nc = sizeof( c ) / sizeof( c[0] ), status, found;
	if ( xml_has_value( node ) ) {
		status = medin_doconvert( node, info, c, nc, &found );
		if ( status!=BIBL_OK ) return status;
		if ( !found ) {
			if ( xml_tag_matches( node, "MedlineDate" ) ) {
				status = medin_medlinedate( info, xml_value_cstr( node ), 1 );
				if ( status!=BIBL_OK ) return status;
			}
			if ( xml_tag_matches( node, "Language" ) ) {
				status = medin_language( node, info, 1 );
				if ( status!=BIBL_OK ) return status;
			}
		}
	}
	if ( node->down ) {
		status = medin_journal1( node->down, info );
		if ( status!=BIBL_OK ) return status;
	}
	if ( node->next ) {
		status = medin_journal1( node->next, info );
		if ( status!=BIBL_OK ) return status;
	}
	return BIBL_OK;
}

/* <Pagination>
 *    <MedlinePgn>12111-6</MedlinePgn>
 * </Pagination>
 */
static int
medin_pagination( xml *node, fields *info )
{
	int i, fstatus, status;
	str sp, ep;
	char *p, *pp;
	if ( xml_tag_matches( node, "MedlinePgn" ) && node->value.len ) {
		strs_init( &sp, &ep, NULL );
		p = str_cpytodelim( &sp, xml_value_cstr( node ), "-", 1 );
		if ( str_memerr( &sp ) ) return BIBL_ERR_MEMERR;
		if ( str_has_value( &sp ) ) {
			fstatus = fields_add( info, "PAGES:START", str_cstr( &sp ), 1 );
			if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
		}
		(void) str_cpytodelim( &ep, p, "", 0 );
		if ( str_memerr( &ep ) ) return BIBL_ERR_MEMERR;
		if ( str_has_value( &ep ) ) {
			if ( sp.len > ep.len ) {
				for ( i=sp.len-ep.len; i<sp.len; ++i )
					sp.data[i] = ep.data[i-sp.len+ep.len];
				pp = sp.data;
			} else  pp = ep.data;
			fstatus = fields_add( info, "PAGES:STOP", pp, 1 );
			if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
		}
		strs_free( &sp, &ep, NULL );
	}
	if ( node->down ) {
		status = medin_pagination( node->down, info );
		if ( status!=BIBL_OK ) return status;
	}
	if ( node->next ) {
		status = medin_pagination( node->next, info );
		if ( status!=BIBL_OK ) return status;
	}
	return BIBL_OK;
}

/* <Abstract>
 *    <AbstractText>ljwejrelr</AbstractText>
 * </Abstract>
 */
static int
medin_abstract( xml *node, fields *info )
{
	int fstatus;
	if ( xml_tag_matches_has_value( node, "AbstractText" ) ) {
		fstatus = fields_add( info, "ABSTRACT", xml_value_cstr( node ), 0 );
		if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
	} else if ( node->next ) return medin_abstract( node->next, info );
	return BIBL_OK;
}

/* <AuthorList CompleteYN="Y">
 *    <Author>
 *        <LastName>Barondeau</LastName>
 *        <ForeName>David P</ForeName>
 *        ( or <FirstName>David P</FirstName> )
 *        <Initials>DP</Initials>
 *    </Author>
 *    <Author>
 *        <CollectiveName>Organization</CollectiveName>
 *    </Author>
 * </AuthorList>
 */
static int
medin_author( xml *node, str *name )
{
	char *p;
	if ( xml_tag_matches( node, "LastName" ) ) {
		if ( str_has_value( name ) ) {
			str_prepend( name, "|" );
			str_prepend( name, xml_value_cstr( node ) );
		}
		else str_strcat( name, xml_value( node ) );
	} else if ( xml_tag_matches( node, "ForeName" ) ||
	            xml_tag_matches( node, "FirstName" ) ) {
		p = xml_value_cstr( node );
		while ( p && *p ) {
			if ( str_has_value( name ) ) str_addchar( name, '|' );
			while ( *p==' ' ) p++;
			while ( *p && *p!=' ' ) str_addchar( name, *p++ );
		}
	} else if ( xml_tag_matches( node, "Initials" ) && !strchr( name->data, '|' )) {
		p = xml_value_cstr( node );
		while ( p && *p ) {
			if ( str_has_value( name ) ) str_addchar( name, '|' );
			if ( !is_ws(*p) ) str_addchar( name, *p++ );
		}
	}
	if ( node->next ) medin_author( node->next, name );
	return BIBL_OK;
}

static int
medin_corpauthor( xml *node, str *name )
{
	if ( xml_tag_matches( node, "CollectiveName" ) ) {
		str_strcpy( name, xml_value( node ) );
	} else if ( node->next ) medin_corpauthor( node->next, name );
	return BIBL_OK;
}

static int
medin_authorlist( xml *node, fields *info )
{
	int fstatus, status;
	str name;
	char *tag;
	str_init( &name );
	node = node->down;
	while ( node ) {
		if ( xml_tag_matches( node, "Author" ) && node->down ) {
			status = medin_author( node->down, &name );
			tag = "AUTHOR";
			if ( str_is_empty( &name ) ) {
				status = medin_corpauthor( node->down, &name );
				tag = "AUTHOR:CORP";
			}
			if ( str_memerr( &name ) || status!=BIBL_OK ) return BIBL_ERR_MEMERR;
			if ( str_has_value( &name ) ) {
				fstatus = fields_add(info,tag,name.data,0);
				if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
			}
			str_empty( &name );
		}
		node = node->next;
	}
	str_free( &name );
	return BIBL_OK;
}

/* <PublicationTypeList>
 *    <PublicationType>Journal Article</PublicationType>
 * </PublicationTypeList>
 */

/* <MedlineJournalInfo>
 *    <Country>United States</Country>
 *    <MedlineTA>Proc Natl Acad Sci U S A</MedlineTA>
 *    <NlmUniqueID>7507876</NlmUniqueID>
 * </MedlineJournalInfo>
 */

static int
medin_journal2( xml *node, fields *info )
{
	int fstatus, status = BIBL_OK;
	if ( xml_tag_matches_has_value( node, "MedlineTA" ) && fields_find( info, "TITLE", LEVEL_HOST )==FIELDS_NOTFOUND ) {
		fstatus = fields_add( info, "TITLE", xml_value_cstr( node ), 1 );
		if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
	}
	if ( node->down ) {
		status = medin_journal2( node->down, info );
		if ( status!=BIBL_OK ) return status;
	}
	if ( node->next ) status = medin_journal2( node->next, info );
	return status;
}

/*
<MeshHeadingList>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Biophysics</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Crystallography, X-Ray</DescriptorName>
</MeshHeading>
</MeshHeadingList>
*/
static int
medin_meshheading( xml *node, fields *info )
{
	int fstatus, status = BIBL_OK;
	if ( xml_tag_matches_has_value( node, "DescriptorName" ) ) {
		fstatus = fields_add( info, "KEYWORD", xml_value_cstr( node ), 0 );
		if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
	}
	if ( node->next ) status = medin_meshheading( node->next, info );
	return status;
}

static int
medin_meshheadinglist( xml *node, fields *info )
{
	int status = BIBL_OK;
	if ( xml_tag_matches( node, "MeshHeading" ) && node->down ) {
		status = medin_meshheading( node->down, info );
		if ( status!=BIBL_OK ) return status;
	}
	if ( node->next ) status = medin_meshheadinglist( node->next, info );
	return status;
}

/* <PubmedData>
 *     ....
 *     <ArticleIdList>
 *         <ArticleId IdType="pubmed">14523232</ArticleId>
 *         <ArticleId IdType="doi">10.1073/pnas.2133463100</ArticleId>
 *         <ArticleId IdType="pii">2133463100</ArticleId>
 *         <ArticleId IdType="pmc">PMC4833866</ArticleId>
 *     </ArticleIdList>
 * </PubmedData>
 *
 * I think "pii" is "Publisher Item Identifier"
 */
static int
medin_pubmeddata( xml *node, fields *info )
{
	xml_convert c[] = {
		{ "ArticleId", "IdType", "doi",     "DOI",     0 },
		{ "ArticleId", "IdType", "pubmed",  "PMID",    0 },
		{ "ArticleId", "IdType", "medline", "MEDLINE", 0 },
		{ "ArticleId", "IdType", "pmc",     "PMC",     0 },
		{ "ArticleId", "IdType", "pii",     "PII",     0 },
	};
	int nc = sizeof( c ) / sizeof( c[0] ), found, status;
	status = medin_doconvert( node, info, c, nc, &found );
	if ( status!=BIBL_OK ) return status;
	if ( node->next ) {
		status = medin_pubmeddata( node->next, info );
		if ( status!=BIBL_OK ) return status;
	}
	if ( node->down ) {
		medin_pubmeddata( node->down, info );
		if ( status!=BIBL_OK ) return status;
	}
	return BIBL_OK;
}

static int
medin_article( xml *node, fields *info )
{
	int fstatus, status = BIBL_OK;
	if ( xml_tag_matches( node, "Journal" ) )
		status = medin_journal1( node, info );
	else if ( xml_tag_matches( node, "ArticleTitle" ) )
		status = medin_articletitle( node, info );
	else if ( xml_tag_matches( node, "Pagination" ) && node->down )
		status = medin_pagination( node->down, info );
	else if ( xml_tag_matches( node, "Abstract" ) && node->down )
		status = medin_abstract( node->down, info );
	else if ( xml_tag_matches( node, "AuthorList" ) )
		status = medin_authorlist( node, info );
	else if ( xml_tag_matches( node, "Language" ) )
		status = medin_language( node, info, 0 );
	else if ( xml_tag_matches( node, "Affiliation" ) ) {
		fstatus = fields_add( info, "ADDRESS", xml_value_cstr( node ), 0 );
		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
	}
	if ( status!=BIBL_OK ) return status;
	if ( node->next ) status = medin_article( node->next, info );
	return status;
}

static int
medin_medlinecitation( xml *node, fields *info )
{
	int fstatus, status = BIBL_OK;
	if ( xml_tag_matches_has_value( node, "PMID" ) ) {
		fstatus = fields_add( info, "PMID", xml_value_cstr( node ), 0 );
		if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
	}
	if ( node->down ) {
		if ( xml_tag_matches( node, "Article" ) ) {
			status = medin_article( node->down, info );
		} else if ( xml_tag_matches( node, "MedlineJournalInfo" ) ) {
			status = medin_journal2( node->down, info );
		} else if ( xml_tag_matches( node, "MeshHeadingList" ) )
			status = medin_meshheadinglist( node->down, info );
		if ( status!=BIBL_OK ) return status;
	}
	if ( node->next ) status = medin_medlinecitation( node->next, info );
	return status;
}

static int
medin_pubmedarticle( xml *node, fields *info )
{
	int status = BIBL_OK;
	if ( node->down ) {
		if ( xml_tag_matches( node, "MedlineCitation" ) )
			status = medin_medlinecitation( node->down, info );
		else if ( xml_tag_matches( node, "PubmedData" ) )
			status = medin_pubmeddata( node->down, info );
		if ( status!=BIBL_OK ) return status;
	}
	if ( node->next ) status = medin_pubmedarticle( node->next, info );
	return status;
}

static int
medin_assembleref( xml *node, fields *info )
{
	int status = BIBL_OK;
	if ( node->down ) {
		if ( xml_tag_matches( node, "PubmedArticle" ) )
			status = medin_pubmedarticle( node->down, info );
		else if ( xml_tag_matches( node, "MedlineCitation" ) )
			status = medin_medlinecitation( node->down, info );
		else
			status = medin_assembleref( node->down, info );
	}
	if ( status!=BIBL_OK ) return status;

	if ( node->next ) {
		status = medin_assembleref( node->next, info );
		if ( status!=BIBL_OK ) return status;
	}

	/* assume everything is a journal article */
	if ( fields_num( info ) ) {
		status = fields_add( info, "RESOURCE", "text", 0 );
		if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR;
		status = fields_add( info, "ISSUANCE", "continuing", 1 );
		if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR;
		status = fields_add( info, "GENRE:MARC", "periodical", 1 );
		if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR;
		status = fields_add( info, "GENRE:BIBUTILS", "academic journal", 1 );
		if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR;
		status = BIBL_OK;
	}

	return status;
}

static int
medin_processf( fields *medin, char *data, char *filename, long nref, param *p )
{
	int status;
	xml top;

	xml_init( &top );
	xml_parse( data, &top );
	status = medin_assembleref( &top, medin );
	xml_free( &top );

	if ( status==BIBL_OK ) return 1;
	return 0;
}