|
Packit |
89ede9 |
/*
|
|
Packit |
89ede9 |
* url.c
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* doi_to_url()
|
|
Packit |
89ede9 |
* Handle outputing DOI as a URL (Endnote and RIS formats)
|
|
Packit |
89ede9 |
* 1) Append https://doi.org as necessary
|
|
Packit |
89ede9 |
* 2) Check for overlap with pre-existing URL for the DOI
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* is_doi()
|
|
Packit |
89ede9 |
* Check for DOI buried in another field.
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* Copyright (c) Chris Putnam 2008-2018
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* Source code released under the GPL version 2
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
#include <stdio.h>
|
|
Packit |
89ede9 |
#include <stdlib.h>
|
|
Packit |
89ede9 |
#include <string.h>
|
|
Packit |
89ede9 |
#include <ctype.h>
|
|
Packit |
89ede9 |
#include "bibutils.h"
|
|
Packit |
89ede9 |
#include "url.h"
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static void
|
|
Packit |
89ede9 |
construct_url( char *prefix, str *id, str *id_url, char sep )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
if ( !strncasecmp( str_cstr( id ), "http:", 5 ) )
|
|
Packit |
89ede9 |
str_strcpy( id_url, id );
|
|
Packit |
89ede9 |
else {
|
|
Packit |
89ede9 |
str_strcpyc( id_url, prefix );
|
|
Packit |
89ede9 |
if ( sep!='\0' ) {
|
|
Packit |
89ede9 |
if ( id->data[0]!=sep ) str_addchar( id_url, sep );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
str_strcat( id_url, id );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static int
|
|
Packit |
89ede9 |
url_exists( fields *f, char *urltag, str *doi_url )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
int i, n;
|
|
Packit |
89ede9 |
if ( urltag ) {
|
|
Packit |
89ede9 |
n = fields_num( f );
|
|
Packit |
89ede9 |
for ( i=0; i
|
|
Packit |
89ede9 |
if ( strcmp( fields_tag( f, i, FIELDS_CHRP ), urltag ) ) continue;
|
|
Packit |
89ede9 |
if ( strcmp( fields_value( f, i, FIELDS_CHRP ), str_cstr( doi_url ) ) ) continue;
|
|
Packit |
89ede9 |
return 1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
return 0;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static void
|
|
Packit |
89ede9 |
xxx_to_url( fields *f, int n, char *http_prefix, char *urltag, str *xxx_url, char sep )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
str_empty( xxx_url );
|
|
Packit |
89ede9 |
construct_url( http_prefix, fields_value( f, n, FIELDS_STRP ), xxx_url, sep );
|
|
Packit |
89ede9 |
if ( url_exists( f, urltag, xxx_url ) )
|
|
Packit |
89ede9 |
str_empty( xxx_url );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
void
|
|
Packit |
89ede9 |
doi_to_url( fields *f, int n, char *urltag, str *url )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
xxx_to_url( f, n, "https://doi.org", urltag, url, '/' );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
void
|
|
Packit |
89ede9 |
jstor_to_url( fields *f, int n, char *urltag, str *url )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
xxx_to_url( f, n, "http://www.jstor.org/stable", urltag, url, '/' );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
void
|
|
Packit |
89ede9 |
pmid_to_url( fields *f, int n, char *urltag, str *url )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
xxx_to_url( f, n, "http://www.ncbi.nlm.nih.gov/pubmed", urltag, url, '/' );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
void
|
|
Packit |
89ede9 |
pmc_to_url( fields *f, int n, char *urltag, str *url )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
xxx_to_url( f, n, "http://www.ncbi.nlm.nih.gov/pmc/articles", urltag, url, '/' );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
void
|
|
Packit |
89ede9 |
arxiv_to_url( fields *f, int n, char *urltag, str *url )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
xxx_to_url( f, n, "http://arxiv.org/abs", urltag, url, '/' );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
void
|
|
Packit |
89ede9 |
mrnumber_to_url( fields *f, int n, char *urltag, str *url )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
xxx_to_url( f, n, "http://www.ams.org/mathscinet-getitem?mr=", urltag, url, '\0' );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* Rules for the pattern:
|
|
Packit |
89ede9 |
* '#' = number
|
|
Packit |
89ede9 |
* isalpha() = match precisely (matchcase==1) or match regardless of case
|
|
Packit |
89ede9 |
* (matchcase==0)
|
|
Packit |
89ede9 |
* all others must match precisely
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
static int
|
|
Packit |
89ede9 |
string_pattern( char *s, char *pattern, int matchcase )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
int patlen, match, i;
|
|
Packit |
89ede9 |
patlen = strlen( pattern );
|
|
Packit |
89ede9 |
if ( strlen( s ) < patlen ) return 0; /* too short */
|
|
Packit |
89ede9 |
for ( i=0; i
|
|
Packit |
89ede9 |
match = 0;
|
|
Packit |
89ede9 |
if ( pattern[i]=='#' ) {
|
|
Packit |
89ede9 |
if ( isdigit( (unsigned char)s[i] ) ) match = 1;
|
|
Packit |
89ede9 |
} else if ( !matchcase && isalpha( (unsigned char)pattern[i] ) ) {
|
|
Packit |
89ede9 |
if ( tolower((unsigned char)pattern[i])==tolower((unsigned char)s[i])) match = 1;
|
|
Packit |
89ede9 |
} else {
|
|
Packit |
89ede9 |
if ( pattern[i] == s[i] ) match = 1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
if ( !match ) return 0;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
return 1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* science direct is now doing "M3 - doi: DOI: 10.xxxx/xxxxx" */
|
|
Packit |
89ede9 |
int
|
|
Packit |
89ede9 |
is_doi( char *s )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
if ( string_pattern( s, "##.####/", 0 ) ) return 0;
|
|
Packit |
89ede9 |
if ( string_pattern( s, "doi:##.####/", 0 ) ) return 4;
|
|
Packit |
89ede9 |
if ( string_pattern( s, "doi: ##.####/", 0 ) ) return 5;
|
|
Packit |
89ede9 |
if ( string_pattern( s, "doi: DOI: ##.####/", 0 ) ) return 10;
|
|
Packit |
89ede9 |
return -1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* determine if string has the header of a Universal Resource Identifier
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* returns -1, if not true
|
|
Packit |
89ede9 |
* returns offset that skips over the URI scheme, if true
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
int
|
|
Packit |
89ede9 |
is_uri_remote_scheme( char *p )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
char *scheme[] = { "http:", "https:", "ftp:", "git:", "gopher:" };
|
|
Packit |
89ede9 |
int schemelen[] = { 5, 6, 4, 4, 7 };
|
|
Packit |
89ede9 |
int i, nschemes = sizeof( scheme ) / sizeof( scheme[0] );
|
|
Packit |
89ede9 |
for ( i=0; i
|
|
Packit |
89ede9 |
if ( !strncasecmp( p, scheme[i], schemelen[i] ) ) return schemelen[i];
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
return -1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
int
|
|
Packit |
89ede9 |
is_reference_database( char *p )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
char *scheme[] = { "arXiv:", "pubmed:", "medline:", "isi:" };
|
|
Packit |
89ede9 |
int schemelen[] = { 6, 7, 8, 4 };
|
|
Packit |
89ede9 |
int i, nschemes = sizeof( scheme ) / sizeof( scheme[0] );
|
|
Packit |
89ede9 |
for ( i=0; i
|
|
Packit |
89ede9 |
if ( !strncasecmp( p, scheme[i], schemelen[i] ) ) return schemelen[i];
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
return -1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* many fields have been abused to embed URLs, DOIs, etc. */
|
|
Packit |
89ede9 |
int
|
|
Packit |
89ede9 |
is_embedded_link( char *s )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
if ( is_uri_remote_scheme( s ) != -1 ) return 1;
|
|
Packit |
89ede9 |
if ( is_reference_database( s ) != -1 ) return 1;
|
|
Packit |
89ede9 |
if ( is_doi( s ) !=-1 ) return 1;
|
|
Packit |
89ede9 |
return 0;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
typedef struct url_t {
|
|
Packit |
89ede9 |
char *tag;
|
|
Packit |
89ede9 |
char *prefix;
|
|
Packit |
89ede9 |
int offset;
|
|
Packit |
89ede9 |
} url_t;
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static url_t prefixes[] = {
|
|
Packit |
89ede9 |
/* 00000000001111111112222222222333333333344444444445 */
|
|
Packit |
89ede9 |
/* 12345678901234567890123456789012345678901234567890 */
|
|
Packit |
89ede9 |
{ "ARXIV", "http://arxiv.org/abs/", 21 },
|
|
Packit |
89ede9 |
{ "DOI", "https://doi.org/", 16 },
|
|
Packit |
89ede9 |
{ "JSTOR", "http://www.jstor.org/stable/", 28 },
|
|
Packit |
89ede9 |
{ "MRNUMBER", "http://www.ams.org/mathscinet-getitem?mr=", 41 },
|
|
Packit |
89ede9 |
{ "PMID", "http://www.ncbi.nlm.nih.gov/pubmed/", 35 },
|
|
Packit |
89ede9 |
{ "PMC", "http://www.ncbi.nlm.nih.gov/pmc/articles/", 41 },
|
|
Packit |
89ede9 |
{ "ISIREFNUM", "isi:", 4 },
|
|
Packit |
89ede9 |
};
|
|
Packit |
89ede9 |
static int nprefixes = sizeof( prefixes ) / sizeof( prefixes[0] );
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* do not add, but recognize */
|
|
Packit |
89ede9 |
static url_t extraprefixes[] = {
|
|
Packit |
89ede9 |
/* 00000000001111111112222222222333333333344444444445 */
|
|
Packit |
89ede9 |
/* 12345678901234567890123456789012345678901234567890 */
|
|
Packit |
89ede9 |
{ "ARXIV", "arXiv:", 6 },
|
|
Packit |
89ede9 |
{ "DOI", "http://dx.doi.org/", 18 },
|
|
Packit |
89ede9 |
{ "JSTOR", "jstor:", 6 },
|
|
Packit |
89ede9 |
{ "PMID", "pmid:", 5 },
|
|
Packit |
89ede9 |
{ "PMID", "pubmed:", 7 },
|
|
Packit |
89ede9 |
{ "PMC", "pmc:", 4 },
|
|
Packit |
89ede9 |
{ "URL", "\\urllink", 8 },
|
|
Packit |
89ede9 |
{ "URL", "\\url", 4 },
|
|
Packit |
89ede9 |
};
|
|
Packit |
89ede9 |
static int nextraprefixes = sizeof( extraprefixes ) / sizeof( extraprefixes[0] );
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static int
|
|
Packit |
89ede9 |
find_prefix( char *s, url_t *p, int np )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
int i;
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
for ( i=0; i
|
|
Packit |
89ede9 |
if ( !strncmp( p[i].prefix, s, p[i].offset ) ) return i;
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
return -1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
int
|
|
Packit |
89ede9 |
urls_split_and_add( char *value_in, fields *out, int lvl_out )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
int n, fstatus, status = BIBL_OK;
|
|
Packit |
89ede9 |
char *tag = "URL";
|
|
Packit |
89ede9 |
int offset = 0;
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
n = find_prefix( value_in, prefixes, nprefixes );
|
|
Packit |
89ede9 |
if ( n!=-1 ) {
|
|
Packit |
89ede9 |
tag = prefixes[n].tag;
|
|
Packit |
89ede9 |
offset = prefixes[n].offset;
|
|
Packit |
89ede9 |
} else {
|
|
Packit |
89ede9 |
n = find_prefix( value_in, extraprefixes, nextraprefixes );
|
|
Packit |
89ede9 |
if ( n!=-1 ) {
|
|
Packit |
89ede9 |
tag = extraprefixes[n].tag;
|
|
Packit |
89ede9 |
offset = extraprefixes[n].offset;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
fstatus = fields_add( out, tag, &(value_in[offset]), lvl_out );
|
|
Packit |
89ede9 |
if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
return status;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* urls_add_type()
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* Append urls of a specific type with a specific prefix (which can be empty).
|
|
Packit |
89ede9 |
* We don't allow duplications here.
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
static int
|
|
Packit |
89ede9 |
urls_merge_and_add_type( fields *out, char *tag_out, int lvl_out, char *prefix, vplist *values )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
int fstatus, status = BIBL_OK;
|
|
Packit |
89ede9 |
vplist_index i;
|
|
Packit |
89ede9 |
str url;
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
str_init( &url );
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
for ( i=0; i<values->n; ++i ) {
|
|
Packit |
89ede9 |
str_strcpyc( &url, prefix );
|
|
Packit |
89ede9 |
str_strcatc( &url, ( char * ) vplist_get( values, i ) );
|
|
Packit |
89ede9 |
fstatus = fields_add( out, tag_out, str_cstr( &url ), lvl_out );
|
|
Packit |
89ede9 |
if ( fstatus!=FIELDS_OK ) {
|
|
Packit |
89ede9 |
status = BIBL_ERR_MEMERR;
|
|
Packit |
89ede9 |
goto out;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
out:
|
|
Packit |
89ede9 |
str_free( &url );
|
|
Packit |
89ede9 |
return status;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/*
|
|
Packit |
89ede9 |
* urls_merge_and_add()
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* Append urls of types controlled by the list type and automatically append appropriate
|
|
Packit |
89ede9 |
* prefixes. If no prefix is found for the entry, don't add one (e.g. "URL" entries).
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* Control of the types to be added by list type is necessary as some reference formats
|
|
Packit |
89ede9 |
* like bibtex ought to do special things with DOI, ARXIV, MRNUMBER, and the like.
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
int
|
|
Packit |
89ede9 |
urls_merge_and_add( fields *in, int lvl_in, fields *out, char *tag_out, int lvl_out, slist *types )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
int i, j, status = BIBL_OK;
|
|
Packit |
89ede9 |
char *tag, *prefix, *empty="";
|
|
Packit |
89ede9 |
vplist a;
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
vplist_init( &a );
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
for ( i=0; i<types->n; ++i ) {
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
tag = slist_cstr( types, i );
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* ...look for data of requested type; if not found skip */
|
|
Packit |
89ede9 |
vplist_empty( &a );
|
|
Packit |
89ede9 |
fields_findv_each( in, lvl_in, FIELDS_CHRP, &a, tag );
|
|
Packit |
89ede9 |
if ( a.n==0 ) continue;
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* ...find the prefix (if present) */
|
|
Packit |
89ede9 |
prefix = empty;
|
|
Packit |
89ede9 |
for ( j=0; j
|
|
Packit |
89ede9 |
if ( !strcmp( prefixes[j].tag, tag ) ) {
|
|
Packit |
89ede9 |
prefix = prefixes[j].prefix;
|
|
Packit |
89ede9 |
break; /* take the first prefix in the list */
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* ...append all data of this type */
|
|
Packit |
89ede9 |
status = urls_merge_and_add_type( out, tag_out, lvl_out, prefix, &a );
|
|
Packit |
89ede9 |
if ( status!=BIBL_OK ) goto out;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
out:
|
|
Packit |
89ede9 |
vplist_free( &a );
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
return status;
|
|
Packit |
89ede9 |
}
|