/*
* name.c
*
* mangle names w/ and w/o commas
*
* Copyright (c) Chris Putnam 2004-2018
*
* Source code released under the GPL version 2
*
*/
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include "utf8.h"
#include "unicode.h"
#include "is_ws.h"
#include "str.h"
#include "fields.h"
#include "slist.h"
#include "intlist.h"
#include "name.h"
/* name_build_withcomma()
*
* reconstruct parsed names in format: 'family|given|given||suffix'
* to 'family suffix, given given
*/
void
name_build_withcomma( str *s, char *p )
{
int nseps = 0, nch;
char *suffix, *stopat;
str_empty( s );
suffix = strstr( p, "||" );
if ( suffix ) stopat = suffix;
else stopat = strchr( p, '\0' );
while ( p != stopat ) {
nch = 0;
if ( nseps==1 ) {
if ( suffix ) {
str_strcatc( s, " " );
str_strcatc( s, suffix+2 );
}
str_addchar( s, ',' );
}
if ( nseps ) str_addchar( s, ' ' );
while ( p!=stopat && *p!='|' ) {
str_addchar( s, *p++ );
nch++;
}
if ( p!=stopat && *p=='|' ) p++;
if ( nseps!=0 && nch==1 ) str_addchar( s, '.' );
nseps++;
}
}
/* name_findetal()
*
* Returns number of final tokens to be skipped in processing
* of name lists.
*/
int
name_findetal( slist *tokens )
{
str *s1, *s2;
if ( tokens->n==0 ) return 0;
/* ...check last entry for full 'et al.' or variant */
s2 = slist_str( tokens, tokens->n - 1 );
if ( !strcasecmp( s2->data, "et alia" ) ||
!strcasecmp( s2->data, "et al." ) ||
!strcasecmp( s2->data, "et al.," ) ||
!strcasecmp( s2->data, "et al" ) ||
!strcasecmp( s2->data, "etalia" ) ||
!strcasecmp( s2->data, "etal." ) ||
!strcasecmp( s2->data, "etal" ) ) {
return 1;
}
if ( tokens->n==1 ) return 0;
/* ...check last two entries for full 'et' and 'al.' */
s1 = slist_str( tokens, tokens->n - 2 );
if ( !strcasecmp( s1->data, "et" ) ) {
if ( !strcasecmp( s2->data, "alia" ) ||
!strcasecmp( s2->data, "al." ) ||
!strcasecmp( s2->data, "al.," ) ||
!strcasecmp( s2->data, "al" ) ) {
return 2;
}
}
return 0;
}
#define WITHCOMMA (1)
#define JUNIOR (2)
#define SENIOR (4)
#define THIRD (8)
#define FOURTH (16)
typedef struct {
char *s;
unsigned short value;
} suffix_value_t;
static int
identify_suffix( char *p )
{
suffix_value_t suffixes[] = {
{ "Jr." , JUNIOR },
{ "Jr" , JUNIOR },
{ "Jr.," , JUNIOR | WITHCOMMA },
{ "Jr," , JUNIOR | WITHCOMMA },
{ "Sr." , SENIOR },
{ "Sr" , SENIOR },
{ "Sr.," , SENIOR | WITHCOMMA },
{ "Sr," , SENIOR | WITHCOMMA },
{ "III" , THIRD },
{ "III," , THIRD | WITHCOMMA },
{ "IV" , FOURTH },
{ "IV," , FOURTH | WITHCOMMA },
};
int i, nsuffixes = sizeof( suffixes ) / sizeof( suffixes[0] );
for ( i=0; i<nsuffixes; ++i ) {
if ( !strcmp( p, suffixes[i].s ) )
return suffixes[i].value;
}
return 0;
}
static int
has_suffix( slist *tokens, int begin, int end, int *suffixpos )
{
int i, ret;
str *s;
/* ...check last element, e.g. "H. F. Author, Sr." */
s = slist_str( tokens, end - 1 );
ret = identify_suffix( s->data );
if ( ret ) {
*suffixpos = end - 1;
return ret;
}
/* ...try to find one after a comma, e.g. "Author, Sr., H. F." */
for ( i=begin; i<end-1; ++i ) {
s = slist_str( tokens, i );
if ( s->len && s->data[ s->len - 1 ]==',' ) {
s = slist_str( tokens, i+1 );
ret = identify_suffix( s->data );
if ( ret ) {
*suffixpos = i+1;
return ret;
}
}
}
return 0;
}
static int
add_given_split( str *name, str *s )
{
unsigned int unicode_char;
unsigned int pos = 0;
char utf8s[7];
while ( pos < s->len ) {
unicode_char = utf8_decode( s->data, &pos );
if ( is_ws( (char) unicode_char ) ) continue;
else if ( unicode_char==(unsigned int)'.' ) {
if ( s->data[pos]=='-' ) {
str_strcatc( name, ".-" );
pos += 1;
unicode_char = utf8_decode( s->data, &pos );
utf8_encode_str( unicode_char, utf8s );
str_strcatc( name, utf8s );
str_addchar( name, '.' );
}
} else if ( unicode_char==(unsigned int)'-' ) {
str_strcatc( name, ".-" );
unicode_char = utf8_decode( s->data, &pos );
utf8_encode_str( unicode_char, utf8s );
str_strcatc( name, utf8s );
str_addchar( name, '.' );
} else if ( unicode_char==(unsigned int)',' ) { /* nothing */
} else {
str_addchar( name, '|' );
utf8_encode_str( unicode_char, utf8s );
str_strcatc( name, utf8s );
}
}
return 1;
}
static unsigned char
token_has_no_upper( slist *tokens, int n )
{
unsigned short m;
str *s;
s = slist_str( tokens, n );
m = unicode_utf8_classify_str( s );
if ( m & UNICODE_UPPER ) return 0;
else return 1;
}
static unsigned char
token_has_upper( slist *tokens, int n )
{
if ( token_has_no_upper( tokens, n ) ) return 0;
else return 1;
}
static int
name_multielement_nocomma( intlist *given, intlist *family, slist *tokens, int begin, int end, int suffixpos )
{
int family_start, family_end;
int i, n;
/* ...family name(s) */
family_start = family_end = end - 1;
if ( family_start == suffixpos ) family_start = family_end = end - 2;
/* ...if family name is capitalized, then look for first non-capitalized
* ...token and combine range to family name, e.g. single quoted parts of
* ..."Ludwig 'von Beethoven'"
* ..."Johannes Diderik 'van der Waals'"
* ..."Charles Louis Xavier Joseph 'de la Valla Poussin' */
if ( token_has_upper( tokens, family_start ) ) {
i = family_start - 1;
n = -1;
while ( i >= begin && ( n==-1 || token_has_no_upper( tokens, i ) ) ) {
if ( token_has_no_upper( tokens, i ) ) n = i;
i--;
}
if ( n != -1 ) family_start = n;
}
for ( i=family_start; i<family_end+1; i++ )
intlist_add( family, i );
/* ...given names */
for ( i=begin; i<end-1; i++ ) {
if ( i>=family_start && i<=family_end ) continue;
if ( i==suffixpos ) continue;
intlist_add( given, i );
}
return 1;
}
static int
name_multielement_comma( intlist *given, intlist *family, slist *tokens, int begin, int end, int comma, int suffixpos )
{
str *s;
int i;
/* ...family names */
for ( i=begin; i<comma; ++i ) {
if ( i==suffixpos ) continue;
intlist_add( family, i );
}
s = slist_str( tokens, comma );
str_trimend( s, 1 ); /* remove comma */
intlist_add( family, comma );
/* ...given names */
for ( i=comma+1; i<end; ++i ) {
if ( i==suffixpos ) continue;
intlist_add( given, i );
}
return 1;
}
static int
name_mutlielement_build( str *name, intlist *given, intlist *family, slist *tokens )
{
unsigned short case_given = 0, case_family = 0, should_split = 0;
str *s;
int i, m;
/* ...copy and analyze family name */
for ( i=0; i<family->n; ++i ) {
m = intlist_get( family, i );
s = slist_str( tokens, m );
if ( i ) str_addchar( name, ' ' );
str_strcat( name, s );
case_family |= unicode_utf8_classify_str( s );
}
/* ...check given name case */
for ( i=0; i<given->n; ++i ) {
m = intlist_get( given, i );
s = slist_str( tokens, m );
case_given |= unicode_utf8_classify_str( s );
}
if ( ( ( case_family & UNICODE_MIXEDCASE ) == UNICODE_MIXEDCASE ) &&
( ( case_given & UNICODE_MIXEDCASE ) == UNICODE_UPPER ) ) {
should_split = 1;
}
for ( i=0; i<given->n; ++i ) {
m = intlist_get( given, i );
s = slist_str( tokens, m );
if ( !should_split ) {
str_addchar( name, '|' );
str_strcat( name, s );
} else add_given_split( name, s );
}
return 1;
}
static int
name_construct_multi( str *outname, slist *tokens, int begin, int end )
{
int i, suffix, suffixpos=-1, comma=-1;
intlist given, family;
str *s;
intlist_init( &family );
intlist_init( &given );
str_empty( outname );
suffix = has_suffix( tokens, begin, end, &suffixpos );
for ( i=begin; i<end && comma==-1; i++ ) {
if ( i==suffixpos ) continue;
s = slist_str( tokens, i );
if ( s->data[ s->len -1 ] == ',' ) {
if ( suffix && i==suffixpos-1 && !(suffix&WITHCOMMA) )
str_trimend( s, 1 );
else
comma = i;
}
}
if ( comma != -1 )
name_multielement_comma( &given, &family, tokens, begin, end, comma, suffixpos );
else
name_multielement_nocomma( &given, &family, tokens, begin, end, suffixpos );
name_mutlielement_build( outname, &given, &family, tokens );
if ( suffix ) {
if ( suffix & JUNIOR ) str_strcatc( outname, "||Jr." );
if ( suffix & SENIOR ) str_strcatc( outname, "||Sr." );
if ( suffix & THIRD ) str_strcatc( outname, "||III" );
if ( suffix & FOURTH ) str_strcatc( outname, "||IV" );
}
intlist_free( &given );
intlist_free( &family );
return 1;
}
int
name_addmultielement( fields *info, char *tag, slist *tokens, int begin, int end, int level )
{
int status, ok = 1;
str name;
str_init( &name );
name_construct_multi( &name, tokens, begin, end );
status = fields_add_can_dup( info, tag, name.data, level );
if ( status!=FIELDS_OK ) ok = 0;
str_free( &name );
return ok;
}
/* name_addsingleelement()
*
* Treat names that are single tokens, e.g. {Random Corporation, Inc.} in bibtex
* as a name that should not be mangled (e.g. AUTHOR:ASIS or AUTHOR:CORP, if corp
* is set).
*/
int
name_addsingleelement( fields *info, char *tag, char *name, int level, int corp )
{
int status, ok = 1;
str outtag;
str_init( &outtag );
str_strcpyc( &outtag, tag );
if ( !corp ) str_strcatc( &outtag, ":ASIS" );
else str_strcatc( &outtag, ":CORP" );
status = fields_add_can_dup( info, outtag.data, name, level );
if ( status!=FIELDS_OK ) ok = 0;
str_free( &outtag );
return ok;
}
/*
* Takes a single name in a string and parses it.
* Skipped by bibtex/biblatex that come pre-parsed.
*
* Returns 0 on error.
* Returns 1 on ok.
* Returns 2 on ok and name in asis list
* Returns 3 on ok and name in corps list
*/
int
name_parse( str *outname, str *inname, slist *asis, slist *corps )
{
int status, ret = 1;
slist tokens;
str_empty( outname );
if ( !inname || !inname->len ) return ret;
slist_init( &tokens );
if ( asis && slist_find( asis, inname ) !=-1 ) {
str_strcpy( outname, inname );
ret = 2;
goto out;
} else if ( corps && slist_find( corps, inname ) != -1 ) {
str_strcpy( outname, inname );
ret = 3;
goto out;
}
str_findreplace( inname, ",", ", " );
status = slist_tokenize( &tokens, inname, " ", 1 );
if ( status!=SLIST_OK ) {
str_strcpy( outname, inname );
ret = 2;
goto out;
}
if ( tokens.n==1 ) {
str_strcpy( outname, inname );
ret = 2;
} else {
name_construct_multi( outname, &tokens, 0, tokens.n );
ret = 1;
}
out:
slist_free( &tokens );
return ret;
}
static char *
name_copy( str *name, char *p )
{
char *start, *end, *q;
str_empty( name );
start = p = skip_ws( p );
/* strip tailing whitespace and commas */
while ( *p && *p!='|' ) p++;
end = p;
while ( is_ws( *end ) || *end==',' || *end=='|' || *end=='\0' )
end--;
if ( *p=='|' ) p++;
for ( q=start; q<=end; q++ )
str_addchar( name, *q );
return p;
}
/*
* name_add( info, newtag, data, level )
*
* take name(s) in data, multiple names should be separated by
* '|' characters and divide into individual name, e.g.
* "H. F. Author|W. G. Author|Q. X. Author"
*
* for each name, compare to names in the "as is" or "corporation"
* lists...these are not personal names and should be added to the
* bibliography fields directly and should not be mangled
*
* for each personal name, send to appropriate algorithm depending
* on if the author name is in the format "H. F. Author" or
* "Author, H. F."
*/
int
name_add( fields *info, char *tag, char *q, int level, slist *asis, slist *corps )
{
int ok, status, nametype, ret = 1;
str inname, outname;
slist tokens;
if ( !q ) return 0;
slist_init( &tokens );
strs_init( &inname, &outname, NULL );
while ( *q ) {
q = name_copy( &inname, q );
nametype = name_parse( &outname, &inname, asis, corps );
if ( !nametype ) { ret = 0; goto out; }
if ( nametype==1 ) {
status = fields_add_can_dup( info, tag, outname.data, level );
ok = ( status==FIELDS_OK ) ? 1 : 0;
}
else if ( nametype==2 )
ok = name_addsingleelement( info, tag, outname.data, level, 0 );
else
ok = name_addsingleelement( info, tag, outname.data, level, 1 );
if ( !ok ) { ret = 0; goto out; }
}
out:
strs_free( &inname, &outname, NULL );
slist_free( &tokens );
return ret;
}