Blame bibutils/str_conv.c

Packit 89ede9
/*
Packit 89ede9
 * str_conv.c
Packit 89ede9
 *
Packit 89ede9
 * Copyright (c) Chris Putnam 1999-2018
Packit 89ede9
 *
Packit 89ede9
 * Source code released under the GPL version 2
Packit 89ede9
 *
Packit 89ede9
 * str routines for converting strs between character sets
Packit 89ede9
 *
Packit 89ede9
 */
Packit 89ede9
#include <stdio.h>
Packit 89ede9
#include <stdlib.h>
Packit 89ede9
#include <string.h>
Packit 89ede9
#include <ctype.h>
Packit 89ede9
#include <limits.h>
Packit 89ede9
#include "latex.h"
Packit 89ede9
#include "entities.h"
Packit 89ede9
#include "utf8.h"
Packit 89ede9
#include "gb18030.h"
Packit 89ede9
#include "charsets.h"
Packit 89ede9
#include "str_conv.h"
Packit 89ede9
Packit 89ede9
static void
Packit 89ede9
addentity( str *s, unsigned int ch )
Packit 89ede9
{
Packit 89ede9
	char buf[512];
Packit 89ede9
	sprintf( buf, "&#%u;", ch );
Packit 89ede9
	str_strcatc( s, buf );
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
/* These are the five minimal predefined entites in XML */
Packit 89ede9
static int
Packit 89ede9
minimalxmlchars( str *s, unsigned int ch )
Packit 89ede9
{
Packit 89ede9
	if ( ch==34 )      { str_strcatc( s, """ ); return 1; }
Packit 89ede9
	else if ( ch==38 ) { str_strcatc( s, "&" );  return 1; }
Packit 89ede9
	else if ( ch==39 ) { str_strcatc( s, "'" ); return 1; }
Packit 89ede9
	else if ( ch==60 ) { str_strcatc( s, "<" );   return 1; }
Packit 89ede9
	else if ( ch==62 ) { str_strcatc( s, ">" );   return 1; }
Packit 89ede9
	return 0;
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
static void
Packit 89ede9
addxmlchar( str *s, unsigned int ch )
Packit 89ede9
{
Packit 89ede9
	if ( minimalxmlchars( s, ch ) ) return;
Packit 89ede9
	if ( ch > 127 ) addentity( s, ch );
Packit 89ede9
	else str_addchar( s, ch );
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
static void
Packit 89ede9
addutf8char( str *s, unsigned int ch, int xmlout )
Packit 89ede9
{
Packit 89ede9
	unsigned char code[6];
Packit 89ede9
	int nc, i;
Packit 89ede9
	if ( xmlout ) {
Packit 89ede9
		if ( minimalxmlchars( s, ch ) ) return;
Packit 89ede9
		if ( ch > 127 && xmlout == STR_CONV_XMLOUT_ENTITIES )
Packit 89ede9
			{ addentity( s, ch ); return; }
Packit 89ede9
	}
Packit 89ede9
	nc = utf8_encode( ch, code );
Packit 89ede9
	for ( i=0; i
Packit 89ede9
		str_addchar( s, code[i] );
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
static void
Packit 89ede9
addgb18030char( str *s, unsigned int ch, int xmlout )
Packit 89ede9
{
Packit 89ede9
	unsigned char code[4];
Packit 89ede9
	int nc, i;
Packit 89ede9
	if ( xmlout ) {
Packit 89ede9
		if ( minimalxmlchars( s, ch ) ) return;
Packit 89ede9
		if ( ch > 127 && xmlout == STR_CONV_XMLOUT_ENTITIES )
Packit 89ede9
			{ addentity( s, ch ); return; }
Packit 89ede9
	}
Packit 89ede9
	nc = gb18030_encode( ch, code );
Packit 89ede9
	for ( i=0; i
Packit 89ede9
		str_addchar( s, code[i] );
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
static void
Packit 89ede9
addlatexchar( str *s, unsigned int ch, int xmlout, int utf8out )
Packit 89ede9
{
Packit 89ede9
	char buf[512];
Packit 89ede9
	uni2latex( ch, buf, sizeof( buf ) );
Packit 89ede9
	/* If the unicode character isn't recognized as latex output
Packit 89ede9
	 * a '?' unless the user has requested unicode output.  If so,
Packit 89ede9
	 * output the unicode.
Packit 89ede9
	 */
Packit 89ede9
	if ( utf8out && !strcmp( buf, "?" ) ) {
Packit 89ede9
		addutf8char( s, ch, xmlout );
Packit 89ede9
	} else {
Packit 89ede9
		str_strcatc( s, buf );
Packit 89ede9
	}
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
/*
Packit 89ede9
 * get_unicode()
Packit 89ede9
 * 
Packit 89ede9
 *   This can be a little tricky.  If the character is simply encoded
Packit 89ede9
 *   such as UTF8 for > 128 or by numeric xml entities such as "Ȗ"
Packit 89ede9
 *   then the output of decode_entity() and utf8_decode will necessarily
Packit 89ede9
 *   be in the charsetin character set.  On the other hand, if it's a
Packit 89ede9
 *   fancy latex expression, such as "\alpha", or a non-numeric xml entity
Packit 89ede9
 *   like "&", then we'll get the Unicode value (because our lists only
Packit 89ede9
 *   keep the Unicode equivalent).
Packit 89ede9
 *
Packit 89ede9
 *   The unicode variable indicates whether or not a Unicode-based listing
Packit 89ede9
 *   was used to convert the character (remember that charsetin could be
Packit 89ede9
 *   Unicode independently).
Packit 89ede9
 *
Packit 89ede9
 *   The charset variable is used to keep track of what character set
Packit 89ede9
 *   the character is in prior to conversion.
Packit 89ede9
 *
Packit 89ede9
 */
Packit 89ede9
Packit 89ede9
static unsigned int
Packit 89ede9
get_unicode( str *s, unsigned int *pi, int charsetin, int latexin, int utf8in, int xmlin )
Packit 89ede9
{
Packit 89ede9
	unsigned int ch;
Packit 89ede9
	int unicode = 0, err = 0;
Packit 89ede9
	if ( xmlin && s->data[*pi]=='&' ) {
Packit 89ede9
		ch = decode_entity( s->data, pi, &unicode, &err );
Packit 89ede9
	} else if ( charsetin==CHARSET_GB18030 ) {
Packit 89ede9
		ch = gb18030_decode( s->data, pi );
Packit 89ede9
		unicode = 1;
Packit 89ede9
	} else if ( latexin ) {
Packit 89ede9
		/* Must handle bibtex files in UTF8/Unicode */
Packit 89ede9
		if ( utf8in && ( s->data[*pi] & 128 ) ) {
Packit 89ede9
			ch = utf8_decode( s->data, pi );
Packit 89ede9
			unicode = 1;
Packit 89ede9
		} else ch = latex2char( s->data, pi, &unicode );
Packit 89ede9
	}
Packit 89ede9
	else if ( utf8in )
Packit 89ede9
		ch = utf8_decode( s->data, pi );
Packit 89ede9
	else {
Packit 89ede9
		ch = (unsigned int) s->data[*pi];
Packit 89ede9
		*pi = *pi + 1;
Packit 89ede9
	}
Packit 89ede9
	if ( !unicode && charsetin!=CHARSET_UNICODE )
Packit 89ede9
		ch = charset_lookupchar( charsetin, ch );
Packit 89ede9
	return ch;
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
static int
Packit 89ede9
write_unicode( str *s, unsigned int ch, int charsetout, int latexout,
Packit 89ede9
		int utf8out, int xmlout )
Packit 89ede9
{
Packit 89ede9
	unsigned int c;
Packit 89ede9
	if ( latexout ) {
Packit 89ede9
		addlatexchar( s, ch, xmlout, utf8out );
Packit 89ede9
	} else if ( utf8out ) {
Packit 89ede9
		addutf8char( s, ch, xmlout );
Packit 89ede9
	} else if ( charsetout==CHARSET_GB18030 ) {
Packit 89ede9
		addgb18030char( s, ch, xmlout );
Packit 89ede9
	} else {
Packit 89ede9
		c = charset_lookupuni( charsetout, ch );
Packit 89ede9
		if ( xmlout ) addxmlchar( s, c );
Packit 89ede9
		else str_addchar( s, c );
Packit 89ede9
	}
Packit 89ede9
	return 1;
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
/*
Packit 89ede9
 * Returns 1 on memory error condition
Packit 89ede9
 */
Packit 89ede9
int
Packit 89ede9
str_convert( str *s,
Packit 89ede9
	int charsetin,  int latexin,  int utf8in,  int xmlin,
Packit 89ede9
	int charsetout, int latexout, int utf8out, int xmlout )
Packit 89ede9
{
Packit 89ede9
	unsigned int pos = 0;
Packit 89ede9
	unsigned int ch;
Packit 89ede9
	str ns;
Packit 89ede9
	int ok = 1;
Packit 89ede9
Packit 89ede9
	if ( !s || s->len==0 ) return ok;
Packit 89ede9
Packit 89ede9
	/* Ensure that string is internally allocated.
Packit 89ede9
	 * This fixes NULL pointer derefernce in CVE-2018-10775 in bibutils
Packit 89ede9
	 * as a string with a valid data pointer is potentially replaced
Packit 89ede9
	 * by a string without a valid data pointer due to it being invalid
Packit 89ede9
	 * unicode.
Packit 89ede9
	 * This probably also fixes CVE-2018-10773 and CVE-2018-10774 which
Packit 89ede9
	 * are NULL dereferences also likely due to a fuzzer, but without
Packit 89ede9
	 * test cases in the report, I can't be completely sure.
Packit 89ede9
	 */
Packit 89ede9
	str_initstrc( &ns, "" );
Packit 89ede9
Packit 89ede9
	if ( charsetin==CHARSET_UNKNOWN ) charsetin = CHARSET_DEFAULT;
Packit 89ede9
	if ( charsetout==CHARSET_UNKNOWN ) charsetout = CHARSET_DEFAULT;
Packit 89ede9
Packit 89ede9
	while ( s->data[pos] ) {
Packit 89ede9
		ch = get_unicode( s, &pos, charsetin, latexin, utf8in, xmlin );
Packit 89ede9
		ok = write_unicode( &ns, ch, charsetout, latexout, utf8out, xmlout );
Packit 89ede9
		if ( !ok ) goto out;
Packit 89ede9
	}
Packit 89ede9
Packit 89ede9
	str_swapstrings( s, &ns );
Packit 89ede9
out:
Packit 89ede9
	str_free( &ns );
Packit 89ede9
Packit 89ede9
	return ok;
Packit 89ede9
}