|
Packit |
89ede9 |
/*
|
|
Packit |
89ede9 |
* str_conv.c
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* Copyright (c) Chris Putnam 1999-2018
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* Source code released under the GPL version 2
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* str routines for converting strs between character sets
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
#include <stdio.h>
|
|
Packit |
89ede9 |
#include <stdlib.h>
|
|
Packit |
89ede9 |
#include <string.h>
|
|
Packit |
89ede9 |
#include <ctype.h>
|
|
Packit |
89ede9 |
#include <limits.h>
|
|
Packit |
89ede9 |
#include "latex.h"
|
|
Packit |
89ede9 |
#include "entities.h"
|
|
Packit |
89ede9 |
#include "utf8.h"
|
|
Packit |
89ede9 |
#include "gb18030.h"
|
|
Packit |
89ede9 |
#include "charsets.h"
|
|
Packit |
89ede9 |
#include "str_conv.h"
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static void
|
|
Packit |
89ede9 |
addentity( str *s, unsigned int ch )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
char buf[512];
|
|
Packit |
89ede9 |
sprintf( buf, "&#%u;", ch );
|
|
Packit |
89ede9 |
str_strcatc( s, buf );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* These are the five minimal predefined entites in XML */
|
|
Packit |
89ede9 |
static int
|
|
Packit |
89ede9 |
minimalxmlchars( str *s, unsigned int ch )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
if ( ch==34 ) { str_strcatc( s, """ ); return 1; }
|
|
Packit |
89ede9 |
else if ( ch==38 ) { str_strcatc( s, "&" ); return 1; }
|
|
Packit |
89ede9 |
else if ( ch==39 ) { str_strcatc( s, "'" ); return 1; }
|
|
Packit |
89ede9 |
else if ( ch==60 ) { str_strcatc( s, "<" ); return 1; }
|
|
Packit |
89ede9 |
else if ( ch==62 ) { str_strcatc( s, ">" ); return 1; }
|
|
Packit |
89ede9 |
return 0;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static void
|
|
Packit |
89ede9 |
addxmlchar( str *s, unsigned int ch )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
if ( minimalxmlchars( s, ch ) ) return;
|
|
Packit |
89ede9 |
if ( ch > 127 ) addentity( s, ch );
|
|
Packit |
89ede9 |
else str_addchar( s, ch );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static void
|
|
Packit |
89ede9 |
addutf8char( str *s, unsigned int ch, int xmlout )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
unsigned char code[6];
|
|
Packit |
89ede9 |
int nc, i;
|
|
Packit |
89ede9 |
if ( xmlout ) {
|
|
Packit |
89ede9 |
if ( minimalxmlchars( s, ch ) ) return;
|
|
Packit |
89ede9 |
if ( ch > 127 && xmlout == STR_CONV_XMLOUT_ENTITIES )
|
|
Packit |
89ede9 |
{ addentity( s, ch ); return; }
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
nc = utf8_encode( ch, code );
|
|
Packit |
89ede9 |
for ( i=0; i
|
|
Packit |
89ede9 |
str_addchar( s, code[i] );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static void
|
|
Packit |
89ede9 |
addgb18030char( str *s, unsigned int ch, int xmlout )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
unsigned char code[4];
|
|
Packit |
89ede9 |
int nc, i;
|
|
Packit |
89ede9 |
if ( xmlout ) {
|
|
Packit |
89ede9 |
if ( minimalxmlchars( s, ch ) ) return;
|
|
Packit |
89ede9 |
if ( ch > 127 && xmlout == STR_CONV_XMLOUT_ENTITIES )
|
|
Packit |
89ede9 |
{ addentity( s, ch ); return; }
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
nc = gb18030_encode( ch, code );
|
|
Packit |
89ede9 |
for ( i=0; i
|
|
Packit |
89ede9 |
str_addchar( s, code[i] );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static void
|
|
Packit |
89ede9 |
addlatexchar( str *s, unsigned int ch, int xmlout, int utf8out )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
char buf[512];
|
|
Packit |
89ede9 |
uni2latex( ch, buf, sizeof( buf ) );
|
|
Packit |
89ede9 |
/* If the unicode character isn't recognized as latex output
|
|
Packit |
89ede9 |
* a '?' unless the user has requested unicode output. If so,
|
|
Packit |
89ede9 |
* output the unicode.
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
if ( utf8out && !strcmp( buf, "?" ) ) {
|
|
Packit |
89ede9 |
addutf8char( s, ch, xmlout );
|
|
Packit |
89ede9 |
} else {
|
|
Packit |
89ede9 |
str_strcatc( s, buf );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/*
|
|
Packit |
89ede9 |
* get_unicode()
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* This can be a little tricky. If the character is simply encoded
|
|
Packit |
89ede9 |
* such as UTF8 for > 128 or by numeric xml entities such as "Ȗ"
|
|
Packit |
89ede9 |
* then the output of decode_entity() and utf8_decode will necessarily
|
|
Packit |
89ede9 |
* be in the charsetin character set. On the other hand, if it's a
|
|
Packit |
89ede9 |
* fancy latex expression, such as "\alpha", or a non-numeric xml entity
|
|
Packit |
89ede9 |
* like "&", then we'll get the Unicode value (because our lists only
|
|
Packit |
89ede9 |
* keep the Unicode equivalent).
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* The unicode variable indicates whether or not a Unicode-based listing
|
|
Packit |
89ede9 |
* was used to convert the character (remember that charsetin could be
|
|
Packit |
89ede9 |
* Unicode independently).
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* The charset variable is used to keep track of what character set
|
|
Packit |
89ede9 |
* the character is in prior to conversion.
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static unsigned int
|
|
Packit |
89ede9 |
get_unicode( str *s, unsigned int *pi, int charsetin, int latexin, int utf8in, int xmlin )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
unsigned int ch;
|
|
Packit |
89ede9 |
int unicode = 0, err = 0;
|
|
Packit |
89ede9 |
if ( xmlin && s->data[*pi]=='&' ) {
|
|
Packit |
89ede9 |
ch = decode_entity( s->data, pi, &unicode, &err );
|
|
Packit |
89ede9 |
} else if ( charsetin==CHARSET_GB18030 ) {
|
|
Packit |
89ede9 |
ch = gb18030_decode( s->data, pi );
|
|
Packit |
89ede9 |
unicode = 1;
|
|
Packit |
89ede9 |
} else if ( latexin ) {
|
|
Packit |
89ede9 |
/* Must handle bibtex files in UTF8/Unicode */
|
|
Packit |
89ede9 |
if ( utf8in && ( s->data[*pi] & 128 ) ) {
|
|
Packit |
89ede9 |
ch = utf8_decode( s->data, pi );
|
|
Packit |
89ede9 |
unicode = 1;
|
|
Packit |
89ede9 |
} else ch = latex2char( s->data, pi, &unicode );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
else if ( utf8in )
|
|
Packit |
89ede9 |
ch = utf8_decode( s->data, pi );
|
|
Packit |
89ede9 |
else {
|
|
Packit |
89ede9 |
ch = (unsigned int) s->data[*pi];
|
|
Packit |
89ede9 |
*pi = *pi + 1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
if ( !unicode && charsetin!=CHARSET_UNICODE )
|
|
Packit |
89ede9 |
ch = charset_lookupchar( charsetin, ch );
|
|
Packit |
89ede9 |
return ch;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static int
|
|
Packit |
89ede9 |
write_unicode( str *s, unsigned int ch, int charsetout, int latexout,
|
|
Packit |
89ede9 |
int utf8out, int xmlout )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
unsigned int c;
|
|
Packit |
89ede9 |
if ( latexout ) {
|
|
Packit |
89ede9 |
addlatexchar( s, ch, xmlout, utf8out );
|
|
Packit |
89ede9 |
} else if ( utf8out ) {
|
|
Packit |
89ede9 |
addutf8char( s, ch, xmlout );
|
|
Packit |
89ede9 |
} else if ( charsetout==CHARSET_GB18030 ) {
|
|
Packit |
89ede9 |
addgb18030char( s, ch, xmlout );
|
|
Packit |
89ede9 |
} else {
|
|
Packit |
89ede9 |
c = charset_lookupuni( charsetout, ch );
|
|
Packit |
89ede9 |
if ( xmlout ) addxmlchar( s, c );
|
|
Packit |
89ede9 |
else str_addchar( s, c );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
return 1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/*
|
|
Packit |
89ede9 |
* Returns 1 on memory error condition
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
int
|
|
Packit |
89ede9 |
str_convert( str *s,
|
|
Packit |
89ede9 |
int charsetin, int latexin, int utf8in, int xmlin,
|
|
Packit |
89ede9 |
int charsetout, int latexout, int utf8out, int xmlout )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
unsigned int pos = 0;
|
|
Packit |
89ede9 |
unsigned int ch;
|
|
Packit |
89ede9 |
str ns;
|
|
Packit |
89ede9 |
int ok = 1;
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
if ( !s || s->len==0 ) return ok;
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* Ensure that string is internally allocated.
|
|
Packit |
89ede9 |
* This fixes NULL pointer derefernce in CVE-2018-10775 in bibutils
|
|
Packit |
89ede9 |
* as a string with a valid data pointer is potentially replaced
|
|
Packit |
89ede9 |
* by a string without a valid data pointer due to it being invalid
|
|
Packit |
89ede9 |
* unicode.
|
|
Packit |
89ede9 |
* This probably also fixes CVE-2018-10773 and CVE-2018-10774 which
|
|
Packit |
89ede9 |
* are NULL dereferences also likely due to a fuzzer, but without
|
|
Packit |
89ede9 |
* test cases in the report, I can't be completely sure.
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
str_initstrc( &ns, "" );
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
if ( charsetin==CHARSET_UNKNOWN ) charsetin = CHARSET_DEFAULT;
|
|
Packit |
89ede9 |
if ( charsetout==CHARSET_UNKNOWN ) charsetout = CHARSET_DEFAULT;
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
while ( s->data[pos] ) {
|
|
Packit |
89ede9 |
ch = get_unicode( s, &pos, charsetin, latexin, utf8in, xmlin );
|
|
Packit |
89ede9 |
ok = write_unicode( &ns, ch, charsetout, latexout, utf8out, xmlout );
|
|
Packit |
89ede9 |
if ( !ok ) goto out;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
str_swapstrings( s, &ns );
|
|
Packit |
89ede9 |
out:
|
|
Packit |
89ede9 |
str_free( &ns );
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
return ok;
|
|
Packit |
89ede9 |
}
|