/*
* entities.c
*
* Copyright (c) Chris Putnam 2003-2018
*
* Source code released under the GPL version 2
*
*/
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "entities.h"
/* HTML 4.0 entities */
typedef struct entities {
char html[20];
unsigned int unicode;
} entities;
entities html_entities[] = {
/* Special Entities */
{ """, 34 }, /* quotation mark */
{ "&", 38 }, /* ampersand */
{ "'", 39 }, /* apostrophe (note not defined in HTML) */
{ "(", 40 }, /* left parenthesis */
{ ")", 41 }, /* right parenthesis */
{ "‐", 45 }, /* hyphen */
{ "<", 60 }, /* less-than sign */
{ ">", 62 }, /* greater-than sign */
{ "?", 63 }, /* question mark */
{ "Œ", 338 }, /* Latin cap ligature OE */
{ "œ", 339 }, /* Latin small ligature OE */
{ "Š", 352 }, /* Latin cap S with caron */
{ "š", 353 }, /* Latin cap S with caron */
{ "Ÿ", 376 }, /* Latin cap y with diaeresis */
{ "ˆ", 710 }, /* modifier letter circumflex */
{ "˜", 732 }, /* small tilde */
{ " ", 8194 }, /* en space */
{ " ", 8195 }, /* em space */
{ " ", 8201 }, /* thin space */
{ "‌", 8204 }, /* zero width non-joiner */
{ "‍", 8205 }, /* zero width joiner */
{ "‎", 8206 }, /* left-to-right mark */
{ "‏", 8207 }, /* right-to-left mark */
{ "–", 8211 }, /* en dash */
{ "—", 8212 }, /* em dash */
{ "‘", 8216 }, /* left single quotation mark */
{ "’", 8217 }, /* right single quot. mark */
{ "‚", 8218 }, /* single low-9 quot. mark */
{ "“", 8220 }, /* left double quot. mark */
{ "”", 8221 }, /* right double quot. mark */
{ "„", 8222 }, /* double low-9 quot. mark */
{ "†", 8224 }, /* dagger */
{ "‡", 8225 }, /* double dagger */
{ "‰", 8240 }, /* per mille sign */
{ "‹", 8249 }, /* sin. left angle quot mark */
{ "›", 8250 }, /* sin. right angle quot mark */
{ "€", 8364 }, /* euro sign */
/* Symbols and Greek characters */
{ "ƒ", 402 }, /* small f with hook = function */
{ "Α", 913 }, /* capital alpha */
{ "Β", 914 }, /* capital beta */
{ "Γ", 915 }, /* capital gamma */
{ "Δ", 916 }, /* capital delta */
{ "Ε", 917 }, /* capital epsilon */
{ "Ζ", 918 }, /* capital zeta */
{ "Η", 919 }, /* capital eta */
{ "Θ", 920 }, /* capital theta */
{ "Ι", 921 }, /* capital iota */
{ "Κ", 922 }, /* capital kappa */
{ "Λ", 923 }, /* capital lambda */
{ "Μ", 924 }, /* capital mu */
{ "Ν", 925 }, /* capital nu */
{ "Ξ", 926 }, /* capital xi */
{ "Ο", 927 }, /* capital omicron */
{ "Π", 928 }, /* capital pi */
{ "Ρ", 929 }, /* capital rho */
{ "Σ", 931 }, /* capital sigma */
{ "Τ", 932 }, /* capital tau */
{ "Υ", 933 }, /* capital upsilon */
{ "Φ", 934 }, /* capital phi */
{ "Χ", 935 }, /* capital chi */
{ "Ψ", 936 }, /* capital psi */
{ "Ω", 937 }, /* capital omega */
{ "α", 945 }, /* small alpha */
{ "β", 946 }, /* small beta */
{ "γ", 947 }, /* small gamma */
{ "δ", 948 }, /* small delta */
{ "ε", 949 }, /* small epsilon */
{ "ζ", 950 }, /* small zeta */
{ "η", 951 }, /* small eta */
{ "θ", 952 }, /* small theta */
{ "ι", 953 }, /* small iota */
{ "κ", 954 }, /* small kappa */
{ "λ", 955 }, /* small lambda */
{ "μ", 956 }, /* small mu */
{ "ν", 957 }, /* small nu */
{ "ξ", 958 }, /* small xi */
{ "ο", 959 }, /* small omicron */
{ "π", 960 }, /* small pi */
{ "ρ", 961 }, /* small rho */
{ "ς", 962 }, /* small final sigma */
{ "σ", 963 }, /* small simga */
{ "τ", 964 }, /* small tau */
{ "υ", 965 }, /* small upsilon */
{ "φ", 966 }, /* small phi */
{ "χ", 967 }, /* small chi */
{ "ψ", 968 }, /* small psi */
{ "ω", 969 }, /* small omega */
{ "ϑ",977 }, /* small theta symbol */
{ "ϒ", 978 }, /* small upsilon with hook */
{ "ϖ", 982 }, /* pi symbol */
{ "•", 8226 }, /* bullet = small blk circle */
{ "…", 8230 }, /* horizontal ellipsis */
{ "′", 8242 }, /* prime = minutes = feet */
{ "″", 8243 }, /* double prime */
{ "‾", 8254 }, /* overline */
{ "⁄", 8260 }, /* fraction slash */
{ "℘", 8472 }, /* Weierstrass p = power set */
{ "ℑ", 8465 }, /* imaginary part-black cap I */
{ "ℜ", 8476 }, /* real part-black cap R */
{ "™", 8482 }, /* trademark sign */
{ "ℵ",8501 }, /* alef symbol */
{ "←", 8592 }, /* left arrow */
{ "↑", 8593 }, /* up arrow */
{ "→", 8594 }, /* right arrow */
{ "↓", 8595 }, /* down arrow */
{ "↔", 8596 }, /* left/right arrow */
{ "↵", 8629 }, /* down arrow with corner left */
{ "⇐", 8656 }, /* left double arrow */
{ "⇑", 8657 }, /* up double arrow */
{ "⇒", 8658 }, /* up double arrow */
{ "⇓", 8659 }, /* up double arrow */
{ "⇔", 8660 }, /* up double arrow */
{ "∀", 8704}, /* for all */
{ "∂", 8706}, /* partial differential */
{ "∃", 8707}, /* there exists */
{ "∅", 8709}, /* empty set */
{ "∇", 8711}, /* nabla=backwards difference */
{ "∈", 8712}, /* element of */
{ "∉", 8713}, /* not an element of */
{ "∋", 8715}, /* contains as member */
{ "∏", 8719}, /* n-ary product */
{ "∑", 8721}, /* n-ary summation */
{ "−", 8722}, /* minuss sign */
{ "∗", 8727}, /* asterisk operator */
{ "√", 8730}, /* square root */
{ "∝", 8733}, /* proportional to */
{ "∞", 8734}, /* infinity */
{ "∠", 8736}, /* angle */
{ "∧", 8743}, /* logical and */
{ "∨", 8744}, /* logical or */
{ "∩", 8745}, /* intersection */
{ "∪", 8746}, /* union */
{ "∫", 8747}, /* integral */
{ "∴", 8756}, /* therefore */
{ "∼", 8764}, /* tilde operator */
{ "≅", 8773}, /* approximately equal to */
{ "≈", 8776}, /* asymptotic to */
{ "≠", 8800}, /* not equal to */
{ "≡", 8801}, /* identical to */
{ "≤", 8804}, /* less-than or equal to */
{ "≥", 8805}, /* greater-than or equal to */
{ "⊂", 8834}, /* subset of */
{ "⊃", 8835}, /* superset of */
{ "⊄", 8836}, /* not a subset of */
{ "⊆", 8838}, /* subset of or equal to */
{ "⊇", 8839}, /* superset of or equal to */
{ "⊕", 8853}, /* circled plus = direct sum */
{ "⊗", 8855}, /* circled times = vec prod */
{ "⊥", 8869}, /* perpendicular */
{ "⋅", 8901}, /* dot operator */
{ "⌈", 8968}, /* left ceiling */
{ "⌉", 8969}, /* right ceiling */
{ "⌊", 8970}, /* left floor */
{ "⌋", 8971}, /* right floor */
{ "⟨", 9001}, /* left angle bracket */
{ "⟩", 9002}, /* right angle bracket */
{ "◊", 9674}, /* lozenge */
{ "♠", 9824}, /* spades */
{ "♣", 9827}, /* clubs */
{ "♥", 9829}, /* hearts */
{ "♦", 9830}, /* diamonds */
/* Latin-1 */
{ " ", 32 }, /* non-breaking space */
{ "¡", 161 }, /* inverted exclamation mark */
{ "¢", 162 }, /* cent sign */
{ "£", 163 }, /* pound sign */
{ "¤", 164 }, /* currency sign */
{ "¥", 165 }, /* yen sign */
{ "¦", 166 }, /* broken vertical bar */
{ "§", 167 }, /* section sign */
{ "¨", 168 }, /* diaeresis - spacing diaeresis */
{ "©", 169 }, /* copyright sign */
{ "ª", 170 }, /* feminine ordinal indicator */
{ "«", 171 }, /* left-pointing guillemet */
{ "¬", 172 }, /* not sign */
{ "­", 173 }, /* soft (discretionary) hyphen */
{ "®", 174 }, /* registered sign */
{ "¯", 175 }, /* macron = overline */
{ "°", 176 }, /* degree sign */
{ "±", 177 }, /* plus-minus sign */
{ "²", 178 }, /* superscript two */
{ "³", 179 }, /* superscript three */
{ "´", 180 }, /* acute accent = spacing acute */
{ "µ", 181 }, /* micro sign */
{ "¶", 182 }, /* pilcrow (paragraph) sign */
{ "·", 183 }, /* middle dot (georgian comma) */
{ "¸", 184 }, /* cedilla = spacing cedilla */
{ "¹", 185 }, /* superscript one */
{ "º", 186 }, /* masculine ordinal indicator */
{ "»", 187 }, /* right pointing guillemet */
{ "¼", 188 }, /* 1/4 */
{ "½", 189 }, /* 1/2 */
{ "¾", 190 }, /* 3/4 */
{ "¿", 191 }, /* inverted question mark */
{ "À", 192 }, /* cap A with grave */
{ "Á", 193 }, /* cap A with acute */
{ "Â", 194 }, /* cap A with circumflex */
{ "Ã", 195 }, /* cap A with tilde */
{ "Ä", 196 }, /* cap A with diaeresis */
{ "Å", 197 }, /* cap A with ring */
{ "Æ", 198 }, /* cap AE ligature */
{ "Ç", 199 }, /* cap C with cedilla */
{ "È", 200 }, /* cap E with grave */
{ "É", 201 }, /* cap E with acute */
{ "Ê", 202 }, /* cap E with circumflex */
{ "Ë", 203 }, /* cap E with diaeresis */
{ "Ì", 204 }, /* cap I with grave */
{ "Í", 205 }, /* cap I with acute */
{ "Î", 206 }, /* cap I with circumflex */
{ "Ï", 207 }, /* cap I with diaeresis */
{ "Ð", 208 }, /* cap letter ETH */
{ "Ñ", 209 }, /* cap N with tilde */
{ "Ò", 210 }, /* cap O with grave */
{ "Ó", 211 }, /* cap O with acute */
{ "Ô", 212 }, /* cap O with circumflex */
{ "Õ", 213 }, /* cap O with tilde */
{ "Ö", 214 }, /* cap O with diaeresis */
{ "×", 215 }, /* multiplication sign */
{ "Ø", 216 }, /* cap O with stroke */
{ "Ù", 217 }, /* cap U with grave */
{ "Ú", 218 }, /* cap U with acute */
{ "Û", 219 }, /* cap U with circumflex */
{ "Ü", 220 }, /* cap U with diaeresis */
{ "Ý", 221 }, /* cap Y with acute */
{ "Þ", 222 }, /* cap letter THORN */
{ "ß", 223 }, /* small sharp s = ess-zed */
{ "à", 224 }, /* small a with grave */
{ "á", 225 }, /* small a with acute */
{ "â", 226 }, /* small a with cirucmflex */
{ "ã", 227 }, /* small a with tilde */
{ "&amul;", 228 }, /* small a with diaeresis */
{ "å", 229 }, /* small a with ring */
{ "æ", 230 }, /* small ligature ae */
{ "ç", 231 }, /* small c with cedilla */
{ "è", 232 }, /* small e with grave */
{ "é", 233 }, /* small e with acute */
{ "ê", 234 }, /* small e with circumflex */
{ "&emul;", 235 }, /* small e with diaeresis */
{ "ì", 236 }, /* small i with grave */
{ "í", 237 }, /* small i with acute */
{ "î", 238 }, /* small i with circumflex */
{ "ï", 239 }, /* small i with diaeresis */
{ "ð", 240 }, /* latin small letter eth */
{ "ñ", 241 }, /* small n with tilde */
{ "ò", 242 }, /* small o with grave */
{ "ó", 243 }, /* small o with acute */
{ "ô", 244 }, /* small o with circumflex */
{ "õ", 245 }, /* small o with tilde */
{ "ö", 246 }, /* small o with diaeresis */
{ "÷", 247 }, /* division sign */
{ "ø", 248 }, /* small o with slash */
{ "ù", 249 }, /* small u with grave */
{ "ú", 250 }, /* small u with acute */
{ "û", 251 }, /* small u with circumflex */
{ "ü", 252 }, /* small u with diaeresis */
{ "ý", 253 }, /* small y with acute */
{ "þ", 254 }, /* latin small letter thorn */
{ "ÿ", 255 }, /* small y with diaeresis */
};
static unsigned int
decode_html_entity( char *s, unsigned int *pi, int *err )
{
int nhtml_entities = sizeof( html_entities ) / sizeof( entities );
char *e;
int i, n=-1, len;
for ( i=0; i<nhtml_entities && n==-1; ++i ) {
e = &(html_entities[i].html[0]);
len = strlen( e );
if ( !strncasecmp( &(s[*pi]), e, len ) ) {
n = i;
*pi += len;
}
}
if ( n==-1 ) {
*err = 1;
return '&';
} else {
*err = 0;
return html_entities[n].unicode;
}
}
/*
* decode decimal entity
*
* extract a decimal entity from &#NNNN;
* s[*pi] points to the '&' character
*/
static unsigned int
decode_decimal_entity( char *s, unsigned int *pi, int *err )
{
unsigned int c = 0, d;
int i = *pi, j = 2;
while ( isdigit( (unsigned char)s[i+j] ) ) {
d = s[i+j] - '0';
c = 10 * c + d;
j++;
}
if ( s[i+j]!=';' ) *err = 1;
else *pi = i+j+1;
return c;
}
/*
* decode hex entity
*
* extract a hex entity from &#xNNNN;
* s[*pi] points to the '&' character
*/
static unsigned int
decode_hex_entity( char *s, unsigned int *pi, int *err )
{
unsigned int c = 0, d;
int i = *pi, j = 3;
while ( isxdigit( (unsigned char)s[i+j] ) ) {
if ( isdigit( (unsigned char)s[i+j] ) ) d = s[i+j]-'0';
else d = toupper((unsigned char)s[i+j])-'A' + 10;
c = 16 * c + d;
j++;
}
if ( s[i+j]!=';' ) *err = 1;
else *pi = i+j+1;
return c;
}
/*
* decode numeric entity
*
* extract a numeric entity from &#NNN; or &#xNNNN;
*
* In XML, the "x" in hexadecimal entries should be lowercase,
* but we'll be generous and accept "X" as well.
*/
static unsigned int
decode_numeric_entity( char *s, unsigned int *pi, int *err )
{
unsigned int c;
*err = 0;
if ( s[*pi+2]!='x' && s[*pi+2]!='X' ) c = decode_decimal_entity( s, pi, err );
else c = decode_hex_entity( s, pi, err );
if ( *err ) {
*pi = *pi + 1;
c = '&';
}
return c;
}
/*
* decode entity
* extract entity from &mmmm;
*
* where &mmmm; is one of
* - &#nnnn; is code point in decimal form
* - &#xhhhh; is code point in hexadecimal form (note "x" is lowercase in XML)
* - &mmmm; corresponds to a pre-defined XML entity, e.g. "e for quotations
*
*/
unsigned int
decode_entity( char *s, unsigned int *pi, int *unicode, int *err )
{
unsigned int c = '&';
*unicode = 0;
if ( s[*pi]!='&' ) {
*err = 1; /* need to start with ampersand */
c = s[*pi];
} else *err = 0;
if ( !*err ) {
if ( s[*pi+1]=='#' ) c = decode_numeric_entity( s, pi, err );
else {
c = decode_html_entity( s, pi, err );
*unicode = 1;
}
}
if ( *err ) *pi = *pi + 1;
return c;
}