Blame bibutils/entities.c

Packit 89ede9
/*
Packit 89ede9
 * entities.c
Packit 89ede9
 *
Packit 89ede9
 * Copyright (c) Chris Putnam 2003-2018
Packit 89ede9
 *
Packit 89ede9
 * Source code released under the GPL version 2
Packit 89ede9
 *
Packit 89ede9
 */
Packit 89ede9
#include <stdio.h>
Packit 89ede9
#include <string.h>
Packit 89ede9
#include <ctype.h>
Packit 89ede9
#include "entities.h"
Packit 89ede9
Packit 89ede9
/* HTML 4.0 entities */
Packit 89ede9
Packit 89ede9
typedef struct entities {
Packit 89ede9
	char html[20];
Packit 89ede9
	unsigned int unicode;
Packit 89ede9
} entities;
Packit 89ede9
Packit 89ede9
entities html_entities[] = {
Packit 89ede9
	/* Special Entities */
Packit 89ede9
	{ """,     34 },  /* quotation mark */
Packit 89ede9
	{ "&",      38 },  /* ampersand */
Packit 89ede9
	{ "'",     39 },  /* apostrophe (note not defined in HTML) */
Packit 89ede9
	{ "(",     40 },  /* left parenthesis */
Packit 89ede9
	{ ")",     41 },  /* right parenthesis */
Packit 89ede9
	{ "‐",   45 },  /* hyphen */
Packit 89ede9
	{ "<",       60 },  /* less-than sign */
Packit 89ede9
	{ ">",       62 },  /* greater-than sign */
Packit 89ede9
	{ "?",    63 },  /* question mark */
Packit 89ede9
	{ "Œ",   338 },  /* Latin cap ligature OE */
Packit 89ede9
	{ "œ",   339 },  /* Latin small ligature OE */
Packit 89ede9
	{ "Š",  352 },  /* Latin cap S with caron */
Packit 89ede9
	{ "š",  353 },  /* Latin cap S with caron */
Packit 89ede9
	{ "Ÿ",    376 },  /* Latin cap y with diaeresis */
Packit 89ede9
	{ "ˆ",    710 },  /* modifier letter circumflex */
Packit 89ede9
	{ "˜",   732 },  /* small tilde */
Packit 89ede9
	{ " ",   8194 }, /* en space */
Packit 89ede9
	{ " ",   8195 }, /* em space */
Packit 89ede9
	{ " ", 8201 }, /* thin space */
Packit 89ede9
	{ "‌",   8204 }, /* zero width non-joiner */
Packit 89ede9
	{ "‍",    8205 }, /* zero width joiner */
Packit 89ede9
	{ "‎",    8206 }, /* left-to-right mark */
Packit 89ede9
	{ "‏",    8207 }, /* right-to-left mark */
Packit 89ede9
	{ "–",  8211 }, /* en dash */
Packit 89ede9
	{ "—",  8212 }, /* em dash */
Packit 89ede9
	{ "‘",  8216 }, /* left single quotation mark */
Packit 89ede9
	{ "’",  8217 }, /* right single quot. mark */
Packit 89ede9
	{ "‚",  8218 }, /* single low-9 quot. mark */
Packit 89ede9
	{ "“",  8220 }, /* left double quot. mark */
Packit 89ede9
	{ "”",  8221 }, /* right double quot. mark */
Packit 89ede9
	{ "„",  8222 }, /* double low-9 quot. mark */
Packit 89ede9
	{ "†", 8224 }, /* dagger */
Packit 89ede9
	{ "‡", 8225 }, /* double dagger */
Packit 89ede9
	{ "‰", 8240 }, /* per mille sign */
Packit 89ede9
	{ "‹", 8249 }, /* sin. left angle quot mark */
Packit 89ede9
	{ "›", 8250 }, /* sin. right angle quot mark */
Packit 89ede9
	{ "€",   8364 }, /* euro sign */
Packit 89ede9
	/* Symbols and Greek characters */
Packit 89ede9
	{ "ƒ",    402 }, /* small f with hook = function */
Packit 89ede9
	{ "Α",   913 }, /* capital alpha */
Packit 89ede9
	{ "Β",    914 }, /* capital beta */
Packit 89ede9
	{ "Γ",   915 }, /* capital gamma */
Packit 89ede9
	{ "Δ",   916 }, /* capital delta */
Packit 89ede9
	{ "Ε", 917 }, /* capital epsilon */
Packit 89ede9
	{ "Ζ",    918 }, /* capital zeta */
Packit 89ede9
	{ "Η",     919 }, /* capital eta */
Packit 89ede9
	{ "Θ",   920 }, /* capital theta */
Packit 89ede9
	{ "Ι",    921 }, /* capital iota */
Packit 89ede9
	{ "Κ",   922 }, /* capital kappa */
Packit 89ede9
	{ "Λ",  923 }, /* capital lambda */
Packit 89ede9
	{ "Μ",      924 }, /* capital mu */
Packit 89ede9
	{ "Ν",      925 }, /* capital nu */
Packit 89ede9
	{ "Ξ",      926 }, /* capital xi */
Packit 89ede9
	{ "Ο", 927 }, /* capital omicron */
Packit 89ede9
	{ "Π",      928 }, /* capital pi */
Packit 89ede9
	{ "Ρ",     929 }, /* capital rho */
Packit 89ede9
	{ "Σ",   931 }, /* capital sigma */
Packit 89ede9
	{ "Τ",     932 }, /* capital tau */
Packit 89ede9
	{ "Υ", 933 }, /* capital upsilon */
Packit 89ede9
	{ "Φ",     934 }, /* capital phi */
Packit 89ede9
	{ "Χ",     935 }, /* capital chi */
Packit 89ede9
	{ "Ψ",     936 }, /* capital psi */
Packit 89ede9
	{ "Ω",   937 }, /* capital omega */
Packit 89ede9
	{ "α",   945 }, /* small alpha */
Packit 89ede9
	{ "β",    946 }, /* small beta */
Packit 89ede9
	{ "γ",   947 }, /* small gamma */
Packit 89ede9
	{ "δ",   948 }, /* small delta */
Packit 89ede9
	{ "ε", 949 }, /* small epsilon */
Packit 89ede9
	{ "ζ",    950 }, /* small zeta */
Packit 89ede9
	{ "η",     951 }, /* small eta */
Packit 89ede9
	{ "θ",   952 }, /* small theta */
Packit 89ede9
	{ "ι",    953 }, /* small iota */
Packit 89ede9
	{ "κ",   954 }, /* small kappa */
Packit 89ede9
	{ "λ",  955 }, /* small lambda */
Packit 89ede9
	{ "μ",      956 }, /* small mu */
Packit 89ede9
	{ "ν",      957 }, /* small nu */
Packit 89ede9
	{ "ξ",      958 }, /* small xi */
Packit 89ede9
	{ "ο", 959 }, /* small omicron */
Packit 89ede9
	{ "π",      960 }, /* small pi */
Packit 89ede9
	{ "ρ",     961 }, /* small rho */
Packit 89ede9
	{ "ς",  962 }, /* small final sigma */
Packit 89ede9
	{ "σ",   963 }, /* small simga */
Packit 89ede9
	{ "τ",     964 }, /* small tau */
Packit 89ede9
	{ "υ", 965 }, /* small upsilon */
Packit 89ede9
	{ "φ",     966 }, /* small phi */
Packit 89ede9
	{ "χ",     967 }, /* small chi */
Packit 89ede9
	{ "ψ",     968 }, /* small psi */
Packit 89ede9
	{ "ω",   969 }, /* small omega */
Packit 89ede9
	{ "ϑ",977 }, /* small theta symbol */
Packit 89ede9
	{ "ϒ",   978 }, /* small upsilon with hook */
Packit 89ede9
	{ "ϖ",     982 }, /* pi symbol */
Packit 89ede9
	{ "•",   8226 }, /* bullet = small blk circle */
Packit 89ede9
	{ "…", 8230 }, /* horizontal ellipsis */
Packit 89ede9
	{ "′",  8242 }, /* prime = minutes = feet */
Packit 89ede9
	{ "″",  8243 }, /* double prime */
Packit 89ede9
	{ "‾",  8254 }, /* overline */
Packit 89ede9
	{ "⁄",  8260 }, /* fraction slash */
Packit 89ede9
	{ "℘", 8472 }, /* Weierstrass p = power set */
Packit 89ede9
	{ "ℑ",  8465 }, /* imaginary part-black cap I */
Packit 89ede9
	{ "ℜ",   8476 }, /* real part-black cap R */
Packit 89ede9
	{ "™",  8482 }, /* trademark sign */
Packit 89ede9
	{ "ℵ",8501 }, /* alef symbol */
Packit 89ede9
	{ "←",   8592 }, /* left arrow */
Packit 89ede9
	{ "↑",   8593 }, /* up arrow */
Packit 89ede9
	{ "→",   8594 }, /* right arrow */
Packit 89ede9
	{ "↓",   8595 }, /* down arrow */
Packit 89ede9
	{ "↔",   8596 }, /* left/right arrow */
Packit 89ede9
	{ "↵",  8629 }, /* down arrow with corner left */
Packit 89ede9
	{ "⇐",   8656 }, /* left double arrow */
Packit 89ede9
	{ "⇑",   8657 }, /* up double arrow */
Packit 89ede9
	{ "⇒",   8658 }, /* up double arrow */
Packit 89ede9
	{ "⇓",   8659 }, /* up double arrow */
Packit 89ede9
	{ "⇔",   8660 }, /* up double arrow */
Packit 89ede9
	{ "∀", 8704}, /* for all */
Packit 89ede9
	{ "∂",   8706}, /* partial differential */
Packit 89ede9
	{ "∃",  8707}, /* there exists */
Packit 89ede9
	{ "∅",  8709}, /* empty set */
Packit 89ede9
	{ "∇",  8711}, /* nabla=backwards difference */
Packit 89ede9
	{ "∈",   8712}, /* element of */
Packit 89ede9
	{ "∉",  8713}, /* not an element of */
Packit 89ede9
	{ "∋",     8715}, /* contains as member */
Packit 89ede9
	{ "∏",   8719}, /* n-ary product */
Packit 89ede9
	{ "∑",    8721}, /* n-ary summation */
Packit 89ede9
	{ "−",  8722}, /* minuss sign */
Packit 89ede9
	{ "∗", 8727}, /* asterisk operator */
Packit 89ede9
	{ "√",  8730}, /* square root */
Packit 89ede9
	{ "∝",   8733}, /* proportional to */
Packit 89ede9
	{ "∞",  8734}, /* infinity */
Packit 89ede9
	{ "∠",    8736}, /* angle */
Packit 89ede9
	{ "∧",    8743}, /* logical and */
Packit 89ede9
	{ "∨",     8744}, /* logical or */
Packit 89ede9
	{ "∩",    8745}, /* intersection */
Packit 89ede9
	{ "∪",    8746}, /* union */
Packit 89ede9
	{ "∫",    8747}, /* integral */
Packit 89ede9
	{ "∴", 8756}, /* therefore */
Packit 89ede9
	{ "∼",    8764}, /* tilde operator */
Packit 89ede9
	{ "≅",   8773}, /* approximately equal to */
Packit 89ede9
	{ "≈",  8776}, /* asymptotic to */
Packit 89ede9
	{ "≠",     8800}, /* not equal to */
Packit 89ede9
	{ "≡",  8801}, /* identical to */
Packit 89ede9
	{ "≤",     8804}, /* less-than or equal to */
Packit 89ede9
	{ "≥",     8805}, /* greater-than or equal to */
Packit 89ede9
	{ "⊂",    8834}, /* subset of */
Packit 89ede9
	{ "⊃",    8835}, /* superset of */
Packit 89ede9
	{ "⊄",   8836}, /* not a subset of */
Packit 89ede9
	{ "⊆",   8838}, /* subset of or equal to */
Packit 89ede9
	{ "⊇",   8839}, /* superset of or equal to */
Packit 89ede9
	{ "⊕",  8853}, /* circled plus = direct sum */
Packit 89ede9
	{ "⊗", 8855}, /* circled times = vec prod */
Packit 89ede9
	{ "⊥",   8869}, /* perpendicular */
Packit 89ede9
	{ "⋅",   8901}, /* dot operator */
Packit 89ede9
	{ "⌈",  8968}, /* left ceiling */
Packit 89ede9
	{ "⌉",  8969}, /* right ceiling */
Packit 89ede9
	{ "⌊", 8970}, /* left floor */
Packit 89ede9
	{ "⌋", 8971}, /* right floor */
Packit 89ede9
	{ "⟨",   9001}, /* left angle bracket */
Packit 89ede9
	{ "⟩",   9002}, /* right angle bracket */
Packit 89ede9
	{ "◊",    9674}, /* lozenge */
Packit 89ede9
	{ "♠", 9824}, /* spades */
Packit 89ede9
	{ "♣",  9827}, /* clubs */
Packit 89ede9
	{ "♥", 9829}, /* hearts */
Packit 89ede9
	{ "♦",  9830}, /* diamonds */
Packit 89ede9
	/* Latin-1 */
Packit 89ede9
	{ " ",    32 },  /* non-breaking space */
Packit 89ede9
	{ "¡",  161 },  /* inverted exclamation mark */
Packit 89ede9
	{ "¢",   162 },  /* cent sign */
Packit 89ede9
	{ "£",  163 },  /* pound sign */
Packit 89ede9
	{ "¤", 164 },  /* currency sign */
Packit 89ede9
	{ "¥",    165 },  /* yen sign */
Packit 89ede9
	{ "¦", 166 },  /* broken vertical bar */
Packit 89ede9
	{ "§",   167 },  /* section sign */
Packit 89ede9
	{ "¨",    168 },  /* diaeresis - spacing diaeresis */
Packit 89ede9
	{ "©",   169 },  /* copyright sign */
Packit 89ede9
	{ "ª",   170 },  /* feminine ordinal indicator */
Packit 89ede9
	{ "«",  171 },  /* left-pointing guillemet */
Packit 89ede9
	{ "¬",    172 },  /* not sign */
Packit 89ede9
	{ "­",    173 },  /* soft (discretionary) hyphen */
Packit 89ede9
	{ "®",    174 },  /* registered sign */
Packit 89ede9
	{ "¯",   175 },  /* macron = overline */
Packit 89ede9
	{ "°",    176 },  /* degree sign */
Packit 89ede9
	{ "±", 177 },  /* plus-minus sign */
Packit 89ede9
	{ "²",   178 },  /* superscript two */
Packit 89ede9
	{ "³",   179 },  /* superscript three */
Packit 89ede9
	{ "´",  180 },  /* acute accent = spacing acute */
Packit 89ede9
	{ "µ",  181 },  /* micro sign */
Packit 89ede9
	{ "¶",   182 },  /* pilcrow (paragraph) sign */
Packit 89ede9
	{ "·", 183 },  /* middle dot (georgian comma) */
Packit 89ede9
	{ "¸",  184 },  /* cedilla = spacing cedilla */
Packit 89ede9
	{ "¹",   185 },  /* superscript one */
Packit 89ede9
	{ "º",   186 },  /* masculine ordinal indicator */
Packit 89ede9
	{ "»",  187 },  /* right pointing guillemet */
Packit 89ede9
	{ "¼", 188 },  /* 1/4 */
Packit 89ede9
	{ "½", 189 },  /* 1/2 */
Packit 89ede9
	{ "¾", 190 },  /* 3/4 */
Packit 89ede9
	{ "¿", 191 },  /* inverted question mark */
Packit 89ede9
	{ "À", 192 },  /* cap A with grave */
Packit 89ede9
	{ "Á", 193 },  /* cap A with acute */
Packit 89ede9
	{ "Â",  194 },  /* cap A with circumflex */
Packit 89ede9
	{ "Ã", 195 },  /* cap A with tilde */
Packit 89ede9
	{ "Ä",   196 },  /* cap A with diaeresis */
Packit 89ede9
	{ "Å",  197 },  /* cap A with ring */
Packit 89ede9
	{ "Æ",  198 },  /* cap AE ligature */
Packit 89ede9
	{ "Ç", 199 },  /* cap C with cedilla */
Packit 89ede9
	{ "È", 200 },  /* cap E with grave */
Packit 89ede9
	{ "É", 201 },  /* cap E with acute */
Packit 89ede9
	{ "Ê",  202 },  /* cap E with circumflex */
Packit 89ede9
	{ "Ë",   203 },  /* cap E with diaeresis */
Packit 89ede9
	{ "Ì", 204 },  /* cap I with grave */
Packit 89ede9
	{ "Í", 205 },  /* cap I with acute */
Packit 89ede9
	{ "Î",  206 },  /* cap I with circumflex */
Packit 89ede9
	{ "Ï",   207 },  /* cap I with diaeresis */
Packit 89ede9
	{ "Ð",    208 },  /* cap letter ETH */
Packit 89ede9
	{ "Ñ", 209 },  /* cap N with tilde */
Packit 89ede9
	{ "Ò", 210 },  /* cap O with grave */
Packit 89ede9
	{ "Ó", 211 },  /* cap O with acute */
Packit 89ede9
	{ "Ô",  212 },  /* cap O with circumflex */
Packit 89ede9
	{ "Õ", 213 },  /* cap O with tilde */
Packit 89ede9
	{ "Ö",   214 },  /* cap O with diaeresis */
Packit 89ede9
	{ "×",  215 },  /* multiplication sign */
Packit 89ede9
	{ "Ø", 216 },  /* cap O with stroke */
Packit 89ede9
	{ "Ù", 217 },  /* cap U with grave */
Packit 89ede9
	{ "Ú", 218 },  /* cap U with acute */
Packit 89ede9
	{ "Û",  219 },  /* cap U with circumflex */
Packit 89ede9
	{ "Ü",   220 },  /* cap U with diaeresis */
Packit 89ede9
	{ "Ý", 221 },  /* cap Y with acute */
Packit 89ede9
	{ "Þ",  222 },  /* cap letter THORN */
Packit 89ede9
	{ "ß",  223 },  /* small sharp s = ess-zed */
Packit 89ede9
	{ "à", 224 },  /* small a with grave */
Packit 89ede9
	{ "á", 225 },  /* small a with acute */
Packit 89ede9
	{ "â",  226 },  /* small a with cirucmflex */
Packit 89ede9
	{ "ã", 227 },  /* small a with tilde */
Packit 89ede9
	{ "&amu;;",   228 },  /* small a with diaeresis */
Packit 89ede9
	{ "å",  229 },  /* small a with ring */
Packit 89ede9
	{ "æ",  230 },  /* small ligature ae */
Packit 89ede9
	{ "ç", 231 },  /* small c with cedilla */
Packit 89ede9
	{ "è", 232 },  /* small e with grave */
Packit 89ede9
	{ "é", 233 },  /* small e with acute */
Packit 89ede9
	{ "ê",  234 },  /* small e with circumflex */
Packit 89ede9
	{ "&emu;;",   235 },  /* small e with diaeresis */
Packit 89ede9
	{ "ì", 236 },  /* small i with grave */
Packit 89ede9
	{ "í", 237 },  /* small i with acute */
Packit 89ede9
	{ "î",  238 },  /* small i with circumflex */
Packit 89ede9
	{ "ï",   239 },  /* small i with diaeresis */
Packit 89ede9
	{ "ð",    240 },  /* latin small letter eth */
Packit 89ede9
	{ "ñ", 241 },  /* small n with tilde */
Packit 89ede9
	{ "ò", 242 },  /* small o with grave */
Packit 89ede9
	{ "ó", 243 },  /* small o with acute */
Packit 89ede9
	{ "ô",  244 },  /* small o with circumflex */
Packit 89ede9
	{ "õ", 245 },  /* small o with tilde */
Packit 89ede9
	{ "ö",   246 },  /* small o with diaeresis */
Packit 89ede9
	{ "÷", 247 },  /* division sign */
Packit 89ede9
	{ "ø", 248 },  /* small o with slash */
Packit 89ede9
	{ "ù", 249 },  /* small u with grave */
Packit 89ede9
	{ "ú", 250 },  /* small u with acute */
Packit 89ede9
	{ "û",  251 },  /* small u with circumflex */
Packit 89ede9
	{ "ü",   252 },  /* small u with diaeresis */
Packit 89ede9
	{ "ý", 253 },  /* small y with acute */
Packit 89ede9
	{ "þ",  254 },  /* latin small letter thorn */
Packit 89ede9
	{ "ÿ",   255 },  /* small y with diaeresis */
Packit 89ede9
};
Packit 89ede9
Packit 89ede9
Packit 89ede9
static unsigned int
Packit 89ede9
decode_html_entity( char *s, unsigned int *pi, int *err )
Packit 89ede9
{
Packit 89ede9
	int nhtml_entities = sizeof( html_entities ) / sizeof( entities );
Packit 89ede9
	char *e;
Packit 89ede9
	int i, n=-1, len;
Packit 89ede9
	for ( i=0; i
Packit 89ede9
		e = &(html_entities[i].html[0]);
Packit 89ede9
		len = strlen( e );
Packit 89ede9
		if ( !strncasecmp( &(s[*pi]), e, len ) ) {
Packit 89ede9
			n = i;
Packit 89ede9
			*pi += len;
Packit 89ede9
		}
Packit 89ede9
	}
Packit 89ede9
	if ( n==-1 ) {
Packit 89ede9
		*err = 1;
Packit 89ede9
		return '&';
Packit 89ede9
	} else {
Packit 89ede9
		*err = 0;
Packit 89ede9
		return html_entities[n].unicode;
Packit 89ede9
	}
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
Packit 89ede9
/*
Packit 89ede9
 * decode decimal entity
Packit 89ede9
 *
Packit 89ede9
 *    extract a decimal entity from &#NNNN;
Packit 89ede9
 *    s[*pi] points to the '&' character
Packit 89ede9
 */
Packit 89ede9
static unsigned int
Packit 89ede9
decode_decimal_entity( char *s, unsigned int *pi, int *err )
Packit 89ede9
{
Packit 89ede9
	unsigned int c = 0, d;
Packit 89ede9
	int i = *pi, j = 2;
Packit 89ede9
	while ( isdigit( (unsigned char)s[i+j] ) ) {
Packit 89ede9
		d = s[i+j] - '0';
Packit 89ede9
		c = 10 * c + d;
Packit 89ede9
		j++;
Packit 89ede9
	}
Packit 89ede9
	if ( s[i+j]!=';' ) *err = 1;
Packit 89ede9
	else *pi = i+j+1;
Packit 89ede9
	return c;
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
/*
Packit 89ede9
 * decode hex entity
Packit 89ede9
 *
Packit 89ede9
 *    extract a hex entity from &#xNNNN;
Packit 89ede9
 *    s[*pi] points to the '&' character
Packit 89ede9
 */
Packit 89ede9
static unsigned int
Packit 89ede9
decode_hex_entity( char *s, unsigned int *pi, int *err )
Packit 89ede9
{
Packit 89ede9
	unsigned int c = 0, d;
Packit 89ede9
	int i = *pi, j = 3;
Packit 89ede9
	while ( isxdigit( (unsigned char)s[i+j] ) ) {
Packit 89ede9
		if ( isdigit( (unsigned char)s[i+j] ) ) d = s[i+j]-'0';
Packit 89ede9
		else d = toupper((unsigned char)s[i+j])-'A' + 10;
Packit 89ede9
		c = 16 * c + d;
Packit 89ede9
		j++;
Packit 89ede9
	}
Packit 89ede9
	if ( s[i+j]!=';' ) *err = 1;
Packit 89ede9
	else *pi = i+j+1;
Packit 89ede9
	return c;
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
/*
Packit 89ede9
 * decode numeric entity
Packit 89ede9
 *
Packit 89ede9
 *    extract a numeric entity from &#NNN; or &#xNNNN;
Packit 89ede9
 *
Packit 89ede9
 *    In XML, the "x" in hexadecimal entries should be lowercase,
Packit 89ede9
 *    but we'll be generous and accept "X" as well.
Packit 89ede9
 */
Packit 89ede9
static unsigned int
Packit 89ede9
decode_numeric_entity( char *s, unsigned int *pi, int *err )
Packit 89ede9
{
Packit 89ede9
	unsigned int c;
Packit 89ede9
	*err = 0;
Packit 89ede9
	if ( s[*pi+2]!='x' && s[*pi+2]!='X' ) c = decode_decimal_entity( s, pi, err );
Packit 89ede9
	else c = decode_hex_entity( s, pi, err );
Packit 89ede9
	if ( *err ) {
Packit 89ede9
		*pi = *pi + 1;
Packit 89ede9
		c = '&';
Packit 89ede9
	}
Packit 89ede9
	return c;
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
/*
Packit 89ede9
 * decode entity
Packit 89ede9
 *    extract entity from  &mmmm;
Packit 89ede9
 *
Packit 89ede9
 * where &mmmm; is one of
Packit 89ede9
 * - &#nnnn; is code point in decimal form
Packit 89ede9
 * - &#xhhhh; is code point in hexadecimal form (note "x" is lowercase in XML)
Packit 89ede9
 * - &mmmm; corresponds to a pre-defined XML entity, e.g. &quote for quotations
Packit 89ede9
 *
Packit 89ede9
 */
Packit 89ede9
unsigned int
Packit 89ede9
decode_entity( char *s, unsigned int *pi, int *unicode, int *err )
Packit 89ede9
{
Packit 89ede9
	unsigned int c = '&';
Packit 89ede9
	*unicode = 0;
Packit 89ede9
Packit 89ede9
	if ( s[*pi]!='&' ) {
Packit 89ede9
		*err = 1;  /* need to start with ampersand */
Packit 89ede9
		c = s[*pi];
Packit 89ede9
	} else *err = 0;
Packit 89ede9
Packit 89ede9
	if ( !*err ) {
Packit 89ede9
		if ( s[*pi+1]=='#' ) c = decode_numeric_entity( s, pi, err );
Packit 89ede9
		else {
Packit 89ede9
			c = decode_html_entity( s, pi, err );
Packit 89ede9
			*unicode = 1;
Packit 89ede9
		}
Packit 89ede9
	}
Packit 89ede9
	if ( *err ) *pi = *pi + 1;
Packit 89ede9
Packit 89ede9
	return c;
Packit 89ede9
}