/*
Wrapper interface to XML parser
Copyright (C) 1999-2007, 2009, Joe Orton <joe@manyfish.co.uk>
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
MA 02111-1307, USA
*/
#include "config.h"
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif
#include "ne_internal.h"
#include "ne_alloc.h"
#include "ne_xml.h"
#include "ne_utils.h"
#include "ne_string.h"
#if defined(HAVE_EXPAT)
/* expat support: */
#ifdef HAVE_XMLPARSE_H
#include "xmlparse.h"
#else
#include <expat.h>
#endif
typedef XML_Char ne_xml_char;
#if !defined(XML_MAJOR_VERSION)
#define NEED_BOM_HANDLING
#elif XML_MAJOR_VERSION < 2 && XML_MINOR_VERSION == 95 && XML_MICRO_VERSION < 2
#define NEED_BOM_HANDLING
#endif
#elif defined(HAVE_LIBXML)
/* libxml2 support: */
#include <libxml/xmlversion.h>
#include <libxml/parser.h>
typedef xmlChar ne_xml_char;
#if LIBXML_VERSION < 20619
/* 2.6.19 and earlier have broken BOM handling */
#define NEED_BOM_HANDLING
#endif
#else /* not HAVE_LIBXML */
# error need an XML parser
#endif /* not HAVE_EXPAT */
/* Approx. one screen of text: */
#define ERR_SIZE (2048)
struct handler {
ne_xml_startelm_cb *startelm_cb; /* start-element callback */
ne_xml_endelm_cb *endelm_cb; /* end-element callback */
ne_xml_cdata_cb *cdata_cb; /* character-data callback. */
void *userdata; /* userdata for the above. */
struct handler *next; /* next handler in stack. */
};
#ifdef HAVE_LIBXML
static void sax_error(void *ctx, const char *msg, ...);
#endif
struct element {
const ne_xml_char *nspace;
ne_xml_char *name;
int state; /* opaque state integer */
/* Namespaces declared in this element */
ne_xml_char *default_ns; /* A default namespace */
struct namespace *nspaces; /* List of other namespace scopes */
struct handler *handler; /* Handler for this element */
struct element *parent; /* parent element, or NULL */
};
/* We pass around a ne_xml_parser as the userdata in the parsing
* library. This maintains the current state of the parse and various
* other bits and bobs. Within the parse, we store the current branch
* of the tree, i.e., the current element and all its parents, up to
* the root, but nothing other than that. */
struct ne_xml_parser_s {
struct element *root; /* the root of the document */
struct element *current; /* current element in the branch */
struct handler *top_handlers; /* always points at the
* handler on top of the stack. */
int failure; /* zero whilst parse should continue */
int prune; /* if non-zero, depth within a dead branch */
#ifdef NEED_BOM_HANDLING
int bom_pos;
#endif
#ifdef HAVE_EXPAT
XML_Parser parser;
char *encoding;
#else
xmlParserCtxtPtr parser;
#endif
char error[ERR_SIZE];
};
/* The callback handlers */
static void start_element(void *userdata, const ne_xml_char *name, const ne_xml_char **atts);
static void end_element(void *userdata, const ne_xml_char *name);
static void char_data(void *userdata, const ne_xml_char *cdata, int len);
static const char *resolve_nspace(const struct element *elm,
const char *prefix, size_t pfxlen);
/* Linked list of namespace scopes */
struct namespace {
ne_xml_char *name;
ne_xml_char *uri;
struct namespace *next;
};
#ifdef HAVE_LIBXML
/* Could be const as far as we care, but libxml doesn't want that */
static xmlSAXHandler sax_handler = {
NULL, /* internalSubset */
NULL, /* isStandalone */
NULL, /* hasInternalSubset */
NULL, /* hasExternalSubset */
NULL, /* resolveEntity */
NULL, /* getEntity */
NULL, /* entityDecl */
NULL, /* notationDecl */
NULL, /* attributeDecl */
NULL, /* elementDecl */
NULL, /* unparsedEntityDecl */
NULL, /* setDocumentLocator */
NULL, /* startDocument */
NULL, /* endDocument */
start_element, /* startElement */
end_element, /* endElement */
NULL, /* reference */
char_data, /* characters */
NULL, /* ignorableWhitespace */
NULL, /* processingInstruction */
NULL, /* comment */
NULL, /* xmlParserWarning */
sax_error, /* xmlParserError */
sax_error, /* fatal error (never called by libxml2?) */
NULL, /* getParameterEntity */
char_data /* cdataBlock */
};
/* empty attributes array to mimic expat behaviour */
static const char *const empty_atts[] = {NULL, NULL};
/* macro for determining the attributes array to pass */
#define PASS_ATTS(atts) (atts ? (const char **)(atts) : empty_atts)
#else
#define PASS_ATTS(atts) ((const char **)(atts))
/* XML declaration callback for expat. */
static void decl_handler(void *userdata,
const XML_Char *version, const XML_Char *encoding,
int standalone)
{
ne_xml_parser *p = userdata;
if (encoding) p->encoding = ne_strdup(encoding);
}
#endif /* HAVE_LIBXML */
int ne_xml_currentline(ne_xml_parser *p)
{
#ifdef HAVE_EXPAT
return XML_GetCurrentLineNumber(p->parser);
#else
return p->parser->input->line;
#endif
}
const char *ne_xml_doc_encoding(const ne_xml_parser *p)
{
#ifdef HAVE_LIBXML
return p->parser->encoding;
#else
return p->encoding;
#endif
}
/* The first character of the REC-xml-names "NCName" rule excludes
* "Digit | '.' | '-' | '_' | CombiningChar | Extender"; the XML
* parser will not enforce this rule in a namespace declaration since
* it treats the entire attribute name as a REC-xml "Name" rule. It's
* too hard to check for all of CombiningChar | Digit | Extender here,
* but the valid_ncname_ch1 macro catches some of the rest. */
/* Return non-zero if 'ch' is an invalid start character for an NCName: */
#define invalid_ncname_ch1(ch) ((ch) == '\0' || strchr("-.0123456789", (ch)) != NULL)
/* Subversion repositories have been deployed which use property names
* marshalled as NCNames including a colon character; these should
* also be rejected but will be allowed for the time being. */
#define invalid_ncname(xn) (invalid_ncname_ch1((xn)[0]))
/* Extract the namespace prefix declarations from 'atts'. */
static int declare_nspaces(ne_xml_parser *p, struct element *elm,
const ne_xml_char **atts)
{
int n;
for (n = 0; atts && atts[n]; n += 2) {
if (strcmp(atts[n], "xmlns") == 0) {
/* New default namespace */
elm->default_ns = ne_strdup(atts[n+1]);
} else if (strncmp(atts[n], "xmlns:", 6) == 0) {
struct namespace *ns;
/* Reject some invalid NCNames as namespace prefix, and an
* empty URI as the namespace URI */
if (invalid_ncname(atts[n] + 6) || atts[n+1][0] == '\0') {
ne_snprintf(p->error, ERR_SIZE,
("XML parse error at line %d: invalid namespace "
"declaration"), ne_xml_currentline(p));
return -1;
}
/* New namespace scope */
ns = ne_calloc(sizeof(*ns));
ns->next = elm->nspaces;
elm->nspaces = ns;
ns->name = ne_strdup(atts[n]+6); /* skip the xmlns= */
ns->uri = ne_strdup(atts[n+1]);
}
}
return 0;
}
/* Expand an XML qualified name, which may include a namespace prefix
* as well as the local part. */
static int expand_qname(ne_xml_parser *p, struct element *elm,
const ne_xml_char *qname)
{
const ne_xml_char *pfx;
pfx = strchr(qname, ':');
if (pfx == NULL) {
struct element *e = elm;
/* Find default namespace; guaranteed to terminate as the root
* element always has default_ns="". */
while (e->default_ns == NULL)
e = e->parent;
elm->name = ne_strdup(qname);
elm->nspace = e->default_ns;
} else if (invalid_ncname(pfx + 1) || qname == pfx) {
ne_snprintf(p->error, ERR_SIZE,
_("XML parse error at line %d: invalid element name"),
ne_xml_currentline(p));
return -1;
} else {
const char *uri = resolve_nspace(elm, qname, pfx-qname);
if (uri) {
elm->name = ne_strdup(pfx+1);
elm->nspace = uri;
} else {
ne_snprintf(p->error, ERR_SIZE,
("XML parse error at line %d: undeclared namespace prefix"),
ne_xml_currentline(p));
return -1;
}
}
return 0;
}
/* Called with the start of a new element. */
static void start_element(void *userdata, const ne_xml_char *name,
const ne_xml_char **atts)
{
ne_xml_parser *p = userdata;
struct element *elm;
struct handler *hand;
int state = NE_XML_DECLINE;
if (p->failure) return;
if (p->prune) {
p->prune++;
return;
}
/* Create a new element */
elm = ne_calloc(sizeof *elm);
elm->parent = p->current;
p->current = elm;
if (declare_nspaces(p, elm, atts) || expand_qname(p, elm, name)) {
p->failure = 1;
return;
}
/* Find a handler which will accept this element (or abort the parse) */
for (hand = elm->parent->handler; hand && state == NE_XML_DECLINE;
hand = hand->next) {
elm->handler = hand;
state = hand->startelm_cb(hand->userdata, elm->parent->state,
elm->nspace, elm->name, PASS_ATTS(atts));
}
NE_DEBUG(NE_DBG_XML, "XML: start-element (%d, {%s, %s}) => %d\n",
elm->parent->state, elm->nspace, elm->name, state);
if (state > 0)
elm->state = state;
else if (state == NE_XML_DECLINE)
/* prune this branch. */
p->prune++;
else /* state < 0 => abort parse */
p->failure = state;
}
/* Destroys an element structure. */
static void destroy_element(struct element *elm)
{
struct namespace *this_ns, *next_ns;
ne_free(elm->name);
/* Free the namespaces */
this_ns = elm->nspaces;
while (this_ns != NULL) {
next_ns = this_ns->next;
ne_free(this_ns->name);
ne_free(this_ns->uri);
ne_free(this_ns);
this_ns = next_ns;
}
if (elm->default_ns)
ne_free(elm->default_ns);
ne_free(elm);
}
/* cdata SAX callback */
static void char_data(void *userdata, const ne_xml_char *data, int len)
{
ne_xml_parser *p = userdata;
struct element *elm = p->current;
if (p->failure || p->prune) return;
if (elm->handler->cdata_cb) {
p->failure = elm->handler->cdata_cb(elm->handler->userdata, elm->state, data, len);
NE_DEBUG(NE_DBG_XML, "XML: char-data (%d) returns %d\n",
elm->state, p->failure);
}
}
/* Called with the end of an element */
static void end_element(void *userdata, const ne_xml_char *name)
{
ne_xml_parser *p = userdata;
struct element *elm = p->current;
if (p->failure) return;
if (p->prune) {
if (p->prune-- > 1) return;
} else if (elm->handler->endelm_cb) {
p->failure = elm->handler->endelm_cb(elm->handler->userdata, elm->state,
elm->nspace, elm->name);
if (p->failure) {
NE_DEBUG(NE_DBG_XML, "XML: end-element for %d failed with %d.\n",
elm->state, p->failure);
}
}
NE_DEBUG(NE_DBG_XML, "XML: end-element (%d, {%s, %s})\n",
elm->state, elm->nspace, elm->name);
/* move back up the tree */
p->current = elm->parent;
p->prune = 0;
destroy_element(elm);
}
#if defined(HAVE_EXPAT) && XML_MAJOR_VERSION > 1
/* Stop the parser if an entity declaration is hit. */
static void entity_declaration(void *userData, const XML_Char *entityName,
int is_parameter_entity, const XML_Char *value,
int value_length, const XML_Char *base,
const XML_Char *systemId, const XML_Char *publicId,
const XML_Char *notationName)
{
ne_xml_parser *parser = userData;
NE_DEBUG(NE_DBG_XMLPARSE, "XML: entity declaration [%s]. Failing.\n",
entityName);
XML_StopParser(parser->parser, XML_FALSE);
}
#elif defined(HAVE_EXPAT)
/* A noop default_handler. */
static void default_handler(void *userData, const XML_Char *s, int len)
{
}
#endif
/* Find a namespace definition for 'prefix' in given element, where
* length of prefix is 'pfxlen'. Returns the URI or NULL. */
static const char *resolve_nspace(const struct element *elm,
const char *prefix, size_t pfxlen)
{
const struct element *s;
/* Search up the tree. */
for (s = elm; s != NULL; s = s->parent) {
const struct namespace *ns;
/* Iterate over defined spaces on this node. */
for (ns = s->nspaces; ns != NULL; ns = ns->next) {
if (strlen(ns->name) == pfxlen &&
memcmp(ns->name, prefix, pfxlen) == 0)
return ns->uri;
}
}
return NULL;
}
const char *ne_xml_resolve_nspace(ne_xml_parser *parser,
const char *prefix, size_t length)
{
if (prefix) {
return resolve_nspace(parser->current, prefix, length);
}
else {
struct element *e = parser->current;
while (e->default_ns == NULL)
e = e->parent;
return e->default_ns;
}
}
ne_xml_parser *ne_xml_create(void)
{
ne_xml_parser *p = ne_calloc(sizeof *p);
/* Placeholder for the root element */
p->current = p->root = ne_calloc(sizeof *p->root);
p->root->default_ns = "";
p->root->state = 0;
strcpy(p->error, _("Unknown error"));
#ifdef HAVE_EXPAT
p->parser = XML_ParserCreate(NULL);
if (p->parser == NULL) {
abort();
}
XML_SetElementHandler(p->parser, start_element, end_element);
XML_SetCharacterDataHandler(p->parser, char_data);
XML_SetUserData(p->parser, (void *) p);
XML_SetXmlDeclHandler(p->parser, decl_handler);
/* Prevent the "billion laughs" attack against expat by disabling
* internal entity expansion. With 2.x, forcibly stop the parser
* if an entity is declared - this is safer and a more obvious
* failure mode. With older versions, installing a noop
* DefaultHandler means that internal entities will be expanded as
* the empty string, which is also sufficient to prevent the
* attack. */
#if XML_MAJOR_VERSION > 1
XML_SetEntityDeclHandler(p->parser, entity_declaration);
#else
XML_SetDefaultHandler(p->parser, default_handler);
#endif
#else /* HAVE_LIBXML */
p->parser = xmlCreatePushParserCtxt(&sax_handler,
(void *)p, NULL, 0, NULL);
if (p->parser == NULL) {
abort();
}
#if LIBXML_VERSION < 20602
p->parser->replaceEntities = 1;
#else
/* Enable expansion of entities, and disable network access. */
xmlCtxtUseOptions(p->parser, XML_PARSE_NOENT | XML_PARSE_NONET);
#endif
#endif /* HAVE_LIBXML || HAVE_EXPAT */
return p;
}
void ne_xml_push_handler(ne_xml_parser *p,
ne_xml_startelm_cb *startelm_cb,
ne_xml_cdata_cb *cdata_cb,
ne_xml_endelm_cb *endelm_cb,
void *userdata)
{
struct handler *hand = ne_calloc(sizeof(struct handler));
hand->startelm_cb = startelm_cb;
hand->cdata_cb = cdata_cb;
hand->endelm_cb = endelm_cb;
hand->userdata = userdata;
/* If this is the first handler registered, update the
* base pointer too. */
if (p->top_handlers == NULL) {
p->root->handler = hand;
p->top_handlers = hand;
} else {
p->top_handlers->next = hand;
p->top_handlers = hand;
}
}
int ne_xml_parse_v(void *userdata, const char *block, size_t len)
{
ne_xml_parser *p = userdata;
return ne_xml_parse(p, (const ne_xml_char *)block, len);
}
#define BOM_UTF8 "\xEF\xBB\xBF" /* UTF-8 BOM */
int ne_xml_parse(ne_xml_parser *p, const char *block, size_t len)
{
int ret, flag;
/* duck out if it's broken */
if (p->failure) {
NE_DEBUG(NE_DBG_XMLPARSE, "XML: Failed; ignoring %" NE_FMT_SIZE_T
" bytes.\n", len);
return p->failure;
}
if (len == 0) {
flag = -1;
block = "";
NE_DEBUG(NE_DBG_XMLPARSE, "XML: End of document.\n");
} else {
NE_DEBUG(NE_DBG_XMLPARSE, "XML: Parsing %" NE_FMT_SIZE_T " bytes.\n", len);
flag = 0;
}
#ifdef NEED_BOM_HANDLING
if (p->bom_pos < 3) {
NE_DEBUG(NE_DBG_XMLPARSE, "Checking for UTF-8 BOM.\n");
while (len > 0 && p->bom_pos < 3 &&
block[0] == BOM_UTF8[p->bom_pos]) {
block++;
len--;
p->bom_pos++;
}
if (len == 0)
return 0;
if (p->bom_pos == 0) {
p->bom_pos = 3; /* no BOM */
} else if (p->bom_pos > 0 && p->bom_pos < 3) {
ne_strnzcpy(p->error, _("Invalid Byte Order Mark"), sizeof p->error);
return p->failure = 1;
}
}
#endif
/* Note, don't write a parser error if p->failure, since an error
* will already have been written in that case. */
#ifdef HAVE_EXPAT
ret = XML_Parse(p->parser, block, len, flag);
NE_DEBUG(NE_DBG_XMLPARSE, "XML: XML_Parse returned %d\n", ret);
if (ret == 0 && p->failure == 0) {
ne_snprintf(p->error, ERR_SIZE,
"XML parse error at line %" NE_FMT_XML_SIZE ": %s",
XML_GetCurrentLineNumber(p->parser),
XML_ErrorString(XML_GetErrorCode(p->parser)));
p->failure = 1;
NE_DEBUG(NE_DBG_XMLPARSE, "XML: Parse error: %s\n", p->error);
}
#else
ret = xmlParseChunk(p->parser, block, len, flag);
NE_DEBUG(NE_DBG_XMLPARSE, "XML: xmlParseChunk returned %d\n", ret);
/* Parse errors are normally caught by the sax_error() callback,
* which clears p->valid. */
if (p->parser->errNo && p->failure == 0) {
ne_snprintf(p->error, ERR_SIZE, "XML parse error at line %d",
ne_xml_currentline(p));
p->failure = 1;
NE_DEBUG(NE_DBG_XMLPARSE, "XML: Parse error: %s\n", p->error);
}
#endif
return p->failure;
}
int ne_xml_failed(ne_xml_parser *p)
{
return p->failure;
}
void ne_xml_destroy(ne_xml_parser *p)
{
struct element *elm, *parent;
struct handler *hand, *next;
/* Free up the handlers on the stack: the root element has the
* pointer to the base of the handler stack. */
for (hand = p->root->handler; hand!=NULL; hand=next) {
next = hand->next;
ne_free(hand);
}
/* Clean up remaining elements */
for (elm = p->current; elm != p->root; elm = parent) {
parent = elm->parent;
destroy_element(elm);
}
/* free root element */
ne_free(p->root);
#ifdef HAVE_EXPAT
XML_ParserFree(p->parser);
if (p->encoding) ne_free(p->encoding);
#else
xmlFreeParserCtxt(p->parser);
#endif
ne_free(p);
}
void ne_xml_set_error(ne_xml_parser *p, const char *msg)
{
ne_snprintf(p->error, ERR_SIZE, "%s", msg);
}
#ifdef HAVE_LIBXML
static void sax_error(void *ctx, const char *msg, ...)
{
ne_xml_parser *p = ctx;
va_list ap;
char buf[1024];
va_start(ap, msg);
ne_vsnprintf(buf, 1024, msg, ap);
va_end(ap);
if (p->failure == 0) {
ne_snprintf(p->error, ERR_SIZE,
_("XML parse error at line %d: %s"),
p->parser->input->line, buf);
p->failure = 1;
}
}
#endif
const char *ne_xml_get_error(ne_xml_parser *p)
{
return p->error;
}
const char *
ne_xml_get_attr(ne_xml_parser *p, const char **attrs,
const char *nspace, const char *name)
{
int n;
for (n = 0; attrs[n] != NULL; n += 2) {
char *pnt = strchr(attrs[n], ':');
if (!nspace && !pnt && strcmp(attrs[n], name) == 0) {
return attrs[n+1];
} else if (nspace && pnt) {
/* If a namespace is given, and the local part matches,
* then resolve the namespace and compare that too. */
if (strcmp(pnt + 1, name) == 0) {
const char *uri = resolve_nspace(p->current,
attrs[n], pnt - attrs[n]);
if (uri && strcmp(uri, nspace) == 0)
return attrs[n+1];
}
}
}
return NULL;
}
int ne_xml_mapid(const struct ne_xml_idmap map[], size_t maplen,
const char *nspace, const char *name)
{
size_t n;
for (n = 0; n < maplen; n++)
if (strcmp(name, map[n].name) == 0 &&
strcmp(nspace, map[n].nspace) == 0)
return map[n].id;
return 0;
}