|
Packit |
423ecb |
# -*- coding: iso-8859-1 -*-
|
|
Packit |
423ecb |
""" A SAX2 driver for libxml2, on top of it's XmlReader API
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
USAGE
|
|
Packit |
423ecb |
# put this file (drv_libxml2.py) in PYTHONPATH
|
|
Packit |
423ecb |
import xml.sax
|
|
Packit |
423ecb |
reader = xml.sax.make_parser(["drv_libxml2"])
|
|
Packit |
423ecb |
# ...and the rest is standard python sax.
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
CAVEATS
|
|
Packit |
423ecb |
- Lexical handlers are supported, except for start/endEntity
|
|
Packit |
423ecb |
(waiting for XmlReader.ResolveEntity) and start/endDTD
|
|
Packit |
423ecb |
- Error callbacks are not exactly synchronous, they tend
|
|
Packit |
423ecb |
to be invoked before the corresponding content callback,
|
|
Packit |
423ecb |
because the underlying reader interface parses
|
|
Packit |
423ecb |
data by chunks of 512 bytes
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
TODO
|
|
Packit |
423ecb |
- search for TODO
|
|
Packit |
423ecb |
- some ErrorHandler events (warning)
|
|
Packit |
423ecb |
- some ContentHandler events (setDocumentLocator, skippedEntity)
|
|
Packit |
423ecb |
- EntityResolver (using libxml2.?)
|
|
Packit |
423ecb |
- DTDHandler (if/when libxml2 exposes such node types)
|
|
Packit |
423ecb |
- DeclHandler (if/when libxml2 exposes such node types)
|
|
Packit |
423ecb |
- property_xml_string?
|
|
Packit |
423ecb |
- feature_string_interning?
|
|
Packit |
423ecb |
- Incremental parser
|
|
Packit |
423ecb |
- additional performance tuning:
|
|
Packit |
423ecb |
- one might cache callbacks to avoid some name lookups
|
|
Packit |
423ecb |
- one might implement a smarter way to pass attributes to startElement
|
|
Packit |
423ecb |
(some kind of lazy evaluation?)
|
|
Packit |
423ecb |
- there might be room for improvement in start/endPrefixMapping
|
|
Packit |
423ecb |
- other?
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
"""
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
__author__ = "Stéphane Bidoul <sbi@skynet.be>"
|
|
Packit |
423ecb |
__version__ = "0.3"
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
import sys
|
|
Packit |
423ecb |
import codecs
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
if sys.version_info[0] < 3:
|
|
Packit |
423ecb |
__author__ = codecs.unicode_escape_decode(__author__)[0]
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
StringTypes = (str, unicode)
|
|
Packit |
423ecb |
# libxml2 returns strings as UTF8
|
|
Packit |
423ecb |
_decoder = codecs.lookup("utf8")[1]
|
|
Packit |
423ecb |
def _d(s):
|
|
Packit |
423ecb |
if s is None:
|
|
Packit |
423ecb |
return s
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
return _decoder(s)[0]
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
StringTypes = str
|
|
Packit |
423ecb |
# s is Unicode `str` already
|
|
Packit |
423ecb |
def _d(s):
|
|
Packit |
423ecb |
return s
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
from xml.sax._exceptions import *
|
|
Packit |
423ecb |
from xml.sax import xmlreader, saxutils
|
|
Packit |
423ecb |
from xml.sax.handler import \
|
|
Packit |
423ecb |
feature_namespaces, \
|
|
Packit |
423ecb |
feature_namespace_prefixes, \
|
|
Packit |
423ecb |
feature_string_interning, \
|
|
Packit |
423ecb |
feature_validation, \
|
|
Packit |
423ecb |
feature_external_ges, \
|
|
Packit |
423ecb |
feature_external_pes, \
|
|
Packit |
423ecb |
property_lexical_handler, \
|
|
Packit |
423ecb |
property_declaration_handler, \
|
|
Packit |
423ecb |
property_dom_node, \
|
|
Packit |
423ecb |
property_xml_string
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
try:
|
|
Packit |
423ecb |
import libxml2
|
|
Packit |
423ecb |
except ImportError:
|
|
Packit |
423ecb |
raise SAXReaderNotAvailable("libxml2 not available: " \
|
|
Packit |
423ecb |
"import error was: %s" % sys.exc_info()[1])
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
class Locator(xmlreader.Locator):
|
|
Packit |
423ecb |
"""SAX Locator adapter for libxml2.xmlTextReaderLocator"""
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def __init__(self,locator):
|
|
Packit |
423ecb |
self.__locator = locator
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def getColumnNumber(self):
|
|
Packit |
423ecb |
"Return the column number where the current event ends."
|
|
Packit |
423ecb |
return -1
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def getLineNumber(self):
|
|
Packit |
423ecb |
"Return the line number where the current event ends."
|
|
Packit |
423ecb |
return self.__locator.LineNumber()
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def getPublicId(self):
|
|
Packit |
423ecb |
"Return the public identifier for the current event."
|
|
Packit |
423ecb |
return None
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def getSystemId(self):
|
|
Packit |
423ecb |
"Return the system identifier for the current event."
|
|
Packit |
423ecb |
return self.__locator.BaseURI()
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
class LibXml2Reader(xmlreader.XMLReader):
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def __init__(self):
|
|
Packit |
423ecb |
xmlreader.XMLReader.__init__(self)
|
|
Packit |
423ecb |
# features
|
|
Packit |
423ecb |
self.__ns = 0
|
|
Packit |
423ecb |
self.__nspfx = 0
|
|
Packit |
423ecb |
self.__validate = 0
|
|
Packit |
423ecb |
self.__extparams = 1
|
|
Packit |
423ecb |
# parsing flag
|
|
Packit |
423ecb |
self.__parsing = 0
|
|
Packit |
423ecb |
# additional handlers
|
|
Packit |
423ecb |
self.__lex_handler = None
|
|
Packit |
423ecb |
self.__decl_handler = None
|
|
Packit |
423ecb |
# error messages accumulator
|
|
Packit |
423ecb |
self.__errors = None
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def _errorHandler(self,arg,msg,severity,locator):
|
|
Packit |
423ecb |
if self.__errors is None:
|
|
Packit |
423ecb |
self.__errors = []
|
|
Packit |
423ecb |
self.__errors.append((severity,
|
|
Packit |
423ecb |
SAXParseException(msg,None,
|
|
Packit |
423ecb |
Locator(locator))))
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def _reportErrors(self,fatal):
|
|
Packit |
423ecb |
for severity,exception in self.__errors:
|
|
Packit |
423ecb |
if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
|
|
Packit |
423ecb |
libxml2.PARSER_SEVERITY_WARNING):
|
|
Packit |
423ecb |
self._err_handler.warning(exception)
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
# when fatal is set, the parse will stop;
|
|
Packit |
423ecb |
# we consider that the last error reported
|
|
Packit |
423ecb |
# is the fatal one.
|
|
Packit |
423ecb |
if fatal and exception is self.__errors[-1][1]:
|
|
Packit |
423ecb |
self._err_handler.fatalError(exception)
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
self._err_handler.error(exception)
|
|
Packit |
423ecb |
self.__errors = None
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def parse(self, source):
|
|
Packit |
423ecb |
self.__parsing = 1
|
|
Packit |
423ecb |
try:
|
|
Packit |
423ecb |
# prepare source and create reader
|
|
Packit |
423ecb |
if isinstance(source, StringTypes):
|
|
Packit |
423ecb |
reader = libxml2.newTextReaderFilename(source)
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
source = saxutils.prepare_input_source(source)
|
|
Packit |
423ecb |
input = libxml2.inputBuffer(source.getByteStream())
|
|
Packit |
423ecb |
reader = input.newTextReader(source.getSystemId())
|
|
Packit |
423ecb |
reader.SetErrorHandler(self._errorHandler,None)
|
|
Packit |
423ecb |
# configure reader
|
|
Packit |
423ecb |
if self.__extparams:
|
|
Packit |
423ecb |
reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
|
|
Packit |
423ecb |
reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
|
|
Packit |
423ecb |
reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
|
|
Packit |
423ecb |
reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
|
|
Packit |
423ecb |
# we reuse attribute maps (for a slight performance gain)
|
|
Packit |
423ecb |
if self.__ns:
|
|
Packit |
423ecb |
attributesNSImpl = xmlreader.AttributesNSImpl({},{})
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
attributesImpl = xmlreader.AttributesImpl({})
|
|
Packit |
423ecb |
# prefixes to pop (for endPrefixMapping)
|
|
Packit |
423ecb |
prefixes = []
|
|
Packit |
423ecb |
# start loop
|
|
Packit |
423ecb |
self._cont_handler.startDocument()
|
|
Packit |
423ecb |
while 1:
|
|
Packit |
423ecb |
r = reader.Read()
|
|
Packit |
423ecb |
# check for errors
|
|
Packit |
423ecb |
if r == 1:
|
|
Packit |
423ecb |
if not self.__errors is None:
|
|
Packit |
423ecb |
self._reportErrors(0)
|
|
Packit |
423ecb |
elif r == 0:
|
|
Packit |
423ecb |
if not self.__errors is None:
|
|
Packit |
423ecb |
self._reportErrors(0)
|
|
Packit |
423ecb |
break # end of parse
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
if not self.__errors is None:
|
|
Packit |
423ecb |
self._reportErrors(1)
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
self._err_handler.fatalError(\
|
|
Packit |
423ecb |
SAXException("Read failed (no details available)"))
|
|
Packit |
423ecb |
break # fatal parse error
|
|
Packit |
423ecb |
# get node type
|
|
Packit |
423ecb |
nodeType = reader.NodeType()
|
|
Packit |
423ecb |
# Element
|
|
Packit |
423ecb |
if nodeType == 1:
|
|
Packit |
423ecb |
if self.__ns:
|
|
Packit |
423ecb |
eltName = (_d(reader.NamespaceUri()),\
|
|
Packit |
423ecb |
_d(reader.LocalName()))
|
|
Packit |
423ecb |
eltQName = _d(reader.Name())
|
|
Packit |
423ecb |
attributesNSImpl._attrs = attrs = {}
|
|
Packit |
423ecb |
attributesNSImpl._qnames = qnames = {}
|
|
Packit |
423ecb |
newPrefixes = []
|
|
Packit |
423ecb |
while reader.MoveToNextAttribute():
|
|
Packit |
423ecb |
qname = _d(reader.Name())
|
|
Packit |
423ecb |
value = _d(reader.Value())
|
|
Packit |
423ecb |
if qname.startswith("xmlns"):
|
|
Packit |
423ecb |
if len(qname) > 5:
|
|
Packit |
423ecb |
newPrefix = qname[6:]
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
newPrefix = None
|
|
Packit |
423ecb |
newPrefixes.append(newPrefix)
|
|
Packit |
423ecb |
self._cont_handler.startPrefixMapping(\
|
|
Packit |
423ecb |
newPrefix,value)
|
|
Packit |
423ecb |
if not self.__nspfx:
|
|
Packit |
423ecb |
continue # don't report xmlns attribute
|
|
Packit |
423ecb |
attName = (_d(reader.NamespaceUri()),
|
|
Packit |
423ecb |
_d(reader.LocalName()))
|
|
Packit |
423ecb |
qnames[attName] = qname
|
|
Packit |
423ecb |
attrs[attName] = value
|
|
Packit |
423ecb |
reader.MoveToElement()
|
|
Packit |
423ecb |
self._cont_handler.startElementNS( \
|
|
Packit |
423ecb |
eltName,eltQName,attributesNSImpl)
|
|
Packit |
423ecb |
if reader.IsEmptyElement():
|
|
Packit |
423ecb |
self._cont_handler.endElementNS(eltName,eltQName)
|
|
Packit |
423ecb |
for newPrefix in newPrefixes:
|
|
Packit |
423ecb |
self._cont_handler.endPrefixMapping(newPrefix)
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
prefixes.append(newPrefixes)
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
eltName = _d(reader.Name())
|
|
Packit |
423ecb |
attributesImpl._attrs = attrs = {}
|
|
Packit |
423ecb |
while reader.MoveToNextAttribute():
|
|
Packit |
423ecb |
attName = _d(reader.Name())
|
|
Packit |
423ecb |
attrs[attName] = _d(reader.Value())
|
|
Packit |
423ecb |
reader.MoveToElement()
|
|
Packit |
423ecb |
self._cont_handler.startElement( \
|
|
Packit |
423ecb |
eltName,attributesImpl)
|
|
Packit |
423ecb |
if reader.IsEmptyElement():
|
|
Packit |
423ecb |
self._cont_handler.endElement(eltName)
|
|
Packit |
423ecb |
# EndElement
|
|
Packit |
423ecb |
elif nodeType == 15:
|
|
Packit |
423ecb |
if self.__ns:
|
|
Packit |
423ecb |
self._cont_handler.endElementNS( \
|
|
Packit |
423ecb |
(_d(reader.NamespaceUri()),_d(reader.LocalName())),
|
|
Packit |
423ecb |
_d(reader.Name()))
|
|
Packit |
423ecb |
for prefix in prefixes.pop():
|
|
Packit |
423ecb |
self._cont_handler.endPrefixMapping(prefix)
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
self._cont_handler.endElement(_d(reader.Name()))
|
|
Packit |
423ecb |
# Text
|
|
Packit |
423ecb |
elif nodeType == 3:
|
|
Packit |
423ecb |
self._cont_handler.characters(_d(reader.Value()))
|
|
Packit |
423ecb |
# Whitespace
|
|
Packit |
423ecb |
elif nodeType == 13:
|
|
Packit |
423ecb |
self._cont_handler.ignorableWhitespace(_d(reader.Value()))
|
|
Packit |
423ecb |
# SignificantWhitespace
|
|
Packit |
423ecb |
elif nodeType == 14:
|
|
Packit |
423ecb |
self._cont_handler.characters(_d(reader.Value()))
|
|
Packit |
423ecb |
# CDATA
|
|
Packit |
423ecb |
elif nodeType == 4:
|
|
Packit |
423ecb |
if not self.__lex_handler is None:
|
|
Packit |
423ecb |
self.__lex_handler.startCDATA()
|
|
Packit |
423ecb |
self._cont_handler.characters(_d(reader.Value()))
|
|
Packit |
423ecb |
if not self.__lex_handler is None:
|
|
Packit |
423ecb |
self.__lex_handler.endCDATA()
|
|
Packit |
423ecb |
# EntityReference
|
|
Packit |
423ecb |
elif nodeType == 5:
|
|
Packit |
423ecb |
if not self.__lex_handler is None:
|
|
Packit |
423ecb |
self.startEntity(_d(reader.Name()))
|
|
Packit |
423ecb |
reader.ResolveEntity()
|
|
Packit |
423ecb |
# EndEntity
|
|
Packit |
423ecb |
elif nodeType == 16:
|
|
Packit |
423ecb |
if not self.__lex_handler is None:
|
|
Packit |
423ecb |
self.endEntity(_d(reader.Name()))
|
|
Packit |
423ecb |
# ProcessingInstruction
|
|
Packit |
423ecb |
elif nodeType == 7:
|
|
Packit |
423ecb |
self._cont_handler.processingInstruction( \
|
|
Packit |
423ecb |
_d(reader.Name()),_d(reader.Value()))
|
|
Packit |
423ecb |
# Comment
|
|
Packit |
423ecb |
elif nodeType == 8:
|
|
Packit |
423ecb |
if not self.__lex_handler is None:
|
|
Packit |
423ecb |
self.__lex_handler.comment(_d(reader.Value()))
|
|
Packit |
423ecb |
# DocumentType
|
|
Packit |
423ecb |
elif nodeType == 10:
|
|
Packit |
423ecb |
#if not self.__lex_handler is None:
|
|
Packit |
423ecb |
# self.__lex_handler.startDTD()
|
|
Packit |
423ecb |
pass # TODO (how to detect endDTD? on first non-dtd event?)
|
|
Packit |
423ecb |
# XmlDeclaration
|
|
Packit |
423ecb |
elif nodeType == 17:
|
|
Packit |
423ecb |
pass # TODO
|
|
Packit |
423ecb |
# Entity
|
|
Packit |
423ecb |
elif nodeType == 6:
|
|
Packit |
423ecb |
pass # TODO (entity decl)
|
|
Packit |
423ecb |
# Notation (decl)
|
|
Packit |
423ecb |
elif nodeType == 12:
|
|
Packit |
423ecb |
pass # TODO
|
|
Packit |
423ecb |
# Attribute (never in this loop)
|
|
Packit |
423ecb |
#elif nodeType == 2:
|
|
Packit |
423ecb |
# pass
|
|
Packit |
423ecb |
# Document (not exposed)
|
|
Packit |
423ecb |
#elif nodeType == 9:
|
|
Packit |
423ecb |
# pass
|
|
Packit |
423ecb |
# DocumentFragment (never returned by XmlReader)
|
|
Packit |
423ecb |
#elif nodeType == 11:
|
|
Packit |
423ecb |
# pass
|
|
Packit |
423ecb |
# None
|
|
Packit |
423ecb |
#elif nodeType == 0:
|
|
Packit |
423ecb |
# pass
|
|
Packit |
423ecb |
# -
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
raise SAXException("Unexpected node type %d" % nodeType)
|
|
Packit |
423ecb |
if r == 0:
|
|
Packit |
423ecb |
self._cont_handler.endDocument()
|
|
Packit |
423ecb |
reader.Close()
|
|
Packit |
423ecb |
finally:
|
|
Packit |
423ecb |
self.__parsing = 0
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def setDTDHandler(self, handler):
|
|
Packit |
423ecb |
# TODO (when supported, the inherited method works just fine)
|
|
Packit |
423ecb |
raise SAXNotSupportedException("DTDHandler not supported")
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def setEntityResolver(self, resolver):
|
|
Packit |
423ecb |
# TODO (when supported, the inherited method works just fine)
|
|
Packit |
423ecb |
raise SAXNotSupportedException("EntityResolver not supported")
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def getFeature(self, name):
|
|
Packit |
423ecb |
if name == feature_namespaces:
|
|
Packit |
423ecb |
return self.__ns
|
|
Packit |
423ecb |
elif name == feature_namespace_prefixes:
|
|
Packit |
423ecb |
return self.__nspfx
|
|
Packit |
423ecb |
elif name == feature_validation:
|
|
Packit |
423ecb |
return self.__validate
|
|
Packit |
423ecb |
elif name == feature_external_ges:
|
|
Packit |
423ecb |
return 1 # TODO (does that relate to PARSER_LOADDTD)?
|
|
Packit |
423ecb |
elif name == feature_external_pes:
|
|
Packit |
423ecb |
return self.__extparams
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
raise SAXNotRecognizedException("Feature '%s' not recognized" % \
|
|
Packit |
423ecb |
name)
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def setFeature(self, name, state):
|
|
Packit |
423ecb |
if self.__parsing:
|
|
Packit |
423ecb |
raise SAXNotSupportedException("Cannot set feature %s " \
|
|
Packit |
423ecb |
"while parsing" % name)
|
|
Packit |
423ecb |
if name == feature_namespaces:
|
|
Packit |
423ecb |
self.__ns = state
|
|
Packit |
423ecb |
elif name == feature_namespace_prefixes:
|
|
Packit |
423ecb |
self.__nspfx = state
|
|
Packit |
423ecb |
elif name == feature_validation:
|
|
Packit |
423ecb |
self.__validate = state
|
|
Packit |
423ecb |
elif name == feature_external_ges:
|
|
Packit |
423ecb |
if state == 0:
|
|
Packit |
423ecb |
# TODO (does that relate to PARSER_LOADDTD)?
|
|
Packit |
423ecb |
raise SAXNotSupportedException("Feature '%s' not supported" % \
|
|
Packit |
423ecb |
name)
|
|
Packit |
423ecb |
elif name == feature_external_pes:
|
|
Packit |
423ecb |
self.__extparams = state
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
raise SAXNotRecognizedException("Feature '%s' not recognized" % \
|
|
Packit |
423ecb |
name)
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def getProperty(self, name):
|
|
Packit |
423ecb |
if name == property_lexical_handler:
|
|
Packit |
423ecb |
return self.__lex_handler
|
|
Packit |
423ecb |
elif name == property_declaration_handler:
|
|
Packit |
423ecb |
return self.__decl_handler
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
raise SAXNotRecognizedException("Property '%s' not recognized" % \
|
|
Packit |
423ecb |
name)
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def setProperty(self, name, value):
|
|
Packit |
423ecb |
if name == property_lexical_handler:
|
|
Packit |
423ecb |
self.__lex_handler = value
|
|
Packit |
423ecb |
elif name == property_declaration_handler:
|
|
Packit |
423ecb |
# TODO: remove if/when libxml2 supports dtd events
|
|
Packit |
423ecb |
raise SAXNotSupportedException("Property '%s' not supported" % \
|
|
Packit |
423ecb |
name)
|
|
Packit |
423ecb |
self.__decl_handler = value
|
|
Packit |
423ecb |
else:
|
|
Packit |
423ecb |
raise SAXNotRecognizedException("Property '%s' not recognized" % \
|
|
Packit |
423ecb |
name)
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
def create_parser():
|
|
Packit |
423ecb |
return LibXml2Reader()
|
|
Packit |
423ecb |
|