Blame python/drv_libxml2.py

Packit 21b7a2
# -*- coding: iso-8859-1 -*-
Packit 21b7a2
""" A SAX2 driver for libxml2, on top of it's XmlReader API
Packit 21b7a2
Packit 21b7a2
USAGE
Packit 21b7a2
    # put this file (drv_libxml2.py) in PYTHONPATH
Packit 21b7a2
    import xml.sax
Packit 21b7a2
    reader = xml.sax.make_parser(["drv_libxml2"])
Packit 21b7a2
    # ...and the rest is standard python sax.
Packit 21b7a2
Packit 21b7a2
CAVEATS
Packit 21b7a2
    - Lexical handlers are supported, except for start/endEntity
Packit 21b7a2
      (waiting for XmlReader.ResolveEntity) and start/endDTD
Packit 21b7a2
    - Error callbacks are not exactly synchronous, they tend
Packit 21b7a2
      to be invoked before the corresponding content callback,
Packit 21b7a2
      because the underlying reader interface parses
Packit 21b7a2
      data by chunks of 512 bytes
Packit 21b7a2
    
Packit 21b7a2
TODO
Packit 21b7a2
    - search for TODO
Packit 21b7a2
    - some ErrorHandler events (warning)
Packit 21b7a2
    - some ContentHandler events (setDocumentLocator, skippedEntity)
Packit 21b7a2
    - EntityResolver (using libxml2.?)
Packit 21b7a2
    - DTDHandler (if/when libxml2 exposes such node types)
Packit 21b7a2
    - DeclHandler (if/when libxml2 exposes such node types)
Packit 21b7a2
    - property_xml_string?
Packit 21b7a2
    - feature_string_interning?
Packit 21b7a2
    - Incremental parser
Packit 21b7a2
    - additional performance tuning:
Packit 21b7a2
      - one might cache callbacks to avoid some name lookups
Packit 21b7a2
      - one might implement a smarter way to pass attributes to startElement
Packit 21b7a2
        (some kind of lazy evaluation?)
Packit 21b7a2
      - there might be room for improvement in start/endPrefixMapping
Packit 21b7a2
      - other?
Packit 21b7a2
Packit 21b7a2
"""
Packit 21b7a2
Packit 21b7a2
__author__  = "Stéphane Bidoul <sbi@skynet.be>"
Packit 21b7a2
__version__ = "0.3"
Packit 21b7a2
Packit 21b7a2
import sys
Packit 21b7a2
import codecs
Packit 21b7a2
Packit 21b7a2
if sys.version_info[0] < 3:
Packit 21b7a2
    __author__  = codecs.unicode_escape_decode(__author__)[0]
Packit 21b7a2
Packit 21b7a2
    StringTypes = (str, unicode)
Packit 21b7a2
else:
Packit 21b7a2
    StringTypes = str
Packit 21b7a2
Packit 21b7a2
from xml.sax._exceptions import *
Packit 21b7a2
from xml.sax import xmlreader, saxutils
Packit 21b7a2
from xml.sax.handler import \
Packit 21b7a2
     feature_namespaces, \
Packit 21b7a2
     feature_namespace_prefixes, \
Packit 21b7a2
     feature_string_interning, \
Packit 21b7a2
     feature_validation, \
Packit 21b7a2
     feature_external_ges, \
Packit 21b7a2
     feature_external_pes, \
Packit 21b7a2
     property_lexical_handler, \
Packit 21b7a2
     property_declaration_handler, \
Packit 21b7a2
     property_dom_node, \
Packit 21b7a2
     property_xml_string
Packit 21b7a2
Packit 21b7a2
# libxml2 returns strings as UTF8
Packit 21b7a2
_decoder = codecs.lookup("utf8")[1]
Packit 21b7a2
def _d(s):
Packit 21b7a2
    if s is None:
Packit 21b7a2
        return s
Packit 21b7a2
    else:
Packit 21b7a2
        return _decoder(s)[0]
Packit 21b7a2
Packit 21b7a2
try:
Packit 21b7a2
    import libxml2
Packit 21b7a2
except ImportError:
Packit 21b7a2
    raise SAXReaderNotAvailable("libxml2 not available: " \
Packit 21b7a2
                                "import error was: %s" % sys.exc_info()[1])
Packit 21b7a2
Packit 21b7a2
class Locator(xmlreader.Locator):
Packit 21b7a2
    """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
Packit 21b7a2
Packit 21b7a2
    def __init__(self,locator):
Packit 21b7a2
        self.__locator = locator
Packit 21b7a2
Packit 21b7a2
    def getColumnNumber(self):
Packit 21b7a2
        "Return the column number where the current event ends."
Packit 21b7a2
        return -1
Packit 21b7a2
Packit 21b7a2
    def getLineNumber(self):
Packit 21b7a2
        "Return the line number where the current event ends."
Packit 21b7a2
        return self.__locator.LineNumber()
Packit 21b7a2
Packit 21b7a2
    def getPublicId(self):
Packit 21b7a2
        "Return the public identifier for the current event."
Packit 21b7a2
        return None
Packit 21b7a2
Packit 21b7a2
    def getSystemId(self):
Packit 21b7a2
        "Return the system identifier for the current event."
Packit 21b7a2
        return self.__locator.BaseURI()
Packit 21b7a2
Packit 21b7a2
class LibXml2Reader(xmlreader.XMLReader):
Packit 21b7a2
Packit 21b7a2
    def __init__(self):
Packit 21b7a2
        xmlreader.XMLReader.__init__(self)
Packit 21b7a2
        # features
Packit 21b7a2
        self.__ns = 0
Packit 21b7a2
        self.__nspfx = 0
Packit 21b7a2
        self.__validate = 0
Packit 21b7a2
        self.__extparams = 1
Packit 21b7a2
        # parsing flag
Packit 21b7a2
        self.__parsing = 0
Packit 21b7a2
        # additional handlers
Packit 21b7a2
        self.__lex_handler = None
Packit 21b7a2
        self.__decl_handler = None
Packit 21b7a2
        # error messages accumulator
Packit 21b7a2
        self.__errors = None
Packit 21b7a2
Packit 21b7a2
    def _errorHandler(self,arg,msg,severity,locator):
Packit 21b7a2
        if self.__errors is None:
Packit 21b7a2
            self.__errors = []
Packit 21b7a2
        self.__errors.append((severity,
Packit 21b7a2
                              SAXParseException(msg,None,
Packit 21b7a2
                                                Locator(locator))))
Packit 21b7a2
Packit 21b7a2
    def _reportErrors(self,fatal):
Packit 21b7a2
        for severity,exception in self.__errors:
Packit 21b7a2
            if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
Packit 21b7a2
                            libxml2.PARSER_SEVERITY_WARNING):
Packit 21b7a2
                self._err_handler.warning(exception)
Packit 21b7a2
            else:
Packit 21b7a2
                # when fatal is set, the parse will stop;
Packit 21b7a2
                # we consider that the last error reported
Packit 21b7a2
                # is the fatal one.
Packit 21b7a2
                if fatal and exception is self.__errors[-1][1]:
Packit 21b7a2
                    self._err_handler.fatalError(exception)
Packit 21b7a2
                else:
Packit 21b7a2
                    self._err_handler.error(exception)
Packit 21b7a2
        self.__errors = None
Packit 21b7a2
Packit 21b7a2
    def parse(self, source):
Packit 21b7a2
        self.__parsing = 1
Packit 21b7a2
        try:
Packit 21b7a2
            # prepare source and create reader
Packit 21b7a2
            if isinstance(source, StringTypes):
Packit 21b7a2
                reader = libxml2.newTextReaderFilename(source)
Packit 21b7a2
            else:
Packit 21b7a2
                source = saxutils.prepare_input_source(source)
Packit 21b7a2
                input = libxml2.inputBuffer(source.getByteStream())
Packit 21b7a2
                reader = input.newTextReader(source.getSystemId())
Packit 21b7a2
            reader.SetErrorHandler(self._errorHandler,None)
Packit 21b7a2
            # configure reader
Packit 21b7a2
            if self.__extparams:
Packit 21b7a2
                reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
Packit 21b7a2
                reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
Packit 21b7a2
                reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
Packit 21b7a2
                reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
Packit 21b7a2
            else:
Packit 21b7a2
                reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
Packit 21b7a2
            # we reuse attribute maps (for a slight performance gain)
Packit 21b7a2
            if self.__ns:
Packit 21b7a2
                attributesNSImpl = xmlreader.AttributesNSImpl({},{})
Packit 21b7a2
            else:
Packit 21b7a2
                attributesImpl = xmlreader.AttributesImpl({})
Packit 21b7a2
            # prefixes to pop (for endPrefixMapping)
Packit 21b7a2
            prefixes = []
Packit 21b7a2
            # start loop
Packit 21b7a2
            self._cont_handler.startDocument()
Packit 21b7a2
            while 1:
Packit 21b7a2
                r = reader.Read()
Packit 21b7a2
                # check for errors
Packit 21b7a2
                if r == 1:
Packit 21b7a2
                    if not self.__errors is None:
Packit 21b7a2
                        self._reportErrors(0)
Packit 21b7a2
                elif r == 0:
Packit 21b7a2
                    if not self.__errors is None:
Packit 21b7a2
                        self._reportErrors(0)
Packit 21b7a2
                    break # end of parse
Packit 21b7a2
                else:
Packit 21b7a2
                    if not self.__errors is None:
Packit 21b7a2
                        self._reportErrors(1)
Packit 21b7a2
                    else:
Packit 21b7a2
                        self._err_handler.fatalError(\
Packit 21b7a2
                            SAXException("Read failed (no details available)"))
Packit 21b7a2
                    break # fatal parse error
Packit 21b7a2
                # get node type
Packit 21b7a2
                nodeType = reader.NodeType()
Packit 21b7a2
                # Element
Packit 21b7a2
                if nodeType == 1: 
Packit 21b7a2
                    if self.__ns:
Packit 21b7a2
                        eltName = (_d(reader.NamespaceUri()),\
Packit 21b7a2
                                   _d(reader.LocalName()))
Packit 21b7a2
                        eltQName = _d(reader.Name())
Packit 21b7a2
                        attributesNSImpl._attrs = attrs = {}
Packit 21b7a2
                        attributesNSImpl._qnames = qnames = {}
Packit 21b7a2
                        newPrefixes = []
Packit 21b7a2
                        while reader.MoveToNextAttribute():
Packit 21b7a2
                            qname = _d(reader.Name())
Packit 21b7a2
                            value = _d(reader.Value())
Packit 21b7a2
                            if qname.startswith("xmlns"):
Packit 21b7a2
                                if len(qname) > 5:
Packit 21b7a2
                                    newPrefix = qname[6:]
Packit 21b7a2
                                else:
Packit 21b7a2
                                    newPrefix = None
Packit 21b7a2
                                newPrefixes.append(newPrefix)
Packit 21b7a2
                                self._cont_handler.startPrefixMapping(\
Packit 21b7a2
                                    newPrefix,value)
Packit 21b7a2
                                if not self.__nspfx:
Packit 21b7a2
                                    continue # don't report xmlns attribute
Packit 21b7a2
                            attName = (_d(reader.NamespaceUri()),
Packit 21b7a2
                                       _d(reader.LocalName()))
Packit 21b7a2
                            qnames[attName] = qname
Packit 21b7a2
                            attrs[attName] = value
Packit 21b7a2
                        reader.MoveToElement()
Packit 21b7a2
                        self._cont_handler.startElementNS( \
Packit 21b7a2
                            eltName,eltQName,attributesNSImpl) 
Packit 21b7a2
                        if reader.IsEmptyElement():
Packit 21b7a2
                            self._cont_handler.endElementNS(eltName,eltQName)
Packit 21b7a2
                            for newPrefix in newPrefixes:
Packit 21b7a2
                                self._cont_handler.endPrefixMapping(newPrefix)
Packit 21b7a2
                        else:
Packit 21b7a2
                            prefixes.append(newPrefixes)
Packit 21b7a2
                    else:
Packit 21b7a2
                        eltName = _d(reader.Name())
Packit 21b7a2
                        attributesImpl._attrs = attrs = {}
Packit 21b7a2
                        while reader.MoveToNextAttribute():
Packit 21b7a2
                            attName = _d(reader.Name())
Packit 21b7a2
                            attrs[attName] = _d(reader.Value())
Packit 21b7a2
                        reader.MoveToElement()
Packit 21b7a2
                        self._cont_handler.startElement( \
Packit 21b7a2
                            eltName,attributesImpl)
Packit 21b7a2
                        if reader.IsEmptyElement():
Packit 21b7a2
                            self._cont_handler.endElement(eltName)
Packit 21b7a2
                # EndElement
Packit 21b7a2
                elif nodeType == 15: 
Packit 21b7a2
                    if self.__ns:
Packit 21b7a2
                        self._cont_handler.endElementNS( \
Packit 21b7a2
                             (_d(reader.NamespaceUri()),_d(reader.LocalName())),
Packit 21b7a2
                             _d(reader.Name()))
Packit 21b7a2
                        for prefix in prefixes.pop():
Packit 21b7a2
                            self._cont_handler.endPrefixMapping(prefix)
Packit 21b7a2
                    else:
Packit 21b7a2
                        self._cont_handler.endElement(_d(reader.Name()))
Packit 21b7a2
                # Text
Packit 21b7a2
                elif nodeType == 3: 
Packit 21b7a2
                    self._cont_handler.characters(_d(reader.Value()))
Packit 21b7a2
                # Whitespace
Packit 21b7a2
                elif nodeType == 13: 
Packit 21b7a2
                    self._cont_handler.ignorableWhitespace(_d(reader.Value()))
Packit 21b7a2
                # SignificantWhitespace
Packit 21b7a2
                elif nodeType == 14:
Packit 21b7a2
                    self._cont_handler.characters(_d(reader.Value()))
Packit 21b7a2
                # CDATA
Packit 21b7a2
                elif nodeType == 4:
Packit 21b7a2
                    if not self.__lex_handler is None:
Packit 21b7a2
                        self.__lex_handler.startCDATA()
Packit 21b7a2
                    self._cont_handler.characters(_d(reader.Value()))
Packit 21b7a2
                    if not self.__lex_handler is None:
Packit 21b7a2
                        self.__lex_handler.endCDATA()
Packit 21b7a2
                # EntityReference
Packit 21b7a2
                elif nodeType == 5:
Packit 21b7a2
                    if not self.__lex_handler is None:
Packit 21b7a2
                        self.startEntity(_d(reader.Name()))
Packit 21b7a2
                    reader.ResolveEntity()
Packit 21b7a2
                # EndEntity
Packit 21b7a2
                elif nodeType == 16:
Packit 21b7a2
                    if not self.__lex_handler is None:
Packit 21b7a2
                        self.endEntity(_d(reader.Name()))
Packit 21b7a2
                # ProcessingInstruction
Packit 21b7a2
                elif nodeType == 7: 
Packit 21b7a2
                    self._cont_handler.processingInstruction( \
Packit 21b7a2
                        _d(reader.Name()),_d(reader.Value()))
Packit 21b7a2
                # Comment
Packit 21b7a2
                elif nodeType == 8:
Packit 21b7a2
                    if not self.__lex_handler is None:
Packit 21b7a2
                        self.__lex_handler.comment(_d(reader.Value()))
Packit 21b7a2
                # DocumentType
Packit 21b7a2
                elif nodeType == 10:
Packit 21b7a2
                    #if not self.__lex_handler is None:
Packit 21b7a2
                    #    self.__lex_handler.startDTD()
Packit 21b7a2
                    pass # TODO (how to detect endDTD? on first non-dtd event?)
Packit 21b7a2
                # XmlDeclaration
Packit 21b7a2
                elif nodeType == 17:
Packit 21b7a2
                    pass # TODO
Packit 21b7a2
                # Entity
Packit 21b7a2
                elif nodeType == 6:
Packit 21b7a2
                    pass # TODO (entity decl)
Packit 21b7a2
                # Notation (decl)
Packit 21b7a2
                elif nodeType == 12:
Packit 21b7a2
                    pass # TODO
Packit 21b7a2
                # Attribute (never in this loop)
Packit 21b7a2
                #elif nodeType == 2: 
Packit 21b7a2
                #    pass
Packit 21b7a2
                # Document (not exposed)
Packit 21b7a2
                #elif nodeType == 9: 
Packit 21b7a2
                #    pass
Packit 21b7a2
                # DocumentFragment (never returned by XmlReader)
Packit 21b7a2
                #elif nodeType == 11:
Packit 21b7a2
                #    pass
Packit 21b7a2
                # None
Packit 21b7a2
                #elif nodeType == 0:
Packit 21b7a2
                #    pass
Packit 21b7a2
                # -
Packit 21b7a2
                else:
Packit 21b7a2
                    raise SAXException("Unexpected node type %d" % nodeType)
Packit 21b7a2
            if r == 0:
Packit 21b7a2
                self._cont_handler.endDocument()
Packit 21b7a2
            reader.Close()
Packit 21b7a2
        finally:
Packit 21b7a2
            self.__parsing = 0
Packit 21b7a2
Packit 21b7a2
    def setDTDHandler(self, handler):
Packit 21b7a2
        # TODO (when supported, the inherited method works just fine)
Packit 21b7a2
        raise SAXNotSupportedException("DTDHandler not supported")
Packit 21b7a2
Packit 21b7a2
    def setEntityResolver(self, resolver):
Packit 21b7a2
        # TODO (when supported, the inherited method works just fine)
Packit 21b7a2
        raise SAXNotSupportedException("EntityResolver not supported")
Packit 21b7a2
Packit 21b7a2
    def getFeature(self, name):
Packit 21b7a2
        if name == feature_namespaces:
Packit 21b7a2
            return self.__ns
Packit 21b7a2
        elif name == feature_namespace_prefixes:
Packit 21b7a2
            return self.__nspfx
Packit 21b7a2
        elif name == feature_validation:
Packit 21b7a2
            return self.__validate
Packit 21b7a2
        elif name == feature_external_ges:
Packit 21b7a2
            return 1 # TODO (does that relate to PARSER_LOADDTD)?
Packit 21b7a2
        elif name == feature_external_pes:
Packit 21b7a2
            return self.__extparams
Packit 21b7a2
        else:
Packit 21b7a2
            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
Packit 21b7a2
                                            name)
Packit 21b7a2
Packit 21b7a2
    def setFeature(self, name, state):
Packit 21b7a2
        if self.__parsing:
Packit 21b7a2
            raise SAXNotSupportedException("Cannot set feature %s " \
Packit 21b7a2
                                           "while parsing" % name)
Packit 21b7a2
        if name == feature_namespaces:
Packit 21b7a2
            self.__ns = state
Packit 21b7a2
        elif name == feature_namespace_prefixes:
Packit 21b7a2
            self.__nspfx = state
Packit 21b7a2
        elif name == feature_validation:
Packit 21b7a2
            self.__validate = state
Packit 21b7a2
        elif name == feature_external_ges:
Packit 21b7a2
            if state == 0:
Packit 21b7a2
                # TODO (does that relate to PARSER_LOADDTD)?
Packit 21b7a2
                raise SAXNotSupportedException("Feature '%s' not supported" % \
Packit 21b7a2
                                               name)
Packit 21b7a2
        elif name == feature_external_pes:
Packit 21b7a2
            self.__extparams = state
Packit 21b7a2
        else:
Packit 21b7a2
            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
Packit 21b7a2
                                            name)
Packit 21b7a2
Packit 21b7a2
    def getProperty(self, name):
Packit 21b7a2
        if name == property_lexical_handler:
Packit 21b7a2
            return self.__lex_handler
Packit 21b7a2
        elif name == property_declaration_handler:
Packit 21b7a2
            return self.__decl_handler
Packit 21b7a2
        else:
Packit 21b7a2
            raise SAXNotRecognizedException("Property '%s' not recognized" % \
Packit 21b7a2
                                            name)
Packit 21b7a2
Packit 21b7a2
    def setProperty(self, name, value):     
Packit 21b7a2
        if name == property_lexical_handler:
Packit 21b7a2
            self.__lex_handler = value
Packit 21b7a2
        elif name == property_declaration_handler:
Packit 21b7a2
            # TODO: remove if/when libxml2 supports dtd events
Packit 21b7a2
            raise SAXNotSupportedException("Property '%s' not supported" % \
Packit 21b7a2
                                           name)
Packit 21b7a2
            self.__decl_handler = value
Packit 21b7a2
        else:
Packit 21b7a2
            raise SAXNotRecognizedException("Property '%s' not recognized" % \
Packit 21b7a2
                                            name)
Packit 21b7a2
Packit 21b7a2
def create_parser():
Packit 21b7a2
    return LibXml2Reader()
Packit 21b7a2