Blame python/drv_libxml2.py

Packit 423ecb
# -*- coding: iso-8859-1 -*-
Packit 423ecb
""" A SAX2 driver for libxml2, on top of it's XmlReader API
Packit 423ecb
Packit 423ecb
USAGE
Packit 423ecb
    # put this file (drv_libxml2.py) in PYTHONPATH
Packit 423ecb
    import xml.sax
Packit 423ecb
    reader = xml.sax.make_parser(["drv_libxml2"])
Packit 423ecb
    # ...and the rest is standard python sax.
Packit 423ecb
Packit 423ecb
CAVEATS
Packit 423ecb
    - Lexical handlers are supported, except for start/endEntity
Packit 423ecb
      (waiting for XmlReader.ResolveEntity) and start/endDTD
Packit 423ecb
    - Error callbacks are not exactly synchronous, they tend
Packit 423ecb
      to be invoked before the corresponding content callback,
Packit 423ecb
      because the underlying reader interface parses
Packit 423ecb
      data by chunks of 512 bytes
Packit 423ecb
    
Packit 423ecb
TODO
Packit 423ecb
    - search for TODO
Packit 423ecb
    - some ErrorHandler events (warning)
Packit 423ecb
    - some ContentHandler events (setDocumentLocator, skippedEntity)
Packit 423ecb
    - EntityResolver (using libxml2.?)
Packit 423ecb
    - DTDHandler (if/when libxml2 exposes such node types)
Packit 423ecb
    - DeclHandler (if/when libxml2 exposes such node types)
Packit 423ecb
    - property_xml_string?
Packit 423ecb
    - feature_string_interning?
Packit 423ecb
    - Incremental parser
Packit 423ecb
    - additional performance tuning:
Packit 423ecb
      - one might cache callbacks to avoid some name lookups
Packit 423ecb
      - one might implement a smarter way to pass attributes to startElement
Packit 423ecb
        (some kind of lazy evaluation?)
Packit 423ecb
      - there might be room for improvement in start/endPrefixMapping
Packit 423ecb
      - other?
Packit 423ecb
Packit 423ecb
"""
Packit 423ecb
Packit 423ecb
__author__  = "Stéphane Bidoul <sbi@skynet.be>"
Packit 423ecb
__version__ = "0.3"
Packit 423ecb
Packit 423ecb
import sys
Packit 423ecb
import codecs
Packit 423ecb
Packit 423ecb
if sys.version_info[0] < 3:
Packit 423ecb
    __author__  = codecs.unicode_escape_decode(__author__)[0]
Packit 423ecb
Packit 423ecb
    StringTypes = (str, unicode)
Packit 423ecb
    # libxml2 returns strings as UTF8
Packit 423ecb
    _decoder = codecs.lookup("utf8")[1]
Packit 423ecb
    def _d(s):
Packit 423ecb
        if s is None:
Packit 423ecb
            return s
Packit 423ecb
        else:
Packit 423ecb
            return _decoder(s)[0]
Packit 423ecb
else:
Packit 423ecb
    StringTypes = str
Packit 423ecb
    # s is Unicode `str` already
Packit 423ecb
    def _d(s):
Packit 423ecb
        return s
Packit 423ecb
Packit 423ecb
from xml.sax._exceptions import *
Packit 423ecb
from xml.sax import xmlreader, saxutils
Packit 423ecb
from xml.sax.handler import \
Packit 423ecb
     feature_namespaces, \
Packit 423ecb
     feature_namespace_prefixes, \
Packit 423ecb
     feature_string_interning, \
Packit 423ecb
     feature_validation, \
Packit 423ecb
     feature_external_ges, \
Packit 423ecb
     feature_external_pes, \
Packit 423ecb
     property_lexical_handler, \
Packit 423ecb
     property_declaration_handler, \
Packit 423ecb
     property_dom_node, \
Packit 423ecb
     property_xml_string
Packit 423ecb
Packit 423ecb
try:
Packit 423ecb
    import libxml2
Packit 423ecb
except ImportError:
Packit 423ecb
    raise SAXReaderNotAvailable("libxml2 not available: " \
Packit 423ecb
                                "import error was: %s" % sys.exc_info()[1])
Packit 423ecb
Packit 423ecb
class Locator(xmlreader.Locator):
Packit 423ecb
    """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
Packit 423ecb
Packit 423ecb
    def __init__(self,locator):
Packit 423ecb
        self.__locator = locator
Packit 423ecb
Packit 423ecb
    def getColumnNumber(self):
Packit 423ecb
        "Return the column number where the current event ends."
Packit 423ecb
        return -1
Packit 423ecb
Packit 423ecb
    def getLineNumber(self):
Packit 423ecb
        "Return the line number where the current event ends."
Packit 423ecb
        return self.__locator.LineNumber()
Packit 423ecb
Packit 423ecb
    def getPublicId(self):
Packit 423ecb
        "Return the public identifier for the current event."
Packit 423ecb
        return None
Packit 423ecb
Packit 423ecb
    def getSystemId(self):
Packit 423ecb
        "Return the system identifier for the current event."
Packit 423ecb
        return self.__locator.BaseURI()
Packit 423ecb
Packit 423ecb
class LibXml2Reader(xmlreader.XMLReader):
Packit 423ecb
Packit 423ecb
    def __init__(self):
Packit 423ecb
        xmlreader.XMLReader.__init__(self)
Packit 423ecb
        # features
Packit 423ecb
        self.__ns = 0
Packit 423ecb
        self.__nspfx = 0
Packit 423ecb
        self.__validate = 0
Packit 423ecb
        self.__extparams = 1
Packit 423ecb
        # parsing flag
Packit 423ecb
        self.__parsing = 0
Packit 423ecb
        # additional handlers
Packit 423ecb
        self.__lex_handler = None
Packit 423ecb
        self.__decl_handler = None
Packit 423ecb
        # error messages accumulator
Packit 423ecb
        self.__errors = None
Packit 423ecb
Packit 423ecb
    def _errorHandler(self,arg,msg,severity,locator):
Packit 423ecb
        if self.__errors is None:
Packit 423ecb
            self.__errors = []
Packit 423ecb
        self.__errors.append((severity,
Packit 423ecb
                              SAXParseException(msg,None,
Packit 423ecb
                                                Locator(locator))))
Packit 423ecb
Packit 423ecb
    def _reportErrors(self,fatal):
Packit 423ecb
        for severity,exception in self.__errors:
Packit 423ecb
            if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
Packit 423ecb
                            libxml2.PARSER_SEVERITY_WARNING):
Packit 423ecb
                self._err_handler.warning(exception)
Packit 423ecb
            else:
Packit 423ecb
                # when fatal is set, the parse will stop;
Packit 423ecb
                # we consider that the last error reported
Packit 423ecb
                # is the fatal one.
Packit 423ecb
                if fatal and exception is self.__errors[-1][1]:
Packit 423ecb
                    self._err_handler.fatalError(exception)
Packit 423ecb
                else:
Packit 423ecb
                    self._err_handler.error(exception)
Packit 423ecb
        self.__errors = None
Packit 423ecb
Packit 423ecb
    def parse(self, source):
Packit 423ecb
        self.__parsing = 1
Packit 423ecb
        try:
Packit 423ecb
            # prepare source and create reader
Packit 423ecb
            if isinstance(source, StringTypes):
Packit 423ecb
                reader = libxml2.newTextReaderFilename(source)
Packit 423ecb
            else:
Packit 423ecb
                source = saxutils.prepare_input_source(source)
Packit 423ecb
                input = libxml2.inputBuffer(source.getByteStream())
Packit 423ecb
                reader = input.newTextReader(source.getSystemId())
Packit 423ecb
            reader.SetErrorHandler(self._errorHandler,None)
Packit 423ecb
            # configure reader
Packit 423ecb
            if self.__extparams:
Packit 423ecb
                reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
Packit 423ecb
                reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
Packit 423ecb
                reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
Packit 423ecb
                reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
Packit 423ecb
            else:
Packit 423ecb
                reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
Packit 423ecb
            # we reuse attribute maps (for a slight performance gain)
Packit 423ecb
            if self.__ns:
Packit 423ecb
                attributesNSImpl = xmlreader.AttributesNSImpl({},{})
Packit 423ecb
            else:
Packit 423ecb
                attributesImpl = xmlreader.AttributesImpl({})
Packit 423ecb
            # prefixes to pop (for endPrefixMapping)
Packit 423ecb
            prefixes = []
Packit 423ecb
            # start loop
Packit 423ecb
            self._cont_handler.startDocument()
Packit 423ecb
            while 1:
Packit 423ecb
                r = reader.Read()
Packit 423ecb
                # check for errors
Packit 423ecb
                if r == 1:
Packit 423ecb
                    if not self.__errors is None:
Packit 423ecb
                        self._reportErrors(0)
Packit 423ecb
                elif r == 0:
Packit 423ecb
                    if not self.__errors is None:
Packit 423ecb
                        self._reportErrors(0)
Packit 423ecb
                    break # end of parse
Packit 423ecb
                else:
Packit 423ecb
                    if not self.__errors is None:
Packit 423ecb
                        self._reportErrors(1)
Packit 423ecb
                    else:
Packit 423ecb
                        self._err_handler.fatalError(\
Packit 423ecb
                            SAXException("Read failed (no details available)"))
Packit 423ecb
                    break # fatal parse error
Packit 423ecb
                # get node type
Packit 423ecb
                nodeType = reader.NodeType()
Packit 423ecb
                # Element
Packit 423ecb
                if nodeType == 1: 
Packit 423ecb
                    if self.__ns:
Packit 423ecb
                        eltName = (_d(reader.NamespaceUri()),\
Packit 423ecb
                                   _d(reader.LocalName()))
Packit 423ecb
                        eltQName = _d(reader.Name())
Packit 423ecb
                        attributesNSImpl._attrs = attrs = {}
Packit 423ecb
                        attributesNSImpl._qnames = qnames = {}
Packit 423ecb
                        newPrefixes = []
Packit 423ecb
                        while reader.MoveToNextAttribute():
Packit 423ecb
                            qname = _d(reader.Name())
Packit 423ecb
                            value = _d(reader.Value())
Packit 423ecb
                            if qname.startswith("xmlns"):
Packit 423ecb
                                if len(qname) > 5:
Packit 423ecb
                                    newPrefix = qname[6:]
Packit 423ecb
                                else:
Packit 423ecb
                                    newPrefix = None
Packit 423ecb
                                newPrefixes.append(newPrefix)
Packit 423ecb
                                self._cont_handler.startPrefixMapping(\
Packit 423ecb
                                    newPrefix,value)
Packit 423ecb
                                if not self.__nspfx:
Packit 423ecb
                                    continue # don't report xmlns attribute
Packit 423ecb
                            attName = (_d(reader.NamespaceUri()),
Packit 423ecb
                                       _d(reader.LocalName()))
Packit 423ecb
                            qnames[attName] = qname
Packit 423ecb
                            attrs[attName] = value
Packit 423ecb
                        reader.MoveToElement()
Packit 423ecb
                        self._cont_handler.startElementNS( \
Packit 423ecb
                            eltName,eltQName,attributesNSImpl) 
Packit 423ecb
                        if reader.IsEmptyElement():
Packit 423ecb
                            self._cont_handler.endElementNS(eltName,eltQName)
Packit 423ecb
                            for newPrefix in newPrefixes:
Packit 423ecb
                                self._cont_handler.endPrefixMapping(newPrefix)
Packit 423ecb
                        else:
Packit 423ecb
                            prefixes.append(newPrefixes)
Packit 423ecb
                    else:
Packit 423ecb
                        eltName = _d(reader.Name())
Packit 423ecb
                        attributesImpl._attrs = attrs = {}
Packit 423ecb
                        while reader.MoveToNextAttribute():
Packit 423ecb
                            attName = _d(reader.Name())
Packit 423ecb
                            attrs[attName] = _d(reader.Value())
Packit 423ecb
                        reader.MoveToElement()
Packit 423ecb
                        self._cont_handler.startElement( \
Packit 423ecb
                            eltName,attributesImpl)
Packit 423ecb
                        if reader.IsEmptyElement():
Packit 423ecb
                            self._cont_handler.endElement(eltName)
Packit 423ecb
                # EndElement
Packit 423ecb
                elif nodeType == 15: 
Packit 423ecb
                    if self.__ns:
Packit 423ecb
                        self._cont_handler.endElementNS( \
Packit 423ecb
                             (_d(reader.NamespaceUri()),_d(reader.LocalName())),
Packit 423ecb
                             _d(reader.Name()))
Packit 423ecb
                        for prefix in prefixes.pop():
Packit 423ecb
                            self._cont_handler.endPrefixMapping(prefix)
Packit 423ecb
                    else:
Packit 423ecb
                        self._cont_handler.endElement(_d(reader.Name()))
Packit 423ecb
                # Text
Packit 423ecb
                elif nodeType == 3: 
Packit 423ecb
                    self._cont_handler.characters(_d(reader.Value()))
Packit 423ecb
                # Whitespace
Packit 423ecb
                elif nodeType == 13: 
Packit 423ecb
                    self._cont_handler.ignorableWhitespace(_d(reader.Value()))
Packit 423ecb
                # SignificantWhitespace
Packit 423ecb
                elif nodeType == 14:
Packit 423ecb
                    self._cont_handler.characters(_d(reader.Value()))
Packit 423ecb
                # CDATA
Packit 423ecb
                elif nodeType == 4:
Packit 423ecb
                    if not self.__lex_handler is None:
Packit 423ecb
                        self.__lex_handler.startCDATA()
Packit 423ecb
                    self._cont_handler.characters(_d(reader.Value()))
Packit 423ecb
                    if not self.__lex_handler is None:
Packit 423ecb
                        self.__lex_handler.endCDATA()
Packit 423ecb
                # EntityReference
Packit 423ecb
                elif nodeType == 5:
Packit 423ecb
                    if not self.__lex_handler is None:
Packit 423ecb
                        self.startEntity(_d(reader.Name()))
Packit 423ecb
                    reader.ResolveEntity()
Packit 423ecb
                # EndEntity
Packit 423ecb
                elif nodeType == 16:
Packit 423ecb
                    if not self.__lex_handler is None:
Packit 423ecb
                        self.endEntity(_d(reader.Name()))
Packit 423ecb
                # ProcessingInstruction
Packit 423ecb
                elif nodeType == 7: 
Packit 423ecb
                    self._cont_handler.processingInstruction( \
Packit 423ecb
                        _d(reader.Name()),_d(reader.Value()))
Packit 423ecb
                # Comment
Packit 423ecb
                elif nodeType == 8:
Packit 423ecb
                    if not self.__lex_handler is None:
Packit 423ecb
                        self.__lex_handler.comment(_d(reader.Value()))
Packit 423ecb
                # DocumentType
Packit 423ecb
                elif nodeType == 10:
Packit 423ecb
                    #if not self.__lex_handler is None:
Packit 423ecb
                    #    self.__lex_handler.startDTD()
Packit 423ecb
                    pass # TODO (how to detect endDTD? on first non-dtd event?)
Packit 423ecb
                # XmlDeclaration
Packit 423ecb
                elif nodeType == 17:
Packit 423ecb
                    pass # TODO
Packit 423ecb
                # Entity
Packit 423ecb
                elif nodeType == 6:
Packit 423ecb
                    pass # TODO (entity decl)
Packit 423ecb
                # Notation (decl)
Packit 423ecb
                elif nodeType == 12:
Packit 423ecb
                    pass # TODO
Packit 423ecb
                # Attribute (never in this loop)
Packit 423ecb
                #elif nodeType == 2: 
Packit 423ecb
                #    pass
Packit 423ecb
                # Document (not exposed)
Packit 423ecb
                #elif nodeType == 9: 
Packit 423ecb
                #    pass
Packit 423ecb
                # DocumentFragment (never returned by XmlReader)
Packit 423ecb
                #elif nodeType == 11:
Packit 423ecb
                #    pass
Packit 423ecb
                # None
Packit 423ecb
                #elif nodeType == 0:
Packit 423ecb
                #    pass
Packit 423ecb
                # -
Packit 423ecb
                else:
Packit 423ecb
                    raise SAXException("Unexpected node type %d" % nodeType)
Packit 423ecb
            if r == 0:
Packit 423ecb
                self._cont_handler.endDocument()
Packit 423ecb
            reader.Close()
Packit 423ecb
        finally:
Packit 423ecb
            self.__parsing = 0
Packit 423ecb
Packit 423ecb
    def setDTDHandler(self, handler):
Packit 423ecb
        # TODO (when supported, the inherited method works just fine)
Packit 423ecb
        raise SAXNotSupportedException("DTDHandler not supported")
Packit 423ecb
Packit 423ecb
    def setEntityResolver(self, resolver):
Packit 423ecb
        # TODO (when supported, the inherited method works just fine)
Packit 423ecb
        raise SAXNotSupportedException("EntityResolver not supported")
Packit 423ecb
Packit 423ecb
    def getFeature(self, name):
Packit 423ecb
        if name == feature_namespaces:
Packit 423ecb
            return self.__ns
Packit 423ecb
        elif name == feature_namespace_prefixes:
Packit 423ecb
            return self.__nspfx
Packit 423ecb
        elif name == feature_validation:
Packit 423ecb
            return self.__validate
Packit 423ecb
        elif name == feature_external_ges:
Packit 423ecb
            return 1 # TODO (does that relate to PARSER_LOADDTD)?
Packit 423ecb
        elif name == feature_external_pes:
Packit 423ecb
            return self.__extparams
Packit 423ecb
        else:
Packit 423ecb
            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
Packit 423ecb
                                            name)
Packit 423ecb
Packit 423ecb
    def setFeature(self, name, state):
Packit 423ecb
        if self.__parsing:
Packit 423ecb
            raise SAXNotSupportedException("Cannot set feature %s " \
Packit 423ecb
                                           "while parsing" % name)
Packit 423ecb
        if name == feature_namespaces:
Packit 423ecb
            self.__ns = state
Packit 423ecb
        elif name == feature_namespace_prefixes:
Packit 423ecb
            self.__nspfx = state
Packit 423ecb
        elif name == feature_validation:
Packit 423ecb
            self.__validate = state
Packit 423ecb
        elif name == feature_external_ges:
Packit 423ecb
            if state == 0:
Packit 423ecb
                # TODO (does that relate to PARSER_LOADDTD)?
Packit 423ecb
                raise SAXNotSupportedException("Feature '%s' not supported" % \
Packit 423ecb
                                               name)
Packit 423ecb
        elif name == feature_external_pes:
Packit 423ecb
            self.__extparams = state
Packit 423ecb
        else:
Packit 423ecb
            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
Packit 423ecb
                                            name)
Packit 423ecb
Packit 423ecb
    def getProperty(self, name):
Packit 423ecb
        if name == property_lexical_handler:
Packit 423ecb
            return self.__lex_handler
Packit 423ecb
        elif name == property_declaration_handler:
Packit 423ecb
            return self.__decl_handler
Packit 423ecb
        else:
Packit 423ecb
            raise SAXNotRecognizedException("Property '%s' not recognized" % \
Packit 423ecb
                                            name)
Packit 423ecb
Packit 423ecb
    def setProperty(self, name, value):     
Packit 423ecb
        if name == property_lexical_handler:
Packit 423ecb
            self.__lex_handler = value
Packit 423ecb
        elif name == property_declaration_handler:
Packit 423ecb
            # TODO: remove if/when libxml2 supports dtd events
Packit 423ecb
            raise SAXNotSupportedException("Property '%s' not supported" % \
Packit 423ecb
                                           name)
Packit 423ecb
            self.__decl_handler = value
Packit 423ecb
        else:
Packit 423ecb
            raise SAXNotRecognizedException("Property '%s' not recognized" % \
Packit 423ecb
                                            name)
Packit 423ecb
Packit 423ecb
def create_parser():
Packit 423ecb
    return LibXml2Reader()
Packit 423ecb