Blame python/drv_libxml2.py

Packit Service a31ea6
# -*- coding: iso-8859-1 -*-
Packit Service a31ea6
""" A SAX2 driver for libxml2, on top of it's XmlReader API
Packit Service a31ea6
Packit Service a31ea6
USAGE
Packit Service a31ea6
    # put this file (drv_libxml2.py) in PYTHONPATH
Packit Service a31ea6
    import xml.sax
Packit Service a31ea6
    reader = xml.sax.make_parser(["drv_libxml2"])
Packit Service a31ea6
    # ...and the rest is standard python sax.
Packit Service a31ea6
Packit Service a31ea6
CAVEATS
Packit Service a31ea6
    - Lexical handlers are supported, except for start/endEntity
Packit Service a31ea6
      (waiting for XmlReader.ResolveEntity) and start/endDTD
Packit Service a31ea6
    - Error callbacks are not exactly synchronous, they tend
Packit Service a31ea6
      to be invoked before the corresponding content callback,
Packit Service a31ea6
      because the underlying reader interface parses
Packit Service a31ea6
      data by chunks of 512 bytes
Packit Service a31ea6
    
Packit Service a31ea6
TODO
Packit Service a31ea6
    - search for TODO
Packit Service a31ea6
    - some ErrorHandler events (warning)
Packit Service a31ea6
    - some ContentHandler events (setDocumentLocator, skippedEntity)
Packit Service a31ea6
    - EntityResolver (using libxml2.?)
Packit Service a31ea6
    - DTDHandler (if/when libxml2 exposes such node types)
Packit Service a31ea6
    - DeclHandler (if/when libxml2 exposes such node types)
Packit Service a31ea6
    - property_xml_string?
Packit Service a31ea6
    - feature_string_interning?
Packit Service a31ea6
    - Incremental parser
Packit Service a31ea6
    - additional performance tuning:
Packit Service a31ea6
      - one might cache callbacks to avoid some name lookups
Packit Service a31ea6
      - one might implement a smarter way to pass attributes to startElement
Packit Service a31ea6
        (some kind of lazy evaluation?)
Packit Service a31ea6
      - there might be room for improvement in start/endPrefixMapping
Packit Service a31ea6
      - other?
Packit Service a31ea6
Packit Service a31ea6
"""
Packit Service a31ea6
Packit Service a31ea6
__author__  = "Stéphane Bidoul <sbi@skynet.be>"
Packit Service a31ea6
__version__ = "0.3"
Packit Service a31ea6
Packit Service a31ea6
import sys
Packit Service a31ea6
import codecs
Packit Service a31ea6
Packit Service a31ea6
if sys.version_info[0] < 3:
Packit Service a31ea6
    __author__  = codecs.unicode_escape_decode(__author__)[0]
Packit Service a31ea6
Packit Service a31ea6
    StringTypes = (str, unicode)
Packit Service a31ea6
    # libxml2 returns strings as UTF8
Packit Service a31ea6
    _decoder = codecs.lookup("utf8")[1]
Packit Service a31ea6
    def _d(s):
Packit Service a31ea6
        if s is None:
Packit Service a31ea6
            return s
Packit Service a31ea6
        else:
Packit Service a31ea6
            return _decoder(s)[0]
Packit Service a31ea6
else:
Packit Service a31ea6
    StringTypes = str
Packit Service a31ea6
    # s is Unicode `str` already
Packit Service a31ea6
    def _d(s):
Packit Service a31ea6
        return s
Packit Service a31ea6
Packit Service a31ea6
from xml.sax._exceptions import *
Packit Service a31ea6
from xml.sax import xmlreader, saxutils
Packit Service a31ea6
from xml.sax.handler import \
Packit Service a31ea6
     feature_namespaces, \
Packit Service a31ea6
     feature_namespace_prefixes, \
Packit Service a31ea6
     feature_string_interning, \
Packit Service a31ea6
     feature_validation, \
Packit Service a31ea6
     feature_external_ges, \
Packit Service a31ea6
     feature_external_pes, \
Packit Service a31ea6
     property_lexical_handler, \
Packit Service a31ea6
     property_declaration_handler, \
Packit Service a31ea6
     property_dom_node, \
Packit Service a31ea6
     property_xml_string
Packit Service a31ea6
Packit Service a31ea6
try:
Packit Service a31ea6
    import libxml2
Packit Service a31ea6
except ImportError:
Packit Service a31ea6
    raise SAXReaderNotAvailable("libxml2 not available: " \
Packit Service a31ea6
                                "import error was: %s" % sys.exc_info()[1])
Packit Service a31ea6
Packit Service a31ea6
class Locator(xmlreader.Locator):
Packit Service a31ea6
    """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
Packit Service a31ea6
Packit Service a31ea6
    def __init__(self,locator):
Packit Service a31ea6
        self.__locator = locator
Packit Service a31ea6
Packit Service a31ea6
    def getColumnNumber(self):
Packit Service a31ea6
        "Return the column number where the current event ends."
Packit Service a31ea6
        return -1
Packit Service a31ea6
Packit Service a31ea6
    def getLineNumber(self):
Packit Service a31ea6
        "Return the line number where the current event ends."
Packit Service a31ea6
        return self.__locator.LineNumber()
Packit Service a31ea6
Packit Service a31ea6
    def getPublicId(self):
Packit Service a31ea6
        "Return the public identifier for the current event."
Packit Service a31ea6
        return None
Packit Service a31ea6
Packit Service a31ea6
    def getSystemId(self):
Packit Service a31ea6
        "Return the system identifier for the current event."
Packit Service a31ea6
        return self.__locator.BaseURI()
Packit Service a31ea6
Packit Service a31ea6
class LibXml2Reader(xmlreader.XMLReader):
Packit Service a31ea6
Packit Service a31ea6
    def __init__(self):
Packit Service a31ea6
        xmlreader.XMLReader.__init__(self)
Packit Service a31ea6
        # features
Packit Service a31ea6
        self.__ns = 0
Packit Service a31ea6
        self.__nspfx = 0
Packit Service a31ea6
        self.__validate = 0
Packit Service a31ea6
        self.__extparams = 1
Packit Service a31ea6
        # parsing flag
Packit Service a31ea6
        self.__parsing = 0
Packit Service a31ea6
        # additional handlers
Packit Service a31ea6
        self.__lex_handler = None
Packit Service a31ea6
        self.__decl_handler = None
Packit Service a31ea6
        # error messages accumulator
Packit Service a31ea6
        self.__errors = None
Packit Service a31ea6
Packit Service a31ea6
    def _errorHandler(self,arg,msg,severity,locator):
Packit Service a31ea6
        if self.__errors is None:
Packit Service a31ea6
            self.__errors = []
Packit Service a31ea6
        self.__errors.append((severity,
Packit Service a31ea6
                              SAXParseException(msg,None,
Packit Service a31ea6
                                                Locator(locator))))
Packit Service a31ea6
Packit Service a31ea6
    def _reportErrors(self,fatal):
Packit Service a31ea6
        for severity,exception in self.__errors:
Packit Service a31ea6
            if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
Packit Service a31ea6
                            libxml2.PARSER_SEVERITY_WARNING):
Packit Service a31ea6
                self._err_handler.warning(exception)
Packit Service a31ea6
            else:
Packit Service a31ea6
                # when fatal is set, the parse will stop;
Packit Service a31ea6
                # we consider that the last error reported
Packit Service a31ea6
                # is the fatal one.
Packit Service a31ea6
                if fatal and exception is self.__errors[-1][1]:
Packit Service a31ea6
                    self._err_handler.fatalError(exception)
Packit Service a31ea6
                else:
Packit Service a31ea6
                    self._err_handler.error(exception)
Packit Service a31ea6
        self.__errors = None
Packit Service a31ea6
Packit Service a31ea6
    def parse(self, source):
Packit Service a31ea6
        self.__parsing = 1
Packit Service a31ea6
        try:
Packit Service a31ea6
            # prepare source and create reader
Packit Service a31ea6
            if isinstance(source, StringTypes):
Packit Service a31ea6
                reader = libxml2.newTextReaderFilename(source)
Packit Service a31ea6
            else:
Packit Service a31ea6
                source = saxutils.prepare_input_source(source)
Packit Service a31ea6
                input = libxml2.inputBuffer(source.getByteStream())
Packit Service a31ea6
                reader = input.newTextReader(source.getSystemId())
Packit Service a31ea6
            reader.SetErrorHandler(self._errorHandler,None)
Packit Service a31ea6
            # configure reader
Packit Service a31ea6
            if self.__extparams:
Packit Service a31ea6
                reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
Packit Service a31ea6
                reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
Packit Service a31ea6
                reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
Packit Service a31ea6
                reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
Packit Service a31ea6
            else:
Packit Service a31ea6
                reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
Packit Service a31ea6
            # we reuse attribute maps (for a slight performance gain)
Packit Service a31ea6
            if self.__ns:
Packit Service a31ea6
                attributesNSImpl = xmlreader.AttributesNSImpl({},{})
Packit Service a31ea6
            else:
Packit Service a31ea6
                attributesImpl = xmlreader.AttributesImpl({})
Packit Service a31ea6
            # prefixes to pop (for endPrefixMapping)
Packit Service a31ea6
            prefixes = []
Packit Service a31ea6
            # start loop
Packit Service a31ea6
            self._cont_handler.startDocument()
Packit Service a31ea6
            while 1:
Packit Service a31ea6
                r = reader.Read()
Packit Service a31ea6
                # check for errors
Packit Service a31ea6
                if r == 1:
Packit Service a31ea6
                    if not self.__errors is None:
Packit Service a31ea6
                        self._reportErrors(0)
Packit Service a31ea6
                elif r == 0:
Packit Service a31ea6
                    if not self.__errors is None:
Packit Service a31ea6
                        self._reportErrors(0)
Packit Service a31ea6
                    break # end of parse
Packit Service a31ea6
                else:
Packit Service a31ea6
                    if not self.__errors is None:
Packit Service a31ea6
                        self._reportErrors(1)
Packit Service a31ea6
                    else:
Packit Service a31ea6
                        self._err_handler.fatalError(\
Packit Service a31ea6
                            SAXException("Read failed (no details available)"))
Packit Service a31ea6
                    break # fatal parse error
Packit Service a31ea6
                # get node type
Packit Service a31ea6
                nodeType = reader.NodeType()
Packit Service a31ea6
                # Element
Packit Service a31ea6
                if nodeType == 1: 
Packit Service a31ea6
                    if self.__ns:
Packit Service a31ea6
                        eltName = (_d(reader.NamespaceUri()),\
Packit Service a31ea6
                                   _d(reader.LocalName()))
Packit Service a31ea6
                        eltQName = _d(reader.Name())
Packit Service a31ea6
                        attributesNSImpl._attrs = attrs = {}
Packit Service a31ea6
                        attributesNSImpl._qnames = qnames = {}
Packit Service a31ea6
                        newPrefixes = []
Packit Service a31ea6
                        while reader.MoveToNextAttribute():
Packit Service a31ea6
                            qname = _d(reader.Name())
Packit Service a31ea6
                            value = _d(reader.Value())
Packit Service a31ea6
                            if qname.startswith("xmlns"):
Packit Service a31ea6
                                if len(qname) > 5:
Packit Service a31ea6
                                    newPrefix = qname[6:]
Packit Service a31ea6
                                else:
Packit Service a31ea6
                                    newPrefix = None
Packit Service a31ea6
                                newPrefixes.append(newPrefix)
Packit Service a31ea6
                                self._cont_handler.startPrefixMapping(\
Packit Service a31ea6
                                    newPrefix,value)
Packit Service a31ea6
                                if not self.__nspfx:
Packit Service a31ea6
                                    continue # don't report xmlns attribute
Packit Service a31ea6
                            attName = (_d(reader.NamespaceUri()),
Packit Service a31ea6
                                       _d(reader.LocalName()))
Packit Service a31ea6
                            qnames[attName] = qname
Packit Service a31ea6
                            attrs[attName] = value
Packit Service a31ea6
                        reader.MoveToElement()
Packit Service a31ea6
                        self._cont_handler.startElementNS( \
Packit Service a31ea6
                            eltName,eltQName,attributesNSImpl) 
Packit Service a31ea6
                        if reader.IsEmptyElement():
Packit Service a31ea6
                            self._cont_handler.endElementNS(eltName,eltQName)
Packit Service a31ea6
                            for newPrefix in newPrefixes:
Packit Service a31ea6
                                self._cont_handler.endPrefixMapping(newPrefix)
Packit Service a31ea6
                        else:
Packit Service a31ea6
                            prefixes.append(newPrefixes)
Packit Service a31ea6
                    else:
Packit Service a31ea6
                        eltName = _d(reader.Name())
Packit Service a31ea6
                        attributesImpl._attrs = attrs = {}
Packit Service a31ea6
                        while reader.MoveToNextAttribute():
Packit Service a31ea6
                            attName = _d(reader.Name())
Packit Service a31ea6
                            attrs[attName] = _d(reader.Value())
Packit Service a31ea6
                        reader.MoveToElement()
Packit Service a31ea6
                        self._cont_handler.startElement( \
Packit Service a31ea6
                            eltName,attributesImpl)
Packit Service a31ea6
                        if reader.IsEmptyElement():
Packit Service a31ea6
                            self._cont_handler.endElement(eltName)
Packit Service a31ea6
                # EndElement
Packit Service a31ea6
                elif nodeType == 15: 
Packit Service a31ea6
                    if self.__ns:
Packit Service a31ea6
                        self._cont_handler.endElementNS( \
Packit Service a31ea6
                             (_d(reader.NamespaceUri()),_d(reader.LocalName())),
Packit Service a31ea6
                             _d(reader.Name()))
Packit Service a31ea6
                        for prefix in prefixes.pop():
Packit Service a31ea6
                            self._cont_handler.endPrefixMapping(prefix)
Packit Service a31ea6
                    else:
Packit Service a31ea6
                        self._cont_handler.endElement(_d(reader.Name()))
Packit Service a31ea6
                # Text
Packit Service a31ea6
                elif nodeType == 3: 
Packit Service a31ea6
                    self._cont_handler.characters(_d(reader.Value()))
Packit Service a31ea6
                # Whitespace
Packit Service a31ea6
                elif nodeType == 13: 
Packit Service a31ea6
                    self._cont_handler.ignorableWhitespace(_d(reader.Value()))
Packit Service a31ea6
                # SignificantWhitespace
Packit Service a31ea6
                elif nodeType == 14:
Packit Service a31ea6
                    self._cont_handler.characters(_d(reader.Value()))
Packit Service a31ea6
                # CDATA
Packit Service a31ea6
                elif nodeType == 4:
Packit Service a31ea6
                    if not self.__lex_handler is None:
Packit Service a31ea6
                        self.__lex_handler.startCDATA()
Packit Service a31ea6
                    self._cont_handler.characters(_d(reader.Value()))
Packit Service a31ea6
                    if not self.__lex_handler is None:
Packit Service a31ea6
                        self.__lex_handler.endCDATA()
Packit Service a31ea6
                # EntityReference
Packit Service a31ea6
                elif nodeType == 5:
Packit Service a31ea6
                    if not self.__lex_handler is None:
Packit Service a31ea6
                        self.startEntity(_d(reader.Name()))
Packit Service a31ea6
                    reader.ResolveEntity()
Packit Service a31ea6
                # EndEntity
Packit Service a31ea6
                elif nodeType == 16:
Packit Service a31ea6
                    if not self.__lex_handler is None:
Packit Service a31ea6
                        self.endEntity(_d(reader.Name()))
Packit Service a31ea6
                # ProcessingInstruction
Packit Service a31ea6
                elif nodeType == 7: 
Packit Service a31ea6
                    self._cont_handler.processingInstruction( \
Packit Service a31ea6
                        _d(reader.Name()),_d(reader.Value()))
Packit Service a31ea6
                # Comment
Packit Service a31ea6
                elif nodeType == 8:
Packit Service a31ea6
                    if not self.__lex_handler is None:
Packit Service a31ea6
                        self.__lex_handler.comment(_d(reader.Value()))
Packit Service a31ea6
                # DocumentType
Packit Service a31ea6
                elif nodeType == 10:
Packit Service a31ea6
                    #if not self.__lex_handler is None:
Packit Service a31ea6
                    #    self.__lex_handler.startDTD()
Packit Service a31ea6
                    pass # TODO (how to detect endDTD? on first non-dtd event?)
Packit Service a31ea6
                # XmlDeclaration
Packit Service a31ea6
                elif nodeType == 17:
Packit Service a31ea6
                    pass # TODO
Packit Service a31ea6
                # Entity
Packit Service a31ea6
                elif nodeType == 6:
Packit Service a31ea6
                    pass # TODO (entity decl)
Packit Service a31ea6
                # Notation (decl)
Packit Service a31ea6
                elif nodeType == 12:
Packit Service a31ea6
                    pass # TODO
Packit Service a31ea6
                # Attribute (never in this loop)
Packit Service a31ea6
                #elif nodeType == 2: 
Packit Service a31ea6
                #    pass
Packit Service a31ea6
                # Document (not exposed)
Packit Service a31ea6
                #elif nodeType == 9: 
Packit Service a31ea6
                #    pass
Packit Service a31ea6
                # DocumentFragment (never returned by XmlReader)
Packit Service a31ea6
                #elif nodeType == 11:
Packit Service a31ea6
                #    pass
Packit Service a31ea6
                # None
Packit Service a31ea6
                #elif nodeType == 0:
Packit Service a31ea6
                #    pass
Packit Service a31ea6
                # -
Packit Service a31ea6
                else:
Packit Service a31ea6
                    raise SAXException("Unexpected node type %d" % nodeType)
Packit Service a31ea6
            if r == 0:
Packit Service a31ea6
                self._cont_handler.endDocument()
Packit Service a31ea6
            reader.Close()
Packit Service a31ea6
        finally:
Packit Service a31ea6
            self.__parsing = 0
Packit Service a31ea6
Packit Service a31ea6
    def setDTDHandler(self, handler):
Packit Service a31ea6
        # TODO (when supported, the inherited method works just fine)
Packit Service a31ea6
        raise SAXNotSupportedException("DTDHandler not supported")
Packit Service a31ea6
Packit Service a31ea6
    def setEntityResolver(self, resolver):
Packit Service a31ea6
        # TODO (when supported, the inherited method works just fine)
Packit Service a31ea6
        raise SAXNotSupportedException("EntityResolver not supported")
Packit Service a31ea6
Packit Service a31ea6
    def getFeature(self, name):
Packit Service a31ea6
        if name == feature_namespaces:
Packit Service a31ea6
            return self.__ns
Packit Service a31ea6
        elif name == feature_namespace_prefixes:
Packit Service a31ea6
            return self.__nspfx
Packit Service a31ea6
        elif name == feature_validation:
Packit Service a31ea6
            return self.__validate
Packit Service a31ea6
        elif name == feature_external_ges:
Packit Service a31ea6
            return 1 # TODO (does that relate to PARSER_LOADDTD)?
Packit Service a31ea6
        elif name == feature_external_pes:
Packit Service a31ea6
            return self.__extparams
Packit Service a31ea6
        else:
Packit Service a31ea6
            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
Packit Service a31ea6
                                            name)
Packit Service a31ea6
Packit Service a31ea6
    def setFeature(self, name, state):
Packit Service a31ea6
        if self.__parsing:
Packit Service a31ea6
            raise SAXNotSupportedException("Cannot set feature %s " \
Packit Service a31ea6
                                           "while parsing" % name)
Packit Service a31ea6
        if name == feature_namespaces:
Packit Service a31ea6
            self.__ns = state
Packit Service a31ea6
        elif name == feature_namespace_prefixes:
Packit Service a31ea6
            self.__nspfx = state
Packit Service a31ea6
        elif name == feature_validation:
Packit Service a31ea6
            self.__validate = state
Packit Service a31ea6
        elif name == feature_external_ges:
Packit Service a31ea6
            if state == 0:
Packit Service a31ea6
                # TODO (does that relate to PARSER_LOADDTD)?
Packit Service a31ea6
                raise SAXNotSupportedException("Feature '%s' not supported" % \
Packit Service a31ea6
                                               name)
Packit Service a31ea6
        elif name == feature_external_pes:
Packit Service a31ea6
            self.__extparams = state
Packit Service a31ea6
        else:
Packit Service a31ea6
            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
Packit Service a31ea6
                                            name)
Packit Service a31ea6
Packit Service a31ea6
    def getProperty(self, name):
Packit Service a31ea6
        if name == property_lexical_handler:
Packit Service a31ea6
            return self.__lex_handler
Packit Service a31ea6
        elif name == property_declaration_handler:
Packit Service a31ea6
            return self.__decl_handler
Packit Service a31ea6
        else:
Packit Service a31ea6
            raise SAXNotRecognizedException("Property '%s' not recognized" % \
Packit Service a31ea6
                                            name)
Packit Service a31ea6
Packit Service a31ea6
    def setProperty(self, name, value):     
Packit Service a31ea6
        if name == property_lexical_handler:
Packit Service a31ea6
            self.__lex_handler = value
Packit Service a31ea6
        elif name == property_declaration_handler:
Packit Service a31ea6
            # TODO: remove if/when libxml2 supports dtd events
Packit Service a31ea6
            raise SAXNotSupportedException("Property '%s' not supported" % \
Packit Service a31ea6
                                           name)
Packit Service a31ea6
            self.__decl_handler = value
Packit Service a31ea6
        else:
Packit Service a31ea6
            raise SAXNotRecognizedException("Property '%s' not recognized" % \
Packit Service a31ea6
                                            name)
Packit Service a31ea6
Packit Service a31ea6
def create_parser():
Packit Service a31ea6
    return LibXml2Reader()
Packit Service a31ea6