Blob Blame History Raw
# XPath evaluation

class XPathSyntaxError(LxmlSyntaxError, XPathError):
    pass

################################################################################
# XPath

cdef object _XPATH_SYNTAX_ERRORS
_XPATH_SYNTAX_ERRORS = (
    xmlerror.XML_XPATH_NUMBER_ERROR,
    xmlerror.XML_XPATH_UNFINISHED_LITERAL_ERROR,
    xmlerror.XML_XPATH_VARIABLE_REF_ERROR,
    xmlerror.XML_XPATH_INVALID_PREDICATE_ERROR,
    xmlerror.XML_XPATH_UNCLOSED_ERROR,
    xmlerror.XML_XPATH_INVALID_CHAR_ERROR
)

cdef object _XPATH_EVAL_ERRORS
_XPATH_EVAL_ERRORS = (
    xmlerror.XML_XPATH_UNDEF_VARIABLE_ERROR,
    xmlerror.XML_XPATH_UNDEF_PREFIX_ERROR,
    xmlerror.XML_XPATH_UNKNOWN_FUNC_ERROR,
    xmlerror.XML_XPATH_INVALID_OPERAND,
    xmlerror.XML_XPATH_INVALID_TYPE,
    xmlerror.XML_XPATH_INVALID_ARITY,
    xmlerror.XML_XPATH_INVALID_CTXT_SIZE,
    xmlerror.XML_XPATH_INVALID_CTXT_POSITION
)

cdef int _register_xpath_function(void* ctxt, name_utf, ns_utf):
    if ns_utf is None:
        return xpath.xmlXPathRegisterFunc(
            <xpath.xmlXPathContext*>ctxt, _xcstr(name_utf),
            _xpath_function_call)
    else:
        return xpath.xmlXPathRegisterFuncNS(
            <xpath.xmlXPathContext*>ctxt, _xcstr(name_utf), _xcstr(ns_utf),
            _xpath_function_call)

cdef int _unregister_xpath_function(void* ctxt, name_utf, ns_utf):
    if ns_utf is None:
        return xpath.xmlXPathRegisterFunc(
            <xpath.xmlXPathContext*>ctxt, _xcstr(name_utf), NULL)
    else:
        return xpath.xmlXPathRegisterFuncNS(
            <xpath.xmlXPathContext*>ctxt, _xcstr(name_utf), _xcstr(ns_utf), NULL)


@cython.final
@cython.internal
cdef class _XPathContext(_BaseContext):
    cdef object _variables
    def __init__(self, namespaces, extensions, error_log, enable_regexp, variables,
                 build_smart_strings):
        self._variables = variables
        _BaseContext.__init__(self, namespaces, extensions, error_log, enable_regexp,
                              build_smart_strings)

    cdef set_context(self, xpath.xmlXPathContext* xpathCtxt):
        self._set_xpath_context(xpathCtxt)
        # This would be a good place to set up the XPath parser dict, but
        # we cannot use the current thread dict as we do not know which
        # thread will execute the XPath evaluator - so, no dict for now.
        self.registerLocalNamespaces()
        self.registerLocalFunctions(xpathCtxt, _register_xpath_function)

    cdef register_context(self, _Document doc):
        self._register_context(doc)
        self.registerGlobalNamespaces()
        self.registerGlobalFunctions(self._xpathCtxt, _register_xpath_function)
        self.registerExsltFunctions()
        if self._variables is not None:
            self.registerVariables(self._variables)

    cdef unregister_context(self):
        self.unregisterGlobalFunctions(
            self._xpathCtxt, _unregister_xpath_function)
        self.unregisterGlobalNamespaces()
        xpath.xmlXPathRegisteredVariablesCleanup(self._xpathCtxt)
        self._cleanup_context()

    cdef void registerExsltFunctions(self):
        if xslt.LIBXSLT_VERSION < 10125:
            # we'd only execute dummy functions anyway
            return
        tree.xmlHashScan(
            self._xpathCtxt.nsHash, _registerExsltFunctionsForNamespaces,
            self._xpathCtxt)

    cdef registerVariables(self, variable_dict):
        for name, value in variable_dict.items():
            name_utf = self._to_utf(name)
            xpath.xmlXPathRegisterVariable(
                self._xpathCtxt, _xcstr(name_utf), _wrapXPathObject(value, None, None))

    cdef registerVariable(self, name, value):
        name_utf = self._to_utf(name)
        xpath.xmlXPathRegisterVariable(
            self._xpathCtxt, _xcstr(name_utf), _wrapXPathObject(value, None, None))


cdef void _registerExsltFunctionsForNamespaces(
        void* _c_href, void* _ctxt, xmlChar* c_prefix):
    c_href = <const_xmlChar*> _c_href
    ctxt = <xpath.xmlXPathContext*> _ctxt

    if tree.xmlStrcmp(c_href, xslt.EXSLT_DATE_NAMESPACE) == 0:
        xslt.exsltDateXpathCtxtRegister(ctxt, c_prefix)
    elif tree.xmlStrcmp(c_href, xslt.EXSLT_SETS_NAMESPACE) == 0:
        xslt.exsltSetsXpathCtxtRegister(ctxt, c_prefix)
    elif tree.xmlStrcmp(c_href, xslt.EXSLT_MATH_NAMESPACE) == 0:
        xslt.exsltMathXpathCtxtRegister(ctxt, c_prefix)
    elif tree.xmlStrcmp(c_href, xslt.EXSLT_STRINGS_NAMESPACE) == 0:
        xslt.exsltStrXpathCtxtRegister(ctxt, c_prefix)


cdef class _XPathEvaluatorBase:
    cdef xpath.xmlXPathContext* _xpathCtxt
    cdef _XPathContext _context
    cdef python.PyThread_type_lock _eval_lock
    cdef _ErrorLog _error_log
    def __cinit__(self):
        self._xpathCtxt = NULL
        if config.ENABLE_THREADING:
            self._eval_lock = python.PyThread_allocate_lock()
            if self._eval_lock is NULL:
                raise MemoryError()
        self._error_log = _ErrorLog()

    def __init__(self, namespaces, extensions, enable_regexp,
                 smart_strings):
        self._context = _XPathContext(namespaces, extensions, self._error_log,
                                      enable_regexp, None, smart_strings)

    property error_log:
        def __get__(self):
            assert self._error_log is not None, "XPath evaluator not initialised"
            return self._error_log.copy()

    def __dealloc__(self):
        if self._xpathCtxt is not NULL:
            xpath.xmlXPathFreeContext(self._xpathCtxt)
        if config.ENABLE_THREADING:
            if self._eval_lock is not NULL:
                python.PyThread_free_lock(self._eval_lock)

    cdef set_context(self, xpath.xmlXPathContext* xpathCtxt):
        self._xpathCtxt = xpathCtxt
        self._context.set_context(xpathCtxt)

    def evaluate(self, _eval_arg, **_variables):
        u"""evaluate(self, _eval_arg, **_variables)

        Evaluate an XPath expression.

        Instead of calling this method, you can also call the evaluator object
        itself.

        Variables may be provided as keyword arguments.  Note that namespaces
        are currently not supported for variables.

        :deprecated: call the object, not its method.
        """
        return self(_eval_arg, **_variables)

    cdef bint _checkAbsolutePath(self, char* path):
        cdef char c
        if path is NULL:
            return 0
        c = path[0]
        while c == c' ' or c == c'\t':
            path = path + 1
            c = path[0]
        return c == c'/'

    @cython.final
    cdef int _lock(self) except -1:
        cdef int result
        if config.ENABLE_THREADING and self._eval_lock != NULL:
            with nogil:
                result = python.PyThread_acquire_lock(
                    self._eval_lock, python.WAIT_LOCK)
            if result == 0:
                raise XPathError, u"XPath evaluator locking failed"
        return 0

    @cython.final
    cdef void _unlock(self):
        if config.ENABLE_THREADING and self._eval_lock != NULL:
            python.PyThread_release_lock(self._eval_lock)

    cdef _build_parse_error(self):
        cdef _BaseErrorLog entries
        entries = self._error_log.filter_types(_XPATH_SYNTAX_ERRORS)
        if entries:
            message = entries._buildExceptionMessage(None)
            if message is not None:
                return XPathSyntaxError(message, self._error_log)
        return XPathSyntaxError(
            self._error_log._buildExceptionMessage(u"Error in xpath expression"),
            self._error_log)

    cdef _build_eval_error(self):
        cdef _BaseErrorLog entries
        entries = self._error_log.filter_types(_XPATH_EVAL_ERRORS)
        if not entries:
            entries = self._error_log.filter_types(_XPATH_SYNTAX_ERRORS)
        if entries:
            message = entries._buildExceptionMessage(None)
            if message is not None:
                return XPathEvalError(message, self._error_log)
        return XPathEvalError(
            self._error_log._buildExceptionMessage(u"Error in xpath expression"),
            self._error_log)

    cdef object _handle_result(self, xpath.xmlXPathObject* xpathObj, _Document doc):
        if self._context._exc._has_raised():
            if xpathObj is not NULL:
                _freeXPathObject(xpathObj)
                xpathObj = NULL
            self._context._release_temp_refs()
            self._context._exc._raise_if_stored()

        if xpathObj is NULL:
            self._context._release_temp_refs()
            raise self._build_eval_error()

        try:
            result = _unwrapXPathObject(xpathObj, doc, self._context)
        finally:
            _freeXPathObject(xpathObj)
            self._context._release_temp_refs()

        return result


cdef class XPathElementEvaluator(_XPathEvaluatorBase):
    u"""XPathElementEvaluator(self, element, namespaces=None, extensions=None, regexp=True, smart_strings=True)
    Create an XPath evaluator for an element.

    Absolute XPath expressions (starting with '/') will be evaluated against
    the ElementTree as returned by getroottree().

    Additional namespace declarations can be passed with the
    'namespace' keyword argument.  EXSLT regular expression support
    can be disabled with the 'regexp' boolean keyword (defaults to
    True).  Smart strings will be returned for string results unless
    you pass ``smart_strings=False``.
    """
    cdef _Element _element
    def __init__(self, _Element element not None, *, namespaces=None,
                 extensions=None, regexp=True, smart_strings=True):
        cdef xpath.xmlXPathContext* xpathCtxt
        cdef int ns_register_status
        cdef _Document doc
        _assertValidNode(element)
        _assertValidDoc(element._doc)
        self._element = element
        doc = element._doc
        _XPathEvaluatorBase.__init__(self, namespaces, extensions,
                                     regexp, smart_strings)
        xpathCtxt = xpath.xmlXPathNewContext(doc._c_doc)
        if xpathCtxt is NULL:
            raise MemoryError()
        self.set_context(xpathCtxt)

    def register_namespace(self, prefix, uri):
        u"""Register a namespace with the XPath context.
        """
        assert self._xpathCtxt is not NULL, "XPath context not initialised"
        self._context.addNamespace(prefix, uri)

    def register_namespaces(self, namespaces):
        u"""Register a prefix -> uri dict.
        """
        assert self._xpathCtxt is not NULL, "XPath context not initialised"
        for prefix, uri in namespaces.items():
            self._context.addNamespace(prefix, uri)

    def __call__(self, _path, **_variables):
        u"""__call__(self, _path, **_variables)

        Evaluate an XPath expression on the document.

        Variables may be provided as keyword arguments.  Note that namespaces
        are currently not supported for variables.

        Absolute XPath expressions (starting with '/') will be evaluated
        against the ElementTree as returned by getroottree().
        """
        cdef xpath.xmlXPathObject*  xpathObj
        cdef _Document doc
        assert self._xpathCtxt is not NULL, "XPath context not initialised"
        path = _utf8(_path)
        doc = self._element._doc

        self._lock()
        self._xpathCtxt.node = self._element._c_node
        try:
            self._context.register_context(doc)
            self._context.registerVariables(_variables)
            c_path = _xcstr(path)
            with nogil:
                xpathObj = xpath.xmlXPathEvalExpression(
                    c_path, self._xpathCtxt)
            result = self._handle_result(xpathObj, doc)
        finally:
            self._context.unregister_context()
            self._unlock()

        return result


cdef class XPathDocumentEvaluator(XPathElementEvaluator):
    u"""XPathDocumentEvaluator(self, etree, namespaces=None, extensions=None, regexp=True, smart_strings=True)
    Create an XPath evaluator for an ElementTree.

    Additional namespace declarations can be passed with the
    'namespace' keyword argument.  EXSLT regular expression support
    can be disabled with the 'regexp' boolean keyword (defaults to
    True).  Smart strings will be returned for string results unless
    you pass ``smart_strings=False``.
    """
    def __init__(self, _ElementTree etree not None, *, namespaces=None,
                 extensions=None, regexp=True, smart_strings=True):
        XPathElementEvaluator.__init__(
            self, etree._context_node, namespaces=namespaces, 
            extensions=extensions, regexp=regexp,
            smart_strings=smart_strings)

    def __call__(self, _path, **_variables):
        u"""__call__(self, _path, **_variables)

        Evaluate an XPath expression on the document.

        Variables may be provided as keyword arguments.  Note that namespaces
        are currently not supported for variables.
        """
        cdef xpath.xmlXPathObject*  xpathObj
        cdef xmlDoc* c_doc
        cdef _Document doc
        assert self._xpathCtxt is not NULL, "XPath context not initialised"
        path = _utf8(_path)
        doc = self._element._doc

        self._lock()
        try:
            self._context.register_context(doc)
            c_doc = _fakeRootDoc(doc._c_doc, self._element._c_node)
            try:
                self._context.registerVariables(_variables)
                c_path = _xcstr(path)
                with nogil:
                    self._xpathCtxt.doc  = c_doc
                    self._xpathCtxt.node = tree.xmlDocGetRootElement(c_doc)
                    xpathObj = xpath.xmlXPathEvalExpression(
                        c_path, self._xpathCtxt)
                result = self._handle_result(xpathObj, doc)
            finally:
                _destroyFakeDoc(doc._c_doc, c_doc)
                self._context.unregister_context()
        finally:
            self._unlock()

        return result


def XPathEvaluator(etree_or_element, *, namespaces=None, extensions=None,
                   regexp=True, smart_strings=True):
    u"""XPathEvaluator(etree_or_element, namespaces=None, extensions=None, regexp=True, smart_strings=True)

    Creates an XPath evaluator for an ElementTree or an Element.

    The resulting object can be called with an XPath expression as argument
    and XPath variables provided as keyword arguments.

    Additional namespace declarations can be passed with the
    'namespace' keyword argument.  EXSLT regular expression support
    can be disabled with the 'regexp' boolean keyword (defaults to
    True).  Smart strings will be returned for string results unless
    you pass ``smart_strings=False``.
    """
    if isinstance(etree_or_element, _ElementTree):
        return XPathDocumentEvaluator(
            etree_or_element, namespaces=namespaces,
            extensions=extensions, regexp=regexp, smart_strings=smart_strings)
    else:
        return XPathElementEvaluator(
            etree_or_element, namespaces=namespaces,
            extensions=extensions, regexp=regexp, smart_strings=smart_strings)


cdef class XPath(_XPathEvaluatorBase):
    u"""XPath(self, path, namespaces=None, extensions=None, regexp=True, smart_strings=True)
    A compiled XPath expression that can be called on Elements and ElementTrees.

    Besides the XPath expression, you can pass prefix-namespace
    mappings and extension functions to the constructor through the
    keyword arguments ``namespaces`` and ``extensions``.  EXSLT
    regular expression support can be disabled with the 'regexp'
    boolean keyword (defaults to True).  Smart strings will be
    returned for string results unless you pass
    ``smart_strings=False``.
    """
    cdef xpath.xmlXPathCompExpr* _xpath
    cdef bytes _path
    def __cinit__(self):
        self._xpath = NULL

    def __init__(self, path, *, namespaces=None, extensions=None,
                 regexp=True, smart_strings=True):
        cdef xpath.xmlXPathContext* xpathCtxt
        _XPathEvaluatorBase.__init__(self, namespaces, extensions,
                                     regexp, smart_strings)
        self._path = _utf8(path)
        xpathCtxt = xpath.xmlXPathNewContext(NULL)
        if xpathCtxt is NULL:
            raise MemoryError()
        self.set_context(xpathCtxt)
        self._xpath = xpath.xmlXPathCtxtCompile(xpathCtxt, _xcstr(self._path))
        if self._xpath is NULL:
            raise self._build_parse_error()

    def __call__(self, _etree_or_element, **_variables):
        u"__call__(self, _etree_or_element, **_variables)"
        cdef xpath.xmlXPathObject*  xpathObj
        cdef _Document document
        cdef _Element element

        assert self._xpathCtxt is not NULL, "XPath context not initialised"
        document = _documentOrRaise(_etree_or_element)
        element  = _rootNodeOrRaise(_etree_or_element)

        self._lock()
        self._xpathCtxt.doc  = document._c_doc
        self._xpathCtxt.node = element._c_node

        try:
            self._context.register_context(document)
            self._context.registerVariables(_variables)
            with nogil:
                xpathObj = xpath.xmlXPathCompiledEval(
                    self._xpath, self._xpathCtxt)
            result = self._handle_result(xpathObj, document)
        finally:
            self._context.unregister_context()
            self._unlock()
        return result

    property path:
        u"""The literal XPath expression.
        """
        def __get__(self):
            return self._path.decode(u'UTF-8')

    def __dealloc__(self):
        if self._xpath is not NULL:
            xpath.xmlXPathFreeCompExpr(self._xpath)

    def __repr__(self):
        return self.path


cdef object _replace_strings
cdef object _find_namespaces
_replace_strings = re.compile(b'("[^"]*")|(\'[^\']*\')').sub
_find_namespaces = re.compile(b'({[^}]+})').findall

cdef class ETXPath(XPath):
    u"""ETXPath(self, path, extensions=None, regexp=True, smart_strings=True)
    Special XPath class that supports the ElementTree {uri} notation for namespaces.

    Note that this class does not accept the ``namespace`` keyword
    argument. All namespaces must be passed as part of the path
    string.  Smart strings will be returned for string results unless
    you pass ``smart_strings=False``.
    """
    def __init__(self, path, *, extensions=None, regexp=True,
                 smart_strings=True):
        path, namespaces = self._nsextract_path(path)
        XPath.__init__(self, path, namespaces=namespaces,
                       extensions=extensions, regexp=regexp,
                       smart_strings=smart_strings)

    cdef _nsextract_path(self, path):
        # replace {namespaces} by new prefixes
        cdef dict namespaces = {}
        cdef list namespace_defs = []
        cdef int i
        path_utf = _utf8(path)
        stripped_path = _replace_strings(b'', path_utf) # remove string literals
        i = 1
        for namespace_def in _find_namespaces(stripped_path):
            if namespace_def not in namespace_defs:
                prefix = python.PyBytes_FromFormat("__xpp%02d", i)
                i += 1
                namespace_defs.append(namespace_def)
                namespace = namespace_def[1:-1] # remove '{}'
                namespace = (<bytes>namespace).decode('utf8')
                namespaces[prefix.decode('utf8')] = namespace
                prefix_str = prefix + b':'
                # FIXME: this also replaces {namespaces} within strings!
                path_utf = path_utf.replace(namespace_def, prefix_str)
        path = path_utf.decode('utf8')
        return path, namespaces