import os
import imp
try:
from StringIO import StringIO
except ImportError: # python 3
from io import StringIO
import sys
import tempfile
import unittest
try:
from unittest import skipUnless
except ImportError:
# sys.version < (2, 7)
def skipUnless(condition, reason):
return lambda f: condition and f or None
if sys.version_info < (2,6):
class NamedTemporaryFile(object):
def __init__(self, delete=True, **kwargs):
self._tmpfile = tempfile.NamedTemporaryFile(**kwargs)
def close(self):
self._tmpfile.flush()
def __getattr__(self, name):
return getattr(self._tmpfile, name)
else:
NamedTemporaryFile = tempfile.NamedTemporaryFile
from lxml.builder import ElementMaker
from lxml.etree import Element, ElementTree, ParserError
from lxml.html import html_parser, XHTML_NAMESPACE
try:
import urlparse
except ImportError:
import urllib.parse as urlparse
try:
from urllib import pathname2url
except ImportError:
from urllib.request import pathname2url
def path2url(path):
return urlparse.urljoin(
'file:', pathname2url(path))
try:
import html5lib
except ImportError:
html5lib = None
class BogusModules(object):
# See PEP 302 for details on how this works
def __init__(self, mocks):
self.mocks = mocks
def find_module(self, fullname, path=None):
if fullname in self.mocks:
return self
return None
def load_module(self, fullname):
mod = sys.modules.setdefault(fullname, imp.new_module(fullname))
mod.__file__, mod.__loader__, mod.__path__ = "<dummy>", self, []
mod.__dict__.update(self.mocks[fullname])
return mod
# Fake just enough of html5lib so that html5parser.py is importable
# without errors.
sys.meta_path.append(BogusModules({
'html5lib': {
# A do-nothing HTMLParser class
'HTMLParser': type('HTMLParser', (object,), {
'__init__': lambda self, **kw: None,
}),
},
'html5lib.treebuilders': {
},
'html5lib.treebuilders.etree_lxml': {
'TreeBuilder': 'dummy treebuilder',
},
}))
class Test_HTMLParser(unittest.TestCase):
def make_one(self, **kwargs):
from lxml.html.html5parser import HTMLParser
return HTMLParser(**kwargs)
@skipUnless(html5lib, 'html5lib is not installed')
def test_integration(self):
parser = self.make_one(strict=True)
tree = parser.parse(XHTML_TEST_DOCUMENT)
root = tree.getroot()
self.assertEqual(root.tag, xhtml_tag('html'))
class Test_XHTMLParser(unittest.TestCase):
def make_one(self, **kwargs):
from lxml.html.html5parser import XHTMLParser
return XHTMLParser(**kwargs)
@skipUnless(hasattr(html5lib, 'XHTMLParser'),
'xhtml5lib does not have XHTMLParser')
def test_integration(self):
# XXX: This test are untested. (html5lib no longer has an XHTMLParser)
parser = self.make_one(strict=True)
tree = parser.parse(XHTML_TEST_DOCUMENT)
root = tree.getroot()
self.assertEqual(root.tag, xhtml_tag('html'))
class Test_document_fromstring(unittest.TestCase):
def call_it(self, *args, **kwargs):
from lxml.html.html5parser import document_fromstring
return document_fromstring(*args, **kwargs)
def test_basic(self):
parser = DummyParser(doc=DummyElementTree(root='dummy root'))
elem = self.call_it(b'dummy input', parser=parser)
self.assertEqual(elem, 'dummy root')
self.assertEqual(parser.parse_args, (b'dummy input',))
self.assertEqual(parser.parse_kwargs, {'useChardet': True})
def test_guess_charset_not_used_for_unicode(self):
parser = DummyParser()
elem = self.call_it(b''.decode('ascii'), parser=parser)
self.assertEqual(parser.parse_kwargs, {})
def test_guess_charset_arg_gets_passed_to_parser(self):
parser = DummyParser()
elem = self.call_it(b'', guess_charset='gc_arg', parser=parser)
self.assertEqual(parser.parse_kwargs, {'useChardet': 'gc_arg'})
def test_raises_type_error_on_nonstring_input(self):
not_a_string = None
self.assertRaises(TypeError, self.call_it, not_a_string)
@skipUnless(html5lib, 'html5lib is not installed')
def test_integration(self):
elem = self.call_it(XHTML_TEST_DOCUMENT)
self.assertEqual(elem.tag, xhtml_tag('html'))
class Test_fragments_fromstring(unittest.TestCase):
def call_it(self, *args, **kwargs):
from lxml.html.html5parser import fragments_fromstring
return fragments_fromstring(*args, **kwargs)
def test_basic(self):
parser = DummyParser(fragments='fragments')
fragments = self.call_it(b'dummy input', parser=parser)
self.assertEqual(fragments, 'fragments')
self.assertEqual(parser.parseFragment_kwargs, {'useChardet': False})
def test_guess_charset_arg_gets_passed_to_parser(self):
parser = DummyParser()
elem = self.call_it(b'', guess_charset='gc_arg', parser=parser)
self.assertEqual(parser.parseFragment_kwargs, {'useChardet': 'gc_arg'})
def test_guess_charset_not_used_for_unicode(self):
parser = DummyParser()
elem = self.call_it(b''.decode('ascii'), parser=parser)
self.assertEqual(parser.parseFragment_kwargs, {})
def test_raises_type_error_on_nonstring_input(self):
not_a_string = None
self.assertRaises(TypeError, self.call_it, not_a_string)
def test_no_leading_text_strips_empty_leading_text(self):
parser = DummyParser(fragments=['', 'tail'])
fragments = self.call_it('', parser=parser, no_leading_text=True)
self.assertEqual(fragments, ['tail'])
def test_no_leading_text_raises_error_if_leading_text(self):
parser = DummyParser(fragments=['leading text', 'tail'])
self.assertRaises(ParserError, self.call_it,
'', parser=parser, no_leading_text=True)
@skipUnless(html5lib, 'html5lib is not installed')
def test_integration(self):
fragments = self.call_it('a<b>c</b>')
self.assertEqual(len(fragments), 2)
self.assertEqual(fragments[0], 'a')
self.assertEqual(fragments[1].tag, xhtml_tag('b'))
class Test_fragment_fromstring(unittest.TestCase):
def call_it(self, *args, **kwargs):
from lxml.html.html5parser import fragment_fromstring
return fragment_fromstring(*args, **kwargs)
def test_basic(self):
element = DummyElement()
parser = DummyParser(fragments=[element])
self.assertEqual(self.call_it('html', parser=parser), element)
def test_raises_type_error_on_nonstring_input(self):
not_a_string = None
self.assertRaises(TypeError, self.call_it, not_a_string)
def test_create_parent(self):
parser = DummyParser(fragments=['head', Element('child')])
elem = self.call_it('html', parser=parser, create_parent='parent')
self.assertEqual(elem.tag, 'parent')
self.assertEqual(elem.text, 'head')
self.assertEqual(elem[0].tag, 'child')
def test_create_parent_default_type_no_ns(self):
parser = DummyParser(fragments=[], namespaceHTMLElements=False)
elem = self.call_it('html', parser=parser, create_parent=True)
self.assertEqual(elem.tag, 'div')
def test_raises_error_on_leading_text(self):
parser = DummyParser(fragments=['leading text'])
self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
def test_raises_error_if_no_elements_found(self):
parser = DummyParser(fragments=[])
self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
def test_raises_error_if_multiple_elements_found(self):
parser = DummyParser(fragments=[DummyElement(), DummyElement()])
self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
def test_raises_error_if_tail(self):
parser = DummyParser(fragments=[DummyElement(tail='tail')])
self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
class Test_fromstring(unittest.TestCase):
def call_it(self, *args, **kwargs):
from lxml.html.html5parser import fromstring
return fromstring(*args, **kwargs)
def test_returns_whole_doc_if_input_contains_html_tag(self):
parser = DummyParser(root='the doc')
self.assertEqual(self.call_it('<html></html>', parser=parser),
'the doc')
def test_returns_whole_doc_if_input_contains_doctype(self):
parser = DummyParser(root='the doc')
self.assertEqual(self.call_it('<!DOCTYPE html>', parser=parser),
'the doc')
def test_returns_whole_doc_if_input_is_encoded(self):
parser = DummyParser(root='the doc')
input = '<!DOCTYPE html>'.encode('ascii')
self.assertEqual(self.call_it(input, parser=parser),
'the doc')
def test_returns_whole_doc_if_head_not_empty(self, use_ns=True):
E = HTMLElementMaker(namespaceHTMLElements=use_ns)
root = E.html(E.head(E.title()))
parser = DummyParser(root=root)
self.assertEqual(self.call_it('', parser=parser), root)
def test_returns_whole_doc_if_head_not_empty_no_ns(self):
self.test_returns_whole_doc_if_head_not_empty(use_ns=False)
def test_returns_unwraps_body_if_single_element(self):
E = HTMLElementMaker()
elem = E.p('test')
root = E.html(E.head(), E.body(elem))
parser = DummyParser(root=root)
self.assertEqual(self.call_it('', parser=parser), elem)
def test_returns_body_if_has_text(self):
E = HTMLElementMaker()
elem = E.p('test')
body = E.body('text', elem)
root = E.html(E.head(), body)
parser = DummyParser(root=root)
self.assertEqual(self.call_it('', parser=parser), body)
def test_returns_body_if_single_element_has_tail(self):
E = HTMLElementMaker()
elem = E.p('test')
elem.tail = 'tail'
body = E.body(elem)
root = E.html(E.head(), body)
parser = DummyParser(root=root)
self.assertEqual(self.call_it('', parser=parser), body)
def test_wraps_multiple_fragments_in_div_no_ns(self):
E = HTMLElementMaker(namespaceHTMLElements=False)
parser = DummyParser(root=E.html(E.head(), E.body(E.h1(), E.p())),
namespaceHTMLElements=False)
elem = self.call_it('', parser=parser)
self.assertEqual(elem.tag, 'div')
def test_wraps_multiple_fragments_in_span_no_ns(self):
E = HTMLElementMaker(namespaceHTMLElements=False)
parser = DummyParser(root=E.html(E.head(), E.body('foo', E.a('link'))),
namespaceHTMLElements=False)
elem = self.call_it('', parser=parser)
self.assertEqual(elem.tag, 'span')
def test_raises_type_error_on_nonstring_input(self):
not_a_string = None
self.assertRaises(TypeError, self.call_it, not_a_string)
@skipUnless(html5lib, 'html5lib is not installed')
def test_integration_whole_doc(self):
elem = self.call_it(XHTML_TEST_DOCUMENT)
self.assertEqual(elem.tag, xhtml_tag('html'))
@skipUnless(html5lib, 'html5lib is not installed')
def test_integration_single_fragment(self):
elem = self.call_it('<p></p>')
self.assertEqual(elem.tag, xhtml_tag('p'))
class Test_parse(unittest.TestCase):
def call_it(self, *args, **kwargs):
from lxml.html.html5parser import parse
return parse(*args, **kwargs)
def make_temp_file(self, contents=''):
tmpfile = NamedTemporaryFile(delete=False)
try:
tmpfile.write(contents.encode('utf8'))
tmpfile.flush()
tmpfile.seek(0)
return tmpfile
except Exception:
try:
tmpfile.close()
finally:
os.unlink(tempfile.name)
raise
def test_with_file_object(self):
parser = DummyParser(doc='the doc')
fp = open(__file__)
try:
self.assertEqual(self.call_it(fp, parser=parser), 'the doc')
self.assertEqual(parser.parse_args, (fp,))
finally:
fp.close()
def test_with_file_name(self):
parser = DummyParser(doc='the doc')
tmpfile = self.make_temp_file('data')
try:
data = tmpfile.read()
finally:
tmpfile.close()
try:
self.assertEqual(self.call_it(tmpfile.name, parser=parser), 'the doc')
fp, = parser.parse_args
try:
self.assertEqual(fp.read(), data)
finally:
fp.close()
finally:
os.unlink(tmpfile.name)
def test_with_url(self):
parser = DummyParser(doc='the doc')
tmpfile = self.make_temp_file('content')
try:
data = tmpfile.read()
finally:
tmpfile.close()
try:
url = path2url(tmpfile.name)
self.assertEqual(self.call_it(url, parser=parser), 'the doc')
fp, = parser.parse_args
try:
self.assertEqual(fp.read(), data)
finally:
fp.close()
finally:
os.unlink(tmpfile.name)
@skipUnless(html5lib, 'html5lib is not installed')
def test_integration(self):
doc = self.call_it(StringIO(XHTML_TEST_DOCUMENT))
root = doc.getroot()
self.assertEqual(root.tag, xhtml_tag('html'))
def test_suite():
loader = unittest.TestLoader()
return loader.loadTestsFromModule(sys.modules[__name__])
class HTMLElementMaker(ElementMaker):
def __init__(self, namespaceHTMLElements=True):
initargs = dict(makeelement=html_parser.makeelement)
if namespaceHTMLElements:
initargs.update(namespace=XHTML_NAMESPACE,
nsmap={None: XHTML_NAMESPACE})
ElementMaker.__init__(self, **initargs)
class DummyParser(object):
def __init__(self, doc=None, root=None,
fragments=None, namespaceHTMLElements=True):
self.doc = doc or DummyElementTree(root=root)
self.fragments = fragments
self.tree = DummyTreeBuilder(namespaceHTMLElements)
def parse(self, *args, **kwargs):
self.parse_args = args
self.parse_kwargs = kwargs
return self.doc
def parseFragment(self, *args, **kwargs):
self.parseFragment_args = args
self.parseFragment_kwargs = kwargs
return self.fragments
class DummyTreeBuilder(object):
def __init__(self, namespaceHTMLElements=True):
self.namespaceHTMLElements = namespaceHTMLElements
class DummyElementTree(object):
def __init__(self, root):
self.root = root
def getroot(self):
return self.root
class DummyElement(object):
def __init__(self, tag='tag', tail=None):
self.tag = tag
self.tail = tail
def xhtml_tag(tag):
return '{%s}%s' % (XHTML_NAMESPACE, tag)
XHTML_TEST_DOCUMENT = '''
<!DOCTYPE html>
<html>
<head><title>TITLE</title></head>
<body></body>
</html>
'''