# -*- coding: utf-8 -*-
"""
HTML parser test cases for etree
"""
import unittest
import tempfile, os, os.path, sys
this_dir = os.path.dirname(__file__)
if this_dir not in sys.path:
sys.path.insert(0, this_dir) # needed for Py3
from common_imports import etree, html, StringIO, BytesIO, fileInTestDir, _bytes, _str
from common_imports import SillyFileLike, HelperTestCase, write_to_file, next
try:
unicode
except NameError:
unicode = str
class HtmlParserTestCase(HelperTestCase):
"""HTML parser test cases
"""
etree = etree
html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>")
html_str_pretty = _bytes("""\
<html>
<head><title>test</title></head>
<body><h1>page title</h1></body>
</html>
""")
broken_html_str = _bytes("<html><head><title>test"
"<body><h1>page title</h3></p></html>")
uhtml_str = _bytes(
"<html><head><title>test á</title></head>"
"<body><h1>page á title</h1></body></html>").decode('utf8')
def tearDown(self):
super(HtmlParserTestCase, self).tearDown()
self.etree.set_default_parser()
def test_module_HTML(self):
element = self.etree.HTML(self.html_str)
self.assertEqual(self.etree.tostring(element, method="html"),
self.html_str)
def test_module_HTML_unicode(self):
element = self.etree.HTML(self.uhtml_str)
self.assertEqual(
self.etree.tostring(element, method="html", encoding='unicode'),
self.uhtml_str)
self.assertEqual(element.findtext('.//h1'),
_bytes("page á title").decode('utf8'))
def test_wide_unicode_xml(self):
if sys.maxunicode < 1114111:
return # skip test
element = self.etree.HTML(_bytes(
'<html><body><p>\\U00026007</p></body></html>'
).decode('unicode_escape'))
p_text = element.findtext('.//p')
self.assertEqual(1, len(p_text))
self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'),
p_text)
def test_html_ids(self):
parser = self.etree.HTMLParser(recover=False)
fromstring = self.etree.fromstring
html = fromstring('''
<html><body id="bodyID"><p id="pID"></p></body></html>
''', parser=parser)
self.assertEqual(len(html.xpath('//p[@id="pID"]')), 1)
def test_html_ids_no_collect_ids(self):
parser = self.etree.HTMLParser(recover=False, collect_ids=False)
fromstring = self.etree.fromstring
html = fromstring('''
<html><body id="bodyID"><p id="pID"></p></body></html>
''', parser=parser)
self.assertEqual(len(html.xpath('//p[@id="pID"]')), 1)
def test_module_HTML_pretty_print(self):
element = self.etree.HTML(self.html_str)
self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True),
self.html_str_pretty)
def test_module_parse_html_error(self):
parser = self.etree.HTMLParser(recover=False)
parse = self.etree.parse
f = BytesIO("<html></body>")
self.assertRaises(self.etree.XMLSyntaxError,
parse, f, parser)
def test_html_element_name_empty(self):
parser = self.etree.HTMLParser()
Element = parser.makeelement
el = Element('name')
self.assertRaises(ValueError, Element, '{}')
self.assertRaises(ValueError, setattr, el, 'tag', '{}')
self.assertRaises(ValueError, Element, '{test}')
self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
def test_html_element_name_colon(self):
parser = self.etree.HTMLParser()
Element = parser.makeelement
pname = Element('p:name')
self.assertEqual(pname.tag, 'p:name')
pname = Element('{test}p:name')
self.assertEqual(pname.tag, '{test}p:name')
pname = Element('name')
pname.tag = 'p:name'
self.assertEqual(pname.tag, 'p:name')
def test_html_element_name_quote(self):
parser = self.etree.HTMLParser()
Element = parser.makeelement
self.assertRaises(ValueError, Element, 'p"name')
self.assertRaises(ValueError, Element, "na'me")
self.assertRaises(ValueError, Element, '{test}"name')
self.assertRaises(ValueError, Element, "{test}name'")
el = Element('name')
self.assertRaises(ValueError, setattr, el, 'tag', "pname'")
self.assertRaises(ValueError, setattr, el, 'tag', '"pname')
self.assertEqual(el.tag, "name")
def test_html_element_name_space(self):
parser = self.etree.HTMLParser()
Element = parser.makeelement
self.assertRaises(ValueError, Element, ' name ')
self.assertRaises(ValueError, Element, 'na me')
self.assertRaises(ValueError, Element, '{test} name')
el = Element('name')
self.assertRaises(ValueError, setattr, el, 'tag', ' name ')
self.assertEqual(el.tag, "name")
def test_html_subelement_name_empty(self):
parser = self.etree.HTMLParser()
Element = parser.makeelement
SubElement = self.etree.SubElement
el = Element('name')
self.assertRaises(ValueError, SubElement, el, '{}')
self.assertRaises(ValueError, SubElement, el, '{test}')
def test_html_subelement_name_colon(self):
parser = self.etree.HTMLParser()
Element = parser.makeelement
SubElement = self.etree.SubElement
el = Element('name')
pname = SubElement(el, 'p:name')
self.assertEqual(pname.tag, 'p:name')
pname = SubElement(el, '{test}p:name')
self.assertEqual(pname.tag, '{test}p:name')
def test_html_subelement_name_quote(self):
parser = self.etree.HTMLParser()
Element = parser.makeelement
SubElement = self.etree.SubElement
el = Element('name')
self.assertRaises(ValueError, SubElement, el, "name'")
self.assertRaises(ValueError, SubElement, el, 'na"me')
self.assertRaises(ValueError, SubElement, el, "{test}na'me")
self.assertRaises(ValueError, SubElement, el, '{test}"name')
def test_html_subelement_name_space(self):
parser = self.etree.HTMLParser()
Element = parser.makeelement
SubElement = self.etree.SubElement
el = Element('name')
self.assertRaises(ValueError, SubElement, el, ' name ')
self.assertRaises(ValueError, SubElement, el, 'na me')
self.assertRaises(ValueError, SubElement, el, '{test} name')
def test_module_parse_html_norecover(self):
parser = self.etree.HTMLParser(recover=False)
parse = self.etree.parse
f = BytesIO(self.broken_html_str)
self.assertRaises(self.etree.XMLSyntaxError,
parse, f, parser)
def test_module_parse_html_default_doctype(self):
parser = self.etree.HTMLParser(default_doctype=False)
d = html.fromstring('<!DOCTYPE html><h1>S</h1></html>', parser=parser)
self.assertEqual(d.getroottree().docinfo.doctype, '<!DOCTYPE html>')
d = html.fromstring('<html><h1>S</h1></html>', parser=parser)
self.assertEqual(d.getroottree().docinfo.doctype, '')
def test_parse_encoding_8bit_explicit(self):
text = _str('Søk på nettet')
html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1')
tree = self.etree.parse(
BytesIO(html_latin1),
self.etree.HTMLParser(encoding="iso-8859-1"))
p = tree.find("//p")
self.assertEqual(p.text, text)
def test_parse_encoding_8bit_override(self):
text = _str('Søk på nettet')
wrong_head = _str('''
<head>
<meta http-equiv="Content-Type"
content="text/html; charset=UTF-8" />
</head>''')
html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head,
text)
).encode('iso-8859-1')
self.assertRaises(self.etree.ParseError,
self.etree.parse,
BytesIO(html_latin1))
tree = self.etree.parse(
BytesIO(html_latin1),
self.etree.HTMLParser(encoding="iso-8859-1"))
p = tree.find("//p")
self.assertEqual(p.text, text)
def test_module_HTML_broken(self):
element = self.etree.HTML(self.broken_html_str)
self.assertEqual(self.etree.tostring(element, method="html"),
self.html_str)
def test_module_HTML_cdata(self):
# by default, libxml2 generates CDATA nodes for <script> content
html = _bytes('<html><head><style>foo</style></head></html>')
element = self.etree.HTML(html)
self.assertEqual(element[0][0].text, "foo")
def test_module_HTML_access(self):
element = self.etree.HTML(self.html_str)
self.assertEqual(element[0][0].tag, 'title')
def test_module_parse_html(self):
parser = self.etree.HTMLParser()
filename = tempfile.mktemp(suffix=".html")
write_to_file(filename, self.html_str, 'wb')
try:
f = open(filename, 'rb')
tree = self.etree.parse(f, parser)
f.close()
self.assertEqual(self.etree.tostring(tree.getroot(), method="html"),
self.html_str)
finally:
os.remove(filename)
def test_module_parse_html_filelike(self):
parser = self.etree.HTMLParser()
f = SillyFileLike(self.html_str)
tree = self.etree.parse(f, parser)
html = self.etree.tostring(tree.getroot(),
method="html", encoding='UTF-8')
self.assertEqual(html, self.html_str)
## def test_module_parse_html_filelike_unicode(self):
## parser = self.etree.HTMLParser()
## f = SillyFileLike(self.uhtml_str)
## tree = self.etree.parse(f, parser)
## html = self.etree.tostring(tree.getroot(), encoding='UTF-8')
## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str)
def test_html_file_error(self):
parser = self.etree.HTMLParser()
parse = self.etree.parse
self.assertRaises(IOError,
parse, "__some_hopefully_nonexisting_file__.html",
parser)
def test_default_parser_HTML_broken(self):
self.assertRaises(self.etree.XMLSyntaxError,
self.etree.parse, BytesIO(self.broken_html_str))
self.etree.set_default_parser( self.etree.HTMLParser() )
tree = self.etree.parse(BytesIO(self.broken_html_str))
self.assertEqual(self.etree.tostring(tree.getroot(), method="html"),
self.html_str)
self.etree.set_default_parser()
self.assertRaises(self.etree.XMLSyntaxError,
self.etree.parse, BytesIO(self.broken_html_str))
def test_html_iterparse(self):
iterparse = self.etree.iterparse
f = BytesIO(
'<html><head><title>TITLE</title><body><p>P</p></body></html>')
iterator = iterparse(f, html=True)
self.assertEqual(None, iterator.root)
events = list(iterator)
root = iterator.root
self.assertTrue(root is not None)
self.assertEqual(
[('end', root[0][0]), ('end', root[0]), ('end', root[1][0]),
('end', root[1]), ('end', root)],
events)
def test_html_iterparse_stop_short(self):
iterparse = self.etree.iterparse
f = BytesIO(
'<html><head><title>TITLE</title><body><p>P</p></body></html>')
iterator = iterparse(f, html=True)
self.assertEqual(None, iterator.root)
event, element = next(iterator)
self.assertEqual('end', event)
self.assertEqual('title', element.tag)
self.assertEqual(None, iterator.root)
del element
event, element = next(iterator)
self.assertEqual('end', event)
self.assertEqual('head', element.tag)
self.assertEqual(None, iterator.root)
del element
del iterator
def test_html_iterparse_broken(self):
iterparse = self.etree.iterparse
f = BytesIO('<head><title>TEST></head><p>P<br></div>')
iterator = iterparse(f, html=True)
self.assertEqual(None, iterator.root)
events = list(iterator)
root = iterator.root
self.assertTrue(root is not None)
self.assertEqual('html', root.tag)
self.assertEqual('head', root[0].tag)
self.assertEqual('body', root[1].tag)
self.assertEqual('p', root[1][0].tag)
self.assertEqual('br', root[1][0][0].tag)
self.assertEqual(
[('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]),
('end', root[1][0]), ('end', root[1]), ('end', root)],
events)
def test_html_iterparse_broken_no_recover(self):
iterparse = self.etree.iterparse
f = BytesIO('<p>P<br></div>')
iterator = iterparse(f, html=True, recover=False)
self.assertRaises(self.etree.XMLSyntaxError, list, iterator)
def test_html_iterparse_file(self):
iterparse = self.etree.iterparse
iterator = iterparse(fileInTestDir("shakespeare.html"),
html=True)
self.assertEqual(None, iterator.root)
events = list(iterator)
root = iterator.root
self.assertTrue(root is not None)
self.assertEqual(249, len(events))
self.assertFalse(
[event for (event, element) in events if event != 'end'])
def test_html_iterparse_start(self):
iterparse = self.etree.iterparse
f = BytesIO(
'<html><head><title>TITLE</title><body><p>P</p></body></html>')
iterator = iterparse(f, html=True, events=('start',))
self.assertEqual(None, iterator.root)
events = list(iterator)
root = iterator.root
self.assertNotEqual(None, root)
self.assertEqual(
[('start', root), ('start', root[0]), ('start', root[0][0]),
('start', root[1]), ('start', root[1][0])],
events)
def test_html_feed_parser(self):
parser = self.etree.HTMLParser()
parser.feed("<html><body></")
parser.feed("body></html>")
root = parser.close()
self.assertEqual('html', root.tag)
# test that we find all names in the parser dict
self.assertEqual([root], list(root.iter('html')))
self.assertEqual([root[0]], list(root.iter('body')))
def test_html_feed_parser_chunky(self):
parser = self.etree.HTMLParser()
parser.feed("<htm")
parser.feed("l><body")
parser.feed("><")
parser.feed("p><")
parser.feed("strong")
parser.feed(">some ")
parser.feed("text</strong></p><")
parser.feed("/body></html>")
root = parser.close()
self.assertEqual('html', root.tag)
# test that we find all names in the parser dict
self.assertEqual([root], list(root.iter('html')))
self.assertEqual([root[0]], list(root.iter('body')))
self.assertEqual([root[0][0]], list(root.iter('p')))
self.assertEqual([root[0][0][0]], list(root.iter('strong')))
def test_html_feed_parser_more_tags(self):
parser = self.etree.HTMLParser()
parser.feed('<html><head>')
parser.feed('<title>TITLE</title><body><p>P</p></body><')
parser.feed("/html>")
root = parser.close()
self.assertEqual('html', root.tag)
# test that we find all names in the parser dict
self.assertEqual([root], list(root.iter('html')))
self.assertEqual([root[0]], list(root.iter('head')))
self.assertEqual([root[0][0]], list(root.iter('title')))
self.assertEqual([root[1]], list(root.iter('body')))
self.assertEqual([root[1][0]], list(root.iter('p')))
def test_html_parser_target_tag(self):
assertFalse = self.assertFalse
events = []
class Target(object):
def start(self, tag, attrib):
events.append(("start", tag))
assertFalse(attrib)
def end(self, tag):
events.append(("end", tag))
def close(self):
return "DONE"
parser = self.etree.HTMLParser(target=Target())
parser.feed("<html><body></body></html>")
done = parser.close()
self.assertEqual("DONE", done)
self.assertEqual([
("start", "html"), ("start", "body"),
("end", "body"), ("end", "html")], events)
def test_html_parser_target_doctype_empty(self):
assertFalse = self.assertFalse
events = []
class Target(object):
def start(self, tag, attrib):
events.append(("start", tag))
assertFalse(attrib)
def end(self, tag):
events.append(("end", tag))
def doctype(self, *args):
events.append(("doctype", args))
def close(self):
return "DONE"
parser = self.etree.HTMLParser(target=Target())
parser.feed("<!DOCTYPE><html><body></body></html>")
done = parser.close()
self.assertEqual("DONE", done)
self.assertEqual([
("doctype", (None, None, None)),
("start", "html"), ("start", "body"),
("end", "body"), ("end", "html")], events)
def test_html_parser_target_doctype_html(self):
assertFalse = self.assertFalse
events = []
class Target(object):
def start(self, tag, attrib):
events.append(("start", tag))
assertFalse(attrib)
def end(self, tag):
events.append(("end", tag))
def doctype(self, *args):
events.append(("doctype", args))
def close(self):
return "DONE"
parser = self.etree.HTMLParser(target=Target())
parser.feed("<!DOCTYPE html><html><body></body></html>")
done = parser.close()
self.assertEqual("DONE", done)
self.assertEqual([
("doctype", ("html", None, None)),
("start", "html"), ("start", "body"),
("end", "body"), ("end", "html")], events)
def test_html_parser_target_doctype_html_full(self):
assertFalse = self.assertFalse
events = []
class Target(object):
def start(self, tag, attrib):
events.append(("start", tag))
assertFalse(attrib)
def end(self, tag):
events.append(("end", tag))
def doctype(self, *args):
events.append(("doctype", args))
def close(self):
return "DONE"
parser = self.etree.HTMLParser(target=Target())
parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">'
'<html><body></body></html>')
done = parser.close()
self.assertEqual("DONE", done)
self.assertEqual([
("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")),
("start", "html"), ("start", "body"),
("end", "body"), ("end", "html")], events)
def test_html_parser_target_exceptions(self):
events = []
class Target(object):
def start(self, tag, attrib):
events.append(("start", tag))
raise ValueError("START")
def end(self, tag):
events.append(("end", tag))
raise TypeError("END")
def close(self):
return "DONE"
parser = self.etree.HTMLParser(target=Target())
try:
parser.feed('<html><body>')
parser.feed('</body></html>')
except ValueError as exc:
assert "START" in str(exc)
except TypeError as exc:
assert "END" in str(exc)
self.assertTrue(False, "wrong exception raised")
else:
self.assertTrue(False, "no exception raised")
self.assertTrue(("start", "html") in events, events)
self.assertTrue(("end", "html") not in events, events)
def test_html_fromstring_target_exceptions(self):
events = []
class Target(object):
def start(self, tag, attrib):
events.append(("start", tag))
raise ValueError("START")
def end(self, tag):
events.append(("end", tag))
raise TypeError("END")
def close(self):
return "DONE"
parser = self.etree.HTMLParser(target=Target())
try:
self.etree.fromstring('<html><body></body></html>', parser)
except ValueError as exc:
assert "START" in str(exc), str(exc)
except TypeError as exc:
assert "END" in str(exc), str(exc)
self.assertTrue(False, "wrong exception raised")
else:
self.assertTrue(False, "no exception raised")
self.assertTrue(("start", "html") in events, events)
self.assertTrue(("end", "html") not in events, events)
def test_set_decl_html(self):
doc = html.Element('html').getroottree()
doc.docinfo.public_id = "-//W3C//DTD XHTML 1.0 Strict//EN"
doc.docinfo.system_url = \
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
self.assertEqual(doc.docinfo.doctype,
'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">')
self.assertEqual(self.etree.tostring(doc),
_bytes('''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"></html>'''))
def test_html5_doctype(self):
# document type declaration with neither public if nor system url
doc = html.Element('html').getroottree()
doc.docinfo.public_id = None
doc.docinfo.system_url = None
self.assertEqual(doc.docinfo.doctype,
'<!DOCTYPE html>')
self.assertTrue(doc.docinfo.public_id is None)
self.assertEqual(self.etree.tostring(doc),
_bytes('<!DOCTYPE html>\n<html/>'))
def test_ietf_decl(self):
# legacy declaration with public id, no system url
doc = html.Element('html').getroottree()
doc.docinfo.public_id = '-//IETF//DTD HTML//EN'
doc.docinfo.system_url = None
self.assertEqual(doc.docinfo.doctype,
'<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">')
self.assertEqual(self.etree.tostring(doc),
_bytes('<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">\n<html/>'))
def test_boolean_attribute(self):
# ability to serialize boolean attribute by setting value to None
form = html.Element('form')
form.set('novalidate', None)
self.assertEqual(html.tostring(form),
_bytes('<form novalidate></form>'))
form.set('custom')
self.assertEqual(html.tostring(form),
_bytes('<form novalidate custom></form>'))
def test_boolean_attribute_round_trip(self):
# ability to pass boolean attributes unmodified
fragment = '<tag attribute></tag>'
self.assertEqual(html.tostring(html.fragment_fromstring(fragment)),
_bytes(fragment))
def test_boolean_attribute_xml_adds_empty_string(self):
# html serialized as xml converts boolean attributes to empty strings
fragment = '<tag attribute></tag>'
self.assertEqual(self.etree.tostring(html.fragment_fromstring(fragment)),
_bytes('<tag attribute=""/>'))
def test_suite():
suite = unittest.TestSuite()
suite.addTests([unittest.makeSuite(HtmlParserTestCase)])
return suite
if __name__ == '__main__':
print('to test use test.py %s' % __file__)