Blob Blame History Raw
import unittest, sys
from lxml.tests.common_imports import make_doctest, HelperTestCase

try:
    import lxml.html.soupparser
    BS_INSTALLED = True
except ImportError:
    if 'bs4' in sys.modules or 'BeautifulSoup' in sys.modules:
        raise  # seems we managed to import BS but not soupparser
    BS_INSTALLED = False

from lxml.html import tostring


if BS_INSTALLED:
    class SoupParserTestCase(HelperTestCase):
        soupparser = lxml.html.soupparser

        def test_broken_attribute(self):
            html = """\
              <html><head></head><body>
                <form><input type='text' disabled size='10'></form>
              </body></html>
            """
            root = self.soupparser.fromstring(html)
            self.assertTrue(root.find('.//input').get('disabled') is not None)

        def test_empty(self):
            tree = self.soupparser.fromstring('')
            res = b'''<html></html>'''
            self.assertEqual(tostring(tree), res)

        def test_text(self):
            tree = self.soupparser.fromstring('huhu')
            res = b'''<html>huhu</html>'''
            self.assertEqual(tostring(tree), res)

        def test_body(self):
            html = '''<body><p>test</p></body>'''
            res = b'''<html><body><p>test</p></body></html>'''
            tree = self.soupparser.fromstring(html)
            self.assertEqual(tostring(tree), res)

        def test_head_body(self):
            # HTML tag missing, parser should fix that
            html = '<head><title>test</title></head><body><p>test</p></body>'
            res = b'<html><head><title>test</title></head><body><p>test</p></body></html>'
            tree = self.soupparser.fromstring(html)
            self.assertEqual(tostring(tree), res)

        def test_wrap_html(self):
            # <head> outside <html>, parser should fix that
            html = '<head><title>title</test></head><html><body/></html>'
            res = b'<html><head><title>title</title></head><body></body></html>'
            tree = self.soupparser.fromstring(html)
            self.assertEqual(tostring(tree), res)

        def test_comment_hyphen(self):
            # These are really invalid XML as per specification
            # https://www.w3.org/TR/REC-xml/#sec-comments
            html = b'<html><!-- comment -- with double-hyphen --></html>'
            tree = self.soupparser.fromstring(html)
            self.assertEqual(tostring(tree), html)

            html = b'<html><!-- comment ends with hyphen ---></html>'
            tree = self.soupparser.fromstring(html)
            self.assertEqual(tostring(tree), html)

        def test_comment_pi(self):
            html = '''<!-- comment -->
<?test asdf?>
<head><title>test</title></head><body><p>test</p></body>
<!-- another comment -->'''
            res = b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<!-- comment --><?test asdf?><html><head><title>test</title></head><body><p>test</p></body></html><!-- another comment -->'''
            tree = self.soupparser.fromstring(html).getroottree()
            self.assertEqual(tostring(tree, method='html'), res)

        def test_doctype1(self):
            # Test document type declaration, comments and PI's
            # outside the root
            html = \
'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar>'''

            res = \
b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''

            tree = self.soupparser.fromstring(html).getroottree()
            self.assertEqual(tree.docinfo.public_id, "-//W3C//DTD HTML 4.01//EN")
            self.assertEqual(tostring(tree), res)

        def test_doctype2(self):
            # Test document type declaration, comments and PI's
            # outside the root
            html = \
'''<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''

            res = \
b'''<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''

            tree = self.soupparser.fromstring(html).getroottree()
            self.assertEqual(tree.docinfo.public_id, "-//IETF//DTD HTML//EN")
            self.assertEqual(tostring(tree), res)

        def test_doctype_html5(self):
            # html 5 doctype declaration
            html = b'<!DOCTYPE html>\n<html lang="en"></html>'

            tree = self.soupparser.fromstring(html).getroottree()
            self.assertTrue(tree.docinfo.public_id is None)
            self.assertEqual(tostring(tree), html)


def test_suite():
    suite = unittest.TestSuite()
    if BS_INSTALLED:
        suite.addTests([unittest.makeSuite(SoupParserTestCase)])
        if sys.version_info[0] < 3:
            suite.addTests([make_doctest('../../../../doc/elementsoup.txt')])
    return suite


if __name__ == '__main__':
    unittest.main()