Blob Blame History Raw
lxml.html adds a find_class method to elements::

    >>> from lxml.etree import Comment
    >>> from lxml.html import document_fromstring, fragment_fromstring, tostring
    >>> from lxml.html import fragments_fromstring, fromstring
    >>> from lxml.html.clean import clean, clean_html
    >>> from lxml.html import usedoctest
    >>> try: unicode = unicode
    ... except NameError: unicode = str

    >>> h = document_fromstring('''
    ... <html><head></head>
    ... <body>
    ...   <a class="vcard
    ... fn   url" href="foobar">P1</a>
    ...   <a class="not-fn vcard" href="baz">P2</a>
    ... </body></html>''')
    >>> print(tostring(h, encoding=unicode))
    <html>
      <head></head>
      <body>
        <a class="vcard
    fn   url" href="foobar">P1</a>
        <a class="not-fn vcard" href="baz">P2</a>
      </body>
    </html>
    >>> print([e.text for e in h.find_class('fn')])
    ['P1']
    >>> print([e.text for e in h.find_class('vcard')])
    ['P1', 'P2']

as well as the ability to toggle classes using a set-like interface

    >>> el = fragment_fromstring('<span class="foo bar"></span>')
    >>> 'foo' in el.classes
    True
    >>> 'f00' in el.classes
    False
    >>> el.classes.update(('qux', 'quux'))
    >>> sorted(el.get('class').split())
    ['bar', 'foo', 'quux', 'qux']
    >>> el.classes.clear()
    >>> el.get('class')
    >>> list(el.classes)
    []
    >>> el.classes.add('a')
    >>> el.classes.add('b')
    >>> el.classes.remove('a')
    >>> el.classes.remove('c')
    Traceback (most recent call last):
    ...
    KeyError: 'c'
    >>> el.classes.discard('c')
    >>> el.get('class')
    'b'
    >>> el.classes.add('b')
    >>> el.get('class')
    'b'
    >>> el.classes |= ('a', 'b')
    >>> el.get('class')
    'b a'
    >>> el.classes -= ('b', 'c', 'd')
    >>> el.get('class')
    'a'

with an extra toggle method to switch the state of classes

    >>> el.get('class')
    'a'
    >>> el.classes.toggle('a')
    False
    >>> el.get('class')
    >>> el.classes.toggle('foo')
    True
    >>> el.get('class')
    'foo'
    >>> el.classes.toggle('foo')
    False
    >>> el.get('class')
    >>> el.classes.add("foo\n")
    Traceback (most recent call last):
    ...
    ValueError: Invalid class name: 'foo\n'
    >>> el.classes.remove("foo ")
    Traceback (most recent call last):
    ...
    ValueError: Invalid class name: 'foo '

Also added is a get_rel_links, which you can use to search for links
like ``<a rel="$something">``::

    >>> h = document_fromstring('''
    ... <a href="1">test 1</a>
    ... <a href="2" rel="tag">item 2</a>
    ... <a href="3" rel="tagging">item 3</a>
    ... <a href="4" rel="TAG">item 4</a>''')
    >>> print([e.attrib['href'] for e in h.find_rel_links('tag')])
    ['2', '4']
    >>> print([e.attrib['href'] for e in h.find_rel_links('nofollow')])
    []

Another method is ``get_element_by_id`` that does what it says::

    >>> print(tostring(fragment_fromstring('''
    ... <div>
    ...  <span id="test">stuff</span>
    ... </div>''').get_element_by_id('test'), encoding=unicode))
    <span id="test">stuff</span>

Or to get the content of an element without the tags, use text_content()::

    >>> el = fragment_fromstring('''
    ... <div>This is <a href="foo">a <b>bold</b> link</a></div>''')
    >>> el.text_content()
    'This is a bold link'

Or drop an element (leaving its content) or the entire tree, like::

    >>> doc = document_fromstring('''
    ... <html>
    ...  <body>
    ...   <div id="body">
    ...    This is a <a href="foo" id="link">test</a> of stuff.
    ...   </div>
    ...   <!-- a comment -->
    ...   <div>footer</div>
    ...  </body>
    ... </html>''')
    >>> doc.get_element_by_id('link').drop_tag()
    >>> print(tostring(doc, encoding=unicode))
    <html>
     <body>
      <div id="body">
       This is a test of stuff.
      </div>
      <!-- a comment -->
      <div>footer</div>
     </body>
    </html>
    >>> doc.get_element_by_id('body').drop_tree()
    >>> print(tostring(doc, encoding=unicode))
    <html>
     <body>
      <!-- a comment -->
      <div>footer</div>
     </body>
    </html>

Note, however, that comment text will not be merged into the tree when you
drop the comment.  Here, ``drop_tag()`` behaves exactly like ``drop_tree()``:

    >>> for comment in doc.getiterator(Comment):
    ...     comment.drop_tag()
    >>> print(tostring(doc, encoding=unicode))
    <html>
     <body>
      <div>footer</div>
     </body>
    </html>

In Python3 it should be possible to parse strings given as bytes objects, at
least if an encoding is given.

    >>> from lxml.html import HTMLParser
    >>> enc = 'utf-8'
    >>> html_parser = HTMLParser(encoding=enc)
    >>> src = '<html><body>Test</body></html>'.encode(enc)

    >>> doc = fromstring(src, parser=html_parser)
    >>> print(tostring(doc, encoding=unicode))
    <html><body>Test</body></html>

    >>> docs = fragments_fromstring(src, parser=html_parser)
    >>> len(docs)
    1
    >>> print(docs[0])
    Test

Bug 599318: Call fromstring with a frameset fragment should not raise an error,
the whole document is returned.

    >>> import lxml.html
    >>> content='''
    ... <frameset>
    ...  <frame src="main.php" name="srcpg">
    ... </frameset>'''
    >>> etree_document = lxml.html.fromstring(content)
    >>> print(tostring(etree_document, encoding=unicode))
    <html><frameset><frame src="main.php" name="srcpg"></frameset></html>

Bug 599318: Call fromstring with a div fragment should not raise an error,
only the element is returned

    >>> import lxml.html
    >>> content='<div></div>'
    >>> etree_document = lxml.html.fromstring(content)
    >>> print(tostring(etree_document, encoding=unicode))
    <div></div>

Bug 599318: Call fromstring with a head fragment should not raise an error,
the whole document is returned.

    >>> import lxml.html
    >>> content='<head></head>'
    >>> etree_document = lxml.html.fromstring(content)
    >>> print(tostring(etree_document, encoding=unicode))
    <html><head></head></html>

Bug 690319: Leading whitespace before doctype declaration should not raise an error.

    >>> import lxml.html
    >>> content='''
    ...     <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
    ...     <html>
    ...     </html>'''
    >>> etree_document = lxml.html.fromstring(content)
    >>> print(tostring(etree_document, encoding=unicode))
    <html></html>

Feature https://github.com/lxml/lxml/pull/140: ensure_head_body option:

    >>> from lxml.html import document_fromstring, tostring
    >>> from functools import partial
    >>> tos = partial(tostring, encoding=unicode)
    >>> print(tos(document_fromstring('<p>test</p>')))
    <html><body><p>test</p></body></html>
    >>> print(tos(document_fromstring('<p>test</p>', ensure_head_body=True)))
    <html><head></head><body><p>test</p></body></html>
    >>> print(tos(document_fromstring('<meta>')))
    <html><head><meta></head></html>
    >>> print(tos(document_fromstring('<meta>', ensure_head_body=True)))
    <html><head><meta></head><body></body></html>
    >>> print(tos(document_fromstring('<html></html>')))
    <html></html>
    >>> print(tos(document_fromstring('<html></html>', ensure_head_body=True)))
    <html><head></head><body></body></html>