Blob Blame History Raw
THIS FAILS IN libxml2 2.6.29 AND 2.6.30 !!


>>> from lxml.html import fromstring, tostring
>>> from lxml.html.clean import clean, clean_html, Cleaner
>>> from lxml.html import usedoctest

>>> def tostring(el):  # work-around for Py3 'bytes' type
...     from lxml.html import tostring
...     s = tostring(el)
...     if not isinstance(s, str):
...         s = s.decode('UTF-8')
...     return s

>>> doc_embed = '''<div>
... <embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
... <embed src="http://anothersite.com/v/another"></embed>
... <script src="http://www.youtube.com/example.js"></script>
... <script src="/something-else.js"></script>
... </div>'''
>>> print(tostring(fromstring(doc_embed)))
<div>
<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
<embed src="http://anothersite.com/v/another"></embed>
<script src="http://www.youtube.com/example.js"></script>
<script src="/something-else.js"></script>
</div>
>>> print(Cleaner().clean_html(doc_embed))
<div>
</div>
>>> print(Cleaner(host_whitelist=['www.youtube.com']).clean_html(doc_embed))
<div>
<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
</div>
>>> print(Cleaner(host_whitelist=['www.youtube.com'], whitelist_tags=None).clean_html(doc_embed))
<div>
<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
<script src="http://www.youtube.com/example.js"></script>
</div>