>>> import re
>>> from lxml.html import fromstring, tostring
>>> from lxml.html.clean import clean, clean_html, Cleaner
>>> from lxml.html import usedoctest
>>> doc = '''<html>
... <head>
... <script type="text/javascript" src="evil-site"></script>
... <link rel="alternate" type="text/rss" src="evil-rss">
... <link rel="alternate" type="text/rss" href="http://example.com">
... <link rel="stylesheet" type="text/rss" href="http://example.com">
... <style>
... body {background-image: url(javascript:do_evil)};
... div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
... div {color: expression(evil)};
... </style>
... </head>
... <body onload="evil_function()">
... <!-- I am interpreted for EVIL! -->
... <a href="javascript:evil_function()">a link</a>
... <a href="j\x01a\x02v\x03a\x04s\x05c\x06r\x07i\x0Ep t:evil_function()">a control char link</a>
... <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
... <a href="#" onclick="evil_function()">another link</a>
... <p onclick="evil_function()">a paragraph</p>
... <div style="display: none">secret EVIL!</div>
... <object> of EVIL! </object>
... <iframe src="evil-site"></iframe>
... <form action="evil-site">
... Password: <input type="password" name="password">
... </form>
... <a href="evil-site">spam spam SPAM!</a>
... <a href="http://example.com" rel="author">Author</a>
... <a href="http://example.com" rel="nofollow">Text</a>
... <img src="evil!">
... </body>
... </html>'''
>>> print(re.sub('[\x00-\x07\x0E]', '', doc))
<html>
<head>
<script type="text/javascript" src="evil-site"></script>
<link rel="alternate" type="text/rss" src="evil-rss">
<link rel="alternate" type="text/rss" href="http://example.com">
<link rel="stylesheet" type="text/rss" href="http://example.com">
<style>
body {background-image: url(javascript:do_evil)};
div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
div {color: expression(evil)};
</style>
</head>
<body onload="evil_function()">
<!-- I am interpreted for EVIL! -->
<a href="javascript:evil_function()">a link</a>
<a href="javascrip t:evil_function()">a control char link</a>
<a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
<a href="#" onclick="evil_function()">another link</a>
<p onclick="evil_function()">a paragraph</p>
<div style="display: none">secret EVIL!</div>
<object> of EVIL! </object>
<iframe src="evil-site"></iframe>
<form action="evil-site">
Password: <input type="password" name="password">
</form>
<a href="evil-site">spam spam SPAM!</a>
<a href="http://example.com" rel="author">Author</a>
<a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>
>>> print(tostring(fromstring(doc)).decode("utf-8"))
<html>
<head>
<script type="text/javascript" src="evil-site"></script>
<link rel="alternate" type="text/rss" src="evil-rss">
<link rel="alternate" type="text/rss" href="http://example.com">
<link rel="stylesheet" type="text/rss" href="http://example.com">
<style>
body {background-image: url(javascript:do_evil)};
div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
div {color: expression(evil)};
</style>
</head>
<body onload="evil_function()">
<!-- I am interpreted for EVIL! -->
<a href="javascript:evil_function()">a link</a>
<a href="javascrip%20t:evil_function()">a control char link</a>
<a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
<a href="#" onclick="evil_function()">another link</a>
<p onclick="evil_function()">a paragraph</p>
<div style="display: none">secret EVIL!</div>
<object> of EVIL! </object>
<iframe src="evil-site"></iframe>
<form action="evil-site">
Password: <input type="password" name="password">
</form>
<a href="evil-site">spam spam SPAM!</a>
<a href="http://example.com" rel="author">Author</a>
<a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>
>>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc))
<html>
<head>
<style>/* deleted */</style>
</head>
<body>
<a href="">a link</a>
<a href="">a control char link</a>
<a href="">data</a>
<a href="#">another link</a>
<p>a paragraph</p>
<div style="display: none">secret EVIL!</div>
of EVIL!
Password:
<a href="evil-site">spam spam SPAM!</a>
<a href="http://example.com" rel="author">Author</a>
<a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>
>>> print(Cleaner(style=True, inline_style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
<html>
<head>
</head>
<body>
<a href="">a link</a>
<a href="">a control char link</a>
<a href="">data</a>
<a href="#">another link</a>
<p>a paragraph</p>
<div>secret EVIL!</div>
of EVIL!
Password:
<a href="evil-site" rel="nofollow">spam spam SPAM!</a>
<a href="http://example.com" rel="author nofollow">Author</a>
<a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>
>>> print(Cleaner(style=True, inline_style=False, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
<html>
<head>
</head>
<body>
<a href="">a link</a>
<a href="">a control char link</a>
<a href="">data</a>
<a href="#">another link</a>
<p>a paragraph</p>
<div style="display: none">secret EVIL!</div>
of EVIL!
Password:
<a href="evil-site" rel="nofollow">spam spam SPAM!</a>
<a href="http://example.com" rel="author nofollow">Author</a>
<a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>
>>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc))
<html>
<head>
<link rel="alternate" type="text/rss" src="evil-rss">
<link rel="alternate" type="text/rss" href="http://example.com">
<link rel="stylesheet" type="text/rss" href="http://example.com">
<style>/* deleted */</style>
</head>
<body>
<a href="">a link</a>
<a href="">a control char link</a>
<a href="">data</a>
<a href="#">another link</a>
<p>a paragraph</p>
<div>secret EVIL!</div>
of EVIL!
Password:
<a href="evil-site">spam spam SPAM!</a>
<a href="http://example.com" rel="author">Author</a>
<a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>