Blob Blame History Raw
>>> import re
>>> from lxml.html import fromstring, tostring
>>> from lxml.html.clean import clean, clean_html, Cleaner
>>> from lxml.html import usedoctest

>>> doc = '''<html>
...   <head>
...     <script type="text/javascript" src="evil-site"></script>
...     <link rel="alternate" type="text/rss" src="evil-rss">
...     <link rel="alternate" type="text/rss" href="http://example.com">
...     <link rel="stylesheet" type="text/rss" href="http://example.com">
...     <style>
...       body {background-image: url(javascript:do_evil)};
...       div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
...       div {color: expression(evil)};
...     </style>
...   </head>
...   <body onload="evil_function()">
...     <!-- I am interpreted for EVIL! -->
...     <a href="javascript:evil_function()">a link</a>
...     <a href="j\x01a\x02v\x03a\x04s\x05c\x06r\x07i\x0Ep t:evil_function()">a control char link</a>
...     <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
...     <a href="#" onclick="evil_function()">another link</a>
...     <p onclick="evil_function()">a paragraph</p>
...     <div style="display: none">secret EVIL!</div>
...     <object> of EVIL! </object>
...     <iframe src="evil-site"></iframe>
...     <form action="evil-site">
...       Password: <input type="password" name="password">
...     </form>
...     <a href="evil-site">spam spam SPAM!</a>
...     <a href="http://example.com" rel="author">Author</a>
...     <a href="http://example.com" rel="nofollow">Text</a>
...     <img src="evil!">
...   </body>
... </html>'''

>>> print(re.sub('[\x00-\x07\x0E]', '', doc))
<html>
  <head>
    <script type="text/javascript" src="evil-site"></script>
    <link rel="alternate" type="text/rss" src="evil-rss">
    <link rel="alternate" type="text/rss" href="http://example.com">
    <link rel="stylesheet" type="text/rss" href="http://example.com">
    <style>
      body {background-image: url(javascript:do_evil)};
      div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
      div {color: expression(evil)};
    </style>
  </head>
  <body onload="evil_function()">
    <!-- I am interpreted for EVIL! -->
    <a href="javascript:evil_function()">a link</a>
    <a href="javascrip t:evil_function()">a control char link</a>
    <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
    <a href="#" onclick="evil_function()">another link</a>
    <p onclick="evil_function()">a paragraph</p>
    <div style="display: none">secret EVIL!</div>
    <object> of EVIL! </object>
    <iframe src="evil-site"></iframe>
    <form action="evil-site">
      Password: <input type="password" name="password">
    </form>
    <a href="evil-site">spam spam SPAM!</a>
    <a href="http://example.com" rel="author">Author</a>
    <a href="http://example.com" rel="nofollow">Text</a>
    <img src="evil!">
  </body>
</html>

>>> print(tostring(fromstring(doc)).decode("utf-8"))
<html>
  <head>
    <script type="text/javascript" src="evil-site"></script>
    <link rel="alternate" type="text/rss" src="evil-rss">
    <link rel="alternate" type="text/rss" href="http://example.com">
    <link rel="stylesheet" type="text/rss" href="http://example.com">
    <style>
      body {background-image: url(javascript:do_evil)};
      div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
      div {color: expression(evil)};
    </style>
  </head>
  <body onload="evil_function()">
    <!-- I am interpreted for EVIL! -->
    <a href="javascript:evil_function()">a link</a>
    <a href="javascrip%20t:evil_function()">a control char link</a>
    <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
    <a href="#" onclick="evil_function()">another link</a>
    <p onclick="evil_function()">a paragraph</p>
    <div style="display: none">secret EVIL!</div>
    <object> of EVIL! </object>
    <iframe src="evil-site"></iframe>
    <form action="evil-site">
      Password: <input type="password" name="password">
    </form>
    <a href="evil-site">spam spam SPAM!</a>
    <a href="http://example.com" rel="author">Author</a>
    <a href="http://example.com" rel="nofollow">Text</a>
    <img src="evil!">
  </body>
</html>

>>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc))
<html>
  <head>
    <style>
      body {background-image: url()};
      div {background-image: url()};
      div {color: };
    </style>
  </head>
  <body>
    <a href="">a link</a>
    <a href="">a control char link</a>
    <a href="">data</a>
    <a href="#">another link</a>
    <p>a paragraph</p>
    <div style="display: none">secret EVIL!</div>
    of EVIL!
    Password:
    <a href="evil-site">spam spam SPAM!</a>
    <a href="http://example.com" rel="author">Author</a>
    <a href="http://example.com" rel="nofollow">Text</a>
    <img src="evil!">
  </body>
</html>

>>> print(Cleaner(style=True, inline_style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
<html>
  <head>
  </head>
  <body>
    <a href="">a link</a>
    <a href="">a control char link</a>
    <a href="">data</a>
    <a href="#">another link</a>
    <p>a paragraph</p>
    <div>secret EVIL!</div>
    of EVIL!
    Password:
    <a href="evil-site" rel="nofollow">spam spam SPAM!</a>
    <a href="http://example.com" rel="author nofollow">Author</a>
    <a href="http://example.com" rel="nofollow">Text</a>
    <img src="evil!">
  </body>
</html>

>>> print(Cleaner(style=True, inline_style=False, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
<html>
  <head>
  </head>
  <body>
    <a href="">a link</a>
    <a href="">a control char link</a>
    <a href="">data</a>
    <a href="#">another link</a>
    <p>a paragraph</p>
    <div style="display: none">secret EVIL!</div>
    of EVIL!
    Password:
    <a href="evil-site" rel="nofollow">spam spam SPAM!</a>
    <a href="http://example.com" rel="author nofollow">Author</a>
    <a href="http://example.com" rel="nofollow">Text</a>
    <img src="evil!">
  </body>
</html>

>>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc))
<html>
  <head>
    <link rel="alternate" type="text/rss" src="evil-rss">
    <link rel="alternate" type="text/rss" href="http://example.com">
    <link rel="stylesheet" type="text/rss" href="http://example.com">
    <style>
      body {background-image: url()};
      div {background-image: url()};
      div {color: };
    </style>
  </head>
  <body>
    <a href="">a link</a>
    <a href="">a control char link</a>
    <a href="">data</a>
    <a href="#">another link</a>
    <p>a paragraph</p>
    <div>secret EVIL!</div>
    of EVIL!
    Password:
    <a href="evil-site">spam spam SPAM!</a>
    <a href="http://example.com" rel="author">Author</a>
    <a href="http://example.com" rel="nofollow">Text</a>
    <img src="evil!">
  </body>
</html>