Blame src/lxml/html/tests/test_clean.txt

Packit Service b74dd5
>>> import re
Packit Service b74dd5
>>> from lxml.html import fromstring, tostring
Packit Service b74dd5
>>> from lxml.html.clean import clean, clean_html, Cleaner
Packit Service b74dd5
>>> from lxml.html import usedoctest
Packit Service b74dd5
Packit Service b74dd5
>>> doc = '''<html>
Packit Service b74dd5
...   <head>
Packit Service b74dd5
...     <script type="text/javascript" src="evil-site"></script>
Packit Service b74dd5
...     <link rel="alternate" type="text/rss" src="evil-rss">
Packit Service b74dd5
...     <link rel="alternate" type="text/rss" href="http://example.com">
Packit Service b74dd5
...     <link rel="stylesheet" type="text/rss" href="http://example.com">
Packit Service b74dd5
...     <style>
Packit Service b74dd5
...       body {background-image: url(javascript:do_evil)};
Packit Service b74dd5
...       div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
Packit Service b74dd5
...       div {color: expression(evil)};
Packit Service b74dd5
...     </style>
Packit Service b74dd5
...   </head>
Packit Service b74dd5
...   <body onload="evil_function()">
Packit Service b74dd5
...     
Packit Service b74dd5
...     a link
Packit Service b74dd5
...     a control char link
Packit Service b74dd5
...     data
Packit Service b74dd5
...     another link
Packit Service b74dd5
...     

a paragraph

Packit Service b74dd5
...     
secret EVIL!
Packit Service b74dd5
...     <object> of EVIL! </object>
Packit Service b74dd5
...     <iframe src="evil-site"></iframe>
Packit Service b74dd5
...     <form action="evil-site">
Packit Service b74dd5
...       Password: <input type="password" name="password">
Packit Service b74dd5
...     </form>
Packit Service b74dd5
...     spam spam SPAM!
Packit Service b74dd5
...     Author
Packit Service b74dd5
...     Text
Packit Service b74dd5
...     
Packit Service b74dd5
...   </body>
Packit Service b74dd5
... </html>'''
Packit Service b74dd5
Packit Service b74dd5
>>> print(re.sub('[\x00-\x07\x0E]', '', doc))
Packit Service b74dd5
<html>
Packit Service b74dd5
  <head>
Packit Service b74dd5
    <script type="text/javascript" src="evil-site"></script>
Packit Service b74dd5
    <link rel="alternate" type="text/rss" src="evil-rss">
Packit Service b74dd5
    <link rel="alternate" type="text/rss" href="http://example.com">
Packit Service b74dd5
    <link rel="stylesheet" type="text/rss" href="http://example.com">
Packit Service b74dd5
    <style>
Packit Service b74dd5
      body {background-image: url(javascript:do_evil)};
Packit Service b74dd5
      div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
Packit Service b74dd5
      div {color: expression(evil)};
Packit Service b74dd5
    </style>
Packit Service b74dd5
  </head>
Packit Service b74dd5
  <body onload="evil_function()">
Packit Service b74dd5
    
Packit Service b74dd5
    a link
Packit Service b74dd5
    a control char link
Packit Service b74dd5
    data
Packit Service b74dd5
    another link
Packit Service b74dd5
    

a paragraph

Packit Service b74dd5
    
secret EVIL!
Packit Service b74dd5
    <object> of EVIL! </object>
Packit Service b74dd5
    <iframe src="evil-site"></iframe>
Packit Service b74dd5
    <form action="evil-site">
Packit Service b74dd5
      Password: <input type="password" name="password">
Packit Service b74dd5
    </form>
Packit Service b74dd5
    spam spam SPAM!
Packit Service b74dd5
    Author
Packit Service b74dd5
    Text
Packit Service b74dd5
    
Packit Service b74dd5
  </body>
Packit Service b74dd5
</html>
Packit Service b74dd5
Packit Service b74dd5
>>> print(tostring(fromstring(doc)).decode("utf-8"))
Packit Service b74dd5
<html>
Packit Service b74dd5
  <head>
Packit Service b74dd5
    <script type="text/javascript" src="evil-site"></script>
Packit Service b74dd5
    <link rel="alternate" type="text/rss" src="evil-rss">
Packit Service b74dd5
    <link rel="alternate" type="text/rss" href="http://example.com">
Packit Service b74dd5
    <link rel="stylesheet" type="text/rss" href="http://example.com">
Packit Service b74dd5
    <style>
Packit Service b74dd5
      body {background-image: url(javascript:do_evil)};
Packit Service b74dd5
      div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
Packit Service b74dd5
      div {color: expression(evil)};
Packit Service b74dd5
    </style>
Packit Service b74dd5
  </head>
Packit Service b74dd5
  <body onload="evil_function()">
Packit Service b74dd5
    
Packit Service b74dd5
    a link
Packit Service b74dd5
    a control char link
Packit Service b74dd5
    data
Packit Service b74dd5
    another link
Packit Service b74dd5
    

a paragraph

Packit Service b74dd5
    
secret EVIL!
Packit Service b74dd5
    <object> of EVIL! </object>
Packit Service b74dd5
    <iframe src="evil-site"></iframe>
Packit Service b74dd5
    <form action="evil-site">
Packit Service b74dd5
      Password: <input type="password" name="password">
Packit Service b74dd5
    </form>
Packit Service b74dd5
    spam spam SPAM!
Packit Service b74dd5
    Author
Packit Service b74dd5
    Text
Packit Service b74dd5
    
Packit Service b74dd5
  </body>
Packit Service b74dd5
</html>
Packit Service b74dd5
Packit Service b74dd5
>>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc))
Packit Service b74dd5
<html>
Packit Service b74dd5
  <head>
Packit Service 1cab4c
    <style>
Packit Service 1cab4c
      body {background-image: url()};
Packit Service 1cab4c
      div {background-image: url()};
Packit Service 1cab4c
      div {color: };
Packit Service 1cab4c
    </style>
Packit Service b74dd5
  </head>
Packit Service b74dd5
  <body>
Packit Service b74dd5
    a link
Packit Service b74dd5
    a control char link
Packit Service b74dd5
    data
Packit Service b74dd5
    another link
Packit Service b74dd5
    

a paragraph

Packit Service b74dd5
    
secret EVIL!
Packit Service b74dd5
    of EVIL!
Packit Service b74dd5
    Password:
Packit Service b74dd5
    spam spam SPAM!
Packit Service b74dd5
    Author
Packit Service b74dd5
    Text
Packit Service b74dd5
    
Packit Service b74dd5
  </body>
Packit Service b74dd5
</html>
Packit Service b74dd5
Packit Service b74dd5
>>> print(Cleaner(style=True, inline_style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
Packit Service b74dd5
<html>
Packit Service b74dd5
  <head>
Packit Service b74dd5
  </head>
Packit Service b74dd5
  <body>
Packit Service b74dd5
    a link
Packit Service b74dd5
    a control char link
Packit Service b74dd5
    data
Packit Service b74dd5
    another link
Packit Service b74dd5
    

a paragraph

Packit Service b74dd5
    
secret EVIL!
Packit Service b74dd5
    of EVIL!
Packit Service b74dd5
    Password:
Packit Service b74dd5
    spam spam SPAM!
Packit Service b74dd5
    Author
Packit Service b74dd5
    Text
Packit Service b74dd5
    
Packit Service b74dd5
  </body>
Packit Service b74dd5
</html>
Packit Service b74dd5
Packit Service b74dd5
>>> print(Cleaner(style=True, inline_style=False, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
Packit Service b74dd5
<html>
Packit Service b74dd5
  <head>
Packit Service b74dd5
  </head>
Packit Service b74dd5
  <body>
Packit Service b74dd5
    a link
Packit Service b74dd5
    a control char link
Packit Service b74dd5
    data
Packit Service b74dd5
    another link
Packit Service b74dd5
    

a paragraph

Packit Service b74dd5
    
secret EVIL!
Packit Service b74dd5
    of EVIL!
Packit Service b74dd5
    Password:
Packit Service b74dd5
    spam spam SPAM!
Packit Service b74dd5
    Author
Packit Service b74dd5
    Text
Packit Service b74dd5
    
Packit Service b74dd5
  </body>
Packit Service b74dd5
</html>
Packit Service b74dd5
Packit Service b74dd5
>>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc))
Packit Service b74dd5
<html>
Packit Service b74dd5
  <head>
Packit Service b74dd5
    <link rel="alternate" type="text/rss" src="evil-rss">
Packit Service b74dd5
    <link rel="alternate" type="text/rss" href="http://example.com">
Packit Service b74dd5
    <link rel="stylesheet" type="text/rss" href="http://example.com">
Packit Service 1cab4c
    <style>
Packit Service 1cab4c
      body {background-image: url()};
Packit Service 1cab4c
      div {background-image: url()};
Packit Service 1cab4c
      div {color: };
Packit Service 1cab4c
    </style>
Packit Service b74dd5
  </head>
Packit Service b74dd5
  <body>
Packit Service b74dd5
    a link
Packit Service b74dd5
    a control char link
Packit Service b74dd5
    data
Packit Service b74dd5
    another link
Packit Service b74dd5
    

a paragraph

Packit Service b74dd5
    
secret EVIL!
Packit Service b74dd5
    of EVIL!
Packit Service b74dd5
    Password:
Packit Service b74dd5
    spam spam SPAM!
Packit Service b74dd5
    Author
Packit Service b74dd5
    Text
Packit Service b74dd5
    
Packit Service b74dd5
  </body>
Packit Service b74dd5
</html>