|
Packit Service |
b74dd5 |
>>> import re
|
|
Packit Service |
b74dd5 |
>>> from lxml.html import fromstring, tostring
|
|
Packit Service |
b74dd5 |
>>> from lxml.html.clean import clean, clean_html, Cleaner
|
|
Packit Service |
b74dd5 |
>>> from lxml.html import usedoctest
|
|
Packit Service |
b74dd5 |
|
|
Packit Service |
b74dd5 |
>>> doc = '''<html>
|
|
Packit Service |
b74dd5 |
... <head>
|
|
Packit Service |
b74dd5 |
... <script type="text/javascript" src="evil-site"></script>
|
|
Packit Service |
b74dd5 |
... <link rel="alternate" type="text/rss" src="evil-rss">
|
|
Packit Service |
b74dd5 |
... <link rel="alternate" type="text/rss" href="http://example.com">
|
|
Packit Service |
b74dd5 |
... <link rel="stylesheet" type="text/rss" href="http://example.com">
|
|
Packit Service |
b74dd5 |
... <style>
|
|
Packit Service |
b74dd5 |
... body {background-image: url(javascript:do_evil)};
|
|
Packit Service |
b74dd5 |
... div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
|
|
Packit Service |
b74dd5 |
... div {color: expression(evil)};
|
|
Packit Service |
b74dd5 |
... </style>
|
|
Packit Service |
b74dd5 |
... </head>
|
|
Packit Service |
b74dd5 |
... <body onload="evil_function()">
|
|
Packit Service |
b74dd5 |
...
|
|
Packit Service |
b74dd5 |
... a link
|
|
Packit Service |
b74dd5 |
... a control char link
|
|
Packit Service |
b74dd5 |
... data
|
|
Packit Service |
b74dd5 |
... another link
|
|
Packit Service |
b74dd5 |
... a paragraph
|
|
Packit Service |
b74dd5 |
... secret EVIL!
|
|
Packit Service |
b74dd5 |
... <object> of EVIL! </object>
|
|
Packit Service |
b74dd5 |
... <iframe src="evil-site"></iframe>
|
|
Packit Service |
b74dd5 |
... <form action="evil-site">
|
|
Packit Service |
b74dd5 |
... Password: <input type="password" name="password">
|
|
Packit Service |
b74dd5 |
... </form>
|
|
Packit Service |
b74dd5 |
... spam spam SPAM!
|
|
Packit Service |
b74dd5 |
... Author
|
|
Packit Service |
b74dd5 |
... Text
|
|
Packit Service |
b74dd5 |
... ![](evil!)
|
|
Packit Service |
b74dd5 |
... </body>
|
|
Packit Service |
b74dd5 |
... </html>'''
|
|
Packit Service |
b74dd5 |
|
|
Packit Service |
b74dd5 |
>>> print(re.sub('[\x00-\x07\x0E]', '', doc))
|
|
Packit Service |
b74dd5 |
<html>
|
|
Packit Service |
b74dd5 |
<head>
|
|
Packit Service |
b74dd5 |
<script type="text/javascript" src="evil-site"></script>
|
|
Packit Service |
b74dd5 |
<link rel="alternate" type="text/rss" src="evil-rss">
|
|
Packit Service |
b74dd5 |
<link rel="alternate" type="text/rss" href="http://example.com">
|
|
Packit Service |
b74dd5 |
<link rel="stylesheet" type="text/rss" href="http://example.com">
|
|
Packit Service |
b74dd5 |
<style>
|
|
Packit Service |
b74dd5 |
body {background-image: url(javascript:do_evil)};
|
|
Packit Service |
b74dd5 |
div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
|
|
Packit Service |
b74dd5 |
div {color: expression(evil)};
|
|
Packit Service |
b74dd5 |
</style>
|
|
Packit Service |
b74dd5 |
</head>
|
|
Packit Service |
b74dd5 |
<body onload="evil_function()">
|
|
Packit Service |
b74dd5 |
|
|
Packit Service |
b74dd5 |
a link
|
|
Packit Service |
b74dd5 |
a control char link
|
|
Packit Service |
b74dd5 |
data
|
|
Packit Service |
b74dd5 |
another link
|
|
Packit Service |
b74dd5 |
a paragraph
|
|
Packit Service |
b74dd5 |
secret EVIL!
|
|
Packit Service |
b74dd5 |
<object> of EVIL! </object>
|
|
Packit Service |
b74dd5 |
<iframe src="evil-site"></iframe>
|
|
Packit Service |
b74dd5 |
<form action="evil-site">
|
|
Packit Service |
b74dd5 |
Password: <input type="password" name="password">
|
|
Packit Service |
b74dd5 |
</form>
|
|
Packit Service |
b74dd5 |
spam spam SPAM!
|
|
Packit Service |
b74dd5 |
Author
|
|
Packit Service |
b74dd5 |
Text
|
|
Packit Service |
b74dd5 |
![](evil!)
|
|
Packit Service |
b74dd5 |
</body>
|
|
Packit Service |
b74dd5 |
</html>
|
|
Packit Service |
b74dd5 |
|
|
Packit Service |
b74dd5 |
>>> print(tostring(fromstring(doc)).decode("utf-8"))
|
|
Packit Service |
b74dd5 |
<html>
|
|
Packit Service |
b74dd5 |
<head>
|
|
Packit Service |
b74dd5 |
<script type="text/javascript" src="evil-site"></script>
|
|
Packit Service |
b74dd5 |
<link rel="alternate" type="text/rss" src="evil-rss">
|
|
Packit Service |
b74dd5 |
<link rel="alternate" type="text/rss" href="http://example.com">
|
|
Packit Service |
b74dd5 |
<link rel="stylesheet" type="text/rss" href="http://example.com">
|
|
Packit Service |
b74dd5 |
<style>
|
|
Packit Service |
b74dd5 |
body {background-image: url(javascript:do_evil)};
|
|
Packit Service |
b74dd5 |
div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
|
|
Packit Service |
b74dd5 |
div {color: expression(evil)};
|
|
Packit Service |
b74dd5 |
</style>
|
|
Packit Service |
b74dd5 |
</head>
|
|
Packit Service |
b74dd5 |
<body onload="evil_function()">
|
|
Packit Service |
b74dd5 |
|
|
Packit Service |
b74dd5 |
a link
|
|
Packit Service |
b74dd5 |
a control char link
|
|
Packit Service |
b74dd5 |
data
|
|
Packit Service |
b74dd5 |
another link
|
|
Packit Service |
b74dd5 |
a paragraph
|
|
Packit Service |
b74dd5 |
secret EVIL!
|
|
Packit Service |
b74dd5 |
<object> of EVIL! </object>
|
|
Packit Service |
b74dd5 |
<iframe src="evil-site"></iframe>
|
|
Packit Service |
b74dd5 |
<form action="evil-site">
|
|
Packit Service |
b74dd5 |
Password: <input type="password" name="password">
|
|
Packit Service |
b74dd5 |
</form>
|
|
Packit Service |
b74dd5 |
spam spam SPAM!
|
|
Packit Service |
b74dd5 |
Author
|
|
Packit Service |
b74dd5 |
Text
|
|
Packit Service |
b74dd5 |
![](evil!)
|
|
Packit Service |
b74dd5 |
</body>
|
|
Packit Service |
b74dd5 |
</html>
|
|
Packit Service |
b74dd5 |
|
|
Packit Service |
b74dd5 |
>>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc))
|
|
Packit Service |
b74dd5 |
<html>
|
|
Packit Service |
b74dd5 |
<head>
|
|
Packit Service |
1cab4c |
<style>
|
|
Packit Service |
1cab4c |
body {background-image: url()};
|
|
Packit Service |
1cab4c |
div {background-image: url()};
|
|
Packit Service |
1cab4c |
div {color: };
|
|
Packit Service |
1cab4c |
</style>
|
|
Packit Service |
b74dd5 |
</head>
|
|
Packit Service |
b74dd5 |
<body>
|
|
Packit Service |
b74dd5 |
a link
|
|
Packit Service |
b74dd5 |
a control char link
|
|
Packit Service |
b74dd5 |
data
|
|
Packit Service |
b74dd5 |
another link
|
|
Packit Service |
b74dd5 |
a paragraph
|
|
Packit Service |
b74dd5 |
secret EVIL!
|
|
Packit Service |
b74dd5 |
of EVIL!
|
|
Packit Service |
b74dd5 |
Password:
|
|
Packit Service |
b74dd5 |
spam spam SPAM!
|
|
Packit Service |
b74dd5 |
Author
|
|
Packit Service |
b74dd5 |
Text
|
|
Packit Service |
b74dd5 |
![](evil!)
|
|
Packit Service |
b74dd5 |
</body>
|
|
Packit Service |
b74dd5 |
</html>
|
|
Packit Service |
b74dd5 |
|
|
Packit Service |
b74dd5 |
>>> print(Cleaner(style=True, inline_style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
|
|
Packit Service |
b74dd5 |
<html>
|
|
Packit Service |
b74dd5 |
<head>
|
|
Packit Service |
b74dd5 |
</head>
|
|
Packit Service |
b74dd5 |
<body>
|
|
Packit Service |
b74dd5 |
a link
|
|
Packit Service |
b74dd5 |
a control char link
|
|
Packit Service |
b74dd5 |
data
|
|
Packit Service |
b74dd5 |
another link
|
|
Packit Service |
b74dd5 |
a paragraph
|
|
Packit Service |
b74dd5 |
secret EVIL!
|
|
Packit Service |
b74dd5 |
of EVIL!
|
|
Packit Service |
b74dd5 |
Password:
|
|
Packit Service |
b74dd5 |
spam spam SPAM!
|
|
Packit Service |
b74dd5 |
Author
|
|
Packit Service |
b74dd5 |
Text
|
|
Packit Service |
b74dd5 |
![](evil!)
|
|
Packit Service |
b74dd5 |
</body>
|
|
Packit Service |
b74dd5 |
</html>
|
|
Packit Service |
b74dd5 |
|
|
Packit Service |
b74dd5 |
>>> print(Cleaner(style=True, inline_style=False, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
|
|
Packit Service |
b74dd5 |
<html>
|
|
Packit Service |
b74dd5 |
<head>
|
|
Packit Service |
b74dd5 |
</head>
|
|
Packit Service |
b74dd5 |
<body>
|
|
Packit Service |
b74dd5 |
a link
|
|
Packit Service |
b74dd5 |
a control char link
|
|
Packit Service |
b74dd5 |
data
|
|
Packit Service |
b74dd5 |
another link
|
|
Packit Service |
b74dd5 |
a paragraph
|
|
Packit Service |
b74dd5 |
secret EVIL!
|
|
Packit Service |
b74dd5 |
of EVIL!
|
|
Packit Service |
b74dd5 |
Password:
|
|
Packit Service |
b74dd5 |
spam spam SPAM!
|
|
Packit Service |
b74dd5 |
Author
|
|
Packit Service |
b74dd5 |
Text
|
|
Packit Service |
b74dd5 |
![](evil!)
|
|
Packit Service |
b74dd5 |
</body>
|
|
Packit Service |
b74dd5 |
</html>
|
|
Packit Service |
b74dd5 |
|
|
Packit Service |
b74dd5 |
>>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc))
|
|
Packit Service |
b74dd5 |
<html>
|
|
Packit Service |
b74dd5 |
<head>
|
|
Packit Service |
b74dd5 |
<link rel="alternate" type="text/rss" src="evil-rss">
|
|
Packit Service |
b74dd5 |
<link rel="alternate" type="text/rss" href="http://example.com">
|
|
Packit Service |
b74dd5 |
<link rel="stylesheet" type="text/rss" href="http://example.com">
|
|
Packit Service |
1cab4c |
<style>
|
|
Packit Service |
1cab4c |
body {background-image: url()};
|
|
Packit Service |
1cab4c |
div {background-image: url()};
|
|
Packit Service |
1cab4c |
div {color: };
|
|
Packit Service |
1cab4c |
</style>
|
|
Packit Service |
b74dd5 |
</head>
|
|
Packit Service |
b74dd5 |
<body>
|
|
Packit Service |
b74dd5 |
a link
|
|
Packit Service |
b74dd5 |
a control char link
|
|
Packit Service |
b74dd5 |
data
|
|
Packit Service |
b74dd5 |
another link
|
|
Packit Service |
b74dd5 |
a paragraph
|
|
Packit Service |
b74dd5 |
secret EVIL!
|
|
Packit Service |
b74dd5 |
of EVIL!
|
|
Packit Service |
b74dd5 |
Password:
|
|
Packit Service |
b74dd5 |
spam spam SPAM!
|
|
Packit Service |
b74dd5 |
Author
|
|
Packit Service |
b74dd5 |
Text
|
|
Packit Service |
b74dd5 |
![](evil!)
|
|
Packit Service |
b74dd5 |
</body>
|
|
Packit Service |
b74dd5 |
</html>
|