Tree - source-git/python-lxml - CentOS Git server

source-git / python-lxml

Files

Commit: b74dd5f56bf76def5aa8ea34e7d986ca0d582af2
Blob Blame History Raw

Setup::

    >>> import lxml.html

We'll define a link translation function:

    >>> base_href = 'http://old/base/path.html'
    >>> try: import urlparse
    ... except ImportError: import urllib.parse as urlparse
    >>> def relocate_href(link):
    ...     link = urlparse.urljoin(base_href, link)
    ...     if link.startswith('http://old'):
    ...         return 'https://new' + link[len('http://old'):]
    ...     else:
    ...         return link

Now for content.  First, to make it easier on us, we need to trim the
normalized HTML we get from these functions::

Some basics::

    >>> from lxml.html import usedoctest, tostring
    >>> from lxml.html import rewrite_links
    >>> print(rewrite_links(
    ...     '<a href="http://old/blah/blah.html">link</a>', relocate_href))
    <a href="https://new/blah/blah.html">link</a>
    >>> print(rewrite_links(
    ...     '<script src="http://old/foo.js"></script>', relocate_href))
    <script src="https://new/foo.js"></script>
    >>> print(rewrite_links(
    ...     '<link href="foo.css">', relocate_href))
    <link href="https://new/base/foo.css">
    >>> print(rewrite_links('''\
    ... <base href="http://blah/stuff/index.html">
    ... <link href="foo.css">
    ... <a href="http://old/bar.html">x</a>\
    ... ''', relocate_href))
    <link href="http://blah/stuff/foo.css">
    <a href="https://new/bar.html">x</a>

Links in CSS are also handled::

    >>> print(rewrite_links('''
    ... <style>
    ...   body {background-image: url(http://old/image.gif)};
    ...   @import "http://old/other-style.css";
    ... </style>''', relocate_href))
    <html><head><style>
      body {background-image: url(https://new/image.gif)};
      @import "https://new/other-style.css";
    </style></head></html>
    >>> print(rewrite_links('''
    ... <style>
    ...   body {background-image: url("http://old/image.gif")};
    ...   @import "http://old/other-style.css";
    ... </style>''', relocate_href))
    <html><head><style>
      body {background-image: url("https://new/image.gif")};
      @import "https://new/other-style.css";
    </style></head></html>

Those links in style attributes are also rewritten::

    >>> print(rewrite_links('''
    ... <div style="background-image: url(http://old/image.gif)">text</div>
    ... ''', relocate_href))
    <div style="background-image: url(https://new/image.gif)">text</div>

The ``<base href>`` tag is also respected (but also removed)::

    >>> print(rewrite_links('''
    ... <html><head>
    ...  <base href="http://old/">
    ... </head>
    ... <body>
    ...  <a href="foo.html">link</a>
    ... </body></html>''', relocate_href))
    <html>
     <head></head>
     <body>
      <a href="https://new/foo.html">link</a>
     </body>
    </html>

The ``iterlinks`` method (and function) gives you all the links in
the document, along with the element and attribute the link comes
from.  This makes it fairly easy to see what resources the document
references or embeds (an ``<a>`` tag is a reference, an ``<img>`` tag
is something embedded).  It returns a generator of ``(element, attrib,
link)``, which is awkward to test here, so we'll make a printer::

    >>> from lxml.html import iterlinks, document_fromstring, tostring
    >>> def print_iter(seq):
    ...     for element, attrib, link, pos in seq:
    ...         if pos:
    ...             extra = '@%s' % pos
    ...         else:
    ...             extra = ''
    ...         print('%s %s="%s"%s' % (element.tag, attrib, link, extra))
    >>> print_iter(iterlinks('''
    ... <html>
    ...  <head>
    ...   <meta http-equiv="refresh" content="0;url=/redirect">
    ...   <meta http-equiv="refresh" content="10;url='/quoted_url'">
    ...   <link rel="stylesheet" href="style.css">
    ...   <style type="text/css">
    ...     body {
    ...       background-image: url(/bg.gif);
    ...     }
    ...     @import "/other-styles.css";
    ...   </style>
    ...   <script src="/js-funcs.js"></script>
    ...  </head>
    ...  <body>
    ...   <table>
    ...    <tr><td><ul>
    ...     <li><a href="/test.html">Test stuff</a></li>
    ...     <li><a href="/other.html">Other stuff</a></li>
    ...    </td></tr>
    ...    <td style="background-image: url(/td-bg.png)">
    ...      <img src="/logo.gif">
    ...      Hi world!
    ...    </td>
    ...    <td style="background-image: url('/quoted.png')">
    ...    </td></tr>
    ...   </table>
    ...  </body></html>'''))
    meta content="/redirect"@6
    meta content="/quoted_url"@8
    link href="style.css"
    style None="/other-styles.css"@69
    style None="/bg.gif"@40
    script src="/js-funcs.js"
    a href="/test.html"
    a href="/other.html"
    td style="/td-bg.png"@22
    img src="/logo.gif"
    td style="/quoted.png"@23

An application of ``iterlinks()`` is ``make_links_absolute()``::

    >>> from lxml.html import make_links_absolute
    >>> print(make_links_absolute('''
    ... <html>
    ...  <head>
    ...   <meta http-equiv="refresh" content=" broken ">
    ...   <meta http-equiv="refresh" content="0; url =  ">
    ...   <meta http-equiv="refresh" content="0;url=/redirect">
    ...   <meta http-equiv="refresh" content="5;   url='/quoted_url'">
    ...   <meta http-equiv="refresh" content="10;url='http://example.com/absolute'">
    ...   <meta http-equiv="refresh" content="15; url=http://example.com/">
    ...   <link rel="stylesheet" href="style.css">
    ...   <style type="text/css">
    ...     body {
    ...       background-image: url(/bg.gif);
    ...     }
    ...     @import "/other-styles.css";
    ...   </style>
    ...   <script src="/js-funcs.js"></script>
    ...  </head>
    ...  <body>
    ...   <table>
    ...    <tr><td><ul>
    ...     <li><a href=" /test.html">Test stuff</a></li>
    ...     <li><a href="/other.html ">Other stuff</a></li>
    ...    </td></tr>
    ...    <tr><td style="background-image: url( /td-bg.png )">
    ...      <img src="logo.gif">
    ...      Hi world!
    ...    </td></tr>
    ...   </table>
    ...  </body></html>''',
    ... base_url="http://my.little.server/url/"))
    <html>
     <head>
      <meta http-equiv="refresh" content=" http://my.little.server/url/broken ">
      <meta http-equiv="refresh" content="0; url =  ">
      <meta http-equiv="refresh" content="0;url=http://my.little.server/redirect">
      <meta http-equiv="refresh" content="5;   url='http://my.little.server/quoted_url'">
      <meta http-equiv="refresh" content="10;url='http://example.com/absolute'">
      <meta http-equiv="refresh" content="15; url=http://example.com/">
      <link rel="stylesheet" href="http://my.little.server/url/style.css">
      <style type="text/css">
        body {
          background-image: url(http://my.little.server/bg.gif);
        }
        @import "http://my.little.server/other-styles.css";
      </style>
      <script src="http://my.little.server/js-funcs.js"></script>
     </head>
     <body>
      <table>
       <tr><td><ul>
        <li><a href="http://my.little.server/test.html">Test stuff</a></li>
        <li><a href="http://my.little.server/other.html">Other stuff</a></li>
       </ul></td></tr>
       <tr>
         <td style="background-image: url(http://my.little.server/td-bg.png)">
          <img src="http://my.little.server/url/logo.gif">
          Hi world!
       </td></tr>
      </table>
     </body>
    </html>

### Test disabled to support Py2.6 and earlier
#If the document contains invalid links, you may choose to "discard" or "ignore"
#them by passing the respective option into the ``handle_failures`` argument::
#
#    >>> html = lxml.html.fromstring ('''\
#    ... <html><body><div>
#    ...     <a href="http://fancybase.com]Buy">test2</a>
#    ... </div></body></html>''')
#
#    >>> html.make_links_absolute(base_url="http://my.little.server/url/",
#    ...                          handle_failures="discard")
#
#    >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
#    <html><body><div>
#        <a>test2</a>
#    </div></body></html>

Check if we can replace multiple links inside of the same text string::

    >>> html = lxml.html.fromstring ("""\
    ... <html>
    ...   <head>
    ...      <title>Test</title>
    ...      <style type='text/css'>
    ...        .bg1 {
    ...            background: url(images/bg1.png);
    ...        }
    ...        .bg2 {
    ...            background: url(images/bg2.png);
    ...        }
    ...      </style>
    ...   </head>
    ...   <body>
    ...      <p>Hi</p>
    ...   </body>
    ... </html>
    ... """,
    ... base_url = 'http://www.example.com/')

    >>> html.make_links_absolute ()

    >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
    <html>
      <head>
        <title>Test</title>
        <style type="text/css">
          .bg1 {
            background: url(http://www.example.com/images/bg1.png);
          }
          .bg2 {
            background: url(http://www.example.com/images/bg2.png);
          }
        </style>
      </head>
      <body>
        <p>Hi</p>
      </body>
    </html>
source-git / python-lxml

Source Code

Files