Blob Blame History Raw
<?xml version="1.0" encoding="ascii"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
          "DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
  <title>lxml.html.html5parser</title>
  <link rel="stylesheet" href="epydoc.css" type="text/css" />
  <script type="text/javascript" src="epydoc.js"></script>
</head>

<body bgcolor="white" text="black" link="blue" vlink="#204080"
      alink="#204080">
<!-- ==================== NAVIGATION BAR ==================== -->
<table class="navbar" border="0" width="100%" cellpadding="0"
       bgcolor="#a0c0ff" cellspacing="0">
  <tr valign="middle">
  <!-- Home link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="lxml-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Tree link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Index link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Help link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Project homepage -->
      <th class="navbar" align="right" width="100%">
        <table border="0" cellpadding="0" cellspacing="0">
          <tr><th class="navbar" align="center"
            ><a class="navbar" target="_top" href="/">lxml API</a></th>
          </tr></table></th>
  </tr>
</table>
<table width="100%" cellpadding="0" cellspacing="0">
  <tr valign="top">
    <td width="100%">
      <span class="breadcrumbs">
        <a href="lxml-module.html">Package&nbsp;lxml</a> ::
        <a href="lxml.html-module.html">Package&nbsp;html</a> ::
        Module&nbsp;html5parser
      </span>
    </td>
    <td>
      <table cellpadding="0" cellspacing="0">
        <!-- hide/show private -->
        <tr><td align="right"><span class="options">[<a href="javascript:void(0);" class="privatelink"
    onclick="toggle_private();">hide&nbsp;private</a>]</span></td></tr>
        <tr><td align="right"><span class="options"
            >[<a href="frames.html" target="_top">frames</a
            >]&nbsp;|&nbsp;<a href="lxml.html.html5parser-module.html"
            target="_top">no&nbsp;frames</a>]</span></td></tr>
      </table>
    </td>
  </tr>
</table>
<!-- ==================== MODULE DESCRIPTION ==================== -->
<h1 class="epydoc">Module html5parser</h1><p class="nomargin-top"><span class="codelink"><a href="lxml.html.html5parser-pysrc.html">source&nbsp;code</a></span></p>
An interface to html5lib that mimics the lxml.html interface.

<!-- ==================== CLASSES ==================== -->
<a name="section-Classes"></a>
<table class="summary" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr bgcolor="#70b0f0" class="table-header">
  <td colspan="2" class="table-header">
    <table border="0" cellpadding="0" cellspacing="0" width="100%">
      <tr valign="top">
        <td align="left"><span class="table-header">Classes</span></td>
        <td align="right" valign="top"
         ><span class="options">[<a href="#section-Classes"
         class="privatelink" onclick="toggle_private();"
         >hide private</a>]</span></td>
      </tr>
    </table>
  </td>
</tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
        <a href="lxml.html.html5parser.HTMLParser-class.html" class="summary-name">HTMLParser</a><br />
      An html5lib HTML parser with lxml as tree.
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
        <a href="lxml.html.html5parser.XHTMLParser-class.html" class="summary-name">XHTMLParser</a><br />
      An html5lib XHTML Parser with lxml as tree.
    </td>
  </tr>
</table>
<!-- ==================== FUNCTIONS ==================== -->
<a name="section-Functions"></a>
<table class="summary" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr bgcolor="#70b0f0" class="table-header">
  <td colspan="2" class="table-header">
    <table border="0" cellpadding="0" cellspacing="0" width="100%">
      <tr valign="top">
        <td align="left"><span class="table-header">Functions</span></td>
        <td align="right" valign="top"
         ><span class="options">[<a href="#section-Functions"
         class="privatelink" onclick="toggle_private();"
         >hide private</a>]</span></td>
      </tr>
    </table>
  </td>
</tr>
<tr class="private">
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a name="_find_tag"></a><span class="summary-sig-name">_find_tag</span>(<span class="summary-sig-arg">tree</span>,
        <span class="summary-sig-arg">tag</span>)</span></td>
          <td align="right" valign="top">
            <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#_find_tag">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#document_fromstring" class="summary-sig-name">document_fromstring</a>(<span class="summary-sig-arg">html</span>,
        <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
        <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
      Parse a whole document into a string.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#document_fromstring">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#fragments_fromstring" class="summary-sig-name">fragments_fromstring</a>(<span class="summary-sig-arg">html</span>,
        <span class="summary-sig-arg">no_leading_text</span>=<span class="summary-sig-default">False</span>,
        <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
        <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
      Parses several HTML elements, returning a list of elements.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fragments_fromstring">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#fragment_fromstring" class="summary-sig-name">fragment_fromstring</a>(<span class="summary-sig-arg">html</span>,
        <span class="summary-sig-arg">create_parent</span>=<span class="summary-sig-default">False</span>,
        <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
        <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
      Parses a single HTML element; it is an error if there is more than
one element, or if anything but whitespace precedes or follows the
element.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fragment_fromstring">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#fromstring" class="summary-sig-name">fromstring</a>(<span class="summary-sig-arg">html</span>,
        <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
        <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
      Parse the html, returning a single element/document.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fromstring">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#parse" class="summary-sig-name">parse</a>(<span class="summary-sig-arg">filename_url_or_file</span>,
        <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
        <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
      Parse a filename, URL, or file-like object into an HTML document
tree.  Note: this returns a tree, not an element.  Use
<tt class="rst-docutils literal"><span class="pre">parse(...).getroot()</span></tt> to get the document root.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#parse">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr class="private">
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a name="_looks_like_url"></a><span class="summary-sig-name">_looks_like_url</span>(<span class="summary-sig-arg">str</span>)</span></td>
          <td align="right" valign="top">
            <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#_looks_like_url">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
</table>
<!-- ==================== VARIABLES ==================== -->
<a name="section-Variables"></a>
<table class="summary" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr bgcolor="#70b0f0" class="table-header">
  <td colspan="2" class="table-header">
    <table border="0" cellpadding="0" cellspacing="0" width="100%">
      <tr valign="top">
        <td align="left"><span class="table-header">Variables</span></td>
        <td align="right" valign="top"
         ><span class="options">[<a href="#section-Variables"
         class="privatelink" onclick="toggle_private();"
         >hide private</a>]</span></td>
      </tr>
    </table>
  </td>
</tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
        <a name="xhtml_parser"></a><span class="summary-name">xhtml_parser</span> = <code title="XHTMLParser()">XHTMLParser()</code>
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
        <a name="html_parser"></a><span class="summary-name">html_parser</span> = <code title="&lt;lxml.html.html5parser.HTMLParser object&gt;">&lt;lxml.html.html5parser.HTMLParser object&gt;</code>
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
        <a name="__package__"></a><span class="summary-name">__package__</span> = <code title="'lxml.html'"><code class="variable-quote">'</code><code class="variable-string">lxml.html</code><code class="variable-quote">'</code></code>
    </td>
  </tr>
</table>
<!-- ==================== FUNCTION DETAILS ==================== -->
<a name="section-FunctionDetails"></a>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr bgcolor="#70b0f0" class="table-header">
  <td colspan="2" class="table-header">
    <table border="0" cellpadding="0" cellspacing="0" width="100%">
      <tr valign="top">
        <td align="left"><span class="table-header">Function Details</span></td>
        <td align="right" valign="top"
         ><span class="options">[<a href="#section-FunctionDetails"
         class="privatelink" onclick="toggle_private();"
         >hide private</a>]</span></td>
      </tr>
    </table>
  </td>
</tr>
</table>
<a name="document_fromstring"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">document_fromstring</span>(<span class="sig-arg">html</span>,
        <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
        <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#document_fromstring">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Parse a whole document into a string.</p>
<p>If <code class="link">guess_charset</code> is true, or if the input is not Unicode but a
byte string, the <code class="link">chardet</code> library will perform charset guessing
on the string.</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="fragments_fromstring"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">fragments_fromstring</span>(<span class="sig-arg">html</span>,
        <span class="sig-arg">no_leading_text</span>=<span class="sig-default">False</span>,
        <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
        <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fragments_fromstring">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Parses several HTML elements, returning a list of elements.</p>
<p>The first item in the list may be a string.  If no_leading_text is true,
then it will be an error if there is leading text, and it will always be
a list of only elements.</p>
<p>If <code class="link">guess_charset</code> is true, the <code class="link">chardet</code> library will perform charset
guessing on the string.</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="fragment_fromstring"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">fragment_fromstring</span>(<span class="sig-arg">html</span>,
        <span class="sig-arg">create_parent</span>=<span class="sig-default">False</span>,
        <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
        <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fragment_fromstring">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Parses a single HTML element; it is an error if there is more than
one element, or if anything but whitespace precedes or follows the
element.</p>
<p>If 'create_parent' is true (or is a tag name) then a parent node
will be created to encapsulate the HTML in a single element.  In
this case, leading or trailing text is allowed.</p>
<p>If <code class="link">guess_charset</code> is true, the <code class="link">chardet</code> library will perform charset
guessing on the string.</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="fromstring"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">fromstring</span>(<span class="sig-arg">html</span>,
        <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
        <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fromstring">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Parse the html, returning a single element/document.</p>
<p>This tries to minimally parse the chunk of text, without knowing if it
is a fragment or a document.</p>
<p>'base_url' will set the document's base_url attribute (and the tree's
docinfo.URL)</p>
<p>If <code class="link">guess_charset</code> is true, or if the input is not Unicode but a
byte string, the <code class="link">chardet</code> library will perform charset guessing
on the string.</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="parse"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">parse</span>(<span class="sig-arg">filename_url_or_file</span>,
        <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
        <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#parse">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Parse a filename, URL, or file-like object into an HTML document
tree.  Note: this returns a tree, not an element.  Use
<tt class="rst-docutils literal"><span class="pre">parse(...).getroot()</span></tt> to get the document root.</p>
<p>If <tt class="rst-docutils literal">guess_charset</tt> is true, the <tt class="rst-docutils literal">useChardet</tt> option is passed into
html5lib to enable character detection.  This option is on by default
when parsing from URLs, off by default when parsing from file(-like)
objects (which tend to return Unicode more often than not), and on by
default when parsing from a file path (which is read in binary mode).</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<br />
<!-- ==================== NAVIGATION BAR ==================== -->
<table class="navbar" border="0" width="100%" cellpadding="0"
       bgcolor="#a0c0ff" cellspacing="0">
  <tr valign="middle">
  <!-- Home link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="lxml-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Tree link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Index link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Help link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Project homepage -->
      <th class="navbar" align="right" width="100%">
        <table border="0" cellpadding="0" cellspacing="0">
          <tr><th class="navbar" align="center"
            ><a class="navbar" target="_top" href="/">lxml API</a></th>
          </tr></table></th>
  </tr>
</table>
<table border="0" cellpadding="0" cellspacing="0" width="100%%">
  <tr>
    <td align="left" class="footer">
    Generated by Epydoc 3.0.1
    on Wed Jun 27 16:05:05 2018
    </td>
    <td align="right" class="footer">
      <a target="mainFrame" href="http://epydoc.sourceforge.net"
        >http://epydoc.sourceforge.net</a>
    </td>
  </tr>
</table>

<script type="text/javascript">
  <!--
  // Private objects are initially displayed (because if
  // javascript is turned off then we want them to be
  // visible); but by default, we want to hide them.  So hide
  // them unless we have a cookie that says to show them.
  checkCookie();
  // -->
</script>
</body>
</html>