Blame doc/devhelp/libxml2-HTMLparser.html

Packit 423ecb
Packit 423ecb
<html>
Packit 423ecb
  <head>
Packit 423ecb
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
Packit 423ecb
    <title>HTMLparser: interface for an HTML 4.0 non-verifying parser</title>
Packit 423ecb
    <meta name="generator" content="Libxml2 devhelp stylesheet"/>
Packit 423ecb
    <link rel="start" href="index.html" title="libxml2 Reference Manual"/>
Packit 423ecb
    <link rel="up" href="general.html" title="API"/>
Packit 423ecb
    <link rel="stylesheet" href="style.css" type="text/css"/>
Packit 423ecb
    <link rel="chapter" href="general.html" title="API"/>
Packit 423ecb
  </head>
Packit 423ecb
  <body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF">
Packit 423ecb
    
Packit 423ecb
      
Packit 423ecb
        
Packit 423ecb
          
Packit 423ecb
            Prev
Packit 423ecb
          
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb
          
Packit 423ecb
            Up
Packit 423ecb
          
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb
          
Packit 423ecb
            Home
Packit 423ecb
          
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb
          
Packit 423ecb
            Next
Packit 423ecb
          
Packit 423ecb
        
Packit 423ecb
        libxml2 Reference Manual
Packit 423ecb
      
Packit 423ecb
    
Packit 423ecb
    

Packit 423ecb
      HTMLparser
Packit 423ecb
    
Packit 423ecb
    

HTMLparser - interface for an HTML 4.0 non-verifying parser

Packit 423ecb
    

this module implements an HTML 4.0 non-verifying parser with API compatible with the XML parser ones. It should be able to parse "real world" HTML, even if severely broken from a specification point of view.

Packit 423ecb
    

Author(s): Daniel Veillard

Packit 423ecb
    
Packit 423ecb
      

Synopsis

Packit 423ecb
      
#define htmlDefaultSubelement(elt);
Packit 423ecb
#define htmlElementAllowedHereDesc(parent, elt);
Packit 423ecb
#define htmlRequiredAttrs(elt);
Packit 423ecb
typedef xmlParserNodeInfo htmlParserNodeInfo;
Packit 423ecb
typedef xmlParserInput htmlParserInput;
Packit 423ecb
typedef xmlParserCtxtPtr htmlParserCtxtPtr;
Packit 423ecb
typedef struct _htmlEntityDesc htmlEntityDesc;
Packit 423ecb
typedef xmlDocPtr htmlDocPtr;
Packit 423ecb
typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
Packit 423ecb
typedef enum htmlStatus;
Packit 423ecb
typedef xmlNodePtr htmlNodePtr;
Packit 423ecb
typedef htmlElemDesc * htmlElemDescPtr;
Packit 423ecb
typedef struct _htmlElemDesc htmlElemDesc;
Packit 423ecb
typedef xmlSAXHandler htmlSAXHandler;
Packit 423ecb
typedef xmlParserInputPtr htmlParserInputPtr;
Packit 423ecb
typedef enum htmlParserOption;
Packit 423ecb
typedef htmlEntityDesc * htmlEntityDescPtr;
Packit 423ecb
typedef xmlParserCtxt htmlParserCtxt;
Packit 423ecb
int	htmlIsScriptAttribute		(const xmlChar * name);
Packit 423ecb
int	htmlHandleOmittedElem		(int val);
Packit 423ecb
htmlDocPtr	htmlReadFd		(int fd, 
const char * URL,
const char * encoding,
int options);
Packit 423ecb
htmlDocPtr	htmlReadIO		(xmlInputReadCallback ioread, 
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options);
Packit 423ecb
htmlDocPtr	htmlParseFile		(const char * filename, 
const char * encoding);
Packit 423ecb
htmlDocPtr	htmlCtxtReadDoc		(htmlParserCtxtPtr ctxt, 
const xmlChar * cur,
const char * URL,
const char * encoding,
int options);
Packit 423ecb
int	htmlAutoCloseTag		(htmlDocPtr doc, 
const xmlChar * name,
htmlNodePtr elem);
Packit 423ecb
int	htmlParseChunk			(htmlParserCtxtPtr ctxt, 
const char * chunk,
int size,
int terminate);
Packit 423ecb
const htmlElemDesc *	htmlTagLookup	(const xmlChar * tag);
Packit 423ecb
htmlParserCtxtPtr	htmlCreateMemoryParserCtxt	(const char * buffer, 
int size);
Packit 423ecb
void	htmlCtxtReset			(htmlParserCtxtPtr ctxt);
Packit 423ecb
int	htmlElementAllowedHere		(const htmlElemDesc * parent, 
const xmlChar * elt);
Packit 423ecb
htmlDocPtr	htmlCtxtReadIO		(htmlParserCtxtPtr ctxt, 
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options);
Packit 423ecb
htmlParserCtxtPtr	htmlCreatePushParserCtxt	(htmlSAXHandlerPtr sax, 
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc);
Packit 423ecb
htmlDocPtr	htmlReadMemory		(const char * buffer, 
int size,
const char * URL,
const char * encoding,
int options);
Packit 423ecb
int	htmlIsAutoClosed		(htmlDocPtr doc, 
htmlNodePtr elem);
Packit 423ecb
int	htmlParseCharRef		(htmlParserCtxtPtr ctxt);
Packit 423ecb
htmlDocPtr	htmlReadDoc		(const xmlChar * cur, 
const char * URL,
const char * encoding,
int options);
Packit 423ecb
int	htmlEncodeEntities		(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen,
int quoteChar);
Packit 423ecb
htmlStatus	htmlNodeStatus		(const htmlNodePtr node, 
int legacy);
Packit 423ecb
htmlStatus	htmlAttrAllowed		(const htmlElemDesc * elt, 
const xmlChar * attr,
int legacy);
Packit 423ecb
htmlDocPtr	htmlSAXParseFile	(const char * filename, 
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData);
Packit 423ecb
const htmlEntityDesc *	htmlParseEntityRef	(htmlParserCtxtPtr ctxt, 
const xmlChar ** str);
Packit 423ecb
htmlStatus	htmlElementStatusHere	(const htmlElemDesc * parent, 
const htmlElemDesc * elt);
Packit 423ecb
const htmlEntityDesc *	htmlEntityValueLookup	(unsigned int value);
Packit 423ecb
void	htmlParseElement		(htmlParserCtxtPtr ctxt);
Packit 423ecb
int	UTF8ToHtml			(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen);
Packit 423ecb
const htmlEntityDesc *	htmlEntityLookup	(const xmlChar * name);
Packit 423ecb
void	htmlFreeParserCtxt		(htmlParserCtxtPtr ctxt);
Packit 423ecb
htmlDocPtr	htmlCtxtReadMemory	(htmlParserCtxtPtr ctxt, 
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options);
Packit 423ecb
htmlDocPtr	htmlCtxtReadFd		(htmlParserCtxtPtr ctxt, 
int fd,
const char * URL,
const char * encoding,
int options);
Packit 423ecb
htmlDocPtr	htmlReadFile		(const char * filename, 
const char * encoding,
int options);
Packit 423ecb
htmlDocPtr	htmlCtxtReadFile	(htmlParserCtxtPtr ctxt, 
const char * filename,
const char * encoding,
int options);
Packit 423ecb
int	htmlParseDocument		(htmlParserCtxtPtr ctxt);
Packit 423ecb
htmlParserCtxtPtr	htmlNewParserCtxt	(void);
Packit 423ecb
htmlDocPtr	htmlSAXParseDoc		(const xmlChar * cur, 
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData);
Packit 423ecb
int	htmlCtxtUseOptions		(htmlParserCtxtPtr ctxt, 
int options);
Packit 423ecb
htmlDocPtr	htmlParseDoc		(const xmlChar * cur, 
const char * encoding);
Packit 423ecb
Packit 423ecb
    
Packit 423ecb
    
Packit 423ecb
      

Description

Packit 423ecb
    
Packit 423ecb
    
Packit 423ecb
      

Details

Packit 423ecb
      
Packit 423ecb
        

Macro htmlDefaultSubelement

#define htmlDefaultSubelement(elt);
Packit 423ecb

Returns the default subelement for this element

<tt>elt</tt>:HTML element
Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Macro htmlElementAllowedHereDesc

#define htmlElementAllowedHereDesc(parent, elt);
Packit 423ecb

Checks whether an HTML element description may be a direct child of the specified element. Returns 1 if allowed; 0 otherwise.

<tt>parent</tt>:HTML parent element
<tt>elt</tt>:HTML element
Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Macro htmlRequiredAttrs

#define htmlRequiredAttrs(elt);
Packit 423ecb

Returns the attributes required for the specified element.

<tt>elt</tt>:HTML element
Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Typedef htmlDocPtr

xmlDocPtr htmlDocPtr;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Structure htmlElemDesc

struct _htmlElemDesc {
Packit 423ecb
    const char *	name	: The tag name
Packit 423ecb
    char	startTag	: Whether the start tag can be implied
Packit 423ecb
    char	endTag	: Whether the end tag can be implied
Packit 423ecb
    char	saveEndTag	: Whether the end tag should be saved
Packit 423ecb
    char	empty	: Is this an empty element ?
Packit 423ecb
    char	depr	: Is this a deprecated element ?
Packit 423ecb
    char	dtd	: 1: only in Loose DTD, 2: only Frameset one
Packit 423ecb
    char	isinline	: is this a block 0 or inline 1 element
Packit 423ecb
    const char *	desc	: the description NRK Jan.2003 * New fields encapsulating HTML structur
Packit 423ecb
    const char **	subelts	: allowed sub-elements of this element
Packit 423ecb
    const char *	defaultsubelt	: subelement for suggested auto-repair if necessary or NULL
Packit 423ecb
    const char **	attrs_opt	: Optional Attributes
Packit 423ecb
    const char **	attrs_depr	: Additional deprecated attributes
Packit 423ecb
    const char **	attrs_req	: Required attributes
Packit 423ecb
} htmlElemDesc;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Typedef htmlElemDescPtr

htmlElemDesc * htmlElemDescPtr;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Structure htmlEntityDesc

struct _htmlEntityDesc {
Packit 423ecb
    unsigned int	value	: the UNICODE value for the character
Packit 423ecb
    const char *	name	: The entity name
Packit 423ecb
    const char *	desc	: the description
Packit 423ecb
} htmlEntityDesc;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Typedef htmlEntityDescPtr

htmlEntityDesc * htmlEntityDescPtr;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Typedef htmlNodePtr

xmlNodePtr htmlNodePtr;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Typedef htmlParserCtxt

xmlParserCtxt htmlParserCtxt;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Typedef htmlParserCtxtPtr

xmlParserCtxtPtr htmlParserCtxtPtr;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Typedef htmlParserInput

xmlParserInput htmlParserInput;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Typedef htmlParserInputPtr

xmlParserInputPtr htmlParserInputPtr;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Typedef htmlParserNodeInfo

xmlParserNodeInfo htmlParserNodeInfo;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Enum htmlParserOption

enum htmlParserOption {
Packit 423ecb
    HTML_PARSE_RECOVER = 1 /* Relaxed parsing */
Packit 423ecb
    HTML_PARSE_NODEFDTD = 4 /* do not default a doctype if not found */
Packit 423ecb
    HTML_PARSE_NOERROR = 32 /* suppress error reports */
Packit 423ecb
    HTML_PARSE_NOWARNING = 64 /* suppress warning reports */
Packit 423ecb
    HTML_PARSE_PEDANTIC = 128 /* pedantic error reporting */
Packit 423ecb
    HTML_PARSE_NOBLANKS = 256 /* remove blank nodes */
Packit 423ecb
    HTML_PARSE_NONET = 2048 /* Forbid network access */
Packit 423ecb
    HTML_PARSE_NOIMPLIED = 8192 /* Do not add implied html/body... elements */
Packit 423ecb
    HTML_PARSE_COMPACT = 65536 /* compact small text nodes */
Packit 423ecb
    HTML_PARSE_IGNORE_ENC = 2097152 /*  ignore internal document encoding hint */
Packit 423ecb
};
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Typedef htmlSAXHandler

xmlSAXHandler htmlSAXHandler;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Typedef htmlSAXHandlerPtr

xmlSAXHandlerPtr htmlSAXHandlerPtr;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Enum htmlStatus

enum htmlStatus {
Packit 423ecb
    HTML_NA = 0 /* something we don't check at all */
Packit 423ecb
    HTML_INVALID = 1
Packit 423ecb
    HTML_DEPRECATED = 2
Packit 423ecb
    HTML_VALID = 4
Packit 423ecb
    HTML_REQUIRED = 12 /*  VALID bit set so ( & HTML_VALID ) is TRUE */
Packit 423ecb
};
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.

Packit 423ecb
<tt>out</tt>:a pointer to an array of bytes to store the result
<tt>outlen</tt>:the length of @out
<tt>in</tt>:a pointer to an array of UTF-8 chars
<tt>inlen</tt>:the length of @in
<tt>Returns</tt>:0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed.
Packit 423ecb
        
Packit 423ecb
        

htmlAttrAllowed ()

htmlStatus	htmlAttrAllowed		(const htmlElemDesc * elt, 
const xmlChar * attr,
int legacy)
Packit 423ecb

Checks whether an attribute is valid for an element Has full knowledge of Required and Deprecated attributes

Packit 423ecb
<tt>elt</tt>:HTML element
<tt>attr</tt>:HTML attribute
<tt>legacy</tt>:whether to allow deprecated attributes
<tt>Returns</tt>:one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.

Packit 423ecb
<tt>doc</tt>:the HTML document
<tt>name</tt>:The tag name
<tt>elem</tt>:the HTML element
<tt>Returns</tt>:1 if autoclose, 0 otherwise
Packit 423ecb
        
Packit 423ecb
        

htmlCreateMemoryParserCtxt ()

htmlParserCtxtPtr	htmlCreateMemoryParserCtxt	(const char * buffer, 
int size)
Packit 423ecb

Create a parser context for an HTML in-memory document.

Packit 423ecb
<tt>buffer</tt>:a pointer to a char array
<tt>size</tt>:the size of the array
<tt>Returns</tt>:the new parser context or NULL
Packit 423ecb
        
Packit 423ecb
        

htmlCreatePushParserCtxt ()

htmlParserCtxtPtr	htmlCreatePushParserCtxt	(htmlSAXHandlerPtr sax, 
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc)
Packit 423ecb

Create a parser context for using the HTML parser in push mode The value of @filename is used for fetching external entities and error/warning reports.

Packit 423ecb
<tt>sax</tt>:a SAX handler
<tt>user_data</tt>:The user data returned on SAX callbacks
<tt>chunk</tt>:a pointer to an array of chars
<tt>size</tt>:number of chars in the array
<tt>filename</tt>:an optional file name or URI
<tt>enc</tt>:an optional encoding
<tt>Returns</tt>:the new parser context or NULL
Packit 423ecb
        
Packit 423ecb
        

htmlCtxtReadDoc ()

htmlDocPtr	htmlCtxtReadDoc		(htmlParserCtxtPtr ctxt, 
const xmlChar * cur,
const char * URL,
const char * encoding,
int options)
Packit 423ecb

parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context

Packit 423ecb
<tt>ctxt</tt>:an HTML parser context
<tt>cur</tt>:a pointer to a zero terminated string
<tt>URL</tt>:the base URL to use for the document
<tt>encoding</tt>:the document encoding, or NULL
<tt>options</tt>:a combination of htmlParserOption(s)
<tt>Returns</tt>:the resulting document tree
Packit 423ecb
        
Packit 423ecb
        

htmlCtxtReadFd ()

htmlDocPtr	htmlCtxtReadFd		(htmlParserCtxtPtr ctxt, 
int fd,
const char * URL,
const char * encoding,
int options)
Packit 423ecb

parse an XML from a file descriptor and build a tree. This reuses the existing @ctxt parser context

Packit 423ecb
<tt>ctxt</tt>:an HTML parser context
<tt>fd</tt>:an open file descriptor
<tt>URL</tt>:the base URL to use for the document
<tt>encoding</tt>:the document encoding, or NULL
<tt>options</tt>:a combination of htmlParserOption(s)
<tt>Returns</tt>:the resulting document tree
Packit 423ecb
        
Packit 423ecb
        

htmlCtxtReadFile ()

htmlDocPtr	htmlCtxtReadFile	(htmlParserCtxtPtr ctxt, 
const char * filename,
const char * encoding,
int options)
Packit 423ecb

parse an XML file from the filesystem or the network. This reuses the existing @ctxt parser context

Packit 423ecb
<tt>ctxt</tt>:an HTML parser context
<tt>filename</tt>:a file or URL
<tt>encoding</tt>:the document encoding, or NULL
<tt>options</tt>:a combination of htmlParserOption(s)
<tt>Returns</tt>:the resulting document tree
Packit 423ecb
        
Packit 423ecb
        

htmlCtxtReadIO ()

htmlDocPtr	htmlCtxtReadIO		(htmlParserCtxtPtr ctxt, 
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)
Packit 423ecb

parse an HTML document from I/O functions and source and build a tree. This reuses the existing @ctxt parser context

Packit 423ecb
<tt>ctxt</tt>:an HTML parser context
<tt>ioread</tt>:an I/O read function
<tt>ioclose</tt>:an I/O close function
<tt>ioctx</tt>:an I/O handler
<tt>URL</tt>:the base URL to use for the document
<tt>encoding</tt>:the document encoding, or NULL
<tt>options</tt>:a combination of htmlParserOption(s)
<tt>Returns</tt>:the resulting document tree
Packit 423ecb
        
Packit 423ecb
        

htmlCtxtReadMemory ()

htmlDocPtr	htmlCtxtReadMemory	(htmlParserCtxtPtr ctxt, 
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options)
Packit 423ecb

parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context

Packit 423ecb
<tt>ctxt</tt>:an HTML parser context
<tt>buffer</tt>:a pointer to a char array
<tt>size</tt>:the size of the array
<tt>URL</tt>:the base URL to use for the document
<tt>encoding</tt>:the document encoding, or NULL
<tt>options</tt>:a combination of htmlParserOption(s)
<tt>Returns</tt>:the resulting document tree
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Reset a parser context

Packit 423ecb
<tt>ctxt</tt>:an HTML parser context
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Applies the options to the parser context

Packit 423ecb
<tt>ctxt</tt>:an HTML parser context
<tt>options</tt>:a combination of htmlParserOption(s)
<tt>Returns</tt>:0 in case of success, the set of unknown or unimplemented options in case of error.
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Checks whether an HTML element may be a direct child of a parent element. Note - doesn't check for deprecated elements

Packit 423ecb
<tt>parent</tt>:HTML parent element
<tt>elt</tt>:HTML element
<tt>Returns</tt>:1 if allowed; 0 otherwise.
Packit 423ecb
        
Packit 423ecb
        

htmlElementStatusHere ()

htmlStatus	htmlElementStatusHere	(const htmlElemDesc * parent, 
const htmlElemDesc * elt)
Packit 423ecb

Checks whether an HTML element may be a direct child of a parent element. and if so whether it is valid or deprecated.

Packit 423ecb
<tt>parent</tt>:HTML parent element
<tt>elt</tt>:HTML element
<tt>Returns</tt>:one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.

Packit 423ecb
<tt>out</tt>:a pointer to an array of bytes to store the result
<tt>outlen</tt>:the length of @out
<tt>in</tt>:a pointer to an array of UTF-8 chars
<tt>inlen</tt>:the length of @in
<tt>quoteChar</tt>:the quote character to escape (' or ") or zero.
<tt>Returns</tt>:0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed.
Packit 423ecb
        
Packit 423ecb
        

htmlEntityLookup ()

const htmlEntityDesc *	htmlEntityLookup	(const xmlChar * name)
Packit 423ecb

Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.

Packit 423ecb
<tt>name</tt>:the entity name
<tt>Returns</tt>:the associated htmlEntityDescPtr if found, NULL otherwise.
Packit 423ecb
        
Packit 423ecb
        

htmlEntityValueLookup ()

const htmlEntityDesc *	htmlEntityValueLookup	(unsigned int value)
Packit 423ecb

Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.

Packit 423ecb
<tt>value</tt>:the entity's unicode value
<tt>Returns</tt>:the associated htmlEntityDescPtr if found, NULL otherwise.
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.

Packit 423ecb
<tt>ctxt</tt>:an HTML parser context
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Set and return the previous value for handling HTML omitted tags.

Packit 423ecb
<tt>val</tt>:int 0 or 1
<tt>Returns</tt>:the last value for 0 for no handling, 1 for auto insertion.
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child

Packit 423ecb
<tt>doc</tt>:the HTML document
<tt>elem</tt>:the HTML element
<tt>Returns</tt>:1 if autoclosed, 0 otherwise
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Check if an attribute is of content type Script

Packit 423ecb
<tt>name</tt>:an attribute name
<tt>Returns</tt>:1 is the attribute is a script 0 otherwise
Packit 423ecb
        
Packit 423ecb
        

htmlNewParserCtxt ()

htmlParserCtxtPtr	htmlNewParserCtxt	(void)
Packit 423ecb

Allocate and initialize a new parser context.

Packit 423ecb
<tt>Returns</tt>:the htmlParserCtxtPtr or NULL in case of allocation error
Packit 423ecb
        
Packit 423ecb
        

htmlNodeStatus ()

htmlStatus	htmlNodeStatus		(const htmlNodePtr node, 
int legacy)
Packit 423ecb

Checks whether the tree node is valid. Experimental (the author only uses the HTML enhancements in a SAX parser)

Packit 423ecb
<tt>node</tt>:an htmlNodePtr in a tree
<tt>legacy</tt>:whether to allow deprecated elements (YES is faster here for Element nodes)
<tt>Returns</tt>:for Element nodes, a return from htmlElementAllowedHere (if legacy allowed) or htmlElementStatusHere (otherwise). for Attribute nodes, a return from htmlAttrAllowed for other nodes, HTML_NA (no checks performed)
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

parse Reference declarations [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'

Packit 423ecb
<tt>ctxt</tt>:an HTML parser context
<tt>Returns</tt>:the value parsed (as an int)
Packit 423ecb
        
Packit 423ecb
        

htmlParseChunk ()

int	htmlParseChunk			(htmlParserCtxtPtr ctxt, 
const char * chunk,
int size,
int terminate)
Packit 423ecb

Parse a Chunk of memory

Packit 423ecb
<tt>ctxt</tt>:an HTML parser context
<tt>chunk</tt>:an char array
<tt>size</tt>:the size in byte of the chunk
<tt>terminate</tt>:last chunk indicator
<tt>Returns</tt>:zero if no error, the xmlParserErrors otherwise.
Packit 423ecb
        
Packit 423ecb
        

htmlParseDoc ()

htmlDocPtr	htmlParseDoc		(const xmlChar * cur, 
const char * encoding)
Packit 423ecb

parse an HTML in-memory document and build a tree.

Packit 423ecb
<tt>cur</tt>:a pointer to an array of xmlChar
<tt>encoding</tt>:a free form C string describing the HTML document encoding, or NULL
<tt>Returns</tt>:the resulting document tree
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

parse an HTML document (and build a tree if using the standard SAX interface).

Packit 423ecb
<tt>ctxt</tt>:an HTML parser context
<tt>Returns</tt>:0, -1 in case of error. the parser context is augmented as a result of the parsing.
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

parse an HTML element, this is highly recursive this is kept for compatibility with previous code versions [39] element ::= EmptyElemTag | STag content ETag [41] Attribute ::= Name Eq AttValue

Packit 423ecb
<tt>ctxt</tt>:an HTML parser context
Packit 423ecb
        
Packit 423ecb
        

htmlParseEntityRef ()

const htmlEntityDesc *	htmlParseEntityRef	(htmlParserCtxtPtr ctxt, 
const xmlChar ** str)
Packit 423ecb

parse an HTML ENTITY references [68] EntityRef ::= '&' Name ';'

Packit 423ecb
<tt>ctxt</tt>:an HTML parser context
<tt>str</tt>:location to store the entity name
<tt>Returns</tt>:the associated htmlEntityDescPtr if found, or NULL otherwise, if non-NULL *str will have to be freed by the caller.
Packit 423ecb
        
Packit 423ecb
        

htmlParseFile ()

htmlDocPtr	htmlParseFile		(const char * filename, 
const char * encoding)
Packit 423ecb

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.

Packit 423ecb
<tt>filename</tt>:the filename
<tt>encoding</tt>:a free form C string describing the HTML document encoding, or NULL
<tt>Returns</tt>:the resulting document tree
Packit 423ecb
        
Packit 423ecb
        

htmlReadDoc ()

htmlDocPtr	htmlReadDoc		(const xmlChar * cur, 
const char * URL,
const char * encoding,
int options)
Packit 423ecb

parse an XML in-memory document and build a tree.

Packit 423ecb
<tt>cur</tt>:a pointer to a zero terminated string
<tt>URL</tt>:the base URL to use for the document
<tt>encoding</tt>:the document encoding, or NULL
<tt>options</tt>:a combination of htmlParserOption(s)
<tt>Returns</tt>:the resulting document tree
Packit 423ecb
        
Packit 423ecb
        

htmlReadFd ()

htmlDocPtr	htmlReadFd		(int fd, 
const char * URL,
const char * encoding,
int options)
Packit 423ecb

parse an XML from a file descriptor and build a tree.

Packit 423ecb
<tt>fd</tt>:an open file descriptor
<tt>URL</tt>:the base URL to use for the document
<tt>encoding</tt>:the document encoding, or NULL
<tt>options</tt>:a combination of htmlParserOption(s)
<tt>Returns</tt>:the resulting document tree
Packit 423ecb
        
Packit 423ecb
        

htmlReadFile ()

htmlDocPtr	htmlReadFile		(const char * filename, 
const char * encoding,
int options)
Packit 423ecb

parse an XML file from the filesystem or the network.

Packit 423ecb
<tt>filename</tt>:a file or URL
<tt>encoding</tt>:the document encoding, or NULL
<tt>options</tt>:a combination of htmlParserOption(s)
<tt>Returns</tt>:the resulting document tree
Packit 423ecb
        
Packit 423ecb
        

htmlReadIO ()

htmlDocPtr	htmlReadIO		(xmlInputReadCallback ioread, 
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)
Packit 423ecb

parse an HTML document from I/O functions and source and build a tree.

Packit 423ecb
<tt>ioread</tt>:an I/O read function
<tt>ioclose</tt>:an I/O close function
<tt>ioctx</tt>:an I/O handler
<tt>URL</tt>:the base URL to use for the document
<tt>encoding</tt>:the document encoding, or NULL
<tt>options</tt>:a combination of htmlParserOption(s)
<tt>Returns</tt>:the resulting document tree
Packit 423ecb
        
Packit 423ecb
        

htmlReadMemory ()

htmlDocPtr	htmlReadMemory		(const char * buffer, 
int size,
const char * URL,
const char * encoding,
int options)
Packit 423ecb

parse an XML in-memory document and build a tree.

Packit 423ecb
<tt>buffer</tt>:a pointer to a char array
<tt>size</tt>:the size of the array
<tt>URL</tt>:the base URL to use for the document
<tt>encoding</tt>:the document encoding, or NULL
<tt>options</tt>:a combination of htmlParserOption(s)
<tt>Returns</tt>:the resulting document tree
Packit 423ecb
        
Packit 423ecb
        

htmlSAXParseDoc ()

htmlDocPtr	htmlSAXParseDoc		(const xmlChar * cur, 
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)
Packit 423ecb

Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.

Packit 423ecb
<tt>cur</tt>:a pointer to an array of xmlChar
<tt>encoding</tt>:a free form C string describing the HTML document encoding, or NULL
<tt>sax</tt>:the SAX handler block
<tt>userData</tt>:if using SAX, this pointer will be provided on callbacks.
<tt>Returns</tt>:the resulting document tree unless SAX is NULL or the document is not well formed.
Packit 423ecb
        
Packit 423ecb
        

htmlSAXParseFile ()

htmlDocPtr	htmlSAXParseFile	(const char * filename, 
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)
Packit 423ecb

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.

Packit 423ecb
<tt>filename</tt>:the filename
<tt>encoding</tt>:a free form C string describing the HTML document encoding, or NULL
<tt>sax</tt>:the SAX handler block
<tt>userData</tt>:if using SAX, this pointer will be provided on callbacks.
<tt>Returns</tt>:the resulting document tree unless SAX is NULL or the document is not well formed.
Packit 423ecb
        
Packit 423ecb
        

htmlTagLookup ()

const htmlElemDesc *	htmlTagLookup	(const xmlChar * tag)
Packit 423ecb

Lookup the HTML tag in the ElementTable

Packit 423ecb
<tt>tag</tt>:The tag name in lowercase
<tt>Returns</tt>:the related htmlElemDescPtr or NULL if not found.
Packit 423ecb
        
Packit 423ecb
      
Packit 423ecb
    
Packit 423ecb
  </body>
Packit 423ecb
</html>