|
Packit |
423ecb |
|
|
Packit |
423ecb |
<html>
|
|
Packit |
423ecb |
<head>
|
|
Packit |
423ecb |
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
|
Packit |
423ecb |
<title>HTMLparser: interface for an HTML 4.0 non-verifying parser</title>
|
|
Packit |
423ecb |
<meta name="generator" content="Libxml2 devhelp stylesheet"/>
|
|
Packit |
423ecb |
<link rel="start" href="index.html" title="libxml2 Reference Manual"/>
|
|
Packit |
423ecb |
<link rel="up" href="general.html" title="API"/>
|
|
Packit |
423ecb |
<link rel="stylesheet" href="style.css" type="text/css"/>
|
|
Packit |
423ecb |
<link rel="chapter" href="general.html" title="API"/>
|
|
Packit |
423ecb |
</head>
|
|
Packit |
423ecb |
<body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF">
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
libxml2 Reference Manual
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
HTMLparser
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
HTMLparser - interface for an HTML 4.0 non-verifying parser
|
|
Packit |
423ecb |
this module implements an HTML 4.0 non-verifying parser with API compatible with the XML parser ones. It should be able to parse "real world" HTML, even if severely broken from a specification point of view.
|
|
Packit |
423ecb |
Author(s): Daniel Veillard
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Synopsis
|
|
Packit |
423ecb |
#define htmlDefaultSubelement(elt);
|
|
Packit |
423ecb |
#define htmlElementAllowedHereDesc(parent, elt);
|
|
Packit |
423ecb |
#define htmlRequiredAttrs(elt);
|
|
Packit |
423ecb |
typedef xmlParserNodeInfo htmlParserNodeInfo;
|
|
Packit |
423ecb |
typedef xmlParserInput htmlParserInput;
|
|
Packit |
423ecb |
typedef xmlParserCtxtPtr htmlParserCtxtPtr;
|
|
Packit |
423ecb |
typedef struct _htmlEntityDesc htmlEntityDesc;
|
|
Packit |
423ecb |
typedef xmlDocPtr htmlDocPtr;
|
|
Packit |
423ecb |
typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
|
|
Packit |
423ecb |
typedef enum htmlStatus;
|
|
Packit |
423ecb |
typedef xmlNodePtr htmlNodePtr;
|
|
Packit |
423ecb |
typedef htmlElemDesc * htmlElemDescPtr;
|
|
Packit |
423ecb |
typedef struct _htmlElemDesc htmlElemDesc;
|
|
Packit |
423ecb |
typedef xmlSAXHandler htmlSAXHandler;
|
|
Packit |
423ecb |
typedef xmlParserInputPtr htmlParserInputPtr;
|
|
Packit |
423ecb |
typedef enum htmlParserOption;
|
|
Packit |
423ecb |
typedef htmlEntityDesc * htmlEntityDescPtr;
|
|
Packit |
423ecb |
typedef xmlParserCtxt htmlParserCtxt;
|
|
Packit |
423ecb |
int htmlIsScriptAttribute (const xmlChar * name);
|
|
Packit |
423ecb |
int htmlHandleOmittedElem (int val);
|
|
Packit |
423ecb |
htmlDocPtr htmlReadFd (int fd, const char * URL, const char * encoding, int options);
|
|
Packit |
423ecb |
htmlDocPtr htmlReadIO (xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void * ioctx, const char * URL, const char * encoding, int options);
|
|
Packit |
423ecb |
htmlDocPtr htmlParseFile (const char * filename, const char * encoding);
|
|
Packit |
423ecb |
htmlDocPtr htmlCtxtReadDoc (htmlParserCtxtPtr ctxt, const xmlChar * cur, const char * URL, const char * encoding, int options);
|
|
Packit |
423ecb |
int htmlAutoCloseTag (htmlDocPtr doc, const xmlChar * name, htmlNodePtr elem);
|
|
Packit |
423ecb |
int htmlParseChunk (htmlParserCtxtPtr ctxt, const char * chunk, int size, int terminate);
|
|
Packit |
423ecb |
const htmlElemDesc * htmlTagLookup (const xmlChar * tag);
|
|
Packit |
423ecb |
htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char * buffer, int size);
|
|
Packit |
423ecb |
void htmlCtxtReset (htmlParserCtxtPtr ctxt);
|
|
Packit |
423ecb |
int htmlElementAllowedHere (const htmlElemDesc * parent, const xmlChar * elt);
|
|
Packit |
423ecb |
htmlDocPtr htmlCtxtReadIO (htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void * ioctx, const char * URL, const char * encoding, int options);
|
|
Packit |
423ecb |
htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax, void * user_data, const char * chunk, int size, const char * filename, xmlCharEncoding enc);
|
|
Packit |
423ecb |
htmlDocPtr htmlReadMemory (const char * buffer, int size, const char * URL, const char * encoding, int options);
|
|
Packit |
423ecb |
int htmlIsAutoClosed (htmlDocPtr doc, htmlNodePtr elem);
|
|
Packit |
423ecb |
int htmlParseCharRef (htmlParserCtxtPtr ctxt);
|
|
Packit |
423ecb |
htmlDocPtr htmlReadDoc (const xmlChar * cur, const char * URL, const char * encoding, int options);
|
|
Packit |
423ecb |
int htmlEncodeEntities (unsigned char * out, int * outlen, const unsigned char * in, int * inlen, int quoteChar);
|
|
Packit |
423ecb |
htmlStatus htmlNodeStatus (const htmlNodePtr node, int legacy);
|
|
Packit |
423ecb |
htmlStatus htmlAttrAllowed (const htmlElemDesc * elt, const xmlChar * attr, int legacy);
|
|
Packit |
423ecb |
htmlDocPtr htmlSAXParseFile (const char * filename, const char * encoding, htmlSAXHandlerPtr sax, void * userData);
|
|
Packit |
423ecb |
const htmlEntityDesc * htmlParseEntityRef (htmlParserCtxtPtr ctxt, const xmlChar ** str);
|
|
Packit |
423ecb |
htmlStatus htmlElementStatusHere (const htmlElemDesc * parent, const htmlElemDesc * elt);
|
|
Packit |
423ecb |
const htmlEntityDesc * htmlEntityValueLookup (unsigned int value);
|
|
Packit |
423ecb |
void htmlParseElement (htmlParserCtxtPtr ctxt);
|
|
Packit |
423ecb |
int UTF8ToHtml (unsigned char * out, int * outlen, const unsigned char * in, int * inlen);
|
|
Packit |
423ecb |
const htmlEntityDesc * htmlEntityLookup (const xmlChar * name);
|
|
Packit |
423ecb |
void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
|
|
Packit |
423ecb |
htmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt, const char * buffer, int size, const char * URL, const char * encoding, int options);
|
|
Packit |
423ecb |
htmlDocPtr htmlCtxtReadFd (htmlParserCtxtPtr ctxt, int fd, const char * URL, const char * encoding, int options);
|
|
Packit |
423ecb |
htmlDocPtr htmlReadFile (const char * filename, const char * encoding, int options);
|
|
Packit |
423ecb |
htmlDocPtr htmlCtxtReadFile (htmlParserCtxtPtr ctxt, const char * filename, const char * encoding, int options);
|
|
Packit |
423ecb |
int htmlParseDocument (htmlParserCtxtPtr ctxt);
|
|
Packit |
423ecb |
htmlParserCtxtPtr htmlNewParserCtxt (void);
|
|
Packit |
423ecb |
htmlDocPtr htmlSAXParseDoc (const xmlChar * cur, const char * encoding, htmlSAXHandlerPtr sax, void * userData);
|
|
Packit |
423ecb |
int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, int options);
|
|
Packit |
423ecb |
htmlDocPtr htmlParseDoc (const xmlChar * cur, const char * encoding);
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Description
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Details
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Returns the default subelement for this element <tt>elt</tt>: | HTML element |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Checks whether an HTML element description may be a direct child of the specified element. Returns 1 if allowed; 0 otherwise. <tt>parent</tt>: | HTML parent element | <tt>elt</tt>: | HTML element |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Returns the attributes required for the specified element. <tt>elt</tt>: | HTML element |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
const char * name : The tag name
|
|
Packit |
423ecb |
char startTag : Whether the start tag can be implied
|
|
Packit |
423ecb |
char endTag : Whether the end tag can be implied
|
|
Packit |
423ecb |
char saveEndTag : Whether the end tag should be saved
|
|
Packit |
423ecb |
char empty : Is this an empty element ?
|
|
Packit |
423ecb |
char depr : Is this a deprecated element ?
|
|
Packit |
423ecb |
char dtd : 1: only in Loose DTD, 2: only Frameset one
|
|
Packit |
423ecb |
char isinline : is this a block 0 or inline 1 element
|
|
Packit |
423ecb |
const char * desc : the description NRK Jan.2003 * New fields encapsulating HTML structur
|
|
Packit |
423ecb |
const char ** subelts : allowed sub-elements of this element
|
|
Packit |
423ecb |
const char * defaultsubelt : subelement for suggested auto-repair if necessary or NULL
|
|
Packit |
423ecb |
const char ** attrs_opt : Optional Attributes
|
|
Packit |
423ecb |
const char ** attrs_depr : Additional deprecated attributes
|
|
Packit |
423ecb |
const char ** attrs_req : Required attributes
|
|
Packit |
423ecb |
} htmlElemDesc;
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
unsigned int value : the UNICODE value for the character
|
|
Packit |
423ecb |
const char * name : The entity name
|
|
Packit |
423ecb |
const char * desc : the description
|
|
Packit |
423ecb |
} htmlEntityDesc;
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
HTML_PARSE_RECOVER = 1 /* Relaxed parsing */
|
|
Packit |
423ecb |
HTML_PARSE_NODEFDTD = 4 /* do not default a doctype if not found */
|
|
Packit |
423ecb |
HTML_PARSE_NOERROR = 32 /* suppress error reports */
|
|
Packit |
423ecb |
HTML_PARSE_NOWARNING = 64 /* suppress warning reports */
|
|
Packit |
423ecb |
HTML_PARSE_PEDANTIC = 128 /* pedantic error reporting */
|
|
Packit |
423ecb |
HTML_PARSE_NOBLANKS = 256 /* remove blank nodes */
|
|
Packit |
423ecb |
HTML_PARSE_NONET = 2048 /* Forbid network access */
|
|
Packit |
423ecb |
HTML_PARSE_NOIMPLIED = 8192 /* Do not add implied html/body... elements */
|
|
Packit |
423ecb |
HTML_PARSE_COMPACT = 65536 /* compact small text nodes */
|
|
Packit |
423ecb |
HTML_PARSE_IGNORE_ENC = 2097152 /* ignore internal document encoding hint */
|
|
Packit |
423ecb |
};
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
HTML_NA = 0 /* something we don't check at all */
|
|
Packit |
423ecb |
HTML_INVALID = 1
|
|
Packit |
423ecb |
HTML_DEPRECATED = 2
|
|
Packit |
423ecb |
HTML_VALID = 4
|
|
Packit |
423ecb |
HTML_REQUIRED = 12 /* VALID bit set so ( & HTML_VALID ) is TRUE */
|
|
Packit |
423ecb |
};
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
|
|
Packit |
423ecb |
<tt>out</tt>: | a pointer to an array of bytes to store the result | <tt>outlen</tt>: | the length of @out | <tt>in</tt>: | a pointer to an array of UTF-8 chars | <tt>inlen</tt>: | the length of @in | <tt>Returns</tt>: | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed. |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Checks whether an attribute is valid for an element Has full knowledge of Required and Deprecated attributes
|
|
Packit |
423ecb |
<tt>elt</tt>: | HTML element | <tt>attr</tt>: | HTML attribute | <tt>legacy</tt>: | whether to allow deprecated attributes | <tt>Returns</tt>: | one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.
|
|
Packit |
423ecb |
<tt>doc</tt>: | the HTML document | <tt>name</tt>: | The tag name | <tt>elem</tt>: | the HTML element | <tt>Returns</tt>: | 1 if autoclose, 0 otherwise |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Create a parser context for an HTML in-memory document.
|
|
Packit |
423ecb |
<tt>buffer</tt>: | a pointer to a char array | <tt>size</tt>: | the size of the array | <tt>Returns</tt>: | the new parser context or NULL |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Create a parser context for using the HTML parser in push mode The value of @filename is used for fetching external entities and error/warning reports.
|
|
Packit |
423ecb |
<tt>sax</tt>: | a SAX handler | <tt>user_data</tt>: | The user data returned on SAX callbacks | <tt>chunk</tt>: | a pointer to an array of chars | <tt>size</tt>: | number of chars in the array | <tt>filename</tt>: | an optional file name or URI | <tt>enc</tt>: | an optional encoding | <tt>Returns</tt>: | the new parser context or NULL |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context
|
|
Packit |
423ecb |
<tt>ctxt</tt>: | an HTML parser context | <tt>cur</tt>: | a pointer to a zero terminated string | <tt>URL</tt>: | the base URL to use for the document | <tt>encoding</tt>: | the document encoding, or NULL | <tt>options</tt>: | a combination of htmlParserOption(s) | <tt>Returns</tt>: | the resulting document tree |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
parse an XML from a file descriptor and build a tree. This reuses the existing @ctxt parser context
|
|
Packit |
423ecb |
<tt>ctxt</tt>: | an HTML parser context | <tt>fd</tt>: | an open file descriptor | <tt>URL</tt>: | the base URL to use for the document | <tt>encoding</tt>: | the document encoding, or NULL | <tt>options</tt>: | a combination of htmlParserOption(s) | <tt>Returns</tt>: | the resulting document tree |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
parse an XML file from the filesystem or the network. This reuses the existing @ctxt parser context
|
|
Packit |
423ecb |
<tt>ctxt</tt>: | an HTML parser context | <tt>filename</tt>: | a file or URL | <tt>encoding</tt>: | the document encoding, or NULL | <tt>options</tt>: | a combination of htmlParserOption(s) | <tt>Returns</tt>: | the resulting document tree |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
parse an HTML document from I/O functions and source and build a tree. This reuses the existing @ctxt parser context
|
|
Packit |
423ecb |
<tt>ctxt</tt>: | an HTML parser context | <tt>ioread</tt>: | an I/O read function | <tt>ioclose</tt>: | an I/O close function | <tt>ioctx</tt>: | an I/O handler | <tt>URL</tt>: | the base URL to use for the document | <tt>encoding</tt>: | the document encoding, or NULL | <tt>options</tt>: | a combination of htmlParserOption(s) | <tt>Returns</tt>: | the resulting document tree |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
htmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt, const char * buffer, int size, const char * URL, const char * encoding, int options)
|
|
Packit |
423ecb |
parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context
|
|
Packit |
423ecb |
<tt>ctxt</tt>: | an HTML parser context | <tt>buffer</tt>: | a pointer to a char array | <tt>size</tt>: | the size of the array | <tt>URL</tt>: | the base URL to use for the document | <tt>encoding</tt>: | the document encoding, or NULL | <tt>options</tt>: | a combination of htmlParserOption(s) | <tt>Returns</tt>: | the resulting document tree |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Reset a parser context
|
|
Packit |
423ecb |
<tt>ctxt</tt>: | an HTML parser context |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Applies the options to the parser context
|
|
Packit |
423ecb |
<tt>ctxt</tt>: | an HTML parser context | <tt>options</tt>: | a combination of htmlParserOption(s) | <tt>Returns</tt>: | 0 in case of success, the set of unknown or unimplemented options in case of error. |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Checks whether an HTML element may be a direct child of a parent element. Note - doesn't check for deprecated elements
|
|
Packit |
423ecb |
<tt>parent</tt>: | HTML parent element | <tt>elt</tt>: | HTML element | <tt>Returns</tt>: | 1 if allowed; 0 otherwise. |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Checks whether an HTML element may be a direct child of a parent element. and if so whether it is valid or deprecated.
|
|
Packit |
423ecb |
<tt>parent</tt>: | HTML parent element | <tt>elt</tt>: | HTML element | <tt>Returns</tt>: | one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
|
|
Packit |
423ecb |
<tt>out</tt>: | a pointer to an array of bytes to store the result | <tt>outlen</tt>: | the length of @out | <tt>in</tt>: | a pointer to an array of UTF-8 chars | <tt>inlen</tt>: | the length of @in | <tt>quoteChar</tt>: | the quote character to escape (' or ") or zero. | <tt>Returns</tt>: | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed. |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.
|
|
Packit |
423ecb |
<tt>name</tt>: | the entity name | <tt>Returns</tt>: | the associated htmlEntityDescPtr if found, NULL otherwise. |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.
|
|
Packit |
423ecb |
<tt>value</tt>: | the entity's unicode value | <tt>Returns</tt>: | the associated htmlEntityDescPtr if found, NULL otherwise. |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.
|
|
Packit |
423ecb |
<tt>ctxt</tt>: | an HTML parser context |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Set and return the previous value for handling HTML omitted tags.
|
|
Packit |
423ecb |
<tt>val</tt>: | int 0 or 1 | <tt>Returns</tt>: | the last value for 0 for no handling, 1 for auto insertion. |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child
|
|
Packit |
423ecb |
<tt>doc</tt>: | the HTML document | <tt>elem</tt>: | the HTML element | <tt>Returns</tt>: | 1 if autoclosed, 0 otherwise |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Check if an attribute is of content type Script
|
|
Packit |
423ecb |
<tt>name</tt>: | an attribute name | <tt>Returns</tt>: | 1 is the attribute is a script 0 otherwise |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Allocate and initialize a new parser context.
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Checks whether the tree node is valid. Experimental (the author only uses the HTML enhancements in a SAX parser)
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
parse Reference declarations [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
|
|
Packit |
423ecb |
<tt>ctxt</tt>: | an HTML parser context | <tt>Returns</tt>: | the value parsed (as an int) |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Parse a Chunk of memory
|
|
Packit |
423ecb |
<tt>ctxt</tt>: | an HTML parser context | <tt>chunk</tt>: | an char array | <tt>size</tt>: | the size in byte of the chunk | <tt>terminate</tt>: | last chunk indicator | <tt>Returns</tt>: | zero if no error, the xmlParserErrors otherwise. |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
parse an HTML in-memory document and build a tree.
|
|
Packit |
423ecb |
<tt>cur</tt>: | a pointer to an array of xmlChar | <tt>encoding</tt>: | a free form C string describing the HTML document encoding, or NULL | <tt>Returns</tt>: | the resulting document tree |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
parse an HTML document (and build a tree if using the standard SAX interface).
|
|
Packit |
423ecb |
<tt>ctxt</tt>: | an HTML parser context | <tt>Returns</tt>: | 0, -1 in case of error. the parser context is augmented as a result of the parsing. |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
parse an HTML element, this is highly recursive this is kept for compatibility with previous code versions [39] element ::= EmptyElemTag | STag content ETag [41] Attribute ::= Name Eq AttValue
|
|
Packit |
423ecb |
<tt>ctxt</tt>: | an HTML parser context |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
parse an HTML ENTITY references [68] EntityRef ::= '&' Name ';'
|
|
Packit |
423ecb |
<tt>ctxt</tt>: | an HTML parser context | <tt>str</tt>: | location to store the entity name | <tt>Returns</tt>: | the associated htmlEntityDescPtr if found, or NULL otherwise, if non-NULL *str will have to be freed by the caller. |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
htmlDocPtr htmlParseFile (const char * filename, const char * encoding)
|
|
Packit |
423ecb |
parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.
|
|
Packit |
423ecb |
<tt>filename</tt>: | the filename | <tt>encoding</tt>: | a free form C string describing the HTML document encoding, or NULL | <tt>Returns</tt>: | the resulting document tree |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
htmlDocPtr htmlReadDoc (const xmlChar * cur, const char * URL, const char * encoding, int options)
|
|
Packit |
423ecb |
parse an XML in-memory document and build a tree.
|
|
Packit |
423ecb |
<tt>cur</tt>: | a pointer to a zero terminated string | <tt>URL</tt>: | the base URL to use for the document | <tt>encoding</tt>: | the document encoding, or NULL | <tt>options</tt>: | a combination of htmlParserOption(s) | <tt>Returns</tt>: | the resulting document tree |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
htmlDocPtr htmlReadFd (int fd, const char * URL, const char * encoding, int options)
|
|
Packit |
423ecb |
parse an XML from a file descriptor and build a tree.
|
|
Packit |
423ecb |
<tt>fd</tt>: | an open file descriptor | <tt>URL</tt>: | the base URL to use for the document | <tt>encoding</tt>: | the document encoding, or NULL | <tt>options</tt>: | a combination of htmlParserOption(s) | <tt>Returns</tt>: | the resulting document tree |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
htmlDocPtr htmlReadFile (const char * filename, const char * encoding, int options)
|
|
Packit |
423ecb |
parse an XML file from the filesystem or the network.
|
|
Packit |
423ecb |
<tt>filename</tt>: | a file or URL | <tt>encoding</tt>: | the document encoding, or NULL | <tt>options</tt>: | a combination of htmlParserOption(s) | <tt>Returns</tt>: | the resulting document tree |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
parse an HTML document from I/O functions and source and build a tree.
|
|
Packit |
423ecb |
<tt>ioread</tt>: | an I/O read function | <tt>ioclose</tt>: | an I/O close function | <tt>ioctx</tt>: | an I/O handler | <tt>URL</tt>: | the base URL to use for the document | <tt>encoding</tt>: | the document encoding, or NULL | <tt>options</tt>: | a combination of htmlParserOption(s) | <tt>Returns</tt>: | the resulting document tree |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
htmlDocPtr htmlReadMemory (const char * buffer, int size, const char * URL, const char * encoding, int options)
|
|
Packit |
423ecb |
parse an XML in-memory document and build a tree.
|
|
Packit |
423ecb |
<tt>buffer</tt>: | a pointer to a char array | <tt>size</tt>: | the size of the array | <tt>URL</tt>: | the base URL to use for the document | <tt>encoding</tt>: | the document encoding, or NULL | <tt>options</tt>: | a combination of htmlParserOption(s) | <tt>Returns</tt>: | the resulting document tree |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.
|
|
Packit |
423ecb |
<tt>cur</tt>: | a pointer to an array of xmlChar | <tt>encoding</tt>: | a free form C string describing the HTML document encoding, or NULL | <tt>sax</tt>: | the SAX handler block | <tt>userData</tt>: | if using SAX, this pointer will be provided on callbacks. | <tt>Returns</tt>: | the resulting document tree unless SAX is NULL or the document is not well formed. |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.
|
|
Packit |
423ecb |
<tt>filename</tt>: | the filename | <tt>encoding</tt>: | a free form C string describing the HTML document encoding, or NULL | <tt>sax</tt>: | the SAX handler block | <tt>userData</tt>: | if using SAX, this pointer will be provided on callbacks. | <tt>Returns</tt>: | the resulting document tree unless SAX is NULL or the document is not well formed. |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
Lookup the HTML tag in the ElementTable
|
|
Packit |
423ecb |
<tt>tag</tt>: | The tag name in lowercase | <tt>Returns</tt>: | the related htmlElemDescPtr or NULL if not found. |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
|
|
Packit |
423ecb |
</body>
|
|
Packit |
423ecb |
</html>
|