Blame doc/devhelp/libxml2-encoding.html

Packit 423ecb
Packit 423ecb
<html>
Packit 423ecb
  <head>
Packit 423ecb
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
Packit 423ecb
    <title>encoding: interface for the encoding conversion functions</title>
Packit 423ecb
    <meta name="generator" content="Libxml2 devhelp stylesheet"/>
Packit 423ecb
    <link rel="start" href="index.html" title="libxml2 Reference Manual"/>
Packit 423ecb
    <link rel="up" href="general.html" title="API"/>
Packit 423ecb
    <link rel="stylesheet" href="style.css" type="text/css"/>
Packit 423ecb
    <link rel="chapter" href="general.html" title="API"/>
Packit 423ecb
  </head>
Packit 423ecb
  <body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF">
Packit 423ecb
    
Packit 423ecb
      
Packit 423ecb
        
Packit 423ecb
          
Packit 423ecb
            Prev
Packit 423ecb
          
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb
          
Packit 423ecb
            Up
Packit 423ecb
          
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb
          
Packit 423ecb
            Home
Packit 423ecb
          
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb
          
Packit 423ecb
            Next
Packit 423ecb
          
Packit 423ecb
        
Packit 423ecb
        libxml2 Reference Manual
Packit 423ecb
      
Packit 423ecb
    
Packit 423ecb
    

Packit 423ecb
      encoding
Packit 423ecb
    
Packit 423ecb
    

encoding - interface for the encoding conversion functions

Packit 423ecb
    

interface for the encoding conversion functions needed for XML basic encoding and iconv() support. Related specs are rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies [ISO-10646] UTF-8 and UTF-16 in Annexes [ISO-8859-1] ISO Latin-1 characters codes. [UNICODE] The Unicode Consortium, "The Unicode Standard -- Worldwide Character Encoding -- Version 1.0", Addison- Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is described in Unicode Technical Report #4. [US-ASCII] Coded Character Set--7-bit American Standard Code for Information Interchange, ANSI X3.4-1986.

Packit 423ecb
    

Author(s): Daniel Veillard

Packit 423ecb
    
Packit 423ecb
      

Synopsis

Packit 423ecb
      
typedef struct _uconv_t uconv_t;
Packit 423ecb
typedef enum xmlCharEncoding;
Packit 423ecb
typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
Packit 423ecb
typedef xmlCharEncodingHandler * xmlCharEncodingHandlerPtr;
Packit 423ecb
int	xmlDelEncodingAlias		(const char * alias);
Packit 423ecb
const char *	xmlGetEncodingAlias	(const char * alias);
Packit 423ecb
void	xmlRegisterCharEncodingHandler	(xmlCharEncodingHandlerPtr handler);
Packit 423ecb
int	UTF8Toisolat1			(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen);
Packit 423ecb
void	xmlInitCharEncodingHandlers	(void);
Packit 423ecb
int	xmlAddEncodingAlias		(const char * name, 
const char * alias);
Packit 423ecb
void	xmlCleanupEncodingAliases	(void);
Packit 423ecb
int	xmlCharEncOutFunc		(xmlCharEncodingHandler * handler, 
xmlBufferPtr out,
xmlBufferPtr in);
Packit 423ecb
xmlCharEncoding	xmlParseCharEncoding	(const char * name);
Packit 423ecb
typedef int xmlCharEncodingInputFunc	(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen);
Packit 423ecb
void	xmlCleanupCharEncodingHandlers	(void);
Packit 423ecb
xmlCharEncodingHandlerPtr	xmlNewCharEncodingHandler	(const char * name, 
xmlCharEncodingInputFunc input,
xmlCharEncodingOutputFunc output);
Packit 423ecb
typedef int xmlCharEncodingOutputFunc	(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen);
Packit 423ecb
int	isolat1ToUTF8			(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen);
Packit 423ecb
xmlCharEncodingHandlerPtr	xmlFindCharEncodingHandler	(const char * name);
Packit 423ecb
int	xmlCharEncInFunc		(xmlCharEncodingHandler * handler, 
xmlBufferPtr out,
xmlBufferPtr in);
Packit 423ecb
xmlCharEncodingHandlerPtr	xmlGetCharEncodingHandler	(xmlCharEncoding enc);
Packit 423ecb
int	xmlCharEncFirstLine		(xmlCharEncodingHandler * handler, 
xmlBufferPtr out,
xmlBufferPtr in);
Packit 423ecb
xmlCharEncoding	xmlDetectCharEncoding	(const unsigned char * in, 
int len);
Packit 423ecb
int	xmlCharEncCloseFunc		(xmlCharEncodingHandler * handler);
Packit 423ecb
const char *	xmlGetCharEncodingName	(xmlCharEncoding enc);
Packit 423ecb
Packit 423ecb
    
Packit 423ecb
    
Packit 423ecb
      

Description

Packit 423ecb
    
Packit 423ecb
    
Packit 423ecb
      

Details

Packit 423ecb
      
Packit 423ecb
        

Structure uconv_t

struct _uconv_t {
Packit 423ecb
    UConverter *	uconv	: for conversion between an encoding and UTF-16
Packit 423ecb
    UConverter *	utf8	: for conversion between UTF-8 and UTF-16
Packit 423ecb
} uconv_t;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Enum xmlCharEncoding

enum xmlCharEncoding {
Packit 423ecb
    XML_CHAR_ENCODING_ERROR = -1 /* No char encoding detected */
Packit 423ecb
    XML_CHAR_ENCODING_NONE = 0 /* No char encoding detected */
Packit 423ecb
    XML_CHAR_ENCODING_UTF8 = 1 /* UTF-8 */
Packit 423ecb
    XML_CHAR_ENCODING_UTF16LE = 2 /* UTF-16 little endian */
Packit 423ecb
    XML_CHAR_ENCODING_UTF16BE = 3 /* UTF-16 big endian */
Packit 423ecb
    XML_CHAR_ENCODING_UCS4LE = 4 /* UCS-4 little endian */
Packit 423ecb
    XML_CHAR_ENCODING_UCS4BE = 5 /* UCS-4 big endian */
Packit 423ecb
    XML_CHAR_ENCODING_EBCDIC = 6 /* EBCDIC uh! */
Packit 423ecb
    XML_CHAR_ENCODING_UCS4_2143 = 7 /* UCS-4 unusual ordering */
Packit 423ecb
    XML_CHAR_ENCODING_UCS4_3412 = 8 /* UCS-4 unusual ordering */
Packit 423ecb
    XML_CHAR_ENCODING_UCS2 = 9 /* UCS-2 */
Packit 423ecb
    XML_CHAR_ENCODING_8859_1 = 10 /* ISO-8859-1 ISO Latin 1 */
Packit 423ecb
    XML_CHAR_ENCODING_8859_2 = 11 /* ISO-8859-2 ISO Latin 2 */
Packit 423ecb
    XML_CHAR_ENCODING_8859_3 = 12 /* ISO-8859-3 */
Packit 423ecb
    XML_CHAR_ENCODING_8859_4 = 13 /* ISO-8859-4 */
Packit 423ecb
    XML_CHAR_ENCODING_8859_5 = 14 /* ISO-8859-5 */
Packit 423ecb
    XML_CHAR_ENCODING_8859_6 = 15 /* ISO-8859-6 */
Packit 423ecb
    XML_CHAR_ENCODING_8859_7 = 16 /* ISO-8859-7 */
Packit 423ecb
    XML_CHAR_ENCODING_8859_8 = 17 /* ISO-8859-8 */
Packit 423ecb
    XML_CHAR_ENCODING_8859_9 = 18 /* ISO-8859-9 */
Packit 423ecb
    XML_CHAR_ENCODING_2022_JP = 19 /* ISO-2022-JP */
Packit 423ecb
    XML_CHAR_ENCODING_SHIFT_JIS = 20 /* Shift_JIS */
Packit 423ecb
    XML_CHAR_ENCODING_EUC_JP = 21 /* EUC-JP */
Packit 423ecb
    XML_CHAR_ENCODING_ASCII = 22 /*  pure ASCII */
Packit 423ecb
};
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Structure xmlCharEncodingHandler

struct _xmlCharEncodingHandler {
Packit 423ecb
    char *	name
Packit 423ecb
    xmlCharEncodingInputFunc	input
Packit 423ecb
    xmlCharEncodingOutputFunc	output
Packit 423ecb
    iconv_t	iconv_in
Packit 423ecb
    iconv_t	iconv_out
Packit 423ecb
    uconv_t *	uconv_in
Packit 423ecb
    uconv_t *	uconv_out
Packit 423ecb
} xmlCharEncodingHandler;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        

Typedef xmlCharEncodingHandlerPtr

xmlCharEncodingHandler * xmlCharEncodingHandlerPtr;
Packit 423ecb

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Take a block of chars in the original encoding and try to convert it to an UTF-8 block of chars out.

Packit 423ecb
<tt>out</tt>:a pointer to an array of bytes to store the UTF-8 result
<tt>outlen</tt>:the length of @out
<tt>in</tt>:a pointer to an array of chars in the original encoding
<tt>inlen</tt>:the length of @in
<tt>Returns</tt>:the number of bytes written, -1 if lack of space, or -2 if the transcoding failed. The value of @inlen after return is the number of octets consumed if the return value is positive, else unpredictiable. The value of @outlen after return is the number of octets consumed.
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Take a block of UTF-8 chars in and try to convert it to another encoding. Note: a first call designed to produce heading info is called with in = NULL. If stateful this should also initialize the encoder state.

Packit 423ecb
<tt>out</tt>:a pointer to an array of bytes to store the result
<tt>outlen</tt>:the length of @out
<tt>in</tt>:a pointer to an array of UTF-8 chars
<tt>inlen</tt>:the length of @in
<tt>Returns</tt>:the number of bytes written, -1 if lack of space, or -2 if the transcoding failed. The value of @inlen after return is the number of octets consumed if the return value is positive, else unpredictiable. The value of @outlen after return is the number of octets produced.
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 block of chars out.

Packit 423ecb
<tt>out</tt>:a pointer to an array of bytes to store the result
<tt>outlen</tt>:the length of @out
<tt>in</tt>:a pointer to an array of UTF-8 chars
<tt>inlen</tt>:the length of @in
<tt>Returns</tt>:the number of bytes written if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed if the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed.
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 block of chars out.

Packit 423ecb
<tt>out</tt>:a pointer to an array of bytes to store the result
<tt>outlen</tt>:the length of @out
<tt>in</tt>:a pointer to an array of ISO Latin 1 chars
<tt>inlen</tt>:the length of @in
<tt>Returns</tt>:the number of bytes written if success, or -1 otherwise The value of @inlen after return is the number of octets consumed if the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed.
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Registers an alias @alias for an encoding named @name. Existing alias will be overwritten.

Packit 423ecb
<tt>name</tt>:the encoding name as parsed, in UTF-8 format (ASCII actually)
<tt>alias</tt>:the alias name as parsed, in UTF-8 format (ASCII actually)
<tt>Returns</tt>:0 in case of success, -1 in case of error
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Generic front-end for encoding handler close function

Packit 423ecb
<tt>handler</tt>:char enconding transformation data structure
<tt>Returns</tt>:0 if success, or -1 in case of error
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Front-end for the encoding handler input function, but handle only the very first line, i.e. limit itself to 45 chars.

Packit 423ecb
<tt>handler</tt>:char enconding transformation data structure
<tt>out</tt>:an xmlBuffer for the output.
<tt>in</tt>:an xmlBuffer for the input
<tt>Returns</tt>:the number of byte written if success, or -1 general error -2 if the transcoding fails (for *in is not valid utf8 string or the result of transformation can't fit into the encoding we want), or
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Generic front-end for the encoding handler input function

Packit 423ecb
<tt>handler</tt>:char encoding transformation data structure
<tt>out</tt>:an xmlBuffer for the output.
<tt>in</tt>:an xmlBuffer for the input
<tt>Returns</tt>:the number of byte written if success, or -1 general error -2 if the transcoding fails (for *in is not valid utf8 string or the result of transformation can't fit into the encoding we want), or
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Generic front-end for the encoding handler output function a first call with @in == NULL has to be made firs to initiate the output in case of non-stateless encoding needing to initiate their state or the output (like the BOM in UTF16). In case of UTF8 sequence conversion errors for the given encoder, the content will be automatically remapped to a CharRef sequence.

Packit 423ecb
<tt>handler</tt>:char enconding transformation data structure
<tt>out</tt>:an xmlBuffer for the output.
<tt>in</tt>:an xmlBuffer for the input
<tt>Returns</tt>:the number of byte written if success, or -1 general error -2 if the transcoding fails (for *in is not valid utf8 string or the result of transformation can't fit into the encoding we want), or
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Cleanup the memory allocated for the char encoding support, it unregisters all the encoding handlers and the aliases.

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Unregisters all aliases

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Unregisters an encoding alias @alias

Packit 423ecb
<tt>alias</tt>:the alias name as parsed, in UTF-8 format (ASCII actually)
<tt>Returns</tt>:0 in case of success, -1 in case of error
Packit 423ecb
        
Packit 423ecb
        

xmlDetectCharEncoding ()

xmlCharEncoding	xmlDetectCharEncoding	(const unsigned char * in, 
int len)
Packit 423ecb

Guess the encoding of the entity using the first bytes of the entity content according to the non-normative appendix F of the XML-1.0 recommendation.

Packit 423ecb
<tt>in</tt>:a pointer to the first bytes of the XML entity, must be at least 2 bytes long (at least 4 if encoding is UTF4 variant).
<tt>len</tt>:pointer to the length of the buffer
<tt>Returns</tt>:one of the XML_CHAR_ENCODING_... values.
Packit 423ecb
        
Packit 423ecb
        

xmlFindCharEncodingHandler ()

xmlCharEncodingHandlerPtr	xmlFindCharEncodingHandler	(const char * name)
Packit 423ecb

Search in the registered set the handler able to read/write that encoding.

Packit 423ecb
<tt>name</tt>:a string describing the char encoding.
<tt>Returns</tt>:the handler or NULL if not found
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Search in the registered set the handler able to read/write that encoding.

Packit 423ecb
<tt>enc</tt>:an xmlCharEncoding value.
<tt>Returns</tt>:the handler or NULL if not found
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

The "canonical" name for XML encoding. C.f. http://www.w3.org/TR/REC-xml#charencoding Section 4.3.3 Character Encoding in Entities

Packit 423ecb
<tt>enc</tt>:the encoding
<tt>Returns</tt>:the canonical name for the given encoding
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Lookup an encoding name for the given alias.

Packit 423ecb
<tt>alias</tt>:the alias name as parsed, in UTF-8 format (ASCII actually)
<tt>Returns</tt>:NULL if not found, otherwise the original name
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Initialize the char encoding support, it registers the default encoding supported. NOTE: while public, this function usually doesn't need to be called in normal processing.

Packit 423ecb
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Create and registers an xmlCharEncodingHandler.

Packit 423ecb
<tt>name</tt>:the encoding name, in UTF-8 format (ASCII actually)
<tt>input</tt>:the xmlCharEncodingInputFunc to read that encoding
<tt>output</tt>:the xmlCharEncodingOutputFunc to write that encoding
<tt>Returns</tt>:the xmlCharEncodingHandlerPtr created (or NULL in case of error).
Packit 423ecb
        
Packit 423ecb
        

xmlParseCharEncoding ()

xmlCharEncoding	xmlParseCharEncoding	(const char * name)
Packit 423ecb

Compare the string to the encoding schemes already known. Note that the comparison is case insensitive accordingly to the section [XML] 4.3.3 Character Encoding in Entities.

Packit 423ecb
<tt>name</tt>:the encoding name as parsed, in UTF-8 format (ASCII actually)
<tt>Returns</tt>:one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE if not recognized.
Packit 423ecb
        
Packit 423ecb
        
Packit 423ecb

Register the char encoding handler, surprising, isn't it ?

Packit 423ecb
<tt>handler</tt>:the xmlCharEncodingHandlerPtr handler block
Packit 423ecb
        
Packit 423ecb
      
Packit 423ecb
    
Packit 423ecb
  </body>
Packit 423ecb
</html>