|
Packit |
d0f5c2 |
/*
|
|
Packit |
d0f5c2 |
$Id: Unicode.xs,v 2.17 2018/02/08 00:26:15 dankogai Exp $
|
|
Packit |
d0f5c2 |
*/
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
#define PERL_NO_GET_CONTEXT
|
|
Packit |
d0f5c2 |
#include "EXTERN.h"
|
|
Packit |
d0f5c2 |
#include "perl.h"
|
|
Packit |
d0f5c2 |
#include "XSUB.h"
|
|
Packit |
d0f5c2 |
#include "../Encode/encode.h"
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
#define FBCHAR 0xFFFd
|
|
Packit |
d0f5c2 |
#define BOM_BE 0xFeFF
|
|
Packit |
d0f5c2 |
#define BOM16LE 0xFFFe
|
|
Packit |
d0f5c2 |
#define BOM32LE 0xFFFe0000
|
|
Packit |
d0f5c2 |
#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
|
|
Packit |
d0f5c2 |
#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
|
|
Packit |
d0f5c2 |
#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
|
|
Packit |
d0f5c2 |
#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
/* For pre-5.14 source compatibility */
|
|
Packit |
d0f5c2 |
#ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
|
|
Packit |
d0f5c2 |
# define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
|
|
Packit |
d0f5c2 |
# define UTF8_DISALLOW_SURROGATE 0
|
|
Packit |
d0f5c2 |
# define UTF8_WARN_SURROGATE 0
|
|
Packit |
d0f5c2 |
# define UTF8_DISALLOW_FE_FF 0
|
|
Packit |
d0f5c2 |
# define UTF8_WARN_FE_FF 0
|
|
Packit |
d0f5c2 |
# define UTF8_WARN_NONCHAR 0
|
|
Packit |
d0f5c2 |
#endif
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
#define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
/* Avoid wasting too much space in the result buffer */
|
|
Packit |
d0f5c2 |
/* static void */
|
|
Packit |
d0f5c2 |
/* shrink_buffer(SV *result) */
|
|
Packit |
d0f5c2 |
/* { */
|
|
Packit |
d0f5c2 |
/* if (SvLEN(result) > 42 + SvCUR(result)) { */
|
|
Packit |
d0f5c2 |
/* char *buf; */
|
|
Packit |
d0f5c2 |
/* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
|
|
Packit |
d0f5c2 |
/* New(0, buf, len, char); */
|
|
Packit |
d0f5c2 |
/* Copy(SvPVX(result), buf, len, char); */
|
|
Packit |
d0f5c2 |
/* Safefree(SvPVX(result)); */
|
|
Packit |
d0f5c2 |
/* SvPV_set(result, buf); */
|
|
Packit |
d0f5c2 |
/* SvLEN_set(result, len); */
|
|
Packit |
d0f5c2 |
/* } */
|
|
Packit |
d0f5c2 |
/* } */
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
#define shrink_buffer(result) { \
|
|
Packit |
d0f5c2 |
if (SvLEN(result) > 42 + SvCUR(result)) { \
|
|
Packit |
d0f5c2 |
char *newpv; \
|
|
Packit |
d0f5c2 |
STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
|
|
Packit |
d0f5c2 |
New(0, newpv, newlen, char); \
|
|
Packit |
d0f5c2 |
Copy(SvPVX(result), newpv, newlen, char); \
|
|
Packit |
d0f5c2 |
Safefree(SvPVX(result)); \
|
|
Packit |
d0f5c2 |
SvPV_set(result, newpv); \
|
|
Packit |
d0f5c2 |
SvLEN_set(result, newlen); \
|
|
Packit |
d0f5c2 |
} \
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
static UV
|
|
Packit |
d0f5c2 |
enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
|
|
Packit |
d0f5c2 |
{
|
|
Packit |
d0f5c2 |
U8 *s = *sp;
|
|
Packit |
d0f5c2 |
UV v = 0;
|
|
Packit |
d0f5c2 |
if (s+size > e) {
|
|
Packit |
d0f5c2 |
croak("Partial character %c",(char) endian);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
switch(endian) {
|
|
Packit |
d0f5c2 |
case 'N':
|
|
Packit |
d0f5c2 |
v = *s++;
|
|
Packit |
d0f5c2 |
v = (v << 8) | *s++;
|
|
Packit |
d0f5c2 |
case 'n':
|
|
Packit |
d0f5c2 |
v = (v << 8) | *s++;
|
|
Packit |
d0f5c2 |
v = (v << 8) | *s++;
|
|
Packit |
d0f5c2 |
break;
|
|
Packit |
d0f5c2 |
case 'V':
|
|
Packit |
d0f5c2 |
case 'v':
|
|
Packit |
d0f5c2 |
v |= *s++;
|
|
Packit |
d0f5c2 |
v |= (*s++ << 8);
|
|
Packit |
d0f5c2 |
if (endian == 'v')
|
|
Packit |
d0f5c2 |
break;
|
|
Packit |
d0f5c2 |
v |= (*s++ << 16);
|
|
Packit |
d0f5c2 |
v |= ((UV)*s++ << 24);
|
|
Packit |
d0f5c2 |
break;
|
|
Packit |
d0f5c2 |
default:
|
|
Packit |
d0f5c2 |
croak("Unknown endian %c",(char) endian);
|
|
Packit |
d0f5c2 |
break;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
*sp = s;
|
|
Packit |
d0f5c2 |
return v;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
static void
|
|
Packit |
d0f5c2 |
enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
|
|
Packit |
d0f5c2 |
{
|
|
Packit |
d0f5c2 |
U8 *d = (U8 *) SvPV_nolen(result);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
switch(endian) {
|
|
Packit |
d0f5c2 |
case 'v':
|
|
Packit |
d0f5c2 |
case 'V':
|
|
Packit |
d0f5c2 |
d += SvCUR(result);
|
|
Packit |
d0f5c2 |
SvCUR_set(result,SvCUR(result)+size);
|
|
Packit |
d0f5c2 |
while (size--) {
|
|
Packit |
d0f5c2 |
*d++ = (U8)(value & 0xFF);
|
|
Packit |
d0f5c2 |
value >>= 8;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
break;
|
|
Packit |
d0f5c2 |
case 'n':
|
|
Packit |
d0f5c2 |
case 'N':
|
|
Packit |
d0f5c2 |
SvCUR_set(result,SvCUR(result)+size);
|
|
Packit |
d0f5c2 |
d += SvCUR(result);
|
|
Packit |
d0f5c2 |
while (size--) {
|
|
Packit |
d0f5c2 |
*--d = (U8)(value & 0xFF);
|
|
Packit |
d0f5c2 |
value >>= 8;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
break;
|
|
Packit |
d0f5c2 |
default:
|
|
Packit |
d0f5c2 |
croak("Unknown endian %c",(char) endian);
|
|
Packit |
d0f5c2 |
break;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
MODULE = Encode::Unicode PACKAGE = Encode::Unicode
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
PROTOTYPES: DISABLE
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
#define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
|
|
Packit |
d0f5c2 |
*hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
void
|
|
Packit |
d0f5c2 |
decode(obj, str, check = 0)
|
|
Packit |
d0f5c2 |
SV * obj
|
|
Packit |
d0f5c2 |
SV * str
|
|
Packit |
d0f5c2 |
IV check
|
|
Packit |
d0f5c2 |
CODE:
|
|
Packit |
d0f5c2 |
{
|
|
Packit |
d0f5c2 |
SV *sve = attr("endian", 6);
|
|
Packit |
d0f5c2 |
U8 endian = *((U8 *)SvPV_nolen(sve));
|
|
Packit |
d0f5c2 |
SV *svs = attr("size", 4);
|
|
Packit |
d0f5c2 |
int size = SvIV(svs);
|
|
Packit |
d0f5c2 |
int ucs2 = -1; /* only needed in the event of surrogate pairs */
|
|
Packit |
d0f5c2 |
SV *result = newSVpvn("",0);
|
|
Packit |
d0f5c2 |
STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
|
|
Packit |
d0f5c2 |
STRLEN ulen;
|
|
Packit |
d0f5c2 |
STRLEN resultbuflen;
|
|
Packit |
d0f5c2 |
U8 *resultbuf;
|
|
Packit |
d0f5c2 |
U8 *s;
|
|
Packit |
d0f5c2 |
U8 *e;
|
|
Packit |
d0f5c2 |
bool modify = (check && !(check & ENCODE_LEAVE_SRC));
|
|
Packit |
d0f5c2 |
bool temp_result;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
SvGETMAGIC(str);
|
|
Packit |
d0f5c2 |
if (!SvOK(str))
|
|
Packit |
d0f5c2 |
XSRETURN_UNDEF;
|
|
Packit |
d0f5c2 |
s = modify ? (U8 *)SvPV_force_nomg(str, ulen) : (U8 *)SvPV_nomg(str, ulen);
|
|
Packit |
d0f5c2 |
if (SvUTF8(str)) {
|
|
Packit |
d0f5c2 |
if (!modify) {
|
|
Packit |
d0f5c2 |
SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
|
|
Packit |
d0f5c2 |
SvUTF8_on(tmp);
|
|
Packit |
d0f5c2 |
if (SvTAINTED(str))
|
|
Packit |
d0f5c2 |
SvTAINTED_on(tmp);
|
|
Packit |
d0f5c2 |
str = tmp;
|
|
Packit |
d0f5c2 |
s = (U8 *)SvPVX(str);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
if (ulen) {
|
|
Packit |
d0f5c2 |
if (!utf8_to_bytes(s, &ulen))
|
|
Packit |
d0f5c2 |
croak("Wide character");
|
|
Packit |
d0f5c2 |
SvCUR_set(str, ulen);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
SvUTF8_off(str);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
e = s+ulen;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
/* Optimise for the common case of being called from PerlIOEncode_fill()
|
|
Packit |
d0f5c2 |
with a standard length buffer. In this case the result SV's buffer is
|
|
Packit |
d0f5c2 |
only used temporarily, so we can afford to allocate the maximum needed
|
|
Packit |
d0f5c2 |
and not care about unused space. */
|
|
Packit |
d0f5c2 |
temp_result = (ulen == PERLIO_BUFSIZ);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
ST(0) = sv_2mortal(result);
|
|
Packit |
d0f5c2 |
SvUTF8_on(result);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
if (!endian && s+size <= e) {
|
|
Packit |
d0f5c2 |
SV *sv;
|
|
Packit |
d0f5c2 |
UV bom;
|
|
Packit |
d0f5c2 |
endian = (size == 4) ? 'N' : 'n';
|
|
Packit |
d0f5c2 |
bom = enc_unpack(aTHX_ &s,e,size,endian);
|
|
Packit |
d0f5c2 |
if (bom != BOM_BE) {
|
|
Packit |
d0f5c2 |
if (bom == BOM16LE) {
|
|
Packit |
d0f5c2 |
endian = 'v';
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else if (bom == BOM32LE) {
|
|
Packit |
d0f5c2 |
endian = 'V';
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
/* No BOM found, use big-endian fallback as specified in
|
|
Packit |
d0f5c2 |
* RFC2781 and the Unicode Standard version 8.0:
|
|
Packit |
d0f5c2 |
*
|
|
Packit |
d0f5c2 |
* The UTF-16 encoding scheme may or may not begin with
|
|
Packit |
d0f5c2 |
* a BOM. However, when there is no BOM, and in the
|
|
Packit |
d0f5c2 |
* absence of a higher-level protocol, the byte order
|
|
Packit |
d0f5c2 |
* of the UTF-16 encoding scheme is big-endian.
|
|
Packit |
d0f5c2 |
*
|
|
Packit |
d0f5c2 |
* If the first two octets of the text is not 0xFE
|
|
Packit |
d0f5c2 |
* followed by 0xFF, and is not 0xFF followed by 0xFE,
|
|
Packit |
d0f5c2 |
* then the text SHOULD be interpreted as big-endian.
|
|
Packit |
d0f5c2 |
*/
|
|
Packit |
d0f5c2 |
s -= size;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
#if 1
|
|
Packit |
d0f5c2 |
/* Update endian for next sequence */
|
|
Packit |
d0f5c2 |
sv = attr("renewed", 7);
|
|
Packit |
d0f5c2 |
if (SvTRUE(sv)) {
|
|
Packit |
d0f5c2 |
(void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
#endif
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
if (temp_result) {
|
|
Packit |
d0f5c2 |
resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
|
|
Packit |
d0f5c2 |
} else {
|
|
Packit |
d0f5c2 |
/* Preallocate the buffer to the minimum possible space required. */
|
|
Packit |
d0f5c2 |
resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
resultbuf = (U8 *) SvGROW(result, resultbuflen);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
while (s < e && s+size <= e) {
|
|
Packit |
d0f5c2 |
UV ord = enc_unpack(aTHX_ &s,e,size,endian);
|
|
Packit |
d0f5c2 |
U8 *d;
|
|
Packit |
d0f5c2 |
if (issurrogate(ord)) {
|
|
Packit |
d0f5c2 |
if (ucs2 == -1) {
|
|
Packit |
d0f5c2 |
SV *sv = attr("ucs2", 4);
|
|
Packit |
d0f5c2 |
ucs2 = SvTRUE(sv);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
if (ucs2 || size == 4) {
|
|
Packit |
d0f5c2 |
if (check) {
|
|
Packit |
d0f5c2 |
croak("%" SVf ":no surrogates allowed %" UVxf,
|
|
Packit |
d0f5c2 |
*hv_fetch((HV *)SvRV(obj),"Name",4,0),
|
|
Packit |
d0f5c2 |
ord);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
ord = FBCHAR;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
UV lo;
|
|
Packit |
d0f5c2 |
if (!isHiSurrogate(ord)) {
|
|
Packit |
d0f5c2 |
if (check) {
|
|
Packit |
d0f5c2 |
croak("%" SVf ":Malformed HI surrogate %" UVxf,
|
|
Packit |
d0f5c2 |
*hv_fetch((HV *)SvRV(obj),"Name",4,0),
|
|
Packit |
d0f5c2 |
ord);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
ord = FBCHAR;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else if (s+size > e) {
|
|
Packit |
d0f5c2 |
if (check) {
|
|
Packit |
d0f5c2 |
if (check & ENCODE_STOP_AT_PARTIAL) {
|
|
Packit |
d0f5c2 |
s -= size;
|
|
Packit |
d0f5c2 |
break;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
croak("%" SVf ":Malformed HI surrogate %" UVxf,
|
|
Packit |
d0f5c2 |
*hv_fetch((HV *)SvRV(obj),"Name",4,0),
|
|
Packit |
d0f5c2 |
ord);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
ord = FBCHAR;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
lo = enc_unpack(aTHX_ &s,e,size,endian);
|
|
Packit |
d0f5c2 |
if (!isLoSurrogate(lo)) {
|
|
Packit |
d0f5c2 |
if (check) {
|
|
Packit |
d0f5c2 |
croak("%" SVf ":Malformed LO surrogate %" UVxf,
|
|
Packit |
d0f5c2 |
*hv_fetch((HV *)SvRV(obj),"Name",4,0),
|
|
Packit |
d0f5c2 |
ord);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
s -= size;
|
|
Packit |
d0f5c2 |
ord = FBCHAR;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
|
|
Packit |
d0f5c2 |
if (check) {
|
|
Packit |
d0f5c2 |
croak("%" SVf ":Unicode character %" UVxf " is illegal",
|
|
Packit |
d0f5c2 |
*hv_fetch((HV *)SvRV(obj),"Name",4,0),
|
|
Packit |
d0f5c2 |
ord);
|
|
Packit |
d0f5c2 |
} else {
|
|
Packit |
d0f5c2 |
ord = FBCHAR;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
|
|
Packit |
d0f5c2 |
/* Do not allocate >8Mb more than the minimum needed.
|
|
Packit |
d0f5c2 |
This prevents allocating too much in the rogue case of a large
|
|
Packit |
d0f5c2 |
input consisting initially of long sequence uft8-byte unicode
|
|
Packit |
d0f5c2 |
chars followed by single utf8-byte chars. */
|
|
Packit |
d0f5c2 |
/* +1
|
|
Packit |
d0f5c2 |
fixes Unicode.xs!decode_xs n-byte heap-overflow
|
|
Packit |
d0f5c2 |
*/
|
|
Packit |
d0f5c2 |
STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
|
|
Packit |
d0f5c2 |
STRLEN max_alloc = remaining + (8*1024*1024);
|
|
Packit |
d0f5c2 |
STRLEN est_alloc = remaining * UTF8_MAXLEN;
|
|
Packit |
d0f5c2 |
STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
|
|
Packit |
d0f5c2 |
(est_alloc > max_alloc ? max_alloc : est_alloc);
|
|
Packit |
d0f5c2 |
resultbuf = (U8 *) SvGROW(result, newlen);
|
|
Packit |
d0f5c2 |
resultbuflen = SvLEN(result);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), ord,
|
|
Packit |
d0f5c2 |
UNICODE_WARN_ILLEGAL_INTERCHANGE);
|
|
Packit |
d0f5c2 |
SvCUR_set(result, d - (U8 *)SvPVX(result));
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
if (s < e) {
|
|
Packit |
d0f5c2 |
/* unlikely to happen because it's fixed-length -- dankogai */
|
|
Packit |
d0f5c2 |
if (check & ENCODE_WARN_ON_ERR) {
|
|
Packit |
d0f5c2 |
Perl_warner(aTHX_ packWARN(WARN_UTF8),"%" SVf ":Partial character",
|
|
Packit |
d0f5c2 |
*hv_fetch((HV *)SvRV(obj),"Name",4,0));
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
if (check && !(check & ENCODE_LEAVE_SRC)) {
|
|
Packit |
d0f5c2 |
if (s < e) {
|
|
Packit |
d0f5c2 |
Move(s,SvPVX(str),e-s,U8);
|
|
Packit |
d0f5c2 |
SvCUR_set(str,(e-s));
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
SvCUR_set(str,0);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
*SvEND(str) = '\0';
|
|
Packit |
d0f5c2 |
SvSETMAGIC(str);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
if (!temp_result) shrink_buffer(result);
|
|
Packit |
d0f5c2 |
if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */
|
|
Packit |
d0f5c2 |
XSRETURN(1);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
void
|
|
Packit |
d0f5c2 |
encode(obj, utf8, check = 0)
|
|
Packit |
d0f5c2 |
SV * obj
|
|
Packit |
d0f5c2 |
SV * utf8
|
|
Packit |
d0f5c2 |
IV check
|
|
Packit |
d0f5c2 |
CODE:
|
|
Packit |
d0f5c2 |
{
|
|
Packit |
d0f5c2 |
SV *sve = attr("endian", 6);
|
|
Packit |
d0f5c2 |
U8 endian = *((U8 *)SvPV_nolen(sve));
|
|
Packit |
d0f5c2 |
SV *svs = attr("size", 4);
|
|
Packit |
d0f5c2 |
const int size = SvIV(svs);
|
|
Packit |
d0f5c2 |
int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
|
|
Packit |
d0f5c2 |
const STRLEN usize = (size > 0 ? size : 1);
|
|
Packit |
d0f5c2 |
SV *result = newSVpvn("", 0);
|
|
Packit |
d0f5c2 |
STRLEN ulen;
|
|
Packit |
d0f5c2 |
U8 *s;
|
|
Packit |
d0f5c2 |
U8 *e;
|
|
Packit |
d0f5c2 |
bool modify = (check && !(check & ENCODE_LEAVE_SRC));
|
|
Packit |
d0f5c2 |
bool temp_result;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
SvGETMAGIC(utf8);
|
|
Packit |
d0f5c2 |
if (!SvOK(utf8))
|
|
Packit |
d0f5c2 |
XSRETURN_UNDEF;
|
|
Packit |
d0f5c2 |
s = modify ? (U8 *)SvPV_force_nomg(utf8, ulen) : (U8 *)SvPV_nomg(utf8, ulen);
|
|
Packit |
d0f5c2 |
if (!SvUTF8(utf8)) {
|
|
Packit |
d0f5c2 |
if (!modify) {
|
|
Packit |
d0f5c2 |
SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
|
|
Packit |
d0f5c2 |
if (SvTAINTED(utf8))
|
|
Packit |
d0f5c2 |
SvTAINTED_on(tmp);
|
|
Packit |
d0f5c2 |
utf8 = tmp;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
sv_utf8_upgrade_nomg(utf8);
|
|
Packit |
d0f5c2 |
s = (U8 *)SvPV_nomg(utf8, ulen);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
e = s+ulen;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
/* Optimise for the common case of being called from PerlIOEncode_flush()
|
|
Packit |
d0f5c2 |
with a standard length buffer. In this case the result SV's buffer is
|
|
Packit |
d0f5c2 |
only used temporarily, so we can afford to allocate the maximum needed
|
|
Packit |
d0f5c2 |
and not care about unused space. */
|
|
Packit |
d0f5c2 |
temp_result = (ulen == PERLIO_BUFSIZ);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
ST(0) = sv_2mortal(result);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
/* Preallocate the result buffer to the maximum possible size.
|
|
Packit |
d0f5c2 |
ie. assume each UTF8 byte is 1 character.
|
|
Packit |
d0f5c2 |
Then shrink the result's buffer if necesary at the end. */
|
|
Packit |
d0f5c2 |
SvGROW(result, ((ulen+1) * usize));
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
if (!endian) {
|
|
Packit |
d0f5c2 |
SV *sv;
|
|
Packit |
d0f5c2 |
endian = (size == 4) ? 'N' : 'n';
|
|
Packit |
d0f5c2 |
enc_pack(aTHX_ result,size,endian,BOM_BE);
|
|
Packit |
d0f5c2 |
#if 1
|
|
Packit |
d0f5c2 |
/* Update endian for next sequence */
|
|
Packit |
d0f5c2 |
sv = attr("renewed", 7);
|
|
Packit |
d0f5c2 |
if (SvTRUE(sv)) {
|
|
Packit |
d0f5c2 |
(void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
#endif
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
while (s < e && s+UTF8SKIP(s) <= e) {
|
|
Packit |
d0f5c2 |
STRLEN len;
|
|
Packit |
d0f5c2 |
UV ord = utf8n_to_uvchr(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
|
|
Packit |
d0f5c2 |
|UTF8_WARN_SURROGATE
|
|
Packit |
d0f5c2 |
|UTF8_DISALLOW_FE_FF
|
|
Packit |
d0f5c2 |
|UTF8_WARN_FE_FF
|
|
Packit |
d0f5c2 |
|UTF8_WARN_NONCHAR));
|
|
Packit |
d0f5c2 |
s += len;
|
|
Packit |
d0f5c2 |
if (size != 4 && invalid_ucs2(ord)) {
|
|
Packit |
d0f5c2 |
if (!issurrogate(ord)) {
|
|
Packit |
d0f5c2 |
if (ucs2 == -1) {
|
|
Packit |
d0f5c2 |
SV *sv = attr("ucs2", 4);
|
|
Packit |
d0f5c2 |
ucs2 = SvTRUE(sv);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
if (ucs2 || ord > 0x10FFFF) {
|
|
Packit |
d0f5c2 |
if (check) {
|
|
Packit |
d0f5c2 |
croak("%" SVf ":code point \"\\x{%" UVxf "}\" too high",
|
|
Packit |
d0f5c2 |
*hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
enc_pack(aTHX_ result,size,endian,FBCHAR);
|
|
Packit |
d0f5c2 |
} else {
|
|
Packit |
d0f5c2 |
UV hi = ((ord - 0x10000) >> 10) + 0xD800;
|
|
Packit |
d0f5c2 |
UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
|
|
Packit |
d0f5c2 |
enc_pack(aTHX_ result,size,endian,hi);
|
|
Packit |
d0f5c2 |
enc_pack(aTHX_ result,size,endian,lo);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
/* not supposed to happen */
|
|
Packit |
d0f5c2 |
enc_pack(aTHX_ result,size,endian,FBCHAR);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
enc_pack(aTHX_ result,size,endian,ord);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
if (s < e) {
|
|
Packit |
d0f5c2 |
/* UTF-8 partial char happens often on PerlIO.
|
|
Packit |
d0f5c2 |
Since this is okay and normal, we do not warn.
|
|
Packit |
d0f5c2 |
But this is critical when you choose to LEAVE_SRC
|
|
Packit |
d0f5c2 |
in which case we die */
|
|
Packit |
d0f5c2 |
if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
|
|
Packit |
d0f5c2 |
Perl_croak(aTHX_ "%" SVf ":partial character is not allowed "
|
|
Packit |
d0f5c2 |
"when CHECK = 0x%" UVuf,
|
|
Packit |
d0f5c2 |
*hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
if (check && !(check & ENCODE_LEAVE_SRC)) {
|
|
Packit |
d0f5c2 |
if (s < e) {
|
|
Packit |
d0f5c2 |
Move(s,SvPVX(utf8),e-s,U8);
|
|
Packit |
d0f5c2 |
SvCUR_set(utf8,(e-s));
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
SvCUR_set(utf8,0);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
*SvEND(utf8) = '\0';
|
|
Packit |
d0f5c2 |
SvSETMAGIC(utf8);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
if (!temp_result) shrink_buffer(result);
|
|
Packit |
d0f5c2 |
if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
XSRETURN(1);
|
|
Packit |
d0f5c2 |
}
|