|
Packit Service |
8264ee |
/*
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Embedded Linux library
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Copyright (C) 2011-2014 Intel Corporation. All rights reserved.
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* This library is free software; you can redistribute it and/or
|
|
Packit Service |
8264ee |
* modify it under the terms of the GNU Lesser General Public
|
|
Packit Service |
8264ee |
* License as published by the Free Software Foundation; either
|
|
Packit Service |
8264ee |
* version 2.1 of the License, or (at your option) any later version.
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* This library is distributed in the hope that it will be useful,
|
|
Packit Service |
8264ee |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
Packit Service |
8264ee |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Packit Service |
8264ee |
* Lesser General Public License for more details.
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* You should have received a copy of the GNU Lesser General Public
|
|
Packit Service |
8264ee |
* License along with this library; if not, write to the Free Software
|
|
Packit Service |
8264ee |
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
*/
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
#ifdef HAVE_CONFIG_H
|
|
Packit Service |
8264ee |
#include <config.h>
|
|
Packit Service |
8264ee |
#endif
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
#include <stdio.h>
|
|
Packit Service |
8264ee |
#include <wchar.h>
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
#include "util.h"
|
|
Packit Service |
8264ee |
#include "strv.h"
|
|
Packit Service |
8264ee |
#include "utf8.h"
|
|
Packit Service |
8264ee |
#include "private.h"
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
/**
|
|
Packit Service |
8264ee |
* SECTION:utf8
|
|
Packit Service |
8264ee |
* @short_description: UTF-8 utility function
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* UTF-8 string handling support
|
|
Packit Service |
8264ee |
*/
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
LIB_EXPORT unsigned char l_ascii_table[256] = {
|
|
Packit Service |
8264ee |
[0x00 ... 0x08] = L_ASCII_CNTRL,
|
|
Packit Service |
8264ee |
[0x09 ... 0x0D] = L_ASCII_CNTRL | L_ASCII_SPACE,
|
|
Packit Service |
8264ee |
[0x0E ... 0x1F] = L_ASCII_CNTRL,
|
|
Packit Service |
8264ee |
[0x20] = L_ASCII_PRINT | L_ASCII_SPACE,
|
|
Packit Service |
8264ee |
[0x21 ... 0x2F] = L_ASCII_PRINT | L_ASCII_PUNCT,
|
|
Packit Service |
8264ee |
[0x30 ... 0x39] = L_ASCII_DIGIT | L_ASCII_XDIGIT | L_ASCII_PRINT,
|
|
Packit Service |
8264ee |
[0x3A ... 0x40] = L_ASCII_PRINT | L_ASCII_PUNCT,
|
|
Packit Service |
8264ee |
[0x41 ... 0x46] = L_ASCII_PRINT | L_ASCII_XDIGIT | L_ASCII_UPPER,
|
|
Packit Service |
8264ee |
[0x47 ... 0x5A] = L_ASCII_PRINT | L_ASCII_UPPER,
|
|
Packit Service |
8264ee |
[0x5B ... 0x60] = L_ASCII_PRINT | L_ASCII_PUNCT,
|
|
Packit Service |
8264ee |
[0x61 ... 0x66] = L_ASCII_PRINT | L_ASCII_XDIGIT | L_ASCII_LOWER,
|
|
Packit Service |
8264ee |
[0x67 ... 0x7A] = L_ASCII_PRINT | L_ASCII_LOWER,
|
|
Packit Service |
8264ee |
[0x7B ... 0x7E] = L_ASCII_PRINT | L_ASCII_PUNCT,
|
|
Packit Service |
8264ee |
[0x7F] = L_ASCII_CNTRL,
|
|
Packit Service |
8264ee |
[0x80 ... 0xFF] = 0,
|
|
Packit Service |
8264ee |
};
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
static inline bool __attribute__ ((always_inline))
|
|
Packit Service |
8264ee |
valid_unicode(wchar_t c)
|
|
Packit Service |
8264ee |
{
|
|
Packit Service |
8264ee |
if (c <= 0xd7ff)
|
|
Packit Service |
8264ee |
return true;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (c < 0xe000 || c > 0x10ffff)
|
|
Packit Service |
8264ee |
return false;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (c >= 0xfdd0 && c <= 0xfdef)
|
|
Packit Service |
8264ee |
return false;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if ((c & 0xfffe) == 0xfffe)
|
|
Packit Service |
8264ee |
return false;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
return true;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
/**
|
|
Packit Service |
8264ee |
* l_utf8_get_codepoint
|
|
Packit Service |
8264ee |
* @str: a pointer to codepoint data
|
|
Packit Service |
8264ee |
* @len: maximum bytes to read
|
|
Packit Service |
8264ee |
* @cp: destination for codepoint
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Returns: number of bytes read, or -1 for invalid coddepoint
|
|
Packit Service |
8264ee |
**/
|
|
Packit Service |
8264ee |
LIB_EXPORT int l_utf8_get_codepoint(const char *str, size_t len, wchar_t *cp)
|
|
Packit Service |
8264ee |
{
|
|
Packit Service |
8264ee |
static const wchar_t mins[3] = { 1 << 7, 1 << 11, 1 << 16 };
|
|
Packit Service |
8264ee |
unsigned int expect_bytes;
|
|
Packit Service |
8264ee |
wchar_t val;
|
|
Packit Service |
8264ee |
size_t i;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (len == 0)
|
|
Packit Service |
8264ee |
return 0;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if ((signed char) str[0] > 0) {
|
|
Packit Service |
8264ee |
*cp = str[0];
|
|
Packit Service |
8264ee |
return 1;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
expect_bytes = __builtin_clz(~((unsigned char)str[0] << 24));
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (expect_bytes < 2 || expect_bytes > 4)
|
|
Packit Service |
8264ee |
goto error;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (expect_bytes > len)
|
|
Packit Service |
8264ee |
goto error;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
val = str[0] & (0xff >> (expect_bytes + 1));
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
for (i = 1; i < expect_bytes; i++) {
|
|
Packit Service |
8264ee |
if ((str[i] & 0xc0) != 0x80)
|
|
Packit Service |
8264ee |
goto error;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
val <<= 6;
|
|
Packit Service |
8264ee |
val |= str[i] & 0x3f;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (val < mins[expect_bytes - 2])
|
|
Packit Service |
8264ee |
goto error;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (valid_unicode(val) == false)
|
|
Packit Service |
8264ee |
goto error;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
*cp = val;
|
|
Packit Service |
8264ee |
return expect_bytes;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
error:
|
|
Packit Service |
8264ee |
return -1;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
/**
|
|
Packit Service |
8264ee |
* l_utf8_validate:
|
|
Packit Service |
8264ee |
* @str: a pointer to character data
|
|
Packit Service |
8264ee |
* @len: max bytes to validate
|
|
Packit Service |
8264ee |
* @end: return location for end of valid data
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Validates UTF-8 encoded text. If @end is non-NULL, then the end of
|
|
Packit Service |
8264ee |
* the valid range will be stored there (i.e. the start of the first
|
|
Packit Service |
8264ee |
* invalid character if some bytes were invalid, or the end of the text
|
|
Packit Service |
8264ee |
* being validated otherwise).
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Returns: Whether the text was valid UTF-8
|
|
Packit Service |
8264ee |
**/
|
|
Packit Service |
8264ee |
LIB_EXPORT bool l_utf8_validate(const char *str, size_t len, const char **end)
|
|
Packit Service |
8264ee |
{
|
|
Packit Service |
8264ee |
size_t pos = 0;
|
|
Packit Service |
8264ee |
int ret;
|
|
Packit Service |
8264ee |
wchar_t val;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
while (pos < len && str[pos]) {
|
|
Packit Service |
8264ee |
ret = l_utf8_get_codepoint(str + pos, len - pos, &val;;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (ret < 0)
|
|
Packit Service |
8264ee |
goto error;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
pos += ret;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
error:
|
|
Packit Service |
8264ee |
if (end)
|
|
Packit Service |
8264ee |
*end = str + pos;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (pos != len)
|
|
Packit Service |
8264ee |
return false;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
return true;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
/**
|
|
Packit Service |
8264ee |
* l_utf8_strlen:
|
|
Packit Service |
8264ee |
* @str: a pointer to character data
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Computes the number of UTF-8 characters (not bytes) in the string given
|
|
Packit Service |
8264ee |
* by @str.
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Returns: The number of UTF-8 characters in the string
|
|
Packit Service |
8264ee |
**/
|
|
Packit Service |
8264ee |
LIB_EXPORT size_t l_utf8_strlen(const char *str)
|
|
Packit Service |
8264ee |
{
|
|
Packit Service |
8264ee |
size_t l = 0;
|
|
Packit Service |
8264ee |
size_t i;
|
|
Packit Service |
8264ee |
unsigned char b;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
for (i = 0; str[i]; i++) {
|
|
Packit Service |
8264ee |
b = str[i];
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if ((b >> 6) == 2)
|
|
Packit Service |
8264ee |
l += 1;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
return i - l;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
static inline int __attribute__ ((always_inline))
|
|
Packit Service |
8264ee |
utf8_length(wchar_t c)
|
|
Packit Service |
8264ee |
{
|
|
Packit Service |
8264ee |
if (c <= 0x7f)
|
|
Packit Service |
8264ee |
return 1;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (c <= 0x7ff)
|
|
Packit Service |
8264ee |
return 2;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (c <= 0xffff)
|
|
Packit Service |
8264ee |
return 3;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
return 4;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
static inline uint16_t __attribute__ ((always_inline))
|
|
Packit Service |
8264ee |
surrogate_value(uint16_t h, uint16_t l)
|
|
Packit Service |
8264ee |
{
|
|
Packit Service |
8264ee |
return 0x10000 + (h - 0xd800) * 0x400 + l - 0xdc00;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
/*
|
|
Packit Service |
8264ee |
* l_utf8_from_wchar:
|
|
Packit Service |
8264ee |
* @c: a wide-character to convert
|
|
Packit Service |
8264ee |
* @out_buf: Buffer to write out to
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Assumes c is valid unicode and out_buf contains enough space for a single
|
|
Packit Service |
8264ee |
* utf8 character (maximum 4 bytes)
|
|
Packit Service |
8264ee |
* Returns: number of characters written
|
|
Packit Service |
8264ee |
*/
|
|
Packit Service |
8264ee |
LIB_EXPORT size_t l_utf8_from_wchar(wchar_t c, char *out_buf)
|
|
Packit Service |
8264ee |
{
|
|
Packit Service |
8264ee |
int len = utf8_length(c);
|
|
Packit Service |
8264ee |
int i;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (len == 1) {
|
|
Packit Service |
8264ee |
out_buf[0] = c;
|
|
Packit Service |
8264ee |
return 1;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
for (i = len - 1; i; i--) {
|
|
Packit Service |
8264ee |
out_buf[i] = (c & 0x3f) | 0x80;
|
|
Packit Service |
8264ee |
c >>= 6;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
out_buf[0] = (0xff << (8 - len)) | c;
|
|
Packit Service |
8264ee |
return len;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
/**
|
|
Packit Service |
8264ee |
* l_utf8_from_utf16:
|
|
Packit Service |
8264ee |
* @utf16: Array of UTF16 characters
|
|
Packit Service |
8264ee |
* @utf16_size: The size of the @utf16 array in bytes. Must be a multiple of 2.
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Returns: A newly-allocated buffer containing UTF16 encoded string converted
|
|
Packit Service |
8264ee |
* to UTF8. The UTF8 string will always be null terminated, even if the
|
|
Packit Service |
8264ee |
* original UTF16 string was not.
|
|
Packit Service |
8264ee |
**/
|
|
Packit Service |
8264ee |
LIB_EXPORT char *l_utf8_from_utf16(const void *utf16, ssize_t utf16_size)
|
|
Packit Service |
8264ee |
{
|
|
Packit Service |
8264ee |
char *utf8;
|
|
Packit Service |
8264ee |
size_t utf8_len = 0;
|
|
Packit Service |
8264ee |
wchar_t high_surrogate = 0;
|
|
Packit Service |
8264ee |
ssize_t i = 0;
|
|
Packit Service |
8264ee |
uint16_t in;
|
|
Packit Service |
8264ee |
wchar_t c;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (unlikely(utf16_size % 2))
|
|
Packit Service |
8264ee |
return NULL;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
while (utf16_size < 0 || i < utf16_size) {
|
|
Packit Service |
8264ee |
in = l_get_u16(utf16 + i);
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (!in)
|
|
Packit Service |
8264ee |
break;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (in >= 0xdc00 && in < 0xe000) {
|
|
Packit Service |
8264ee |
if (high_surrogate)
|
|
Packit Service |
8264ee |
c = surrogate_value(high_surrogate, in);
|
|
Packit Service |
8264ee |
else
|
|
Packit Service |
8264ee |
return NULL;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
high_surrogate = 0;
|
|
Packit Service |
8264ee |
} else {
|
|
Packit Service |
8264ee |
if (high_surrogate)
|
|
Packit Service |
8264ee |
return NULL;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (in >= 0xd800 && in < 0xdc00) {
|
|
Packit Service |
8264ee |
high_surrogate = in;
|
|
Packit Service |
8264ee |
goto next;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
c = in;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (!valid_unicode(c))
|
|
Packit Service |
8264ee |
return NULL;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
utf8_len += utf8_length(c);
|
|
Packit Service |
8264ee |
next:
|
|
Packit Service |
8264ee |
i += 2;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (high_surrogate)
|
|
Packit Service |
8264ee |
return NULL;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
utf8 = l_malloc(utf8_len + 1);
|
|
Packit Service |
8264ee |
utf8_len = 0;
|
|
Packit Service |
8264ee |
i = 0;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
while (utf16_size < 0 || i < utf16_size) {
|
|
Packit Service |
8264ee |
in = l_get_u16(utf16 + i);
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (!in)
|
|
Packit Service |
8264ee |
break;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (in >= 0xd800 && in < 0xdc00) {
|
|
Packit Service |
8264ee |
high_surrogate = in;
|
|
Packit Service |
8264ee |
i += 2;
|
|
Packit Service |
8264ee |
in = l_get_u16(utf16 + i);
|
|
Packit Service |
8264ee |
c = surrogate_value(high_surrogate, in);
|
|
Packit Service |
8264ee |
} else
|
|
Packit Service |
8264ee |
c = in;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
utf8_len += l_utf8_from_wchar(c, utf8 + utf8_len);
|
|
Packit Service |
8264ee |
i += 2;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
utf8[utf8_len] = '\0';
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
return utf8;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
/**
|
|
Packit Service |
8264ee |
* l_utf8_to_utf16:
|
|
Packit Service |
8264ee |
* @utf8: UTF8 formatted string
|
|
Packit Service |
8264ee |
* @out_size: The size in bytes of the converted utf16 string
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Converts a UTF8 formatted string to UTF16. It is assumed that the string
|
|
Packit Service |
8264ee |
* is valid UTF8 and no sanity checking is performed.
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Returns: A newly-allocated buffer containing UTF8 encoded string converted
|
|
Packit Service |
8264ee |
* to UTF16. The UTF16 string will always be null terminated.
|
|
Packit Service |
8264ee |
**/
|
|
Packit Service |
8264ee |
LIB_EXPORT void *l_utf8_to_utf16(const char *utf8, size_t *out_size)
|
|
Packit Service |
8264ee |
{
|
|
Packit Service |
8264ee |
const char *c;
|
|
Packit Service |
8264ee |
wchar_t wc;
|
|
Packit Service |
8264ee |
int len;
|
|
Packit Service |
8264ee |
uint16_t *utf16;
|
|
Packit Service |
8264ee |
size_t n_utf16;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (unlikely(!utf8))
|
|
Packit Service |
8264ee |
return NULL;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
c = utf8;
|
|
Packit Service |
8264ee |
n_utf16 = 0;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
while (*c) {
|
|
Packit Service |
8264ee |
len = l_utf8_get_codepoint(c, 4, &wc);
|
|
Packit Service |
8264ee |
if (len < 0)
|
|
Packit Service |
8264ee |
return NULL;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (wc < 0x10000)
|
|
Packit Service |
8264ee |
n_utf16 += 1;
|
|
Packit Service |
8264ee |
else
|
|
Packit Service |
8264ee |
n_utf16 += 2;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
c += len;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
utf16 = l_malloc((n_utf16 + 1) * 2);
|
|
Packit Service |
8264ee |
c = utf8;
|
|
Packit Service |
8264ee |
n_utf16 = 0;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
while (*c) {
|
|
Packit Service |
8264ee |
len = l_utf8_get_codepoint(c, 4, &wc);
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (wc >= 0x10000) {
|
|
Packit Service |
8264ee |
utf16[n_utf16++] = (wc - 0x1000) / 0x400 + 0xd800;
|
|
Packit Service |
8264ee |
utf16[n_utf16++] = (wc - 0x1000) % 0x400 + 0xdc00;
|
|
Packit Service |
8264ee |
} else
|
|
Packit Service |
8264ee |
utf16[n_utf16++] = wc;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
c += len;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
utf16[n_utf16] = 0;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (out_size)
|
|
Packit Service |
8264ee |
*out_size = (n_utf16 + 1) * 2;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
return utf16;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
/**
|
|
Packit Service |
8264ee |
* l_utf8_from_ucs2be:
|
|
Packit Service |
8264ee |
* @ucs2be: Array of UCS2 characters in big-endian format
|
|
Packit Service |
8264ee |
* @ucs2be_size: The size of the @ucs2 array in bytes. Must be a multiple of 2.
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Returns: A newly-allocated buffer containing UCS2BE encoded string converted
|
|
Packit Service |
8264ee |
* to UTF8. The UTF8 string will always be null terminated, even if the
|
|
Packit Service |
8264ee |
* original UCS2BE string was not.
|
|
Packit Service |
8264ee |
**/
|
|
Packit Service |
8264ee |
LIB_EXPORT char *l_utf8_from_ucs2be(const void *ucs2be, ssize_t ucs2be_size)
|
|
Packit Service |
8264ee |
{
|
|
Packit Service |
8264ee |
char *utf8;
|
|
Packit Service |
8264ee |
size_t utf8_len = 0;
|
|
Packit Service |
8264ee |
ssize_t i = 0;
|
|
Packit Service |
8264ee |
uint16_t in;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (unlikely(ucs2be_size % 2))
|
|
Packit Service |
8264ee |
return NULL;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
while (ucs2be_size < 0 || i < ucs2be_size) {
|
|
Packit Service |
8264ee |
in = l_get_be16(ucs2be + i);
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (!in)
|
|
Packit Service |
8264ee |
break;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (in >= 0xd800 && in < 0xe000)
|
|
Packit Service |
8264ee |
return NULL;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (!valid_unicode(in))
|
|
Packit Service |
8264ee |
return NULL;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
utf8_len += utf8_length(in);
|
|
Packit Service |
8264ee |
i += 2;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
utf8 = l_malloc(utf8_len + 1);
|
|
Packit Service |
8264ee |
utf8_len = 0;
|
|
Packit Service |
8264ee |
i = 0;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
while (ucs2be_size < 0 || i < ucs2be_size) {
|
|
Packit Service |
8264ee |
in = l_get_be16(ucs2be + i);
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (!in)
|
|
Packit Service |
8264ee |
break;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
utf8_len += l_utf8_from_wchar(in, utf8 + utf8_len);
|
|
Packit Service |
8264ee |
i += 2;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
utf8[utf8_len] = '\0';
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
return utf8;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
/**
|
|
Packit Service |
8264ee |
* l_utf8_to_ucs2be:
|
|
Packit Service |
8264ee |
* @utf8: UTF8 formatted string
|
|
Packit Service |
8264ee |
* @out_size: The size in bytes of the converted ucs2be string
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Converts a UTF8 formatted string to UCS2BE. It is assumed that the string
|
|
Packit Service |
8264ee |
* is valid UTF8 and no sanity checking is performed.
|
|
Packit Service |
8264ee |
*
|
|
Packit Service |
8264ee |
* Returns: A newly-allocated buffer containing UTF8 encoded string converted
|
|
Packit Service |
8264ee |
* to UCS2BE. The UCS2BE string will always be null terminated.
|
|
Packit Service |
8264ee |
**/
|
|
Packit Service |
8264ee |
LIB_EXPORT void *l_utf8_to_ucs2be(const char *utf8, size_t *out_size)
|
|
Packit Service |
8264ee |
{
|
|
Packit Service |
8264ee |
const char *c;
|
|
Packit Service |
8264ee |
wchar_t wc;
|
|
Packit Service |
8264ee |
int len;
|
|
Packit Service |
8264ee |
uint16_t *ucs2be;
|
|
Packit Service |
8264ee |
size_t n_ucs2be;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (unlikely(!utf8))
|
|
Packit Service |
8264ee |
return NULL;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
c = utf8;
|
|
Packit Service |
8264ee |
n_ucs2be = 0;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
while (*c) {
|
|
Packit Service |
8264ee |
len = l_utf8_get_codepoint(c, 4, &wc);
|
|
Packit Service |
8264ee |
if (len < 0)
|
|
Packit Service |
8264ee |
return NULL;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (wc >= 0x10000)
|
|
Packit Service |
8264ee |
return NULL;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
n_ucs2be += 1;
|
|
Packit Service |
8264ee |
c += len;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
ucs2be = l_malloc((n_ucs2be + 1) * 2);
|
|
Packit Service |
8264ee |
c = utf8;
|
|
Packit Service |
8264ee |
n_ucs2be = 0;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
while (*c) {
|
|
Packit Service |
8264ee |
len = l_utf8_get_codepoint(c, 4, &wc);
|
|
Packit Service |
8264ee |
ucs2be[n_ucs2be++] = L_CPU_TO_BE16(wc);
|
|
Packit Service |
8264ee |
c += len;
|
|
Packit Service |
8264ee |
}
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
ucs2be[n_ucs2be] = 0;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
if (out_size)
|
|
Packit Service |
8264ee |
*out_size = (n_ucs2be + 1) * 2;
|
|
Packit Service |
8264ee |
|
|
Packit Service |
8264ee |
return ucs2be;
|
|
Packit Service |
8264ee |
}
|