Blame winpr/libwinpr/crt/utf.c

Packit Service fa4841
/*
Packit Service fa4841
 * Copyright 2001-2004 Unicode, Inc.
Packit Service fa4841
 *
Packit Service fa4841
 * Disclaimer
Packit Service fa4841
 *
Packit Service fa4841
 * This source code is provided as is by Unicode, Inc. No claims are
Packit Service fa4841
 * made as to fitness for any particular purpose. No warranties of any
Packit Service fa4841
 * kind are expressed or implied. The recipient agrees to determine
Packit Service fa4841
 * applicability of information provided. If this file has been
Packit Service fa4841
 * purchased on magnetic or optical media from Unicode, Inc., the
Packit Service fa4841
 * sole remedy for any claim will be exchange of defective media
Packit Service fa4841
 * within 90 days of receipt.
Packit Service fa4841
 *
Packit Service fa4841
 * Limitations on Rights to Redistribute This Code
Packit Service fa4841
 *
Packit Service fa4841
 * Unicode, Inc. hereby grants the right to freely use the information
Packit Service fa4841
 * supplied in this file in the creation of products supporting the
Packit Service fa4841
 * Unicode Standard, and to make copies of this file in any form
Packit Service fa4841
 * for internal or external distribution as long as this notice
Packit Service fa4841
 * remains attached.
Packit Service fa4841
 */
Packit Service fa4841
Packit Service fa4841
/* ---------------------------------------------------------------------
Packit Service fa4841
Packit Service fa4841
    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
Packit Service fa4841
    Author: Mark E. Davis, 1994.
Packit Service fa4841
    Rev History: Rick McGowan, fixes & updates May 2001.
Packit Service fa4841
    Sept 2001: fixed const & error conditions per
Packit Service fa4841
    mods suggested by S. Parent & A. Lillich.
Packit Service fa4841
    June 2002: Tim Dodd added detection and handling of incomplete
Packit Service fa4841
    source sequences, enhanced error detection, added casts
Packit Service fa4841
    to eliminate compiler warnings.
Packit Service fa4841
    July 2003: slight mods to back out aggressive FFFE detection.
Packit Service fa4841
    Jan 2004: updated switches in from-UTF8 conversions.
Packit Service fa4841
    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
Packit Service fa4841
Packit Service fa4841
    See the header file "utf.h" for complete documentation.
Packit Service fa4841
Packit Service fa4841
------------------------------------------------------------------------ */
Packit Service fa4841
Packit Service fa4841
#include "utf.h"
Packit Service fa4841
#include <winpr/endian.h>
Packit Service fa4841
Packit Service fa4841
#pragma GCC diagnostic push
Packit Service fa4841
#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
Packit Service fa4841
static const int halfShift = 10; /* used for shifting by 10 bits */
Packit Service fa4841
Packit Service fa4841
static const DWORD halfBase = 0x0010000UL;
Packit Service fa4841
static const DWORD halfMask = 0x3FFUL;
Packit Service fa4841
Packit Service fa4841
#define UNI_SUR_HIGH_START (DWORD)0xD800
Packit Service fa4841
#define UNI_SUR_HIGH_END (DWORD)0xDBFF
Packit Service fa4841
#define UNI_SUR_LOW_START (DWORD)0xDC00
Packit Service fa4841
#define UNI_SUR_LOW_END (DWORD)0xDFFF
Packit Service fa4841
Packit Service fa4841
/* --------------------------------------------------------------------- */
Packit Service fa4841
Packit Service fa4841
ConversionResult ConvertUTF32toUTF16(const DWORD** sourceStart, const DWORD* sourceEnd,
Packit Service fa4841
                                     WCHAR** targetStart, WCHAR* targetEnd, ConversionFlags flags)
Packit Service fa4841
{
Packit Service fa4841
	ConversionResult result = conversionOK;
Packit Service fa4841
	const DWORD* source = *sourceStart;
Packit Service fa4841
	WCHAR* target = *targetStart;
Packit Service fa4841
Packit Service fa4841
	while (source < sourceEnd)
Packit Service fa4841
	{
Packit Service fa4841
		DWORD ch;
Packit Service fa4841
Packit Service fa4841
		if (target >= targetEnd)
Packit Service fa4841
		{
Packit Service fa4841
			result = targetExhausted;
Packit Service fa4841
			break;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		ch = *source++;
Packit Service fa4841
Packit Service fa4841
		if (ch <= UNI_MAX_BMP) /* Target is a character <= 0xFFFF */
Packit Service fa4841
		{
Packit Service fa4841
			/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved
Packit Service fa4841
			 * values */
Packit Service fa4841
			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
Packit Service fa4841
			{
Packit Service fa4841
				if (flags == strictConversion)
Packit Service fa4841
				{
Packit Service fa4841
					--source; /* return to the illegal value itself */
Packit Service fa4841
					result = sourceIllegal;
Packit Service fa4841
					break;
Packit Service fa4841
				}
Packit Service fa4841
				else
Packit Service fa4841
				{
Packit Service fa4841
					*target++ = UNI_REPLACEMENT_CHAR;
Packit Service fa4841
				}
Packit Service fa4841
			}
Packit Service fa4841
			else
Packit Service fa4841
			{
Packit Service fa4841
				*target++ = (WCHAR)ch; /* normal case */
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
		else if (ch > UNI_MAX_LEGAL_UTF32)
Packit Service fa4841
		{
Packit Service fa4841
			if (flags == strictConversion)
Packit Service fa4841
			{
Packit Service fa4841
				result = sourceIllegal;
Packit Service fa4841
			}
Packit Service fa4841
			else
Packit Service fa4841
			{
Packit Service fa4841
				*target++ = UNI_REPLACEMENT_CHAR;
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
		else
Packit Service fa4841
		{
Packit Service fa4841
			/* target is a character in range 0xFFFF - 0x10FFFF. */
Packit Service fa4841
			if (target + 1 >= targetEnd)
Packit Service fa4841
			{
Packit Service fa4841
				--source; /* Back up source pointer! */
Packit Service fa4841
				result = targetExhausted;
Packit Service fa4841
				break;
Packit Service fa4841
			}
Packit Service fa4841
Packit Service fa4841
			ch -= halfBase;
Packit Service fa4841
			*target++ = (WCHAR)((ch >> halfShift) + UNI_SUR_HIGH_START);
Packit Service fa4841
			*target++ = (WCHAR)((ch & halfMask) + UNI_SUR_LOW_START);
Packit Service fa4841
		}
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	*sourceStart = source;
Packit Service fa4841
	*targetStart = target;
Packit Service fa4841
	return result;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
/* --------------------------------------------------------------------- */
Packit Service fa4841
Packit Service fa4841
ConversionResult ConvertUTF16toUTF32(const WCHAR** sourceStart, const WCHAR* sourceEnd,
Packit Service fa4841
                                     DWORD** targetStart, DWORD* targetEnd, ConversionFlags flags)
Packit Service fa4841
{
Packit Service fa4841
	ConversionResult result = conversionOK;
Packit Service fa4841
	const WCHAR* source = *sourceStart;
Packit Service fa4841
	DWORD* target = *targetStart;
Packit Service fa4841
	DWORD ch, ch2;
Packit Service fa4841
Packit Service fa4841
	while (source < sourceEnd)
Packit Service fa4841
	{
Packit Service fa4841
		const WCHAR* oldSource =
Packit Service fa4841
		    source; /*  In case we have to back up because of target overflow. */
Packit Service fa4841
		ch = *source++;
Packit Service fa4841
Packit Service fa4841
		/* If we have a surrogate pair, convert to UTF32 first. */
Packit Service fa4841
		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
Packit Service fa4841
		{
Packit Service fa4841
			/* If the 16 bits following the high surrogate are in the source buffer... */
Packit Service fa4841
			if (source < sourceEnd)
Packit Service fa4841
			{
Packit Service fa4841
				ch2 = *source;
Packit Service fa4841
Packit Service fa4841
				/* If it's a low surrogate, convert to UTF32. */
Packit Service fa4841
				if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
Packit Service fa4841
				{
Packit Service fa4841
					ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) +
Packit Service fa4841
					     halfBase;
Packit Service fa4841
					++source;
Packit Service fa4841
				}
Packit Service fa4841
				else if (flags == strictConversion) /* it's an unpaired high surrogate */
Packit Service fa4841
				{
Packit Service fa4841
					--source; /* return to the illegal value itself */
Packit Service fa4841
					result = sourceIllegal;
Packit Service fa4841
					break;
Packit Service fa4841
				}
Packit Service fa4841
			}
Packit Service fa4841
			else /* We don't have the 16 bits following the high surrogate. */
Packit Service fa4841
			{
Packit Service fa4841
				--source; /* return to the high surrogate */
Packit Service fa4841
				result = sourceExhausted;
Packit Service fa4841
				break;
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
		else if (flags == strictConversion)
Packit Service fa4841
		{
Packit Service fa4841
			/* UTF-16 surrogate values are illegal in UTF-32 */
Packit Service fa4841
			if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
Packit Service fa4841
			{
Packit Service fa4841
				--source; /* return to the illegal value itself */
Packit Service fa4841
				result = sourceIllegal;
Packit Service fa4841
				break;
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		if (target >= targetEnd)
Packit Service fa4841
		{
Packit Service fa4841
			source = oldSource; /* Back up source pointer! */
Packit Service fa4841
			result = targetExhausted;
Packit Service fa4841
			break;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		*target++ = ch;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	*sourceStart = source;
Packit Service fa4841
	*targetStart = target;
Packit Service fa4841
#ifdef CVTUTF_DEBUG
Packit Service fa4841
Packit Service fa4841
	if (result == sourceIllegal)
Packit Service fa4841
	{
Packit Service fa4841
		WLOG_WARN(TAG, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x", ch, ch2);
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
#endif
Packit Service fa4841
	return result;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
/* --------------------------------------------------------------------- */
Packit Service fa4841
Packit Service fa4841
/*
Packit Service fa4841
 * Index into the table below with the first byte of a UTF-8 sequence to
Packit Service fa4841
 * get the number of trailing bytes that are supposed to follow it.
Packit Service fa4841
 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
Packit Service fa4841
 * left as-is for anyone who may want to do such conversion, which was
Packit Service fa4841
 * allowed in earlier algorithms.
Packit Service fa4841
 */
Packit Service fa4841
static const char trailingBytesForUTF8[256] = {
Packit Service fa4841
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit Service fa4841
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit Service fa4841
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit Service fa4841
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit Service fa4841
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit Service fa4841
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit Service fa4841
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Packit Service fa4841
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
Packit Service fa4841
};
Packit Service fa4841
Packit Service fa4841
/*
Packit Service fa4841
 * Magic values subtracted from a buffer value during UTF8 conversion.
Packit Service fa4841
 * This table contains as many values as there might be trailing bytes
Packit Service fa4841
 * in a UTF-8 sequence.
Packit Service fa4841
 */
Packit Service fa4841
static const DWORD offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
Packit Service fa4841
	                                      0x03C82080UL, 0xFA082080UL, 0x82082080UL };
Packit Service fa4841
Packit Service fa4841
/*
Packit Service fa4841
 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
Packit Service fa4841
 * into the first byte, depending on how many bytes follow.  There are
Packit Service fa4841
 * as many entries in this table as there are UTF-8 sequence types.
Packit Service fa4841
 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
Packit Service fa4841
 * for *legal* UTF-8 will be 4 or fewer bytes total.
Packit Service fa4841
 */
Packit Service fa4841
static const BYTE firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
Packit Service fa4841
Packit Service fa4841
/* --------------------------------------------------------------------- */
Packit Service fa4841
Packit Service fa4841
/* The interface converts a whole buffer to avoid function-call overhead.
Packit Service fa4841
 * Constants have been gathered. Loops & conditionals have been removed as
Packit Service fa4841
 * much as possible for efficiency, in favor of drop-through switches.
Packit Service fa4841
 * (See "Note A" at the bottom of the file for equivalent code.)
Packit Service fa4841
 * If your compiler supports it, the "isLegalUTF8" call can be turned
Packit Service fa4841
 * into an inline function.
Packit Service fa4841
 */
Packit Service fa4841
Packit Service fa4841
/* --------------------------------------------------------------------- */
Packit Service fa4841
Packit Service fa4841
ConversionResult ConvertUTF16toUTF8(const WCHAR** sourceStart, const WCHAR* sourceEnd,
Packit Service fa4841
                                    BYTE** targetStart, BYTE* te, ConversionFlags flags)
Packit Service fa4841
{
Packit Service fa4841
	size_t pos = 0;
Packit Service fa4841
	size_t end = 0;
Packit Service fa4841
	const WCHAR* source;
Packit Service fa4841
	const BOOL computeLength = (!te) ? TRUE : FALSE;
Packit Service fa4841
	ConversionResult result = conversionOK;
Packit Service fa4841
Packit Service fa4841
	if (targetStart && te)
Packit Service fa4841
	{
Packit Service fa4841
		const size_t s = (size_t)*targetStart;
Packit Service fa4841
		const size_t e = (size_t)te;
Packit Service fa4841
		if (s > e)
Packit Service fa4841
			return sourceIllegal;
Packit Service fa4841
		end = e - s;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	source = *sourceStart;
Packit Service fa4841
Packit Service fa4841
	while (source < sourceEnd)
Packit Service fa4841
	{
Packit Service fa4841
		DWORD ch;
Packit Service fa4841
		unsigned short bytesToWrite = 0;
Packit Service fa4841
		const DWORD byteMask = 0xBF;
Packit Service fa4841
		const DWORD byteMark = 0x80;
Packit Service fa4841
		const WCHAR* oldSource =
Packit Service fa4841
		    source; /* In case we have to back up because of target overflow. */
Packit Service fa4841
		Data_Read_UINT16(source, ch);
Packit Service fa4841
		source++;
Packit Service fa4841
Packit Service fa4841
		/* If we have a surrogate pair, convert to UTF32 first. */
Packit Service fa4841
		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
Packit Service fa4841
		{
Packit Service fa4841
			/* If the 16 bits following the high surrogate are in the source buffer... */
Packit Service fa4841
			if (source < sourceEnd)
Packit Service fa4841
			{
Packit Service fa4841
				DWORD ch2;
Packit Service fa4841
				Data_Read_UINT16(source, ch2);
Packit Service fa4841
Packit Service fa4841
				/* If it's a low surrogate, convert to UTF32. */
Packit Service fa4841
				if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
Packit Service fa4841
				{
Packit Service fa4841
					ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) +
Packit Service fa4841
					     halfBase;
Packit Service fa4841
					++source;
Packit Service fa4841
				}
Packit Service fa4841
				else if (flags == strictConversion)
Packit Service fa4841
				{
Packit Service fa4841
					/* it's an unpaired high surrogate */
Packit Service fa4841
					--source; /* return to the illegal value itself */
Packit Service fa4841
					result = sourceIllegal;
Packit Service fa4841
					break;
Packit Service fa4841
				}
Packit Service fa4841
			}
Packit Service fa4841
			else
Packit Service fa4841
			{
Packit Service fa4841
				/* We don't have the 16 bits following the high surrogate. */
Packit Service fa4841
				--source; /* return to the high surrogate */
Packit Service fa4841
				result = sourceExhausted;
Packit Service fa4841
				break;
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
		else if (flags == strictConversion)
Packit Service fa4841
		{
Packit Service fa4841
			/* UTF-16 surrogate values are illegal in UTF-32 */
Packit Service fa4841
			if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
Packit Service fa4841
			{
Packit Service fa4841
				--source; /* return to the illegal value itself */
Packit Service fa4841
				result = sourceIllegal;
Packit Service fa4841
				break;
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		/* Figure out how many bytes the result will require */
Packit Service fa4841
		if (ch < (DWORD)0x80)
Packit Service fa4841
		{
Packit Service fa4841
			bytesToWrite = 1;
Packit Service fa4841
		}
Packit Service fa4841
		else if (ch < (DWORD)0x800)
Packit Service fa4841
		{
Packit Service fa4841
			bytesToWrite = 2;
Packit Service fa4841
		}
Packit Service fa4841
		else if (ch < (DWORD)0x10000)
Packit Service fa4841
		{
Packit Service fa4841
			bytesToWrite = 3;
Packit Service fa4841
		}
Packit Service fa4841
		else if (ch < (DWORD)0x110000)
Packit Service fa4841
		{
Packit Service fa4841
			bytesToWrite = 4;
Packit Service fa4841
		}
Packit Service fa4841
		else
Packit Service fa4841
		{
Packit Service fa4841
			bytesToWrite = 3;
Packit Service fa4841
			ch = UNI_REPLACEMENT_CHAR;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		pos += bytesToWrite;
Packit Service fa4841
Packit Service fa4841
		if ((pos > end) && (!computeLength))
Packit Service fa4841
		{
Packit Service fa4841
			source = oldSource; /* Back up source pointer! */
Packit Service fa4841
			pos -= bytesToWrite;
Packit Service fa4841
			result = targetExhausted;
Packit Service fa4841
			break;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		if (!computeLength)
Packit Service fa4841
		{
Packit Service fa4841
			switch (bytesToWrite)
Packit Service fa4841
			{
Packit Service fa4841
				/* note: everything falls through. */
Packit Service fa4841
				case 4:
Packit Service fa4841
					(*targetStart)[--pos] = (BYTE)((ch | byteMark) & byteMask);
Packit Service fa4841
					ch >>= 6;
Packit Service fa4841
Packit Service fa4841
				case 3:
Packit Service fa4841
					(*targetStart)[--pos] = (BYTE)((ch | byteMark) & byteMask);
Packit Service fa4841
					ch >>= 6;
Packit Service fa4841
Packit Service fa4841
				case 2:
Packit Service fa4841
					(*targetStart)[--pos] = (BYTE)((ch | byteMark) & byteMask);
Packit Service fa4841
					ch >>= 6;
Packit Service fa4841
Packit Service fa4841
				case 1:
Packit Service fa4841
					(*targetStart)[--pos] = (BYTE)(ch | firstByteMark[bytesToWrite]);
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
		else
Packit Service fa4841
		{
Packit Service fa4841
			switch (bytesToWrite)
Packit Service fa4841
			{
Packit Service fa4841
				/* note: everything falls through. */
Packit Service fa4841
				case 4:
Packit Service fa4841
					--pos;
Packit Service fa4841
Packit Service fa4841
				case 3:
Packit Service fa4841
					--pos;
Packit Service fa4841
Packit Service fa4841
				case 2:
Packit Service fa4841
					--pos;
Packit Service fa4841
Packit Service fa4841
				case 1:
Packit Service fa4841
					--pos;
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		pos += bytesToWrite;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	*sourceStart = source;
Packit Service fa4841
	if (targetStart && *targetStart)
Packit Service fa4841
		*targetStart = &(*targetStart)[pos];
Packit Service fa4841
	else if (targetStart)
Packit Service fa4841
		*targetStart = (BYTE*)pos;
Packit Service fa4841
	return result;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
/* --------------------------------------------------------------------- */
Packit Service fa4841
Packit Service fa4841
/*
Packit Service fa4841
 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
Packit Service fa4841
 * This must be called with the length pre-determined by the first byte.
Packit Service fa4841
 * If not calling this from ConvertUTF8to*, then the length can be set by:
Packit Service fa4841
 *  length = trailingBytesForUTF8[*source]+1;
Packit Service fa4841
 * and the sequence is illegal right away if there aren't that many bytes
Packit Service fa4841
 * available.
Packit Service fa4841
 * If presented with a length > 4, this returns FALSE.  The Unicode
Packit Service fa4841
 * definition of UTF-8 goes up to 4-byte sequences.
Packit Service fa4841
 */
Packit Service fa4841
Packit Service fa4841
static BOOL isLegalUTF8(const BYTE* source, int length)
Packit Service fa4841
{
Packit Service fa4841
	BYTE a;
Packit Service fa4841
	const BYTE* srcptr = source + length;
Packit Service fa4841
Packit Service fa4841
	switch (length)
Packit Service fa4841
	{
Packit Service fa4841
		default:
Packit Service fa4841
			return FALSE;
Packit Service fa4841
Packit Service fa4841
		/* Everything else falls through when "TRUE"... */
Packit Service fa4841
		case 4:
Packit Service fa4841
			if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
Packit Service fa4841
				return FALSE;
Packit Service fa4841
Packit Service fa4841
		case 3:
Packit Service fa4841
			if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
Packit Service fa4841
				return FALSE;
Packit Service fa4841
Packit Service fa4841
		case 2:
Packit Service fa4841
			if ((a = (*--srcptr)) > 0xBF)
Packit Service fa4841
				return FALSE;
Packit Service fa4841
Packit Service fa4841
			switch (*source)
Packit Service fa4841
			{
Packit Service fa4841
				/* no fall-through in this inner switch */
Packit Service fa4841
				case 0xE0:
Packit Service fa4841
					if (a < 0xA0)
Packit Service fa4841
						return FALSE;
Packit Service fa4841
Packit Service fa4841
					break;
Packit Service fa4841
Packit Service fa4841
				case 0xED:
Packit Service fa4841
					if (a > 0x9F)
Packit Service fa4841
						return FALSE;
Packit Service fa4841
Packit Service fa4841
					break;
Packit Service fa4841
Packit Service fa4841
				case 0xF0:
Packit Service fa4841
					if (a < 0x90)
Packit Service fa4841
						return FALSE;
Packit Service fa4841
Packit Service fa4841
					break;
Packit Service fa4841
Packit Service fa4841
				case 0xF4:
Packit Service fa4841
					if (a > 0x8F)
Packit Service fa4841
						return FALSE;
Packit Service fa4841
Packit Service fa4841
					break;
Packit Service fa4841
Packit Service fa4841
				default:
Packit Service fa4841
					if (a < 0x80)
Packit Service fa4841
						return FALSE;
Packit Service fa4841
			}
Packit Service fa4841
Packit Service fa4841
		case 1:
Packit Service fa4841
			if (*source >= 0x80 && *source < 0xC2)
Packit Service fa4841
				return FALSE;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	if (*source > 0xF4)
Packit Service fa4841
		return FALSE;
Packit Service fa4841
Packit Service fa4841
	return TRUE;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
/* --------------------------------------------------------------------- */
Packit Service fa4841
Packit Service fa4841
/*
Packit Service fa4841
 * Exported function to return whether a UTF-8 sequence is legal or not.
Packit Service fa4841
 * This is not used here; it's just exported.
Packit Service fa4841
 */
Packit Service fa4841
BOOL isLegalUTF8Sequence(const BYTE* source, const BYTE* sourceEnd)
Packit Service fa4841
{
Packit Service fa4841
	int length = trailingBytesForUTF8[*source] + 1;
Packit Service fa4841
Packit Service fa4841
	if (source + length > sourceEnd)
Packit Service fa4841
		return FALSE;
Packit Service fa4841
Packit Service fa4841
	return isLegalUTF8(source, length);
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
/* --------------------------------------------------------------------- */
Packit Service fa4841
Packit Service fa4841
ConversionResult ConvertUTF8toUTF16(const BYTE** sourceStart, const BYTE* sourceEnd,
Packit Service fa4841
                                    WCHAR** targetStart, WCHAR* targetEnd, ConversionFlags flags)
Packit Service fa4841
{
Packit Service fa4841
	size_t target = 0;
Packit Service fa4841
	size_t end = 0;
Packit Service fa4841
	const BYTE* source;
Packit Service fa4841
	BOOL computeLength;
Packit Service fa4841
	ConversionResult result;
Packit Service fa4841
	computeLength = (!targetEnd) ? TRUE : FALSE;
Packit Service fa4841
	result = conversionOK;
Packit Service fa4841
	source = *sourceStart;
Packit Service fa4841
Packit Service fa4841
	if (targetStart && targetEnd)
Packit Service fa4841
	{
Packit Service fa4841
		const size_t s = (size_t)*targetStart;
Packit Service fa4841
		const size_t e = (size_t)targetEnd;
Packit Service fa4841
		if (s > e)
Packit Service fa4841
			return sourceIllegal;
Packit Service fa4841
Packit Service fa4841
		end = ((size_t)(targetEnd)) - ((size_t)(*targetStart));
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	while (source < sourceEnd)
Packit Service fa4841
	{
Packit Service fa4841
		DWORD ch = 0;
Packit Service fa4841
		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
Packit Service fa4841
Packit Service fa4841
		if ((source + extraBytesToRead) >= sourceEnd)
Packit Service fa4841
		{
Packit Service fa4841
			result = sourceExhausted;
Packit Service fa4841
			break;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		/* Do this check whether lenient or strict */
Packit Service fa4841
		if (!isLegalUTF8(source, extraBytesToRead + 1))
Packit Service fa4841
		{
Packit Service fa4841
			result = sourceIllegal;
Packit Service fa4841
			break;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		/*
Packit Service fa4841
		 * The cases all fall through. See "Note A" below.
Packit Service fa4841
		 */
Packit Service fa4841
		switch (extraBytesToRead)
Packit Service fa4841
		{
Packit Service fa4841
			case 5:
Packit Service fa4841
				ch += *source++;
Packit Service fa4841
				ch <<= 6; /* remember, illegal UTF-8 */
Packit Service fa4841
Packit Service fa4841
			case 4:
Packit Service fa4841
				ch += *source++;
Packit Service fa4841
				ch <<= 6; /* remember, illegal UTF-8 */
Packit Service fa4841
Packit Service fa4841
			case 3:
Packit Service fa4841
				ch += *source++;
Packit Service fa4841
				ch <<= 6;
Packit Service fa4841
Packit Service fa4841
			case 2:
Packit Service fa4841
				ch += *source++;
Packit Service fa4841
				ch <<= 6;
Packit Service fa4841
Packit Service fa4841
			case 1:
Packit Service fa4841
				ch += *source++;
Packit Service fa4841
				ch <<= 6;
Packit Service fa4841
Packit Service fa4841
			case 0:
Packit Service fa4841
				ch += *source++;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		ch -= offsetsFromUTF8[extraBytesToRead];
Packit Service fa4841
Packit Service fa4841
		if ((target >= end) && (!computeLength))
Packit Service fa4841
		{
Packit Service fa4841
			source -= (extraBytesToRead + 1); /* Back up source pointer! */
Packit Service fa4841
			result = targetExhausted;
Packit Service fa4841
			break;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		if (ch <= UNI_MAX_BMP)
Packit Service fa4841
		{
Packit Service fa4841
			/* Target is a character <= 0xFFFF */
Packit Service fa4841
			/* UTF-16 surrogate values are illegal in UTF-32 */
Packit Service fa4841
			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
Packit Service fa4841
			{
Packit Service fa4841
				if (flags == strictConversion)
Packit Service fa4841
				{
Packit Service fa4841
					source -= (extraBytesToRead + 1); /* return to the illegal value itself */
Packit Service fa4841
					result = sourceIllegal;
Packit Service fa4841
					break;
Packit Service fa4841
				}
Packit Service fa4841
				else
Packit Service fa4841
				{
Packit Service fa4841
					if (!computeLength)
Packit Service fa4841
						Data_Write_UINT16(&(*targetStart)[target], UNI_REPLACEMENT_CHAR);
Packit Service fa4841
					target++;
Packit Service fa4841
				}
Packit Service fa4841
			}
Packit Service fa4841
			else
Packit Service fa4841
			{
Packit Service fa4841
				if (!computeLength)
Packit Service fa4841
					Data_Write_UINT16(&(*targetStart)[target], ch); /* normal case */
Packit Service fa4841
				target++;
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
		else if (ch > UNI_MAX_UTF16)
Packit Service fa4841
		{
Packit Service fa4841
			if (flags == strictConversion)
Packit Service fa4841
			{
Packit Service fa4841
				result = sourceIllegal;
Packit Service fa4841
				source -= (extraBytesToRead + 1); /* return to the start */
Packit Service fa4841
				break;                            /* Bail out; shouldn't continue */
Packit Service fa4841
			}
Packit Service fa4841
			else
Packit Service fa4841
			{
Packit Service fa4841
				if (!computeLength)
Packit Service fa4841
					Data_Write_UINT16(&(*targetStart)[target], UNI_REPLACEMENT_CHAR);
Packit Service fa4841
				target++;
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
		else
Packit Service fa4841
		{
Packit Service fa4841
			/* target is a character in range 0xFFFF - 0x10FFFF. */
Packit Service fa4841
			if ((target + 1 >= end) && (!computeLength))
Packit Service fa4841
			{
Packit Service fa4841
				source -= (extraBytesToRead + 1); /* Back up source pointer! */
Packit Service fa4841
				result = targetExhausted;
Packit Service fa4841
				break;
Packit Service fa4841
			}
Packit Service fa4841
Packit Service fa4841
			ch -= halfBase;
Packit Service fa4841
Packit Service fa4841
			if (!computeLength)
Packit Service fa4841
			{
Packit Service fa4841
				WCHAR wchar;
Packit Service fa4841
				wchar = (ch >> halfShift) + UNI_SUR_HIGH_START;
Packit Service fa4841
				Data_Write_UINT16(&(*targetStart)[target++], wchar);
Packit Service fa4841
				wchar = (ch & halfMask) + UNI_SUR_LOW_START;
Packit Service fa4841
				Data_Write_UINT16(&(*targetStart)[target++], wchar);
Packit Service fa4841
			}
Packit Service fa4841
			else
Packit Service fa4841
			{
Packit Service fa4841
				target++;
Packit Service fa4841
				target++;
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	*sourceStart = source;
Packit Service fa4841
	if (targetStart && (*targetStart))
Packit Service fa4841
		*targetStart = &(*targetStart)[target];
Packit Service fa4841
	else if (targetStart)
Packit Service fa4841
		*targetStart = (WCHAR*)(target * sizeof(WCHAR));
Packit Service fa4841
	return result;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
/* --------------------------------------------------------------------- */
Packit Service fa4841
Packit Service fa4841
ConversionResult ConvertUTF32toUTF8(const DWORD** sourceStart, const DWORD* sourceEnd,
Packit Service fa4841
                                    BYTE** targetStart, BYTE* targetEnd, ConversionFlags flags)
Packit Service fa4841
{
Packit Service fa4841
	ConversionResult result = conversionOK;
Packit Service fa4841
	const DWORD* source = *sourceStart;
Packit Service fa4841
	BYTE* target = *targetStart;
Packit Service fa4841
Packit Service fa4841
	while (source < sourceEnd)
Packit Service fa4841
	{
Packit Service fa4841
		DWORD ch;
Packit Service fa4841
		unsigned short bytesToWrite = 0;
Packit Service fa4841
		const DWORD byteMask = 0xBF;
Packit Service fa4841
		const DWORD byteMark = 0x80;
Packit Service fa4841
		ch = *source++;
Packit Service fa4841
Packit Service fa4841
		if (flags == strictConversion)
Packit Service fa4841
		{
Packit Service fa4841
			/* UTF-16 surrogate values are illegal in UTF-32 */
Packit Service fa4841
			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
Packit Service fa4841
			{
Packit Service fa4841
				--source; /* return to the illegal value itself */
Packit Service fa4841
				result = sourceIllegal;
Packit Service fa4841
				break;
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		/*
Packit Service fa4841
		 * Figure out how many bytes the result will require. Turn any
Packit Service fa4841
		 * illegally large UTF32 things (> Plane 17) into replacement chars.
Packit Service fa4841
		 */
Packit Service fa4841
		if (ch < (DWORD)0x80)
Packit Service fa4841
		{
Packit Service fa4841
			bytesToWrite = 1;
Packit Service fa4841
		}
Packit Service fa4841
		else if (ch < (DWORD)0x800)
Packit Service fa4841
		{
Packit Service fa4841
			bytesToWrite = 2;
Packit Service fa4841
		}
Packit Service fa4841
		else if (ch < (DWORD)0x10000)
Packit Service fa4841
		{
Packit Service fa4841
			bytesToWrite = 3;
Packit Service fa4841
		}
Packit Service fa4841
		else if (ch <= UNI_MAX_LEGAL_UTF32)
Packit Service fa4841
		{
Packit Service fa4841
			bytesToWrite = 4;
Packit Service fa4841
		}
Packit Service fa4841
		else
Packit Service fa4841
		{
Packit Service fa4841
			bytesToWrite = 3;
Packit Service fa4841
			ch = UNI_REPLACEMENT_CHAR;
Packit Service fa4841
			result = sourceIllegal;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		target += bytesToWrite;
Packit Service fa4841
Packit Service fa4841
		if (target > targetEnd)
Packit Service fa4841
		{
Packit Service fa4841
			--source; /* Back up source pointer! */
Packit Service fa4841
			target -= bytesToWrite;
Packit Service fa4841
			result = targetExhausted;
Packit Service fa4841
			break;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		switch (bytesToWrite) /* note: everything falls through. */
Packit Service fa4841
		{
Packit Service fa4841
			case 4:
Packit Service fa4841
				*--target = (BYTE)((ch | byteMark) & byteMask);
Packit Service fa4841
				ch >>= 6;
Packit Service fa4841
Packit Service fa4841
			case 3:
Packit Service fa4841
				*--target = (BYTE)((ch | byteMark) & byteMask);
Packit Service fa4841
				ch >>= 6;
Packit Service fa4841
Packit Service fa4841
			case 2:
Packit Service fa4841
				*--target = (BYTE)((ch | byteMark) & byteMask);
Packit Service fa4841
				ch >>= 6;
Packit Service fa4841
Packit Service fa4841
			case 1:
Packit Service fa4841
				*--target = (BYTE)(ch | firstByteMark[bytesToWrite]);
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		target += bytesToWrite;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	*sourceStart = source;
Packit Service fa4841
	*targetStart = target;
Packit Service fa4841
	return result;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
/* --------------------------------------------------------------------- */
Packit Service fa4841
Packit Service fa4841
ConversionResult ConvertUTF8toUTF32(const BYTE** sourceStart, const BYTE* sourceEnd,
Packit Service fa4841
                                    DWORD** targetStart, DWORD* targetEnd, ConversionFlags flags)
Packit Service fa4841
{
Packit Service fa4841
	ConversionResult result = conversionOK;
Packit Service fa4841
	const BYTE* source = *sourceStart;
Packit Service fa4841
	DWORD* target = *targetStart;
Packit Service fa4841
Packit Service fa4841
	while (source < sourceEnd)
Packit Service fa4841
	{
Packit Service fa4841
		DWORD ch = 0;
Packit Service fa4841
		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
Packit Service fa4841
Packit Service fa4841
		if (source + extraBytesToRead >= sourceEnd)
Packit Service fa4841
		{
Packit Service fa4841
			result = sourceExhausted;
Packit Service fa4841
			break;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		/* Do this check whether lenient or strict */
Packit Service fa4841
		if (!isLegalUTF8(source, extraBytesToRead + 1))
Packit Service fa4841
		{
Packit Service fa4841
			result = sourceIllegal;
Packit Service fa4841
			break;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		/*
Packit Service fa4841
		 * The cases all fall through. See "Note A" below.
Packit Service fa4841
		 */
Packit Service fa4841
		switch (extraBytesToRead)
Packit Service fa4841
		{
Packit Service fa4841
			case 5:
Packit Service fa4841
				ch += *source++;
Packit Service fa4841
				ch <<= 6;
Packit Service fa4841
Packit Service fa4841
			case 4:
Packit Service fa4841
				ch += *source++;
Packit Service fa4841
				ch <<= 6;
Packit Service fa4841
Packit Service fa4841
			case 3:
Packit Service fa4841
				ch += *source++;
Packit Service fa4841
				ch <<= 6;
Packit Service fa4841
Packit Service fa4841
			case 2:
Packit Service fa4841
				ch += *source++;
Packit Service fa4841
				ch <<= 6;
Packit Service fa4841
Packit Service fa4841
			case 1:
Packit Service fa4841
				ch += *source++;
Packit Service fa4841
				ch <<= 6;
Packit Service fa4841
Packit Service fa4841
			case 0:
Packit Service fa4841
				ch += *source++;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		ch -= offsetsFromUTF8[extraBytesToRead];
Packit Service fa4841
Packit Service fa4841
		if (target >= targetEnd)
Packit Service fa4841
		{
Packit Service fa4841
			source -= (extraBytesToRead + 1); /* Back up the source pointer! */
Packit Service fa4841
			result = targetExhausted;
Packit Service fa4841
			break;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		if (ch <= UNI_MAX_LEGAL_UTF32)
Packit Service fa4841
		{
Packit Service fa4841
			/*
Packit Service fa4841
			 * UTF-16 surrogate values are illegal in UTF-32, and anything
Packit Service fa4841
			 * over Plane 17 (> 0x10FFFF) is illegal.
Packit Service fa4841
			 */
Packit Service fa4841
			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
Packit Service fa4841
			{
Packit Service fa4841
				if (flags == strictConversion)
Packit Service fa4841
				{
Packit Service fa4841
					source -= (extraBytesToRead + 1); /* return to the illegal value itself */
Packit Service fa4841
					result = sourceIllegal;
Packit Service fa4841
					break;
Packit Service fa4841
				}
Packit Service fa4841
				else
Packit Service fa4841
				{
Packit Service fa4841
					*target++ = UNI_REPLACEMENT_CHAR;
Packit Service fa4841
				}
Packit Service fa4841
			}
Packit Service fa4841
			else
Packit Service fa4841
			{
Packit Service fa4841
				*target++ = ch;
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
		else /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
Packit Service fa4841
		{
Packit Service fa4841
			result = sourceIllegal;
Packit Service fa4841
			*target++ = UNI_REPLACEMENT_CHAR;
Packit Service fa4841
		}
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	*sourceStart = source;
Packit Service fa4841
	*targetStart = target;
Packit Service fa4841
	return result;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
/* ---------------------------------------------------------------------
Packit Service fa4841
Packit Service fa4841
    Note A.
Packit Service fa4841
    The fall-through switches in UTF-8 reading code save a
Packit Service fa4841
    temp variable, some decrements & conditionals.  The switches
Packit Service fa4841
    are equivalent to the following loop:
Packit Service fa4841
    {
Packit Service fa4841
        int tmpBytesToRead = extraBytesToRead+1;
Packit Service fa4841
        do {
Packit Service fa4841
        ch += *source++;
Packit Service fa4841
        --tmpBytesToRead;
Packit Service fa4841
        if (tmpBytesToRead) ch <<= 6;
Packit Service fa4841
        } while (tmpBytesToRead > 0);
Packit Service fa4841
    }
Packit Service fa4841
    In UTF-8 writing code, the switches on "bytesToWrite" are
Packit Service fa4841
    similarly unrolled loops.
Packit Service fa4841
Packit Service fa4841
   --------------------------------------------------------------------- */
Packit Service fa4841
#pragma GCC diagnostic pop