Blame winpr/libwinpr/crt/utf.c

Packit 1fb8d4
/*
Packit 1fb8d4
 * Copyright 2001-2004 Unicode, Inc.
Packit 1fb8d4
 *
Packit 1fb8d4
 * Disclaimer
Packit 1fb8d4
 *
Packit 1fb8d4
 * This source code is provided as is by Unicode, Inc. No claims are
Packit 1fb8d4
 * made as to fitness for any particular purpose. No warranties of any
Packit 1fb8d4
 * kind are expressed or implied. The recipient agrees to determine
Packit 1fb8d4
 * applicability of information provided. If this file has been
Packit 1fb8d4
 * purchased on magnetic or optical media from Unicode, Inc., the
Packit 1fb8d4
 * sole remedy for any claim will be exchange of defective media
Packit 1fb8d4
 * within 90 days of receipt.
Packit 1fb8d4
 *
Packit 1fb8d4
 * Limitations on Rights to Redistribute This Code
Packit 1fb8d4
 *
Packit 1fb8d4
 * Unicode, Inc. hereby grants the right to freely use the information
Packit 1fb8d4
 * supplied in this file in the creation of products supporting the
Packit 1fb8d4
 * Unicode Standard, and to make copies of this file in any form
Packit 1fb8d4
 * for internal or external distribution as long as this notice
Packit 1fb8d4
 * remains attached.
Packit 1fb8d4
 */
Packit 1fb8d4
Packit 1fb8d4
/* ---------------------------------------------------------------------
Packit 1fb8d4
Packit 1fb8d4
    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
Packit 1fb8d4
    Author: Mark E. Davis, 1994.
Packit 1fb8d4
    Rev History: Rick McGowan, fixes & updates May 2001.
Packit 1fb8d4
    Sept 2001: fixed const & error conditions per
Packit 1fb8d4
    mods suggested by S. Parent & A. Lillich.
Packit 1fb8d4
    June 2002: Tim Dodd added detection and handling of incomplete
Packit 1fb8d4
    source sequences, enhanced error detection, added casts
Packit 1fb8d4
    to eliminate compiler warnings.
Packit 1fb8d4
    July 2003: slight mods to back out aggressive FFFE detection.
Packit 1fb8d4
    Jan 2004: updated switches in from-UTF8 conversions.
Packit 1fb8d4
    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
Packit 1fb8d4
Packit 1fb8d4
    See the header file "utf.h" for complete documentation.
Packit 1fb8d4
Packit 1fb8d4
------------------------------------------------------------------------ */
Packit 1fb8d4
Packit 1fb8d4
#include "utf.h"
Packit 1fb8d4
#include <winpr/endian.h>
Packit 1fb8d4
Packit 1fb8d4
static const int halfShift  = 10; /* used for shifting by 10 bits */
Packit 1fb8d4
Packit 1fb8d4
static const DWORD halfBase = 0x0010000UL;
Packit 1fb8d4
static const DWORD halfMask = 0x3FFUL;
Packit 1fb8d4
Packit 1fb8d4
#define UNI_SUR_HIGH_START  (DWORD)0xD800
Packit 1fb8d4
#define UNI_SUR_HIGH_END    (DWORD)0xDBFF
Packit 1fb8d4
#define UNI_SUR_LOW_START   (DWORD)0xDC00
Packit 1fb8d4
#define UNI_SUR_LOW_END     (DWORD)0xDFFF
Packit 1fb8d4
Packit 1fb8d4
/* --------------------------------------------------------------------- */
Packit 1fb8d4
Packit 1fb8d4
ConversionResult ConvertUTF32toUTF16(
Packit 1fb8d4
    const DWORD** sourceStart, const DWORD* sourceEnd,
Packit 1fb8d4
    WCHAR** targetStart, WCHAR* targetEnd, ConversionFlags flags)
Packit 1fb8d4
{
Packit 1fb8d4
	ConversionResult result = conversionOK;
Packit 1fb8d4
	const DWORD* source = *sourceStart;
Packit 1fb8d4
	WCHAR* target = *targetStart;
Packit 1fb8d4
Packit 1fb8d4
	while (source < sourceEnd)
Packit 1fb8d4
	{
Packit 1fb8d4
		DWORD ch;
Packit 1fb8d4
Packit 1fb8d4
		if (target >= targetEnd)
Packit 1fb8d4
		{
Packit 1fb8d4
			result = targetExhausted;
Packit 1fb8d4
			break;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		ch = *source++;
Packit 1fb8d4
Packit 1fb8d4
		if (ch <= UNI_MAX_BMP)   /* Target is a character <= 0xFFFF */
Packit 1fb8d4
		{
Packit 1fb8d4
			/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
Packit 1fb8d4
			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
Packit 1fb8d4
			{
Packit 1fb8d4
				if (flags == strictConversion)
Packit 1fb8d4
				{
Packit 1fb8d4
					--source; /* return to the illegal value itself */
Packit 1fb8d4
					result = sourceIllegal;
Packit 1fb8d4
					break;
Packit 1fb8d4
				}
Packit 1fb8d4
				else
Packit 1fb8d4
				{
Packit 1fb8d4
					*target++ = UNI_REPLACEMENT_CHAR;
Packit 1fb8d4
				}
Packit 1fb8d4
			}
Packit 1fb8d4
			else
Packit 1fb8d4
			{
Packit 1fb8d4
				*target++ = (WCHAR)ch; /* normal case */
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
		else if (ch > UNI_MAX_LEGAL_UTF32)
Packit 1fb8d4
		{
Packit 1fb8d4
			if (flags == strictConversion)
Packit 1fb8d4
			{
Packit 1fb8d4
				result = sourceIllegal;
Packit 1fb8d4
			}
Packit 1fb8d4
			else
Packit 1fb8d4
			{
Packit 1fb8d4
				*target++ = UNI_REPLACEMENT_CHAR;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
		else
Packit 1fb8d4
		{
Packit 1fb8d4
			/* target is a character in range 0xFFFF - 0x10FFFF. */
Packit 1fb8d4
			if (target + 1 >= targetEnd)
Packit 1fb8d4
			{
Packit 1fb8d4
				--source; /* Back up source pointer! */
Packit 1fb8d4
				result = targetExhausted;
Packit 1fb8d4
				break;
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			ch -= halfBase;
Packit 1fb8d4
			*target++ = (WCHAR)((ch >> halfShift) + UNI_SUR_HIGH_START);
Packit 1fb8d4
			*target++ = (WCHAR)((ch & halfMask) + UNI_SUR_LOW_START);
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	*sourceStart = source;
Packit 1fb8d4
	*targetStart = target;
Packit 1fb8d4
	return result;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/* --------------------------------------------------------------------- */
Packit 1fb8d4
Packit 1fb8d4
ConversionResult ConvertUTF16toUTF32(
Packit 1fb8d4
    const WCHAR** sourceStart, const WCHAR* sourceEnd,
Packit 1fb8d4
    DWORD** targetStart, DWORD* targetEnd, ConversionFlags flags)
Packit 1fb8d4
{
Packit 1fb8d4
	ConversionResult result = conversionOK;
Packit 1fb8d4
	const WCHAR* source = *sourceStart;
Packit 1fb8d4
	DWORD* target = *targetStart;
Packit 1fb8d4
	DWORD ch, ch2;
Packit 1fb8d4
Packit 1fb8d4
	while (source < sourceEnd)
Packit 1fb8d4
	{
Packit 1fb8d4
		const WCHAR* oldSource = source; /*  In case we have to back up because of target overflow. */
Packit 1fb8d4
		ch = *source++;
Packit 1fb8d4
Packit 1fb8d4
		/* If we have a surrogate pair, convert to UTF32 first. */
Packit 1fb8d4
		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* If the 16 bits following the high surrogate are in the source buffer... */
Packit 1fb8d4
			if (source < sourceEnd)
Packit 1fb8d4
			{
Packit 1fb8d4
				ch2 = *source;
Packit 1fb8d4
Packit 1fb8d4
				/* If it's a low surrogate, convert to UTF32. */
Packit 1fb8d4
				if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
Packit 1fb8d4
				{
Packit 1fb8d4
					ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
Packit 1fb8d4
					     + (ch2 - UNI_SUR_LOW_START) + halfBase;
Packit 1fb8d4
					++source;
Packit 1fb8d4
				}
Packit 1fb8d4
				else if (flags == strictConversion)     /* it's an unpaired high surrogate */
Packit 1fb8d4
				{
Packit 1fb8d4
					--source; /* return to the illegal value itself */
Packit 1fb8d4
					result = sourceIllegal;
Packit 1fb8d4
					break;
Packit 1fb8d4
				}
Packit 1fb8d4
			}
Packit 1fb8d4
			else     /* We don't have the 16 bits following the high surrogate. */
Packit 1fb8d4
			{
Packit 1fb8d4
				--source; /* return to the high surrogate */
Packit 1fb8d4
				result = sourceExhausted;
Packit 1fb8d4
				break;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
		else if (flags == strictConversion)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* UTF-16 surrogate values are illegal in UTF-32 */
Packit 1fb8d4
			if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
Packit 1fb8d4
			{
Packit 1fb8d4
				--source; /* return to the illegal value itself */
Packit 1fb8d4
				result = sourceIllegal;
Packit 1fb8d4
				break;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		if (target >= targetEnd)
Packit 1fb8d4
		{
Packit 1fb8d4
			source = oldSource; /* Back up source pointer! */
Packit 1fb8d4
			result = targetExhausted;
Packit 1fb8d4
			break;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		*target++ = ch;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	*sourceStart = source;
Packit 1fb8d4
	*targetStart = target;
Packit 1fb8d4
#ifdef CVTUTF_DEBUG
Packit 1fb8d4
Packit 1fb8d4
	if (result == sourceIllegal)
Packit 1fb8d4
	{
Packit 1fb8d4
		WLOG_WARN(TAG, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x", ch, ch2);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
#endif
Packit 1fb8d4
	return result;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/* --------------------------------------------------------------------- */
Packit 1fb8d4
Packit 1fb8d4
/*
Packit 1fb8d4
 * Index into the table below with the first byte of a UTF-8 sequence to
Packit 1fb8d4
 * get the number of trailing bytes that are supposed to follow it.
Packit 1fb8d4
 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
Packit 1fb8d4
 * left as-is for anyone who may want to do such conversion, which was
Packit 1fb8d4
 * allowed in earlier algorithms.
Packit 1fb8d4
 */
Packit 1fb8d4
static const char trailingBytesForUTF8[256] =
Packit 1fb8d4
{
Packit 1fb8d4
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit 1fb8d4
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit 1fb8d4
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit 1fb8d4
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit 1fb8d4
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit 1fb8d4
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit 1fb8d4
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Packit 1fb8d4
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
Packit 1fb8d4
};
Packit 1fb8d4
Packit 1fb8d4
/*
Packit 1fb8d4
 * Magic values subtracted from a buffer value during UTF8 conversion.
Packit 1fb8d4
 * This table contains as many values as there might be trailing bytes
Packit 1fb8d4
 * in a UTF-8 sequence.
Packit 1fb8d4
 */
Packit 1fb8d4
static const DWORD offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
Packit 1fb8d4
                                          0x03C82080UL, 0xFA082080UL, 0x82082080UL
Packit 1fb8d4
                                        };
Packit 1fb8d4
Packit 1fb8d4
/*
Packit 1fb8d4
 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
Packit 1fb8d4
 * into the first byte, depending on how many bytes follow.  There are
Packit 1fb8d4
 * as many entries in this table as there are UTF-8 sequence types.
Packit 1fb8d4
 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
Packit 1fb8d4
 * for *legal* UTF-8 will be 4 or fewer bytes total.
Packit 1fb8d4
 */
Packit 1fb8d4
static const BYTE firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
Packit 1fb8d4
Packit 1fb8d4
/* --------------------------------------------------------------------- */
Packit 1fb8d4
Packit 1fb8d4
/* The interface converts a whole buffer to avoid function-call overhead.
Packit 1fb8d4
 * Constants have been gathered. Loops & conditionals have been removed as
Packit 1fb8d4
 * much as possible for efficiency, in favor of drop-through switches.
Packit 1fb8d4
 * (See "Note A" at the bottom of the file for equivalent code.)
Packit 1fb8d4
 * If your compiler supports it, the "isLegalUTF8" call can be turned
Packit 1fb8d4
 * into an inline function.
Packit 1fb8d4
 */
Packit 1fb8d4
Packit 1fb8d4
/* --------------------------------------------------------------------- */
Packit 1fb8d4
Packit 1fb8d4
ConversionResult ConvertUTF16toUTF8(
Packit 1fb8d4
    const WCHAR** sourceStart, const WCHAR* sourceEnd,
Packit 1fb8d4
    BYTE** targetStart, BYTE* targetEnd, ConversionFlags flags)
Packit 1fb8d4
{
Packit 1fb8d4
	BYTE* target;
Packit 1fb8d4
	const WCHAR* source;
Packit 1fb8d4
	BOOL computeLength;
Packit 1fb8d4
	ConversionResult result;
Packit 1fb8d4
	computeLength = (!targetEnd) ? TRUE : FALSE;
Packit 1fb8d4
	source = *sourceStart;
Packit 1fb8d4
	target = *targetStart;
Packit 1fb8d4
	result = conversionOK;
Packit 1fb8d4
Packit 1fb8d4
	while (source < sourceEnd)
Packit 1fb8d4
	{
Packit 1fb8d4
		DWORD ch;
Packit 1fb8d4
		unsigned short bytesToWrite = 0;
Packit 1fb8d4
		const DWORD byteMask = 0xBF;
Packit 1fb8d4
		const DWORD byteMark = 0x80;
Packit 1fb8d4
		const WCHAR* oldSource = source; /* In case we have to back up because of target overflow. */
Packit 1fb8d4
		Data_Read_UINT16(source, ch);
Packit 1fb8d4
		source++;
Packit 1fb8d4
Packit 1fb8d4
		/* If we have a surrogate pair, convert to UTF32 first. */
Packit 1fb8d4
		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* If the 16 bits following the high surrogate are in the source buffer... */
Packit 1fb8d4
			if (source < sourceEnd)
Packit 1fb8d4
			{
Packit 1fb8d4
				DWORD ch2;
Packit 1fb8d4
				Data_Read_UINT16(source, ch2);
Packit 1fb8d4
Packit 1fb8d4
				/* If it's a low surrogate, convert to UTF32. */
Packit 1fb8d4
				if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
Packit 1fb8d4
				{
Packit 1fb8d4
					ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
Packit 1fb8d4
					     + (ch2 - UNI_SUR_LOW_START) + halfBase;
Packit 1fb8d4
					++source;
Packit 1fb8d4
				}
Packit 1fb8d4
				else if (flags == strictConversion)
Packit 1fb8d4
				{
Packit 1fb8d4
					/* it's an unpaired high surrogate */
Packit 1fb8d4
					--source; /* return to the illegal value itself */
Packit 1fb8d4
					result = sourceIllegal;
Packit 1fb8d4
					break;
Packit 1fb8d4
				}
Packit 1fb8d4
			}
Packit 1fb8d4
			else
Packit 1fb8d4
			{
Packit 1fb8d4
				/* We don't have the 16 bits following the high surrogate. */
Packit 1fb8d4
				--source; /* return to the high surrogate */
Packit 1fb8d4
				result = sourceExhausted;
Packit 1fb8d4
				break;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
		else if (flags == strictConversion)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* UTF-16 surrogate values are illegal in UTF-32 */
Packit 1fb8d4
			if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
Packit 1fb8d4
			{
Packit 1fb8d4
				--source; /* return to the illegal value itself */
Packit 1fb8d4
				result = sourceIllegal;
Packit 1fb8d4
				break;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Figure out how many bytes the result will require */
Packit 1fb8d4
		if (ch < (DWORD) 0x80)
Packit 1fb8d4
		{
Packit 1fb8d4
			bytesToWrite = 1;
Packit 1fb8d4
		}
Packit 1fb8d4
		else if (ch < (DWORD) 0x800)
Packit 1fb8d4
		{
Packit 1fb8d4
			bytesToWrite = 2;
Packit 1fb8d4
		}
Packit 1fb8d4
		else if (ch < (DWORD) 0x10000)
Packit 1fb8d4
		{
Packit 1fb8d4
			bytesToWrite = 3;
Packit 1fb8d4
		}
Packit 1fb8d4
		else if (ch < (DWORD) 0x110000)
Packit 1fb8d4
		{
Packit 1fb8d4
			bytesToWrite = 4;
Packit 1fb8d4
		}
Packit 1fb8d4
		else
Packit 1fb8d4
		{
Packit 1fb8d4
			bytesToWrite = 3;
Packit 1fb8d4
			ch = UNI_REPLACEMENT_CHAR;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		target += bytesToWrite;
Packit 1fb8d4
Packit 1fb8d4
		if ((target > targetEnd) && (!computeLength))
Packit 1fb8d4
		{
Packit 1fb8d4
			source = oldSource; /* Back up source pointer! */
Packit 1fb8d4
			target -= bytesToWrite;
Packit 1fb8d4
			result = targetExhausted;
Packit 1fb8d4
			break;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		if (!computeLength)
Packit 1fb8d4
		{
Packit 1fb8d4
			switch (bytesToWrite)
Packit 1fb8d4
			{
Packit 1fb8d4
				/* note: everything falls through. */
Packit 1fb8d4
				case 4:
Packit 1fb8d4
					*--target = (BYTE)((ch | byteMark) & byteMask);
Packit 1fb8d4
					ch >>= 6;
Packit 1fb8d4
Packit 1fb8d4
				case 3:
Packit 1fb8d4
					*--target = (BYTE)((ch | byteMark) & byteMask);
Packit 1fb8d4
					ch >>= 6;
Packit 1fb8d4
Packit 1fb8d4
				case 2:
Packit 1fb8d4
					*--target = (BYTE)((ch | byteMark) & byteMask);
Packit 1fb8d4
					ch >>= 6;
Packit 1fb8d4
Packit 1fb8d4
				case 1:
Packit 1fb8d4
					*--target = (BYTE)(ch | firstByteMark[bytesToWrite]);
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
		else
Packit 1fb8d4
		{
Packit 1fb8d4
			switch (bytesToWrite)
Packit 1fb8d4
			{
Packit 1fb8d4
				/* note: everything falls through. */
Packit 1fb8d4
				case 4:
Packit 1fb8d4
					--target;
Packit 1fb8d4
Packit 1fb8d4
				case 3:
Packit 1fb8d4
					--target;
Packit 1fb8d4
Packit 1fb8d4
				case 2:
Packit 1fb8d4
					--target;
Packit 1fb8d4
Packit 1fb8d4
				case 1:
Packit 1fb8d4
					--target;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		target += bytesToWrite;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	*sourceStart = source;
Packit 1fb8d4
	*targetStart = target;
Packit 1fb8d4
	return result;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/* --------------------------------------------------------------------- */
Packit 1fb8d4
Packit 1fb8d4
/*
Packit 1fb8d4
 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
Packit 1fb8d4
 * This must be called with the length pre-determined by the first byte.
Packit 1fb8d4
 * If not calling this from ConvertUTF8to*, then the length can be set by:
Packit 1fb8d4
 *  length = trailingBytesForUTF8[*source]+1;
Packit 1fb8d4
 * and the sequence is illegal right away if there aren't that many bytes
Packit 1fb8d4
 * available.
Packit 1fb8d4
 * If presented with a length > 4, this returns FALSE.  The Unicode
Packit 1fb8d4
 * definition of UTF-8 goes up to 4-byte sequences.
Packit 1fb8d4
 */
Packit 1fb8d4
Packit 1fb8d4
static BOOL isLegalUTF8(const BYTE* source, int length)
Packit 1fb8d4
{
Packit 1fb8d4
	BYTE a;
Packit 1fb8d4
	const BYTE* srcptr = source + length;
Packit 1fb8d4
Packit 1fb8d4
	switch (length)
Packit 1fb8d4
	{
Packit 1fb8d4
		default:
Packit 1fb8d4
			return FALSE;
Packit 1fb8d4
Packit 1fb8d4
		/* Everything else falls through when "TRUE"... */
Packit 1fb8d4
		case 4:
Packit 1fb8d4
			if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return FALSE;
Packit 1fb8d4
Packit 1fb8d4
		case 3:
Packit 1fb8d4
			if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return FALSE;
Packit 1fb8d4
Packit 1fb8d4
		case 2:
Packit 1fb8d4
			if ((a = (*--srcptr)) > 0xBF) return FALSE;
Packit 1fb8d4
Packit 1fb8d4
			switch (*source)
Packit 1fb8d4
			{
Packit 1fb8d4
				/* no fall-through in this inner switch */
Packit 1fb8d4
				case 0xE0:
Packit 1fb8d4
					if (a < 0xA0) return FALSE;
Packit 1fb8d4
Packit 1fb8d4
					break;
Packit 1fb8d4
Packit 1fb8d4
				case 0xED:
Packit 1fb8d4
					if (a > 0x9F) return FALSE;
Packit 1fb8d4
Packit 1fb8d4
					break;
Packit 1fb8d4
Packit 1fb8d4
				case 0xF0:
Packit 1fb8d4
					if (a < 0x90) return FALSE;
Packit 1fb8d4
Packit 1fb8d4
					break;
Packit 1fb8d4
Packit 1fb8d4
				case 0xF4:
Packit 1fb8d4
					if (a > 0x8F) return FALSE;
Packit 1fb8d4
Packit 1fb8d4
					break;
Packit 1fb8d4
Packit 1fb8d4
				default:
Packit 1fb8d4
					if (a < 0x80) return FALSE;
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
		case 1:
Packit 1fb8d4
			if (*source >= 0x80 && *source < 0xC2) return FALSE;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	if (*source > 0xF4)
Packit 1fb8d4
		return FALSE;
Packit 1fb8d4
Packit 1fb8d4
	return TRUE;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/* --------------------------------------------------------------------- */
Packit 1fb8d4
Packit 1fb8d4
/*
Packit 1fb8d4
 * Exported function to return whether a UTF-8 sequence is legal or not.
Packit 1fb8d4
 * This is not used here; it's just exported.
Packit 1fb8d4
 */
Packit 1fb8d4
BOOL isLegalUTF8Sequence(const BYTE* source, const BYTE* sourceEnd)
Packit 1fb8d4
{
Packit 1fb8d4
	int length = trailingBytesForUTF8[*source] + 1;
Packit 1fb8d4
Packit 1fb8d4
	if (source + length > sourceEnd)
Packit 1fb8d4
		return FALSE;
Packit 1fb8d4
Packit 1fb8d4
	return isLegalUTF8(source, length);
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/* --------------------------------------------------------------------- */
Packit 1fb8d4
Packit 1fb8d4
ConversionResult ConvertUTF8toUTF16(
Packit 1fb8d4
    const BYTE** sourceStart, const BYTE* sourceEnd,
Packit 1fb8d4
    WCHAR** targetStart, WCHAR* targetEnd, ConversionFlags flags)
Packit 1fb8d4
{
Packit 1fb8d4
	WCHAR* target;
Packit 1fb8d4
	const BYTE* source;
Packit 1fb8d4
	BOOL computeLength;
Packit 1fb8d4
	ConversionResult result;
Packit 1fb8d4
	computeLength = (!targetEnd) ? TRUE : FALSE;
Packit 1fb8d4
	result = conversionOK;
Packit 1fb8d4
	source = *sourceStart;
Packit 1fb8d4
	target = *targetStart;
Packit 1fb8d4
Packit 1fb8d4
	while (source < sourceEnd)
Packit 1fb8d4
	{
Packit 1fb8d4
		DWORD ch = 0;
Packit 1fb8d4
		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
Packit 1fb8d4
Packit 1fb8d4
		if ((source + extraBytesToRead) >= sourceEnd)
Packit 1fb8d4
		{
Packit 1fb8d4
			result = sourceExhausted;
Packit 1fb8d4
			break;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Do this check whether lenient or strict */
Packit 1fb8d4
		if (!isLegalUTF8(source, extraBytesToRead + 1))
Packit 1fb8d4
		{
Packit 1fb8d4
			result = sourceIllegal;
Packit 1fb8d4
			break;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/*
Packit 1fb8d4
		 * The cases all fall through. See "Note A" below.
Packit 1fb8d4
		 */
Packit 1fb8d4
		switch (extraBytesToRead)
Packit 1fb8d4
		{
Packit 1fb8d4
			case 5:
Packit 1fb8d4
				ch += *source++;
Packit 1fb8d4
				ch <<= 6; /* remember, illegal UTF-8 */
Packit 1fb8d4
Packit 1fb8d4
			case 4:
Packit 1fb8d4
				ch += *source++;
Packit 1fb8d4
				ch <<= 6; /* remember, illegal UTF-8 */
Packit 1fb8d4
Packit 1fb8d4
			case 3:
Packit 1fb8d4
				ch += *source++;
Packit 1fb8d4
				ch <<= 6;
Packit 1fb8d4
Packit 1fb8d4
			case 2:
Packit 1fb8d4
				ch += *source++;
Packit 1fb8d4
				ch <<= 6;
Packit 1fb8d4
Packit 1fb8d4
			case 1:
Packit 1fb8d4
				ch += *source++;
Packit 1fb8d4
				ch <<= 6;
Packit 1fb8d4
Packit 1fb8d4
			case 0:
Packit 1fb8d4
				ch += *source++;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		ch -= offsetsFromUTF8[extraBytesToRead];
Packit 1fb8d4
Packit 1fb8d4
		if ((target >= targetEnd) && (!computeLength))
Packit 1fb8d4
		{
Packit 1fb8d4
			source -= (extraBytesToRead + 1); /* Back up source pointer! */
Packit 1fb8d4
			result = targetExhausted;
Packit 1fb8d4
			break;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		if (ch <= UNI_MAX_BMP)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* Target is a character <= 0xFFFF */
Packit 1fb8d4
			/* UTF-16 surrogate values are illegal in UTF-32 */
Packit 1fb8d4
			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
Packit 1fb8d4
			{
Packit 1fb8d4
				if (flags == strictConversion)
Packit 1fb8d4
				{
Packit 1fb8d4
					source -= (extraBytesToRead + 1); /* return to the illegal value itself */
Packit 1fb8d4
					result = sourceIllegal;
Packit 1fb8d4
					break;
Packit 1fb8d4
				}
Packit 1fb8d4
				else
Packit 1fb8d4
				{
Packit 1fb8d4
					if (!computeLength)
Packit 1fb8d4
					{
Packit 1fb8d4
						Data_Write_UINT16(target, UNI_REPLACEMENT_CHAR);
Packit 1fb8d4
						target++;
Packit 1fb8d4
					}
Packit 1fb8d4
					else
Packit 1fb8d4
						target++;
Packit 1fb8d4
				}
Packit 1fb8d4
			}
Packit 1fb8d4
			else
Packit 1fb8d4
			{
Packit 1fb8d4
				if (!computeLength)
Packit 1fb8d4
				{
Packit 1fb8d4
					Data_Write_UINT16(target, ch); /* normal case */
Packit 1fb8d4
					target++;
Packit 1fb8d4
				}
Packit 1fb8d4
				else
Packit 1fb8d4
					target++;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
		else if (ch > UNI_MAX_UTF16)
Packit 1fb8d4
		{
Packit 1fb8d4
			if (flags == strictConversion)
Packit 1fb8d4
			{
Packit 1fb8d4
				result = sourceIllegal;
Packit 1fb8d4
				source -= (extraBytesToRead + 1); /* return to the start */
Packit 1fb8d4
				break; /* Bail out; shouldn't continue */
Packit 1fb8d4
			}
Packit 1fb8d4
			else
Packit 1fb8d4
			{
Packit 1fb8d4
				if (!computeLength)
Packit 1fb8d4
				{
Packit 1fb8d4
					Data_Write_UINT16(target, UNI_REPLACEMENT_CHAR);
Packit 1fb8d4
					target++;
Packit 1fb8d4
				}
Packit 1fb8d4
				else
Packit 1fb8d4
					target++;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
		else
Packit 1fb8d4
		{
Packit 1fb8d4
			/* target is a character in range 0xFFFF - 0x10FFFF. */
Packit 1fb8d4
			if ((target + 1 >= targetEnd) && (!computeLength))
Packit 1fb8d4
			{
Packit 1fb8d4
				source -= (extraBytesToRead + 1); /* Back up source pointer! */
Packit 1fb8d4
				result = targetExhausted;
Packit 1fb8d4
				break;
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			ch -= halfBase;
Packit 1fb8d4
Packit 1fb8d4
			if (!computeLength)
Packit 1fb8d4
			{
Packit 1fb8d4
				WCHAR wchar;
Packit 1fb8d4
				wchar = (ch >> halfShift) + UNI_SUR_HIGH_START;
Packit 1fb8d4
				Data_Write_UINT16(target, wchar);
Packit 1fb8d4
				target++;
Packit 1fb8d4
				wchar = (ch & halfMask) + UNI_SUR_LOW_START;
Packit 1fb8d4
				Data_Write_UINT16(target, wchar);
Packit 1fb8d4
				target++;
Packit 1fb8d4
			}
Packit 1fb8d4
			else
Packit 1fb8d4
			{
Packit 1fb8d4
				target++;
Packit 1fb8d4
				target++;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	*sourceStart = source;
Packit 1fb8d4
	*targetStart = target;
Packit 1fb8d4
	return result;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/* --------------------------------------------------------------------- */
Packit 1fb8d4
Packit 1fb8d4
ConversionResult ConvertUTF32toUTF8(
Packit 1fb8d4
    const DWORD** sourceStart, const DWORD* sourceEnd,
Packit 1fb8d4
    BYTE** targetStart, BYTE* targetEnd, ConversionFlags flags)
Packit 1fb8d4
{
Packit 1fb8d4
	ConversionResult result = conversionOK;
Packit 1fb8d4
	const DWORD* source = *sourceStart;
Packit 1fb8d4
	BYTE* target = *targetStart;
Packit 1fb8d4
Packit 1fb8d4
	while (source < sourceEnd)
Packit 1fb8d4
	{
Packit 1fb8d4
		DWORD ch;
Packit 1fb8d4
		unsigned short bytesToWrite = 0;
Packit 1fb8d4
		const DWORD byteMask = 0xBF;
Packit 1fb8d4
		const DWORD byteMark = 0x80;
Packit 1fb8d4
		ch = *source++;
Packit 1fb8d4
Packit 1fb8d4
		if (flags == strictConversion)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* UTF-16 surrogate values are illegal in UTF-32 */
Packit 1fb8d4
			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
Packit 1fb8d4
			{
Packit 1fb8d4
				--source; /* return to the illegal value itself */
Packit 1fb8d4
				result = sourceIllegal;
Packit 1fb8d4
				break;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/*
Packit 1fb8d4
		 * Figure out how many bytes the result will require. Turn any
Packit 1fb8d4
		 * illegally large UTF32 things (> Plane 17) into replacement chars.
Packit 1fb8d4
		 */
Packit 1fb8d4
		if (ch < (DWORD)0x80)
Packit 1fb8d4
		{
Packit 1fb8d4
			bytesToWrite = 1;
Packit 1fb8d4
		}
Packit 1fb8d4
		else if (ch < (DWORD)0x800)
Packit 1fb8d4
		{
Packit 1fb8d4
			bytesToWrite = 2;
Packit 1fb8d4
		}
Packit 1fb8d4
		else if (ch < (DWORD)0x10000)
Packit 1fb8d4
		{
Packit 1fb8d4
			bytesToWrite = 3;
Packit 1fb8d4
		}
Packit 1fb8d4
		else if (ch <= UNI_MAX_LEGAL_UTF32)
Packit 1fb8d4
		{
Packit 1fb8d4
			bytesToWrite = 4;
Packit 1fb8d4
		}
Packit 1fb8d4
		else
Packit 1fb8d4
		{
Packit 1fb8d4
			bytesToWrite = 3;
Packit 1fb8d4
			ch = UNI_REPLACEMENT_CHAR;
Packit 1fb8d4
			result = sourceIllegal;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		target += bytesToWrite;
Packit 1fb8d4
Packit 1fb8d4
		if (target > targetEnd)
Packit 1fb8d4
		{
Packit 1fb8d4
			--source; /* Back up source pointer! */
Packit 1fb8d4
			target -= bytesToWrite;
Packit 1fb8d4
			result = targetExhausted;
Packit 1fb8d4
			break;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		switch (bytesToWrite)   /* note: everything falls through. */
Packit 1fb8d4
		{
Packit 1fb8d4
			case 4:
Packit 1fb8d4
				*--target = (BYTE)((ch | byteMark) & byteMask);
Packit 1fb8d4
				ch >>= 6;
Packit 1fb8d4
Packit 1fb8d4
			case 3:
Packit 1fb8d4
				*--target = (BYTE)((ch | byteMark) & byteMask);
Packit 1fb8d4
				ch >>= 6;
Packit 1fb8d4
Packit 1fb8d4
			case 2:
Packit 1fb8d4
				*--target = (BYTE)((ch | byteMark) & byteMask);
Packit 1fb8d4
				ch >>= 6;
Packit 1fb8d4
Packit 1fb8d4
			case 1:
Packit 1fb8d4
				*--target = (BYTE)(ch | firstByteMark[bytesToWrite]);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		target += bytesToWrite;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	*sourceStart = source;
Packit 1fb8d4
	*targetStart = target;
Packit 1fb8d4
	return result;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/* --------------------------------------------------------------------- */
Packit 1fb8d4
Packit 1fb8d4
ConversionResult ConvertUTF8toUTF32(
Packit 1fb8d4
    const BYTE** sourceStart, const BYTE* sourceEnd,
Packit 1fb8d4
    DWORD** targetStart, DWORD* targetEnd, ConversionFlags flags)
Packit 1fb8d4
{
Packit 1fb8d4
	ConversionResult result = conversionOK;
Packit 1fb8d4
	const BYTE* source = *sourceStart;
Packit 1fb8d4
	DWORD* target = *targetStart;
Packit 1fb8d4
Packit 1fb8d4
	while (source < sourceEnd)
Packit 1fb8d4
	{
Packit 1fb8d4
		DWORD ch = 0;
Packit 1fb8d4
		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
Packit 1fb8d4
Packit 1fb8d4
		if (source + extraBytesToRead >= sourceEnd)
Packit 1fb8d4
		{
Packit 1fb8d4
			result = sourceExhausted;
Packit 1fb8d4
			break;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Do this check whether lenient or strict */
Packit 1fb8d4
		if (! isLegalUTF8(source, extraBytesToRead + 1))
Packit 1fb8d4
		{
Packit 1fb8d4
			result = sourceIllegal;
Packit 1fb8d4
			break;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/*
Packit 1fb8d4
		 * The cases all fall through. See "Note A" below.
Packit 1fb8d4
		 */
Packit 1fb8d4
		switch (extraBytesToRead)
Packit 1fb8d4
		{
Packit 1fb8d4
			case 5:
Packit 1fb8d4
				ch += *source++;
Packit 1fb8d4
				ch <<= 6;
Packit 1fb8d4
Packit 1fb8d4
			case 4:
Packit 1fb8d4
				ch += *source++;
Packit 1fb8d4
				ch <<= 6;
Packit 1fb8d4
Packit 1fb8d4
			case 3:
Packit 1fb8d4
				ch += *source++;
Packit 1fb8d4
				ch <<= 6;
Packit 1fb8d4
Packit 1fb8d4
			case 2:
Packit 1fb8d4
				ch += *source++;
Packit 1fb8d4
				ch <<= 6;
Packit 1fb8d4
Packit 1fb8d4
			case 1:
Packit 1fb8d4
				ch += *source++;
Packit 1fb8d4
				ch <<= 6;
Packit 1fb8d4
Packit 1fb8d4
			case 0:
Packit 1fb8d4
				ch += *source++;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		ch -= offsetsFromUTF8[extraBytesToRead];
Packit 1fb8d4
Packit 1fb8d4
		if (target >= targetEnd)
Packit 1fb8d4
		{
Packit 1fb8d4
			source -= (extraBytesToRead + 1); /* Back up the source pointer! */
Packit 1fb8d4
			result = targetExhausted;
Packit 1fb8d4
			break;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		if (ch <= UNI_MAX_LEGAL_UTF32)
Packit 1fb8d4
		{
Packit 1fb8d4
			/*
Packit 1fb8d4
			 * UTF-16 surrogate values are illegal in UTF-32, and anything
Packit 1fb8d4
			 * over Plane 17 (> 0x10FFFF) is illegal.
Packit 1fb8d4
			 */
Packit 1fb8d4
			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
Packit 1fb8d4
			{
Packit 1fb8d4
				if (flags == strictConversion)
Packit 1fb8d4
				{
Packit 1fb8d4
					source -= (extraBytesToRead + 1); /* return to the illegal value itself */
Packit 1fb8d4
					result = sourceIllegal;
Packit 1fb8d4
					break;
Packit 1fb8d4
				}
Packit 1fb8d4
				else
Packit 1fb8d4
				{
Packit 1fb8d4
					*target++ = UNI_REPLACEMENT_CHAR;
Packit 1fb8d4
				}
Packit 1fb8d4
			}
Packit 1fb8d4
			else
Packit 1fb8d4
			{
Packit 1fb8d4
				*target++ = ch;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
		else     /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
Packit 1fb8d4
		{
Packit 1fb8d4
			result = sourceIllegal;
Packit 1fb8d4
			*target++ = UNI_REPLACEMENT_CHAR;
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	*sourceStart = source;
Packit 1fb8d4
	*targetStart = target;
Packit 1fb8d4
	return result;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/* ---------------------------------------------------------------------
Packit 1fb8d4
Packit 1fb8d4
    Note A.
Packit 1fb8d4
    The fall-through switches in UTF-8 reading code save a
Packit 1fb8d4
    temp variable, some decrements & conditionals.  The switches
Packit 1fb8d4
    are equivalent to the following loop:
Packit 1fb8d4
    {
Packit 1fb8d4
        int tmpBytesToRead = extraBytesToRead+1;
Packit 1fb8d4
        do {
Packit 1fb8d4
        ch += *source++;
Packit 1fb8d4
        --tmpBytesToRead;
Packit 1fb8d4
        if (tmpBytesToRead) ch <<= 6;
Packit 1fb8d4
        } while (tmpBytesToRead > 0);
Packit 1fb8d4
    }
Packit 1fb8d4
    In UTF-8 writing code, the switches on "bytesToWrite" are
Packit 1fb8d4
    similarly unrolled loops.
Packit 1fb8d4
Packit 1fb8d4
   --------------------------------------------------------------------- */