Blame libfreerdp/primitives/prim_colors_opt.c

Packit 1fb8d4
/* FreeRDP: A Remote Desktop Protocol Client
Packit 1fb8d4
 * Optimized Color conversion operations.
Packit 1fb8d4
 * vi:ts=4 sw=4:
Packit 1fb8d4
 *
Packit 1fb8d4
 * Copyright 2011 Stephen Erisman
Packit 1fb8d4
 * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
Packit 1fb8d4
 * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
Packit 1fb8d4
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
Packit 1fb8d4
 *
Packit 1fb8d4
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
Packit 1fb8d4
 * not use this file except in compliance with the License. You may obtain
Packit 1fb8d4
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
Packit 1fb8d4
 * Unless required by applicable law or agreed to in writing, software
Packit 1fb8d4
 * distributed under the License is distributed on an "AS IS" BASIS,
Packit 1fb8d4
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
Packit 1fb8d4
 * or implied. See the License for the specific language governing
Packit 1fb8d4
 * permissions and limitations under the License.
Packit 1fb8d4
 */
Packit 1fb8d4
Packit 1fb8d4
#ifdef HAVE_CONFIG_H
Packit 1fb8d4
#include "config.h"
Packit 1fb8d4
#endif
Packit 1fb8d4
Packit 1fb8d4
#include <freerdp/types.h>
Packit 1fb8d4
#include <freerdp/primitives.h>
Packit 1fb8d4
#include <winpr/sysinfo.h>
Packit 1fb8d4
Packit 1fb8d4
#ifdef WITH_SSE2
Packit 1fb8d4
#include <emmintrin.h>
Packit 1fb8d4
#elif defined(WITH_NEON)
Packit 1fb8d4
#include <arm_neon.h>
Packit 1fb8d4
#endif /* WITH_SSE2 else WITH_NEON */
Packit 1fb8d4
Packit 1fb8d4
#include "prim_internal.h"
Packit 1fb8d4
#include "prim_templates.h"
Packit 1fb8d4
Packit 1fb8d4
static primitives_t* generic = NULL;
Packit 1fb8d4
Packit 1fb8d4
#ifdef WITH_SSE2
Packit 1fb8d4
Packit 1fb8d4
#ifdef __GNUC__
Packit Service 5a9772
#define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Packit 1fb8d4
#else
Packit Service 5a9772
#define GNU_INLINE
Packit 1fb8d4
#endif
Packit 1fb8d4
Packit Service 5a9772
#define CACHE_LINE_BYTES 64
Packit 1fb8d4
Packit Service 5a9772
#define _mm_between_epi16(_val, _min, _max)                    \
Packit Service 5a9772
	do                                                         \
Packit Service 5a9772
	{                                                          \
Packit Service 5a9772
		_val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); \
Packit Service 5a9772
	} while (0)
Packit 1fb8d4
Packit 1fb8d4
#ifdef DO_PREFETCH
Packit 1fb8d4
/*---------------------------------------------------------------------------*/
Packit Service 5a9772
static inline void GNU_INLINE _mm_prefetch_buffer(char* buffer, int num_bytes)
Packit 1fb8d4
{
Packit Service 5a9772
	__m128i* buf = (__m128i*)buffer;
Packit 1fb8d4
	unsigned int i;
Packit 1fb8d4
Packit Service 5a9772
	for (i = 0; i < (num_bytes / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit 1fb8d4
	{
Packit 1fb8d4
		_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
#endif /* DO_PREFETCH */
Packit 1fb8d4
Packit 1fb8d4
/*---------------------------------------------------------------------------*/
Packit Service 5a9772
static pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(const INT16* const pSrc[3], int srcStep,
Packit Service 5a9772
                                             INT16* pDst[3], int dstStep,
Packit Service 5a9772
                                             const prim_size_t* roi) /* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
Packit Service 5a9772
	__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
Packit Service 5a9772
	UINT32 yp;
Packit Service 5a9772
	int srcbump, dstbump, imax;
Packit Service 5a9772
Packit Service 5a9772
	if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
Packit Service 5a9772
	    ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
Packit Service 5a9772
	    ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
Packit Service 5a9772
	    (srcStep & 127) || (dstStep & 127))
Packit 1fb8d4
	{
Packit 1fb8d4
		/* We can't maintain 16-byte alignment. */
Packit Service 5a9772
		return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	zero = _mm_setzero_si128();
Packit 1fb8d4
	max = _mm_set1_epi16(255);
Packit Service 5a9772
	y_buf = (__m128i*)(pSrc[0]);
Packit 1fb8d4
	cb_buf = (__m128i*)(pSrc[1]);
Packit 1fb8d4
	cr_buf = (__m128i*)(pSrc[2]);
Packit Service 5a9772
	r_buf = (__m128i*)(pDst[0]);
Packit Service 5a9772
	g_buf = (__m128i*)(pDst[1]);
Packit Service 5a9772
	b_buf = (__m128i*)(pDst[2]);
Packit Service 5a9772
	r_cr = _mm_set1_epi16(22986);  /*  1.403 << 14 */
Packit Service 5a9772
	g_cb = _mm_set1_epi16(-5636);  /* -0.344 << 14 */
Packit Service 5a9772
	g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
Packit Service 5a9772
	b_cb = _mm_set1_epi16(28999);  /*  1.770 << 14 */
Packit 1fb8d4
	c4096 = _mm_set1_epi16(4096);
Packit 1fb8d4
	srcbump = srcStep / sizeof(__m128i);
Packit 1fb8d4
	dstbump = dstStep / sizeof(__m128i);
Packit 1fb8d4
#ifdef DO_PREFETCH
Packit 1fb8d4
Packit 1fb8d4
	/* Prefetch Y's, Cb's, and Cr's. */
Packit 1fb8d4
	for (yp = 0; yp < roi->height; yp++)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
Packit 1fb8d4
		     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit 1fb8d4
		{
Packit Service 5a9772
			_mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit Service 5a9772
		y_buf += srcbump;
Packit 1fb8d4
		cb_buf += srcbump;
Packit 1fb8d4
		cr_buf += srcbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit Service 5a9772
	y_buf = (__m128i*)(pSrc[0]);
Packit 1fb8d4
	cb_buf = (__m128i*)(pSrc[1]);
Packit 1fb8d4
	cr_buf = (__m128i*)(pSrc[2]);
Packit 1fb8d4
#endif /* DO_PREFETCH */
Packit 1fb8d4
	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
Packit 1fb8d4
Packit 1fb8d4
	for (yp = 0; yp < roi->height; ++yp)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < imax; i++)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* In order to use SSE2 signed 16-bit integer multiplication
Packit 1fb8d4
			 * we need to convert the floating point factors to signed int
Packit 1fb8d4
			 * without losing information.
Packit 1fb8d4
			 * The result of this multiplication is 32 bit and we have two
Packit 1fb8d4
			 * SSE instructions that return either the hi or lo word.
Packit 1fb8d4
			 * Thus we will multiply the factors by the highest possible 2^n,
Packit 1fb8d4
			 * take the upper 16 bits of the signed 32-bit result
Packit 1fb8d4
			 * (_mm_mulhi_epi16) and correct this result by multiplying
Packit 1fb8d4
			 * it by 2^(16-n).
Packit 1fb8d4
			 *
Packit 1fb8d4
			 * For the given factors in the conversion matrix the best
Packit 1fb8d4
			 * possible n is 14.
Packit 1fb8d4
			 *
Packit 1fb8d4
			 * Example for calculating r:
Packit 1fb8d4
			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
Packit 1fb8d4
			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
Packit 1fb8d4
			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
Packit 1fb8d4
			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
Packit 1fb8d4
			 */
Packit 1fb8d4
			/* y = (y_r_buf[i] + 4096) >> 2 */
Packit 1fb8d4
			__m128i y, cb, cr, r, g, b;
Packit 1fb8d4
			y = _mm_load_si128(y_buf + i);
Packit 1fb8d4
			y = _mm_add_epi16(y, c4096);
Packit 1fb8d4
			y = _mm_srai_epi16(y, 2);
Packit 1fb8d4
			/* cb = cb_g_buf[i]; */
Packit 1fb8d4
			cb = _mm_load_si128(cb_buf + i);
Packit 1fb8d4
			/* cr = cr_b_buf[i]; */
Packit 1fb8d4
			cr = _mm_load_si128(cr_buf + i);
Packit 1fb8d4
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit 1fb8d4
			r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
Packit 1fb8d4
			r = _mm_srai_epi16(r, 3);
Packit 1fb8d4
			/* r_buf[i] = CLIP(r); */
Packit 1fb8d4
			_mm_between_epi16(r, zero, max);
Packit 1fb8d4
			_mm_store_si128(r_buf + i, r);
Packit 1fb8d4
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit 1fb8d4
			g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
Packit 1fb8d4
			g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
Packit 1fb8d4
			g = _mm_srai_epi16(g, 3);
Packit 1fb8d4
			/* g_buf[i] = CLIP(g); */
Packit 1fb8d4
			_mm_between_epi16(g, zero, max);
Packit 1fb8d4
			_mm_store_si128(g_buf + i, g);
Packit 1fb8d4
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit 1fb8d4
			b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
Packit 1fb8d4
			b = _mm_srai_epi16(b, 3);
Packit 1fb8d4
			/* b_buf[i] = CLIP(b); */
Packit 1fb8d4
			_mm_between_epi16(b, zero, max);
Packit 1fb8d4
			_mm_store_si128(b_buf + i, b);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit Service 5a9772
		y_buf += srcbump;
Packit 1fb8d4
		cb_buf += srcbump;
Packit 1fb8d4
		cr_buf += srcbump;
Packit 1fb8d4
		r_buf += dstbump;
Packit 1fb8d4
		g_buf += dstbump;
Packit 1fb8d4
		b_buf += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/*---------------------------------------------------------------------------*/
Packit Service 5a9772
static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* const pSrc[3], UINT32 srcStep,
Packit Service 5a9772
                                                   BYTE* pDst, UINT32 dstStep,
Packit Service 5a9772
                                                   const prim_size_t* roi) /* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	const __m128i zero = _mm_setzero_si128();
Packit 1fb8d4
	const __m128i max = _mm_set1_epi16(255);
Packit Service 5a9772
	const __m128i r_cr = _mm_set1_epi16(22986);  /*  1.403 << 14 */
Packit Service 5a9772
	const __m128i g_cb = _mm_set1_epi16(-5636);  /* -0.344 << 14 */
Packit Service 5a9772
	const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
Packit Service 5a9772
	const __m128i b_cb = _mm_set1_epi16(28999);  /*  1.770 << 14 */
Packit 1fb8d4
	const __m128i c4096 = _mm_set1_epi16(4096);
Packit 1fb8d4
	const INT16* y_buf = (INT16*)pSrc[0];
Packit 1fb8d4
	const INT16* cb_buf = (INT16*)pSrc[1];
Packit 1fb8d4
	const INT16* cr_buf = (INT16*)pSrc[2];
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	const UINT32 step = sizeof(__m128i) / sizeof(INT16);
Packit 1fb8d4
	const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
Packit 1fb8d4
	BYTE* d_buf = pDst;
Packit Service 5a9772
	UINT32 yp;
Packit 1fb8d4
	const size_t dstPad = (dstStep - roi->width * 4);
Packit 1fb8d4
#ifdef DO_PREFETCH
Packit 1fb8d4
Packit 1fb8d4
	/* Prefetch Y's, Cb's, and Cr's. */
Packit 1fb8d4
	for (yp = 0; yp < roi->height; yp++)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit Service 5a9772
		for (i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit 1fb8d4
		{
Packit Service 5a9772
			_mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit Service 5a9772
		y_buf += srcStep / sizeof(INT16);
Packit 1fb8d4
		cb_buf += srcStep / sizeof(INT16);
Packit 1fb8d4
		cr_buf += srcStep / sizeof(INT16);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit Service 5a9772
	y_buf = (INT16*)pSrc[0];
Packit 1fb8d4
	cb_buf = (INT16*)pSrc[1];
Packit 1fb8d4
	cr_buf = (INT16*)pSrc[2];
Packit 1fb8d4
#endif /* DO_PREFETCH */
Packit 1fb8d4
Packit 1fb8d4
	for (yp = 0; yp < roi->height; ++yp)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < imax; i += 2)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* In order to use SSE2 signed 16-bit integer multiplication
Packit 1fb8d4
			 * we need to convert the floating point factors to signed int
Packit 1fb8d4
			 * without losing information.
Packit 1fb8d4
			 * The result of this multiplication is 32 bit and we have two
Packit 1fb8d4
			 * SSE instructions that return either the hi or lo word.
Packit 1fb8d4
			 * Thus we will multiply the factors by the highest possible 2^n,
Packit 1fb8d4
			 * take the upper 16 bits of the signed 32-bit result
Packit 1fb8d4
			 * (_mm_mulhi_epi16) and correct this result by multiplying
Packit 1fb8d4
			 * it by 2^(16-n).
Packit 1fb8d4
			 *
Packit 1fb8d4
			 * For the given factors in the conversion matrix the best
Packit 1fb8d4
			 * possible n is 14.
Packit 1fb8d4
			 *
Packit 1fb8d4
			 * Example for calculating r:
Packit 1fb8d4
			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
Packit 1fb8d4
			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
Packit 1fb8d4
			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
Packit 1fb8d4
			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
Packit 1fb8d4
			 */
Packit 1fb8d4
			/* y = (y_r_buf[i] + 4096) >> 2 */
Packit 1fb8d4
			__m128i y1, y2, cb1, cb2, cr1, cr2, r1, r2, g1, g2, b1, b2;
Packit 1fb8d4
			y1 = _mm_load_si128((__m128i*)y_buf);
Packit 1fb8d4
			y_buf += step;
Packit 1fb8d4
			y1 = _mm_add_epi16(y1, c4096);
Packit 1fb8d4
			y1 = _mm_srai_epi16(y1, 2);
Packit 1fb8d4
			/* cb = cb_g_buf[i]; */
Packit 1fb8d4
			cb1 = _mm_load_si128((__m128i*)cb_buf);
Packit 1fb8d4
			cb_buf += step;
Packit 1fb8d4
			/* cr = cr_b_buf[i]; */
Packit 1fb8d4
			cr1 = _mm_load_si128((__m128i*)cr_buf);
Packit 1fb8d4
			cr_buf += step;
Packit 1fb8d4
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit 1fb8d4
			r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
Packit 1fb8d4
			r1 = _mm_srai_epi16(r1, 3);
Packit 1fb8d4
			/* r_buf[i] = CLIP(r); */
Packit 1fb8d4
			_mm_between_epi16(r1, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit 1fb8d4
			g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
Packit 1fb8d4
			g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
Packit 1fb8d4
			g1 = _mm_srai_epi16(g1, 3);
Packit 1fb8d4
			/* g_buf[i] = CLIP(g); */
Packit 1fb8d4
			_mm_between_epi16(g1, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit 1fb8d4
			b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
Packit 1fb8d4
			b1 = _mm_srai_epi16(b1, 3);
Packit 1fb8d4
			/* b_buf[i] = CLIP(b); */
Packit 1fb8d4
			_mm_between_epi16(b1, zero, max);
Packit 1fb8d4
			y2 = _mm_load_si128((__m128i*)y_buf);
Packit 1fb8d4
			y_buf += step;
Packit 1fb8d4
			y2 = _mm_add_epi16(y2, c4096);
Packit 1fb8d4
			y2 = _mm_srai_epi16(y2, 2);
Packit 1fb8d4
			/* cb = cb_g_buf[i]; */
Packit 1fb8d4
			cb2 = _mm_load_si128((__m128i*)cb_buf);
Packit 1fb8d4
			cb_buf += step;
Packit 1fb8d4
			/* cr = cr_b_buf[i]; */
Packit 1fb8d4
			cr2 = _mm_load_si128((__m128i*)cr_buf);
Packit 1fb8d4
			cr_buf += step;
Packit 1fb8d4
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit 1fb8d4
			r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
Packit 1fb8d4
			r2 = _mm_srai_epi16(r2, 3);
Packit 1fb8d4
			/* r_buf[i] = CLIP(r); */
Packit 1fb8d4
			_mm_between_epi16(r2, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit 1fb8d4
			g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
Packit 1fb8d4
			g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
Packit 1fb8d4
			g2 = _mm_srai_epi16(g2, 3);
Packit 1fb8d4
			/* g_buf[i] = CLIP(g); */
Packit 1fb8d4
			_mm_between_epi16(g2, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit 1fb8d4
			b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
Packit 1fb8d4
			b2 = _mm_srai_epi16(b2, 3);
Packit 1fb8d4
			/* b_buf[i] = CLIP(b); */
Packit 1fb8d4
			_mm_between_epi16(b2, zero, max);
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1, R2, R3, R4;
Packit 1fb8d4
				/* The comments below pretend these are 8-byte registers
Packit 1fb8d4
				 * rather than 16-byte, for readability.
Packit 1fb8d4
				 */
Packit Service 5a9772
				R0 = b1;                              /* R0 = 00B300B200B100B0 */
Packit Service 5a9772
				R1 = b2;                              /* R1 = 00B700B600B500B4 */
Packit Service 5a9772
				R0 = _mm_packus_epi16(R0, R1);        /* R0 = B7B6B5B4B3B2B1B0 */
Packit Service 5a9772
				R1 = g1;                              /* R1 = 00G300G200G100G0 */
Packit Service 5a9772
				R2 = g2;                              /* R2 = 00G700G600G500G4 */
Packit Service 5a9772
				R1 = _mm_packus_epi16(R1, R2);        /* R1 = G7G6G5G4G3G2G1G0 */
Packit Service 5a9772
				R2 = R1;                              /* R2 = G7G6G5G4G3G2G1G0 */
Packit Service 5a9772
				R2 = _mm_unpacklo_epi8(R0, R2);       /* R2 = B3G3B2G2B1G1B0G0 */
Packit Service 5a9772
				R1 = _mm_unpackhi_epi8(R0, R1);       /* R1 = B7G7B6G6B5G5B4G4 */
Packit Service 5a9772
				R0 = r1;                              /* R0 = 00R300R200R100R0 */
Packit Service 5a9772
				R3 = r2;                              /* R3 = 00R700R600R500R4 */
Packit Service 5a9772
				R0 = _mm_packus_epi16(R0, R3);        /* R0 = R7R6R5R4R3R2R1R0 */
Packit Service 5a9772
				R3 = _mm_set1_epi32(0xFFFFFFFFU);     /* R3 = FFFFFFFFFFFFFFFF */
Packit Service 5a9772
				R4 = R3;                              /* R4 = FFFFFFFFFFFFFFFF */
Packit Service 5a9772
				R4 = _mm_unpacklo_epi8(R0, R4);       /* R4 = R3FFR2FFR1FFR0FF */
Packit Service 5a9772
				R3 = _mm_unpackhi_epi8(R0, R3);       /* R3 = R7FFR6FFR5FFR4FF */
Packit Service 5a9772
				R0 = R4;                              /* R0 = R4               */
Packit Service 5a9772
				R0 = _mm_unpacklo_epi16(R2, R0);      /* R0 = B1G1R1FFB0G0R0FF */
Packit Service 5a9772
				R4 = _mm_unpackhi_epi16(R2, R4);      /* R4 = B3G3R3FFB2G2R2FF */
Packit Service 5a9772
				R2 = R3;                              /* R2 = R3               */
Packit Service 5a9772
				R2 = _mm_unpacklo_epi16(R1, R2);      /* R2 = B5G5R5FFB4G4R4FF */
Packit Service 5a9772
				R3 = _mm_unpackhi_epi16(R1, R3);      /* R3 = B7G7R7FFB6G6R6FF */
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R0); /* B1G1R1FFB0G0R0FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R4); /* B3G3R3FFB2G2R2FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R2); /* B5G5R5FFB4G4R4FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R3); /* B7G7R7FFB6G6R6FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < pad; i++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const INT32 divisor = 16;
Packit 1fb8d4
			const INT32 Y = ((*y_buf++) + 4096) << divisor;
Packit 1fb8d4
			const INT32 Cb = (*cb_buf++);
Packit 1fb8d4
			const INT32 Cr = (*cr_buf++);
Packit 1fb8d4
			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
Packit 1fb8d4
			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
Packit 1fb8d4
			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
Packit 1fb8d4
			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
Packit 1fb8d4
			const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
Packit 1fb8d4
			const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
Packit 1fb8d4
			const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
Packit 1fb8d4
			*d_buf++ = CLIP(B);
Packit 1fb8d4
			*d_buf++ = CLIP(G);
Packit 1fb8d4
			*d_buf++ = CLIP(R);
Packit 1fb8d4
			*d_buf++ = 0xFF;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		d_buf += dstPad;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/*---------------------------------------------------------------------------*/
Packit Service 5a9772
static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* const pSrc[3], UINT32 srcStep,
Packit Service 5a9772
                                                   BYTE* pDst, UINT32 dstStep,
Packit Service 5a9772
                                                   const prim_size_t* roi) /* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	const __m128i zero = _mm_setzero_si128();
Packit 1fb8d4
	const __m128i max = _mm_set1_epi16(255);
Packit Service 5a9772
	const __m128i r_cr = _mm_set1_epi16(22986);  /*  1.403 << 14 */
Packit Service 5a9772
	const __m128i g_cb = _mm_set1_epi16(-5636);  /* -0.344 << 14 */
Packit Service 5a9772
	const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
Packit Service 5a9772
	const __m128i b_cb = _mm_set1_epi16(28999);  /*  1.770 << 14 */
Packit 1fb8d4
	const __m128i c4096 = _mm_set1_epi16(4096);
Packit 1fb8d4
	const INT16* y_buf = (INT16*)pSrc[0];
Packit 1fb8d4
	const INT16* cb_buf = (INT16*)pSrc[1];
Packit 1fb8d4
	const INT16* cr_buf = (INT16*)pSrc[2];
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	const UINT32 step = sizeof(__m128i) / sizeof(INT16);
Packit 1fb8d4
	const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
Packit 1fb8d4
	BYTE* d_buf = pDst;
Packit Service 5a9772
	UINT32 yp;
Packit 1fb8d4
	const size_t dstPad = (dstStep - roi->width * 4);
Packit 1fb8d4
#ifdef DO_PREFETCH
Packit 1fb8d4
Packit 1fb8d4
	/* Prefetch Y's, Cb's, and Cr's. */
Packit 1fb8d4
	for (yp = 0; yp < roi->height; yp++)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit Service 5a9772
		for (i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit 1fb8d4
		{
Packit Service 5a9772
			_mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit Service 5a9772
		y_buf += srcStep / sizeof(INT16);
Packit 1fb8d4
		cb_buf += srcStep / sizeof(INT16);
Packit 1fb8d4
		cr_buf += srcStep / sizeof(INT16);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit Service 5a9772
	y_buf = (INT16*)(pSrc[0]);
Packit 1fb8d4
	cb_buf = (INT16*)(pSrc[1]);
Packit 1fb8d4
	cr_buf = (INT16*)(pSrc[2]);
Packit 1fb8d4
#endif /* DO_PREFETCH */
Packit 1fb8d4
Packit 1fb8d4
	for (yp = 0; yp < roi->height; ++yp)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < imax; i += 2)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* In order to use SSE2 signed 16-bit integer multiplication
Packit 1fb8d4
			 * we need to convert the floating point factors to signed int
Packit 1fb8d4
			 * without losing information.
Packit 1fb8d4
			 * The result of this multiplication is 32 bit and we have two
Packit 1fb8d4
			 * SSE instructions that return either the hi or lo word.
Packit 1fb8d4
			 * Thus we will multiply the factors by the highest possible 2^n,
Packit 1fb8d4
			 * take the upper 16 bits of the signed 32-bit result
Packit 1fb8d4
			 * (_mm_mulhi_epi16) and correct this result by multiplying
Packit 1fb8d4
			 * it by 2^(16-n).
Packit 1fb8d4
			 *
Packit 1fb8d4
			 * For the given factors in the conversion matrix the best
Packit 1fb8d4
			 * possible n is 14.
Packit 1fb8d4
			 *
Packit 1fb8d4
			 * Example for calculating r:
Packit 1fb8d4
			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
Packit 1fb8d4
			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
Packit 1fb8d4
			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
Packit 1fb8d4
			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
Packit 1fb8d4
			 */
Packit 1fb8d4
			/* y = (y_r_buf[i] + 4096) >> 2 */
Packit 1fb8d4
			__m128i y1, y2, cb1, cb2, cr1, cr2, r1, r2, g1, g2, b1, b2;
Packit 1fb8d4
			y1 = _mm_load_si128((__m128i*)y_buf);
Packit 1fb8d4
			y_buf += step;
Packit 1fb8d4
			y1 = _mm_add_epi16(y1, c4096);
Packit 1fb8d4
			y1 = _mm_srai_epi16(y1, 2);
Packit 1fb8d4
			/* cb = cb_g_buf[i]; */
Packit 1fb8d4
			cb1 = _mm_load_si128((__m128i*)cb_buf);
Packit 1fb8d4
			cb_buf += step;
Packit 1fb8d4
			/* cr = cr_b_buf[i]; */
Packit 1fb8d4
			cr1 = _mm_load_si128((__m128i*)cr_buf);
Packit 1fb8d4
			cr_buf += step;
Packit 1fb8d4
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit 1fb8d4
			r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
Packit 1fb8d4
			r1 = _mm_srai_epi16(r1, 3);
Packit 1fb8d4
			/* r_buf[i] = CLIP(r); */
Packit 1fb8d4
			_mm_between_epi16(r1, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit 1fb8d4
			g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
Packit 1fb8d4
			g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
Packit 1fb8d4
			g1 = _mm_srai_epi16(g1, 3);
Packit 1fb8d4
			/* g_buf[i] = CLIP(g); */
Packit 1fb8d4
			_mm_between_epi16(g1, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit 1fb8d4
			b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
Packit 1fb8d4
			b1 = _mm_srai_epi16(b1, 3);
Packit 1fb8d4
			/* b_buf[i] = CLIP(b); */
Packit 1fb8d4
			_mm_between_epi16(b1, zero, max);
Packit 1fb8d4
			y2 = _mm_load_si128((__m128i*)y_buf);
Packit 1fb8d4
			y_buf += step;
Packit 1fb8d4
			y2 = _mm_add_epi16(y2, c4096);
Packit 1fb8d4
			y2 = _mm_srai_epi16(y2, 2);
Packit 1fb8d4
			/* cb = cb_g_buf[i]; */
Packit 1fb8d4
			cb2 = _mm_load_si128((__m128i*)cb_buf);
Packit 1fb8d4
			cb_buf += step;
Packit 1fb8d4
			/* cr = cr_b_buf[i]; */
Packit 1fb8d4
			cr2 = _mm_load_si128((__m128i*)cr_buf);
Packit 1fb8d4
			cr_buf += step;
Packit 1fb8d4
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit 1fb8d4
			r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
Packit 1fb8d4
			r2 = _mm_srai_epi16(r2, 3);
Packit 1fb8d4
			/* r_buf[i] = CLIP(r); */
Packit 1fb8d4
			_mm_between_epi16(r2, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit 1fb8d4
			g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
Packit 1fb8d4
			g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
Packit 1fb8d4
			g2 = _mm_srai_epi16(g2, 3);
Packit 1fb8d4
			/* g_buf[i] = CLIP(g); */
Packit 1fb8d4
			_mm_between_epi16(g2, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit 1fb8d4
			b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
Packit 1fb8d4
			b2 = _mm_srai_epi16(b2, 3);
Packit 1fb8d4
			/* b_buf[i] = CLIP(b); */
Packit 1fb8d4
			_mm_between_epi16(b2, zero, max);
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1, R2, R3, R4;
Packit 1fb8d4
				/* The comments below pretend these are 8-byte registers
Packit 1fb8d4
				 * rather than 16-byte, for readability.
Packit 1fb8d4
				 */
Packit Service 5a9772
				R0 = r1;                              /* R0 = 00R300R200R100R0 */
Packit Service 5a9772
				R1 = r2;                              /* R1 = 00R700R600R500R4 */
Packit Service 5a9772
				R0 = _mm_packus_epi16(R0, R1);        /* R0 = R7R6R5R4R3R2R1R0 */
Packit Service 5a9772
				R1 = g1;                              /* R1 = 00G300G200G100G0 */
Packit Service 5a9772
				R2 = g2;                              /* R2 = 00G700G600G500G4 */
Packit Service 5a9772
				R1 = _mm_packus_epi16(R1, R2);        /* R1 = G7G6G5G4G3G2G1G0 */
Packit Service 5a9772
				R2 = R1;                              /* R2 = G7G6G5G4G3G2G1G0 */
Packit Service 5a9772
				R2 = _mm_unpacklo_epi8(R0, R2);       /* R2 = R3G3R2G2R1G1R0G0 */
Packit Service 5a9772
				R1 = _mm_unpackhi_epi8(R0, R1);       /* R1 = R7G7R6G6R5G5R4G4 */
Packit Service 5a9772
				R0 = b1;                              /* R0 = 00B300B200B100B0 */
Packit Service 5a9772
				R3 = b2;                              /* R3 = 00B700B600B500B4 */
Packit Service 5a9772
				R0 = _mm_packus_epi16(R0, R3);        /* R0 = B7B6B5B4B3B2B1B0 */
Packit Service 5a9772
				R3 = _mm_set1_epi32(0xFFFFFFFFU);     /* R3 = FFFFFFFFFFFFFFFF */
Packit Service 5a9772
				R4 = R3;                              /* R4 = FFFFFFFFFFFFFFFF */
Packit Service 5a9772
				R4 = _mm_unpacklo_epi8(R0, R4);       /* R4 = B3FFB2FFB1FFB0FF */
Packit Service 5a9772
				R3 = _mm_unpackhi_epi8(R0, R3);       /* R3 = B7FFB6FFB5FFB4FF */
Packit Service 5a9772
				R0 = R4;                              /* R0 = R4               */
Packit Service 5a9772
				R0 = _mm_unpacklo_epi16(R2, R0);      /* R0 = R1G1B1FFR0G0B0FF */
Packit Service 5a9772
				R4 = _mm_unpackhi_epi16(R2, R4);      /* R4 = R3G3B3FFR2G2B2FF */
Packit Service 5a9772
				R2 = R3;                              /* R2 = R3               */
Packit Service 5a9772
				R2 = _mm_unpacklo_epi16(R1, R2);      /* R2 = R5G5B5FFR4G4B4FF */
Packit Service 5a9772
				R3 = _mm_unpackhi_epi16(R1, R3);      /* R3 = R7G7B7FFR6G6B6FF */
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R0); /* R1G1B1FFR0G0B0FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R4); /* R3G3B3FFR2G2B2FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R2); /* R5G5B5FFR4G4B4FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R3); /* R7G7B7FFR6G6B6FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < pad; i++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const INT32 divisor = 16;
Packit 1fb8d4
			const INT32 Y = ((*y_buf++) + 4096) << divisor;
Packit 1fb8d4
			const INT32 Cb = (*cb_buf++);
Packit 1fb8d4
			const INT32 Cr = (*cr_buf++);
Packit 1fb8d4
			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
Packit 1fb8d4
			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
Packit 1fb8d4
			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
Packit 1fb8d4
			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
Packit 1fb8d4
			const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
Packit 1fb8d4
			const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
Packit 1fb8d4
			const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
Packit 1fb8d4
			*d_buf++ = CLIP(R);
Packit 1fb8d4
			*d_buf++ = CLIP(G);
Packit 1fb8d4
			*d_buf++ = CLIP(B);
Packit 1fb8d4
			*d_buf++ = 0xFF;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		d_buf += dstPad;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit Service 5a9772
static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], UINT32 srcStep,
Packit Service 5a9772
                                              BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
Packit Service 5a9772
                                              const prim_size_t* roi) /* region of interest */
Packit 1fb8d4
{
Packit Service 5a9772
	if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
Packit Service 5a9772
	    ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst)&0x0f) || (srcStep & 0x0f) ||
Packit Service 5a9772
	    (dstStep & 0x0f))
Packit 1fb8d4
	{
Packit 1fb8d4
		/* We can't maintain 16-byte alignment. */
Packit Service 5a9772
		return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	switch (DstFormat)
Packit 1fb8d4
	{
Packit 1fb8d4
		case PIXEL_FORMAT_BGRA32:
Packit 1fb8d4
		case PIXEL_FORMAT_BGRX32:
Packit 1fb8d4
			return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_RGBA32:
Packit 1fb8d4
		case PIXEL_FORMAT_RGBX32:
Packit 1fb8d4
			return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
Packit 1fb8d4
 * numbers. See the general code above.
Packit 1fb8d4
 */
Packit Service 5a9772
static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(const INT16* const pSrc[3], int srcStep,
Packit Service 5a9772
                                             INT16* pDst[3], int dstStep,
Packit Service 5a9772
                                             const prim_size_t* roi) /* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
Packit Service 5a9772
	__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
Packit Service 5a9772
	UINT32 yp;
Packit Service 5a9772
	int srcbump, dstbump, imax;
Packit Service 5a9772
Packit Service 5a9772
	if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
Packit Service 5a9772
	    ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
Packit Service 5a9772
	    ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
Packit Service 5a9772
	    (srcStep & 127) || (dstStep & 127))
Packit 1fb8d4
	{
Packit 1fb8d4
		/* We can't maintain 16-byte alignment. */
Packit Service 5a9772
		return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	min = _mm_set1_epi16(-128 * 32);
Packit 1fb8d4
	max = _mm_set1_epi16(127 * 32);
Packit Service 5a9772
	r_buf = (__m128i*)(pSrc[0]);
Packit Service 5a9772
	g_buf = (__m128i*)(pSrc[1]);
Packit Service 5a9772
	b_buf = (__m128i*)(pSrc[2]);
Packit Service 5a9772
	y_buf = (__m128i*)(pDst[0]);
Packit 1fb8d4
	cb_buf = (__m128i*)(pDst[1]);
Packit 1fb8d4
	cr_buf = (__m128i*)(pDst[2]);
Packit Service 5a9772
	y_r = _mm_set1_epi16(9798);    /*  0.299000 << 15 */
Packit Service 5a9772
	y_g = _mm_set1_epi16(19235);   /*  0.587000 << 15 */
Packit Service 5a9772
	y_b = _mm_set1_epi16(3735);    /*  0.114000 << 15 */
Packit 1fb8d4
	cb_r = _mm_set1_epi16(-5535);  /* -0.168935 << 15 */
Packit 1fb8d4
	cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
Packit 1fb8d4
	cb_b = _mm_set1_epi16(16403);  /*  0.500590 << 15 */
Packit 1fb8d4
	cr_r = _mm_set1_epi16(16377);  /*  0.499813 << 15 */
Packit 1fb8d4
	cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
Packit 1fb8d4
	cr_b = _mm_set1_epi16(-2663);  /* -0.081282 << 15 */
Packit 1fb8d4
	srcbump = srcStep / sizeof(__m128i);
Packit 1fb8d4
	dstbump = dstStep / sizeof(__m128i);
Packit 1fb8d4
#ifdef DO_PREFETCH
Packit 1fb8d4
Packit 1fb8d4
	/* Prefetch RGB's. */
Packit 1fb8d4
	for (yp = 0; yp < roi->height; yp++)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
Packit 1fb8d4
		     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit 1fb8d4
		{
Packit 1fb8d4
			_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		r_buf += srcbump;
Packit 1fb8d4
		g_buf += srcbump;
Packit 1fb8d4
		b_buf += srcbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	r_buf = (__m128i*)(pSrc[0]);
Packit 1fb8d4
	g_buf = (__m128i*)(pSrc[1]);
Packit 1fb8d4
	b_buf = (__m128i*)(pSrc[2]);
Packit 1fb8d4
#endif /* DO_PREFETCH */
Packit 1fb8d4
	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
Packit 1fb8d4
Packit 1fb8d4
	for (yp = 0; yp < roi->height; ++yp)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < imax; i++)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* In order to use SSE2 signed 16-bit integer multiplication we
Packit 1fb8d4
			 * need to convert the floating point factors to signed int
Packit 1fb8d4
			 * without loosing information.  The result of this multiplication
Packit 1fb8d4
			 * is 32 bit and using SSE2 we get either the product's hi or lo
Packit 1fb8d4
			 * word.  Thus we will multiply the factors by the highest
Packit 1fb8d4
			 * possible 2^n and take the upper 16 bits of the signed 32-bit
Packit 1fb8d4
			 * result (_mm_mulhi_epi16).  Since the final result needs to
Packit 1fb8d4
			 * be scaled by << 5 and also in in order to keep the precision
Packit 1fb8d4
			 * within the upper 16 bits we will also have to scale the RGB
Packit 1fb8d4
			 * values used in the multiplication by << 5+(16-n).
Packit 1fb8d4
			 */
Packit 1fb8d4
			__m128i r, g, b, y, cb, cr;
Packit 1fb8d4
			r = _mm_load_si128(y_buf + i);
Packit 1fb8d4
			g = _mm_load_si128(g_buf + i);
Packit 1fb8d4
			b = _mm_load_si128(b_buf + i);
Packit 1fb8d4
			/* r<<6; g<<6; b<<6 */
Packit 1fb8d4
			r = _mm_slli_epi16(r, 6);
Packit 1fb8d4
			g = _mm_slli_epi16(g, 6);
Packit 1fb8d4
			b = _mm_slli_epi16(b, 6);
Packit 1fb8d4
			/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
Packit 1fb8d4
			y = _mm_mulhi_epi16(r, y_r);
Packit 1fb8d4
			y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
Packit 1fb8d4
			y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
Packit 1fb8d4
			y = _mm_add_epi16(y, min);
Packit 1fb8d4
			/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
Packit 1fb8d4
			_mm_between_epi16(y, min, max);
Packit 1fb8d4
			_mm_store_si128(y_buf + i, y);
Packit 1fb8d4
			/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
Packit 1fb8d4
			cb = _mm_mulhi_epi16(r, cb_r);
Packit 1fb8d4
			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
Packit 1fb8d4
			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
Packit 1fb8d4
			/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
Packit 1fb8d4
			_mm_between_epi16(cb, min, max);
Packit 1fb8d4
			_mm_store_si128(cb_buf + i, cb);
Packit 1fb8d4
			/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
Packit 1fb8d4
			cr = _mm_mulhi_epi16(r, cr_r);
Packit 1fb8d4
			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
Packit 1fb8d4
			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
Packit 1fb8d4
			/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
Packit 1fb8d4
			_mm_between_epi16(cr, min, max);
Packit 1fb8d4
			_mm_store_si128(cr_buf + i, cr);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit Service 5a9772
		y_buf += srcbump;
Packit 1fb8d4
		cb_buf += srcbump;
Packit 1fb8d4
		cr_buf += srcbump;
Packit 1fb8d4
		r_buf += dstbump;
Packit 1fb8d4
		g_buf += dstbump;
Packit 1fb8d4
		b_buf += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/*---------------------------------------------------------------------------*/
Packit Service 5a9772
static pstatus_t
Packit Service 5a9772
sse2_RGBToRGB_16s8u_P3AC4R_BGRX(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service 5a9772
                                UINT32 srcStep,             /* bytes between rows in source data */
Packit Service 5a9772
                                BYTE* pDst,             /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service 5a9772
                                UINT32 dstStep,         /* bytes between rows in dest data */
Packit Service 5a9772
                                const prim_size_t* roi) /* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	const UINT16* pr = (const UINT16*)(pSrc[0]);
Packit 1fb8d4
	const UINT16* pg = (const UINT16*)(pSrc[1]);
Packit 1fb8d4
	const UINT16* pb = (const UINT16*)(pSrc[2]);
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
Packit 1fb8d4
	BYTE* out;
Packit 1fb8d4
	UINT32 srcbump, dstbump, y;
Packit Service 5a9772
	out = (BYTE*)pDst;
Packit 1fb8d4
	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
Packit 1fb8d4
	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; ++y)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 x;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < roi->width - pad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			__m128i r, g, b;
Packit 1fb8d4
			/* The comments below pretend these are 8-byte registers
Packit 1fb8d4
			 * rather than 16-byte, for readability.
Packit 1fb8d4
			 */
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pb);
Packit Service 5a9772
				pb += 8; /* R0 = 00B300B200B100B0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pb);
Packit Service 5a9772
				pb += 8;                      /* R1 = 00B700B600B500B4 */
Packit Service 5a9772
				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pg);
Packit Service 5a9772
				pg += 8; /* R1 = 00G300G200G100G0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pg);
Packit Service 5a9772
				pg += 8;                      /* R2 = 00G700G600G500G4 */
Packit Service 5a9772
				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pr);
Packit Service 5a9772
				pr += 8; /* R0 = 00R300R200R100R0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pr);
Packit Service 5a9772
				pr += 8;                      /* R3 = 00R700R600R500R4 */
Packit Service 5a9772
				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i gbHi, gbLo, arHi, arLo;
Packit 1fb8d4
				{
Packit Service 5a9772
					gbLo = _mm_unpacklo_epi8(b, g); /* R0 = G7G6G5G4G3G2G1G0 */
Packit Service 5a9772
					gbHi = _mm_unpackhi_epi8(b, g); /* R1 = G7B7G6B7G5B5G4B4 */
Packit Service 5a9772
					arLo = _mm_unpacklo_epi8(r, a); /* R4 = FFR3FFR2FFR1FFR0 */
Packit Service 5a9772
					arHi = _mm_unpackhi_epi8(r, a); /* R3 = FFR7FFR6FFR5FFR4 */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR1G1B1FFR0G0B0      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR3G3B3FFR2G2B2      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR5G5B5FFR4G4B4      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR7G7B7FFR6G6B6      */
Packit 1fb8d4
				}
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < pad; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const BYTE R = CLIP(*pr++);
Packit 1fb8d4
			const BYTE G = CLIP(*pg++);
Packit 1fb8d4
			const BYTE B = CLIP(*pb++);
Packit 1fb8d4
			*out++ = B;
Packit 1fb8d4
			*out++ = G;
Packit 1fb8d4
			*out++ = R;
Packit 1fb8d4
			*out++ = 0xFF;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Jump to next row. */
Packit 1fb8d4
		pr += srcbump;
Packit 1fb8d4
		pg += srcbump;
Packit 1fb8d4
		pb += srcbump;
Packit 1fb8d4
		out += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit Service 5a9772
static pstatus_t
Packit Service 5a9772
sse2_RGBToRGB_16s8u_P3AC4R_RGBX(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service 5a9772
                                UINT32 srcStep,             /* bytes between rows in source data */
Packit Service 5a9772
                                BYTE* pDst,             /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service 5a9772
                                UINT32 dstStep,         /* bytes between rows in dest data */
Packit Service 5a9772
                                const prim_size_t* roi) /* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	const UINT16* pr = (const UINT16*)(pSrc[0]);
Packit 1fb8d4
	const UINT16* pg = (const UINT16*)(pSrc[1]);
Packit 1fb8d4
	const UINT16* pb = (const UINT16*)(pSrc[2]);
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
Packit 1fb8d4
	BYTE* out;
Packit 1fb8d4
	UINT32 srcbump, dstbump, y;
Packit Service 5a9772
	out = (BYTE*)pDst;
Packit 1fb8d4
	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
Packit 1fb8d4
	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; ++y)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 x;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < roi->width - pad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			__m128i r, g, b;
Packit 1fb8d4
			/* The comments below pretend these are 8-byte registers
Packit 1fb8d4
			 * rather than 16-byte, for readability.
Packit 1fb8d4
			 */
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pb);
Packit Service 5a9772
				pb += 8; /* R0 = 00B300B200B100B0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pb);
Packit Service 5a9772
				pb += 8;                      /* R1 = 00B700B600B500B4 */
Packit Service 5a9772
				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pg);
Packit Service 5a9772
				pg += 8; /* R1 = 00G300G200G100G0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pg);
Packit Service 5a9772
				pg += 8;                      /* R2 = 00G700G600G500G4 */
Packit Service 5a9772
				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pr);
Packit Service 5a9772
				pr += 8; /* R0 = 00R300R200R100R0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pr);
Packit Service 5a9772
				pr += 8;                      /* R3 = 00R700R600R500R4 */
Packit Service 5a9772
				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i gbHi, gbLo, arHi, arLo;
Packit 1fb8d4
				{
Packit Service 5a9772
					gbLo = _mm_unpacklo_epi8(r, g); /* R0 = G7G6G5G4G3G2G1G0 */
Packit Service 5a9772
					gbHi = _mm_unpackhi_epi8(r, g); /* R1 = G7B7G6B7G5B5G4B4 */
Packit Service 5a9772
					arLo = _mm_unpacklo_epi8(b, a); /* R4 = FFR3FFR2FFR1FFR0 */
Packit Service 5a9772
					arHi = _mm_unpackhi_epi8(b, a); /* R3 = FFR7FFR6FFR5FFR4 */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR1G1B1FFR0G0B0      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR3G3B3FFR2G2B2      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR5G5B5FFR4G4B4      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR7G7B7FFR6G6B6      */
Packit 1fb8d4
				}
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < pad; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const BYTE R = CLIP(*pr++);
Packit 1fb8d4
			const BYTE G = CLIP(*pg++);
Packit 1fb8d4
			const BYTE B = CLIP(*pb++);
Packit 1fb8d4
			*out++ = R;
Packit 1fb8d4
			*out++ = G;
Packit 1fb8d4
			*out++ = B;
Packit 1fb8d4
			*out++ = 0xFF;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Jump to next row. */
Packit 1fb8d4
		pr += srcbump;
Packit 1fb8d4
		pg += srcbump;
Packit 1fb8d4
		pb += srcbump;
Packit 1fb8d4
		out += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit Service 5a9772
static pstatus_t
Packit Service 5a9772
sse2_RGBToRGB_16s8u_P3AC4R_XBGR(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service 5a9772
                                UINT32 srcStep,             /* bytes between rows in source data */
Packit Service 5a9772
                                BYTE* pDst,             /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service 5a9772
                                UINT32 dstStep,         /* bytes between rows in dest data */
Packit Service 5a9772
                                const prim_size_t* roi) /* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	const UINT16* pr = (const UINT16*)(pSrc[0]);
Packit 1fb8d4
	const UINT16* pg = (const UINT16*)(pSrc[1]);
Packit 1fb8d4
	const UINT16* pb = (const UINT16*)(pSrc[2]);
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
Packit 1fb8d4
	BYTE* out;
Packit 1fb8d4
	UINT32 srcbump, dstbump, y;
Packit Service 5a9772
	out = (BYTE*)pDst;
Packit 1fb8d4
	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
Packit 1fb8d4
	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; ++y)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 x;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < roi->width - pad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			__m128i r, g, b;
Packit 1fb8d4
			/* The comments below pretend these are 8-byte registers
Packit 1fb8d4
			 * rather than 16-byte, for readability.
Packit 1fb8d4
			 */
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pb);
Packit Service 5a9772
				pb += 8; /* R0 = 00B300B200B100B0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pb);
Packit Service 5a9772
				pb += 8;                      /* R1 = 00B700B600B500B4 */
Packit Service 5a9772
				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pg);
Packit Service 5a9772
				pg += 8; /* R1 = 00G300G200G100G0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pg);
Packit Service 5a9772
				pg += 8;                      /* R2 = 00G700G600G500G4 */
Packit Service 5a9772
				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pr);
Packit Service 5a9772
				pr += 8; /* R0 = 00R300R200R100R0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pr);
Packit Service 5a9772
				pr += 8;                      /* R3 = 00R700R600R500R4 */
Packit Service 5a9772
				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i gbHi, gbLo, arHi, arLo;
Packit 1fb8d4
				{
Packit Service 5a9772
					gbLo = _mm_unpacklo_epi8(a, b); /* R0 = G7G6G5G4G3G2G1G0 */
Packit Service 5a9772
					gbHi = _mm_unpackhi_epi8(a, b); /* R1 = G7B7G6B7G5B5G4B4 */
Packit Service 5a9772
					arLo = _mm_unpacklo_epi8(g, r); /* R4 = FFR3FFR2FFR1FFR0 */
Packit Service 5a9772
					arHi = _mm_unpackhi_epi8(g, r); /* R3 = FFR7FFR6FFR5FFR4 */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR1G1B1FFR0G0B0      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR3G3B3FFR2G2B2      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR5G5B5FFR4G4B4      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR7G7B7FFR6G6B6      */
Packit 1fb8d4
				}
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < pad; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const BYTE R = CLIP(*pr++);
Packit 1fb8d4
			const BYTE G = CLIP(*pg++);
Packit 1fb8d4
			const BYTE B = CLIP(*pb++);
Packit 1fb8d4
			*out++ = 0xFF;
Packit 1fb8d4
			*out++ = B;
Packit 1fb8d4
			*out++ = G;
Packit 1fb8d4
			*out++ = R;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Jump to next row. */
Packit 1fb8d4
		pr += srcbump;
Packit 1fb8d4
		pg += srcbump;
Packit 1fb8d4
		pb += srcbump;
Packit 1fb8d4
		out += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit Service 5a9772
static pstatus_t
Packit Service 5a9772
sse2_RGBToRGB_16s8u_P3AC4R_XRGB(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service 5a9772
                                UINT32 srcStep,             /* bytes between rows in source data */
Packit Service 5a9772
                                BYTE* pDst,             /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service 5a9772
                                UINT32 dstStep,         /* bytes between rows in dest data */
Packit Service 5a9772
                                const prim_size_t* roi) /* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	const UINT16* pr = (const UINT16*)(pSrc[0]);
Packit 1fb8d4
	const UINT16* pg = (const UINT16*)(pSrc[1]);
Packit 1fb8d4
	const UINT16* pb = (const UINT16*)(pSrc[2]);
Packit 1fb8d4
	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	BYTE* out;
Packit 1fb8d4
	UINT32 srcbump, dstbump, y;
Packit Service 5a9772
	out = (BYTE*)pDst;
Packit 1fb8d4
	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
Packit 1fb8d4
	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; ++y)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 x;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < roi->width - pad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			__m128i r, g, b;
Packit 1fb8d4
			/* The comments below pretend these are 8-byte registers
Packit 1fb8d4
			 * rather than 16-byte, for readability.
Packit 1fb8d4
			 */
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pb);
Packit Service 5a9772
				pb += 8; /* R0 = 00B300B200B100B0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pb);
Packit Service 5a9772
				pb += 8;                      /* R1 = 00B700B600B500B4 */
Packit Service 5a9772
				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pg);
Packit Service 5a9772
				pg += 8; /* R1 = 00G300G200G100G0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pg);
Packit Service 5a9772
				pg += 8;                      /* R2 = 00G700G600G500G4 */
Packit Service 5a9772
				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pr);
Packit Service 5a9772
				pr += 8; /* R0 = 00R300R200R100R0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pr);
Packit Service 5a9772
				pr += 8;                      /* R3 = 00R700R600R500R4 */
Packit Service 5a9772
				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i gbHi, gbLo, arHi, arLo;
Packit 1fb8d4
				{
Packit Service 5a9772
					gbLo = _mm_unpacklo_epi8(a, r); /* R0 = G7G6G5G4G3G2G1G0 */
Packit Service 5a9772
					gbHi = _mm_unpackhi_epi8(a, r); /* R1 = G7B7G6B7G5B5G4B4 */
Packit Service 5a9772
					arLo = _mm_unpacklo_epi8(g, b); /* R4 = FFR3FFR2FFR1FFR0 */
Packit Service 5a9772
					arHi = _mm_unpackhi_epi8(g, b); /* R3 = FFR7FFR6FFR5FFR4 */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR1G1B1FFR0G0B0      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR3G3B3FFR2G2B2      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR5G5B5FFR4G4B4      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service 5a9772
					out += 16; /* FFR7G7B7FFR6G6B6      */
Packit 1fb8d4
				}
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < pad; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const BYTE R = CLIP(*pr++);
Packit 1fb8d4
			const BYTE G = CLIP(*pg++);
Packit 1fb8d4
			const BYTE B = CLIP(*pb++);
Packit 1fb8d4
			*out++ = 0xFF;
Packit 1fb8d4
			*out++ = R;
Packit 1fb8d4
			*out++ = G;
Packit 1fb8d4
			*out++ = B;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Jump to next row. */
Packit 1fb8d4
		pr += srcbump;
Packit 1fb8d4
		pg += srcbump;
Packit 1fb8d4
		pb += srcbump;
Packit 1fb8d4
		out += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit Service 5a9772
static pstatus_t
Packit Service 5a9772
sse2_RGBToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service 5a9772
                           UINT32 srcStep,             /* bytes between rows in source data */
Packit Service 5a9772
                           BYTE* pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service 5a9772
                           UINT32 dstStep,             /* bytes between rows in dest data */
Packit Service 5a9772
                           UINT32 DstFormat, const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
Packit 1fb8d4
	    (srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
Packit 1fb8d4
		return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
Packit 1fb8d4
	switch (DstFormat)
Packit 1fb8d4
	{
Packit 1fb8d4
		case PIXEL_FORMAT_BGRA32:
Packit 1fb8d4
		case PIXEL_FORMAT_BGRX32:
Packit 1fb8d4
			return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_RGBA32:
Packit 1fb8d4
		case PIXEL_FORMAT_RGBX32:
Packit 1fb8d4
			return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_ABGR32:
Packit 1fb8d4
		case PIXEL_FORMAT_XBGR32:
Packit 1fb8d4
			return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_ARGB32:
Packit 1fb8d4
		case PIXEL_FORMAT_XRGB32:
Packit 1fb8d4
			return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
#endif /* WITH_SSE2 */
Packit 1fb8d4
Packit 1fb8d4
/*---------------------------------------------------------------------------*/
Packit 1fb8d4
#ifdef WITH_NEON
Packit Service 5a9772
static pstatus_t neon_yCbCrToRGB_16s16s_P3P3(const INT16* const pSrc[3], INT32 srcStep,
Packit Service 5a9772
                                             INT16* pDst[3], INT32 dstStep,
Packit Service 5a9772
                                             const prim_size_t* roi) /* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	/* TODO: If necessary, check alignments and call the general version. */
Packit 1fb8d4
	int16x8_t zero = vdupq_n_s16(0);
Packit 1fb8d4
	int16x8_t max = vdupq_n_s16(255);
Packit Service 5a9772
	int16x8_t r_cr = vdupq_n_s16(22986);  //  1.403 << 14
Packit Service 5a9772
	int16x8_t g_cb = vdupq_n_s16(-5636);  // -0.344 << 14
Packit Service 5a9772
	int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
Packit Service 5a9772
	int16x8_t b_cb = vdupq_n_s16(28999);  //  1.770 << 14
Packit 1fb8d4
	int16x8_t c4096 = vdupq_n_s16(4096);
Packit Service 5a9772
	int16x8_t* y_buf = (int16x8_t*)pSrc[0];
Packit Service 5a9772
	int16x8_t* cb_buf = (int16x8_t*)pSrc[1];
Packit Service 5a9772
	int16x8_t* cr_buf = (int16x8_t*)pSrc[2];
Packit Service 5a9772
	int16x8_t* r_buf = (int16x8_t*)pDst[0];
Packit Service 5a9772
	int16x8_t* g_buf = (int16x8_t*)pDst[1];
Packit Service 5a9772
	int16x8_t* b_buf = (int16x8_t*)pDst[2];
Packit 1fb8d4
	int srcbump = srcStep / sizeof(int16x8_t);
Packit 1fb8d4
	int dstbump = dstStep / sizeof(int16x8_t);
Packit 1fb8d4
	int yp;
Packit 1fb8d4
	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
Packit 1fb8d4
Packit 1fb8d4
	for (yp = 0; yp < roi->height; ++yp)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < imax; i++)
Packit 1fb8d4
		{
Packit 1fb8d4
			/*
Packit Service 5a9772
			    In order to use NEON signed 16-bit integer multiplication we need to convert
Packit Service 5a9772
			    the floating point factors to signed int without loosing information.
Packit Service 5a9772
			    The result of this multiplication is 32 bit and we have a NEON instruction
Packit Service 5a9772
			    that returns the hi word of the saturated double.
Packit Service 5a9772
			    Thus we will multiply the factors by the highest possible 2^n, take the
Packit Service 5a9772
			    upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
Packit Service 5a9772
			    shift by 1 to reverse the doubling) and correct	this result by multiplying it
Packit Service 5a9772
			    by 2^(16-n).
Packit Service 5a9772
			    For the given factors in the conversion matrix the best possible n is 14.
Packit Service 5a9772
Packit Service 5a9772
			    Example for calculating r:
Packit Service 5a9772
			    r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
Packit Service 5a9772
			    r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
Packit Service 5a9772
			    r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
Packit Service 5a9772
			    r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
Packit 1fb8d4
			*/
Packit 1fb8d4
			/* y = (y_buf[i] + 4096) >> 2 */
Packit Service 5a9772
			int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
Packit 1fb8d4
			y = vaddq_s16(y, c4096);
Packit 1fb8d4
			y = vshrq_n_s16(y, 2);
Packit 1fb8d4
			/* cb = cb_buf[i]; */
Packit 1fb8d4
			int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
Packit 1fb8d4
			/* cr = cr_buf[i]; */
Packit Service 5a9772
			int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
Packit 1fb8d4
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit 1fb8d4
			int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
Packit 1fb8d4
			r = vshrq_n_s16(r, 3);
Packit 1fb8d4
			/* r_buf[i] = CLIP(r); */
Packit 1fb8d4
			r = vminq_s16(vmaxq_s16(r, zero), max);
Packit 1fb8d4
			vst1q_s16((INT16*)&r_buf[i], r);
Packit 1fb8d4
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit 1fb8d4
			int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
Packit 1fb8d4
			g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
Packit 1fb8d4
			g = vshrq_n_s16(g, 3);
Packit 1fb8d4
			/* g_buf[i] = CLIP(g); */
Packit 1fb8d4
			g = vminq_s16(vmaxq_s16(g, zero), max);
Packit 1fb8d4
			vst1q_s16((INT16*)&g_buf[i], g);
Packit 1fb8d4
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit 1fb8d4
			int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
Packit 1fb8d4
			b = vshrq_n_s16(b, 3);
Packit 1fb8d4
			/* b_buf[i] = CLIP(b); */
Packit 1fb8d4
			b = vminq_s16(vmaxq_s16(b, zero), max);
Packit 1fb8d4
			vst1q_s16((INT16*)&b_buf[i], b);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit Service 5a9772
		y_buf += srcbump;
Packit 1fb8d4
		cb_buf += srcbump;
Packit 1fb8d4
		cr_buf += srcbump;
Packit 1fb8d4
		r_buf += dstbump;
Packit 1fb8d4
		g_buf += dstbump;
Packit 1fb8d4
		b_buf += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit Service 5a9772
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const pSrc[3], UINT32 srcStep,
Packit Service 5a9772
                                                BYTE* pDst, UINT32 dstStep, const prim_size_t* roi,
Packit Service 5a9772
                                                uint8_t rPos, uint8_t gPos, uint8_t bPos,
Packit Service 5a9772
                                                uint8_t aPos)
Packit 1fb8d4
{
Packit 1fb8d4
	UINT32 x, y;
Packit 1fb8d4
	BYTE* pRGB = pDst;
Packit Service 5a9772
	const INT16* pY = pSrc[0];
Packit 1fb8d4
	const INT16* pCb = pSrc[1];
Packit 1fb8d4
	const INT16* pCr = pSrc[2];
Packit 1fb8d4
	const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
Packit 1fb8d4
	const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
Packit 1fb8d4
	const size_t pad = roi->width % 8;
Packit 1fb8d4
	const int16x4_t c4096 = vdup_n_s16(4096);
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		for (x = 0; x < roi->width - pad; x += 8)
Packit 1fb8d4
		{
Packit 1fb8d4
			const int16x8_t Y = vld1q_s16(pY);
Packit 1fb8d4
			const int16x4_t Yh = vget_high_s16(Y);
Packit 1fb8d4
			const int16x4_t Yl = vget_low_s16(Y);
Packit 1fb8d4
			const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
Packit 1fb8d4
			const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
Packit 1fb8d4
			const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
Packit 1fb8d4
			const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
Packit 1fb8d4
			const int16x8_t Cr = vld1q_s16(pCr);
Packit 1fb8d4
			const int16x4_t Crh = vget_high_s16(Cr);
Packit 1fb8d4
			const int16x4_t Crl = vget_low_s16(Cr);
Packit 1fb8d4
			const int16x8_t Cb = vld1q_s16(pCb);
Packit 1fb8d4
			const int16x4_t Cbh = vget_high_s16(Cb);
Packit 1fb8d4
			const int16x4_t Cbl = vget_low_s16(Cb);
Packit 1fb8d4
			uint8x8x4_t bgrx;
Packit 1fb8d4
			{
Packit 1fb8d4
				/* R */
Packit 1fb8d4
				const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
Packit 1fb8d4
				const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
Packit 1fb8d4
				const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
Packit 1fb8d4
				const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
Packit 1fb8d4
				const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
Packit 1fb8d4
				const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
Packit 1fb8d4
				const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
Packit 1fb8d4
				bgrx.val[rPos] = vqmovun_s16(Rs);
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				/* G */
Packit Service 5a9772
				const int32x4_t CbGh = vmull_n_s16(Cbh, 22527);            /* 0.343730 * 2^16 */
Packit Service 5a9772
				const int32x4_t CbGl = vmull_n_s16(Cbl, 22527);            /* 0.343730 * 2^16 */
Packit 1fb8d4
				const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
Packit 1fb8d4
				const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
Packit 1fb8d4
				const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
Packit 1fb8d4
				const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
Packit 1fb8d4
				const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
Packit 1fb8d4
				const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
Packit 1fb8d4
				const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
Packit 1fb8d4
				const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
Packit 1fb8d4
				const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
Packit 1fb8d4
				const uint8x8_t G = vqmovun_s16(Gs);
Packit 1fb8d4
				bgrx.val[gPos] = G;
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				/* B */
Packit 1fb8d4
				const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
Packit 1fb8d4
				const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
Packit 1fb8d4
				const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
Packit 1fb8d4
				const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
Packit 1fb8d4
				const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
Packit 1fb8d4
				const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
Packit 1fb8d4
				const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
Packit 1fb8d4
				const uint8x8_t B = vqmovun_s16(Bs);
Packit 1fb8d4
				bgrx.val[bPos] = B;
Packit 1fb8d4
			}
Packit 1fb8d4
			/* A */
Packit 1fb8d4
			{
Packit 1fb8d4
				bgrx.val[aPos] = vdup_n_u8(0xFF);
Packit 1fb8d4
			}
Packit 1fb8d4
			vst4_u8(pRGB, bgrx);
Packit 1fb8d4
			pY += 8;
Packit 1fb8d4
			pCb += 8;
Packit 1fb8d4
			pCr += 8;
Packit 1fb8d4
			pRGB += 32;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < pad; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const INT32 divisor = 16;
Packit 1fb8d4
			const INT32 Y = ((*pY++) + 4096) << divisor;
Packit 1fb8d4
			const INT32 Cb = (*pCb++);
Packit 1fb8d4
			const INT32 Cr = (*pCr++);
Packit 1fb8d4
			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
Packit 1fb8d4
			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
Packit 1fb8d4
			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
Packit 1fb8d4
			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
Packit 1fb8d4
			INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
Packit 1fb8d4
			INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
Packit 1fb8d4
			INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
Packit 1fb8d4
			BYTE bgrx[4];
Packit 1fb8d4
			bgrx[bPos] = CLIP(B);
Packit 1fb8d4
			bgrx[gPos] = CLIP(G);
Packit 1fb8d4
			bgrx[rPos] = CLIP(R);
Packit 1fb8d4
			bgrx[aPos] = 0xFF;
Packit 1fb8d4
			*pRGB++ = bgrx[0];
Packit 1fb8d4
			*pRGB++ = bgrx[1];
Packit 1fb8d4
			*pRGB++ = bgrx[2];
Packit 1fb8d4
			*pRGB++ = bgrx[3];
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		pY += srcPad;
Packit 1fb8d4
		pCb += srcPad;
Packit 1fb8d4
		pCr += srcPad;
Packit 1fb8d4
		pRGB += dstPad;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit Service 5a9772
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], UINT32 srcStep,
Packit Service 5a9772
                                              BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
Packit Service 5a9772
                                              const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	switch (DstFormat)
Packit 1fb8d4
	{
Packit 1fb8d4
		case PIXEL_FORMAT_BGRA32:
Packit 1fb8d4
		case PIXEL_FORMAT_BGRX32:
Packit 1fb8d4
			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_RGBA32:
Packit 1fb8d4
		case PIXEL_FORMAT_RGBX32:
Packit 1fb8d4
			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_ARGB32:
Packit 1fb8d4
		case PIXEL_FORMAT_XRGB32:
Packit 1fb8d4
			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_ABGR32:
Packit 1fb8d4
		case PIXEL_FORMAT_XBGR32:
Packit 1fb8d4
			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit Service 5a9772
static pstatus_t
Packit Service 5a9772
neon_RGBToRGB_16s8u_P3AC4R_X(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service 5a9772
                             UINT32 srcStep,             /* bytes between rows in source data */
Packit Service 5a9772
                             BYTE* pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service 5a9772
                             UINT32 dstStep,             /* bytes between rows in dest data */
Packit Service 5a9772
                             const prim_size_t* roi,     /* region of interest */
Packit Service 5a9772
                             uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
Packit 1fb8d4
{
Packit 1fb8d4
	UINT32 x, y;
Packit 1fb8d4
	UINT32 pad = roi->width % 8;
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
Packit 1fb8d4
		const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
Packit 1fb8d4
		const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
Packit 1fb8d4
		BYTE* dst = pDst + y * dstStep;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < roi->width - pad; x += 8)
Packit 1fb8d4
		{
Packit 1fb8d4
			int16x8_t r = vld1q_s16(pr);
Packit 1fb8d4
			int16x8_t g = vld1q_s16(pg);
Packit 1fb8d4
			int16x8_t b = vld1q_s16(pb);
Packit 1fb8d4
			uint8x8x4_t bgrx;
Packit 1fb8d4
			bgrx.val[aPos] = vdup_n_u8(0xFF);
Packit 1fb8d4
			bgrx.val[rPos] = vqmovun_s16(r);
Packit 1fb8d4
			bgrx.val[gPos] = vqmovun_s16(g);
Packit 1fb8d4
			bgrx.val[bPos] = vqmovun_s16(b);
Packit 1fb8d4
			vst4_u8(dst, bgrx);
Packit 1fb8d4
			pr += 8;
Packit 1fb8d4
			pg += 8;
Packit 1fb8d4
			pb += 8;
Packit 1fb8d4
			dst += 32;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit Service 5a9772
		for (x = 0; x < pad; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			BYTE bgrx[4];
Packit 1fb8d4
			bgrx[bPos] = *pb++;
Packit 1fb8d4
			bgrx[gPos] = *pg++;
Packit 1fb8d4
			bgrx[rPos] = *pr++;
Packit 1fb8d4
			bgrx[aPos] = 0xFF;
Packit 1fb8d4
			*dst++ = bgrx[0];
Packit 1fb8d4
			*dst++ = bgrx[1];
Packit 1fb8d4
			*dst++ = bgrx[2];
Packit 1fb8d4
			*dst++ = bgrx[3];
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit Service 5a9772
static pstatus_t
Packit Service 5a9772
neon_RGBToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service 5a9772
                           UINT32 srcStep,             /* bytes between rows in source data */
Packit Service 5a9772
                           BYTE* pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service 5a9772
                           UINT32 dstStep,             /* bytes between rows in dest data */
Packit Service 5a9772
                           UINT32 DstFormat, const prim_size_t* roi) /* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	switch (DstFormat)
Packit 1fb8d4
	{
Packit 1fb8d4
		case PIXEL_FORMAT_BGRA32:
Packit 1fb8d4
		case PIXEL_FORMAT_BGRX32:
Packit 1fb8d4
			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_RGBA32:
Packit 1fb8d4
		case PIXEL_FORMAT_RGBX32:
Packit 1fb8d4
			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_ARGB32:
Packit 1fb8d4
		case PIXEL_FORMAT_XRGB32:
Packit 1fb8d4
			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_ABGR32:
Packit 1fb8d4
		case PIXEL_FORMAT_XBGR32:
Packit 1fb8d4
			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
#endif /* WITH_NEON */
Packit 1fb8d4
/* I don't see a direct IPP version of this, since the input is INT16
Packit 1fb8d4
 * YCbCr.  It may be possible via  Deinterleave and then YCbCrToRGB_<mod>.
Packit 1fb8d4
 * But that would likely be slower.
Packit 1fb8d4
 */
Packit 1fb8d4
Packit 1fb8d4
/* ------------------------------------------------------------------------- */
Packit 1fb8d4
void primitives_init_colors_opt(primitives_t* prims)
Packit 1fb8d4
{
Packit 1fb8d4
	generic = primitives_get_generic();
Packit 1fb8d4
	primitives_init_colors(prims);
Packit 1fb8d4
#if defined(WITH_SSE2)
Packit 1fb8d4
Packit 1fb8d4
	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
Packit 1fb8d4
	{
Packit Service 5a9772
		prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
Packit 1fb8d4
		prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
Packit 1fb8d4
		prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
Packit 1fb8d4
		prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
#elif defined(WITH_NEON)
Packit 1fb8d4
Packit 1fb8d4
	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
Packit 1fb8d4
	{
Packit Service 5a9772
		prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
Packit 1fb8d4
		prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
Packit 1fb8d4
		prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
#endif /* WITH_SSE2 */
Packit 1fb8d4
}