Blame libfreerdp/primitives/prim_colors_opt.c

Packit Service fa4841
/* FreeRDP: A Remote Desktop Protocol Client
Packit Service fa4841
 * Optimized Color conversion operations.
Packit Service fa4841
 * vi:ts=4 sw=4:
Packit Service fa4841
 *
Packit Service fa4841
 * Copyright 2011 Stephen Erisman
Packit Service fa4841
 * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
Packit Service fa4841
 * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
Packit Service fa4841
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
Packit Service fa4841
 *
Packit Service fa4841
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
Packit Service fa4841
 * not use this file except in compliance with the License. You may obtain
Packit Service fa4841
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
Packit Service fa4841
 * Unless required by applicable law or agreed to in writing, software
Packit Service fa4841
 * distributed under the License is distributed on an "AS IS" BASIS,
Packit Service fa4841
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
Packit Service fa4841
 * or implied. See the License for the specific language governing
Packit Service fa4841
 * permissions and limitations under the License.
Packit Service fa4841
 */
Packit Service fa4841
Packit Service fa4841
#ifdef HAVE_CONFIG_H
Packit Service fa4841
#include "config.h"
Packit Service fa4841
#endif
Packit Service fa4841
Packit Service fa4841
#include <freerdp/types.h>
Packit Service fa4841
#include <freerdp/primitives.h>
Packit Service fa4841
#include <winpr/sysinfo.h>
Packit Service fa4841
Packit Service fa4841
#ifdef WITH_SSE2
Packit Service fa4841
#include <emmintrin.h>
Packit Service fa4841
#elif defined(WITH_NEON)
Packit Service fa4841
#include <arm_neon.h>
Packit Service fa4841
#endif /* WITH_SSE2 else WITH_NEON */
Packit Service fa4841
Packit Service fa4841
#include "prim_internal.h"
Packit Service fa4841
#include "prim_templates.h"
Packit Service fa4841
Packit Service fa4841
static primitives_t* generic = NULL;
Packit Service fa4841
Packit Service fa4841
#ifdef WITH_SSE2
Packit Service fa4841
Packit Service fa4841
#ifdef __GNUC__
Packit Service fa4841
#define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Packit Service fa4841
#else
Packit Service fa4841
#define GNU_INLINE
Packit Service fa4841
#endif
Packit Service fa4841
Packit Service fa4841
#define CACHE_LINE_BYTES 64
Packit Service fa4841
Packit Service fa4841
#define _mm_between_epi16(_val, _min, _max)                    \
Packit Service fa4841
	do                                                         \
Packit Service fa4841
	{                                                          \
Packit Service fa4841
		_val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); \
Packit Service fa4841
	} while (0)
Packit Service fa4841
Packit Service fa4841
#ifdef DO_PREFETCH
Packit Service fa4841
/*---------------------------------------------------------------------------*/
Packit Service fa4841
static inline void GNU_INLINE _mm_prefetch_buffer(char* buffer, int num_bytes)
Packit Service fa4841
{
Packit Service fa4841
	__m128i* buf = (__m128i*)buffer;
Packit Service fa4841
	unsigned int i;
Packit Service fa4841
Packit Service fa4841
	for (i = 0; i < (num_bytes / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit Service fa4841
	{
Packit Service fa4841
		_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
Packit Service fa4841
	}
Packit Service fa4841
}
Packit Service fa4841
#endif /* DO_PREFETCH */
Packit Service fa4841
Packit Service fa4841
/*---------------------------------------------------------------------------*/
Packit Service fa4841
static pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(const INT16* const pSrc[3], int srcStep,
Packit Service fa4841
                                             INT16* pDst[3], int dstStep,
Packit Service fa4841
                                             const prim_size_t* roi) /* region of interest */
Packit Service fa4841
{
Packit Service fa4841
	__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
Packit Service fa4841
	__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
Packit Service fa4841
	UINT32 yp;
Packit Service fa4841
	int srcbump, dstbump, imax;
Packit Service fa4841
Packit Service fa4841
	if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
Packit Service fa4841
	    ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
Packit Service fa4841
	    ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
Packit Service fa4841
	    (srcStep & 127) || (dstStep & 127))
Packit Service fa4841
	{
Packit Service fa4841
		/* We can't maintain 16-byte alignment. */
Packit Service fa4841
		return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	zero = _mm_setzero_si128();
Packit Service fa4841
	max = _mm_set1_epi16(255);
Packit Service fa4841
	y_buf = (__m128i*)(pSrc[0]);
Packit Service fa4841
	cb_buf = (__m128i*)(pSrc[1]);
Packit Service fa4841
	cr_buf = (__m128i*)(pSrc[2]);
Packit Service fa4841
	r_buf = (__m128i*)(pDst[0]);
Packit Service fa4841
	g_buf = (__m128i*)(pDst[1]);
Packit Service fa4841
	b_buf = (__m128i*)(pDst[2]);
Packit Service fa4841
	r_cr = _mm_set1_epi16(22986);  /*  1.403 << 14 */
Packit Service fa4841
	g_cb = _mm_set1_epi16(-5636);  /* -0.344 << 14 */
Packit Service fa4841
	g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
Packit Service fa4841
	b_cb = _mm_set1_epi16(28999);  /*  1.770 << 14 */
Packit Service fa4841
	c4096 = _mm_set1_epi16(4096);
Packit Service fa4841
	srcbump = srcStep / sizeof(__m128i);
Packit Service fa4841
	dstbump = dstStep / sizeof(__m128i);
Packit Service fa4841
#ifdef DO_PREFETCH
Packit Service fa4841
Packit Service fa4841
	/* Prefetch Y's, Cb's, and Cr's. */
Packit Service fa4841
	for (yp = 0; yp < roi->height; yp++)
Packit Service fa4841
	{
Packit Service fa4841
		int i;
Packit Service fa4841
Packit Service fa4841
		for (i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
Packit Service fa4841
		     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit Service fa4841
		{
Packit Service fa4841
			_mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
Packit Service fa4841
			_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
Packit Service fa4841
			_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		y_buf += srcbump;
Packit Service fa4841
		cb_buf += srcbump;
Packit Service fa4841
		cr_buf += srcbump;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	y_buf = (__m128i*)(pSrc[0]);
Packit Service fa4841
	cb_buf = (__m128i*)(pSrc[1]);
Packit Service fa4841
	cr_buf = (__m128i*)(pSrc[2]);
Packit Service fa4841
#endif /* DO_PREFETCH */
Packit Service fa4841
	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
Packit Service fa4841
Packit Service fa4841
	for (yp = 0; yp < roi->height; ++yp)
Packit Service fa4841
	{
Packit Service fa4841
		int i;
Packit Service fa4841
Packit Service fa4841
		for (i = 0; i < imax; i++)
Packit Service fa4841
		{
Packit Service fa4841
			/* In order to use SSE2 signed 16-bit integer multiplication
Packit Service fa4841
			 * we need to convert the floating point factors to signed int
Packit Service fa4841
			 * without losing information.
Packit Service fa4841
			 * The result of this multiplication is 32 bit and we have two
Packit Service fa4841
			 * SSE instructions that return either the hi or lo word.
Packit Service fa4841
			 * Thus we will multiply the factors by the highest possible 2^n,
Packit Service fa4841
			 * take the upper 16 bits of the signed 32-bit result
Packit Service fa4841
			 * (_mm_mulhi_epi16) and correct this result by multiplying
Packit Service fa4841
			 * it by 2^(16-n).
Packit Service fa4841
			 *
Packit Service fa4841
			 * For the given factors in the conversion matrix the best
Packit Service fa4841
			 * possible n is 14.
Packit Service fa4841
			 *
Packit Service fa4841
			 * Example for calculating r:
Packit Service fa4841
			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
Packit Service fa4841
			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
Packit Service fa4841
			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
Packit Service fa4841
			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
Packit Service fa4841
			 */
Packit Service fa4841
			/* y = (y_r_buf[i] + 4096) >> 2 */
Packit Service fa4841
			__m128i y, cb, cr, r, g, b;
Packit Service fa4841
			y = _mm_load_si128(y_buf + i);
Packit Service fa4841
			y = _mm_add_epi16(y, c4096);
Packit Service fa4841
			y = _mm_srai_epi16(y, 2);
Packit Service fa4841
			/* cb = cb_g_buf[i]; */
Packit Service fa4841
			cb = _mm_load_si128(cb_buf + i);
Packit Service fa4841
			/* cr = cr_b_buf[i]; */
Packit Service fa4841
			cr = _mm_load_si128(cr_buf + i);
Packit Service fa4841
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit Service fa4841
			r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
Packit Service fa4841
			r = _mm_srai_epi16(r, 3);
Packit Service fa4841
			/* r_buf[i] = CLIP(r); */
Packit Service fa4841
			_mm_between_epi16(r, zero, max);
Packit Service fa4841
			_mm_store_si128(r_buf + i, r);
Packit Service fa4841
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit Service fa4841
			g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
Packit Service fa4841
			g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
Packit Service fa4841
			g = _mm_srai_epi16(g, 3);
Packit Service fa4841
			/* g_buf[i] = CLIP(g); */
Packit Service fa4841
			_mm_between_epi16(g, zero, max);
Packit Service fa4841
			_mm_store_si128(g_buf + i, g);
Packit Service fa4841
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit Service fa4841
			b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
Packit Service fa4841
			b = _mm_srai_epi16(b, 3);
Packit Service fa4841
			/* b_buf[i] = CLIP(b); */
Packit Service fa4841
			_mm_between_epi16(b, zero, max);
Packit Service fa4841
			_mm_store_si128(b_buf + i, b);
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		y_buf += srcbump;
Packit Service fa4841
		cb_buf += srcbump;
Packit Service fa4841
		cr_buf += srcbump;
Packit Service fa4841
		r_buf += dstbump;
Packit Service fa4841
		g_buf += dstbump;
Packit Service fa4841
		b_buf += dstbump;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	return PRIMITIVES_SUCCESS;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
/*---------------------------------------------------------------------------*/
Packit Service fa4841
static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* const pSrc[3], UINT32 srcStep,
Packit Service fa4841
                                                   BYTE* pDst, UINT32 dstStep,
Packit Service fa4841
                                                   const prim_size_t* roi) /* region of interest */
Packit Service fa4841
{
Packit Service fa4841
	const __m128i zero = _mm_setzero_si128();
Packit Service fa4841
	const __m128i max = _mm_set1_epi16(255);
Packit Service fa4841
	const __m128i r_cr = _mm_set1_epi16(22986);  /*  1.403 << 14 */
Packit Service fa4841
	const __m128i g_cb = _mm_set1_epi16(-5636);  /* -0.344 << 14 */
Packit Service fa4841
	const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
Packit Service fa4841
	const __m128i b_cb = _mm_set1_epi16(28999);  /*  1.770 << 14 */
Packit Service fa4841
	const __m128i c4096 = _mm_set1_epi16(4096);
Packit Service fa4841
	const INT16* y_buf = (INT16*)pSrc[0];
Packit Service fa4841
	const INT16* cb_buf = (INT16*)pSrc[1];
Packit Service fa4841
	const INT16* cr_buf = (INT16*)pSrc[2];
Packit Service fa4841
	const UINT32 pad = roi->width % 16;
Packit Service fa4841
	const UINT32 step = sizeof(__m128i) / sizeof(INT16);
Packit Service fa4841
	const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
Packit Service fa4841
	BYTE* d_buf = pDst;
Packit Service fa4841
	UINT32 yp;
Packit Service fa4841
	const size_t dstPad = (dstStep - roi->width * 4);
Packit Service fa4841
#ifdef DO_PREFETCH
Packit Service fa4841
Packit Service fa4841
	/* Prefetch Y's, Cb's, and Cr's. */
Packit Service fa4841
	for (yp = 0; yp < roi->height; yp++)
Packit Service fa4841
	{
Packit Service fa4841
		int i;
Packit Service fa4841
Packit Service fa4841
		for (i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit Service fa4841
		{
Packit Service fa4841
			_mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
Packit Service fa4841
			_mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
Packit Service fa4841
			_mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		y_buf += srcStep / sizeof(INT16);
Packit Service fa4841
		cb_buf += srcStep / sizeof(INT16);
Packit Service fa4841
		cr_buf += srcStep / sizeof(INT16);
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	y_buf = (INT16*)pSrc[0];
Packit Service fa4841
	cb_buf = (INT16*)pSrc[1];
Packit Service fa4841
	cr_buf = (INT16*)pSrc[2];
Packit Service fa4841
#endif /* DO_PREFETCH */
Packit Service fa4841
Packit Service fa4841
	for (yp = 0; yp < roi->height; ++yp)
Packit Service fa4841
	{
Packit Service fa4841
		UINT32 i;
Packit Service fa4841
Packit Service fa4841
		for (i = 0; i < imax; i += 2)
Packit Service fa4841
		{
Packit Service fa4841
			/* In order to use SSE2 signed 16-bit integer multiplication
Packit Service fa4841
			 * we need to convert the floating point factors to signed int
Packit Service fa4841
			 * without losing information.
Packit Service fa4841
			 * The result of this multiplication is 32 bit and we have two
Packit Service fa4841
			 * SSE instructions that return either the hi or lo word.
Packit Service fa4841
			 * Thus we will multiply the factors by the highest possible 2^n,
Packit Service fa4841
			 * take the upper 16 bits of the signed 32-bit result
Packit Service fa4841
			 * (_mm_mulhi_epi16) and correct this result by multiplying
Packit Service fa4841
			 * it by 2^(16-n).
Packit Service fa4841
			 *
Packit Service fa4841
			 * For the given factors in the conversion matrix the best
Packit Service fa4841
			 * possible n is 14.
Packit Service fa4841
			 *
Packit Service fa4841
			 * Example for calculating r:
Packit Service fa4841
			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
Packit Service fa4841
			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
Packit Service fa4841
			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
Packit Service fa4841
			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
Packit Service fa4841
			 */
Packit Service fa4841
			/* y = (y_r_buf[i] + 4096) >> 2 */
Packit Service fa4841
			__m128i y1, y2, cb1, cb2, cr1, cr2, r1, r2, g1, g2, b1, b2;
Packit Service fa4841
			y1 = _mm_load_si128((__m128i*)y_buf);
Packit Service fa4841
			y_buf += step;
Packit Service fa4841
			y1 = _mm_add_epi16(y1, c4096);
Packit Service fa4841
			y1 = _mm_srai_epi16(y1, 2);
Packit Service fa4841
			/* cb = cb_g_buf[i]; */
Packit Service fa4841
			cb1 = _mm_load_si128((__m128i*)cb_buf);
Packit Service fa4841
			cb_buf += step;
Packit Service fa4841
			/* cr = cr_b_buf[i]; */
Packit Service fa4841
			cr1 = _mm_load_si128((__m128i*)cr_buf);
Packit Service fa4841
			cr_buf += step;
Packit Service fa4841
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit Service fa4841
			r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
Packit Service fa4841
			r1 = _mm_srai_epi16(r1, 3);
Packit Service fa4841
			/* r_buf[i] = CLIP(r); */
Packit Service fa4841
			_mm_between_epi16(r1, zero, max);
Packit Service fa4841
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit Service fa4841
			g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
Packit Service fa4841
			g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
Packit Service fa4841
			g1 = _mm_srai_epi16(g1, 3);
Packit Service fa4841
			/* g_buf[i] = CLIP(g); */
Packit Service fa4841
			_mm_between_epi16(g1, zero, max);
Packit Service fa4841
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit Service fa4841
			b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
Packit Service fa4841
			b1 = _mm_srai_epi16(b1, 3);
Packit Service fa4841
			/* b_buf[i] = CLIP(b); */
Packit Service fa4841
			_mm_between_epi16(b1, zero, max);
Packit Service fa4841
			y2 = _mm_load_si128((__m128i*)y_buf);
Packit Service fa4841
			y_buf += step;
Packit Service fa4841
			y2 = _mm_add_epi16(y2, c4096);
Packit Service fa4841
			y2 = _mm_srai_epi16(y2, 2);
Packit Service fa4841
			/* cb = cb_g_buf[i]; */
Packit Service fa4841
			cb2 = _mm_load_si128((__m128i*)cb_buf);
Packit Service fa4841
			cb_buf += step;
Packit Service fa4841
			/* cr = cr_b_buf[i]; */
Packit Service fa4841
			cr2 = _mm_load_si128((__m128i*)cr_buf);
Packit Service fa4841
			cr_buf += step;
Packit Service fa4841
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit Service fa4841
			r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
Packit Service fa4841
			r2 = _mm_srai_epi16(r2, 3);
Packit Service fa4841
			/* r_buf[i] = CLIP(r); */
Packit Service fa4841
			_mm_between_epi16(r2, zero, max);
Packit Service fa4841
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit Service fa4841
			g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
Packit Service fa4841
			g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
Packit Service fa4841
			g2 = _mm_srai_epi16(g2, 3);
Packit Service fa4841
			/* g_buf[i] = CLIP(g); */
Packit Service fa4841
			_mm_between_epi16(g2, zero, max);
Packit Service fa4841
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit Service fa4841
			b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
Packit Service fa4841
			b2 = _mm_srai_epi16(b2, 3);
Packit Service fa4841
			/* b_buf[i] = CLIP(b); */
Packit Service fa4841
			_mm_between_epi16(b2, zero, max);
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1, R2, R3, R4;
Packit Service fa4841
				/* The comments below pretend these are 8-byte registers
Packit Service fa4841
				 * rather than 16-byte, for readability.
Packit Service fa4841
				 */
Packit Service fa4841
				R0 = b1;                              /* R0 = 00B300B200B100B0 */
Packit Service fa4841
				R1 = b2;                              /* R1 = 00B700B600B500B4 */
Packit Service fa4841
				R0 = _mm_packus_epi16(R0, R1);        /* R0 = B7B6B5B4B3B2B1B0 */
Packit Service fa4841
				R1 = g1;                              /* R1 = 00G300G200G100G0 */
Packit Service fa4841
				R2 = g2;                              /* R2 = 00G700G600G500G4 */
Packit Service fa4841
				R1 = _mm_packus_epi16(R1, R2);        /* R1 = G7G6G5G4G3G2G1G0 */
Packit Service fa4841
				R2 = R1;                              /* R2 = G7G6G5G4G3G2G1G0 */
Packit Service fa4841
				R2 = _mm_unpacklo_epi8(R0, R2);       /* R2 = B3G3B2G2B1G1B0G0 */
Packit Service fa4841
				R1 = _mm_unpackhi_epi8(R0, R1);       /* R1 = B7G7B6G6B5G5B4G4 */
Packit Service fa4841
				R0 = r1;                              /* R0 = 00R300R200R100R0 */
Packit Service fa4841
				R3 = r2;                              /* R3 = 00R700R600R500R4 */
Packit Service fa4841
				R0 = _mm_packus_epi16(R0, R3);        /* R0 = R7R6R5R4R3R2R1R0 */
Packit Service fa4841
				R3 = _mm_set1_epi32(0xFFFFFFFFU);     /* R3 = FFFFFFFFFFFFFFFF */
Packit Service fa4841
				R4 = R3;                              /* R4 = FFFFFFFFFFFFFFFF */
Packit Service fa4841
				R4 = _mm_unpacklo_epi8(R0, R4);       /* R4 = R3FFR2FFR1FFR0FF */
Packit Service fa4841
				R3 = _mm_unpackhi_epi8(R0, R3);       /* R3 = R7FFR6FFR5FFR4FF */
Packit Service fa4841
				R0 = R4;                              /* R0 = R4               */
Packit Service fa4841
				R0 = _mm_unpacklo_epi16(R2, R0);      /* R0 = B1G1R1FFB0G0R0FF */
Packit Service fa4841
				R4 = _mm_unpackhi_epi16(R2, R4);      /* R4 = B3G3R3FFB2G2R2FF */
Packit Service fa4841
				R2 = R3;                              /* R2 = R3               */
Packit Service fa4841
				R2 = _mm_unpacklo_epi16(R1, R2);      /* R2 = B5G5R5FFB4G4R4FF */
Packit Service fa4841
				R3 = _mm_unpackhi_epi16(R1, R3);      /* R3 = B7G7R7FFB6G6R6FF */
Packit Service fa4841
				_mm_store_si128((__m128i*)d_buf, R0); /* B1G1R1FFB0G0R0FF      */
Packit Service fa4841
				d_buf += sizeof(__m128i);
Packit Service fa4841
				_mm_store_si128((__m128i*)d_buf, R4); /* B3G3R3FFB2G2R2FF      */
Packit Service fa4841
				d_buf += sizeof(__m128i);
Packit Service fa4841
				_mm_store_si128((__m128i*)d_buf, R2); /* B5G5R5FFB4G4R4FF      */
Packit Service fa4841
				d_buf += sizeof(__m128i);
Packit Service fa4841
				_mm_store_si128((__m128i*)d_buf, R3); /* B7G7R7FFB6G6R6FF      */
Packit Service fa4841
				d_buf += sizeof(__m128i);
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		for (i = 0; i < pad; i++)
Packit Service fa4841
		{
Packit Service fa4841
			const INT32 divisor = 16;
Packit Service fa4841
			const INT32 Y = ((*y_buf++) + 4096) << divisor;
Packit Service fa4841
			const INT32 Cb = (*cb_buf++);
Packit Service fa4841
			const INT32 Cr = (*cr_buf++);
Packit Service fa4841
			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
Packit Service fa4841
			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
Packit Service fa4841
			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
Packit Service fa4841
			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
Packit Service fa4841
			const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
Packit Service fa4841
			const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
Packit Service fa4841
			const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
Packit Service fa4841
			*d_buf++ = CLIP(B);
Packit Service fa4841
			*d_buf++ = CLIP(G);
Packit Service fa4841
			*d_buf++ = CLIP(R);
Packit Service fa4841
			*d_buf++ = 0xFF;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		d_buf += dstPad;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	return PRIMITIVES_SUCCESS;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
/*---------------------------------------------------------------------------*/
Packit Service fa4841
static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* const pSrc[3], UINT32 srcStep,
Packit Service fa4841
                                                   BYTE* pDst, UINT32 dstStep,
Packit Service fa4841
                                                   const prim_size_t* roi) /* region of interest */
Packit Service fa4841
{
Packit Service fa4841
	const __m128i zero = _mm_setzero_si128();
Packit Service fa4841
	const __m128i max = _mm_set1_epi16(255);
Packit Service fa4841
	const __m128i r_cr = _mm_set1_epi16(22986);  /*  1.403 << 14 */
Packit Service fa4841
	const __m128i g_cb = _mm_set1_epi16(-5636);  /* -0.344 << 14 */
Packit Service fa4841
	const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
Packit Service fa4841
	const __m128i b_cb = _mm_set1_epi16(28999);  /*  1.770 << 14 */
Packit Service fa4841
	const __m128i c4096 = _mm_set1_epi16(4096);
Packit Service fa4841
	const INT16* y_buf = (INT16*)pSrc[0];
Packit Service fa4841
	const INT16* cb_buf = (INT16*)pSrc[1];
Packit Service fa4841
	const INT16* cr_buf = (INT16*)pSrc[2];
Packit Service fa4841
	const UINT32 pad = roi->width % 16;
Packit Service fa4841
	const UINT32 step = sizeof(__m128i) / sizeof(INT16);
Packit Service fa4841
	const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
Packit Service fa4841
	BYTE* d_buf = pDst;
Packit Service fa4841
	UINT32 yp;
Packit Service fa4841
	const size_t dstPad = (dstStep - roi->width * 4);
Packit Service fa4841
#ifdef DO_PREFETCH
Packit Service fa4841
Packit Service fa4841
	/* Prefetch Y's, Cb's, and Cr's. */
Packit Service fa4841
	for (yp = 0; yp < roi->height; yp++)
Packit Service fa4841
	{
Packit Service fa4841
		int i;
Packit Service fa4841
Packit Service fa4841
		for (i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit Service fa4841
		{
Packit Service fa4841
			_mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
Packit Service fa4841
			_mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
Packit Service fa4841
			_mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		y_buf += srcStep / sizeof(INT16);
Packit Service fa4841
		cb_buf += srcStep / sizeof(INT16);
Packit Service fa4841
		cr_buf += srcStep / sizeof(INT16);
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	y_buf = (INT16*)(pSrc[0]);
Packit Service fa4841
	cb_buf = (INT16*)(pSrc[1]);
Packit Service fa4841
	cr_buf = (INT16*)(pSrc[2]);
Packit Service fa4841
#endif /* DO_PREFETCH */
Packit Service fa4841
Packit Service fa4841
	for (yp = 0; yp < roi->height; ++yp)
Packit Service fa4841
	{
Packit Service fa4841
		UINT32 i;
Packit Service fa4841
Packit Service fa4841
		for (i = 0; i < imax; i += 2)
Packit Service fa4841
		{
Packit Service fa4841
			/* In order to use SSE2 signed 16-bit integer multiplication
Packit Service fa4841
			 * we need to convert the floating point factors to signed int
Packit Service fa4841
			 * without losing information.
Packit Service fa4841
			 * The result of this multiplication is 32 bit and we have two
Packit Service fa4841
			 * SSE instructions that return either the hi or lo word.
Packit Service fa4841
			 * Thus we will multiply the factors by the highest possible 2^n,
Packit Service fa4841
			 * take the upper 16 bits of the signed 32-bit result
Packit Service fa4841
			 * (_mm_mulhi_epi16) and correct this result by multiplying
Packit Service fa4841
			 * it by 2^(16-n).
Packit Service fa4841
			 *
Packit Service fa4841
			 * For the given factors in the conversion matrix the best
Packit Service fa4841
			 * possible n is 14.
Packit Service fa4841
			 *
Packit Service fa4841
			 * Example for calculating r:
Packit Service fa4841
			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
Packit Service fa4841
			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
Packit Service fa4841
			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
Packit Service fa4841
			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
Packit Service fa4841
			 */
Packit Service fa4841
			/* y = (y_r_buf[i] + 4096) >> 2 */
Packit Service fa4841
			__m128i y1, y2, cb1, cb2, cr1, cr2, r1, r2, g1, g2, b1, b2;
Packit Service fa4841
			y1 = _mm_load_si128((__m128i*)y_buf);
Packit Service fa4841
			y_buf += step;
Packit Service fa4841
			y1 = _mm_add_epi16(y1, c4096);
Packit Service fa4841
			y1 = _mm_srai_epi16(y1, 2);
Packit Service fa4841
			/* cb = cb_g_buf[i]; */
Packit Service fa4841
			cb1 = _mm_load_si128((__m128i*)cb_buf);
Packit Service fa4841
			cb_buf += step;
Packit Service fa4841
			/* cr = cr_b_buf[i]; */
Packit Service fa4841
			cr1 = _mm_load_si128((__m128i*)cr_buf);
Packit Service fa4841
			cr_buf += step;
Packit Service fa4841
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit Service fa4841
			r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
Packit Service fa4841
			r1 = _mm_srai_epi16(r1, 3);
Packit Service fa4841
			/* r_buf[i] = CLIP(r); */
Packit Service fa4841
			_mm_between_epi16(r1, zero, max);
Packit Service fa4841
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit Service fa4841
			g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
Packit Service fa4841
			g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
Packit Service fa4841
			g1 = _mm_srai_epi16(g1, 3);
Packit Service fa4841
			/* g_buf[i] = CLIP(g); */
Packit Service fa4841
			_mm_between_epi16(g1, zero, max);
Packit Service fa4841
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit Service fa4841
			b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
Packit Service fa4841
			b1 = _mm_srai_epi16(b1, 3);
Packit Service fa4841
			/* b_buf[i] = CLIP(b); */
Packit Service fa4841
			_mm_between_epi16(b1, zero, max);
Packit Service fa4841
			y2 = _mm_load_si128((__m128i*)y_buf);
Packit Service fa4841
			y_buf += step;
Packit Service fa4841
			y2 = _mm_add_epi16(y2, c4096);
Packit Service fa4841
			y2 = _mm_srai_epi16(y2, 2);
Packit Service fa4841
			/* cb = cb_g_buf[i]; */
Packit Service fa4841
			cb2 = _mm_load_si128((__m128i*)cb_buf);
Packit Service fa4841
			cb_buf += step;
Packit Service fa4841
			/* cr = cr_b_buf[i]; */
Packit Service fa4841
			cr2 = _mm_load_si128((__m128i*)cr_buf);
Packit Service fa4841
			cr_buf += step;
Packit Service fa4841
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit Service fa4841
			r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
Packit Service fa4841
			r2 = _mm_srai_epi16(r2, 3);
Packit Service fa4841
			/* r_buf[i] = CLIP(r); */
Packit Service fa4841
			_mm_between_epi16(r2, zero, max);
Packit Service fa4841
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit Service fa4841
			g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
Packit Service fa4841
			g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
Packit Service fa4841
			g2 = _mm_srai_epi16(g2, 3);
Packit Service fa4841
			/* g_buf[i] = CLIP(g); */
Packit Service fa4841
			_mm_between_epi16(g2, zero, max);
Packit Service fa4841
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit Service fa4841
			b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
Packit Service fa4841
			b2 = _mm_srai_epi16(b2, 3);
Packit Service fa4841
			/* b_buf[i] = CLIP(b); */
Packit Service fa4841
			_mm_between_epi16(b2, zero, max);
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1, R2, R3, R4;
Packit Service fa4841
				/* The comments below pretend these are 8-byte registers
Packit Service fa4841
				 * rather than 16-byte, for readability.
Packit Service fa4841
				 */
Packit Service fa4841
				R0 = r1;                              /* R0 = 00R300R200R100R0 */
Packit Service fa4841
				R1 = r2;                              /* R1 = 00R700R600R500R4 */
Packit Service fa4841
				R0 = _mm_packus_epi16(R0, R1);        /* R0 = R7R6R5R4R3R2R1R0 */
Packit Service fa4841
				R1 = g1;                              /* R1 = 00G300G200G100G0 */
Packit Service fa4841
				R2 = g2;                              /* R2 = 00G700G600G500G4 */
Packit Service fa4841
				R1 = _mm_packus_epi16(R1, R2);        /* R1 = G7G6G5G4G3G2G1G0 */
Packit Service fa4841
				R2 = R1;                              /* R2 = G7G6G5G4G3G2G1G0 */
Packit Service fa4841
				R2 = _mm_unpacklo_epi8(R0, R2);       /* R2 = R3G3R2G2R1G1R0G0 */
Packit Service fa4841
				R1 = _mm_unpackhi_epi8(R0, R1);       /* R1 = R7G7R6G6R5G5R4G4 */
Packit Service fa4841
				R0 = b1;                              /* R0 = 00B300B200B100B0 */
Packit Service fa4841
				R3 = b2;                              /* R3 = 00B700B600B500B4 */
Packit Service fa4841
				R0 = _mm_packus_epi16(R0, R3);        /* R0 = B7B6B5B4B3B2B1B0 */
Packit Service fa4841
				R3 = _mm_set1_epi32(0xFFFFFFFFU);     /* R3 = FFFFFFFFFFFFFFFF */
Packit Service fa4841
				R4 = R3;                              /* R4 = FFFFFFFFFFFFFFFF */
Packit Service fa4841
				R4 = _mm_unpacklo_epi8(R0, R4);       /* R4 = B3FFB2FFB1FFB0FF */
Packit Service fa4841
				R3 = _mm_unpackhi_epi8(R0, R3);       /* R3 = B7FFB6FFB5FFB4FF */
Packit Service fa4841
				R0 = R4;                              /* R0 = R4               */
Packit Service fa4841
				R0 = _mm_unpacklo_epi16(R2, R0);      /* R0 = R1G1B1FFR0G0B0FF */
Packit Service fa4841
				R4 = _mm_unpackhi_epi16(R2, R4);      /* R4 = R3G3B3FFR2G2B2FF */
Packit Service fa4841
				R2 = R3;                              /* R2 = R3               */
Packit Service fa4841
				R2 = _mm_unpacklo_epi16(R1, R2);      /* R2 = R5G5B5FFR4G4B4FF */
Packit Service fa4841
				R3 = _mm_unpackhi_epi16(R1, R3);      /* R3 = R7G7B7FFR6G6B6FF */
Packit Service fa4841
				_mm_store_si128((__m128i*)d_buf, R0); /* R1G1B1FFR0G0B0FF      */
Packit Service fa4841
				d_buf += sizeof(__m128i);
Packit Service fa4841
				_mm_store_si128((__m128i*)d_buf, R4); /* R3G3B3FFR2G2B2FF      */
Packit Service fa4841
				d_buf += sizeof(__m128i);
Packit Service fa4841
				_mm_store_si128((__m128i*)d_buf, R2); /* R5G5B5FFR4G4B4FF      */
Packit Service fa4841
				d_buf += sizeof(__m128i);
Packit Service fa4841
				_mm_store_si128((__m128i*)d_buf, R3); /* R7G7B7FFR6G6B6FF      */
Packit Service fa4841
				d_buf += sizeof(__m128i);
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		for (i = 0; i < pad; i++)
Packit Service fa4841
		{
Packit Service fa4841
			const INT32 divisor = 16;
Packit Service fa4841
			const INT32 Y = ((*y_buf++) + 4096) << divisor;
Packit Service fa4841
			const INT32 Cb = (*cb_buf++);
Packit Service fa4841
			const INT32 Cr = (*cr_buf++);
Packit Service fa4841
			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
Packit Service fa4841
			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
Packit Service fa4841
			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
Packit Service fa4841
			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
Packit Service fa4841
			const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
Packit Service fa4841
			const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
Packit Service fa4841
			const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
Packit Service fa4841
			*d_buf++ = CLIP(R);
Packit Service fa4841
			*d_buf++ = CLIP(G);
Packit Service fa4841
			*d_buf++ = CLIP(B);
Packit Service fa4841
			*d_buf++ = 0xFF;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		d_buf += dstPad;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	return PRIMITIVES_SUCCESS;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], UINT32 srcStep,
Packit Service fa4841
                                              BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
Packit Service fa4841
                                              const prim_size_t* roi) /* region of interest */
Packit Service fa4841
{
Packit Service fa4841
	if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
Packit Service fa4841
	    ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst)&0x0f) || (srcStep & 0x0f) ||
Packit Service fa4841
	    (dstStep & 0x0f))
Packit Service fa4841
	{
Packit Service fa4841
		/* We can't maintain 16-byte alignment. */
Packit Service fa4841
		return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	switch (DstFormat)
Packit Service fa4841
	{
Packit Service fa4841
		case PIXEL_FORMAT_BGRA32:
Packit Service fa4841
		case PIXEL_FORMAT_BGRX32:
Packit Service fa4841
			return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
Packit Service fa4841
Packit Service fa4841
		case PIXEL_FORMAT_RGBA32:
Packit Service fa4841
		case PIXEL_FORMAT_RGBX32:
Packit Service fa4841
			return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
Packit Service fa4841
Packit Service fa4841
		default:
Packit Service fa4841
			return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit Service fa4841
	}
Packit Service fa4841
}
Packit Service fa4841
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
Packit Service fa4841
 * numbers. See the general code above.
Packit Service fa4841
 */
Packit Service fa4841
static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(const INT16* const pSrc[3], int srcStep,
Packit Service fa4841
                                             INT16* pDst[3], int dstStep,
Packit Service fa4841
                                             const prim_size_t* roi) /* region of interest */
Packit Service fa4841
{
Packit Service fa4841
	__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
Packit Service fa4841
	__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
Packit Service fa4841
	UINT32 yp;
Packit Service fa4841
	int srcbump, dstbump, imax;
Packit Service fa4841
Packit Service fa4841
	if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
Packit Service fa4841
	    ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
Packit Service fa4841
	    ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
Packit Service fa4841
	    (srcStep & 127) || (dstStep & 127))
Packit Service fa4841
	{
Packit Service fa4841
		/* We can't maintain 16-byte alignment. */
Packit Service fa4841
		return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	min = _mm_set1_epi16(-128 * 32);
Packit Service fa4841
	max = _mm_set1_epi16(127 * 32);
Packit Service fa4841
	r_buf = (__m128i*)(pSrc[0]);
Packit Service fa4841
	g_buf = (__m128i*)(pSrc[1]);
Packit Service fa4841
	b_buf = (__m128i*)(pSrc[2]);
Packit Service fa4841
	y_buf = (__m128i*)(pDst[0]);
Packit Service fa4841
	cb_buf = (__m128i*)(pDst[1]);
Packit Service fa4841
	cr_buf = (__m128i*)(pDst[2]);
Packit Service fa4841
	y_r = _mm_set1_epi16(9798);    /*  0.299000 << 15 */
Packit Service fa4841
	y_g = _mm_set1_epi16(19235);   /*  0.587000 << 15 */
Packit Service fa4841
	y_b = _mm_set1_epi16(3735);    /*  0.114000 << 15 */
Packit Service fa4841
	cb_r = _mm_set1_epi16(-5535);  /* -0.168935 << 15 */
Packit Service fa4841
	cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
Packit Service fa4841
	cb_b = _mm_set1_epi16(16403);  /*  0.500590 << 15 */
Packit Service fa4841
	cr_r = _mm_set1_epi16(16377);  /*  0.499813 << 15 */
Packit Service fa4841
	cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
Packit Service fa4841
	cr_b = _mm_set1_epi16(-2663);  /* -0.081282 << 15 */
Packit Service fa4841
	srcbump = srcStep / sizeof(__m128i);
Packit Service fa4841
	dstbump = dstStep / sizeof(__m128i);
Packit Service fa4841
#ifdef DO_PREFETCH
Packit Service fa4841
Packit Service fa4841
	/* Prefetch RGB's. */
Packit Service fa4841
	for (yp = 0; yp < roi->height; yp++)
Packit Service fa4841
	{
Packit Service fa4841
		int i;
Packit Service fa4841
Packit Service fa4841
		for (i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
Packit Service fa4841
		     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit Service fa4841
		{
Packit Service fa4841
			_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
Packit Service fa4841
			_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
Packit Service fa4841
			_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		r_buf += srcbump;
Packit Service fa4841
		g_buf += srcbump;
Packit Service fa4841
		b_buf += srcbump;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	r_buf = (__m128i*)(pSrc[0]);
Packit Service fa4841
	g_buf = (__m128i*)(pSrc[1]);
Packit Service fa4841
	b_buf = (__m128i*)(pSrc[2]);
Packit Service fa4841
#endif /* DO_PREFETCH */
Packit Service fa4841
	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
Packit Service fa4841
Packit Service fa4841
	for (yp = 0; yp < roi->height; ++yp)
Packit Service fa4841
	{
Packit Service fa4841
		int i;
Packit Service fa4841
Packit Service fa4841
		for (i = 0; i < imax; i++)
Packit Service fa4841
		{
Packit Service fa4841
			/* In order to use SSE2 signed 16-bit integer multiplication we
Packit Service fa4841
			 * need to convert the floating point factors to signed int
Packit Service fa4841
			 * without loosing information.  The result of this multiplication
Packit Service fa4841
			 * is 32 bit and using SSE2 we get either the product's hi or lo
Packit Service fa4841
			 * word.  Thus we will multiply the factors by the highest
Packit Service fa4841
			 * possible 2^n and take the upper 16 bits of the signed 32-bit
Packit Service fa4841
			 * result (_mm_mulhi_epi16).  Since the final result needs to
Packit Service fa4841
			 * be scaled by << 5 and also in in order to keep the precision
Packit Service fa4841
			 * within the upper 16 bits we will also have to scale the RGB
Packit Service fa4841
			 * values used in the multiplication by << 5+(16-n).
Packit Service fa4841
			 */
Packit Service fa4841
			__m128i r, g, b, y, cb, cr;
Packit Service fa4841
			r = _mm_load_si128(y_buf + i);
Packit Service fa4841
			g = _mm_load_si128(g_buf + i);
Packit Service fa4841
			b = _mm_load_si128(b_buf + i);
Packit Service fa4841
			/* r<<6; g<<6; b<<6 */
Packit Service fa4841
			r = _mm_slli_epi16(r, 6);
Packit Service fa4841
			g = _mm_slli_epi16(g, 6);
Packit Service fa4841
			b = _mm_slli_epi16(b, 6);
Packit Service fa4841
			/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
Packit Service fa4841
			y = _mm_mulhi_epi16(r, y_r);
Packit Service fa4841
			y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
Packit Service fa4841
			y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
Packit Service fa4841
			y = _mm_add_epi16(y, min);
Packit Service fa4841
			/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
Packit Service fa4841
			_mm_between_epi16(y, min, max);
Packit Service fa4841
			_mm_store_si128(y_buf + i, y);
Packit Service fa4841
			/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
Packit Service fa4841
			cb = _mm_mulhi_epi16(r, cb_r);
Packit Service fa4841
			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
Packit Service fa4841
			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
Packit Service fa4841
			/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
Packit Service fa4841
			_mm_between_epi16(cb, min, max);
Packit Service fa4841
			_mm_store_si128(cb_buf + i, cb);
Packit Service fa4841
			/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
Packit Service fa4841
			cr = _mm_mulhi_epi16(r, cr_r);
Packit Service fa4841
			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
Packit Service fa4841
			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
Packit Service fa4841
			/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
Packit Service fa4841
			_mm_between_epi16(cr, min, max);
Packit Service fa4841
			_mm_store_si128(cr_buf + i, cr);
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		y_buf += srcbump;
Packit Service fa4841
		cb_buf += srcbump;
Packit Service fa4841
		cr_buf += srcbump;
Packit Service fa4841
		r_buf += dstbump;
Packit Service fa4841
		g_buf += dstbump;
Packit Service fa4841
		b_buf += dstbump;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	return PRIMITIVES_SUCCESS;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
/*---------------------------------------------------------------------------*/
Packit Service fa4841
static pstatus_t
Packit Service fa4841
sse2_RGBToRGB_16s8u_P3AC4R_BGRX(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service fa4841
                                UINT32 srcStep,             /* bytes between rows in source data */
Packit Service fa4841
                                BYTE* pDst,             /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service fa4841
                                UINT32 dstStep,         /* bytes between rows in dest data */
Packit Service fa4841
                                const prim_size_t* roi) /* region of interest */
Packit Service fa4841
{
Packit Service fa4841
	const UINT16* pr = (const UINT16*)(pSrc[0]);
Packit Service fa4841
	const UINT16* pg = (const UINT16*)(pSrc[1]);
Packit Service fa4841
	const UINT16* pb = (const UINT16*)(pSrc[2]);
Packit Service fa4841
	const UINT32 pad = roi->width % 16;
Packit Service fa4841
	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
Packit Service fa4841
	BYTE* out;
Packit Service fa4841
	UINT32 srcbump, dstbump, y;
Packit Service fa4841
	out = (BYTE*)pDst;
Packit Service fa4841
	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
Packit Service fa4841
	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
Packit Service fa4841
Packit Service fa4841
	for (y = 0; y < roi->height; ++y)
Packit Service fa4841
	{
Packit Service fa4841
		UINT32 x;
Packit Service fa4841
Packit Service fa4841
		for (x = 0; x < roi->width - pad; x += 16)
Packit Service fa4841
		{
Packit Service fa4841
			__m128i r, g, b;
Packit Service fa4841
			/* The comments below pretend these are 8-byte registers
Packit Service fa4841
			 * rather than 16-byte, for readability.
Packit Service fa4841
			 */
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1;
Packit Service fa4841
				R0 = _mm_load_si128((__m128i*)pb);
Packit Service fa4841
				pb += 8; /* R0 = 00B300B200B100B0 */
Packit Service fa4841
				R1 = _mm_load_si128((__m128i*)pb);
Packit Service fa4841
				pb += 8;                      /* R1 = 00B700B600B500B4 */
Packit Service fa4841
				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1;
Packit Service fa4841
				R0 = _mm_load_si128((__m128i*)pg);
Packit Service fa4841
				pg += 8; /* R1 = 00G300G200G100G0 */
Packit Service fa4841
				R1 = _mm_load_si128((__m128i*)pg);
Packit Service fa4841
				pg += 8;                      /* R2 = 00G700G600G500G4 */
Packit Service fa4841
				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1;
Packit Service fa4841
				R0 = _mm_load_si128((__m128i*)pr);
Packit Service fa4841
				pr += 8; /* R0 = 00R300R200R100R0 */
Packit Service fa4841
				R1 = _mm_load_si128((__m128i*)pr);
Packit Service fa4841
				pr += 8;                      /* R3 = 00R700R600R500R4 */
Packit Service fa4841
				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				__m128i gbHi, gbLo, arHi, arLo;
Packit Service fa4841
				{
Packit Service fa4841
					gbLo = _mm_unpacklo_epi8(b, g); /* R0 = G7G6G5G4G3G2G1G0 */
Packit Service fa4841
					gbHi = _mm_unpackhi_epi8(b, g); /* R1 = G7B7G6B7G5B5G4B4 */
Packit Service fa4841
					arLo = _mm_unpacklo_epi8(r, a); /* R4 = FFR3FFR2FFR1FFR0 */
Packit Service fa4841
					arHi = _mm_unpackhi_epi8(r, a); /* R3 = FFR7FFR6FFR5FFR4 */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR1G1B1FFR0G0B0      */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR3G3B3FFR2G2B2      */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR5G5B5FFR4G4B4      */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR7G7B7FFR6G6B6      */
Packit Service fa4841
				}
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		for (x = 0; x < pad; x++)
Packit Service fa4841
		{
Packit Service fa4841
			const BYTE R = CLIP(*pr++);
Packit Service fa4841
			const BYTE G = CLIP(*pg++);
Packit Service fa4841
			const BYTE B = CLIP(*pb++);
Packit Service fa4841
			*out++ = B;
Packit Service fa4841
			*out++ = G;
Packit Service fa4841
			*out++ = R;
Packit Service fa4841
			*out++ = 0xFF;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		/* Jump to next row. */
Packit Service fa4841
		pr += srcbump;
Packit Service fa4841
		pg += srcbump;
Packit Service fa4841
		pb += srcbump;
Packit Service fa4841
		out += dstbump;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	return PRIMITIVES_SUCCESS;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static pstatus_t
Packit Service fa4841
sse2_RGBToRGB_16s8u_P3AC4R_RGBX(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service fa4841
                                UINT32 srcStep,             /* bytes between rows in source data */
Packit Service fa4841
                                BYTE* pDst,             /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service fa4841
                                UINT32 dstStep,         /* bytes between rows in dest data */
Packit Service fa4841
                                const prim_size_t* roi) /* region of interest */
Packit Service fa4841
{
Packit Service fa4841
	const UINT16* pr = (const UINT16*)(pSrc[0]);
Packit Service fa4841
	const UINT16* pg = (const UINT16*)(pSrc[1]);
Packit Service fa4841
	const UINT16* pb = (const UINT16*)(pSrc[2]);
Packit Service fa4841
	const UINT32 pad = roi->width % 16;
Packit Service fa4841
	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
Packit Service fa4841
	BYTE* out;
Packit Service fa4841
	UINT32 srcbump, dstbump, y;
Packit Service fa4841
	out = (BYTE*)pDst;
Packit Service fa4841
	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
Packit Service fa4841
	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
Packit Service fa4841
Packit Service fa4841
	for (y = 0; y < roi->height; ++y)
Packit Service fa4841
	{
Packit Service fa4841
		UINT32 x;
Packit Service fa4841
Packit Service fa4841
		for (x = 0; x < roi->width - pad; x += 16)
Packit Service fa4841
		{
Packit Service fa4841
			__m128i r, g, b;
Packit Service fa4841
			/* The comments below pretend these are 8-byte registers
Packit Service fa4841
			 * rather than 16-byte, for readability.
Packit Service fa4841
			 */
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1;
Packit Service fa4841
				R0 = _mm_load_si128((__m128i*)pb);
Packit Service fa4841
				pb += 8; /* R0 = 00B300B200B100B0 */
Packit Service fa4841
				R1 = _mm_load_si128((__m128i*)pb);
Packit Service fa4841
				pb += 8;                      /* R1 = 00B700B600B500B4 */
Packit Service fa4841
				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1;
Packit Service fa4841
				R0 = _mm_load_si128((__m128i*)pg);
Packit Service fa4841
				pg += 8; /* R1 = 00G300G200G100G0 */
Packit Service fa4841
				R1 = _mm_load_si128((__m128i*)pg);
Packit Service fa4841
				pg += 8;                      /* R2 = 00G700G600G500G4 */
Packit Service fa4841
				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1;
Packit Service fa4841
				R0 = _mm_load_si128((__m128i*)pr);
Packit Service fa4841
				pr += 8; /* R0 = 00R300R200R100R0 */
Packit Service fa4841
				R1 = _mm_load_si128((__m128i*)pr);
Packit Service fa4841
				pr += 8;                      /* R3 = 00R700R600R500R4 */
Packit Service fa4841
				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				__m128i gbHi, gbLo, arHi, arLo;
Packit Service fa4841
				{
Packit Service fa4841
					gbLo = _mm_unpacklo_epi8(r, g); /* R0 = G7G6G5G4G3G2G1G0 */
Packit Service fa4841
					gbHi = _mm_unpackhi_epi8(r, g); /* R1 = G7B7G6B7G5B5G4B4 */
Packit Service fa4841
					arLo = _mm_unpacklo_epi8(b, a); /* R4 = FFR3FFR2FFR1FFR0 */
Packit Service fa4841
					arHi = _mm_unpackhi_epi8(b, a); /* R3 = FFR7FFR6FFR5FFR4 */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR1G1B1FFR0G0B0      */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR3G3B3FFR2G2B2      */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR5G5B5FFR4G4B4      */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR7G7B7FFR6G6B6      */
Packit Service fa4841
				}
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		for (x = 0; x < pad; x++)
Packit Service fa4841
		{
Packit Service fa4841
			const BYTE R = CLIP(*pr++);
Packit Service fa4841
			const BYTE G = CLIP(*pg++);
Packit Service fa4841
			const BYTE B = CLIP(*pb++);
Packit Service fa4841
			*out++ = R;
Packit Service fa4841
			*out++ = G;
Packit Service fa4841
			*out++ = B;
Packit Service fa4841
			*out++ = 0xFF;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		/* Jump to next row. */
Packit Service fa4841
		pr += srcbump;
Packit Service fa4841
		pg += srcbump;
Packit Service fa4841
		pb += srcbump;
Packit Service fa4841
		out += dstbump;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	return PRIMITIVES_SUCCESS;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static pstatus_t
Packit Service fa4841
sse2_RGBToRGB_16s8u_P3AC4R_XBGR(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service fa4841
                                UINT32 srcStep,             /* bytes between rows in source data */
Packit Service fa4841
                                BYTE* pDst,             /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service fa4841
                                UINT32 dstStep,         /* bytes between rows in dest data */
Packit Service fa4841
                                const prim_size_t* roi) /* region of interest */
Packit Service fa4841
{
Packit Service fa4841
	const UINT16* pr = (const UINT16*)(pSrc[0]);
Packit Service fa4841
	const UINT16* pg = (const UINT16*)(pSrc[1]);
Packit Service fa4841
	const UINT16* pb = (const UINT16*)(pSrc[2]);
Packit Service fa4841
	const UINT32 pad = roi->width % 16;
Packit Service fa4841
	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
Packit Service fa4841
	BYTE* out;
Packit Service fa4841
	UINT32 srcbump, dstbump, y;
Packit Service fa4841
	out = (BYTE*)pDst;
Packit Service fa4841
	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
Packit Service fa4841
	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
Packit Service fa4841
Packit Service fa4841
	for (y = 0; y < roi->height; ++y)
Packit Service fa4841
	{
Packit Service fa4841
		UINT32 x;
Packit Service fa4841
Packit Service fa4841
		for (x = 0; x < roi->width - pad; x += 16)
Packit Service fa4841
		{
Packit Service fa4841
			__m128i r, g, b;
Packit Service fa4841
			/* The comments below pretend these are 8-byte registers
Packit Service fa4841
			 * rather than 16-byte, for readability.
Packit Service fa4841
			 */
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1;
Packit Service fa4841
				R0 = _mm_load_si128((__m128i*)pb);
Packit Service fa4841
				pb += 8; /* R0 = 00B300B200B100B0 */
Packit Service fa4841
				R1 = _mm_load_si128((__m128i*)pb);
Packit Service fa4841
				pb += 8;                      /* R1 = 00B700B600B500B4 */
Packit Service fa4841
				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1;
Packit Service fa4841
				R0 = _mm_load_si128((__m128i*)pg);
Packit Service fa4841
				pg += 8; /* R1 = 00G300G200G100G0 */
Packit Service fa4841
				R1 = _mm_load_si128((__m128i*)pg);
Packit Service fa4841
				pg += 8;                      /* R2 = 00G700G600G500G4 */
Packit Service fa4841
				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1;
Packit Service fa4841
				R0 = _mm_load_si128((__m128i*)pr);
Packit Service fa4841
				pr += 8; /* R0 = 00R300R200R100R0 */
Packit Service fa4841
				R1 = _mm_load_si128((__m128i*)pr);
Packit Service fa4841
				pr += 8;                      /* R3 = 00R700R600R500R4 */
Packit Service fa4841
				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				__m128i gbHi, gbLo, arHi, arLo;
Packit Service fa4841
				{
Packit Service fa4841
					gbLo = _mm_unpacklo_epi8(a, b); /* R0 = G7G6G5G4G3G2G1G0 */
Packit Service fa4841
					gbHi = _mm_unpackhi_epi8(a, b); /* R1 = G7B7G6B7G5B5G4B4 */
Packit Service fa4841
					arLo = _mm_unpacklo_epi8(g, r); /* R4 = FFR3FFR2FFR1FFR0 */
Packit Service fa4841
					arHi = _mm_unpackhi_epi8(g, r); /* R3 = FFR7FFR6FFR5FFR4 */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR1G1B1FFR0G0B0      */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR3G3B3FFR2G2B2      */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR5G5B5FFR4G4B4      */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR7G7B7FFR6G6B6      */
Packit Service fa4841
				}
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		for (x = 0; x < pad; x++)
Packit Service fa4841
		{
Packit Service fa4841
			const BYTE R = CLIP(*pr++);
Packit Service fa4841
			const BYTE G = CLIP(*pg++);
Packit Service fa4841
			const BYTE B = CLIP(*pb++);
Packit Service fa4841
			*out++ = 0xFF;
Packit Service fa4841
			*out++ = B;
Packit Service fa4841
			*out++ = G;
Packit Service fa4841
			*out++ = R;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		/* Jump to next row. */
Packit Service fa4841
		pr += srcbump;
Packit Service fa4841
		pg += srcbump;
Packit Service fa4841
		pb += srcbump;
Packit Service fa4841
		out += dstbump;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	return PRIMITIVES_SUCCESS;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static pstatus_t
Packit Service fa4841
sse2_RGBToRGB_16s8u_P3AC4R_XRGB(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service fa4841
                                UINT32 srcStep,             /* bytes between rows in source data */
Packit Service fa4841
                                BYTE* pDst,             /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service fa4841
                                UINT32 dstStep,         /* bytes between rows in dest data */
Packit Service fa4841
                                const prim_size_t* roi) /* region of interest */
Packit Service fa4841
{
Packit Service fa4841
	const UINT16* pr = (const UINT16*)(pSrc[0]);
Packit Service fa4841
	const UINT16* pg = (const UINT16*)(pSrc[1]);
Packit Service fa4841
	const UINT16* pb = (const UINT16*)(pSrc[2]);
Packit Service fa4841
	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
Packit Service fa4841
	const UINT32 pad = roi->width % 16;
Packit Service fa4841
	BYTE* out;
Packit Service fa4841
	UINT32 srcbump, dstbump, y;
Packit Service fa4841
	out = (BYTE*)pDst;
Packit Service fa4841
	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
Packit Service fa4841
	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
Packit Service fa4841
Packit Service fa4841
	for (y = 0; y < roi->height; ++y)
Packit Service fa4841
	{
Packit Service fa4841
		UINT32 x;
Packit Service fa4841
Packit Service fa4841
		for (x = 0; x < roi->width - pad; x += 16)
Packit Service fa4841
		{
Packit Service fa4841
			__m128i r, g, b;
Packit Service fa4841
			/* The comments below pretend these are 8-byte registers
Packit Service fa4841
			 * rather than 16-byte, for readability.
Packit Service fa4841
			 */
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1;
Packit Service fa4841
				R0 = _mm_load_si128((__m128i*)pb);
Packit Service fa4841
				pb += 8; /* R0 = 00B300B200B100B0 */
Packit Service fa4841
				R1 = _mm_load_si128((__m128i*)pb);
Packit Service fa4841
				pb += 8;                      /* R1 = 00B700B600B500B4 */
Packit Service fa4841
				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1;
Packit Service fa4841
				R0 = _mm_load_si128((__m128i*)pg);
Packit Service fa4841
				pg += 8; /* R1 = 00G300G200G100G0 */
Packit Service fa4841
				R1 = _mm_load_si128((__m128i*)pg);
Packit Service fa4841
				pg += 8;                      /* R2 = 00G700G600G500G4 */
Packit Service fa4841
				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				__m128i R0, R1;
Packit Service fa4841
				R0 = _mm_load_si128((__m128i*)pr);
Packit Service fa4841
				pr += 8; /* R0 = 00R300R200R100R0 */
Packit Service fa4841
				R1 = _mm_load_si128((__m128i*)pr);
Packit Service fa4841
				pr += 8;                      /* R3 = 00R700R600R500R4 */
Packit Service fa4841
				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				__m128i gbHi, gbLo, arHi, arLo;
Packit Service fa4841
				{
Packit Service fa4841
					gbLo = _mm_unpacklo_epi8(a, r); /* R0 = G7G6G5G4G3G2G1G0 */
Packit Service fa4841
					gbHi = _mm_unpackhi_epi8(a, r); /* R1 = G7B7G6B7G5B5G4B4 */
Packit Service fa4841
					arLo = _mm_unpacklo_epi8(g, b); /* R4 = FFR3FFR2FFR1FFR0 */
Packit Service fa4841
					arHi = _mm_unpackhi_epi8(g, b); /* R3 = FFR7FFR6FFR5FFR4 */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR1G1B1FFR0G0B0      */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR3G3B3FFR2G2B2      */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR5G5B5FFR4G4B4      */
Packit Service fa4841
				}
Packit Service fa4841
				{
Packit Service fa4841
					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
Packit Service fa4841
					_mm_store_si128((__m128i*)out, bgrx);
Packit Service fa4841
					out += 16; /* FFR7G7B7FFR6G6B6      */
Packit Service fa4841
				}
Packit Service fa4841
			}
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		for (x = 0; x < pad; x++)
Packit Service fa4841
		{
Packit Service fa4841
			const BYTE R = CLIP(*pr++);
Packit Service fa4841
			const BYTE G = CLIP(*pg++);
Packit Service fa4841
			const BYTE B = CLIP(*pb++);
Packit Service fa4841
			*out++ = 0xFF;
Packit Service fa4841
			*out++ = R;
Packit Service fa4841
			*out++ = G;
Packit Service fa4841
			*out++ = B;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		/* Jump to next row. */
Packit Service fa4841
		pr += srcbump;
Packit Service fa4841
		pg += srcbump;
Packit Service fa4841
		pb += srcbump;
Packit Service fa4841
		out += dstbump;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	return PRIMITIVES_SUCCESS;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static pstatus_t
Packit Service fa4841
sse2_RGBToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service fa4841
                           UINT32 srcStep,             /* bytes between rows in source data */
Packit Service fa4841
                           BYTE* pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service fa4841
                           UINT32 dstStep,             /* bytes between rows in dest data */
Packit Service fa4841
                           UINT32 DstFormat, const prim_size_t* roi)
Packit Service fa4841
{
Packit Service fa4841
	if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
Packit Service fa4841
	    (srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
Packit Service fa4841
		return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit Service fa4841
Packit Service fa4841
	switch (DstFormat)
Packit Service fa4841
	{
Packit Service fa4841
		case PIXEL_FORMAT_BGRA32:
Packit Service fa4841
		case PIXEL_FORMAT_BGRX32:
Packit Service fa4841
			return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
Packit Service fa4841
Packit Service fa4841
		case PIXEL_FORMAT_RGBA32:
Packit Service fa4841
		case PIXEL_FORMAT_RGBX32:
Packit Service fa4841
			return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
Packit Service fa4841
Packit Service fa4841
		case PIXEL_FORMAT_ABGR32:
Packit Service fa4841
		case PIXEL_FORMAT_XBGR32:
Packit Service fa4841
			return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
Packit Service fa4841
Packit Service fa4841
		case PIXEL_FORMAT_ARGB32:
Packit Service fa4841
		case PIXEL_FORMAT_XRGB32:
Packit Service fa4841
			return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
Packit Service fa4841
Packit Service fa4841
		default:
Packit Service fa4841
			return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit Service fa4841
	}
Packit Service fa4841
}
Packit Service fa4841
#endif /* WITH_SSE2 */
Packit Service fa4841
Packit Service fa4841
/*---------------------------------------------------------------------------*/
Packit Service fa4841
#ifdef WITH_NEON
Packit Service fa4841
static pstatus_t neon_yCbCrToRGB_16s16s_P3P3(const INT16* const pSrc[3], INT32 srcStep,
Packit Service fa4841
                                             INT16* pDst[3], INT32 dstStep,
Packit Service fa4841
                                             const prim_size_t* roi) /* region of interest */
Packit Service fa4841
{
Packit Service fa4841
	/* TODO: If necessary, check alignments and call the general version. */
Packit Service fa4841
	int16x8_t zero = vdupq_n_s16(0);
Packit Service fa4841
	int16x8_t max = vdupq_n_s16(255);
Packit Service fa4841
	int16x8_t r_cr = vdupq_n_s16(22986);  //  1.403 << 14
Packit Service fa4841
	int16x8_t g_cb = vdupq_n_s16(-5636);  // -0.344 << 14
Packit Service fa4841
	int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
Packit Service fa4841
	int16x8_t b_cb = vdupq_n_s16(28999);  //  1.770 << 14
Packit Service fa4841
	int16x8_t c4096 = vdupq_n_s16(4096);
Packit Service fa4841
	int16x8_t* y_buf = (int16x8_t*)pSrc[0];
Packit Service fa4841
	int16x8_t* cb_buf = (int16x8_t*)pSrc[1];
Packit Service fa4841
	int16x8_t* cr_buf = (int16x8_t*)pSrc[2];
Packit Service fa4841
	int16x8_t* r_buf = (int16x8_t*)pDst[0];
Packit Service fa4841
	int16x8_t* g_buf = (int16x8_t*)pDst[1];
Packit Service fa4841
	int16x8_t* b_buf = (int16x8_t*)pDst[2];
Packit Service fa4841
	int srcbump = srcStep / sizeof(int16x8_t);
Packit Service fa4841
	int dstbump = dstStep / sizeof(int16x8_t);
Packit Service fa4841
	int yp;
Packit Service fa4841
	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
Packit Service fa4841
Packit Service fa4841
	for (yp = 0; yp < roi->height; ++yp)
Packit Service fa4841
	{
Packit Service fa4841
		int i;
Packit Service fa4841
Packit Service fa4841
		for (i = 0; i < imax; i++)
Packit Service fa4841
		{
Packit Service fa4841
			/*
Packit Service fa4841
			    In order to use NEON signed 16-bit integer multiplication we need to convert
Packit Service fa4841
			    the floating point factors to signed int without loosing information.
Packit Service fa4841
			    The result of this multiplication is 32 bit and we have a NEON instruction
Packit Service fa4841
			    that returns the hi word of the saturated double.
Packit Service fa4841
			    Thus we will multiply the factors by the highest possible 2^n, take the
Packit Service fa4841
			    upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
Packit Service fa4841
			    shift by 1 to reverse the doubling) and correct	this result by multiplying it
Packit Service fa4841
			    by 2^(16-n).
Packit Service fa4841
			    For the given factors in the conversion matrix the best possible n is 14.
Packit Service fa4841
Packit Service fa4841
			    Example for calculating r:
Packit Service fa4841
			    r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
Packit Service fa4841
			    r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
Packit Service fa4841
			    r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
Packit Service fa4841
			    r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
Packit Service fa4841
			*/
Packit Service fa4841
			/* y = (y_buf[i] + 4096) >> 2 */
Packit Service fa4841
			int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
Packit Service fa4841
			y = vaddq_s16(y, c4096);
Packit Service fa4841
			y = vshrq_n_s16(y, 2);
Packit Service fa4841
			/* cb = cb_buf[i]; */
Packit Service fa4841
			int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
Packit Service fa4841
			/* cr = cr_buf[i]; */
Packit Service fa4841
			int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
Packit Service fa4841
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit Service fa4841
			int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
Packit Service fa4841
			r = vshrq_n_s16(r, 3);
Packit Service fa4841
			/* r_buf[i] = CLIP(r); */
Packit Service fa4841
			r = vminq_s16(vmaxq_s16(r, zero), max);
Packit Service fa4841
			vst1q_s16((INT16*)&r_buf[i], r);
Packit Service fa4841
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit Service fa4841
			int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
Packit Service fa4841
			g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
Packit Service fa4841
			g = vshrq_n_s16(g, 3);
Packit Service fa4841
			/* g_buf[i] = CLIP(g); */
Packit Service fa4841
			g = vminq_s16(vmaxq_s16(g, zero), max);
Packit Service fa4841
			vst1q_s16((INT16*)&g_buf[i], g);
Packit Service fa4841
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit Service fa4841
			int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
Packit Service fa4841
			b = vshrq_n_s16(b, 3);
Packit Service fa4841
			/* b_buf[i] = CLIP(b); */
Packit Service fa4841
			b = vminq_s16(vmaxq_s16(b, zero), max);
Packit Service fa4841
			vst1q_s16((INT16*)&b_buf[i], b);
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		y_buf += srcbump;
Packit Service fa4841
		cb_buf += srcbump;
Packit Service fa4841
		cr_buf += srcbump;
Packit Service fa4841
		r_buf += dstbump;
Packit Service fa4841
		g_buf += dstbump;
Packit Service fa4841
		b_buf += dstbump;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	return PRIMITIVES_SUCCESS;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const pSrc[3], UINT32 srcStep,
Packit Service fa4841
                                                BYTE* pDst, UINT32 dstStep, const prim_size_t* roi,
Packit Service fa4841
                                                uint8_t rPos, uint8_t gPos, uint8_t bPos,
Packit Service fa4841
                                                uint8_t aPos)
Packit Service fa4841
{
Packit Service fa4841
	UINT32 x, y;
Packit Service fa4841
	BYTE* pRGB = pDst;
Packit Service fa4841
	const INT16* pY = pSrc[0];
Packit Service fa4841
	const INT16* pCb = pSrc[1];
Packit Service fa4841
	const INT16* pCr = pSrc[2];
Packit Service fa4841
	const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
Packit Service fa4841
	const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
Packit Service fa4841
	const size_t pad = roi->width % 8;
Packit Service fa4841
	const int16x4_t c4096 = vdup_n_s16(4096);
Packit Service fa4841
Packit Service fa4841
	for (y = 0; y < roi->height; y++)
Packit Service fa4841
	{
Packit Service fa4841
		for (x = 0; x < roi->width - pad; x += 8)
Packit Service fa4841
		{
Packit Service fa4841
			const int16x8_t Y = vld1q_s16(pY);
Packit Service fa4841
			const int16x4_t Yh = vget_high_s16(Y);
Packit Service fa4841
			const int16x4_t Yl = vget_low_s16(Y);
Packit Service fa4841
			const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
Packit Service fa4841
			const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
Packit Service fa4841
			const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
Packit Service fa4841
			const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
Packit Service fa4841
			const int16x8_t Cr = vld1q_s16(pCr);
Packit Service fa4841
			const int16x4_t Crh = vget_high_s16(Cr);
Packit Service fa4841
			const int16x4_t Crl = vget_low_s16(Cr);
Packit Service fa4841
			const int16x8_t Cb = vld1q_s16(pCb);
Packit Service fa4841
			const int16x4_t Cbh = vget_high_s16(Cb);
Packit Service fa4841
			const int16x4_t Cbl = vget_low_s16(Cb);
Packit Service fa4841
			uint8x8x4_t bgrx;
Packit Service fa4841
			{
Packit Service fa4841
				/* R */
Packit Service fa4841
				const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
Packit Service fa4841
				const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
Packit Service fa4841
				const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
Packit Service fa4841
				const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
Packit Service fa4841
				const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
Packit Service fa4841
				const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
Packit Service fa4841
				const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
Packit Service fa4841
				bgrx.val[rPos] = vqmovun_s16(Rs);
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				/* G */
Packit Service fa4841
				const int32x4_t CbGh = vmull_n_s16(Cbh, 22527);            /* 0.343730 * 2^16 */
Packit Service fa4841
				const int32x4_t CbGl = vmull_n_s16(Cbl, 22527);            /* 0.343730 * 2^16 */
Packit Service fa4841
				const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
Packit Service fa4841
				const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
Packit Service fa4841
				const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
Packit Service fa4841
				const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
Packit Service fa4841
				const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
Packit Service fa4841
				const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
Packit Service fa4841
				const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
Packit Service fa4841
				const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
Packit Service fa4841
				const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
Packit Service fa4841
				const uint8x8_t G = vqmovun_s16(Gs);
Packit Service fa4841
				bgrx.val[gPos] = G;
Packit Service fa4841
			}
Packit Service fa4841
			{
Packit Service fa4841
				/* B */
Packit Service fa4841
				const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
Packit Service fa4841
				const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
Packit Service fa4841
				const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
Packit Service fa4841
				const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
Packit Service fa4841
				const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
Packit Service fa4841
				const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
Packit Service fa4841
				const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
Packit Service fa4841
				const uint8x8_t B = vqmovun_s16(Bs);
Packit Service fa4841
				bgrx.val[bPos] = B;
Packit Service fa4841
			}
Packit Service fa4841
			/* A */
Packit Service fa4841
			{
Packit Service fa4841
				bgrx.val[aPos] = vdup_n_u8(0xFF);
Packit Service fa4841
			}
Packit Service fa4841
			vst4_u8(pRGB, bgrx);
Packit Service fa4841
			pY += 8;
Packit Service fa4841
			pCb += 8;
Packit Service fa4841
			pCr += 8;
Packit Service fa4841
			pRGB += 32;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		for (x = 0; x < pad; x++)
Packit Service fa4841
		{
Packit Service fa4841
			const INT32 divisor = 16;
Packit Service fa4841
			const INT32 Y = ((*pY++) + 4096) << divisor;
Packit Service fa4841
			const INT32 Cb = (*pCb++);
Packit Service fa4841
			const INT32 Cr = (*pCr++);
Packit Service fa4841
			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
Packit Service fa4841
			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
Packit Service fa4841
			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
Packit Service fa4841
			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
Packit Service fa4841
			INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
Packit Service fa4841
			INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
Packit Service fa4841
			INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
Packit Service fa4841
			BYTE bgrx[4];
Packit Service fa4841
			bgrx[bPos] = CLIP(B);
Packit Service fa4841
			bgrx[gPos] = CLIP(G);
Packit Service fa4841
			bgrx[rPos] = CLIP(R);
Packit Service fa4841
			bgrx[aPos] = 0xFF;
Packit Service fa4841
			*pRGB++ = bgrx[0];
Packit Service fa4841
			*pRGB++ = bgrx[1];
Packit Service fa4841
			*pRGB++ = bgrx[2];
Packit Service fa4841
			*pRGB++ = bgrx[3];
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		pY += srcPad;
Packit Service fa4841
		pCb += srcPad;
Packit Service fa4841
		pCr += srcPad;
Packit Service fa4841
		pRGB += dstPad;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	return PRIMITIVES_SUCCESS;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], UINT32 srcStep,
Packit Service fa4841
                                              BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
Packit Service fa4841
                                              const prim_size_t* roi)
Packit Service fa4841
{
Packit Service fa4841
	switch (DstFormat)
Packit Service fa4841
	{
Packit Service fa4841
		case PIXEL_FORMAT_BGRA32:
Packit Service fa4841
		case PIXEL_FORMAT_BGRX32:
Packit Service fa4841
			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
Packit Service fa4841
Packit Service fa4841
		case PIXEL_FORMAT_RGBA32:
Packit Service fa4841
		case PIXEL_FORMAT_RGBX32:
Packit Service fa4841
			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
Packit Service fa4841
Packit Service fa4841
		case PIXEL_FORMAT_ARGB32:
Packit Service fa4841
		case PIXEL_FORMAT_XRGB32:
Packit Service fa4841
			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
Packit Service fa4841
Packit Service fa4841
		case PIXEL_FORMAT_ABGR32:
Packit Service fa4841
		case PIXEL_FORMAT_XBGR32:
Packit Service fa4841
			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
Packit Service fa4841
Packit Service fa4841
		default:
Packit Service fa4841
			return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit Service fa4841
	}
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static pstatus_t
Packit Service fa4841
neon_RGBToRGB_16s8u_P3AC4R_X(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service fa4841
                             UINT32 srcStep,             /* bytes between rows in source data */
Packit Service fa4841
                             BYTE* pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service fa4841
                             UINT32 dstStep,             /* bytes between rows in dest data */
Packit Service fa4841
                             const prim_size_t* roi,     /* region of interest */
Packit Service fa4841
                             uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
Packit Service fa4841
{
Packit Service fa4841
	UINT32 x, y;
Packit Service fa4841
	UINT32 pad = roi->width % 8;
Packit Service fa4841
Packit Service fa4841
	for (y = 0; y < roi->height; y++)
Packit Service fa4841
	{
Packit Service fa4841
		const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
Packit Service fa4841
		const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
Packit Service fa4841
		const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
Packit Service fa4841
		BYTE* dst = pDst + y * dstStep;
Packit Service fa4841
Packit Service fa4841
		for (x = 0; x < roi->width - pad; x += 8)
Packit Service fa4841
		{
Packit Service fa4841
			int16x8_t r = vld1q_s16(pr);
Packit Service fa4841
			int16x8_t g = vld1q_s16(pg);
Packit Service fa4841
			int16x8_t b = vld1q_s16(pb);
Packit Service fa4841
			uint8x8x4_t bgrx;
Packit Service fa4841
			bgrx.val[aPos] = vdup_n_u8(0xFF);
Packit Service fa4841
			bgrx.val[rPos] = vqmovun_s16(r);
Packit Service fa4841
			bgrx.val[gPos] = vqmovun_s16(g);
Packit Service fa4841
			bgrx.val[bPos] = vqmovun_s16(b);
Packit Service fa4841
			vst4_u8(dst, bgrx);
Packit Service fa4841
			pr += 8;
Packit Service fa4841
			pg += 8;
Packit Service fa4841
			pb += 8;
Packit Service fa4841
			dst += 32;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		for (x = 0; x < pad; x++)
Packit Service fa4841
		{
Packit Service fa4841
			BYTE bgrx[4];
Packit Service fa4841
			bgrx[bPos] = *pb++;
Packit Service fa4841
			bgrx[gPos] = *pg++;
Packit Service fa4841
			bgrx[rPos] = *pr++;
Packit Service fa4841
			bgrx[aPos] = 0xFF;
Packit Service fa4841
			*dst++ = bgrx[0];
Packit Service fa4841
			*dst++ = bgrx[1];
Packit Service fa4841
			*dst++ = bgrx[2];
Packit Service fa4841
			*dst++ = bgrx[3];
Packit Service fa4841
		}
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	return PRIMITIVES_SUCCESS;
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static pstatus_t
Packit Service fa4841
neon_RGBToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
Packit Service fa4841
                           UINT32 srcStep,             /* bytes between rows in source data */
Packit Service fa4841
                           BYTE* pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
Packit Service fa4841
                           UINT32 dstStep,             /* bytes between rows in dest data */
Packit Service fa4841
                           UINT32 DstFormat, const prim_size_t* roi) /* region of interest */
Packit Service fa4841
{
Packit Service fa4841
	switch (DstFormat)
Packit Service fa4841
	{
Packit Service fa4841
		case PIXEL_FORMAT_BGRA32:
Packit Service fa4841
		case PIXEL_FORMAT_BGRX32:
Packit Service fa4841
			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
Packit Service fa4841
Packit Service fa4841
		case PIXEL_FORMAT_RGBA32:
Packit Service fa4841
		case PIXEL_FORMAT_RGBX32:
Packit Service fa4841
			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
Packit Service fa4841
Packit Service fa4841
		case PIXEL_FORMAT_ARGB32:
Packit Service fa4841
		case PIXEL_FORMAT_XRGB32:
Packit Service fa4841
			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
Packit Service fa4841
Packit Service fa4841
		case PIXEL_FORMAT_ABGR32:
Packit Service fa4841
		case PIXEL_FORMAT_XBGR32:
Packit Service fa4841
			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
Packit Service fa4841
Packit Service fa4841
		default:
Packit Service fa4841
			return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit Service fa4841
	}
Packit Service fa4841
}
Packit Service fa4841
#endif /* WITH_NEON */
Packit Service fa4841
/* I don't see a direct IPP version of this, since the input is INT16
Packit Service fa4841
 * YCbCr.  It may be possible via  Deinterleave and then YCbCrToRGB_<mod>.
Packit Service fa4841
 * But that would likely be slower.
Packit Service fa4841
 */
Packit Service fa4841
Packit Service fa4841
/* ------------------------------------------------------------------------- */
Packit Service fa4841
void primitives_init_colors_opt(primitives_t* prims)
Packit Service fa4841
{
Packit Service fa4841
	generic = primitives_get_generic();
Packit Service fa4841
	primitives_init_colors(prims);
Packit Service fa4841
#if defined(WITH_SSE2)
Packit Service fa4841
Packit Service fa4841
	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
Packit Service fa4841
	{
Packit Service fa4841
		prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
Packit Service fa4841
		prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
Packit Service fa4841
		prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
Packit Service fa4841
		prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
#elif defined(WITH_NEON)
Packit Service fa4841
Packit Service fa4841
	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
Packit Service fa4841
	{
Packit Service fa4841
		prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
Packit Service fa4841
		prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
Packit Service fa4841
		prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
#endif /* WITH_SSE2 */
Packit Service fa4841
}