Blame libfreerdp/primitives/prim_colors_opt.c

Packit 1fb8d4
/* FreeRDP: A Remote Desktop Protocol Client
Packit 1fb8d4
 * Optimized Color conversion operations.
Packit 1fb8d4
 * vi:ts=4 sw=4:
Packit 1fb8d4
 *
Packit 1fb8d4
 * Copyright 2011 Stephen Erisman
Packit 1fb8d4
 * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
Packit 1fb8d4
 * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
Packit 1fb8d4
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
Packit 1fb8d4
 *
Packit 1fb8d4
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
Packit 1fb8d4
 * not use this file except in compliance with the License. You may obtain
Packit 1fb8d4
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
Packit 1fb8d4
 * Unless required by applicable law or agreed to in writing, software
Packit 1fb8d4
 * distributed under the License is distributed on an "AS IS" BASIS,
Packit 1fb8d4
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
Packit 1fb8d4
 * or implied. See the License for the specific language governing
Packit 1fb8d4
 * permissions and limitations under the License.
Packit 1fb8d4
 */
Packit 1fb8d4
Packit 1fb8d4
#ifdef HAVE_CONFIG_H
Packit 1fb8d4
#include "config.h"
Packit 1fb8d4
#endif
Packit 1fb8d4
Packit 1fb8d4
#include <freerdp/types.h>
Packit 1fb8d4
#include <freerdp/primitives.h>
Packit 1fb8d4
#include <winpr/sysinfo.h>
Packit 1fb8d4
Packit 1fb8d4
#ifdef WITH_SSE2
Packit 1fb8d4
#include <emmintrin.h>
Packit 1fb8d4
#elif defined(WITH_NEON)
Packit 1fb8d4
#include <arm_neon.h>
Packit 1fb8d4
#endif /* WITH_SSE2 else WITH_NEON */
Packit 1fb8d4
Packit 1fb8d4
#include "prim_internal.h"
Packit 1fb8d4
#include "prim_templates.h"
Packit 1fb8d4
Packit 1fb8d4
static primitives_t* generic = NULL;
Packit 1fb8d4
Packit 1fb8d4
#ifdef WITH_SSE2
Packit 1fb8d4
Packit 1fb8d4
#ifdef __GNUC__
Packit 1fb8d4
# define GNU_INLINE \
Packit 1fb8d4
	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
Packit 1fb8d4
#else
Packit 1fb8d4
# define GNU_INLINE
Packit 1fb8d4
#endif
Packit 1fb8d4
Packit 1fb8d4
#define CACHE_LINE_BYTES	64
Packit 1fb8d4
Packit 1fb8d4
#define _mm_between_epi16(_val, _min, _max) \
Packit 1fb8d4
	do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
Packit 1fb8d4
Packit 1fb8d4
#ifdef DO_PREFETCH
Packit 1fb8d4
/*---------------------------------------------------------------------------*/
Packit 1fb8d4
static inline void GNU_INLINE _mm_prefetch_buffer(
Packit 1fb8d4
    char* buffer,
Packit 1fb8d4
    int num_bytes)
Packit 1fb8d4
{
Packit 1fb8d4
	__m128i* buf = (__m128i*) buffer;
Packit 1fb8d4
	unsigned int i;
Packit 1fb8d4
Packit 1fb8d4
	for (i = 0; i < (num_bytes / sizeof(__m128i));
Packit 1fb8d4
	     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit 1fb8d4
	{
Packit 1fb8d4
		_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
#endif /* DO_PREFETCH */
Packit 1fb8d4
Packit 1fb8d4
/*---------------------------------------------------------------------------*/
Packit 1fb8d4
static pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
Packit 1fb8d4
    const INT16* pSrc[3],
Packit 1fb8d4
    int srcStep,
Packit 1fb8d4
    INT16* pDst[3],
Packit 1fb8d4
    int dstStep,
Packit 1fb8d4
    const prim_size_t* roi)	/* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
Packit 1fb8d4
	__m128i* y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
Packit 1fb8d4
	int srcbump, dstbump, yp, imax;
Packit 1fb8d4
Packit 1fb8d4
	if (((ULONG_PTR)(pSrc[0]) & 0x0f)
Packit 1fb8d4
	    || ((ULONG_PTR)(pSrc[1]) & 0x0f)
Packit 1fb8d4
	    || ((ULONG_PTR)(pSrc[2]) & 0x0f)
Packit 1fb8d4
	    || ((ULONG_PTR)(pDst[0]) & 0x0f)
Packit 1fb8d4
	    || ((ULONG_PTR)(pDst[1]) & 0x0f)
Packit 1fb8d4
	    || ((ULONG_PTR)(pDst[2]) & 0x0f)
Packit 1fb8d4
	    || (roi->width & 0x07)
Packit 1fb8d4
	    || (srcStep & 127)
Packit 1fb8d4
	    || (dstStep & 127))
Packit 1fb8d4
	{
Packit 1fb8d4
		/* We can't maintain 16-byte alignment. */
Packit 1fb8d4
		return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
Packit 1fb8d4
		                                       pDst, dstStep, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	zero = _mm_setzero_si128();
Packit 1fb8d4
	max = _mm_set1_epi16(255);
Packit 1fb8d4
	y_buf  = (__m128i*)(pSrc[0]);
Packit 1fb8d4
	cb_buf = (__m128i*)(pSrc[1]);
Packit 1fb8d4
	cr_buf = (__m128i*)(pSrc[2]);
Packit 1fb8d4
	r_buf  = (__m128i*)(pDst[0]);
Packit 1fb8d4
	g_buf  = (__m128i*)(pDst[1]);
Packit 1fb8d4
	b_buf  = (__m128i*)(pDst[2]);
Packit 1fb8d4
	r_cr = _mm_set1_epi16(22986);	/*  1.403 << 14 */
Packit 1fb8d4
	g_cb = _mm_set1_epi16(-5636);	/* -0.344 << 14 */
Packit 1fb8d4
	g_cr = _mm_set1_epi16(-11698);	/* -0.714 << 14 */
Packit 1fb8d4
	b_cb = _mm_set1_epi16(28999);	/*  1.770 << 14 */
Packit 1fb8d4
	c4096 = _mm_set1_epi16(4096);
Packit 1fb8d4
	srcbump = srcStep / sizeof(__m128i);
Packit 1fb8d4
	dstbump = dstStep / sizeof(__m128i);
Packit 1fb8d4
#ifdef DO_PREFETCH
Packit 1fb8d4
Packit 1fb8d4
	/* Prefetch Y's, Cb's, and Cr's. */
Packit 1fb8d4
	for (yp = 0; yp < roi->height; yp++)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
Packit 1fb8d4
		     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit 1fb8d4
		{
Packit 1fb8d4
			_mm_prefetch((char*)(&y_buf[i]),  _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		y_buf  += srcbump;
Packit 1fb8d4
		cb_buf += srcbump;
Packit 1fb8d4
		cr_buf += srcbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	y_buf  = (__m128i*)(pSrc[0]);
Packit 1fb8d4
	cb_buf = (__m128i*)(pSrc[1]);
Packit 1fb8d4
	cr_buf = (__m128i*)(pSrc[2]);
Packit 1fb8d4
#endif /* DO_PREFETCH */
Packit 1fb8d4
	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
Packit 1fb8d4
Packit 1fb8d4
	for (yp = 0; yp < roi->height; ++yp)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < imax; i++)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* In order to use SSE2 signed 16-bit integer multiplication
Packit 1fb8d4
			 * we need to convert the floating point factors to signed int
Packit 1fb8d4
			 * without losing information.
Packit 1fb8d4
			 * The result of this multiplication is 32 bit and we have two
Packit 1fb8d4
			 * SSE instructions that return either the hi or lo word.
Packit 1fb8d4
			 * Thus we will multiply the factors by the highest possible 2^n,
Packit 1fb8d4
			 * take the upper 16 bits of the signed 32-bit result
Packit 1fb8d4
			 * (_mm_mulhi_epi16) and correct this result by multiplying
Packit 1fb8d4
			 * it by 2^(16-n).
Packit 1fb8d4
			 *
Packit 1fb8d4
			 * For the given factors in the conversion matrix the best
Packit 1fb8d4
			 * possible n is 14.
Packit 1fb8d4
			 *
Packit 1fb8d4
			 * Example for calculating r:
Packit 1fb8d4
			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
Packit 1fb8d4
			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
Packit 1fb8d4
			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
Packit 1fb8d4
			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
Packit 1fb8d4
			 */
Packit 1fb8d4
			/* y = (y_r_buf[i] + 4096) >> 2 */
Packit 1fb8d4
			__m128i y, cb, cr, r, g, b;
Packit 1fb8d4
			y = _mm_load_si128(y_buf + i);
Packit 1fb8d4
			y = _mm_add_epi16(y, c4096);
Packit 1fb8d4
			y = _mm_srai_epi16(y, 2);
Packit 1fb8d4
			/* cb = cb_g_buf[i]; */
Packit 1fb8d4
			cb = _mm_load_si128(cb_buf + i);
Packit 1fb8d4
			/* cr = cr_b_buf[i]; */
Packit 1fb8d4
			cr = _mm_load_si128(cr_buf + i);
Packit 1fb8d4
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit 1fb8d4
			r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
Packit 1fb8d4
			r = _mm_srai_epi16(r, 3);
Packit 1fb8d4
			/* r_buf[i] = CLIP(r); */
Packit 1fb8d4
			_mm_between_epi16(r, zero, max);
Packit 1fb8d4
			_mm_store_si128(r_buf + i, r);
Packit 1fb8d4
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit 1fb8d4
			g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
Packit 1fb8d4
			g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
Packit 1fb8d4
			g = _mm_srai_epi16(g, 3);
Packit 1fb8d4
			/* g_buf[i] = CLIP(g); */
Packit 1fb8d4
			_mm_between_epi16(g, zero, max);
Packit 1fb8d4
			_mm_store_si128(g_buf + i, g);
Packit 1fb8d4
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit 1fb8d4
			b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
Packit 1fb8d4
			b = _mm_srai_epi16(b, 3);
Packit 1fb8d4
			/* b_buf[i] = CLIP(b); */
Packit 1fb8d4
			_mm_between_epi16(b, zero, max);
Packit 1fb8d4
			_mm_store_si128(b_buf + i, b);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		y_buf  += srcbump;
Packit 1fb8d4
		cb_buf += srcbump;
Packit 1fb8d4
		cr_buf += srcbump;
Packit 1fb8d4
		r_buf += dstbump;
Packit 1fb8d4
		g_buf += dstbump;
Packit 1fb8d4
		b_buf += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/*---------------------------------------------------------------------------*/
Packit 1fb8d4
static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(
Packit 1fb8d4
    const INT16* pSrc[3], UINT32 srcStep,
Packit 1fb8d4
    BYTE* pDst, UINT32 dstStep,
Packit 1fb8d4
    const prim_size_t* roi)	/* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	const __m128i zero = _mm_setzero_si128();
Packit 1fb8d4
	const __m128i max = _mm_set1_epi16(255);
Packit 1fb8d4
	const __m128i r_cr = _mm_set1_epi16(22986);	/*  1.403 << 14 */
Packit 1fb8d4
	const __m128i g_cb = _mm_set1_epi16(-5636);	/* -0.344 << 14 */
Packit 1fb8d4
	const __m128i g_cr = _mm_set1_epi16(-11698);	/* -0.714 << 14 */
Packit 1fb8d4
	const __m128i b_cb = _mm_set1_epi16(28999);	/*  1.770 << 14 */
Packit 1fb8d4
	const __m128i c4096 = _mm_set1_epi16(4096);
Packit 1fb8d4
	const INT16* y_buf = (INT16*)pSrc[0];
Packit 1fb8d4
	const INT16* cb_buf = (INT16*)pSrc[1];
Packit 1fb8d4
	const INT16* cr_buf = (INT16*)pSrc[2];
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	const UINT32 step = sizeof(__m128i) / sizeof(INT16);
Packit 1fb8d4
	const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
Packit 1fb8d4
	BYTE* d_buf = pDst;
Packit 1fb8d4
	int yp;
Packit 1fb8d4
	const size_t dstPad = (dstStep - roi->width * 4);
Packit 1fb8d4
#ifdef DO_PREFETCH
Packit 1fb8d4
Packit 1fb8d4
	/* Prefetch Y's, Cb's, and Cr's. */
Packit 1fb8d4
	for (yp = 0; yp < roi->height; yp++)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < imax;
Packit 1fb8d4
		     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit 1fb8d4
		{
Packit 1fb8d4
			_mm_prefetch((char*)(&((__m128i*)y_buf)[i]),  _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		y_buf  += srcStep / sizeof(INT16);
Packit 1fb8d4
		cb_buf += srcStep / sizeof(INT16);
Packit 1fb8d4
		cr_buf += srcStep / sizeof(INT16);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	y_buf  = (INT16*)pSrc[0];
Packit 1fb8d4
	cb_buf = (INT16*)pSrc[1];
Packit 1fb8d4
	cr_buf = (INT16*)pSrc[2];
Packit 1fb8d4
#endif /* DO_PREFETCH */
Packit 1fb8d4
Packit 1fb8d4
	for (yp = 0; yp < roi->height; ++yp)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < imax; i += 2)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* In order to use SSE2 signed 16-bit integer multiplication
Packit 1fb8d4
			 * we need to convert the floating point factors to signed int
Packit 1fb8d4
			 * without losing information.
Packit 1fb8d4
			 * The result of this multiplication is 32 bit and we have two
Packit 1fb8d4
			 * SSE instructions that return either the hi or lo word.
Packit 1fb8d4
			 * Thus we will multiply the factors by the highest possible 2^n,
Packit 1fb8d4
			 * take the upper 16 bits of the signed 32-bit result
Packit 1fb8d4
			 * (_mm_mulhi_epi16) and correct this result by multiplying
Packit 1fb8d4
			 * it by 2^(16-n).
Packit 1fb8d4
			 *
Packit 1fb8d4
			 * For the given factors in the conversion matrix the best
Packit 1fb8d4
			 * possible n is 14.
Packit 1fb8d4
			 *
Packit 1fb8d4
			 * Example for calculating r:
Packit 1fb8d4
			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
Packit 1fb8d4
			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
Packit 1fb8d4
			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
Packit 1fb8d4
			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
Packit 1fb8d4
			 */
Packit 1fb8d4
			/* y = (y_r_buf[i] + 4096) >> 2 */
Packit 1fb8d4
			__m128i y1, y2, cb1, cb2, cr1, cr2, r1, r2, g1, g2, b1, b2;
Packit 1fb8d4
			y1 = _mm_load_si128((__m128i*)y_buf);
Packit 1fb8d4
			y_buf += step;
Packit 1fb8d4
			y1 = _mm_add_epi16(y1, c4096);
Packit 1fb8d4
			y1 = _mm_srai_epi16(y1, 2);
Packit 1fb8d4
			/* cb = cb_g_buf[i]; */
Packit 1fb8d4
			cb1 = _mm_load_si128((__m128i*)cb_buf);
Packit 1fb8d4
			cb_buf += step;
Packit 1fb8d4
			/* cr = cr_b_buf[i]; */
Packit 1fb8d4
			cr1 = _mm_load_si128((__m128i*)cr_buf);
Packit 1fb8d4
			cr_buf += step;
Packit 1fb8d4
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit 1fb8d4
			r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
Packit 1fb8d4
			r1 = _mm_srai_epi16(r1, 3);
Packit 1fb8d4
			/* r_buf[i] = CLIP(r); */
Packit 1fb8d4
			_mm_between_epi16(r1, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit 1fb8d4
			g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
Packit 1fb8d4
			g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
Packit 1fb8d4
			g1 = _mm_srai_epi16(g1, 3);
Packit 1fb8d4
			/* g_buf[i] = CLIP(g); */
Packit 1fb8d4
			_mm_between_epi16(g1, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit 1fb8d4
			b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
Packit 1fb8d4
			b1 = _mm_srai_epi16(b1, 3);
Packit 1fb8d4
			/* b_buf[i] = CLIP(b); */
Packit 1fb8d4
			_mm_between_epi16(b1, zero, max);
Packit 1fb8d4
			y2 = _mm_load_si128((__m128i*)y_buf);
Packit 1fb8d4
			y_buf += step;
Packit 1fb8d4
			y2 = _mm_add_epi16(y2, c4096);
Packit 1fb8d4
			y2 = _mm_srai_epi16(y2, 2);
Packit 1fb8d4
			/* cb = cb_g_buf[i]; */
Packit 1fb8d4
			cb2 = _mm_load_si128((__m128i*)cb_buf);
Packit 1fb8d4
			cb_buf += step;
Packit 1fb8d4
			/* cr = cr_b_buf[i]; */
Packit 1fb8d4
			cr2 = _mm_load_si128((__m128i*)cr_buf);
Packit 1fb8d4
			cr_buf += step;
Packit 1fb8d4
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit 1fb8d4
			r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
Packit 1fb8d4
			r2 = _mm_srai_epi16(r2, 3);
Packit 1fb8d4
			/* r_buf[i] = CLIP(r); */
Packit 1fb8d4
			_mm_between_epi16(r2, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit 1fb8d4
			g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
Packit 1fb8d4
			g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
Packit 1fb8d4
			g2 = _mm_srai_epi16(g2, 3);
Packit 1fb8d4
			/* g_buf[i] = CLIP(g); */
Packit 1fb8d4
			_mm_between_epi16(g2, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit 1fb8d4
			b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
Packit 1fb8d4
			b2 = _mm_srai_epi16(b2, 3);
Packit 1fb8d4
			/* b_buf[i] = CLIP(b); */
Packit 1fb8d4
			_mm_between_epi16(b2, zero, max);
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1, R2, R3, R4;
Packit 1fb8d4
				/* The comments below pretend these are 8-byte registers
Packit 1fb8d4
				 * rather than 16-byte, for readability.
Packit 1fb8d4
				 */
Packit 1fb8d4
				R0 = b1; /* R0 = 00B300B200B100B0 */
Packit 1fb8d4
				R1 = b2; /* R1 = 00B700B600B500B4 */
Packit 1fb8d4
				R0 = _mm_packus_epi16(R0, R1);	/* R0 = B7B6B5B4B3B2B1B0 */
Packit 1fb8d4
				R1 = g1;		/* R1 = 00G300G200G100G0 */
Packit 1fb8d4
				R2 = g2;		/* R2 = 00G700G600G500G4 */
Packit 1fb8d4
				R1 = _mm_packus_epi16(R1, R2);				/* R1 = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
				R2 = R1;						/* R2 = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
				R2 = _mm_unpacklo_epi8(R0, R2);				/* R2 = B3G3B2G2B1G1B0G0 */
Packit 1fb8d4
				R1 = _mm_unpackhi_epi8(R0, R1);				/* R1 = B7G7B6G6B5G5B4G4 */
Packit 1fb8d4
				R0 = r1;		/* R0 = 00R300R200R100R0 */
Packit 1fb8d4
				R3 = r2;		/* R3 = 00R700R600R500R4 */
Packit 1fb8d4
				R0 = _mm_packus_epi16(R0, R3);				/* R0 = R7R6R5R4R3R2R1R0 */
Packit 1fb8d4
				R3 = _mm_set1_epi32(0xFFFFFFFFU);				/* R3 = FFFFFFFFFFFFFFFF */
Packit 1fb8d4
				R4 = R3;						/* R4 = FFFFFFFFFFFFFFFF */
Packit 1fb8d4
				R4 = _mm_unpacklo_epi8(R0, R4);				/* R4 = R3FFR2FFR1FFR0FF */
Packit 1fb8d4
				R3 = _mm_unpackhi_epi8(R0, R3);				/* R3 = R7FFR6FFR5FFR4FF */
Packit 1fb8d4
				R0 = R4;						/* R0 = R4               */
Packit 1fb8d4
				R0 = _mm_unpacklo_epi16(R2, R0);				/* R0 = B1G1R1FFB0G0R0FF */
Packit 1fb8d4
				R4 = _mm_unpackhi_epi16(R2, R4);				/* R4 = B3G3R3FFB2G2R2FF */
Packit 1fb8d4
				R2 = R3;						/* R2 = R3               */
Packit 1fb8d4
				R2 = _mm_unpacklo_epi16(R1, R2);				/* R2 = B5G5R5FFB4G4R4FF */
Packit 1fb8d4
				R3 = _mm_unpackhi_epi16(R1, R3);				/* R3 = B7G7R7FFB6G6R6FF */
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R0); /* B1G1R1FFB0G0R0FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R4); /* B3G3R3FFB2G2R2FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R2); /* B5G5R5FFB4G4R4FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R3); /* B7G7R7FFB6G6R6FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < pad; i++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const INT32 divisor = 16;
Packit 1fb8d4
			const INT32 Y = ((*y_buf++) + 4096) << divisor;
Packit 1fb8d4
			const INT32 Cb = (*cb_buf++);
Packit 1fb8d4
			const INT32 Cr = (*cr_buf++);
Packit 1fb8d4
			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
Packit 1fb8d4
			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
Packit 1fb8d4
			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
Packit 1fb8d4
			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
Packit 1fb8d4
			const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
Packit 1fb8d4
			const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
Packit 1fb8d4
			const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
Packit 1fb8d4
			*d_buf++ = CLIP(B);
Packit 1fb8d4
			*d_buf++ = CLIP(G);
Packit 1fb8d4
			*d_buf++ = CLIP(R);
Packit 1fb8d4
			*d_buf++ = 0xFF;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		d_buf += dstPad;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/*---------------------------------------------------------------------------*/
Packit 1fb8d4
static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(
Packit 1fb8d4
    const INT16* pSrc[3], UINT32 srcStep,
Packit 1fb8d4
    BYTE* pDst, UINT32 dstStep,
Packit 1fb8d4
    const prim_size_t* roi)	/* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	const __m128i zero = _mm_setzero_si128();
Packit 1fb8d4
	const __m128i max = _mm_set1_epi16(255);
Packit 1fb8d4
	const __m128i r_cr = _mm_set1_epi16(22986);	/*  1.403 << 14 */
Packit 1fb8d4
	const __m128i g_cb = _mm_set1_epi16(-5636);	/* -0.344 << 14 */
Packit 1fb8d4
	const __m128i g_cr = _mm_set1_epi16(-11698);	/* -0.714 << 14 */
Packit 1fb8d4
	const __m128i b_cb = _mm_set1_epi16(28999);	/*  1.770 << 14 */
Packit 1fb8d4
	const __m128i c4096 = _mm_set1_epi16(4096);
Packit 1fb8d4
	const INT16* y_buf = (INT16*)pSrc[0];
Packit 1fb8d4
	const INT16* cb_buf = (INT16*)pSrc[1];
Packit 1fb8d4
	const INT16* cr_buf = (INT16*)pSrc[2];
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	const UINT32 step = sizeof(__m128i) / sizeof(INT16);
Packit 1fb8d4
	const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
Packit 1fb8d4
	BYTE* d_buf = pDst;
Packit 1fb8d4
	int yp;
Packit 1fb8d4
	const size_t dstPad = (dstStep - roi->width * 4);
Packit 1fb8d4
#ifdef DO_PREFETCH
Packit 1fb8d4
Packit 1fb8d4
	/* Prefetch Y's, Cb's, and Cr's. */
Packit 1fb8d4
	for (yp = 0; yp < roi->height; yp++)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < imax;
Packit 1fb8d4
		     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit 1fb8d4
		{
Packit 1fb8d4
			_mm_prefetch((char*)(&((__m128i*)y_buf)[i]),  _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		y_buf  += srcStep / sizeof(INT16);
Packit 1fb8d4
		cb_buf += srcStep / sizeof(INT16);
Packit 1fb8d4
		cr_buf += srcStep / sizeof(INT16);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	y_buf  = (INT16*)(pSrc[0]);
Packit 1fb8d4
	cb_buf = (INT16*)(pSrc[1]);
Packit 1fb8d4
	cr_buf = (INT16*)(pSrc[2]);
Packit 1fb8d4
#endif /* DO_PREFETCH */
Packit 1fb8d4
Packit 1fb8d4
	for (yp = 0; yp < roi->height; ++yp)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < imax; i += 2)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* In order to use SSE2 signed 16-bit integer multiplication
Packit 1fb8d4
			 * we need to convert the floating point factors to signed int
Packit 1fb8d4
			 * without losing information.
Packit 1fb8d4
			 * The result of this multiplication is 32 bit and we have two
Packit 1fb8d4
			 * SSE instructions that return either the hi or lo word.
Packit 1fb8d4
			 * Thus we will multiply the factors by the highest possible 2^n,
Packit 1fb8d4
			 * take the upper 16 bits of the signed 32-bit result
Packit 1fb8d4
			 * (_mm_mulhi_epi16) and correct this result by multiplying
Packit 1fb8d4
			 * it by 2^(16-n).
Packit 1fb8d4
			 *
Packit 1fb8d4
			 * For the given factors in the conversion matrix the best
Packit 1fb8d4
			 * possible n is 14.
Packit 1fb8d4
			 *
Packit 1fb8d4
			 * Example for calculating r:
Packit 1fb8d4
			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
Packit 1fb8d4
			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
Packit 1fb8d4
			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
Packit 1fb8d4
			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
Packit 1fb8d4
			 */
Packit 1fb8d4
			/* y = (y_r_buf[i] + 4096) >> 2 */
Packit 1fb8d4
			__m128i y1, y2, cb1, cb2, cr1, cr2, r1, r2, g1, g2, b1, b2;
Packit 1fb8d4
			y1 = _mm_load_si128((__m128i*)y_buf);
Packit 1fb8d4
			y_buf += step;
Packit 1fb8d4
			y1 = _mm_add_epi16(y1, c4096);
Packit 1fb8d4
			y1 = _mm_srai_epi16(y1, 2);
Packit 1fb8d4
			/* cb = cb_g_buf[i]; */
Packit 1fb8d4
			cb1 = _mm_load_si128((__m128i*)cb_buf);
Packit 1fb8d4
			cb_buf += step;
Packit 1fb8d4
			/* cr = cr_b_buf[i]; */
Packit 1fb8d4
			cr1 = _mm_load_si128((__m128i*)cr_buf);
Packit 1fb8d4
			cr_buf += step;
Packit 1fb8d4
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit 1fb8d4
			r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
Packit 1fb8d4
			r1 = _mm_srai_epi16(r1, 3);
Packit 1fb8d4
			/* r_buf[i] = CLIP(r); */
Packit 1fb8d4
			_mm_between_epi16(r1, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit 1fb8d4
			g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
Packit 1fb8d4
			g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
Packit 1fb8d4
			g1 = _mm_srai_epi16(g1, 3);
Packit 1fb8d4
			/* g_buf[i] = CLIP(g); */
Packit 1fb8d4
			_mm_between_epi16(g1, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit 1fb8d4
			b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
Packit 1fb8d4
			b1 = _mm_srai_epi16(b1, 3);
Packit 1fb8d4
			/* b_buf[i] = CLIP(b); */
Packit 1fb8d4
			_mm_between_epi16(b1, zero, max);
Packit 1fb8d4
			y2 = _mm_load_si128((__m128i*)y_buf);
Packit 1fb8d4
			y_buf += step;
Packit 1fb8d4
			y2 = _mm_add_epi16(y2, c4096);
Packit 1fb8d4
			y2 = _mm_srai_epi16(y2, 2);
Packit 1fb8d4
			/* cb = cb_g_buf[i]; */
Packit 1fb8d4
			cb2 = _mm_load_si128((__m128i*)cb_buf);
Packit 1fb8d4
			cb_buf += step;
Packit 1fb8d4
			/* cr = cr_b_buf[i]; */
Packit 1fb8d4
			cr2 = _mm_load_si128((__m128i*)cr_buf);
Packit 1fb8d4
			cr_buf += step;
Packit 1fb8d4
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit 1fb8d4
			r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
Packit 1fb8d4
			r2 = _mm_srai_epi16(r2, 3);
Packit 1fb8d4
			/* r_buf[i] = CLIP(r); */
Packit 1fb8d4
			_mm_between_epi16(r2, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit 1fb8d4
			g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
Packit 1fb8d4
			g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
Packit 1fb8d4
			g2 = _mm_srai_epi16(g2, 3);
Packit 1fb8d4
			/* g_buf[i] = CLIP(g); */
Packit 1fb8d4
			_mm_between_epi16(g2, zero, max);
Packit 1fb8d4
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit 1fb8d4
			b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
Packit 1fb8d4
			b2 = _mm_srai_epi16(b2, 3);
Packit 1fb8d4
			/* b_buf[i] = CLIP(b); */
Packit 1fb8d4
			_mm_between_epi16(b2, zero, max);
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1, R2, R3, R4;
Packit 1fb8d4
				/* The comments below pretend these are 8-byte registers
Packit 1fb8d4
				 * rather than 16-byte, for readability.
Packit 1fb8d4
				 */
Packit 1fb8d4
				R0 = r1; /* R0 = 00R300R200R100R0 */
Packit 1fb8d4
				R1 = r2; /* R1 = 00R700R600R500R4 */
Packit 1fb8d4
				R0 = _mm_packus_epi16(R0, R1);	/* R0 = R7R6R5R4R3R2R1R0 */
Packit 1fb8d4
				R1 = g1;		/* R1 = 00G300G200G100G0 */
Packit 1fb8d4
				R2 = g2;		/* R2 = 00G700G600G500G4 */
Packit 1fb8d4
				R1 = _mm_packus_epi16(R1, R2);				/* R1 = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
				R2 = R1;						/* R2 = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
				R2 = _mm_unpacklo_epi8(R0, R2);				/* R2 = R3G3R2G2R1G1R0G0 */
Packit 1fb8d4
				R1 = _mm_unpackhi_epi8(R0, R1);				/* R1 = R7G7R6G6R5G5R4G4 */
Packit 1fb8d4
				R0 = b1;		/* R0 = 00B300B200B100B0 */
Packit 1fb8d4
				R3 = b2;		/* R3 = 00B700B600B500B4 */
Packit 1fb8d4
				R0 = _mm_packus_epi16(R0, R3);				/* R0 = B7B6B5B4B3B2B1B0 */
Packit 1fb8d4
				R3 = _mm_set1_epi32(0xFFFFFFFFU);				/* R3 = FFFFFFFFFFFFFFFF */
Packit 1fb8d4
				R4 = R3;						/* R4 = FFFFFFFFFFFFFFFF */
Packit 1fb8d4
				R4 = _mm_unpacklo_epi8(R0, R4);				/* R4 = B3FFB2FFB1FFB0FF */
Packit 1fb8d4
				R3 = _mm_unpackhi_epi8(R0, R3);				/* R3 = B7FFB6FFB5FFB4FF */
Packit 1fb8d4
				R0 = R4;						/* R0 = R4               */
Packit 1fb8d4
				R0 = _mm_unpacklo_epi16(R2, R0);				/* R0 = R1G1B1FFR0G0B0FF */
Packit 1fb8d4
				R4 = _mm_unpackhi_epi16(R2, R4);				/* R4 = R3G3B3FFR2G2B2FF */
Packit 1fb8d4
				R2 = R3;						/* R2 = R3               */
Packit 1fb8d4
				R2 = _mm_unpacklo_epi16(R1, R2);				/* R2 = R5G5B5FFR4G4B4FF */
Packit 1fb8d4
				R3 = _mm_unpackhi_epi16(R1, R3);				/* R3 = R7G7B7FFR6G6B6FF */
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R0); /* R1G1B1FFR0G0B0FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R4); /* R3G3B3FFR2G2B2FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R2); /* R5G5B5FFR4G4B4FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
				_mm_store_si128((__m128i*)d_buf, R3); /* R7G7B7FFR6G6B6FF      */
Packit 1fb8d4
				d_buf += sizeof(__m128i);
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < pad; i++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const INT32 divisor = 16;
Packit 1fb8d4
			const INT32 Y = ((*y_buf++) + 4096) << divisor;
Packit 1fb8d4
			const INT32 Cb = (*cb_buf++);
Packit 1fb8d4
			const INT32 Cr = (*cr_buf++);
Packit 1fb8d4
			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
Packit 1fb8d4
			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
Packit 1fb8d4
			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
Packit 1fb8d4
			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
Packit 1fb8d4
			const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
Packit 1fb8d4
			const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
Packit 1fb8d4
			const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
Packit 1fb8d4
			*d_buf++ = CLIP(R);
Packit 1fb8d4
			*d_buf++ = CLIP(G);
Packit 1fb8d4
			*d_buf++ = CLIP(B);
Packit 1fb8d4
			*d_buf++ = 0xFF;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		d_buf += dstPad;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R(
Packit 1fb8d4
    const INT16* pSrc[3], UINT32 srcStep,
Packit 1fb8d4
    BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
Packit 1fb8d4
    const prim_size_t* roi)	/* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	if (((ULONG_PTR)(pSrc[0]) & 0x0f)
Packit 1fb8d4
	    || ((ULONG_PTR)(pSrc[1]) & 0x0f)
Packit 1fb8d4
	    || ((ULONG_PTR)(pSrc[2]) & 0x0f)
Packit 1fb8d4
	    || ((ULONG_PTR)(pDst) & 0x0f)
Packit 1fb8d4
	    || (srcStep & 0x0f)
Packit 1fb8d4
	    || (dstStep & 0x0f))
Packit 1fb8d4
	{
Packit 1fb8d4
		/* We can't maintain 16-byte alignment. */
Packit 1fb8d4
		return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep,
Packit 1fb8d4
		                                        pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	switch (DstFormat)
Packit 1fb8d4
	{
Packit 1fb8d4
		case PIXEL_FORMAT_BGRA32:
Packit 1fb8d4
		case PIXEL_FORMAT_BGRX32:
Packit 1fb8d4
			return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_RGBA32:
Packit 1fb8d4
		case PIXEL_FORMAT_RGBX32:
Packit 1fb8d4
			return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
Packit 1fb8d4
 * numbers. See the general code above.
Packit 1fb8d4
 */
Packit 1fb8d4
static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
Packit 1fb8d4
    const INT16* pSrc[3],
Packit 1fb8d4
    int srcStep,
Packit 1fb8d4
    INT16* pDst[3],
Packit 1fb8d4
    int dstStep,
Packit 1fb8d4
    const prim_size_t* roi)	/* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
Packit 1fb8d4
	__m128i* r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
Packit 1fb8d4
	int srcbump, dstbump, yp, imax;
Packit 1fb8d4
Packit 1fb8d4
	if (((ULONG_PTR)(pSrc[0]) & 0x0f)
Packit 1fb8d4
	    || ((ULONG_PTR)(pSrc[1]) & 0x0f)
Packit 1fb8d4
	    || ((ULONG_PTR)(pSrc[2]) & 0x0f)
Packit 1fb8d4
	    || ((ULONG_PTR)(pDst[0]) & 0x0f)
Packit 1fb8d4
	    || ((ULONG_PTR)(pDst[1]) & 0x0f)
Packit 1fb8d4
	    || ((ULONG_PTR)(pDst[2]) & 0x0f)
Packit 1fb8d4
	    || (roi->width & 0x07)
Packit 1fb8d4
	    || (srcStep & 127)
Packit 1fb8d4
	    || (dstStep & 127))
Packit 1fb8d4
	{
Packit 1fb8d4
		/* We can't maintain 16-byte alignment. */
Packit 1fb8d4
		return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
Packit 1fb8d4
		                                       pDst, dstStep, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	min = _mm_set1_epi16(-128 * 32);
Packit 1fb8d4
	max = _mm_set1_epi16(127 * 32);
Packit 1fb8d4
	r_buf  = (__m128i*)(pSrc[0]);
Packit 1fb8d4
	g_buf  = (__m128i*)(pSrc[1]);
Packit 1fb8d4
	b_buf  = (__m128i*)(pSrc[2]);
Packit 1fb8d4
	y_buf  = (__m128i*)(pDst[0]);
Packit 1fb8d4
	cb_buf = (__m128i*)(pDst[1]);
Packit 1fb8d4
	cr_buf = (__m128i*)(pDst[2]);
Packit 1fb8d4
	y_r  = _mm_set1_epi16(9798);   /*  0.299000 << 15 */
Packit 1fb8d4
	y_g  = _mm_set1_epi16(19235);  /*  0.587000 << 15 */
Packit 1fb8d4
	y_b  = _mm_set1_epi16(3735);   /*  0.114000 << 15 */
Packit 1fb8d4
	cb_r = _mm_set1_epi16(-5535);  /* -0.168935 << 15 */
Packit 1fb8d4
	cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
Packit 1fb8d4
	cb_b = _mm_set1_epi16(16403);  /*  0.500590 << 15 */
Packit 1fb8d4
	cr_r = _mm_set1_epi16(16377);  /*  0.499813 << 15 */
Packit 1fb8d4
	cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
Packit 1fb8d4
	cr_b = _mm_set1_epi16(-2663);  /* -0.081282 << 15 */
Packit 1fb8d4
	srcbump = srcStep / sizeof(__m128i);
Packit 1fb8d4
	dstbump = dstStep / sizeof(__m128i);
Packit 1fb8d4
#ifdef DO_PREFETCH
Packit 1fb8d4
Packit 1fb8d4
	/* Prefetch RGB's. */
Packit 1fb8d4
	for (yp = 0; yp < roi->height; yp++)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
Packit 1fb8d4
		     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
Packit 1fb8d4
		{
Packit 1fb8d4
			_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
Packit 1fb8d4
			_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		r_buf += srcbump;
Packit 1fb8d4
		g_buf += srcbump;
Packit 1fb8d4
		b_buf += srcbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	r_buf = (__m128i*)(pSrc[0]);
Packit 1fb8d4
	g_buf = (__m128i*)(pSrc[1]);
Packit 1fb8d4
	b_buf = (__m128i*)(pSrc[2]);
Packit 1fb8d4
#endif /* DO_PREFETCH */
Packit 1fb8d4
	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
Packit 1fb8d4
Packit 1fb8d4
	for (yp = 0; yp < roi->height; ++yp)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < imax; i++)
Packit 1fb8d4
		{
Packit 1fb8d4
			/* In order to use SSE2 signed 16-bit integer multiplication we
Packit 1fb8d4
			 * need to convert the floating point factors to signed int
Packit 1fb8d4
			 * without loosing information.  The result of this multiplication
Packit 1fb8d4
			 * is 32 bit and using SSE2 we get either the product's hi or lo
Packit 1fb8d4
			 * word.  Thus we will multiply the factors by the highest
Packit 1fb8d4
			 * possible 2^n and take the upper 16 bits of the signed 32-bit
Packit 1fb8d4
			 * result (_mm_mulhi_epi16).  Since the final result needs to
Packit 1fb8d4
			 * be scaled by << 5 and also in in order to keep the precision
Packit 1fb8d4
			 * within the upper 16 bits we will also have to scale the RGB
Packit 1fb8d4
			 * values used in the multiplication by << 5+(16-n).
Packit 1fb8d4
			 */
Packit 1fb8d4
			__m128i r, g, b, y, cb, cr;
Packit 1fb8d4
			r = _mm_load_si128(y_buf + i);
Packit 1fb8d4
			g = _mm_load_si128(g_buf + i);
Packit 1fb8d4
			b = _mm_load_si128(b_buf + i);
Packit 1fb8d4
			/* r<<6; g<<6; b<<6 */
Packit 1fb8d4
			r = _mm_slli_epi16(r, 6);
Packit 1fb8d4
			g = _mm_slli_epi16(g, 6);
Packit 1fb8d4
			b = _mm_slli_epi16(b, 6);
Packit 1fb8d4
			/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
Packit 1fb8d4
			y = _mm_mulhi_epi16(r, y_r);
Packit 1fb8d4
			y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
Packit 1fb8d4
			y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
Packit 1fb8d4
			y = _mm_add_epi16(y, min);
Packit 1fb8d4
			/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
Packit 1fb8d4
			_mm_between_epi16(y, min, max);
Packit 1fb8d4
			_mm_store_si128(y_buf + i, y);
Packit 1fb8d4
			/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
Packit 1fb8d4
			cb = _mm_mulhi_epi16(r, cb_r);
Packit 1fb8d4
			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
Packit 1fb8d4
			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
Packit 1fb8d4
			/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
Packit 1fb8d4
			_mm_between_epi16(cb, min, max);
Packit 1fb8d4
			_mm_store_si128(cb_buf + i, cb);
Packit 1fb8d4
			/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
Packit 1fb8d4
			cr = _mm_mulhi_epi16(r, cr_r);
Packit 1fb8d4
			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
Packit 1fb8d4
			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
Packit 1fb8d4
			/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
Packit 1fb8d4
			_mm_between_epi16(cr, min, max);
Packit 1fb8d4
			_mm_store_si128(cr_buf + i, cr);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		y_buf  += srcbump;
Packit 1fb8d4
		cb_buf += srcbump;
Packit 1fb8d4
		cr_buf += srcbump;
Packit 1fb8d4
		r_buf += dstbump;
Packit 1fb8d4
		g_buf += dstbump;
Packit 1fb8d4
		b_buf += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/*---------------------------------------------------------------------------*/
Packit 1fb8d4
static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
Packit 1fb8d4
    const INT16* const pSrc[3],	/* 16-bit R,G, and B arrays */
Packit 1fb8d4
    UINT32 srcStep,			/* bytes between rows in source data */
Packit 1fb8d4
    BYTE* pDst,				/* 32-bit interleaved ARGB (ABGR?) data */
Packit 1fb8d4
    UINT32 dstStep,			/* bytes between rows in dest data */
Packit 1fb8d4
    const prim_size_t* roi)	/* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	const UINT16* pr = (const UINT16*)(pSrc[0]);
Packit 1fb8d4
	const UINT16* pg = (const UINT16*)(pSrc[1]);
Packit 1fb8d4
	const UINT16* pb = (const UINT16*)(pSrc[2]);
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
Packit 1fb8d4
	BYTE* out;
Packit 1fb8d4
	UINT32 srcbump, dstbump, y;
Packit 1fb8d4
	out = (BYTE*) pDst;
Packit 1fb8d4
	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
Packit 1fb8d4
	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; ++y)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 x;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < roi->width - pad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			__m128i r, g, b;
Packit 1fb8d4
			/* The comments below pretend these are 8-byte registers
Packit 1fb8d4
			 * rather than 16-byte, for readability.
Packit 1fb8d4
			 */
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pb);
Packit 1fb8d4
				pb += 8;		/* R0 = 00B300B200B100B0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pb);
Packit 1fb8d4
				pb += 8;		/* R1 = 00B700B600B500B4 */
Packit 1fb8d4
				b = _mm_packus_epi16(R0, R1);				/* b = B7B6B5B4B3B2B1B0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pg);
Packit 1fb8d4
				pg += 8;		/* R1 = 00G300G200G100G0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pg);
Packit 1fb8d4
				pg += 8;		/* R2 = 00G700G600G500G4 */
Packit 1fb8d4
				g = _mm_packus_epi16(R0, R1);				/* g = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pr);
Packit 1fb8d4
				pr += 8;		/* R0 = 00R300R200R100R0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pr);
Packit 1fb8d4
				pr += 8;		/* R3 = 00R700R600R500R4 */
Packit 1fb8d4
				r = _mm_packus_epi16(R0, R1);				/* r = R7R6R5R4R3R2R1R0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i gbHi, gbLo, arHi, arLo;
Packit 1fb8d4
				{
Packit 1fb8d4
					gbLo = _mm_unpacklo_epi8(b, g);	/* R0 = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
					gbHi = _mm_unpackhi_epi8(b, g);	/* R1 = G7B7G6B7G5B5G4B4 */
Packit 1fb8d4
					arLo = _mm_unpacklo_epi8(r, a);	/* R4 = FFR3FFR2FFR1FFR0 */
Packit 1fb8d4
					arHi = _mm_unpackhi_epi8(r, a);	/* R3 = FFR7FFR6FFR5FFR4 */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR1G1B1FFR0G0B0      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR3G3B3FFR2G2B2      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR5G5B5FFR4G4B4      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR7G7B7FFR6G6B6      */
Packit 1fb8d4
				}
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < pad; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const BYTE R = CLIP(*pr++);
Packit 1fb8d4
			const BYTE G = CLIP(*pg++);
Packit 1fb8d4
			const BYTE B = CLIP(*pb++);
Packit 1fb8d4
			*out++ = B;
Packit 1fb8d4
			*out++ = G;
Packit 1fb8d4
			*out++ = R;
Packit 1fb8d4
			*out++ = 0xFF;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Jump to next row. */
Packit 1fb8d4
		pr += srcbump;
Packit 1fb8d4
		pg += srcbump;
Packit 1fb8d4
		pb += srcbump;
Packit 1fb8d4
		out += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
Packit 1fb8d4
    const INT16* const pSrc[3],	/* 16-bit R,G, and B arrays */
Packit 1fb8d4
    UINT32 srcStep,			/* bytes between rows in source data */
Packit 1fb8d4
    BYTE* pDst,				/* 32-bit interleaved ARGB (ABGR?) data */
Packit 1fb8d4
    UINT32 dstStep,			/* bytes between rows in dest data */
Packit 1fb8d4
    const prim_size_t* roi)	/* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	const UINT16* pr = (const UINT16*)(pSrc[0]);
Packit 1fb8d4
	const UINT16* pg = (const UINT16*)(pSrc[1]);
Packit 1fb8d4
	const UINT16* pb = (const UINT16*)(pSrc[2]);
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
Packit 1fb8d4
	BYTE* out;
Packit 1fb8d4
	UINT32 srcbump, dstbump, y;
Packit 1fb8d4
	out = (BYTE*) pDst;
Packit 1fb8d4
	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
Packit 1fb8d4
	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; ++y)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 x;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < roi->width - pad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			__m128i r, g, b;
Packit 1fb8d4
			/* The comments below pretend these are 8-byte registers
Packit 1fb8d4
			 * rather than 16-byte, for readability.
Packit 1fb8d4
			 */
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pb);
Packit 1fb8d4
				pb += 8;		/* R0 = 00B300B200B100B0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pb);
Packit 1fb8d4
				pb += 8;		/* R1 = 00B700B600B500B4 */
Packit 1fb8d4
				b = _mm_packus_epi16(R0, R1);				/* b = B7B6B5B4B3B2B1B0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pg);
Packit 1fb8d4
				pg += 8;		/* R1 = 00G300G200G100G0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pg);
Packit 1fb8d4
				pg += 8;		/* R2 = 00G700G600G500G4 */
Packit 1fb8d4
				g = _mm_packus_epi16(R0, R1);				/* g = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pr);
Packit 1fb8d4
				pr += 8;		/* R0 = 00R300R200R100R0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pr);
Packit 1fb8d4
				pr += 8;		/* R3 = 00R700R600R500R4 */
Packit 1fb8d4
				r = _mm_packus_epi16(R0, R1);				/* r = R7R6R5R4R3R2R1R0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i gbHi, gbLo, arHi, arLo;
Packit 1fb8d4
				{
Packit 1fb8d4
					gbLo = _mm_unpacklo_epi8(r, g);	/* R0 = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
					gbHi = _mm_unpackhi_epi8(r, g);	/* R1 = G7B7G6B7G5B5G4B4 */
Packit 1fb8d4
					arLo = _mm_unpacklo_epi8(b, a);	/* R4 = FFR3FFR2FFR1FFR0 */
Packit 1fb8d4
					arHi = _mm_unpackhi_epi8(b, a);	/* R3 = FFR7FFR6FFR5FFR4 */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR1G1B1FFR0G0B0      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR3G3B3FFR2G2B2      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR5G5B5FFR4G4B4      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR7G7B7FFR6G6B6      */
Packit 1fb8d4
				}
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < pad; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const BYTE R = CLIP(*pr++);
Packit 1fb8d4
			const BYTE G = CLIP(*pg++);
Packit 1fb8d4
			const BYTE B = CLIP(*pb++);
Packit 1fb8d4
			*out++ = R;
Packit 1fb8d4
			*out++ = G;
Packit 1fb8d4
			*out++ = B;
Packit 1fb8d4
			*out++ = 0xFF;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Jump to next row. */
Packit 1fb8d4
		pr += srcbump;
Packit 1fb8d4
		pg += srcbump;
Packit 1fb8d4
		pb += srcbump;
Packit 1fb8d4
		out += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
Packit 1fb8d4
    const INT16* const pSrc[3],	/* 16-bit R,G, and B arrays */
Packit 1fb8d4
    UINT32 srcStep,			/* bytes between rows in source data */
Packit 1fb8d4
    BYTE* pDst,				/* 32-bit interleaved ARGB (ABGR?) data */
Packit 1fb8d4
    UINT32 dstStep,			/* bytes between rows in dest data */
Packit 1fb8d4
    const prim_size_t* roi)	/* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	const UINT16* pr = (const UINT16*)(pSrc[0]);
Packit 1fb8d4
	const UINT16* pg = (const UINT16*)(pSrc[1]);
Packit 1fb8d4
	const UINT16* pb = (const UINT16*)(pSrc[2]);
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
Packit 1fb8d4
	BYTE* out;
Packit 1fb8d4
	UINT32 srcbump, dstbump, y;
Packit 1fb8d4
	out = (BYTE*) pDst;
Packit 1fb8d4
	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
Packit 1fb8d4
	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; ++y)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 x;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < roi->width - pad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			__m128i r, g, b;
Packit 1fb8d4
			/* The comments below pretend these are 8-byte registers
Packit 1fb8d4
			 * rather than 16-byte, for readability.
Packit 1fb8d4
			 */
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pb);
Packit 1fb8d4
				pb += 8;		/* R0 = 00B300B200B100B0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pb);
Packit 1fb8d4
				pb += 8;		/* R1 = 00B700B600B500B4 */
Packit 1fb8d4
				b = _mm_packus_epi16(R0, R1);				/* b = B7B6B5B4B3B2B1B0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pg);
Packit 1fb8d4
				pg += 8;		/* R1 = 00G300G200G100G0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pg);
Packit 1fb8d4
				pg += 8;		/* R2 = 00G700G600G500G4 */
Packit 1fb8d4
				g = _mm_packus_epi16(R0, R1);				/* g = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pr);
Packit 1fb8d4
				pr += 8;		/* R0 = 00R300R200R100R0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pr);
Packit 1fb8d4
				pr += 8;		/* R3 = 00R700R600R500R4 */
Packit 1fb8d4
				r = _mm_packus_epi16(R0, R1);				/* r = R7R6R5R4R3R2R1R0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i gbHi, gbLo, arHi, arLo;
Packit 1fb8d4
				{
Packit 1fb8d4
					gbLo = _mm_unpacklo_epi8(a, b);	/* R0 = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
					gbHi = _mm_unpackhi_epi8(a, b);	/* R1 = G7B7G6B7G5B5G4B4 */
Packit 1fb8d4
					arLo = _mm_unpacklo_epi8(g, r);	/* R4 = FFR3FFR2FFR1FFR0 */
Packit 1fb8d4
					arHi = _mm_unpackhi_epi8(g, r);	/* R3 = FFR7FFR6FFR5FFR4 */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR1G1B1FFR0G0B0      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR3G3B3FFR2G2B2      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR5G5B5FFR4G4B4      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR7G7B7FFR6G6B6      */
Packit 1fb8d4
				}
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < pad; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const BYTE R = CLIP(*pr++);
Packit 1fb8d4
			const BYTE G = CLIP(*pg++);
Packit 1fb8d4
			const BYTE B = CLIP(*pb++);
Packit 1fb8d4
			*out++ = 0xFF;
Packit 1fb8d4
			*out++ = B;
Packit 1fb8d4
			*out++ = G;
Packit 1fb8d4
			*out++ = R;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Jump to next row. */
Packit 1fb8d4
		pr += srcbump;
Packit 1fb8d4
		pg += srcbump;
Packit 1fb8d4
		pb += srcbump;
Packit 1fb8d4
		out += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
Packit 1fb8d4
    const INT16* const pSrc[3],	/* 16-bit R,G, and B arrays */
Packit 1fb8d4
    UINT32 srcStep,			/* bytes between rows in source data */
Packit 1fb8d4
    BYTE* pDst,				/* 32-bit interleaved ARGB (ABGR?) data */
Packit 1fb8d4
    UINT32 dstStep,			/* bytes between rows in dest data */
Packit 1fb8d4
    const prim_size_t* roi)	/* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	const UINT16* pr = (const UINT16*)(pSrc[0]);
Packit 1fb8d4
	const UINT16* pg = (const UINT16*)(pSrc[1]);
Packit 1fb8d4
	const UINT16* pb = (const UINT16*)(pSrc[2]);
Packit 1fb8d4
	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	BYTE* out;
Packit 1fb8d4
	UINT32 srcbump, dstbump, y;
Packit 1fb8d4
	out = (BYTE*) pDst;
Packit 1fb8d4
	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
Packit 1fb8d4
	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; ++y)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 x;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < roi->width - pad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			__m128i r, g, b;
Packit 1fb8d4
			/* The comments below pretend these are 8-byte registers
Packit 1fb8d4
			 * rather than 16-byte, for readability.
Packit 1fb8d4
			 */
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pb);
Packit 1fb8d4
				pb += 8;		/* R0 = 00B300B200B100B0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pb);
Packit 1fb8d4
				pb += 8;		/* R1 = 00B700B600B500B4 */
Packit 1fb8d4
				b = _mm_packus_epi16(R0, R1);				/* b = B7B6B5B4B3B2B1B0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pg);
Packit 1fb8d4
				pg += 8;		/* R1 = 00G300G200G100G0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pg);
Packit 1fb8d4
				pg += 8;		/* R2 = 00G700G600G500G4 */
Packit 1fb8d4
				g = _mm_packus_epi16(R0, R1);				/* g = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i R0, R1;
Packit 1fb8d4
				R0 = _mm_load_si128((__m128i*)pr);
Packit 1fb8d4
				pr += 8;		/* R0 = 00R300R200R100R0 */
Packit 1fb8d4
				R1 = _mm_load_si128((__m128i*)pr);
Packit 1fb8d4
				pr += 8;		/* R3 = 00R700R600R500R4 */
Packit 1fb8d4
				r = _mm_packus_epi16(R0, R1);				/* r = R7R6R5R4R3R2R1R0 */
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				__m128i gbHi, gbLo, arHi, arLo;
Packit 1fb8d4
				{
Packit 1fb8d4
					gbLo = _mm_unpacklo_epi8(a, r);	/* R0 = G7G6G5G4G3G2G1G0 */
Packit 1fb8d4
					gbHi = _mm_unpackhi_epi8(a, r);	/* R1 = G7B7G6B7G5B5G4B4 */
Packit 1fb8d4
					arLo = _mm_unpacklo_epi8(g, b);	/* R4 = FFR3FFR2FFR1FFR0 */
Packit 1fb8d4
					arHi = _mm_unpackhi_epi8(g, b);	/* R3 = FFR7FFR6FFR5FFR4 */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR1G1B1FFR0G0B0      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR3G3B3FFR2G2B2      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR5G5B5FFR4G4B4      */
Packit 1fb8d4
				}
Packit 1fb8d4
				{
Packit 1fb8d4
					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
Packit 1fb8d4
					_mm_store_si128((__m128i*)out, bgrx);
Packit 1fb8d4
					out += 16;	/* FFR7G7B7FFR6G6B6      */
Packit 1fb8d4
				}
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < pad; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const BYTE R = CLIP(*pr++);
Packit 1fb8d4
			const BYTE G = CLIP(*pg++);
Packit 1fb8d4
			const BYTE B = CLIP(*pb++);
Packit 1fb8d4
			*out++ = 0xFF;
Packit 1fb8d4
			*out++ = R;
Packit 1fb8d4
			*out++ = G;
Packit 1fb8d4
			*out++ = B;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Jump to next row. */
Packit 1fb8d4
		pr += srcbump;
Packit 1fb8d4
		pg += srcbump;
Packit 1fb8d4
		pb += srcbump;
Packit 1fb8d4
		out += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
Packit 1fb8d4
    const INT16* const pSrc[3],	/* 16-bit R,G, and B arrays */
Packit 1fb8d4
    UINT32 srcStep,			/* bytes between rows in source data */
Packit 1fb8d4
    BYTE* pDst,				/* 32-bit interleaved ARGB (ABGR?) data */
Packit 1fb8d4
    UINT32 dstStep,			/* bytes between rows in dest data */
Packit 1fb8d4
    UINT32 DstFormat,
Packit 1fb8d4
    const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
Packit 1fb8d4
	    (srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
Packit 1fb8d4
		return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
Packit 1fb8d4
	switch (DstFormat)
Packit 1fb8d4
	{
Packit 1fb8d4
		case PIXEL_FORMAT_BGRA32:
Packit 1fb8d4
		case PIXEL_FORMAT_BGRX32:
Packit 1fb8d4
			return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_RGBA32:
Packit 1fb8d4
		case PIXEL_FORMAT_RGBX32:
Packit 1fb8d4
			return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_ABGR32:
Packit 1fb8d4
		case PIXEL_FORMAT_XBGR32:
Packit 1fb8d4
			return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_ARGB32:
Packit 1fb8d4
		case PIXEL_FORMAT_XRGB32:
Packit 1fb8d4
			return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
#endif /* WITH_SSE2 */
Packit 1fb8d4
Packit 1fb8d4
/*---------------------------------------------------------------------------*/
Packit 1fb8d4
#ifdef WITH_NEON
Packit 1fb8d4
static pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
Packit 1fb8d4
    const INT16* pSrc[3],  INT32 srcStep,
Packit 1fb8d4
    INT16* pDst[3],  INT32 dstStep,
Packit 1fb8d4
    const prim_size_t* roi)	/* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	/* TODO: If necessary, check alignments and call the general version. */
Packit 1fb8d4
	int16x8_t zero = vdupq_n_s16(0);
Packit 1fb8d4
	int16x8_t max = vdupq_n_s16(255);
Packit 1fb8d4
	int16x8_t r_cr = vdupq_n_s16(22986);	//  1.403 << 14
Packit 1fb8d4
	int16x8_t g_cb = vdupq_n_s16(-5636);	// -0.344 << 14
Packit 1fb8d4
	int16x8_t g_cr = vdupq_n_s16(-11698);	// -0.714 << 14
Packit 1fb8d4
	int16x8_t b_cb = vdupq_n_s16(28999);	//  1.770 << 14
Packit 1fb8d4
	int16x8_t c4096 = vdupq_n_s16(4096);
Packit 1fb8d4
	int16x8_t* y_buf  = (int16x8_t*) pSrc[0];
Packit 1fb8d4
	int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
Packit 1fb8d4
	int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
Packit 1fb8d4
	int16x8_t* r_buf  = (int16x8_t*) pDst[0];
Packit 1fb8d4
	int16x8_t* g_buf  = (int16x8_t*) pDst[1];
Packit 1fb8d4
	int16x8_t* b_buf  = (int16x8_t*) pDst[2];
Packit 1fb8d4
	int srcbump = srcStep / sizeof(int16x8_t);
Packit 1fb8d4
	int dstbump = dstStep / sizeof(int16x8_t);
Packit 1fb8d4
	int yp;
Packit 1fb8d4
	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
Packit 1fb8d4
Packit 1fb8d4
	for (yp = 0; yp < roi->height; ++yp)
Packit 1fb8d4
	{
Packit 1fb8d4
		int i;
Packit 1fb8d4
Packit 1fb8d4
		for (i = 0; i < imax; i++)
Packit 1fb8d4
		{
Packit 1fb8d4
			/*
Packit 1fb8d4
				In order to use NEON signed 16-bit integer multiplication we need to convert
Packit 1fb8d4
				the floating point factors to signed int without loosing information.
Packit 1fb8d4
				The result of this multiplication is 32 bit and we have a NEON instruction
Packit 1fb8d4
				that returns the hi word of the saturated double.
Packit 1fb8d4
				Thus we will multiply the factors by the highest possible 2^n, take the
Packit 1fb8d4
				upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
Packit 1fb8d4
				shift by 1 to reverse the doubling) and correct	this result by multiplying it
Packit 1fb8d4
				by 2^(16-n).
Packit 1fb8d4
				For the given factors in the conversion matrix the best possible n is 14.
Packit 1fb8d4
Packit 1fb8d4
				Example for calculating r:
Packit 1fb8d4
				r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
Packit 1fb8d4
				r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
Packit 1fb8d4
				r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
Packit 1fb8d4
				r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
Packit 1fb8d4
			*/
Packit 1fb8d4
			/* y = (y_buf[i] + 4096) >> 2 */
Packit 1fb8d4
			int16x8_t y = vld1q_s16((INT16*) &y_buf[i]);
Packit 1fb8d4
			y = vaddq_s16(y, c4096);
Packit 1fb8d4
			y = vshrq_n_s16(y, 2);
Packit 1fb8d4
			/* cb = cb_buf[i]; */
Packit 1fb8d4
			int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
Packit 1fb8d4
			/* cr = cr_buf[i]; */
Packit 1fb8d4
			int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]);
Packit 1fb8d4
			/* (y + HIWORD(cr*22986)) >> 3 */
Packit 1fb8d4
			int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
Packit 1fb8d4
			r = vshrq_n_s16(r, 3);
Packit 1fb8d4
			/* r_buf[i] = CLIP(r); */
Packit 1fb8d4
			r = vminq_s16(vmaxq_s16(r, zero), max);
Packit 1fb8d4
			vst1q_s16((INT16*)&r_buf[i], r);
Packit 1fb8d4
			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
Packit 1fb8d4
			int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
Packit 1fb8d4
			g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
Packit 1fb8d4
			g = vshrq_n_s16(g, 3);
Packit 1fb8d4
			/* g_buf[i] = CLIP(g); */
Packit 1fb8d4
			g = vminq_s16(vmaxq_s16(g, zero), max);
Packit 1fb8d4
			vst1q_s16((INT16*)&g_buf[i], g);
Packit 1fb8d4
			/* (y + HIWORD(cb*28999)) >> 3 */
Packit 1fb8d4
			int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
Packit 1fb8d4
			b = vshrq_n_s16(b, 3);
Packit 1fb8d4
			/* b_buf[i] = CLIP(b); */
Packit 1fb8d4
			b = vminq_s16(vmaxq_s16(b, zero), max);
Packit 1fb8d4
			vst1q_s16((INT16*)&b_buf[i], b);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		y_buf  += srcbump;
Packit 1fb8d4
		cb_buf += srcbump;
Packit 1fb8d4
		cr_buf += srcbump;
Packit 1fb8d4
		r_buf += dstbump;
Packit 1fb8d4
		g_buf += dstbump;
Packit 1fb8d4
		b_buf += dstbump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(
Packit 1fb8d4
    const INT16* pSrc[3], UINT32 srcStep,
Packit 1fb8d4
    BYTE* pDst, UINT32 dstStep,
Packit 1fb8d4
    const prim_size_t* roi,
Packit 1fb8d4
    uint8_t rPos,
Packit 1fb8d4
    uint8_t gPos,
Packit 1fb8d4
    uint8_t bPos,
Packit 1fb8d4
    uint8_t aPos)
Packit 1fb8d4
{
Packit 1fb8d4
	UINT32 x, y;
Packit 1fb8d4
	BYTE* pRGB = pDst;
Packit 1fb8d4
	const INT16* pY  = pSrc[0];
Packit 1fb8d4
	const INT16* pCb = pSrc[1];
Packit 1fb8d4
	const INT16* pCr = pSrc[2];
Packit 1fb8d4
	const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
Packit 1fb8d4
	const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
Packit 1fb8d4
	const size_t pad = roi->width % 8;
Packit 1fb8d4
	const int16x4_t c4096 = vdup_n_s16(4096);
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		for (x = 0; x < roi->width - pad; x += 8)
Packit 1fb8d4
		{
Packit 1fb8d4
			const int16x8_t Y = vld1q_s16(pY);
Packit 1fb8d4
			const int16x4_t Yh = vget_high_s16(Y);
Packit 1fb8d4
			const int16x4_t Yl = vget_low_s16(Y);
Packit 1fb8d4
			const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
Packit 1fb8d4
			const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
Packit 1fb8d4
			const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
Packit 1fb8d4
			const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
Packit 1fb8d4
			const int16x8_t Cr = vld1q_s16(pCr);
Packit 1fb8d4
			const int16x4_t Crh = vget_high_s16(Cr);
Packit 1fb8d4
			const int16x4_t Crl = vget_low_s16(Cr);
Packit 1fb8d4
			const int16x8_t Cb = vld1q_s16(pCb);
Packit 1fb8d4
			const int16x4_t Cbh = vget_high_s16(Cb);
Packit 1fb8d4
			const int16x4_t Cbl = vget_low_s16(Cb);
Packit 1fb8d4
			uint8x8x4_t bgrx;
Packit 1fb8d4
			{
Packit 1fb8d4
				/* R */
Packit 1fb8d4
				const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
Packit 1fb8d4
				const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
Packit 1fb8d4
				const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
Packit 1fb8d4
				const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
Packit 1fb8d4
				const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
Packit 1fb8d4
				const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
Packit 1fb8d4
				const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
Packit 1fb8d4
				bgrx.val[rPos] = vqmovun_s16(Rs);
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				/* G */
Packit 1fb8d4
				const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */
Packit 1fb8d4
				const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */
Packit 1fb8d4
				const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
Packit 1fb8d4
				const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
Packit 1fb8d4
				const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
Packit 1fb8d4
				const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
Packit 1fb8d4
				const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
Packit 1fb8d4
				const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
Packit 1fb8d4
				const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
Packit 1fb8d4
				const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
Packit 1fb8d4
				const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
Packit 1fb8d4
				const uint8x8_t G = vqmovun_s16(Gs);
Packit 1fb8d4
				bgrx.val[gPos] = G;
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				/* B */
Packit 1fb8d4
				const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
Packit 1fb8d4
				const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
Packit 1fb8d4
				const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
Packit 1fb8d4
				const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
Packit 1fb8d4
				const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
Packit 1fb8d4
				const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
Packit 1fb8d4
				const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
Packit 1fb8d4
				const uint8x8_t B = vqmovun_s16(Bs);
Packit 1fb8d4
				bgrx.val[bPos] = B;
Packit 1fb8d4
			}
Packit 1fb8d4
			/* A */
Packit 1fb8d4
			{
Packit 1fb8d4
				bgrx.val[aPos] = vdup_n_u8(0xFF);
Packit 1fb8d4
			}
Packit 1fb8d4
			vst4_u8(pRGB, bgrx);
Packit 1fb8d4
			pY += 8;
Packit 1fb8d4
			pCb += 8;
Packit 1fb8d4
			pCr += 8;
Packit 1fb8d4
			pRGB += 32;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < pad; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const INT32 divisor = 16;
Packit 1fb8d4
			const INT32 Y = ((*pY++) + 4096) << divisor;
Packit 1fb8d4
			const INT32 Cb = (*pCb++);
Packit 1fb8d4
			const INT32 Cr = (*pCr++);
Packit 1fb8d4
			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
Packit 1fb8d4
			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
Packit 1fb8d4
			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
Packit 1fb8d4
			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
Packit 1fb8d4
			INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
Packit 1fb8d4
			INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
Packit 1fb8d4
			INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
Packit 1fb8d4
			BYTE bgrx[4];
Packit 1fb8d4
			bgrx[bPos] = CLIP(B);
Packit 1fb8d4
			bgrx[gPos] = CLIP(G);
Packit 1fb8d4
			bgrx[rPos] = CLIP(R);
Packit 1fb8d4
			bgrx[aPos] = 0xFF;
Packit 1fb8d4
			*pRGB++ = bgrx[0];
Packit 1fb8d4
			*pRGB++ = bgrx[1];
Packit 1fb8d4
			*pRGB++ = bgrx[2];
Packit 1fb8d4
			*pRGB++ = bgrx[3];
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		pY += srcPad;
Packit 1fb8d4
		pCb += srcPad;
Packit 1fb8d4
		pCr += srcPad;
Packit 1fb8d4
		pRGB += dstPad;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(
Packit 1fb8d4
    const INT16* pSrc[3], UINT32 srcStep,
Packit 1fb8d4
    BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
Packit 1fb8d4
    const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	switch (DstFormat)
Packit 1fb8d4
	{
Packit 1fb8d4
		case PIXEL_FORMAT_BGRA32:
Packit 1fb8d4
		case PIXEL_FORMAT_BGRX32:
Packit 1fb8d4
			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_RGBA32:
Packit 1fb8d4
		case PIXEL_FORMAT_RGBX32:
Packit 1fb8d4
			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_ARGB32:
Packit 1fb8d4
		case PIXEL_FORMAT_XRGB32:
Packit 1fb8d4
			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_ABGR32:
Packit 1fb8d4
		case PIXEL_FORMAT_XBGR32:
Packit 1fb8d4
			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t neon_RGBToRGB_16s8u_P3AC4R_X(
Packit 1fb8d4
    const INT16* const pSrc[3],	/* 16-bit R,G, and B arrays */
Packit 1fb8d4
    UINT32 srcStep,			/* bytes between rows in source data */
Packit 1fb8d4
    BYTE* pDst,			/* 32-bit interleaved ARGB (ABGR?) data */
Packit 1fb8d4
    UINT32 dstStep,			/* bytes between rows in dest data */
Packit 1fb8d4
    const prim_size_t* roi,	/* region of interest */
Packit 1fb8d4
    uint8_t rPos,
Packit 1fb8d4
    uint8_t gPos,
Packit 1fb8d4
    uint8_t bPos,
Packit 1fb8d4
    uint8_t aPos)
Packit 1fb8d4
{
Packit 1fb8d4
	UINT32 x, y;
Packit 1fb8d4
	UINT32 pad = roi->width % 8;
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
Packit 1fb8d4
		const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
Packit 1fb8d4
		const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
Packit 1fb8d4
		BYTE* dst = pDst + y * dstStep;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < roi->width - pad; x += 8)
Packit 1fb8d4
		{
Packit 1fb8d4
			int16x8_t r = vld1q_s16(pr);
Packit 1fb8d4
			int16x8_t g = vld1q_s16(pg);
Packit 1fb8d4
			int16x8_t b = vld1q_s16(pb);
Packit 1fb8d4
			uint8x8x4_t bgrx;
Packit 1fb8d4
			bgrx.val[aPos] = vdup_n_u8(0xFF);
Packit 1fb8d4
			bgrx.val[rPos] = vqmovun_s16(r);
Packit 1fb8d4
			bgrx.val[gPos] = vqmovun_s16(g);
Packit 1fb8d4
			bgrx.val[bPos] = vqmovun_s16(b);
Packit 1fb8d4
			vst4_u8(dst, bgrx);
Packit 1fb8d4
			pr += 8;
Packit 1fb8d4
			pg += 8;
Packit 1fb8d4
			pb += 8;
Packit 1fb8d4
			dst += 32;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < pad; x ++)
Packit 1fb8d4
		{
Packit 1fb8d4
			BYTE bgrx[4];
Packit 1fb8d4
			bgrx[bPos] = *pb++;
Packit 1fb8d4
			bgrx[gPos] = *pg++;
Packit 1fb8d4
			bgrx[rPos] = *pr++;
Packit 1fb8d4
			bgrx[aPos] = 0xFF;
Packit 1fb8d4
			*dst++ = bgrx[0];
Packit 1fb8d4
			*dst++ = bgrx[1];
Packit 1fb8d4
			*dst++ = bgrx[2];
Packit 1fb8d4
			*dst++ = bgrx[3];
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t neon_RGBToRGB_16s8u_P3AC4R(
Packit 1fb8d4
    const INT16* const pSrc[3],	/* 16-bit R,G, and B arrays */
Packit 1fb8d4
    UINT32 srcStep,			/* bytes between rows in source data */
Packit 1fb8d4
    BYTE* pDst,			/* 32-bit interleaved ARGB (ABGR?) data */
Packit 1fb8d4
    UINT32 dstStep,			/* bytes between rows in dest data */
Packit 1fb8d4
    UINT32 DstFormat,
Packit 1fb8d4
    const prim_size_t* roi)	/* region of interest */
Packit 1fb8d4
{
Packit 1fb8d4
	switch (DstFormat)
Packit 1fb8d4
	{
Packit 1fb8d4
		case PIXEL_FORMAT_BGRA32:
Packit 1fb8d4
		case PIXEL_FORMAT_BGRX32:
Packit 1fb8d4
			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_RGBA32:
Packit 1fb8d4
		case PIXEL_FORMAT_RGBX32:
Packit 1fb8d4
			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_ARGB32:
Packit 1fb8d4
		case PIXEL_FORMAT_XRGB32:
Packit 1fb8d4
			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
Packit 1fb8d4
Packit 1fb8d4
		case PIXEL_FORMAT_ABGR32:
Packit 1fb8d4
		case PIXEL_FORMAT_XBGR32:
Packit 1fb8d4
			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
#endif /* WITH_NEON */
Packit 1fb8d4
/* I don't see a direct IPP version of this, since the input is INT16
Packit 1fb8d4
 * YCbCr.  It may be possible via  Deinterleave and then YCbCrToRGB_<mod>.
Packit 1fb8d4
 * But that would likely be slower.
Packit 1fb8d4
 */
Packit 1fb8d4
Packit 1fb8d4
/* ------------------------------------------------------------------------- */
Packit 1fb8d4
void primitives_init_colors_opt(primitives_t* prims)
Packit 1fb8d4
{
Packit 1fb8d4
	generic = primitives_get_generic();
Packit 1fb8d4
	primitives_init_colors(prims);
Packit 1fb8d4
#if defined(WITH_SSE2)
Packit 1fb8d4
Packit 1fb8d4
	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
Packit 1fb8d4
	{
Packit 1fb8d4
		prims->RGBToRGB_16s8u_P3AC4R  = sse2_RGBToRGB_16s8u_P3AC4R;
Packit 1fb8d4
		prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
Packit 1fb8d4
		prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
Packit 1fb8d4
		prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
#elif defined(WITH_NEON)
Packit 1fb8d4
Packit 1fb8d4
	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
Packit 1fb8d4
	{
Packit 1fb8d4
		prims->RGBToRGB_16s8u_P3AC4R  = neon_RGBToRGB_16s8u_P3AC4R;
Packit 1fb8d4
		prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
Packit 1fb8d4
		prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
#endif /* WITH_SSE2 */
Packit 1fb8d4
}
Packit 1fb8d4