Blame libfreerdp/primitives/prim_YUV_ssse3.c

Packit 1fb8d4
/**
Packit 1fb8d4
 * FreeRDP: A Remote Desktop Protocol Implementation
Packit 1fb8d4
 * Optimized YUV/RGB conversion operations
Packit 1fb8d4
 *
Packit 1fb8d4
 * Copyright 2014 Thomas Erbesdobler
Packit 1fb8d4
 * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
Packit 1fb8d4
 * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
Packit 1fb8d4
 * Copyright 2016-2017 Thincast Technologies GmbH
Packit 1fb8d4
 *
Packit 1fb8d4
 * Licensed under the Apache License, Version 2.0 (the "License");
Packit 1fb8d4
 * you may not use this file except in compliance with the License.
Packit 1fb8d4
 * You may obtain a copy of the License at
Packit 1fb8d4
 *
Packit 1fb8d4
 *     http://www.apache.org/licenses/LICENSE-2.0
Packit 1fb8d4
 *
Packit 1fb8d4
 * Unless required by applicable law or agreed to in writing, software
Packit 1fb8d4
 * distributed under the License is distributed on an "AS IS" BASIS,
Packit 1fb8d4
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Packit 1fb8d4
 * See the License for the specific language governing permissions and
Packit 1fb8d4
 * limitations under the License.
Packit 1fb8d4
 */
Packit 1fb8d4
Packit 1fb8d4
#ifdef HAVE_CONFIG_H
Packit 1fb8d4
#include "config.h"
Packit 1fb8d4
#endif
Packit 1fb8d4
Packit 1fb8d4
#include <winpr/sysinfo.h>
Packit 1fb8d4
#include <winpr/crt.h>
Packit 1fb8d4
#include <freerdp/types.h>
Packit 1fb8d4
#include <freerdp/primitives.h>
Packit 1fb8d4
Packit 1fb8d4
#include "prim_internal.h"
Packit 1fb8d4
Packit 1fb8d4
#include <emmintrin.h>
Packit 1fb8d4
#include <tmmintrin.h>
Packit 1fb8d4
Packit 1fb8d4
#if !defined(WITH_SSE2)
Packit 1fb8d4
#error "This file needs WITH_SSE2 enabled!"
Packit 1fb8d4
#endif
Packit 1fb8d4
Packit 1fb8d4
static primitives_t* generic = NULL;
Packit 1fb8d4
Packit 1fb8d4
/****************************************************************************/
Packit 1fb8d4
/* SSSE3 YUV420 -> RGB conversion                                           */
Packit 1fb8d4
/****************************************************************************/
Packit 1fb8d4
static __m128i* ssse3_YUV444Pixel(__m128i* dst, __m128i Yraw, __m128i Uraw, __m128i Vraw, UINT8 pos)
Packit 1fb8d4
{
Packit 1fb8d4
	/* Visual Studio 2010 doesn't like _mm_set_epi32 in array initializer list */
Packit 1fb8d4
	/* Note: This also applies to Visual Studio 2013 before Update 4 */
Packit 1fb8d4
#if !defined(_MSC_VER) || (_MSC_VER > 1600)
Packit 1fb8d4
	const __m128i mapY[] =
Packit 1fb8d4
	{
Packit 1fb8d4
		_mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
Packit 1fb8d4
		_mm_set_epi32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
Packit 1fb8d4
		_mm_set_epi32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880),
Packit 1fb8d4
		_mm_set_epi32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80)
Packit 1fb8d4
	};
Packit 1fb8d4
	const __m128i mapUV[] =
Packit 1fb8d4
	{
Packit 1fb8d4
		_mm_set_epi32(0x80038002, 0x80018000, 0x80808080, 0x80808080),
Packit 1fb8d4
		_mm_set_epi32(0x80078006, 0x80058004, 0x80808080, 0x80808080),
Packit 1fb8d4
		_mm_set_epi32(0x800B800A, 0x80098008, 0x80808080, 0x80808080),
Packit 1fb8d4
		_mm_set_epi32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080)
Packit 1fb8d4
	};
Packit 1fb8d4
	const __m128i mask[] =
Packit 1fb8d4
	{
Packit 1fb8d4
		_mm_set_epi32(0x80038080, 0x80028080, 0x80018080, 0x80008080),
Packit 1fb8d4
		_mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
Packit 1fb8d4
		_mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000)
Packit 1fb8d4
	};
Packit 1fb8d4
#else
Packit 1fb8d4
	/* Note: must be in little-endian format ! */
Packit 1fb8d4
	const __m128i mapY[] =
Packit 1fb8d4
	{
Packit 1fb8d4
		{ 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80 },
Packit 1fb8d4
		{ 0x80, 0x04, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x06, 0x80, 0x80, 0x80, 0x07, 0x80, 0x80 },
Packit 1fb8d4
		{ 0x80, 0x08, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x0a, 0x80, 0x80, 0x80, 0x0b, 0x80, 0x80 },
Packit 1fb8d4
		{ 0x80, 0x0c, 0x80, 0x80, 0x80, 0x0d, 0x80, 0x80, 0x80, 0x0e, 0x80, 0x80, 0x80, 0x0f, 0x80, 0x80 }
Packit 1fb8d4
Packit 1fb8d4
	};
Packit 1fb8d4
	const __m128i mapUV[] =
Packit 1fb8d4
	{
Packit 1fb8d4
		{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x01, 0x80, 0x02, 0x80, 0x03, 0x80 },
Packit 1fb8d4
		{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x80, 0x05, 0x80, 0x06, 0x80, 0x07, 0x80 },
Packit 1fb8d4
		{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x80, 0x09, 0x80, 0x0a, 0x80, 0x0b, 0x80 },
Packit 1fb8d4
		{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0c, 0x80, 0x0d, 0x80, 0x0e, 0x80, 0x0f, 0x80 }
Packit 1fb8d4
	};
Packit 1fb8d4
	const __m128i mask[] =
Packit 1fb8d4
	{
Packit 1fb8d4
		{ 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x03, 0x80 },
Packit 1fb8d4
		{ 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80 },
Packit 1fb8d4
		{ 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80 }
Packit 1fb8d4
	};
Packit 1fb8d4
#endif
Packit 1fb8d4
	const __m128i c128 = _mm_set1_epi16(128);
Packit 1fb8d4
	__m128i BGRX = _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
Packit 1fb8d4
	{
Packit 1fb8d4
		__m128i C, D, E;
Packit 1fb8d4
		/* Load Y values and expand to 32 bit */
Packit 1fb8d4
		{
Packit 1fb8d4
			C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */
Packit 1fb8d4
		}
Packit 1fb8d4
		/* Load U values and expand to 32 bit */
Packit 1fb8d4
		{
Packit 1fb8d4
			const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */
Packit 1fb8d4
			D = _mm_sub_epi16(U, c128); /* D = U - 128 */
Packit 1fb8d4
		}
Packit 1fb8d4
		/* Load V values and expand to 32 bit */
Packit 1fb8d4
		{
Packit 1fb8d4
			const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */
Packit 1fb8d4
			E = _mm_sub_epi16(V, c128); /* E = V - 128 */
Packit 1fb8d4
		}
Packit 1fb8d4
		/* Get the R value */
Packit 1fb8d4
		{
Packit 1fb8d4
			const __m128i c403 = _mm_set1_epi16(403);
Packit 1fb8d4
			const __m128i e403 = _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
Packit 1fb8d4
			const __m128i Rs = _mm_add_epi32(C, e403);
Packit 1fb8d4
			const __m128i R32 = _mm_srai_epi32(Rs, 8);
Packit 1fb8d4
			const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
Packit 1fb8d4
			const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
Packit 1fb8d4
			const __m128i packed = _mm_shuffle_epi8(R, mask[0]);
Packit 1fb8d4
			BGRX = _mm_or_si128(BGRX, packed);
Packit 1fb8d4
		}
Packit 1fb8d4
		/* Get the G value */
Packit 1fb8d4
		{
Packit 1fb8d4
			const __m128i c48 = _mm_set1_epi16(48);
Packit 1fb8d4
			const __m128i d48 = _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
Packit 1fb8d4
			const __m128i c120 = _mm_set1_epi16(120);
Packit 1fb8d4
			const __m128i e120 = _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
Packit 1fb8d4
			const __m128i de = _mm_add_epi32(d48, e120);
Packit 1fb8d4
			const __m128i Gs = _mm_sub_epi32(C, de);
Packit 1fb8d4
			const __m128i G32 = _mm_srai_epi32(Gs, 8);
Packit 1fb8d4
			const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
Packit 1fb8d4
			const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
Packit 1fb8d4
			const __m128i packed = _mm_shuffle_epi8(G, mask[1]);
Packit 1fb8d4
			BGRX = _mm_or_si128(BGRX, packed);
Packit 1fb8d4
		}
Packit 1fb8d4
		/* Get the B value */
Packit 1fb8d4
		{
Packit 1fb8d4
			const __m128i c475 = _mm_set1_epi16(475);
Packit 1fb8d4
			const __m128i d475 = _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
Packit 1fb8d4
			const __m128i Bs = _mm_add_epi32(C, d475);
Packit 1fb8d4
			const __m128i B32 = _mm_srai_epi32(Bs, 8);
Packit 1fb8d4
			const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
Packit 1fb8d4
			const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
Packit 1fb8d4
			const __m128i packed = _mm_shuffle_epi8(B, mask[2]);
Packit 1fb8d4
			BGRX = _mm_or_si128(BGRX, packed);
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
	_mm_storeu_si128(dst++, BGRX);
Packit 1fb8d4
	return dst;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_YUV420ToRGB_BGRX(
Packit 1fb8d4
    const BYTE** pSrc, const UINT32* srcStep,
Packit 1fb8d4
    BYTE* pDst, UINT32 dstStep,
Packit 1fb8d4
    const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	const UINT32 nWidth = roi->width;
Packit 1fb8d4
	const UINT32 nHeight = roi->height;
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
Packit 1fb8d4
	UINT32 y;
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < nHeight; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 x;
Packit 1fb8d4
		__m128i* dst = (__m128i*)(pDst + dstStep * y);
Packit 1fb8d4
		const BYTE* YData = pSrc[0] + y * srcStep[0];
Packit 1fb8d4
		const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1];
Packit 1fb8d4
		const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2];
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < nWidth - pad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			const __m128i Y = _mm_loadu_si128((__m128i*)YData);
Packit 1fb8d4
			const __m128i uRaw = _mm_loadu_si128((__m128i*)UData);
Packit 1fb8d4
			const __m128i vRaw = _mm_loadu_si128((__m128i*)VData);
Packit 1fb8d4
			const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
Packit 1fb8d4
			const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
Packit 1fb8d4
			YData += 16;
Packit 1fb8d4
			UData += 8;
Packit 1fb8d4
			VData += 8;
Packit 1fb8d4
			dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
Packit 1fb8d4
			dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
Packit 1fb8d4
			dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
Packit 1fb8d4
			dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < pad; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const BYTE Y = *YData++;
Packit 1fb8d4
			const BYTE U = *UData;
Packit 1fb8d4
			const BYTE V = *VData;
Packit 1fb8d4
			const BYTE r = YUV2R(Y, U, V);
Packit 1fb8d4
			const BYTE g = YUV2G(Y, U, V);
Packit 1fb8d4
			const BYTE b = YUV2B(Y, U, V);
Packit 1fb8d4
			dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0xFF);
Packit 1fb8d4
Packit 1fb8d4
			if (x % 2)
Packit 1fb8d4
			{
Packit 1fb8d4
				UData++;
Packit 1fb8d4
				VData++;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_YUV420ToRGB(
Packit 1fb8d4
    const BYTE** pSrc, const UINT32* srcStep,
Packit 1fb8d4
    BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
Packit 1fb8d4
    const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	switch (DstFormat)
Packit 1fb8d4
	{
Packit 1fb8d4
		case PIXEL_FORMAT_BGRX32:
Packit 1fb8d4
		case PIXEL_FORMAT_BGRA32:
Packit 1fb8d4
			return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(
Packit 1fb8d4
    const BYTE** pSrc, const UINT32* srcStep,
Packit 1fb8d4
    BYTE* pDst, UINT32 dstStep,
Packit 1fb8d4
    const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	const UINT32 nWidth = roi->width;
Packit 1fb8d4
	const UINT32 nHeight = roi->height;
Packit 1fb8d4
	const UINT32 pad = roi->width % 16;
Packit 1fb8d4
	UINT32 y;
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < nHeight; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		UINT32 x;
Packit 1fb8d4
		__m128i* dst = (__m128i*)(pDst + dstStep * y);
Packit 1fb8d4
		const BYTE* YData = pSrc[0] + y * srcStep[0];
Packit 1fb8d4
		const BYTE* UData = pSrc[1] + y * srcStep[1];
Packit 1fb8d4
		const BYTE* VData = pSrc[2] + y * srcStep[2];
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < nWidth - pad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			__m128i Y = _mm_load_si128((__m128i*)YData);
Packit 1fb8d4
			__m128i U = _mm_load_si128((__m128i*)UData);
Packit 1fb8d4
			__m128i V = _mm_load_si128((__m128i*)VData);
Packit 1fb8d4
			YData += 16;
Packit 1fb8d4
			UData += 16;
Packit 1fb8d4
			VData += 16;
Packit 1fb8d4
			dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
Packit 1fb8d4
			dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
Packit 1fb8d4
			dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
Packit 1fb8d4
			dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < pad; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const BYTE Y = *YData++;
Packit 1fb8d4
			const BYTE U = *UData++;
Packit 1fb8d4
			const BYTE V = *VData++;
Packit 1fb8d4
			const BYTE r = YUV2R(Y, U, V);
Packit 1fb8d4
			const BYTE g = YUV2G(Y, U, V);
Packit 1fb8d4
			const BYTE b = YUV2B(Y, U, V);
Packit 1fb8d4
			dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0xFF);
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(const BYTE** pSrc, const UINT32* srcStep,
Packit 1fb8d4
        BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
Packit 1fb8d4
        const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	if ((unsigned long)pSrc[0] % 16 || (unsigned long)pSrc[1] % 16 || (unsigned long)pSrc[2] % 16 ||
Packit 1fb8d4
	    srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16)
Packit 1fb8d4
		return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
Packit 1fb8d4
	switch (DstFormat)
Packit 1fb8d4
	{
Packit 1fb8d4
		case PIXEL_FORMAT_BGRX32:
Packit 1fb8d4
		case PIXEL_FORMAT_BGRA32:
Packit 1fb8d4
			return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/****************************************************************************/
Packit 1fb8d4
/* SSSE3 RGB -> YUV420 conversion                                          **/
Packit 1fb8d4
/****************************************************************************/
Packit 1fb8d4
Packit 1fb8d4
Packit 1fb8d4
/**
Packit 1fb8d4
 * Note (nfedera):
Packit 1fb8d4
 * The used forward transformation factors from RGB to YUV are based on the
Packit 1fb8d4
 * values specified in [Rec. ITU-R BT.709-6] Section 3:
Packit 1fb8d4
 * http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en
Packit 1fb8d4
 *
Packit 1fb8d4
 * Y =  0.21260 * R + 0.71520 * G + 0.07220 * B +   0;
Packit 1fb8d4
 * U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128;
Packit 1fb8d4
 * V =  0.50000 * R - 0.45415 * G - 0.04585 * B + 128;
Packit 1fb8d4
 *
Packit 1fb8d4
 * The most accurate integer arithmetic approximation when using 8-bit signed
Packit 1fb8d4
 * integer factors with 16-bit signed integer intermediate results is:
Packit 1fb8d4
 *
Packit 1fb8d4
 * Y = ( ( 27 * R + 92 * G +  9 * B) >> 7 );
Packit 1fb8d4
 * U = ( (-29 * R - 99 * G + 128 * B) >> 8 ) + 128;
Packit 1fb8d4
 * V = ( ( 128 * R - 116 * G -  12 * B) >> 8 ) + 128;
Packit 1fb8d4
 *
Packit 1fb8d4
 * Due to signed 8bit range being [-128,127] the U and V constants of 128 are
Packit 1fb8d4
 * rounded to 127
Packit 1fb8d4
 */
Packit 1fb8d4
Packit 1fb8d4
#define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9)
Packit 1fb8d4
#define BGRX_U_FACTORS _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127)
Packit 1fb8d4
#define BGRX_V_FACTORS _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12)
Packit 1fb8d4
#define CONST128_FACTORS _mm_set1_epi8(-128)
Packit 1fb8d4
Packit 1fb8d4
#define Y_SHIFT 7
Packit 1fb8d4
#define U_SHIFT 8
Packit 1fb8d4
#define V_SHIFT 8
Packit 1fb8d4
Packit 1fb8d4
/*
Packit 1fb8d4
TODO:
Packit 1fb8d4
RGB[AX] can simply be supported using the following factors. And instead of loading the
Packit 1fb8d4
globals directly the functions below could be passed pointers to the correct vectors
Packit 1fb8d4
depending on the source picture format.
Packit 1fb8d4
Packit 1fb8d4
PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = {
Packit 1fb8d4
      27,  92,   9,   0,  27,  92,   9,   0,  27,  92,   9,   0,  27,  92,   9,   0
Packit 1fb8d4
};
Packit 1fb8d4
PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = {
Packit 1fb8d4
     -15, -49,  64,   0, -15, -49,  64,   0, -15, -49,  64,   0, -15, -49,  64,   0
Packit 1fb8d4
};
Packit 1fb8d4
PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
Packit 1fb8d4
      64, -58,  -6,   0,  64, -58,  -6,   0,  64, -58,  -6,   0,  64, -58,  -6,   0
Packit 1fb8d4
};
Packit 1fb8d4
*/
Packit 1fb8d4
Packit 1fb8d4
Packit 1fb8d4
/* compute the luma (Y) component from a single rgb source line */
Packit 1fb8d4
Packit 1fb8d4
static INLINE void ssse3_RGBToYUV420_BGRX_Y(
Packit 1fb8d4
    const BYTE* src, BYTE* dst, UINT32 width)
Packit 1fb8d4
{
Packit 1fb8d4
	UINT32 x;
Packit 1fb8d4
	__m128i x0, x1, x2, x3;
Packit 1fb8d4
	const __m128i y_factors = BGRX_Y_FACTORS;
Packit 1fb8d4
	const __m128i* argb = (const __m128i*) src;
Packit 1fb8d4
	__m128i* ydst = (__m128i*) dst;
Packit 1fb8d4
Packit 1fb8d4
	for (x = 0; x < width; x += 16)
Packit 1fb8d4
	{
Packit 1fb8d4
		/* store 16 rgba pixels in 4 128 bit registers */
Packit 1fb8d4
		x0 = _mm_load_si128(argb++); // 1st 4 pixels
Packit 1fb8d4
		x1 = _mm_load_si128(argb++); // 2nd 4 pixels
Packit 1fb8d4
		x2 = _mm_load_si128(argb++); // 3rd 4 pixels
Packit 1fb8d4
		x3 = _mm_load_si128(argb++); // 4th 4 pixels
Packit 1fb8d4
		/* multiplications and subtotals */
Packit 1fb8d4
		x0 = _mm_maddubs_epi16(x0, y_factors);
Packit 1fb8d4
		x1 = _mm_maddubs_epi16(x1, y_factors);
Packit 1fb8d4
		x2 = _mm_maddubs_epi16(x2, y_factors);
Packit 1fb8d4
		x3 = _mm_maddubs_epi16(x3, y_factors);
Packit 1fb8d4
		/* the total sums */
Packit 1fb8d4
		x0 = _mm_hadd_epi16(x0, x1);
Packit 1fb8d4
		x2 = _mm_hadd_epi16(x2, x3);
Packit 1fb8d4
		/* shift the results */
Packit 1fb8d4
		x0 = _mm_srli_epi16(x0, Y_SHIFT);
Packit 1fb8d4
		x2 = _mm_srli_epi16(x2, Y_SHIFT);
Packit 1fb8d4
		/* pack the 16 words into bytes */
Packit 1fb8d4
		x0 = _mm_packus_epi16(x0, x2);
Packit 1fb8d4
		/* save to y plane */
Packit 1fb8d4
		_mm_storeu_si128(ydst++, x0);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/* compute the chrominance (UV) components from two rgb source lines */
Packit 1fb8d4
Packit 1fb8d4
static INLINE void ssse3_RGBToYUV420_BGRX_UV(
Packit 1fb8d4
    const BYTE* src1, const BYTE* src2,
Packit 1fb8d4
    BYTE* dst1, BYTE* dst2, UINT32 width)
Packit 1fb8d4
{
Packit 1fb8d4
	UINT32 x;
Packit 1fb8d4
	const __m128i u_factors = BGRX_U_FACTORS;
Packit 1fb8d4
	const __m128i v_factors = BGRX_V_FACTORS;
Packit 1fb8d4
	const __m128i vector128 = CONST128_FACTORS;
Packit 1fb8d4
	__m128i x0, x1, x2, x3, x4, x5;
Packit 1fb8d4
	const __m128i* rgb1 = (const __m128i*)src1;
Packit 1fb8d4
	const __m128i* rgb2 = (const __m128i*)src2;
Packit 1fb8d4
	__m64* udst = (__m64*)dst1;
Packit 1fb8d4
	__m64* vdst = (__m64*)dst2;
Packit 1fb8d4
Packit 1fb8d4
	for (x = 0; x < width; x += 16)
Packit 1fb8d4
	{
Packit 1fb8d4
		/* subsample 16x2 pixels into 16x1 pixels */
Packit 1fb8d4
		x0 = _mm_load_si128(rgb1++);
Packit 1fb8d4
		x4 = _mm_load_si128(rgb2++);
Packit 1fb8d4
		x0 = _mm_avg_epu8(x0, x4);
Packit 1fb8d4
		x1 = _mm_load_si128(rgb1++);
Packit 1fb8d4
		x4 = _mm_load_si128(rgb2++);
Packit 1fb8d4
		x1 = _mm_avg_epu8(x1, x4);
Packit 1fb8d4
		x2 = _mm_load_si128(rgb1++);
Packit 1fb8d4
		x4 = _mm_load_si128(rgb2++);
Packit 1fb8d4
		x2 = _mm_avg_epu8(x2, x4);
Packit 1fb8d4
		x3 = _mm_load_si128(rgb1++);
Packit 1fb8d4
		x4 = _mm_load_si128(rgb2++);
Packit 1fb8d4
		x3 = _mm_avg_epu8(x3, x4);
Packit 1fb8d4
		/* subsample these 16x1 pixels into 8x1 pixels */
Packit 1fb8d4
		/**
Packit 1fb8d4
		 * shuffle controls
Packit 1fb8d4
		 * c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88
Packit 1fb8d4
		 * c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd
Packit 1fb8d4
		 */
Packit 1fb8d4
		x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
Packit 1fb8d4
		x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
Packit 1fb8d4
		x0 = _mm_avg_epu8(x0, x4);
Packit 1fb8d4
		x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
Packit 1fb8d4
		x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
Packit 1fb8d4
		x1 = _mm_avg_epu8(x1, x4);
Packit 1fb8d4
		/* multiplications and subtotals */
Packit 1fb8d4
		x2 = _mm_maddubs_epi16(x0, u_factors);
Packit 1fb8d4
		x3 = _mm_maddubs_epi16(x1, u_factors);
Packit 1fb8d4
		x4 = _mm_maddubs_epi16(x0, v_factors);
Packit 1fb8d4
		x5 = _mm_maddubs_epi16(x1, v_factors);
Packit 1fb8d4
		/* the total sums */
Packit 1fb8d4
		x0 = _mm_hadd_epi16(x2, x3);
Packit 1fb8d4
		x1 = _mm_hadd_epi16(x4, x5);
Packit 1fb8d4
		/* shift the results */
Packit 1fb8d4
		x0 = _mm_srai_epi16(x0, U_SHIFT);
Packit 1fb8d4
		x1 = _mm_srai_epi16(x1, V_SHIFT);
Packit 1fb8d4
		/* pack the 16 words into bytes */
Packit 1fb8d4
		x0 = _mm_packs_epi16(x0, x1);
Packit 1fb8d4
		/* add 128 */
Packit 1fb8d4
		x0 = _mm_sub_epi8(x0, vector128);
Packit 1fb8d4
		/* the lower 8 bytes go to the u plane */
Packit 1fb8d4
		_mm_storel_pi(udst++, _mm_castsi128_ps(x0));
Packit 1fb8d4
		/* the upper 8 bytes go to the v plane */
Packit 1fb8d4
		_mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_RGBToYUV420_BGRX(
Packit 1fb8d4
    const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep,
Packit 1fb8d4
    BYTE* pDst[3], UINT32 dstStep[3],
Packit 1fb8d4
    const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	UINT32 y;
Packit 1fb8d4
	const BYTE* argb = pSrc;
Packit 1fb8d4
	BYTE* ydst = pDst[0];
Packit 1fb8d4
	BYTE* udst = pDst[1];
Packit 1fb8d4
	BYTE* vdst = pDst[2];
Packit 1fb8d4
Packit 1fb8d4
	if (roi->height < 1 || roi->width < 1)
Packit 1fb8d4
	{
Packit 1fb8d4
		return !PRIMITIVES_SUCCESS;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16)
Packit 1fb8d4
	{
Packit 1fb8d4
		return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height - 1; y += 2)
Packit 1fb8d4
	{
Packit 1fb8d4
		const BYTE* line1 = argb;
Packit 1fb8d4
		const BYTE* line2 = argb + srcStep;
Packit 1fb8d4
		ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
Packit 1fb8d4
		ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
Packit 1fb8d4
		ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
Packit 1fb8d4
		argb += 2 * srcStep;
Packit 1fb8d4
		ydst += 2 * dstStep[0];
Packit 1fb8d4
		udst += 1 * dstStep[1];
Packit 1fb8d4
		vdst += 1 * dstStep[2];
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	if (roi->height & 1)
Packit 1fb8d4
	{
Packit 1fb8d4
		/* pass the same last line of an odd height twice for UV */
Packit 1fb8d4
		ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
Packit 1fb8d4
		ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_RGBToYUV420(
Packit 1fb8d4
    const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep,
Packit 1fb8d4
    BYTE* pDst[3], UINT32 dstStep[3],
Packit 1fb8d4
    const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	switch (srcFormat)
Packit 1fb8d4
	{
Packit 1fb8d4
		case PIXEL_FORMAT_BGRX32:
Packit 1fb8d4
		case PIXEL_FORMAT_BGRA32:
Packit 1fb8d4
			return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
Packit 1fb8d4
/****************************************************************************/
Packit 1fb8d4
/* SSSE3 RGB -> AVC444-YUV conversion                                      **/
Packit 1fb8d4
/****************************************************************************/
Packit 1fb8d4
Packit 1fb8d4
static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
Packit 1fb8d4
    const BYTE* srcEven, const BYTE* srcOdd, BYTE* b1Even, BYTE* b1Odd, BYTE* b2,
Packit 1fb8d4
    BYTE* b3, BYTE* b4, BYTE* b5, BYTE* b6, BYTE* b7, UINT32 width)
Packit 1fb8d4
{
Packit 1fb8d4
	UINT32 x;
Packit 1fb8d4
	const __m128i* argbEven = (const __m128i*) srcEven;
Packit 1fb8d4
	const __m128i* argbOdd = (const __m128i*) srcOdd;
Packit 1fb8d4
	const __m128i y_factors = BGRX_Y_FACTORS;
Packit 1fb8d4
	const __m128i u_factors = BGRX_U_FACTORS;
Packit 1fb8d4
	const __m128i v_factors = BGRX_V_FACTORS;
Packit 1fb8d4
	const __m128i vector128 = CONST128_FACTORS;
Packit 1fb8d4
Packit 1fb8d4
	for (x = 0; x < width; x += 16)
Packit 1fb8d4
	{
Packit 1fb8d4
		/* store 16 rgba pixels in 4 128 bit registers */
Packit 1fb8d4
		const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
Packit 1fb8d4
		const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
Packit 1fb8d4
		const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
Packit 1fb8d4
		const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
Packit 1fb8d4
		const __m128i xo1 = _mm_load_si128(argbOdd++); // 1st 4 pixels
Packit 1fb8d4
		const __m128i xo2 = _mm_load_si128(argbOdd++); // 2nd 4 pixels
Packit 1fb8d4
		const __m128i xo3 = _mm_load_si128(argbOdd++); // 3rd 4 pixels
Packit 1fb8d4
		const __m128i xo4 = _mm_load_si128(argbOdd++); // 4th 4 pixels
Packit 1fb8d4
		{
Packit 1fb8d4
			/* Y: multiplications with subtotals and horizontal sums */
Packit 1fb8d4
			const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
Packit 1fb8d4
			                                   _mm_maddubs_epi16(xe2, y_factors)), Y_SHIFT);
Packit 1fb8d4
			const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
Packit 1fb8d4
			                                   _mm_maddubs_epi16(xe4, y_factors)), Y_SHIFT);
Packit 1fb8d4
			const __m128i ye = _mm_packus_epi16(ye1, ye2);
Packit 1fb8d4
			const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
Packit 1fb8d4
			                                   _mm_maddubs_epi16(xo2, y_factors)), Y_SHIFT);
Packit 1fb8d4
			const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
Packit 1fb8d4
			                                   _mm_maddubs_epi16(xo4, y_factors)), Y_SHIFT);
Packit 1fb8d4
			const __m128i yo = _mm_packus_epi16(yo1, yo2);
Packit 1fb8d4
			/* store y [b1] */
Packit 1fb8d4
			_mm_storeu_si128((__m128i*)b1Even, ye);
Packit 1fb8d4
			b1Even += 16;
Packit 1fb8d4
Packit 1fb8d4
			if (b1Odd)
Packit 1fb8d4
			{
Packit 1fb8d4
				_mm_storeu_si128((__m128i*)b1Odd, yo);
Packit 1fb8d4
				b1Odd += 16;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
		{
Packit 1fb8d4
			/* We have now
Packit 1fb8d4
			   * 16 even U values in ue
Packit 1fb8d4
			   * 16 odd U values in uo
Packit 1fb8d4
			   *
Packit 1fb8d4
			   * We need to split these according to
Packit 1fb8d4
			   * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
Packit 1fb8d4
			__m128i ue, uo;
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i ue1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xe2, u_factors)), U_SHIFT);
Packit 1fb8d4
				const __m128i ue2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xe4, u_factors)), U_SHIFT);
Packit 1fb8d4
				ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			if (b1Odd)
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i uo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xo2, u_factors)), U_SHIFT);
Packit 1fb8d4
				const __m128i uo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xo4, u_factors)), U_SHIFT);
Packit 1fb8d4
				uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			/* Now we need the following storage distribution:
Packit 1fb8d4
			 * 2x   2y    -> b2
Packit 1fb8d4
			 * x    2y+1  -> b4
Packit 1fb8d4
			 * 2x+1 2y    -> b6 */
Packit 1fb8d4
			if (b1Odd) /* b2 */
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
Packit 1fb8d4
				const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
Packit 1fb8d4
				const __m128i hi = _mm_add_epi16(ueh, uoh);
Packit 1fb8d4
				const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
Packit 1fb8d4
				const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
Packit 1fb8d4
				const __m128i lo = _mm_add_epi16(uel, uol);
Packit 1fb8d4
				const __m128i added = _mm_hadd_epi16(lo, hi);
Packit 1fb8d4
				const __m128i avg16 = _mm_srai_epi16(added, 2);
Packit 1fb8d4
				const __m128i avg = _mm_packus_epi16(avg16, avg16);
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)b2, avg);
Packit 1fb8d4
			}
Packit 1fb8d4
			else
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
Packit 1fb8d4
				                                  14, 12, 10, 8, 6, 4, 2, 0);
Packit 1fb8d4
				const __m128i ud = _mm_shuffle_epi8(ue, mask);
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)b2, ud);
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			b2 += 8;
Packit 1fb8d4
Packit 1fb8d4
			if (b1Odd) /* b4 */
Packit 1fb8d4
			{
Packit 1fb8d4
				_mm_store_si128((__m128i*)b4, uo);
Packit 1fb8d4
				b4 += 16;
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			{
Packit 1fb8d4
				/* b6 */
Packit 1fb8d4
				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
Packit 1fb8d4
				                                  15, 13, 11, 9, 7, 5, 3, 1);
Packit 1fb8d4
				const __m128i ude = _mm_shuffle_epi8(ue, mask);
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)b6, ude);
Packit 1fb8d4
				b6 += 8;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
		{
Packit 1fb8d4
			/* We have now
Packit 1fb8d4
			   * 16 even V values in ue
Packit 1fb8d4
			   * 16 odd V values in uo
Packit 1fb8d4
			   *
Packit 1fb8d4
			   * We need to split these according to
Packit 1fb8d4
			   * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
Packit 1fb8d4
			__m128i ve, vo;
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i ve1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xe2, v_factors)), V_SHIFT);
Packit 1fb8d4
				const __m128i ve2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xe4, v_factors)), V_SHIFT);
Packit 1fb8d4
				ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			if (b1Odd)
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i vo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xo2, v_factors)), V_SHIFT);
Packit 1fb8d4
				const __m128i vo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xo4, v_factors)), V_SHIFT);
Packit 1fb8d4
				vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			/* Now we need the following storage distribution:
Packit 1fb8d4
			 * 2x   2y    -> b3
Packit 1fb8d4
			 * x    2y+1  -> b5
Packit 1fb8d4
			 * 2x+1 2y    -> b7 */
Packit 1fb8d4
			if (b1Odd) /* b3 */
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
Packit 1fb8d4
				const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
Packit 1fb8d4
				const __m128i hi = _mm_add_epi16(veh, voh);
Packit 1fb8d4
				const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
Packit 1fb8d4
				const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
Packit 1fb8d4
				const __m128i lo = _mm_add_epi16(vel, vol);
Packit 1fb8d4
				const __m128i added = _mm_hadd_epi16(lo, hi);
Packit 1fb8d4
				const __m128i avg16 = _mm_srai_epi16(added, 2);
Packit 1fb8d4
				const __m128i avg = _mm_packus_epi16(avg16, avg16);
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)b3, avg);
Packit 1fb8d4
			}
Packit 1fb8d4
			else
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
Packit 1fb8d4
				                                  14, 12, 10, 8, 6, 4, 2, 0);
Packit 1fb8d4
				const __m128i vd = _mm_shuffle_epi8(ve, mask);
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)b3, vd);
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			b3 += 8;
Packit 1fb8d4
Packit 1fb8d4
			if (b1Odd) /* b5 */
Packit 1fb8d4
			{
Packit 1fb8d4
				_mm_store_si128((__m128i*)b5, vo);
Packit 1fb8d4
				b5 += 16;
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			{
Packit 1fb8d4
				/* b7 */
Packit 1fb8d4
				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
Packit 1fb8d4
				                                  15, 13, 11, 9, 7, 5, 3, 1);
Packit 1fb8d4
				const __m128i vde = _mm_shuffle_epi8(ve, mask);
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)b7, vde);
Packit 1fb8d4
				b7 += 8;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_RGBToAVC444YUV_BGRX(
Packit 1fb8d4
    const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep,
Packit 1fb8d4
    BYTE* pDst1[3], const UINT32 dst1Step[3],
Packit 1fb8d4
    BYTE* pDst2[3], const UINT32 dst2Step[3],
Packit 1fb8d4
    const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	UINT32 y;
Packit 1fb8d4
	const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
Packit 1fb8d4
Packit 1fb8d4
	if (roi->height < 1 || roi->width < 1)
Packit 1fb8d4
		return !PRIMITIVES_SUCCESS;
Packit 1fb8d4
Packit 1fb8d4
	if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16)
Packit 1fb8d4
		return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi);
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; y += 2)
Packit 1fb8d4
	{
Packit 1fb8d4
		const BOOL last = (y >= (roi->height - 1));
Packit 1fb8d4
		const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
Packit 1fb8d4
		const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
Packit 1fb8d4
		const UINT32 i = y >> 1;
Packit 1fb8d4
		const UINT32 n = (i & ~7) + i;
Packit 1fb8d4
		BYTE* b1Even = pDst1[0] + y * dst1Step[0];
Packit 1fb8d4
		BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
Packit 1fb8d4
		BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
Packit 1fb8d4
		BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
Packit 1fb8d4
		BYTE* b4 = pDst2[0] + dst2Step[0] * n;
Packit 1fb8d4
		BYTE* b5 = b4 + 8 * dst2Step[0];
Packit 1fb8d4
		BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
Packit 1fb8d4
		BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
Packit 1fb8d4
		ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
Packit 1fb8d4
		                                     roi->width);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_RGBToAVC444YUV(
Packit 1fb8d4
    const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep,
Packit 1fb8d4
    BYTE* pDst1[3], const UINT32 dst1Step[3],
Packit 1fb8d4
    BYTE* pDst2[3], const UINT32 dst2Step[3],
Packit 1fb8d4
    const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	switch (srcFormat)
Packit 1fb8d4
	{
Packit 1fb8d4
		case PIXEL_FORMAT_BGRX32:
Packit 1fb8d4
		case PIXEL_FORMAT_BGRA32:
Packit 1fb8d4
			return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
/* Mapping of arguments:
Packit 1fb8d4
 *
Packit 1fb8d4
 * b1 [even lines] -> yLumaDstEven
Packit 1fb8d4
 * b1 [odd lines]  -> yLumaDstOdd
Packit 1fb8d4
 * b2              -> uLumaDst
Packit 1fb8d4
 * b3              -> vLumaDst
Packit 1fb8d4
 * b4              -> yChromaDst1
Packit 1fb8d4
 * b5              -> yChromaDst2
Packit 1fb8d4
 * b6              -> uChromaDst1
Packit 1fb8d4
 * b7              -> uChromaDst2
Packit 1fb8d4
 * b8              -> vChromaDst1
Packit 1fb8d4
 * b9              -> vChromaDst2
Packit 1fb8d4
 */
Packit 1fb8d4
static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
Packit 1fb8d4
    const BYTE* srcEven, const BYTE* srcOdd,
Packit 1fb8d4
    BYTE* yLumaDstEven, BYTE* yLumaDstOdd,
Packit 1fb8d4
    BYTE* uLumaDst, BYTE* vLumaDst,
Packit 1fb8d4
    BYTE* yEvenChromaDst1, BYTE* yEvenChromaDst2,
Packit 1fb8d4
    BYTE* yOddChromaDst1, BYTE* yOddChromaDst2,
Packit 1fb8d4
    BYTE* uChromaDst1, BYTE* uChromaDst2,
Packit 1fb8d4
    BYTE* vChromaDst1, BYTE* vChromaDst2,
Packit 1fb8d4
    UINT32 width)
Packit 1fb8d4
{
Packit 1fb8d4
	UINT32 x;
Packit 1fb8d4
	const __m128i vector128 = CONST128_FACTORS;
Packit 1fb8d4
	const __m128i* argbEven = (const __m128i*) srcEven;
Packit 1fb8d4
	const __m128i* argbOdd = (const __m128i*) srcOdd;
Packit 1fb8d4
Packit 1fb8d4
	for (x = 0; x < width; x += 16)
Packit 1fb8d4
	{
Packit 1fb8d4
		/* store 16 rgba pixels in 4 128 bit registers
Packit 1fb8d4
		 * for even and odd rows.
Packit 1fb8d4
		 */
Packit 1fb8d4
		const __m128i xe1 = _mm_load_si128(argbEven++); /* 1st 4 pixels */
Packit 1fb8d4
		const __m128i xe2 = _mm_load_si128(argbEven++); /* 2nd 4 pixels */
Packit 1fb8d4
		const __m128i xe3 = _mm_load_si128(argbEven++); /* 3rd 4 pixels */
Packit 1fb8d4
		const __m128i xe4 = _mm_load_si128(argbEven++); /* 4th 4 pixels */
Packit 1fb8d4
		const __m128i xo1 = _mm_load_si128(argbOdd++); /* 1st 4 pixels */
Packit 1fb8d4
		const __m128i xo2 = _mm_load_si128(argbOdd++); /* 2nd 4 pixels */
Packit 1fb8d4
		const __m128i xo3 = _mm_load_si128(argbOdd++); /* 3rd 4 pixels */
Packit 1fb8d4
		const __m128i xo4 = _mm_load_si128(argbOdd++); /* 4th 4 pixels */
Packit 1fb8d4
		{
Packit 1fb8d4
			/* Y: multiplications with subtotals and horizontal sums */
Packit 1fb8d4
			const __m128i y_factors = BGRX_Y_FACTORS;
Packit 1fb8d4
			const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
Packit 1fb8d4
			                                   _mm_maddubs_epi16(xe2, y_factors)), Y_SHIFT);
Packit 1fb8d4
			const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
Packit 1fb8d4
			                                   _mm_maddubs_epi16(xe4, y_factors)), Y_SHIFT);
Packit 1fb8d4
			const __m128i ye = _mm_packus_epi16(ye1, ye2);
Packit 1fb8d4
			/* store y [b1] */
Packit 1fb8d4
			_mm_storeu_si128((__m128i*)yLumaDstEven, ye);
Packit 1fb8d4
			yLumaDstEven += 16;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		if (yLumaDstOdd)
Packit 1fb8d4
		{
Packit 1fb8d4
			const __m128i y_factors = BGRX_Y_FACTORS;
Packit 1fb8d4
			const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
Packit 1fb8d4
			                                   _mm_maddubs_epi16(xo2, y_factors)), Y_SHIFT);
Packit 1fb8d4
			const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
Packit 1fb8d4
			                                   _mm_maddubs_epi16(xo4, y_factors)), Y_SHIFT);
Packit 1fb8d4
			const __m128i yo = _mm_packus_epi16(yo1, yo2);
Packit 1fb8d4
			_mm_storeu_si128((__m128i*)yLumaDstOdd, yo);
Packit 1fb8d4
			yLumaDstOdd += 16;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		{
Packit 1fb8d4
			/* We have now
Packit 1fb8d4
			   * 16 even U values in ue
Packit 1fb8d4
			   * 16 odd U values in uo
Packit 1fb8d4
			   *
Packit 1fb8d4
			   * We need to split these according to
Packit 1fb8d4
			   * 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode */
Packit 1fb8d4
			/* U: multiplications with subtotals and horizontal sums */
Packit 1fb8d4
			__m128i ue, uo, uavg;
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i u_factors = BGRX_U_FACTORS;
Packit 1fb8d4
				const __m128i ue1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xe2, u_factors)), U_SHIFT);
Packit 1fb8d4
				const __m128i ue2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xe4, u_factors)), U_SHIFT);
Packit 1fb8d4
				const __m128i ueavg = _mm_hadd_epi16(ue1, ue2);
Packit 1fb8d4
				ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
Packit 1fb8d4
				uavg = ueavg;
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i u_factors = BGRX_U_FACTORS;
Packit 1fb8d4
				const __m128i uo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xo2, u_factors)), U_SHIFT);
Packit 1fb8d4
				const __m128i uo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xo4, u_factors)), U_SHIFT);
Packit 1fb8d4
				const __m128i uoavg = _mm_hadd_epi16(uo1, uo2);
Packit 1fb8d4
				uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
Packit 1fb8d4
				uavg = _mm_add_epi16(uavg, uoavg);
Packit 1fb8d4
				uavg = _mm_srai_epi16(uavg, 2);
Packit 1fb8d4
				uavg = _mm_packs_epi16(uavg, uoavg);
Packit 1fb8d4
				uavg = _mm_sub_epi8(uavg, vector128);
Packit 1fb8d4
			}
Packit 1fb8d4
			/* Now we need the following storage distribution:
Packit 1fb8d4
			 * 2x   2y    -> uLumaDst
Packit 1fb8d4
			 * 2x+1  y    -> yChromaDst1
Packit 1fb8d4
			 * 4x   2y+1  -> uChromaDst1
Packit 1fb8d4
			 * 4x+2 2y+1  -> vChromaDst1 */
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
Packit 1fb8d4
				                                  15, 13, 11, 9, 7, 5, 3, 1);
Packit 1fb8d4
				const __m128i ude = _mm_shuffle_epi8(ue, mask);
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)yEvenChromaDst1, ude);
Packit 1fb8d4
				yEvenChromaDst1 += 8;
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			if (yLumaDstOdd)
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
Packit 1fb8d4
				                                  15, 13, 11, 9, 7, 5, 3, 1);
Packit 1fb8d4
				const __m128i udo = _mm_shuffle_epi8(uo, mask);
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)yOddChromaDst1, udo);
Packit 1fb8d4
				yOddChromaDst1 += 8;
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			if (yLumaDstOdd)
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
Packit 1fb8d4
				                                  14, 10, 6, 2, 12, 8, 4, 0);
Packit 1fb8d4
				const __m128i ud = _mm_shuffle_epi8(uo, mask);
Packit 1fb8d4
				int* uDst1 = (int*)uChromaDst1;
Packit 1fb8d4
				int* vDst1 = (int*)vChromaDst1;
Packit 1fb8d4
				const int* src = (const int*)&ud;
Packit 1fb8d4
				_mm_stream_si32(uDst1, src[0]);
Packit 1fb8d4
				_mm_stream_si32(vDst1, src[1]);
Packit 1fb8d4
				uChromaDst1 += 4;
Packit 1fb8d4
				vChromaDst1 += 4;
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			if (yLumaDstOdd)
Packit 1fb8d4
			{
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)uLumaDst, uavg);
Packit 1fb8d4
				uLumaDst += 8;
Packit 1fb8d4
			}
Packit 1fb8d4
			else
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
Packit 1fb8d4
				                                  14, 12, 10, 8, 6, 4, 2, 0);
Packit 1fb8d4
				const __m128i ud = _mm_shuffle_epi8(ue, mask);
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)uLumaDst, ud);
Packit 1fb8d4
				uLumaDst += 8;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		{
Packit 1fb8d4
			/* V: multiplications with subtotals and horizontal sums */
Packit 1fb8d4
			__m128i ve, vo, vavg;
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i v_factors = BGRX_V_FACTORS;
Packit 1fb8d4
				const __m128i ve1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xe2, v_factors)), V_SHIFT);
Packit 1fb8d4
				const __m128i ve2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xe4, v_factors)), V_SHIFT);
Packit 1fb8d4
				const __m128i veavg = _mm_hadd_epi16(ve1, ve2);
Packit 1fb8d4
				ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
Packit 1fb8d4
				vavg = veavg;
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i v_factors = BGRX_V_FACTORS;
Packit 1fb8d4
				const __m128i vo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xo2, v_factors)), V_SHIFT);
Packit 1fb8d4
				const __m128i vo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
Packit 1fb8d4
				                                   _mm_maddubs_epi16(xo4, v_factors)), V_SHIFT);
Packit 1fb8d4
				const __m128i voavg = _mm_hadd_epi16(vo1, vo2);
Packit 1fb8d4
				vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
Packit 1fb8d4
				vavg = _mm_add_epi16(vavg, voavg);
Packit 1fb8d4
				vavg = _mm_srai_epi16(vavg, 2);
Packit 1fb8d4
				vavg = _mm_packs_epi16(vavg, voavg);
Packit 1fb8d4
				vavg = _mm_sub_epi8(vavg, vector128);
Packit 1fb8d4
			}
Packit 1fb8d4
			/* Now we need the following storage distribution:
Packit 1fb8d4
			 * 2x   2y    -> vLumaDst
Packit 1fb8d4
			 * 2x+1  y    -> yChromaDst2
Packit 1fb8d4
			 * 4x   2y+1  -> uChromaDst2
Packit 1fb8d4
			 * 4x+2 2y+1  -> vChromaDst2 */
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
Packit 1fb8d4
				                                  15, 13, 11, 9, 7, 5, 3, 1);
Packit 1fb8d4
				__m128i vde = _mm_shuffle_epi8(ve, mask);
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)yEvenChromaDst2, vde);
Packit 1fb8d4
				yEvenChromaDst2 += 8;
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			if (yLumaDstOdd)
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
Packit 1fb8d4
				                                  15, 13, 11, 9, 7, 5, 3, 1);
Packit 1fb8d4
				__m128i vdo = _mm_shuffle_epi8(vo, mask);
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)yOddChromaDst2, vdo);
Packit 1fb8d4
				yOddChromaDst2 += 8;
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			if (yLumaDstOdd)
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
Packit 1fb8d4
				                                  14, 10, 6, 2, 12, 8, 4, 0);
Packit 1fb8d4
				const __m128i vd = _mm_shuffle_epi8(vo, mask);
Packit 1fb8d4
				int* uDst2 = (int*)uChromaDst2;
Packit 1fb8d4
				int* vDst2 = (int*)vChromaDst2;
Packit 1fb8d4
				const int* src = (const int*)&vd;
Packit 1fb8d4
				_mm_stream_si32(uDst2, src[0]);
Packit 1fb8d4
				_mm_stream_si32(vDst2, src[1]);
Packit 1fb8d4
				uChromaDst2 += 4;
Packit 1fb8d4
				vChromaDst2 += 4;
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			if (yLumaDstOdd)
Packit 1fb8d4
			{
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)vLumaDst, vavg);
Packit 1fb8d4
				vLumaDst += 8;
Packit 1fb8d4
			}
Packit 1fb8d4
			else
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
Packit 1fb8d4
				                                  14, 12, 10, 8, 6, 4, 2, 0);
Packit 1fb8d4
				__m128i vd = _mm_shuffle_epi8(ve, mask);
Packit 1fb8d4
				_mm_storel_epi64((__m128i*)vLumaDst, vd);
Packit 1fb8d4
				vLumaDst += 8;
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_RGBToAVC444YUVv2_BGRX(
Packit 1fb8d4
    const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep,
Packit 1fb8d4
    BYTE* pDst1[3], const UINT32 dst1Step[3],
Packit 1fb8d4
    BYTE* pDst2[3], const UINT32 dst2Step[3],
Packit 1fb8d4
    const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	UINT32 y;
Packit 1fb8d4
Packit 1fb8d4
	if (roi->height < 1 || roi->width < 1)
Packit 1fb8d4
		return !PRIMITIVES_SUCCESS;
Packit 1fb8d4
Packit 1fb8d4
	if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16)
Packit 1fb8d4
		return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi);
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < roi->height; y += 2)
Packit 1fb8d4
	{
Packit 1fb8d4
		const BYTE* srcEven = (pSrc + y * srcStep);
Packit 1fb8d4
		const BYTE* srcOdd = (srcEven + srcStep);
Packit 1fb8d4
		BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
Packit 1fb8d4
		BYTE* dstLumaYOdd = (y < roi->height - 1) ? (dstLumaYEven + dst1Step[0]) : NULL;
Packit 1fb8d4
		BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
Packit 1fb8d4
		BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
Packit 1fb8d4
		BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
Packit 1fb8d4
		BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
Packit 1fb8d4
		BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
Packit 1fb8d4
		BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
Packit 1fb8d4
		BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
Packit 1fb8d4
		BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
Packit 1fb8d4
		BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
Packit 1fb8d4
		BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
Packit 1fb8d4
		ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven,
Packit 1fb8d4
		                                       dstLumaYOdd, dstLumaU, dstLumaV,
Packit 1fb8d4
		                                       dstEvenChromaY1, dstEvenChromaY2,
Packit 1fb8d4
		                                       dstOddChromaY1, dstOddChromaY2,
Packit 1fb8d4
		                                       dstChromaU1, dstChromaU2,
Packit 1fb8d4
		                                       dstChromaV1, dstChromaV2,
Packit 1fb8d4
		                                       roi->width);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_RGBToAVC444YUVv2(
Packit 1fb8d4
    const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep,
Packit 1fb8d4
    BYTE* pDst1[3], const UINT32 dst1Step[3],
Packit 1fb8d4
    BYTE* pDst2[3], const UINT32 dst2Step[3],
Packit 1fb8d4
    const prim_size_t* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	switch (srcFormat)
Packit 1fb8d4
	{
Packit 1fb8d4
		case PIXEL_FORMAT_BGRX32:
Packit 1fb8d4
		case PIXEL_FORMAT_BGRA32:
Packit 1fb8d4
			return ssse3_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi);
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_LumaToYUV444(const BYTE* pSrcRaw[3], const UINT32 srcStep[3],
Packit 1fb8d4
                                    BYTE* pDstRaw[3], const UINT32 dstStep[3],
Packit 1fb8d4
                                    const RECTANGLE_16* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	UINT32 x, y;
Packit 1fb8d4
	const UINT32 nWidth = roi->right - roi->left;
Packit 1fb8d4
	const UINT32 nHeight = roi->bottom - roi->top;
Packit 1fb8d4
	const UINT32 halfWidth = (nWidth + 1) / 2;
Packit 1fb8d4
	const UINT32 halfPad = halfWidth % 16;
Packit 1fb8d4
	const UINT32 halfHeight = (nHeight + 1) / 2;
Packit 1fb8d4
	const UINT32 oddY = 1;
Packit 1fb8d4
	const UINT32 evenY = 0;
Packit 1fb8d4
	const UINT32 oddX = 1;
Packit 1fb8d4
	const UINT32 evenX = 0;
Packit 1fb8d4
	const BYTE* pSrc[3] =
Packit 1fb8d4
	{
Packit 1fb8d4
		pSrcRaw[0] + roi->top* srcStep[0] + roi->left,
Packit 1fb8d4
		pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
Packit 1fb8d4
		pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2
Packit 1fb8d4
	};
Packit 1fb8d4
	BYTE* pDst[3] =
Packit 1fb8d4
	{
Packit 1fb8d4
		pDstRaw[0] + roi->top* dstStep[0] + roi->left,
Packit 1fb8d4
		pDstRaw[1] + roi->top* dstStep[1] + roi->left,
Packit 1fb8d4
		pDstRaw[2] + roi->top* dstStep[2] + roi->left
Packit 1fb8d4
	};
Packit 1fb8d4
Packit 1fb8d4
	/* Y data is already here... */
Packit 1fb8d4
	/* B1 */
Packit 1fb8d4
	for (y = 0; y < nHeight; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		const BYTE* Ym = pSrc[0] + srcStep[0] * y;
Packit 1fb8d4
		BYTE* pY = pDst[0] + dstStep[0] * y;
Packit 1fb8d4
		memcpy(pY, Ym, nWidth);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	/* The first half of U, V are already here part of this frame. */
Packit 1fb8d4
	/* B2 and B3 */
Packit 1fb8d4
	for (y = 0; y < halfHeight; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		const UINT32 val2y = (2 * y + evenY);
Packit 1fb8d4
		const UINT32 val2y1 = val2y + oddY;
Packit 1fb8d4
		const BYTE* Um = pSrc[1] + srcStep[1] * y;
Packit 1fb8d4
		const BYTE* Vm = pSrc[2] + srcStep[2] * y;
Packit 1fb8d4
		BYTE* pU = pDst[1] + dstStep[1] * val2y;
Packit 1fb8d4
		BYTE* pV = pDst[2] + dstStep[2] * val2y;
Packit 1fb8d4
		BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
Packit 1fb8d4
		BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < halfWidth - halfPad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
Packit 1fb8d4
			const __m128i unpackLow = _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i u = _mm_loadu_si128((__m128i*)&Um[x]);
Packit 1fb8d4
				const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
Packit 1fb8d4
				const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
Packit 1fb8d4
				_mm_storeu_si128((__m128i*)&pU[2 * x], uHigh);
Packit 1fb8d4
				_mm_storeu_si128((__m128i*)&pU[2 * x + 16], uLow);
Packit 1fb8d4
				_mm_storeu_si128((__m128i*)&pU1[2 * x], uHigh);
Packit 1fb8d4
				_mm_storeu_si128((__m128i*)&pU1[2 * x + 16], uLow);
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i u = _mm_loadu_si128((__m128i*)&Vm[x]);
Packit 1fb8d4
				const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
Packit 1fb8d4
				const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
Packit 1fb8d4
				_mm_storeu_si128((__m128i*)&pV[2 * x], uHigh);
Packit 1fb8d4
				_mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow);
Packit 1fb8d4
				_mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh);
Packit 1fb8d4
				_mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow);
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (; x < halfWidth; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const UINT32 val2x = 2 * x + evenX;
Packit 1fb8d4
			const UINT32 val2x1 = val2x + oddX;
Packit 1fb8d4
			pU[val2x] = Um[x];
Packit 1fb8d4
			pV[val2x] = Vm[x];
Packit 1fb8d4
			pU[val2x1] = Um[x];
Packit 1fb8d4
			pV[val2x1] = Vm[x];
Packit 1fb8d4
			pU1[val2x] = Um[x];
Packit 1fb8d4
			pV1[val2x] = Vm[x];
Packit 1fb8d4
			pU1[val2x1] = Um[x];
Packit 1fb8d4
			pV1[val2x1] = Vm[x];
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static INLINE void ssse3_filter(BYTE* pSrcDst, const BYTE* pSrc2)
Packit 1fb8d4
{
Packit 1fb8d4
	const __m128i even = _mm_set_epi8(0x80, 14, 0x80, 12, 0x80, 10, 0x80, 8, 0x80, 6, 0x80, 4, 0x80, 2,
Packit 1fb8d4
	                                  0x80, 0);
Packit 1fb8d4
	const __m128i odd = _mm_set_epi8(0x80, 15, 0x80, 13, 0x80, 11, 0x80, 9, 0x80, 7, 0x80, 5, 0x80, 3,
Packit 1fb8d4
	                                 0x80, 1);
Packit 1fb8d4
	const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
Packit 1fb8d4
	const __m128i u = _mm_loadu_si128((__m128i*)pSrcDst);
Packit 1fb8d4
	const __m128i u1 = _mm_loadu_si128((__m128i*)pSrc2);
Packit 1fb8d4
	const __m128i uEven = _mm_shuffle_epi8(u, even);
Packit 1fb8d4
	const __m128i uEven4 = _mm_slli_epi16(uEven, 2);
Packit 1fb8d4
	const __m128i uOdd = _mm_shuffle_epi8(u, odd);
Packit 1fb8d4
	const __m128i u1Even = _mm_shuffle_epi8(u1, even);
Packit 1fb8d4
	const __m128i u1Odd = _mm_shuffle_epi8(u1, odd);
Packit 1fb8d4
	const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even);
Packit 1fb8d4
	const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd);
Packit 1fb8d4
	const __m128i result = _mm_sub_epi16(uEven4, tmp2);
Packit 1fb8d4
	const __m128i packed = _mm_packus_epi16(result, uOdd);
Packit 1fb8d4
	const __m128i interleaved = _mm_shuffle_epi8(packed, interleave);
Packit 1fb8d4
	_mm_storeu_si128((__m128i*)pSrcDst, interleaved);
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_ChromaFilter(BYTE* pDst[3], const UINT32 dstStep[3],
Packit 1fb8d4
                                    const RECTANGLE_16* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	const UINT32 oddY = 1;
Packit 1fb8d4
	const UINT32 evenY = 0;
Packit 1fb8d4
	const UINT32 nWidth = roi->right - roi->left;
Packit 1fb8d4
	const UINT32 nHeight = roi->bottom - roi->top;
Packit 1fb8d4
	const UINT32 halfHeight = (nHeight + 1) / 2;
Packit 1fb8d4
	const UINT32 halfWidth = (nWidth + 1) / 2;
Packit 1fb8d4
	const UINT32 halfPad = halfWidth % 16;
Packit 1fb8d4
	UINT32 x, y;
Packit 1fb8d4
Packit 1fb8d4
	/* Filter */
Packit 1fb8d4
	for (y = roi->top; y < halfHeight + roi->top; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		const UINT32 val2y = (y * 2 + evenY);
Packit 1fb8d4
		const UINT32 val2y1 = val2y + oddY;
Packit 1fb8d4
		BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
Packit 1fb8d4
		BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
Packit 1fb8d4
		BYTE* pU = pDst[1] + dstStep[1] * val2y;
Packit 1fb8d4
		BYTE* pV = pDst[2] + dstStep[2] * val2y;
Packit 1fb8d4
Packit 1fb8d4
		if (val2y1 > nHeight)
Packit 1fb8d4
			continue;
Packit 1fb8d4
Packit 1fb8d4
		for (x = roi->left; x < halfWidth + roi->left - halfPad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			ssse3_filter(&pU[2 * x], &pU1[2 * x]);
Packit 1fb8d4
			ssse3_filter(&pV[2 * x], &pV1[2 * x]);
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (; x < halfWidth + roi->left; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const UINT32 val2x = (x * 2);
Packit 1fb8d4
			const UINT32 val2x1 = val2x + 1;
Packit 1fb8d4
			const INT32 up = pU[val2x] * 4;
Packit 1fb8d4
			const INT32 vp = pV[val2x] * 4;
Packit 1fb8d4
			INT32 u2020;
Packit 1fb8d4
			INT32 v2020;
Packit 1fb8d4
Packit 1fb8d4
			if (val2x1 > nWidth)
Packit 1fb8d4
				continue;
Packit 1fb8d4
Packit 1fb8d4
			u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
Packit 1fb8d4
			v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
Packit 1fb8d4
			pU[val2x] = CLIP(u2020);
Packit 1fb8d4
			pV[val2x] = CLIP(v2020);
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_ChromaV1ToYUV444(const BYTE* pSrcRaw[3], const UINT32 srcStep[3],
Packit 1fb8d4
                                        BYTE* pDstRaw[3], const UINT32 dstStep[3],
Packit 1fb8d4
                                        const RECTANGLE_16* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	const UINT32 mod = 16;
Packit 1fb8d4
	UINT32 uY = 0;
Packit 1fb8d4
	UINT32 vY = 0;
Packit 1fb8d4
	UINT32 x, y;
Packit 1fb8d4
	const UINT32 nWidth = roi->right - roi->left;
Packit 1fb8d4
	const UINT32 nHeight = roi->bottom - roi->top;
Packit 1fb8d4
	const UINT32 halfWidth = (nWidth + 1) / 2;
Packit 1fb8d4
	const UINT32 halfPad = halfWidth % 16;
Packit 1fb8d4
	const UINT32 halfHeight = (nHeight + 1) / 2;
Packit 1fb8d4
	const UINT32 oddY = 1;
Packit 1fb8d4
	const UINT32 evenY = 0;
Packit 1fb8d4
	const UINT32 oddX = 1;
Packit 1fb8d4
	/* The auxilary frame is aligned to multiples of 16x16.
Packit 1fb8d4
	 * We need the padded height for B4 and B5 conversion. */
Packit 1fb8d4
	const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
Packit 1fb8d4
	const BYTE* pSrc[3] =
Packit 1fb8d4
	{
Packit 1fb8d4
		pSrcRaw[0] + roi->top* srcStep[0] + roi->left,
Packit 1fb8d4
		pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
Packit 1fb8d4
		pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2
Packit 1fb8d4
	};
Packit 1fb8d4
	BYTE* pDst[3] =
Packit 1fb8d4
	{
Packit 1fb8d4
		pDstRaw[0] + roi->top* dstStep[0] + roi->left,
Packit 1fb8d4
		pDstRaw[1] + roi->top* dstStep[1] + roi->left,
Packit 1fb8d4
		pDstRaw[2] + roi->top* dstStep[2] + roi->left
Packit 1fb8d4
	};
Packit 1fb8d4
	const __m128i zero = _mm_setzero_si128();
Packit 1fb8d4
	const __m128i mask = _mm_set_epi8(0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0,
Packit 1fb8d4
	                                  0x80);
Packit 1fb8d4
Packit 1fb8d4
	/* The second half of U and V is a bit more tricky... */
Packit 1fb8d4
	/* B4 and B5 */
Packit 1fb8d4
	for (y = 0; y < padHeigth; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		const BYTE* Ya = pSrc[0] + srcStep[0] * y;
Packit 1fb8d4
		BYTE* pX;
Packit 1fb8d4
Packit 1fb8d4
		if ((y) % mod < (mod + 1) / 2)
Packit 1fb8d4
		{
Packit 1fb8d4
			const UINT32 pos = (2 * uY++ + oddY);
Packit 1fb8d4
Packit 1fb8d4
			if (pos >= nHeight)
Packit 1fb8d4
				continue;
Packit 1fb8d4
Packit 1fb8d4
			pX = pDst[1] + dstStep[1] * pos;
Packit 1fb8d4
		}
Packit 1fb8d4
		else
Packit 1fb8d4
		{
Packit 1fb8d4
			const UINT32 pos = (2 * vY++ + oddY);
Packit 1fb8d4
Packit 1fb8d4
			if (pos >= nHeight)
Packit 1fb8d4
				continue;
Packit 1fb8d4
Packit 1fb8d4
			pX = pDst[2] + dstStep[2] * pos;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		memcpy(pX, Ya, nWidth);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	/* B6 and B7 */
Packit 1fb8d4
	for (y = 0; y < halfHeight; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		const UINT32 val2y = (y * 2 + evenY);
Packit 1fb8d4
		const BYTE* Ua = pSrc[1] + srcStep[1] * y;
Packit 1fb8d4
		const BYTE* Va = pSrc[2] + srcStep[2] * y;
Packit 1fb8d4
		BYTE* pU = pDst[1] + dstStep[1] * val2y;
Packit 1fb8d4
		BYTE* pV = pDst[2] + dstStep[2] * val2y;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < halfWidth - halfPad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i u = _mm_loadu_si128((__m128i*)&Ua[x]);
Packit 1fb8d4
				const __m128i u2 = _mm_unpackhi_epi8(u, zero);
Packit 1fb8d4
				const __m128i u1 = _mm_unpacklo_epi8(u, zero);
Packit 1fb8d4
				_mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
Packit 1fb8d4
				_mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i u = _mm_loadu_si128((__m128i*)&Va[x]);
Packit 1fb8d4
				const __m128i u2 = _mm_unpackhi_epi8(u, zero);
Packit 1fb8d4
				const __m128i u1 = _mm_unpacklo_epi8(u, zero);
Packit 1fb8d4
				_mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]);
Packit 1fb8d4
				_mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]);
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (; x < halfWidth; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const UINT32 val2x1 = (x * 2 + oddX);
Packit 1fb8d4
			pU[val2x1] = Ua[x];
Packit 1fb8d4
			pV[val2x1] = Va[x];
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	/* Filter */
Packit 1fb8d4
	return ssse3_ChromaFilter(pDst, dstStep, roi);
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* pSrc[3], const UINT32 srcStep[3],
Packit 1fb8d4
                                        UINT32 nTotalWidth, UINT32 nTotalHeight,
Packit 1fb8d4
                                        BYTE* pDst[3], const UINT32 dstStep[3],
Packit 1fb8d4
                                        const RECTANGLE_16* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	UINT32 x, y;
Packit 1fb8d4
	const UINT32 nWidth = roi->right - roi->left;
Packit 1fb8d4
	const UINT32 nHeight = roi->bottom - roi->top;
Packit 1fb8d4
	const UINT32 halfWidth = (nWidth + 1) / 2;
Packit 1fb8d4
	const UINT32 halfPad = halfWidth % 16;
Packit 1fb8d4
	const UINT32 halfHeight = (nHeight + 1) / 2;
Packit 1fb8d4
	const UINT32 quaterWidth = (nWidth + 3) / 4;
Packit 1fb8d4
	const UINT32 quaterPad = quaterWidth % 16;
Packit 1fb8d4
	const __m128i zero = _mm_setzero_si128();
Packit 1fb8d4
	const __m128i mask = _mm_set_epi8(0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0,
Packit 1fb8d4
	                                  0x80, 0);
Packit 1fb8d4
	const __m128i mask2 = _mm_set_epi8(0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0,
Packit 1fb8d4
	                                   0x80);
Packit 1fb8d4
	const __m128i shuffle1 = _mm_set_epi8(0x80, 15, 0x80, 14, 0x80, 13, 0x80, 12, 0x80, 11, 0x80, 10,
Packit 1fb8d4
	                                      0x80, 9, 0x80, 8);
Packit 1fb8d4
	const __m128i shuffle2 = _mm_set_epi8(0x80, 7, 0x80, 6, 0x80, 5, 0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1,
Packit 1fb8d4
	                                      0x80, 0);
Packit 1fb8d4
Packit 1fb8d4
	/* B4 and B5: odd UV values for width/2, height */
Packit 1fb8d4
	for (y = 0; y < nHeight; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		const UINT32 yTop = y + roi->top;
Packit 1fb8d4
		const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
Packit 1fb8d4
		const BYTE* pYaV = pYaU + nTotalWidth / 2;
Packit 1fb8d4
		BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
Packit 1fb8d4
		BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < halfWidth - halfPad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i u = _mm_loadu_si128((__m128i*)&pYaU[x]);
Packit 1fb8d4
				const __m128i u2 = _mm_unpackhi_epi8(zero, u);
Packit 1fb8d4
				const __m128i u1 = _mm_unpacklo_epi8(zero, u);
Packit 1fb8d4
				_mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
Packit 1fb8d4
				_mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i v = _mm_loadu_si128((__m128i*)&pYaV[x]);
Packit 1fb8d4
				const __m128i v2 = _mm_unpackhi_epi8(zero, v);
Packit 1fb8d4
				const __m128i v1 = _mm_unpacklo_epi8(zero, v);
Packit 1fb8d4
				_mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]);
Packit 1fb8d4
				_mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]);
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (; x < halfWidth; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			const UINT32 odd = 2 * x + 1;
Packit 1fb8d4
			pU[odd] = pYaU[x];
Packit 1fb8d4
			pV[odd] = pYaV[x];
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	/* B6 - B9 */
Packit 1fb8d4
	for (y = 0; y < halfHeight; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
Packit 1fb8d4
		const BYTE* pUaV = pUaU + nTotalWidth / 4;
Packit 1fb8d4
		const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
Packit 1fb8d4
		const BYTE* pVaV = pVaU + nTotalWidth / 4;
Packit 1fb8d4
		BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
Packit 1fb8d4
		BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
Packit 1fb8d4
Packit 1fb8d4
		for (x = 0; x < quaterWidth - quaterPad; x += 16)
Packit 1fb8d4
		{
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i uU = _mm_loadu_si128((__m128i*)&pUaU[x]);
Packit 1fb8d4
				const __m128i uV = _mm_loadu_si128((__m128i*)&pVaU[x]);
Packit 1fb8d4
				const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
Packit 1fb8d4
				const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
Packit 1fb8d4
				const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
Packit 1fb8d4
				const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
Packit 1fb8d4
				const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
Packit 1fb8d4
				const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
Packit 1fb8d4
				_mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]);
Packit 1fb8d4
				_mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]);
Packit 1fb8d4
				_mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]);
Packit 1fb8d4
				_mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]);
Packit 1fb8d4
			}
Packit 1fb8d4
			{
Packit 1fb8d4
				const __m128i vU = _mm_loadu_si128((__m128i*)&pUaV[x]);
Packit 1fb8d4
				const __m128i vV = _mm_loadu_si128((__m128i*)&pVaV[x]);
Packit 1fb8d4
				const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
Packit 1fb8d4
				const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
Packit 1fb8d4
				const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
Packit 1fb8d4
				const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
Packit 1fb8d4
				const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
Packit 1fb8d4
				const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
Packit 1fb8d4
				_mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]);
Packit 1fb8d4
				_mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]);
Packit 1fb8d4
				_mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]);
Packit 1fb8d4
				_mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]);
Packit 1fb8d4
			}
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		for (; x < quaterWidth; x++)
Packit 1fb8d4
		{
Packit 1fb8d4
			pU[4 * x + 0] = pUaU[x];
Packit 1fb8d4
			pV[4 * x + 0] = pUaV[x];
Packit 1fb8d4
			pU[4 * x + 2] = pVaU[x];
Packit 1fb8d4
			pV[4 * x + 2] = pVaV[x];
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return ssse3_ChromaFilter(pDst, dstStep, roi);
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static pstatus_t ssse3_YUV420CombineToYUV444(
Packit 1fb8d4
    avc444_frame_type type,
Packit 1fb8d4
    const BYTE* pSrc[3], const UINT32 srcStep[3],
Packit 1fb8d4
    UINT32 nWidth, UINT32 nHeight,
Packit 1fb8d4
    BYTE* pDst[3], const UINT32 dstStep[3],
Packit 1fb8d4
    const RECTANGLE_16* roi)
Packit 1fb8d4
{
Packit 1fb8d4
	if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
Packit 1fb8d4
		return -1;
Packit 1fb8d4
Packit 1fb8d4
	if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
Packit 1fb8d4
		return -1;
Packit 1fb8d4
Packit 1fb8d4
	if (!roi)
Packit 1fb8d4
		return -1;
Packit 1fb8d4
Packit 1fb8d4
	switch (type)
Packit 1fb8d4
	{
Packit 1fb8d4
		case AVC444_LUMA:
Packit 1fb8d4
			return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		case AVC444_CHROMAv1:
Packit 1fb8d4
			return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		case AVC444_CHROMAv2:
Packit 1fb8d4
			return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
Packit 1fb8d4
Packit 1fb8d4
		default:
Packit 1fb8d4
			return -1;
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
void primitives_init_YUV_opt(primitives_t* prims)
Packit 1fb8d4
{
Packit 1fb8d4
	generic = primitives_get_generic();
Packit 1fb8d4
	primitives_init_YUV(prims);
Packit 1fb8d4
Packit 1fb8d4
	if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)
Packit 1fb8d4
	    && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
Packit 1fb8d4
	{
Packit 1fb8d4
		prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420;
Packit 1fb8d4
		prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV;
Packit 1fb8d4
		prims->RGBToAVC444YUVv2 = ssse3_RGBToAVC444YUVv2;
Packit 1fb8d4
		prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB;
Packit 1fb8d4
		prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
Packit 1fb8d4
		prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444;
Packit 1fb8d4
	}
Packit 1fb8d4
}