|
Packit |
1fb8d4 |
/* FreeRDP: A Remote Desktop Protocol Client
|
|
Packit |
1fb8d4 |
* Optimized Color conversion operations.
|
|
Packit |
1fb8d4 |
* vi:ts=4 sw=4:
|
|
Packit |
1fb8d4 |
*
|
|
Packit |
1fb8d4 |
* Copyright 2011 Stephen Erisman
|
|
Packit |
1fb8d4 |
* Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
|
|
Packit |
1fb8d4 |
* Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
|
|
Packit |
1fb8d4 |
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
|
Packit |
1fb8d4 |
*
|
|
Packit |
1fb8d4 |
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
Packit |
1fb8d4 |
* not use this file except in compliance with the License. You may obtain
|
|
Packit |
1fb8d4 |
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
|
Packit |
1fb8d4 |
* Unless required by applicable law or agreed to in writing, software
|
|
Packit |
1fb8d4 |
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
Packit |
1fb8d4 |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
Packit |
1fb8d4 |
* or implied. See the License for the specific language governing
|
|
Packit |
1fb8d4 |
* permissions and limitations under the License.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#ifdef HAVE_CONFIG_H
|
|
Packit |
1fb8d4 |
#include "config.h"
|
|
Packit |
1fb8d4 |
#endif
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#include <freerdp/types.h>
|
|
Packit |
1fb8d4 |
#include <freerdp/primitives.h>
|
|
Packit |
1fb8d4 |
#include <winpr/sysinfo.h>
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#ifdef WITH_SSE2
|
|
Packit |
1fb8d4 |
#include <emmintrin.h>
|
|
Packit |
1fb8d4 |
#elif defined(WITH_NEON)
|
|
Packit |
1fb8d4 |
#include <arm_neon.h>
|
|
Packit |
1fb8d4 |
#endif /* WITH_SSE2 else WITH_NEON */
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#include "prim_internal.h"
|
|
Packit |
1fb8d4 |
#include "prim_templates.h"
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
static primitives_t* generic = NULL;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#ifdef WITH_SSE2
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#ifdef __GNUC__
|
|
Packit |
1fb8d4 |
# define GNU_INLINE \
|
|
Packit |
1fb8d4 |
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
Packit |
1fb8d4 |
#else
|
|
Packit |
1fb8d4 |
# define GNU_INLINE
|
|
Packit |
1fb8d4 |
#endif
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#define CACHE_LINE_BYTES 64
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#define _mm_between_epi16(_val, _min, _max) \
|
|
Packit |
1fb8d4 |
do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#ifdef DO_PREFETCH
|
|
Packit |
1fb8d4 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
1fb8d4 |
static inline void GNU_INLINE _mm_prefetch_buffer(
|
|
Packit |
1fb8d4 |
char* buffer,
|
|
Packit |
1fb8d4 |
int num_bytes)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i* buf = (__m128i*) buffer;
|
|
Packit |
1fb8d4 |
unsigned int i;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (i = 0; i < (num_bytes / sizeof(__m128i));
|
|
Packit |
1fb8d4 |
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
#endif /* DO_PREFETCH */
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
1fb8d4 |
static pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
|
|
Packit |
1fb8d4 |
const INT16* pSrc[3],
|
|
Packit |
1fb8d4 |
int srcStep,
|
|
Packit |
1fb8d4 |
INT16* pDst[3],
|
|
Packit |
1fb8d4 |
int dstStep,
|
|
Packit |
1fb8d4 |
const prim_size_t* roi) /* region of interest */
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
|
|
Packit |
1fb8d4 |
__m128i* y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
|
|
Packit |
1fb8d4 |
int srcbump, dstbump, yp, imax;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
if (((ULONG_PTR)(pSrc[0]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| ((ULONG_PTR)(pSrc[1]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| ((ULONG_PTR)(pSrc[2]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| ((ULONG_PTR)(pDst[0]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| ((ULONG_PTR)(pDst[1]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| ((ULONG_PTR)(pDst[2]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| (roi->width & 0x07)
|
|
Packit |
1fb8d4 |
|| (srcStep & 127)
|
|
Packit |
1fb8d4 |
|| (dstStep & 127))
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
/* We can't maintain 16-byte alignment. */
|
|
Packit |
1fb8d4 |
return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
|
|
Packit |
1fb8d4 |
pDst, dstStep, roi);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
zero = _mm_setzero_si128();
|
|
Packit |
1fb8d4 |
max = _mm_set1_epi16(255);
|
|
Packit |
1fb8d4 |
y_buf = (__m128i*)(pSrc[0]);
|
|
Packit |
1fb8d4 |
cb_buf = (__m128i*)(pSrc[1]);
|
|
Packit |
1fb8d4 |
cr_buf = (__m128i*)(pSrc[2]);
|
|
Packit |
1fb8d4 |
r_buf = (__m128i*)(pDst[0]);
|
|
Packit |
1fb8d4 |
g_buf = (__m128i*)(pDst[1]);
|
|
Packit |
1fb8d4 |
b_buf = (__m128i*)(pDst[2]);
|
|
Packit |
1fb8d4 |
r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
|
|
Packit |
1fb8d4 |
g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
|
|
Packit |
1fb8d4 |
g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
|
|
Packit |
1fb8d4 |
b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
|
|
Packit |
1fb8d4 |
c4096 = _mm_set1_epi16(4096);
|
|
Packit |
1fb8d4 |
srcbump = srcStep / sizeof(__m128i);
|
|
Packit |
1fb8d4 |
dstbump = dstStep / sizeof(__m128i);
|
|
Packit |
1fb8d4 |
#ifdef DO_PREFETCH
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* Prefetch Y's, Cb's, and Cr's. */
|
|
Packit |
1fb8d4 |
for (yp = 0; yp < roi->height; yp++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
int i;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
|
|
Packit |
1fb8d4 |
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
_mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
|
|
Packit |
1fb8d4 |
_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
|
|
Packit |
1fb8d4 |
_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
y_buf += srcbump;
|
|
Packit |
1fb8d4 |
cb_buf += srcbump;
|
|
Packit |
1fb8d4 |
cr_buf += srcbump;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
y_buf = (__m128i*)(pSrc[0]);
|
|
Packit |
1fb8d4 |
cb_buf = (__m128i*)(pSrc[1]);
|
|
Packit |
1fb8d4 |
cr_buf = (__m128i*)(pSrc[2]);
|
|
Packit |
1fb8d4 |
#endif /* DO_PREFETCH */
|
|
Packit |
1fb8d4 |
imax = roi->width * sizeof(INT16) / sizeof(__m128i);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (yp = 0; yp < roi->height; ++yp)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
int i;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (i = 0; i < imax; i++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
/* In order to use SSE2 signed 16-bit integer multiplication
|
|
Packit |
1fb8d4 |
* we need to convert the floating point factors to signed int
|
|
Packit |
1fb8d4 |
* without losing information.
|
|
Packit |
1fb8d4 |
* The result of this multiplication is 32 bit and we have two
|
|
Packit |
1fb8d4 |
* SSE instructions that return either the hi or lo word.
|
|
Packit |
1fb8d4 |
* Thus we will multiply the factors by the highest possible 2^n,
|
|
Packit |
1fb8d4 |
* take the upper 16 bits of the signed 32-bit result
|
|
Packit |
1fb8d4 |
* (_mm_mulhi_epi16) and correct this result by multiplying
|
|
Packit |
1fb8d4 |
* it by 2^(16-n).
|
|
Packit |
1fb8d4 |
*
|
|
Packit |
1fb8d4 |
* For the given factors in the conversion matrix the best
|
|
Packit |
1fb8d4 |
* possible n is 14.
|
|
Packit |
1fb8d4 |
*
|
|
Packit |
1fb8d4 |
* Example for calculating r:
|
|
Packit |
1fb8d4 |
* r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
|
|
Packit |
1fb8d4 |
* r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
|
|
Packit |
1fb8d4 |
* r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
|
|
Packit |
1fb8d4 |
* r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
/* y = (y_r_buf[i] + 4096) >> 2 */
|
|
Packit |
1fb8d4 |
__m128i y, cb, cr, r, g, b;
|
|
Packit |
1fb8d4 |
y = _mm_load_si128(y_buf + i);
|
|
Packit |
1fb8d4 |
y = _mm_add_epi16(y, c4096);
|
|
Packit |
1fb8d4 |
y = _mm_srai_epi16(y, 2);
|
|
Packit |
1fb8d4 |
/* cb = cb_g_buf[i]; */
|
|
Packit |
1fb8d4 |
cb = _mm_load_si128(cb_buf + i);
|
|
Packit |
1fb8d4 |
/* cr = cr_b_buf[i]; */
|
|
Packit |
1fb8d4 |
cr = _mm_load_si128(cr_buf + i);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cr*22986)) >> 3 */
|
|
Packit |
1fb8d4 |
r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
|
|
Packit |
1fb8d4 |
r = _mm_srai_epi16(r, 3);
|
|
Packit |
1fb8d4 |
/* r_buf[i] = CLIP(r); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(r, zero, max);
|
|
Packit |
1fb8d4 |
_mm_store_si128(r_buf + i, r);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
|
|
Packit |
1fb8d4 |
g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
|
|
Packit |
1fb8d4 |
g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
|
|
Packit |
1fb8d4 |
g = _mm_srai_epi16(g, 3);
|
|
Packit |
1fb8d4 |
/* g_buf[i] = CLIP(g); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(g, zero, max);
|
|
Packit |
1fb8d4 |
_mm_store_si128(g_buf + i, g);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cb*28999)) >> 3 */
|
|
Packit |
1fb8d4 |
b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
|
|
Packit |
1fb8d4 |
b = _mm_srai_epi16(b, 3);
|
|
Packit |
1fb8d4 |
/* b_buf[i] = CLIP(b); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(b, zero, max);
|
|
Packit |
1fb8d4 |
_mm_store_si128(b_buf + i, b);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
y_buf += srcbump;
|
|
Packit |
1fb8d4 |
cb_buf += srcbump;
|
|
Packit |
1fb8d4 |
cr_buf += srcbump;
|
|
Packit |
1fb8d4 |
r_buf += dstbump;
|
|
Packit |
1fb8d4 |
g_buf += dstbump;
|
|
Packit |
1fb8d4 |
b_buf += dstbump;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
1fb8d4 |
static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(
|
|
Packit |
1fb8d4 |
const INT16* pSrc[3], UINT32 srcStep,
|
|
Packit |
1fb8d4 |
BYTE* pDst, UINT32 dstStep,
|
|
Packit |
1fb8d4 |
const prim_size_t* roi) /* region of interest */
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i zero = _mm_setzero_si128();
|
|
Packit |
1fb8d4 |
const __m128i max = _mm_set1_epi16(255);
|
|
Packit |
1fb8d4 |
const __m128i r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
|
|
Packit |
1fb8d4 |
const __m128i g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
|
|
Packit |
1fb8d4 |
const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
|
|
Packit |
1fb8d4 |
const __m128i b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
|
|
Packit |
1fb8d4 |
const __m128i c4096 = _mm_set1_epi16(4096);
|
|
Packit |
1fb8d4 |
const INT16* y_buf = (INT16*)pSrc[0];
|
|
Packit |
1fb8d4 |
const INT16* cb_buf = (INT16*)pSrc[1];
|
|
Packit |
1fb8d4 |
const INT16* cr_buf = (INT16*)pSrc[2];
|
|
Packit |
1fb8d4 |
const UINT32 pad = roi->width % 16;
|
|
Packit |
1fb8d4 |
const UINT32 step = sizeof(__m128i) / sizeof(INT16);
|
|
Packit |
1fb8d4 |
const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
|
|
Packit |
1fb8d4 |
BYTE* d_buf = pDst;
|
|
Packit |
1fb8d4 |
int yp;
|
|
Packit |
1fb8d4 |
const size_t dstPad = (dstStep - roi->width * 4);
|
|
Packit |
1fb8d4 |
#ifdef DO_PREFETCH
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* Prefetch Y's, Cb's, and Cr's. */
|
|
Packit |
1fb8d4 |
for (yp = 0; yp < roi->height; yp++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
int i;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (i = 0; i < imax;
|
|
Packit |
1fb8d4 |
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
_mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
|
|
Packit |
1fb8d4 |
_mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
|
|
Packit |
1fb8d4 |
_mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
y_buf += srcStep / sizeof(INT16);
|
|
Packit |
1fb8d4 |
cb_buf += srcStep / sizeof(INT16);
|
|
Packit |
1fb8d4 |
cr_buf += srcStep / sizeof(INT16);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
y_buf = (INT16*)pSrc[0];
|
|
Packit |
1fb8d4 |
cb_buf = (INT16*)pSrc[1];
|
|
Packit |
1fb8d4 |
cr_buf = (INT16*)pSrc[2];
|
|
Packit |
1fb8d4 |
#endif /* DO_PREFETCH */
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (yp = 0; yp < roi->height; ++yp)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
UINT32 i;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (i = 0; i < imax; i += 2)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
/* In order to use SSE2 signed 16-bit integer multiplication
|
|
Packit |
1fb8d4 |
* we need to convert the floating point factors to signed int
|
|
Packit |
1fb8d4 |
* without losing information.
|
|
Packit |
1fb8d4 |
* The result of this multiplication is 32 bit and we have two
|
|
Packit |
1fb8d4 |
* SSE instructions that return either the hi or lo word.
|
|
Packit |
1fb8d4 |
* Thus we will multiply the factors by the highest possible 2^n,
|
|
Packit |
1fb8d4 |
* take the upper 16 bits of the signed 32-bit result
|
|
Packit |
1fb8d4 |
* (_mm_mulhi_epi16) and correct this result by multiplying
|
|
Packit |
1fb8d4 |
* it by 2^(16-n).
|
|
Packit |
1fb8d4 |
*
|
|
Packit |
1fb8d4 |
* For the given factors in the conversion matrix the best
|
|
Packit |
1fb8d4 |
* possible n is 14.
|
|
Packit |
1fb8d4 |
*
|
|
Packit |
1fb8d4 |
* Example for calculating r:
|
|
Packit |
1fb8d4 |
* r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
|
|
Packit |
1fb8d4 |
* r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
|
|
Packit |
1fb8d4 |
* r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
|
|
Packit |
1fb8d4 |
* r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
/* y = (y_r_buf[i] + 4096) >> 2 */
|
|
Packit |
1fb8d4 |
__m128i y1, y2, cb1, cb2, cr1, cr2, r1, r2, g1, g2, b1, b2;
|
|
Packit |
1fb8d4 |
y1 = _mm_load_si128((__m128i*)y_buf);
|
|
Packit |
1fb8d4 |
y_buf += step;
|
|
Packit |
1fb8d4 |
y1 = _mm_add_epi16(y1, c4096);
|
|
Packit |
1fb8d4 |
y1 = _mm_srai_epi16(y1, 2);
|
|
Packit |
1fb8d4 |
/* cb = cb_g_buf[i]; */
|
|
Packit |
1fb8d4 |
cb1 = _mm_load_si128((__m128i*)cb_buf);
|
|
Packit |
1fb8d4 |
cb_buf += step;
|
|
Packit |
1fb8d4 |
/* cr = cr_b_buf[i]; */
|
|
Packit |
1fb8d4 |
cr1 = _mm_load_si128((__m128i*)cr_buf);
|
|
Packit |
1fb8d4 |
cr_buf += step;
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cr*22986)) >> 3 */
|
|
Packit |
1fb8d4 |
r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
|
|
Packit |
1fb8d4 |
r1 = _mm_srai_epi16(r1, 3);
|
|
Packit |
1fb8d4 |
/* r_buf[i] = CLIP(r); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(r1, zero, max);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
|
|
Packit |
1fb8d4 |
g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
|
|
Packit |
1fb8d4 |
g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
|
|
Packit |
1fb8d4 |
g1 = _mm_srai_epi16(g1, 3);
|
|
Packit |
1fb8d4 |
/* g_buf[i] = CLIP(g); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(g1, zero, max);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cb*28999)) >> 3 */
|
|
Packit |
1fb8d4 |
b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
|
|
Packit |
1fb8d4 |
b1 = _mm_srai_epi16(b1, 3);
|
|
Packit |
1fb8d4 |
/* b_buf[i] = CLIP(b); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(b1, zero, max);
|
|
Packit |
1fb8d4 |
y2 = _mm_load_si128((__m128i*)y_buf);
|
|
Packit |
1fb8d4 |
y_buf += step;
|
|
Packit |
1fb8d4 |
y2 = _mm_add_epi16(y2, c4096);
|
|
Packit |
1fb8d4 |
y2 = _mm_srai_epi16(y2, 2);
|
|
Packit |
1fb8d4 |
/* cb = cb_g_buf[i]; */
|
|
Packit |
1fb8d4 |
cb2 = _mm_load_si128((__m128i*)cb_buf);
|
|
Packit |
1fb8d4 |
cb_buf += step;
|
|
Packit |
1fb8d4 |
/* cr = cr_b_buf[i]; */
|
|
Packit |
1fb8d4 |
cr2 = _mm_load_si128((__m128i*)cr_buf);
|
|
Packit |
1fb8d4 |
cr_buf += step;
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cr*22986)) >> 3 */
|
|
Packit |
1fb8d4 |
r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
|
|
Packit |
1fb8d4 |
r2 = _mm_srai_epi16(r2, 3);
|
|
Packit |
1fb8d4 |
/* r_buf[i] = CLIP(r); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(r2, zero, max);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
|
|
Packit |
1fb8d4 |
g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
|
|
Packit |
1fb8d4 |
g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
|
|
Packit |
1fb8d4 |
g2 = _mm_srai_epi16(g2, 3);
|
|
Packit |
1fb8d4 |
/* g_buf[i] = CLIP(g); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(g2, zero, max);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cb*28999)) >> 3 */
|
|
Packit |
1fb8d4 |
b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
|
|
Packit |
1fb8d4 |
b2 = _mm_srai_epi16(b2, 3);
|
|
Packit |
1fb8d4 |
/* b_buf[i] = CLIP(b); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(b2, zero, max);
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1, R2, R3, R4;
|
|
Packit |
1fb8d4 |
/* The comments below pretend these are 8-byte registers
|
|
Packit |
1fb8d4 |
* rather than 16-byte, for readability.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
R0 = b1; /* R0 = 00B300B200B100B0 */
|
|
Packit |
1fb8d4 |
R1 = b2; /* R1 = 00B700B600B500B4 */
|
|
Packit |
1fb8d4 |
R0 = _mm_packus_epi16(R0, R1); /* R0 = B7B6B5B4B3B2B1B0 */
|
|
Packit |
1fb8d4 |
R1 = g1; /* R1 = 00G300G200G100G0 */
|
|
Packit |
1fb8d4 |
R2 = g2; /* R2 = 00G700G600G500G4 */
|
|
Packit |
1fb8d4 |
R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */
|
|
Packit |
1fb8d4 |
R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
|
|
Packit |
1fb8d4 |
R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = B3G3B2G2B1G1B0G0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = B7G7B6G6B5G5B4G4 */
|
|
Packit |
1fb8d4 |
R0 = r1; /* R0 = 00R300R200R100R0 */
|
|
Packit |
1fb8d4 |
R3 = r2; /* R3 = 00R700R600R500R4 */
|
|
Packit |
1fb8d4 |
R0 = _mm_packus_epi16(R0, R3); /* R0 = R7R6R5R4R3R2R1R0 */
|
|
Packit |
1fb8d4 |
R3 = _mm_set1_epi32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */
|
|
Packit |
1fb8d4 |
R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
|
|
Packit |
1fb8d4 |
R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = R3FFR2FFR1FFR0FF */
|
|
Packit |
1fb8d4 |
R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = R7FFR6FFR5FFR4FF */
|
|
Packit |
1fb8d4 |
R0 = R4; /* R0 = R4 */
|
|
Packit |
1fb8d4 |
R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = B1G1R1FFB0G0R0FF */
|
|
Packit |
1fb8d4 |
R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = B3G3R3FFB2G2R2FF */
|
|
Packit |
1fb8d4 |
R2 = R3; /* R2 = R3 */
|
|
Packit |
1fb8d4 |
R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = B5G5R5FFB4G4R4FF */
|
|
Packit |
1fb8d4 |
R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = B7G7R7FFB6G6R6FF */
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)d_buf, R0); /* B1G1R1FFB0G0R0FF */
|
|
Packit |
1fb8d4 |
d_buf += sizeof(__m128i);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)d_buf, R4); /* B3G3R3FFB2G2R2FF */
|
|
Packit |
1fb8d4 |
d_buf += sizeof(__m128i);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)d_buf, R2); /* B5G5R5FFB4G4R4FF */
|
|
Packit |
1fb8d4 |
d_buf += sizeof(__m128i);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)d_buf, R3); /* B7G7R7FFB6G6R6FF */
|
|
Packit |
1fb8d4 |
d_buf += sizeof(__m128i);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (i = 0; i < pad; i++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const INT32 divisor = 16;
|
|
Packit |
1fb8d4 |
const INT32 Y = ((*y_buf++) + 4096) << divisor;
|
|
Packit |
1fb8d4 |
const INT32 Cb = (*cb_buf++);
|
|
Packit |
1fb8d4 |
const INT32 Cr = (*cr_buf++);
|
|
Packit |
1fb8d4 |
const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
|
|
Packit |
1fb8d4 |
const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
|
|
Packit |
1fb8d4 |
const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
|
|
Packit |
1fb8d4 |
const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
|
|
Packit |
1fb8d4 |
const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
|
|
Packit |
1fb8d4 |
const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
|
|
Packit |
1fb8d4 |
const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
|
|
Packit |
1fb8d4 |
*d_buf++ = CLIP(B);
|
|
Packit |
1fb8d4 |
*d_buf++ = CLIP(G);
|
|
Packit |
1fb8d4 |
*d_buf++ = CLIP(R);
|
|
Packit |
1fb8d4 |
*d_buf++ = 0xFF;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
d_buf += dstPad;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
1fb8d4 |
static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(
|
|
Packit |
1fb8d4 |
const INT16* pSrc[3], UINT32 srcStep,
|
|
Packit |
1fb8d4 |
BYTE* pDst, UINT32 dstStep,
|
|
Packit |
1fb8d4 |
const prim_size_t* roi) /* region of interest */
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i zero = _mm_setzero_si128();
|
|
Packit |
1fb8d4 |
const __m128i max = _mm_set1_epi16(255);
|
|
Packit |
1fb8d4 |
const __m128i r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
|
|
Packit |
1fb8d4 |
const __m128i g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
|
|
Packit |
1fb8d4 |
const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
|
|
Packit |
1fb8d4 |
const __m128i b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
|
|
Packit |
1fb8d4 |
const __m128i c4096 = _mm_set1_epi16(4096);
|
|
Packit |
1fb8d4 |
const INT16* y_buf = (INT16*)pSrc[0];
|
|
Packit |
1fb8d4 |
const INT16* cb_buf = (INT16*)pSrc[1];
|
|
Packit |
1fb8d4 |
const INT16* cr_buf = (INT16*)pSrc[2];
|
|
Packit |
1fb8d4 |
const UINT32 pad = roi->width % 16;
|
|
Packit |
1fb8d4 |
const UINT32 step = sizeof(__m128i) / sizeof(INT16);
|
|
Packit |
1fb8d4 |
const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
|
|
Packit |
1fb8d4 |
BYTE* d_buf = pDst;
|
|
Packit |
1fb8d4 |
int yp;
|
|
Packit |
1fb8d4 |
const size_t dstPad = (dstStep - roi->width * 4);
|
|
Packit |
1fb8d4 |
#ifdef DO_PREFETCH
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* Prefetch Y's, Cb's, and Cr's. */
|
|
Packit |
1fb8d4 |
for (yp = 0; yp < roi->height; yp++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
int i;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (i = 0; i < imax;
|
|
Packit |
1fb8d4 |
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
_mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
|
|
Packit |
1fb8d4 |
_mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
|
|
Packit |
1fb8d4 |
_mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
y_buf += srcStep / sizeof(INT16);
|
|
Packit |
1fb8d4 |
cb_buf += srcStep / sizeof(INT16);
|
|
Packit |
1fb8d4 |
cr_buf += srcStep / sizeof(INT16);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
y_buf = (INT16*)(pSrc[0]);
|
|
Packit |
1fb8d4 |
cb_buf = (INT16*)(pSrc[1]);
|
|
Packit |
1fb8d4 |
cr_buf = (INT16*)(pSrc[2]);
|
|
Packit |
1fb8d4 |
#endif /* DO_PREFETCH */
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (yp = 0; yp < roi->height; ++yp)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
UINT32 i;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (i = 0; i < imax; i += 2)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
/* In order to use SSE2 signed 16-bit integer multiplication
|
|
Packit |
1fb8d4 |
* we need to convert the floating point factors to signed int
|
|
Packit |
1fb8d4 |
* without losing information.
|
|
Packit |
1fb8d4 |
* The result of this multiplication is 32 bit and we have two
|
|
Packit |
1fb8d4 |
* SSE instructions that return either the hi or lo word.
|
|
Packit |
1fb8d4 |
* Thus we will multiply the factors by the highest possible 2^n,
|
|
Packit |
1fb8d4 |
* take the upper 16 bits of the signed 32-bit result
|
|
Packit |
1fb8d4 |
* (_mm_mulhi_epi16) and correct this result by multiplying
|
|
Packit |
1fb8d4 |
* it by 2^(16-n).
|
|
Packit |
1fb8d4 |
*
|
|
Packit |
1fb8d4 |
* For the given factors in the conversion matrix the best
|
|
Packit |
1fb8d4 |
* possible n is 14.
|
|
Packit |
1fb8d4 |
*
|
|
Packit |
1fb8d4 |
* Example for calculating r:
|
|
Packit |
1fb8d4 |
* r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
|
|
Packit |
1fb8d4 |
* r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
|
|
Packit |
1fb8d4 |
* r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
|
|
Packit |
1fb8d4 |
* r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
/* y = (y_r_buf[i] + 4096) >> 2 */
|
|
Packit |
1fb8d4 |
__m128i y1, y2, cb1, cb2, cr1, cr2, r1, r2, g1, g2, b1, b2;
|
|
Packit |
1fb8d4 |
y1 = _mm_load_si128((__m128i*)y_buf);
|
|
Packit |
1fb8d4 |
y_buf += step;
|
|
Packit |
1fb8d4 |
y1 = _mm_add_epi16(y1, c4096);
|
|
Packit |
1fb8d4 |
y1 = _mm_srai_epi16(y1, 2);
|
|
Packit |
1fb8d4 |
/* cb = cb_g_buf[i]; */
|
|
Packit |
1fb8d4 |
cb1 = _mm_load_si128((__m128i*)cb_buf);
|
|
Packit |
1fb8d4 |
cb_buf += step;
|
|
Packit |
1fb8d4 |
/* cr = cr_b_buf[i]; */
|
|
Packit |
1fb8d4 |
cr1 = _mm_load_si128((__m128i*)cr_buf);
|
|
Packit |
1fb8d4 |
cr_buf += step;
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cr*22986)) >> 3 */
|
|
Packit |
1fb8d4 |
r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
|
|
Packit |
1fb8d4 |
r1 = _mm_srai_epi16(r1, 3);
|
|
Packit |
1fb8d4 |
/* r_buf[i] = CLIP(r); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(r1, zero, max);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
|
|
Packit |
1fb8d4 |
g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
|
|
Packit |
1fb8d4 |
g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
|
|
Packit |
1fb8d4 |
g1 = _mm_srai_epi16(g1, 3);
|
|
Packit |
1fb8d4 |
/* g_buf[i] = CLIP(g); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(g1, zero, max);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cb*28999)) >> 3 */
|
|
Packit |
1fb8d4 |
b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
|
|
Packit |
1fb8d4 |
b1 = _mm_srai_epi16(b1, 3);
|
|
Packit |
1fb8d4 |
/* b_buf[i] = CLIP(b); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(b1, zero, max);
|
|
Packit |
1fb8d4 |
y2 = _mm_load_si128((__m128i*)y_buf);
|
|
Packit |
1fb8d4 |
y_buf += step;
|
|
Packit |
1fb8d4 |
y2 = _mm_add_epi16(y2, c4096);
|
|
Packit |
1fb8d4 |
y2 = _mm_srai_epi16(y2, 2);
|
|
Packit |
1fb8d4 |
/* cb = cb_g_buf[i]; */
|
|
Packit |
1fb8d4 |
cb2 = _mm_load_si128((__m128i*)cb_buf);
|
|
Packit |
1fb8d4 |
cb_buf += step;
|
|
Packit |
1fb8d4 |
/* cr = cr_b_buf[i]; */
|
|
Packit |
1fb8d4 |
cr2 = _mm_load_si128((__m128i*)cr_buf);
|
|
Packit |
1fb8d4 |
cr_buf += step;
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cr*22986)) >> 3 */
|
|
Packit |
1fb8d4 |
r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
|
|
Packit |
1fb8d4 |
r2 = _mm_srai_epi16(r2, 3);
|
|
Packit |
1fb8d4 |
/* r_buf[i] = CLIP(r); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(r2, zero, max);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
|
|
Packit |
1fb8d4 |
g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
|
|
Packit |
1fb8d4 |
g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
|
|
Packit |
1fb8d4 |
g2 = _mm_srai_epi16(g2, 3);
|
|
Packit |
1fb8d4 |
/* g_buf[i] = CLIP(g); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(g2, zero, max);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cb*28999)) >> 3 */
|
|
Packit |
1fb8d4 |
b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
|
|
Packit |
1fb8d4 |
b2 = _mm_srai_epi16(b2, 3);
|
|
Packit |
1fb8d4 |
/* b_buf[i] = CLIP(b); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(b2, zero, max);
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1, R2, R3, R4;
|
|
Packit |
1fb8d4 |
/* The comments below pretend these are 8-byte registers
|
|
Packit |
1fb8d4 |
* rather than 16-byte, for readability.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
R0 = r1; /* R0 = 00R300R200R100R0 */
|
|
Packit |
1fb8d4 |
R1 = r2; /* R1 = 00R700R600R500R4 */
|
|
Packit |
1fb8d4 |
R0 = _mm_packus_epi16(R0, R1); /* R0 = R7R6R5R4R3R2R1R0 */
|
|
Packit |
1fb8d4 |
R1 = g1; /* R1 = 00G300G200G100G0 */
|
|
Packit |
1fb8d4 |
R2 = g2; /* R2 = 00G700G600G500G4 */
|
|
Packit |
1fb8d4 |
R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */
|
|
Packit |
1fb8d4 |
R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
|
|
Packit |
1fb8d4 |
R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = R3G3R2G2R1G1R0G0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = R7G7R6G6R5G5R4G4 */
|
|
Packit |
1fb8d4 |
R0 = b1; /* R0 = 00B300B200B100B0 */
|
|
Packit |
1fb8d4 |
R3 = b2; /* R3 = 00B700B600B500B4 */
|
|
Packit |
1fb8d4 |
R0 = _mm_packus_epi16(R0, R3); /* R0 = B7B6B5B4B3B2B1B0 */
|
|
Packit |
1fb8d4 |
R3 = _mm_set1_epi32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */
|
|
Packit |
1fb8d4 |
R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
|
|
Packit |
1fb8d4 |
R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = B3FFB2FFB1FFB0FF */
|
|
Packit |
1fb8d4 |
R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = B7FFB6FFB5FFB4FF */
|
|
Packit |
1fb8d4 |
R0 = R4; /* R0 = R4 */
|
|
Packit |
1fb8d4 |
R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = R1G1B1FFR0G0B0FF */
|
|
Packit |
1fb8d4 |
R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = R3G3B3FFR2G2B2FF */
|
|
Packit |
1fb8d4 |
R2 = R3; /* R2 = R3 */
|
|
Packit |
1fb8d4 |
R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = R5G5B5FFR4G4B4FF */
|
|
Packit |
1fb8d4 |
R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = R7G7B7FFR6G6B6FF */
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)d_buf, R0); /* R1G1B1FFR0G0B0FF */
|
|
Packit |
1fb8d4 |
d_buf += sizeof(__m128i);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)d_buf, R4); /* R3G3B3FFR2G2B2FF */
|
|
Packit |
1fb8d4 |
d_buf += sizeof(__m128i);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)d_buf, R2); /* R5G5B5FFR4G4B4FF */
|
|
Packit |
1fb8d4 |
d_buf += sizeof(__m128i);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)d_buf, R3); /* R7G7B7FFR6G6B6FF */
|
|
Packit |
1fb8d4 |
d_buf += sizeof(__m128i);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (i = 0; i < pad; i++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const INT32 divisor = 16;
|
|
Packit |
1fb8d4 |
const INT32 Y = ((*y_buf++) + 4096) << divisor;
|
|
Packit |
1fb8d4 |
const INT32 Cb = (*cb_buf++);
|
|
Packit |
1fb8d4 |
const INT32 Cr = (*cr_buf++);
|
|
Packit |
1fb8d4 |
const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
|
|
Packit |
1fb8d4 |
const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
|
|
Packit |
1fb8d4 |
const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
|
|
Packit |
1fb8d4 |
const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
|
|
Packit |
1fb8d4 |
const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
|
|
Packit |
1fb8d4 |
const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
|
|
Packit |
1fb8d4 |
const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
|
|
Packit |
1fb8d4 |
*d_buf++ = CLIP(R);
|
|
Packit |
1fb8d4 |
*d_buf++ = CLIP(G);
|
|
Packit |
1fb8d4 |
*d_buf++ = CLIP(B);
|
|
Packit |
1fb8d4 |
*d_buf++ = 0xFF;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
d_buf += dstPad;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R(
|
|
Packit |
1fb8d4 |
const INT16* pSrc[3], UINT32 srcStep,
|
|
Packit |
1fb8d4 |
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
|
|
Packit |
1fb8d4 |
const prim_size_t* roi) /* region of interest */
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
if (((ULONG_PTR)(pSrc[0]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| ((ULONG_PTR)(pSrc[1]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| ((ULONG_PTR)(pSrc[2]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| ((ULONG_PTR)(pDst) & 0x0f)
|
|
Packit |
1fb8d4 |
|| (srcStep & 0x0f)
|
|
Packit |
1fb8d4 |
|| (dstStep & 0x0f))
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
/* We can't maintain 16-byte alignment. */
|
|
Packit |
1fb8d4 |
return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep,
|
|
Packit |
1fb8d4 |
pDst, dstStep, DstFormat, roi);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
switch (DstFormat)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_BGRA32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_BGRX32:
|
|
Packit |
1fb8d4 |
return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_RGBA32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_RGBX32:
|
|
Packit |
1fb8d4 |
return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
default:
|
|
Packit |
1fb8d4 |
return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
|
|
Packit |
1fb8d4 |
* numbers. See the general code above.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
|
|
Packit |
1fb8d4 |
const INT16* pSrc[3],
|
|
Packit |
1fb8d4 |
int srcStep,
|
|
Packit |
1fb8d4 |
INT16* pDst[3],
|
|
Packit |
1fb8d4 |
int dstStep,
|
|
Packit |
1fb8d4 |
const prim_size_t* roi) /* region of interest */
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
|
|
Packit |
1fb8d4 |
__m128i* r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
|
|
Packit |
1fb8d4 |
int srcbump, dstbump, yp, imax;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
if (((ULONG_PTR)(pSrc[0]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| ((ULONG_PTR)(pSrc[1]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| ((ULONG_PTR)(pSrc[2]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| ((ULONG_PTR)(pDst[0]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| ((ULONG_PTR)(pDst[1]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| ((ULONG_PTR)(pDst[2]) & 0x0f)
|
|
Packit |
1fb8d4 |
|| (roi->width & 0x07)
|
|
Packit |
1fb8d4 |
|| (srcStep & 127)
|
|
Packit |
1fb8d4 |
|| (dstStep & 127))
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
/* We can't maintain 16-byte alignment. */
|
|
Packit |
1fb8d4 |
return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
|
|
Packit |
1fb8d4 |
pDst, dstStep, roi);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
min = _mm_set1_epi16(-128 * 32);
|
|
Packit |
1fb8d4 |
max = _mm_set1_epi16(127 * 32);
|
|
Packit |
1fb8d4 |
r_buf = (__m128i*)(pSrc[0]);
|
|
Packit |
1fb8d4 |
g_buf = (__m128i*)(pSrc[1]);
|
|
Packit |
1fb8d4 |
b_buf = (__m128i*)(pSrc[2]);
|
|
Packit |
1fb8d4 |
y_buf = (__m128i*)(pDst[0]);
|
|
Packit |
1fb8d4 |
cb_buf = (__m128i*)(pDst[1]);
|
|
Packit |
1fb8d4 |
cr_buf = (__m128i*)(pDst[2]);
|
|
Packit |
1fb8d4 |
y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */
|
|
Packit |
1fb8d4 |
y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */
|
|
Packit |
1fb8d4 |
y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */
|
|
Packit |
1fb8d4 |
cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */
|
|
Packit |
1fb8d4 |
cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
|
|
Packit |
1fb8d4 |
cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */
|
|
Packit |
1fb8d4 |
cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */
|
|
Packit |
1fb8d4 |
cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
|
|
Packit |
1fb8d4 |
cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */
|
|
Packit |
1fb8d4 |
srcbump = srcStep / sizeof(__m128i);
|
|
Packit |
1fb8d4 |
dstbump = dstStep / sizeof(__m128i);
|
|
Packit |
1fb8d4 |
#ifdef DO_PREFETCH
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* Prefetch RGB's. */
|
|
Packit |
1fb8d4 |
for (yp = 0; yp < roi->height; yp++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
int i;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
|
|
Packit |
1fb8d4 |
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
|
|
Packit |
1fb8d4 |
_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
|
|
Packit |
1fb8d4 |
_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
r_buf += srcbump;
|
|
Packit |
1fb8d4 |
g_buf += srcbump;
|
|
Packit |
1fb8d4 |
b_buf += srcbump;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
r_buf = (__m128i*)(pSrc[0]);
|
|
Packit |
1fb8d4 |
g_buf = (__m128i*)(pSrc[1]);
|
|
Packit |
1fb8d4 |
b_buf = (__m128i*)(pSrc[2]);
|
|
Packit |
1fb8d4 |
#endif /* DO_PREFETCH */
|
|
Packit |
1fb8d4 |
imax = roi->width * sizeof(INT16) / sizeof(__m128i);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (yp = 0; yp < roi->height; ++yp)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
int i;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (i = 0; i < imax; i++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
/* In order to use SSE2 signed 16-bit integer multiplication we
|
|
Packit |
1fb8d4 |
* need to convert the floating point factors to signed int
|
|
Packit |
1fb8d4 |
* without loosing information. The result of this multiplication
|
|
Packit |
1fb8d4 |
* is 32 bit and using SSE2 we get either the product's hi or lo
|
|
Packit |
1fb8d4 |
* word. Thus we will multiply the factors by the highest
|
|
Packit |
1fb8d4 |
* possible 2^n and take the upper 16 bits of the signed 32-bit
|
|
Packit |
1fb8d4 |
* result (_mm_mulhi_epi16). Since the final result needs to
|
|
Packit |
1fb8d4 |
* be scaled by << 5 and also in in order to keep the precision
|
|
Packit |
1fb8d4 |
* within the upper 16 bits we will also have to scale the RGB
|
|
Packit |
1fb8d4 |
* values used in the multiplication by << 5+(16-n).
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
__m128i r, g, b, y, cb, cr;
|
|
Packit |
1fb8d4 |
r = _mm_load_si128(y_buf + i);
|
|
Packit |
1fb8d4 |
g = _mm_load_si128(g_buf + i);
|
|
Packit |
1fb8d4 |
b = _mm_load_si128(b_buf + i);
|
|
Packit |
1fb8d4 |
/* r<<6; g<<6; b<<6 */
|
|
Packit |
1fb8d4 |
r = _mm_slli_epi16(r, 6);
|
|
Packit |
1fb8d4 |
g = _mm_slli_epi16(g, 6);
|
|
Packit |
1fb8d4 |
b = _mm_slli_epi16(b, 6);
|
|
Packit |
1fb8d4 |
/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
|
|
Packit |
1fb8d4 |
y = _mm_mulhi_epi16(r, y_r);
|
|
Packit |
1fb8d4 |
y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
|
|
Packit |
1fb8d4 |
y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
|
|
Packit |
1fb8d4 |
y = _mm_add_epi16(y, min);
|
|
Packit |
1fb8d4 |
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(y, min, max);
|
|
Packit |
1fb8d4 |
_mm_store_si128(y_buf + i, y);
|
|
Packit |
1fb8d4 |
/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
|
|
Packit |
1fb8d4 |
cb = _mm_mulhi_epi16(r, cb_r);
|
|
Packit |
1fb8d4 |
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
|
|
Packit |
1fb8d4 |
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
|
|
Packit |
1fb8d4 |
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(cb, min, max);
|
|
Packit |
1fb8d4 |
_mm_store_si128(cb_buf + i, cb);
|
|
Packit |
1fb8d4 |
/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
|
|
Packit |
1fb8d4 |
cr = _mm_mulhi_epi16(r, cr_r);
|
|
Packit |
1fb8d4 |
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
|
|
Packit |
1fb8d4 |
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
|
|
Packit |
1fb8d4 |
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
|
|
Packit |
1fb8d4 |
_mm_between_epi16(cr, min, max);
|
|
Packit |
1fb8d4 |
_mm_store_si128(cr_buf + i, cr);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
y_buf += srcbump;
|
|
Packit |
1fb8d4 |
cb_buf += srcbump;
|
|
Packit |
1fb8d4 |
cr_buf += srcbump;
|
|
Packit |
1fb8d4 |
r_buf += dstbump;
|
|
Packit |
1fb8d4 |
g_buf += dstbump;
|
|
Packit |
1fb8d4 |
b_buf += dstbump;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
1fb8d4 |
static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
|
|
Packit |
1fb8d4 |
const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
|
|
Packit |
1fb8d4 |
UINT32 srcStep, /* bytes between rows in source data */
|
|
Packit |
1fb8d4 |
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
|
Packit |
1fb8d4 |
UINT32 dstStep, /* bytes between rows in dest data */
|
|
Packit |
1fb8d4 |
const prim_size_t* roi) /* region of interest */
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const UINT16* pr = (const UINT16*)(pSrc[0]);
|
|
Packit |
1fb8d4 |
const UINT16* pg = (const UINT16*)(pSrc[1]);
|
|
Packit |
1fb8d4 |
const UINT16* pb = (const UINT16*)(pSrc[2]);
|
|
Packit |
1fb8d4 |
const UINT32 pad = roi->width % 16;
|
|
Packit |
1fb8d4 |
const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
|
|
Packit |
1fb8d4 |
BYTE* out;
|
|
Packit |
1fb8d4 |
UINT32 srcbump, dstbump, y;
|
|
Packit |
1fb8d4 |
out = (BYTE*) pDst;
|
|
Packit |
1fb8d4 |
srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
|
|
Packit |
1fb8d4 |
dstbump = (dstStep - (roi->width * sizeof(UINT32)));
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (y = 0; y < roi->height; ++y)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
UINT32 x;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (x = 0; x < roi->width - pad; x += 16)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i r, g, b;
|
|
Packit |
1fb8d4 |
/* The comments below pretend these are 8-byte registers
|
|
Packit |
1fb8d4 |
* rather than 16-byte, for readability.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1;
|
|
Packit |
1fb8d4 |
R0 = _mm_load_si128((__m128i*)pb);
|
|
Packit |
1fb8d4 |
pb += 8; /* R0 = 00B300B200B100B0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_load_si128((__m128i*)pb);
|
|
Packit |
1fb8d4 |
pb += 8; /* R1 = 00B700B600B500B4 */
|
|
Packit |
1fb8d4 |
b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1;
|
|
Packit |
1fb8d4 |
R0 = _mm_load_si128((__m128i*)pg);
|
|
Packit |
1fb8d4 |
pg += 8; /* R1 = 00G300G200G100G0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_load_si128((__m128i*)pg);
|
|
Packit |
1fb8d4 |
pg += 8; /* R2 = 00G700G600G500G4 */
|
|
Packit |
1fb8d4 |
g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1;
|
|
Packit |
1fb8d4 |
R0 = _mm_load_si128((__m128i*)pr);
|
|
Packit |
1fb8d4 |
pr += 8; /* R0 = 00R300R200R100R0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_load_si128((__m128i*)pr);
|
|
Packit |
1fb8d4 |
pr += 8; /* R3 = 00R700R600R500R4 */
|
|
Packit |
1fb8d4 |
r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i gbHi, gbLo, arHi, arLo;
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
gbLo = _mm_unpacklo_epi8(b, g); /* R0 = G7G6G5G4G3G2G1G0 */
|
|
Packit |
1fb8d4 |
gbHi = _mm_unpackhi_epi8(b, g); /* R1 = G7B7G6B7G5B5G4B4 */
|
|
Packit |
1fb8d4 |
arLo = _mm_unpacklo_epi8(r, a); /* R4 = FFR3FFR2FFR1FFR0 */
|
|
Packit |
1fb8d4 |
arHi = _mm_unpackhi_epi8(r, a); /* R3 = FFR7FFR6FFR5FFR4 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR1G1B1FFR0G0B0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR3G3B3FFR2G2B2 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR5G5B5FFR4G4B4 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR7G7B7FFR6G6B6 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (x = 0; x < pad; x++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const BYTE R = CLIP(*pr++);
|
|
Packit |
1fb8d4 |
const BYTE G = CLIP(*pg++);
|
|
Packit |
1fb8d4 |
const BYTE B = CLIP(*pb++);
|
|
Packit |
1fb8d4 |
*out++ = B;
|
|
Packit |
1fb8d4 |
*out++ = G;
|
|
Packit |
1fb8d4 |
*out++ = R;
|
|
Packit |
1fb8d4 |
*out++ = 0xFF;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* Jump to next row. */
|
|
Packit |
1fb8d4 |
pr += srcbump;
|
|
Packit |
1fb8d4 |
pg += srcbump;
|
|
Packit |
1fb8d4 |
pb += srcbump;
|
|
Packit |
1fb8d4 |
out += dstbump;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
|
|
Packit |
1fb8d4 |
const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
|
|
Packit |
1fb8d4 |
UINT32 srcStep, /* bytes between rows in source data */
|
|
Packit |
1fb8d4 |
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
|
Packit |
1fb8d4 |
UINT32 dstStep, /* bytes between rows in dest data */
|
|
Packit |
1fb8d4 |
const prim_size_t* roi) /* region of interest */
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const UINT16* pr = (const UINT16*)(pSrc[0]);
|
|
Packit |
1fb8d4 |
const UINT16* pg = (const UINT16*)(pSrc[1]);
|
|
Packit |
1fb8d4 |
const UINT16* pb = (const UINT16*)(pSrc[2]);
|
|
Packit |
1fb8d4 |
const UINT32 pad = roi->width % 16;
|
|
Packit |
1fb8d4 |
const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
|
|
Packit |
1fb8d4 |
BYTE* out;
|
|
Packit |
1fb8d4 |
UINT32 srcbump, dstbump, y;
|
|
Packit |
1fb8d4 |
out = (BYTE*) pDst;
|
|
Packit |
1fb8d4 |
srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
|
|
Packit |
1fb8d4 |
dstbump = (dstStep - (roi->width * sizeof(UINT32)));
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (y = 0; y < roi->height; ++y)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
UINT32 x;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (x = 0; x < roi->width - pad; x += 16)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i r, g, b;
|
|
Packit |
1fb8d4 |
/* The comments below pretend these are 8-byte registers
|
|
Packit |
1fb8d4 |
* rather than 16-byte, for readability.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1;
|
|
Packit |
1fb8d4 |
R0 = _mm_load_si128((__m128i*)pb);
|
|
Packit |
1fb8d4 |
pb += 8; /* R0 = 00B300B200B100B0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_load_si128((__m128i*)pb);
|
|
Packit |
1fb8d4 |
pb += 8; /* R1 = 00B700B600B500B4 */
|
|
Packit |
1fb8d4 |
b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1;
|
|
Packit |
1fb8d4 |
R0 = _mm_load_si128((__m128i*)pg);
|
|
Packit |
1fb8d4 |
pg += 8; /* R1 = 00G300G200G100G0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_load_si128((__m128i*)pg);
|
|
Packit |
1fb8d4 |
pg += 8; /* R2 = 00G700G600G500G4 */
|
|
Packit |
1fb8d4 |
g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1;
|
|
Packit |
1fb8d4 |
R0 = _mm_load_si128((__m128i*)pr);
|
|
Packit |
1fb8d4 |
pr += 8; /* R0 = 00R300R200R100R0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_load_si128((__m128i*)pr);
|
|
Packit |
1fb8d4 |
pr += 8; /* R3 = 00R700R600R500R4 */
|
|
Packit |
1fb8d4 |
r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i gbHi, gbLo, arHi, arLo;
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
gbLo = _mm_unpacklo_epi8(r, g); /* R0 = G7G6G5G4G3G2G1G0 */
|
|
Packit |
1fb8d4 |
gbHi = _mm_unpackhi_epi8(r, g); /* R1 = G7B7G6B7G5B5G4B4 */
|
|
Packit |
1fb8d4 |
arLo = _mm_unpacklo_epi8(b, a); /* R4 = FFR3FFR2FFR1FFR0 */
|
|
Packit |
1fb8d4 |
arHi = _mm_unpackhi_epi8(b, a); /* R3 = FFR7FFR6FFR5FFR4 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR1G1B1FFR0G0B0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR3G3B3FFR2G2B2 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR5G5B5FFR4G4B4 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR7G7B7FFR6G6B6 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (x = 0; x < pad; x++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const BYTE R = CLIP(*pr++);
|
|
Packit |
1fb8d4 |
const BYTE G = CLIP(*pg++);
|
|
Packit |
1fb8d4 |
const BYTE B = CLIP(*pb++);
|
|
Packit |
1fb8d4 |
*out++ = R;
|
|
Packit |
1fb8d4 |
*out++ = G;
|
|
Packit |
1fb8d4 |
*out++ = B;
|
|
Packit |
1fb8d4 |
*out++ = 0xFF;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* Jump to next row. */
|
|
Packit |
1fb8d4 |
pr += srcbump;
|
|
Packit |
1fb8d4 |
pg += srcbump;
|
|
Packit |
1fb8d4 |
pb += srcbump;
|
|
Packit |
1fb8d4 |
out += dstbump;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
|
|
Packit |
1fb8d4 |
const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
|
|
Packit |
1fb8d4 |
UINT32 srcStep, /* bytes between rows in source data */
|
|
Packit |
1fb8d4 |
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
|
Packit |
1fb8d4 |
UINT32 dstStep, /* bytes between rows in dest data */
|
|
Packit |
1fb8d4 |
const prim_size_t* roi) /* region of interest */
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const UINT16* pr = (const UINT16*)(pSrc[0]);
|
|
Packit |
1fb8d4 |
const UINT16* pg = (const UINT16*)(pSrc[1]);
|
|
Packit |
1fb8d4 |
const UINT16* pb = (const UINT16*)(pSrc[2]);
|
|
Packit |
1fb8d4 |
const UINT32 pad = roi->width % 16;
|
|
Packit |
1fb8d4 |
const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
|
|
Packit |
1fb8d4 |
BYTE* out;
|
|
Packit |
1fb8d4 |
UINT32 srcbump, dstbump, y;
|
|
Packit |
1fb8d4 |
out = (BYTE*) pDst;
|
|
Packit |
1fb8d4 |
srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
|
|
Packit |
1fb8d4 |
dstbump = (dstStep - (roi->width * sizeof(UINT32)));
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (y = 0; y < roi->height; ++y)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
UINT32 x;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (x = 0; x < roi->width - pad; x += 16)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i r, g, b;
|
|
Packit |
1fb8d4 |
/* The comments below pretend these are 8-byte registers
|
|
Packit |
1fb8d4 |
* rather than 16-byte, for readability.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1;
|
|
Packit |
1fb8d4 |
R0 = _mm_load_si128((__m128i*)pb);
|
|
Packit |
1fb8d4 |
pb += 8; /* R0 = 00B300B200B100B0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_load_si128((__m128i*)pb);
|
|
Packit |
1fb8d4 |
pb += 8; /* R1 = 00B700B600B500B4 */
|
|
Packit |
1fb8d4 |
b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1;
|
|
Packit |
1fb8d4 |
R0 = _mm_load_si128((__m128i*)pg);
|
|
Packit |
1fb8d4 |
pg += 8; /* R1 = 00G300G200G100G0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_load_si128((__m128i*)pg);
|
|
Packit |
1fb8d4 |
pg += 8; /* R2 = 00G700G600G500G4 */
|
|
Packit |
1fb8d4 |
g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1;
|
|
Packit |
1fb8d4 |
R0 = _mm_load_si128((__m128i*)pr);
|
|
Packit |
1fb8d4 |
pr += 8; /* R0 = 00R300R200R100R0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_load_si128((__m128i*)pr);
|
|
Packit |
1fb8d4 |
pr += 8; /* R3 = 00R700R600R500R4 */
|
|
Packit |
1fb8d4 |
r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i gbHi, gbLo, arHi, arLo;
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
gbLo = _mm_unpacklo_epi8(a, b); /* R0 = G7G6G5G4G3G2G1G0 */
|
|
Packit |
1fb8d4 |
gbHi = _mm_unpackhi_epi8(a, b); /* R1 = G7B7G6B7G5B5G4B4 */
|
|
Packit |
1fb8d4 |
arLo = _mm_unpacklo_epi8(g, r); /* R4 = FFR3FFR2FFR1FFR0 */
|
|
Packit |
1fb8d4 |
arHi = _mm_unpackhi_epi8(g, r); /* R3 = FFR7FFR6FFR5FFR4 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR1G1B1FFR0G0B0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR3G3B3FFR2G2B2 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR5G5B5FFR4G4B4 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR7G7B7FFR6G6B6 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (x = 0; x < pad; x++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const BYTE R = CLIP(*pr++);
|
|
Packit |
1fb8d4 |
const BYTE G = CLIP(*pg++);
|
|
Packit |
1fb8d4 |
const BYTE B = CLIP(*pb++);
|
|
Packit |
1fb8d4 |
*out++ = 0xFF;
|
|
Packit |
1fb8d4 |
*out++ = B;
|
|
Packit |
1fb8d4 |
*out++ = G;
|
|
Packit |
1fb8d4 |
*out++ = R;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* Jump to next row. */
|
|
Packit |
1fb8d4 |
pr += srcbump;
|
|
Packit |
1fb8d4 |
pg += srcbump;
|
|
Packit |
1fb8d4 |
pb += srcbump;
|
|
Packit |
1fb8d4 |
out += dstbump;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
|
|
Packit |
1fb8d4 |
const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
|
|
Packit |
1fb8d4 |
UINT32 srcStep, /* bytes between rows in source data */
|
|
Packit |
1fb8d4 |
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
|
Packit |
1fb8d4 |
UINT32 dstStep, /* bytes between rows in dest data */
|
|
Packit |
1fb8d4 |
const prim_size_t* roi) /* region of interest */
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const UINT16* pr = (const UINT16*)(pSrc[0]);
|
|
Packit |
1fb8d4 |
const UINT16* pg = (const UINT16*)(pSrc[1]);
|
|
Packit |
1fb8d4 |
const UINT16* pb = (const UINT16*)(pSrc[2]);
|
|
Packit |
1fb8d4 |
const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
|
|
Packit |
1fb8d4 |
const UINT32 pad = roi->width % 16;
|
|
Packit |
1fb8d4 |
BYTE* out;
|
|
Packit |
1fb8d4 |
UINT32 srcbump, dstbump, y;
|
|
Packit |
1fb8d4 |
out = (BYTE*) pDst;
|
|
Packit |
1fb8d4 |
srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
|
|
Packit |
1fb8d4 |
dstbump = (dstStep - (roi->width * sizeof(UINT32)));
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (y = 0; y < roi->height; ++y)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
UINT32 x;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (x = 0; x < roi->width - pad; x += 16)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i r, g, b;
|
|
Packit |
1fb8d4 |
/* The comments below pretend these are 8-byte registers
|
|
Packit |
1fb8d4 |
* rather than 16-byte, for readability.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1;
|
|
Packit |
1fb8d4 |
R0 = _mm_load_si128((__m128i*)pb);
|
|
Packit |
1fb8d4 |
pb += 8; /* R0 = 00B300B200B100B0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_load_si128((__m128i*)pb);
|
|
Packit |
1fb8d4 |
pb += 8; /* R1 = 00B700B600B500B4 */
|
|
Packit |
1fb8d4 |
b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1;
|
|
Packit |
1fb8d4 |
R0 = _mm_load_si128((__m128i*)pg);
|
|
Packit |
1fb8d4 |
pg += 8; /* R1 = 00G300G200G100G0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_load_si128((__m128i*)pg);
|
|
Packit |
1fb8d4 |
pg += 8; /* R2 = 00G700G600G500G4 */
|
|
Packit |
1fb8d4 |
g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i R0, R1;
|
|
Packit |
1fb8d4 |
R0 = _mm_load_si128((__m128i*)pr);
|
|
Packit |
1fb8d4 |
pr += 8; /* R0 = 00R300R200R100R0 */
|
|
Packit |
1fb8d4 |
R1 = _mm_load_si128((__m128i*)pr);
|
|
Packit |
1fb8d4 |
pr += 8; /* R3 = 00R700R600R500R4 */
|
|
Packit |
1fb8d4 |
r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
__m128i gbHi, gbLo, arHi, arLo;
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
gbLo = _mm_unpacklo_epi8(a, r); /* R0 = G7G6G5G4G3G2G1G0 */
|
|
Packit |
1fb8d4 |
gbHi = _mm_unpackhi_epi8(a, r); /* R1 = G7B7G6B7G5B5G4B4 */
|
|
Packit |
1fb8d4 |
arLo = _mm_unpacklo_epi8(g, b); /* R4 = FFR3FFR2FFR1FFR0 */
|
|
Packit |
1fb8d4 |
arHi = _mm_unpackhi_epi8(g, b); /* R3 = FFR7FFR6FFR5FFR4 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR1G1B1FFR0G0B0 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR3G3B3FFR2G2B2 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR5G5B5FFR4G4B4 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i*)out, bgrx);
|
|
Packit |
1fb8d4 |
out += 16; /* FFR7G7B7FFR6G6B6 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (x = 0; x < pad; x++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const BYTE R = CLIP(*pr++);
|
|
Packit |
1fb8d4 |
const BYTE G = CLIP(*pg++);
|
|
Packit |
1fb8d4 |
const BYTE B = CLIP(*pb++);
|
|
Packit |
1fb8d4 |
*out++ = 0xFF;
|
|
Packit |
1fb8d4 |
*out++ = R;
|
|
Packit |
1fb8d4 |
*out++ = G;
|
|
Packit |
1fb8d4 |
*out++ = B;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* Jump to next row. */
|
|
Packit |
1fb8d4 |
pr += srcbump;
|
|
Packit |
1fb8d4 |
pg += srcbump;
|
|
Packit |
1fb8d4 |
pb += srcbump;
|
|
Packit |
1fb8d4 |
out += dstbump;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
|
|
Packit |
1fb8d4 |
const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
|
|
Packit |
1fb8d4 |
UINT32 srcStep, /* bytes between rows in source data */
|
|
Packit |
1fb8d4 |
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
|
Packit |
1fb8d4 |
UINT32 dstStep, /* bytes between rows in dest data */
|
|
Packit |
1fb8d4 |
UINT32 DstFormat,
|
|
Packit |
1fb8d4 |
const prim_size_t* roi)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
|
|
Packit |
1fb8d4 |
(srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
|
|
Packit |
1fb8d4 |
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
switch (DstFormat)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_BGRA32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_BGRX32:
|
|
Packit |
1fb8d4 |
return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_RGBA32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_RGBX32:
|
|
Packit |
1fb8d4 |
return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_ABGR32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_XBGR32:
|
|
Packit |
1fb8d4 |
return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_ARGB32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_XRGB32:
|
|
Packit |
1fb8d4 |
return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
default:
|
|
Packit |
1fb8d4 |
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
#endif /* WITH_SSE2 */
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
1fb8d4 |
#ifdef WITH_NEON
|
|
Packit |
1fb8d4 |
static pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
|
|
Packit |
1fb8d4 |
const INT16* pSrc[3], INT32 srcStep,
|
|
Packit |
1fb8d4 |
INT16* pDst[3], INT32 dstStep,
|
|
Packit |
1fb8d4 |
const prim_size_t* roi) /* region of interest */
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
/* TODO: If necessary, check alignments and call the general version. */
|
|
Packit |
1fb8d4 |
int16x8_t zero = vdupq_n_s16(0);
|
|
Packit |
1fb8d4 |
int16x8_t max = vdupq_n_s16(255);
|
|
Packit |
1fb8d4 |
int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14
|
|
Packit |
1fb8d4 |
int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14
|
|
Packit |
1fb8d4 |
int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
|
|
Packit |
1fb8d4 |
int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14
|
|
Packit |
1fb8d4 |
int16x8_t c4096 = vdupq_n_s16(4096);
|
|
Packit |
1fb8d4 |
int16x8_t* y_buf = (int16x8_t*) pSrc[0];
|
|
Packit |
1fb8d4 |
int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
|
|
Packit |
1fb8d4 |
int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
|
|
Packit |
1fb8d4 |
int16x8_t* r_buf = (int16x8_t*) pDst[0];
|
|
Packit |
1fb8d4 |
int16x8_t* g_buf = (int16x8_t*) pDst[1];
|
|
Packit |
1fb8d4 |
int16x8_t* b_buf = (int16x8_t*) pDst[2];
|
|
Packit |
1fb8d4 |
int srcbump = srcStep / sizeof(int16x8_t);
|
|
Packit |
1fb8d4 |
int dstbump = dstStep / sizeof(int16x8_t);
|
|
Packit |
1fb8d4 |
int yp;
|
|
Packit |
1fb8d4 |
int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (yp = 0; yp < roi->height; ++yp)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
int i;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (i = 0; i < imax; i++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
/*
|
|
Packit |
1fb8d4 |
In order to use NEON signed 16-bit integer multiplication we need to convert
|
|
Packit |
1fb8d4 |
the floating point factors to signed int without loosing information.
|
|
Packit |
1fb8d4 |
The result of this multiplication is 32 bit and we have a NEON instruction
|
|
Packit |
1fb8d4 |
that returns the hi word of the saturated double.
|
|
Packit |
1fb8d4 |
Thus we will multiply the factors by the highest possible 2^n, take the
|
|
Packit |
1fb8d4 |
upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
|
|
Packit |
1fb8d4 |
shift by 1 to reverse the doubling) and correct this result by multiplying it
|
|
Packit |
1fb8d4 |
by 2^(16-n).
|
|
Packit |
1fb8d4 |
For the given factors in the conversion matrix the best possible n is 14.
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
Example for calculating r:
|
|
Packit |
1fb8d4 |
r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
|
|
Packit |
1fb8d4 |
r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
|
|
Packit |
1fb8d4 |
r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
|
|
Packit |
1fb8d4 |
r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
/* y = (y_buf[i] + 4096) >> 2 */
|
|
Packit |
1fb8d4 |
int16x8_t y = vld1q_s16((INT16*) &y_buf[i]);
|
|
Packit |
1fb8d4 |
y = vaddq_s16(y, c4096);
|
|
Packit |
1fb8d4 |
y = vshrq_n_s16(y, 2);
|
|
Packit |
1fb8d4 |
/* cb = cb_buf[i]; */
|
|
Packit |
1fb8d4 |
int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
|
|
Packit |
1fb8d4 |
/* cr = cr_buf[i]; */
|
|
Packit |
1fb8d4 |
int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cr*22986)) >> 3 */
|
|
Packit |
1fb8d4 |
int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
|
|
Packit |
1fb8d4 |
r = vshrq_n_s16(r, 3);
|
|
Packit |
1fb8d4 |
/* r_buf[i] = CLIP(r); */
|
|
Packit |
1fb8d4 |
r = vminq_s16(vmaxq_s16(r, zero), max);
|
|
Packit |
1fb8d4 |
vst1q_s16((INT16*)&r_buf[i], r);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
|
|
Packit |
1fb8d4 |
int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
|
|
Packit |
1fb8d4 |
g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
|
|
Packit |
1fb8d4 |
g = vshrq_n_s16(g, 3);
|
|
Packit |
1fb8d4 |
/* g_buf[i] = CLIP(g); */
|
|
Packit |
1fb8d4 |
g = vminq_s16(vmaxq_s16(g, zero), max);
|
|
Packit |
1fb8d4 |
vst1q_s16((INT16*)&g_buf[i], g);
|
|
Packit |
1fb8d4 |
/* (y + HIWORD(cb*28999)) >> 3 */
|
|
Packit |
1fb8d4 |
int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
|
|
Packit |
1fb8d4 |
b = vshrq_n_s16(b, 3);
|
|
Packit |
1fb8d4 |
/* b_buf[i] = CLIP(b); */
|
|
Packit |
1fb8d4 |
b = vminq_s16(vmaxq_s16(b, zero), max);
|
|
Packit |
1fb8d4 |
vst1q_s16((INT16*)&b_buf[i], b);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
y_buf += srcbump;
|
|
Packit |
1fb8d4 |
cb_buf += srcbump;
|
|
Packit |
1fb8d4 |
cr_buf += srcbump;
|
|
Packit |
1fb8d4 |
r_buf += dstbump;
|
|
Packit |
1fb8d4 |
g_buf += dstbump;
|
|
Packit |
1fb8d4 |
b_buf += dstbump;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(
|
|
Packit |
1fb8d4 |
const INT16* pSrc[3], UINT32 srcStep,
|
|
Packit |
1fb8d4 |
BYTE* pDst, UINT32 dstStep,
|
|
Packit |
1fb8d4 |
const prim_size_t* roi,
|
|
Packit |
1fb8d4 |
uint8_t rPos,
|
|
Packit |
1fb8d4 |
uint8_t gPos,
|
|
Packit |
1fb8d4 |
uint8_t bPos,
|
|
Packit |
1fb8d4 |
uint8_t aPos)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
UINT32 x, y;
|
|
Packit |
1fb8d4 |
BYTE* pRGB = pDst;
|
|
Packit |
1fb8d4 |
const INT16* pY = pSrc[0];
|
|
Packit |
1fb8d4 |
const INT16* pCb = pSrc[1];
|
|
Packit |
1fb8d4 |
const INT16* pCr = pSrc[2];
|
|
Packit |
1fb8d4 |
const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
|
|
Packit |
1fb8d4 |
const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
|
|
Packit |
1fb8d4 |
const size_t pad = roi->width % 8;
|
|
Packit |
1fb8d4 |
const int16x4_t c4096 = vdup_n_s16(4096);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (y = 0; y < roi->height; y++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
for (x = 0; x < roi->width - pad; x += 8)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const int16x8_t Y = vld1q_s16(pY);
|
|
Packit |
1fb8d4 |
const int16x4_t Yh = vget_high_s16(Y);
|
|
Packit |
1fb8d4 |
const int16x4_t Yl = vget_low_s16(Y);
|
|
Packit |
1fb8d4 |
const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
|
|
Packit |
1fb8d4 |
const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
|
|
Packit |
1fb8d4 |
const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
|
|
Packit |
1fb8d4 |
const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
|
|
Packit |
1fb8d4 |
const int16x8_t Cr = vld1q_s16(pCr);
|
|
Packit |
1fb8d4 |
const int16x4_t Crh = vget_high_s16(Cr);
|
|
Packit |
1fb8d4 |
const int16x4_t Crl = vget_low_s16(Cr);
|
|
Packit |
1fb8d4 |
const int16x8_t Cb = vld1q_s16(pCb);
|
|
Packit |
1fb8d4 |
const int16x4_t Cbh = vget_high_s16(Cb);
|
|
Packit |
1fb8d4 |
const int16x4_t Cbl = vget_low_s16(Cb);
|
|
Packit |
1fb8d4 |
uint8x8x4_t bgrx;
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
/* R */
|
|
Packit |
1fb8d4 |
const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
|
|
Packit |
1fb8d4 |
const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
|
|
Packit |
1fb8d4 |
const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
|
|
Packit |
1fb8d4 |
const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
|
|
Packit |
1fb8d4 |
const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
|
|
Packit |
1fb8d4 |
const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
|
|
Packit |
1fb8d4 |
const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
|
|
Packit |
1fb8d4 |
bgrx.val[rPos] = vqmovun_s16(Rs);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
/* G */
|
|
Packit |
1fb8d4 |
const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */
|
|
Packit |
1fb8d4 |
const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */
|
|
Packit |
1fb8d4 |
const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
|
|
Packit |
1fb8d4 |
const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
|
|
Packit |
1fb8d4 |
const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
|
|
Packit |
1fb8d4 |
const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
|
|
Packit |
1fb8d4 |
const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
|
|
Packit |
1fb8d4 |
const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
|
|
Packit |
1fb8d4 |
const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
|
|
Packit |
1fb8d4 |
const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
|
|
Packit |
1fb8d4 |
const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
|
|
Packit |
1fb8d4 |
const uint8x8_t G = vqmovun_s16(Gs);
|
|
Packit |
1fb8d4 |
bgrx.val[gPos] = G;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
/* B */
|
|
Packit |
1fb8d4 |
const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
|
|
Packit |
1fb8d4 |
const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
|
|
Packit |
1fb8d4 |
const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
|
|
Packit |
1fb8d4 |
const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
|
|
Packit |
1fb8d4 |
const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
|
|
Packit |
1fb8d4 |
const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
|
|
Packit |
1fb8d4 |
const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
|
|
Packit |
1fb8d4 |
const uint8x8_t B = vqmovun_s16(Bs);
|
|
Packit |
1fb8d4 |
bgrx.val[bPos] = B;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
/* A */
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
bgrx.val[aPos] = vdup_n_u8(0xFF);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
vst4_u8(pRGB, bgrx);
|
|
Packit |
1fb8d4 |
pY += 8;
|
|
Packit |
1fb8d4 |
pCb += 8;
|
|
Packit |
1fb8d4 |
pCr += 8;
|
|
Packit |
1fb8d4 |
pRGB += 32;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (x = 0; x < pad; x++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const INT32 divisor = 16;
|
|
Packit |
1fb8d4 |
const INT32 Y = ((*pY++) + 4096) << divisor;
|
|
Packit |
1fb8d4 |
const INT32 Cb = (*pCb++);
|
|
Packit |
1fb8d4 |
const INT32 Cr = (*pCr++);
|
|
Packit |
1fb8d4 |
const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
|
|
Packit |
1fb8d4 |
const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
|
|
Packit |
1fb8d4 |
const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
|
|
Packit |
1fb8d4 |
const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
|
|
Packit |
1fb8d4 |
INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
|
|
Packit |
1fb8d4 |
INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
|
|
Packit |
1fb8d4 |
INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
|
|
Packit |
1fb8d4 |
BYTE bgrx[4];
|
|
Packit |
1fb8d4 |
bgrx[bPos] = CLIP(B);
|
|
Packit |
1fb8d4 |
bgrx[gPos] = CLIP(G);
|
|
Packit |
1fb8d4 |
bgrx[rPos] = CLIP(R);
|
|
Packit |
1fb8d4 |
bgrx[aPos] = 0xFF;
|
|
Packit |
1fb8d4 |
*pRGB++ = bgrx[0];
|
|
Packit |
1fb8d4 |
*pRGB++ = bgrx[1];
|
|
Packit |
1fb8d4 |
*pRGB++ = bgrx[2];
|
|
Packit |
1fb8d4 |
*pRGB++ = bgrx[3];
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
pY += srcPad;
|
|
Packit |
1fb8d4 |
pCb += srcPad;
|
|
Packit |
1fb8d4 |
pCr += srcPad;
|
|
Packit |
1fb8d4 |
pRGB += dstPad;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(
|
|
Packit |
1fb8d4 |
const INT16* pSrc[3], UINT32 srcStep,
|
|
Packit |
1fb8d4 |
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
|
|
Packit |
1fb8d4 |
const prim_size_t* roi)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
switch (DstFormat)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_BGRA32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_BGRX32:
|
|
Packit |
1fb8d4 |
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_RGBA32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_RGBX32:
|
|
Packit |
1fb8d4 |
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_ARGB32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_XRGB32:
|
|
Packit |
1fb8d4 |
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_ABGR32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_XBGR32:
|
|
Packit |
1fb8d4 |
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
default:
|
|
Packit |
1fb8d4 |
return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
static pstatus_t neon_RGBToRGB_16s8u_P3AC4R_X(
|
|
Packit |
1fb8d4 |
const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
|
|
Packit |
1fb8d4 |
UINT32 srcStep, /* bytes between rows in source data */
|
|
Packit |
1fb8d4 |
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
|
Packit |
1fb8d4 |
UINT32 dstStep, /* bytes between rows in dest data */
|
|
Packit |
1fb8d4 |
const prim_size_t* roi, /* region of interest */
|
|
Packit |
1fb8d4 |
uint8_t rPos,
|
|
Packit |
1fb8d4 |
uint8_t gPos,
|
|
Packit |
1fb8d4 |
uint8_t bPos,
|
|
Packit |
1fb8d4 |
uint8_t aPos)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
UINT32 x, y;
|
|
Packit |
1fb8d4 |
UINT32 pad = roi->width % 8;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (y = 0; y < roi->height; y++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
|
|
Packit |
1fb8d4 |
const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
|
|
Packit |
1fb8d4 |
const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
|
|
Packit |
1fb8d4 |
BYTE* dst = pDst + y * dstStep;
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (x = 0; x < roi->width - pad; x += 8)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
int16x8_t r = vld1q_s16(pr);
|
|
Packit |
1fb8d4 |
int16x8_t g = vld1q_s16(pg);
|
|
Packit |
1fb8d4 |
int16x8_t b = vld1q_s16(pb);
|
|
Packit |
1fb8d4 |
uint8x8x4_t bgrx;
|
|
Packit |
1fb8d4 |
bgrx.val[aPos] = vdup_n_u8(0xFF);
|
|
Packit |
1fb8d4 |
bgrx.val[rPos] = vqmovun_s16(r);
|
|
Packit |
1fb8d4 |
bgrx.val[gPos] = vqmovun_s16(g);
|
|
Packit |
1fb8d4 |
bgrx.val[bPos] = vqmovun_s16(b);
|
|
Packit |
1fb8d4 |
vst4_u8(dst, bgrx);
|
|
Packit |
1fb8d4 |
pr += 8;
|
|
Packit |
1fb8d4 |
pg += 8;
|
|
Packit |
1fb8d4 |
pb += 8;
|
|
Packit |
1fb8d4 |
dst += 32;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
for (x = 0; x < pad; x ++)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
BYTE bgrx[4];
|
|
Packit |
1fb8d4 |
bgrx[bPos] = *pb++;
|
|
Packit |
1fb8d4 |
bgrx[gPos] = *pg++;
|
|
Packit |
1fb8d4 |
bgrx[rPos] = *pr++;
|
|
Packit |
1fb8d4 |
bgrx[aPos] = 0xFF;
|
|
Packit |
1fb8d4 |
*dst++ = bgrx[0];
|
|
Packit |
1fb8d4 |
*dst++ = bgrx[1];
|
|
Packit |
1fb8d4 |
*dst++ = bgrx[2];
|
|
Packit |
1fb8d4 |
*dst++ = bgrx[3];
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
static pstatus_t neon_RGBToRGB_16s8u_P3AC4R(
|
|
Packit |
1fb8d4 |
const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
|
|
Packit |
1fb8d4 |
UINT32 srcStep, /* bytes between rows in source data */
|
|
Packit |
1fb8d4 |
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
|
Packit |
1fb8d4 |
UINT32 dstStep, /* bytes between rows in dest data */
|
|
Packit |
1fb8d4 |
UINT32 DstFormat,
|
|
Packit |
1fb8d4 |
const prim_size_t* roi) /* region of interest */
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
switch (DstFormat)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_BGRA32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_BGRX32:
|
|
Packit |
1fb8d4 |
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_RGBA32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_RGBX32:
|
|
Packit |
1fb8d4 |
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_ARGB32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_XRGB32:
|
|
Packit |
1fb8d4 |
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_ABGR32:
|
|
Packit |
1fb8d4 |
case PIXEL_FORMAT_XBGR32:
|
|
Packit |
1fb8d4 |
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
default:
|
|
Packit |
1fb8d4 |
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
#endif /* WITH_NEON */
|
|
Packit |
1fb8d4 |
/* I don't see a direct IPP version of this, since the input is INT16
|
|
Packit |
1fb8d4 |
* YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_<mod>.
|
|
Packit |
1fb8d4 |
* But that would likely be slower.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* ------------------------------------------------------------------------- */
|
|
Packit |
1fb8d4 |
void primitives_init_colors_opt(primitives_t* prims)
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
generic = primitives_get_generic();
|
|
Packit |
1fb8d4 |
primitives_init_colors(prims);
|
|
Packit |
1fb8d4 |
#if defined(WITH_SSE2)
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
|
|
Packit |
1fb8d4 |
prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
|
|
Packit |
1fb8d4 |
prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
|
|
Packit |
1fb8d4 |
prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#elif defined(WITH_NEON)
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
|
|
Packit |
1fb8d4 |
{
|
|
Packit |
1fb8d4 |
prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
|
|
Packit |
1fb8d4 |
prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
|
|
Packit |
1fb8d4 |
prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#endif /* WITH_SSE2 */
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|