Blame libfreerdp/primitives/prim_alphaComp_opt.c

Packit 1fb8d4
/* FreeRDP: A Remote Desktop Protocol Client
Packit 1fb8d4
 * Optimized alpha blending routines.
Packit 1fb8d4
 * vi:ts=4 sw=4:
Packit 1fb8d4
 *
Packit 1fb8d4
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
Packit 1fb8d4
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
Packit 1fb8d4
 * not use this file except in compliance with the License. You may obtain
Packit 1fb8d4
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
Packit 1fb8d4
 * Unless required by applicable law or agreed to in writing, software
Packit 1fb8d4
 * distributed under the License is distributed on an "AS IS" BASIS,
Packit 1fb8d4
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
Packit 1fb8d4
 * or implied. See the License for the specific language governing
Packit 1fb8d4
 * permissions and limitations under the License.
Packit 1fb8d4
 *
Packit 1fb8d4
 * Note: this code assumes the second operand is fully opaque,
Packit 1fb8d4
 * e.g.
Packit 1fb8d4
 *   newval = alpha1*val1 + (1-alpha1)*val2
Packit 1fb8d4
 * rather than
Packit 1fb8d4
 *   newval = alpha1*val1 + (1-alpha1)*alpha2*val2
Packit 1fb8d4
 * The IPP gives other options.
Packit 1fb8d4
 */
Packit 1fb8d4
Packit 1fb8d4
#ifdef HAVE_CONFIG_H
Packit 1fb8d4
#include "config.h"
Packit 1fb8d4
#endif
Packit 1fb8d4
Packit 1fb8d4
#include <freerdp/types.h>
Packit 1fb8d4
#include <freerdp/primitives.h>
Packit 1fb8d4
#include <winpr/sysinfo.h>
Packit 1fb8d4
Packit 1fb8d4
#ifdef WITH_SSE2
Packit 1fb8d4
#include <emmintrin.h>
Packit 1fb8d4
#include <pmmintrin.h>
Packit 1fb8d4
#endif /* WITH_SSE2 */
Packit 1fb8d4
Packit 1fb8d4
#ifdef WITH_IPP
Packit 1fb8d4
#include <ippi.h>
Packit 1fb8d4
#endif /* WITH_IPP */
Packit 1fb8d4
Packit 1fb8d4
#include "prim_internal.h"
Packit 1fb8d4
Packit 1fb8d4
static primitives_t* generic = NULL;
Packit 1fb8d4
Packit 1fb8d4
/* ------------------------------------------------------------------------- */
Packit 1fb8d4
#ifdef WITH_SSE2
Packit 1fb8d4
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
Packit 1fb8d4
Packit Service 5a9772
static pstatus_t sse2_alphaComp_argb(const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2,
Packit Service 5a9772
                                     UINT32 src2Step, BYTE* pDst, UINT32 dstStep, UINT32 width,
Packit Service 5a9772
                                     UINT32 height)
Packit 1fb8d4
{
Packit Service 5a9772
	const UINT32* sptr1 = (const UINT32*)pSrc1;
Packit Service 5a9772
	const UINT32* sptr2 = (const UINT32*)pSrc2;
Packit 1fb8d4
	UINT32* dptr;
Packit Service 5a9772
	int linebytes, src1Jump, src2Jump, dstJump;
Packit Service 5a9772
	UINT32 y;
Packit 1fb8d4
	__m128i xmm0, xmm1;
Packit 1fb8d4
Packit Service 5a9772
	if ((width <= 0) || (height <= 0))
Packit Service 5a9772
		return PRIMITIVES_SUCCESS;
Packit 1fb8d4
Packit Service 5a9772
	if (width < 4) /* pointless if too small */
Packit 1fb8d4
	{
Packit Service 5a9772
		return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
Packit Service 5a9772
		                               height);
Packit 1fb8d4
	}
Packit 1fb8d4
Packit Service 5a9772
	dptr = (UINT32*)pDst;
Packit 1fb8d4
	linebytes = width * sizeof(UINT32);
Packit 1fb8d4
	src1Jump = (src1Step - linebytes) / sizeof(UINT32);
Packit 1fb8d4
	src2Jump = (src2Step - linebytes) / sizeof(UINT32);
Packit Service 5a9772
	dstJump = (dstStep - linebytes) / sizeof(UINT32);
Packit 1fb8d4
	xmm0 = _mm_set1_epi32(0);
Packit 1fb8d4
	xmm1 = _mm_set1_epi16(1);
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < height; ++y)
Packit 1fb8d4
	{
Packit 1fb8d4
		int pixels = width;
Packit 1fb8d4
		int count;
Packit 1fb8d4
		/* Get to the 16-byte boundary now. */
Packit 1fb8d4
		int leadIn = 0;
Packit 1fb8d4
Packit Service 5a9772
		switch ((ULONG_PTR)dptr & 0x0f)
Packit 1fb8d4
		{
Packit 1fb8d4
			case 0:
Packit 1fb8d4
				leadIn = 0;
Packit 1fb8d4
				break;
Packit 1fb8d4
Packit 1fb8d4
			case 4:
Packit 1fb8d4
				leadIn = 3;
Packit 1fb8d4
				break;
Packit 1fb8d4
Packit 1fb8d4
			case 8:
Packit 1fb8d4
				leadIn = 2;
Packit 1fb8d4
				break;
Packit 1fb8d4
Packit 1fb8d4
			case 12:
Packit 1fb8d4
				leadIn = 1;
Packit 1fb8d4
				break;
Packit 1fb8d4
Packit 1fb8d4
			default:
Packit 1fb8d4
				/* We'll never hit a 16-byte boundary, so do the whole
Packit 1fb8d4
				 * thing the slow way.
Packit 1fb8d4
				 */
Packit 1fb8d4
				leadIn = width;
Packit 1fb8d4
				break;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		if (leadIn)
Packit 1fb8d4
		{
Packit 1fb8d4
			pstatus_t status;
Packit Service 5a9772
			status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
Packit Service 5a9772
			                                 src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
Packit 1fb8d4
			if (status != PRIMITIVES_SUCCESS)
Packit 1fb8d4
				return status;
Packit 1fb8d4
Packit 1fb8d4
			sptr1 += leadIn;
Packit 1fb8d4
			sptr2 += leadIn;
Packit Service 5a9772
			dptr += leadIn;
Packit 1fb8d4
			pixels -= leadIn;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Use SSE registers to do 4 pixels at a time. */
Packit 1fb8d4
		count = pixels >> 2;
Packit 1fb8d4
		pixels -= count << 2;
Packit 1fb8d4
Packit 1fb8d4
		while (count--)
Packit 1fb8d4
		{
Packit 1fb8d4
			__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
Packit 1fb8d4
			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
Packit 1fb8d4
			xmm2 = LOAD_SI128(sptr1);
Packit 1fb8d4
			sptr1 += 4;
Packit 1fb8d4
			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
Packit 1fb8d4
			xmm3 = LOAD_SI128(sptr2);
Packit 1fb8d4
			sptr2 += 4;
Packit 1fb8d4
			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
Packit 1fb8d4
			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
Packit 1fb8d4
			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
Packit 1fb8d4
			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
Packit 1fb8d4
			/* subtract */
Packit 1fb8d4
			xmm6 = _mm_subs_epi16(xmm4, xmm5);
Packit 1fb8d4
			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
Packit 1fb8d4
			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
Packit 1fb8d4
			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
Packit 1fb8d4
			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
Packit 1fb8d4
			/* Add one to alphas */
Packit 1fb8d4
			xmm4 = _mm_adds_epi16(xmm4, xmm1);
Packit 1fb8d4
			/* Multiply and take low word */
Packit 1fb8d4
			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
Packit 1fb8d4
			/* Shift 8 right */
Packit 1fb8d4
			xmm4 = _mm_srai_epi16(xmm4, 8);
Packit 1fb8d4
			/* Add xmm5 */
Packit 1fb8d4
			xmm4 = _mm_adds_epi16(xmm4, xmm5);
Packit 1fb8d4
			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
Packit 1fb8d4
			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
Packit 1fb8d4
			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
Packit 1fb8d4
			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
Packit 1fb8d4
			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
Packit 1fb8d4
			/* subtract */
Packit 1fb8d4
			xmm7 = _mm_subs_epi16(xmm5, xmm6);
Packit 1fb8d4
			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
Packit 1fb8d4
			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
Packit 1fb8d4
			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
Packit 1fb8d4
			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
Packit 1fb8d4
			/* Add one to alphas */
Packit 1fb8d4
			xmm5 = _mm_adds_epi16(xmm5, xmm1);
Packit 1fb8d4
			/* Multiply and take low word */
Packit 1fb8d4
			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
Packit 1fb8d4
			/* Shift 8 right */
Packit 1fb8d4
			xmm5 = _mm_srai_epi16(xmm5, 8);
Packit 1fb8d4
			/* Add xmm6 */
Packit 1fb8d4
			xmm5 = _mm_adds_epi16(xmm5, xmm6);
Packit 1fb8d4
			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
Packit 1fb8d4
			/* Must mask off remainders or pack gets confused */
Packit 1fb8d4
			xmm3 = _mm_set1_epi16(0x00ffU);
Packit 1fb8d4
			xmm4 = _mm_and_si128(xmm4, xmm3);
Packit 1fb8d4
			xmm5 = _mm_and_si128(xmm5, xmm3);
Packit 1fb8d4
			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
Packit 1fb8d4
			xmm5 = _mm_packus_epi16(xmm5, xmm4);
Packit Service 5a9772
			_mm_store_si128((__m128i*)dptr, xmm5);
Packit 1fb8d4
			dptr += 4;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Finish off the remainder. */
Packit 1fb8d4
		if (pixels)
Packit 1fb8d4
		{
Packit 1fb8d4
			pstatus_t status;
Packit Service 5a9772
			status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
Packit Service 5a9772
			                                 src2Step, (BYTE*)dptr, dstStep, pixels, 1);
Packit 1fb8d4
			if (status != PRIMITIVES_SUCCESS)
Packit 1fb8d4
				return status;
Packit 1fb8d4
Packit 1fb8d4
			sptr1 += pixels;
Packit 1fb8d4
			sptr2 += pixels;
Packit Service 5a9772
			dptr += pixels;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		/* Jump to next row. */
Packit 1fb8d4
		sptr1 += src1Jump;
Packit 1fb8d4
		sptr2 += src2Jump;
Packit Service 5a9772
		dptr += dstJump;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	return PRIMITIVES_SUCCESS;
Packit 1fb8d4
}
Packit 1fb8d4
#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
Packit 1fb8d4
#endif
Packit 1fb8d4
Packit 1fb8d4
#ifdef WITH_IPP
Packit 1fb8d4
/* ------------------------------------------------------------------------- */
Packit Service 5a9772
static pstatus_t ipp_alphaComp_argb(const BYTE* pSrc1, INT32 src1Step, const BYTE* pSrc2,
Packit Service 5a9772
                                    INT32 src2Step, BYTE* pDst, INT32 dstStep, INT32 width,
Packit Service 5a9772
                                    INT32 height)
Packit 1fb8d4
{
Packit 1fb8d4
	IppiSize sz;
Packit Service 5a9772
	sz.width = width;
Packit 1fb8d4
	sz.height = height;
Packit Service 5a9772
	return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, sz, ippAlphaOver);
Packit 1fb8d4
}
Packit 1fb8d4
#endif
Packit 1fb8d4
Packit 1fb8d4
/* ------------------------------------------------------------------------- */
Packit 1fb8d4
void primitives_init_alphaComp_opt(primitives_t* prims)
Packit 1fb8d4
{
Packit 1fb8d4
	generic = primitives_get_generic();
Packit 1fb8d4
	primitives_init_alphaComp(prims);
Packit 1fb8d4
#ifdef WITH_IPP
Packit 1fb8d4
	prims->alphaComp_argb = ipp_alphaComp_argb;
Packit 1fb8d4
#elif defined(WITH_SSE2)
Packit 1fb8d4
Packit Service 5a9772
	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
Packit Service 5a9772
	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
Packit 1fb8d4
	{
Packit 1fb8d4
		prims->alphaComp_argb = sse2_alphaComp_argb;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
#endif
Packit 1fb8d4
}