Blame libfreerdp/codec/rfx_neon.c

Packit Service fa4841
/*
Packit Service fa4841
   FreeRDP: A Remote Desktop Protocol Implementation
Packit Service fa4841
   RemoteFX Codec Library - NEON Optimizations
Packit Service fa4841
Packit Service fa4841
   Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
Packit Service fa4841
Packit Service fa4841
   Licensed under the Apache License, Version 2.0 (the "License");
Packit Service fa4841
   you may not use this file except in compliance with the License.
Packit Service fa4841
   You may obtain a copy of the License at
Packit Service fa4841
Packit Service fa4841
       http://www.apache.org/licenses/LICENSE-2.0
Packit Service fa4841
Packit Service fa4841
   Unless required by applicable law or agreed to in writing, software
Packit Service fa4841
   distributed under the License is distributed on an "AS IS" BASIS,
Packit Service fa4841
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Packit Service fa4841
   See the License for the specific language governing permissions and
Packit Service fa4841
   limitations under the License.
Packit Service fa4841
*/
Packit Service fa4841
Packit Service fa4841
#ifdef HAVE_CONFIG_H
Packit Service fa4841
#include "config.h"
Packit Service fa4841
#endif
Packit Service fa4841
Packit Service fa4841
#if defined(__ARM_NEON__)
Packit Service fa4841
Packit Service fa4841
#include <stdio.h>
Packit Service fa4841
#include <stdlib.h>
Packit Service fa4841
#include <string.h>
Packit Service fa4841
#include <arm_neon.h>
Packit Service fa4841
#include <winpr/sysinfo.h>
Packit Service fa4841
Packit Service fa4841
#include "rfx_types.h"
Packit Service fa4841
#include "rfx_neon.h"
Packit Service fa4841
Packit Service fa4841
/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
Packit Service fa4841
Packit Service fa4841
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Packit Service fa4841
rfx_quantization_decode_block_NEON(INT16* buffer, const int buffer_size, const UINT32 factor)
Packit Service fa4841
{
Packit Service fa4841
	int16x8_t quantFactors = vdupq_n_s16(factor);
Packit Service fa4841
	int16x8_t* buf = (int16x8_t*)buffer;
Packit Service fa4841
	int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
Packit Service fa4841
Packit Service fa4841
	do
Packit Service fa4841
	{
Packit Service fa4841
		int16x8_t val = vld1q_s16((INT16*)buf);
Packit Service fa4841
		val = vshlq_s16(val, quantFactors);
Packit Service fa4841
		vst1q_s16((INT16*)buf, val);
Packit Service fa4841
		buf++;
Packit Service fa4841
	} while (buf < buf_end);
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static void rfx_quantization_decode_NEON(INT16* buffer, const UINT32* quantVals)
Packit Service fa4841
{
Packit Service fa4841
	rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1);    /* HL1 */
Packit Service fa4841
	rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
Packit Service fa4841
	rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
Packit Service fa4841
	rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1);  /* HL2 */
Packit Service fa4841
	rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1);  /* LH2 */
Packit Service fa4841
	rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1);  /* HH2 */
Packit Service fa4841
	rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1);   /* HL3 */
Packit Service fa4841
	rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1);   /* LH3 */
Packit Service fa4841
	rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1);   /* HH3 */
Packit Service fa4841
	rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1);   /* LL3 */
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Packit Service fa4841
rfx_dwt_2d_decode_block_horiz_NEON(INT16* l, INT16* h, INT16* dst, int subband_width)
Packit Service fa4841
{
Packit Service fa4841
	int y, n;
Packit Service fa4841
	INT16* l_ptr = l;
Packit Service fa4841
	INT16* h_ptr = h;
Packit Service fa4841
	INT16* dst_ptr = dst;
Packit Service fa4841
Packit Service fa4841
	for (y = 0; y < subband_width; y++)
Packit Service fa4841
	{
Packit Service fa4841
		/* Even coefficients */
Packit Service fa4841
		for (n = 0; n < subband_width; n += 8)
Packit Service fa4841
		{
Packit Service fa4841
			// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
Packit Service fa4841
			int16x8_t l_n = vld1q_s16(l_ptr);
Packit Service fa4841
			int16x8_t h_n = vld1q_s16(h_ptr);
Packit Service fa4841
			int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
Packit Service fa4841
Packit Service fa4841
			if (n == 0)
Packit Service fa4841
			{
Packit Service fa4841
				int16_t first = vgetq_lane_s16(h_n_m, 1);
Packit Service fa4841
				h_n_m = vsetq_lane_s16(first, h_n_m, 0);
Packit Service fa4841
			}
Packit Service fa4841
Packit Service fa4841
			int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
Packit Service fa4841
			tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
Packit Service fa4841
			tmp_n = vshrq_n_s16(tmp_n, 1);
Packit Service fa4841
			int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
Packit Service fa4841
			vst1q_s16(l_ptr, dst_n);
Packit Service fa4841
			l_ptr += 8;
Packit Service fa4841
			h_ptr += 8;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		l_ptr -= subband_width;
Packit Service fa4841
		h_ptr -= subband_width;
Packit Service fa4841
Packit Service fa4841
		/* Odd coefficients */
Packit Service fa4841
		for (n = 0; n < subband_width; n += 8)
Packit Service fa4841
		{
Packit Service fa4841
			// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
Packit Service fa4841
			int16x8_t h_n = vld1q_s16(h_ptr);
Packit Service fa4841
			h_n = vshlq_n_s16(h_n, 1);
Packit Service fa4841
			int16x8x2_t dst_n;
Packit Service fa4841
			dst_n.val[0] = vld1q_s16(l_ptr);
Packit Service fa4841
			int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
Packit Service fa4841
Packit Service fa4841
			if (n == subband_width - 8)
Packit Service fa4841
			{
Packit Service fa4841
				int16_t last = vgetq_lane_s16(dst_n_p, 6);
Packit Service fa4841
				dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
Packit Service fa4841
			}
Packit Service fa4841
Packit Service fa4841
			dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
Packit Service fa4841
			dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
Packit Service fa4841
			dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
Packit Service fa4841
			vst2q_s16(dst_ptr, dst_n);
Packit Service fa4841
			l_ptr += 8;
Packit Service fa4841
			h_ptr += 8;
Packit Service fa4841
			dst_ptr += 16;
Packit Service fa4841
		}
Packit Service fa4841
	}
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Packit Service fa4841
rfx_dwt_2d_decode_block_vert_NEON(INT16* l, INT16* h, INT16* dst, int subband_width)
Packit Service fa4841
{
Packit Service fa4841
	int x, n;
Packit Service fa4841
	INT16* l_ptr = l;
Packit Service fa4841
	INT16* h_ptr = h;
Packit Service fa4841
	INT16* dst_ptr = dst;
Packit Service fa4841
	int total_width = subband_width + subband_width;
Packit Service fa4841
Packit Service fa4841
	/* Even coefficients */
Packit Service fa4841
	for (n = 0; n < subband_width; n++)
Packit Service fa4841
	{
Packit Service fa4841
		for (x = 0; x < total_width; x += 8)
Packit Service fa4841
		{
Packit Service fa4841
			// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
Packit Service fa4841
			int16x8_t l_n = vld1q_s16(l_ptr);
Packit Service fa4841
			int16x8_t h_n = vld1q_s16(h_ptr);
Packit Service fa4841
			int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
Packit Service fa4841
Packit Service fa4841
			if (n == 0)
Packit Service fa4841
				tmp_n = vaddq_s16(tmp_n, h_n);
Packit Service fa4841
			else
Packit Service fa4841
			{
Packit Service fa4841
				int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
Packit Service fa4841
				tmp_n = vaddq_s16(tmp_n, h_n_m);
Packit Service fa4841
			}
Packit Service fa4841
Packit Service fa4841
			tmp_n = vshrq_n_s16(tmp_n, 1);
Packit Service fa4841
			int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
Packit Service fa4841
			vst1q_s16(dst_ptr, dst_n);
Packit Service fa4841
			l_ptr += 8;
Packit Service fa4841
			h_ptr += 8;
Packit Service fa4841
			dst_ptr += 8;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		dst_ptr += total_width;
Packit Service fa4841
	}
Packit Service fa4841
Packit Service fa4841
	h_ptr = h;
Packit Service fa4841
	dst_ptr = dst + total_width;
Packit Service fa4841
Packit Service fa4841
	/* Odd coefficients */
Packit Service fa4841
	for (n = 0; n < subband_width; n++)
Packit Service fa4841
	{
Packit Service fa4841
		for (x = 0; x < total_width; x += 8)
Packit Service fa4841
		{
Packit Service fa4841
			// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
Packit Service fa4841
			int16x8_t h_n = vld1q_s16(h_ptr);
Packit Service fa4841
			int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
Packit Service fa4841
			h_n = vshlq_n_s16(h_n, 1);
Packit Service fa4841
			int16x8_t tmp_n = dst_n_m;
Packit Service fa4841
Packit Service fa4841
			if (n == subband_width - 1)
Packit Service fa4841
				tmp_n = vaddq_s16(tmp_n, dst_n_m);
Packit Service fa4841
			else
Packit Service fa4841
			{
Packit Service fa4841
				int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
Packit Service fa4841
				tmp_n = vaddq_s16(tmp_n, dst_n_p);
Packit Service fa4841
			}
Packit Service fa4841
Packit Service fa4841
			tmp_n = vshrq_n_s16(tmp_n, 1);
Packit Service fa4841
			int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
Packit Service fa4841
			vst1q_s16(dst_ptr, dst_n);
Packit Service fa4841
			h_ptr += 8;
Packit Service fa4841
			dst_ptr += 8;
Packit Service fa4841
		}
Packit Service fa4841
Packit Service fa4841
		dst_ptr += total_width;
Packit Service fa4841
	}
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Packit Service fa4841
rfx_dwt_2d_decode_block_NEON(INT16* buffer, INT16* idwt, int subband_width)
Packit Service fa4841
{
Packit Service fa4841
	INT16 *hl, *lh, *hh, *ll;
Packit Service fa4841
	INT16 *l_dst, *h_dst;
Packit Service fa4841
	/* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
Packit Service fa4841
	 */
Packit Service fa4841
	/* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
Packit Service fa4841
	/* The lower part L uses LL(3) and HL(0). */
Packit Service fa4841
	/* The higher part H uses LH(1) and HH(2). */
Packit Service fa4841
	ll = buffer + subband_width * subband_width * 3;
Packit Service fa4841
	hl = buffer;
Packit Service fa4841
	l_dst = idwt;
Packit Service fa4841
	rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
Packit Service fa4841
	lh = buffer + subband_width * subband_width;
Packit Service fa4841
	hh = buffer + subband_width * subband_width * 2;
Packit Service fa4841
	h_dst = idwt + subband_width * subband_width * 2;
Packit Service fa4841
	rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
Packit Service fa4841
	/* Inverse DWT in vertical direction, results are stored in original buffer. */
Packit Service fa4841
	rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
Packit Service fa4841
{
Packit Service fa4841
	rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
Packit Service fa4841
	rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
Packit Service fa4841
	rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
void rfx_init_neon(RFX_CONTEXT* context)
Packit Service fa4841
{
Packit Service fa4841
	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
Packit Service fa4841
	{
Packit Service fa4841
		DEBUG_RFX("Using NEON optimizations");
Packit Service fa4841
		PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
Packit Service fa4841
		PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
Packit Service fa4841
		                "rfx_quantization_decode_NEON");
Packit Service fa4841
		PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
Packit Service fa4841
		context->quantization_decode = rfx_quantization_decode_NEON;
Packit Service fa4841
		context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
Packit Service fa4841
	}
Packit Service fa4841
}
Packit Service fa4841
Packit Service fa4841
#endif // __ARM_NEON__