Blame libfreerdp/codec/rfx_neon.c

Packit 1fb8d4
/*
Packit 1fb8d4
   FreeRDP: A Remote Desktop Protocol Implementation
Packit 1fb8d4
   RemoteFX Codec Library - NEON Optimizations
Packit 1fb8d4
Packit 1fb8d4
   Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
Packit 1fb8d4
Packit 1fb8d4
   Licensed under the Apache License, Version 2.0 (the "License");
Packit 1fb8d4
   you may not use this file except in compliance with the License.
Packit 1fb8d4
   You may obtain a copy of the License at
Packit 1fb8d4
Packit 1fb8d4
       http://www.apache.org/licenses/LICENSE-2.0
Packit 1fb8d4
Packit 1fb8d4
   Unless required by applicable law or agreed to in writing, software
Packit 1fb8d4
   distributed under the License is distributed on an "AS IS" BASIS,
Packit 1fb8d4
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Packit 1fb8d4
   See the License for the specific language governing permissions and
Packit 1fb8d4
   limitations under the License.
Packit 1fb8d4
*/
Packit 1fb8d4
Packit 1fb8d4
#ifdef HAVE_CONFIG_H
Packit 1fb8d4
#include "config.h"
Packit 1fb8d4
#endif
Packit 1fb8d4
Packit 1fb8d4
#if defined(__ARM_NEON__)
Packit 1fb8d4
Packit 1fb8d4
#include <stdio.h>
Packit 1fb8d4
#include <stdlib.h>
Packit 1fb8d4
#include <string.h>
Packit 1fb8d4
#include <arm_neon.h>
Packit 1fb8d4
#include <winpr/sysinfo.h>
Packit 1fb8d4
Packit 1fb8d4
#include "rfx_types.h"
Packit 1fb8d4
#include "rfx_neon.h"
Packit 1fb8d4
Packit 1fb8d4
/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
Packit 1fb8d4
Packit 1fb8d4
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Packit 1fb8d4
rfx_quantization_decode_block_NEON(INT16* buffer, const int buffer_size, const UINT32 factor)
Packit 1fb8d4
{
Packit 1fb8d4
	int16x8_t quantFactors = vdupq_n_s16(factor);
Packit 1fb8d4
	int16x8_t* buf = (int16x8_t*)buffer;
Packit 1fb8d4
	int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
Packit 1fb8d4
Packit 1fb8d4
	do
Packit 1fb8d4
	{
Packit 1fb8d4
		int16x8_t val = vld1q_s16((INT16*)buf);
Packit 1fb8d4
		val = vshlq_s16(val, quantFactors);
Packit 1fb8d4
		vst1q_s16((INT16*)buf, val);
Packit 1fb8d4
		buf++;
Packit 1fb8d4
	}
Packit 1fb8d4
	while (buf < buf_end);
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
void rfx_quantization_decode_NEON(INT16* buffer, const UINT32* quantVals)
Packit 1fb8d4
{
Packit 1fb8d4
	rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1); /* HL1 */
Packit 1fb8d4
	rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
Packit 1fb8d4
	rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
Packit 1fb8d4
	rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1); /* HL2 */
Packit 1fb8d4
	rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1); /* LH2 */
Packit 1fb8d4
	rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1); /* HH2 */
Packit 1fb8d4
	rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1); /* HL3 */
Packit 1fb8d4
	rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1); /* LH3 */
Packit 1fb8d4
	rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1); /* HH3 */
Packit 1fb8d4
	rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1); /* LL3 */
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
Packit 1fb8d4
Packit 1fb8d4
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Packit 1fb8d4
rfx_dwt_2d_decode_block_horiz_NEON(INT16* l, INT16* h, INT16* dst, int subband_width)
Packit 1fb8d4
{
Packit 1fb8d4
	int y, n;
Packit 1fb8d4
	INT16* l_ptr = l;
Packit 1fb8d4
	INT16* h_ptr = h;
Packit 1fb8d4
	INT16* dst_ptr = dst;
Packit 1fb8d4
Packit 1fb8d4
	for (y = 0; y < subband_width; y++)
Packit 1fb8d4
	{
Packit 1fb8d4
		/* Even coefficients */
Packit 1fb8d4
		for (n = 0; n < subband_width; n += 8)
Packit 1fb8d4
		{
Packit 1fb8d4
			// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
Packit 1fb8d4
			int16x8_t l_n = vld1q_s16(l_ptr);
Packit 1fb8d4
			int16x8_t h_n = vld1q_s16(h_ptr);
Packit 1fb8d4
			int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
Packit 1fb8d4
Packit 1fb8d4
			if (n == 0)
Packit 1fb8d4
			{
Packit 1fb8d4
				int16_t first = vgetq_lane_s16(h_n_m, 1);
Packit 1fb8d4
				h_n_m = vsetq_lane_s16(first, h_n_m, 0);
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
Packit 1fb8d4
			tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
Packit 1fb8d4
			tmp_n = vshrq_n_s16(tmp_n, 1);
Packit 1fb8d4
			int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
Packit 1fb8d4
			vst1q_s16(l_ptr, dst_n);
Packit 1fb8d4
			l_ptr += 8;
Packit 1fb8d4
			h_ptr += 8;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		l_ptr -= subband_width;
Packit 1fb8d4
		h_ptr -= subband_width;
Packit 1fb8d4
Packit 1fb8d4
		/* Odd coefficients */
Packit 1fb8d4
		for (n = 0; n < subband_width; n += 8)
Packit 1fb8d4
		{
Packit 1fb8d4
			// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
Packit 1fb8d4
			int16x8_t h_n = vld1q_s16(h_ptr);
Packit 1fb8d4
			h_n = vshlq_n_s16(h_n, 1);
Packit 1fb8d4
			int16x8x2_t dst_n;
Packit 1fb8d4
			dst_n.val[0] = vld1q_s16(l_ptr);
Packit 1fb8d4
			int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
Packit 1fb8d4
Packit 1fb8d4
			if (n == subband_width - 8)
Packit 1fb8d4
			{
Packit 1fb8d4
				int16_t last = vgetq_lane_s16(dst_n_p, 6);
Packit 1fb8d4
				dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
Packit 1fb8d4
			dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
Packit 1fb8d4
			dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
Packit 1fb8d4
			vst2q_s16(dst_ptr, dst_n);
Packit 1fb8d4
			l_ptr += 8;
Packit 1fb8d4
			h_ptr += 8;
Packit 1fb8d4
			dst_ptr += 16;
Packit 1fb8d4
		}
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Packit 1fb8d4
rfx_dwt_2d_decode_block_vert_NEON(INT16* l, INT16* h, INT16* dst, int subband_width)
Packit 1fb8d4
{
Packit 1fb8d4
	int x, n;
Packit 1fb8d4
	INT16* l_ptr = l;
Packit 1fb8d4
	INT16* h_ptr = h;
Packit 1fb8d4
	INT16* dst_ptr = dst;
Packit 1fb8d4
	int total_width = subband_width + subband_width;
Packit 1fb8d4
Packit 1fb8d4
	/* Even coefficients */
Packit 1fb8d4
	for (n = 0; n < subband_width; n++)
Packit 1fb8d4
	{
Packit 1fb8d4
		for (x = 0; x < total_width; x += 8)
Packit 1fb8d4
		{
Packit 1fb8d4
			// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
Packit 1fb8d4
			int16x8_t l_n = vld1q_s16(l_ptr);
Packit 1fb8d4
			int16x8_t h_n = vld1q_s16(h_ptr);
Packit 1fb8d4
			int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));;
Packit 1fb8d4
Packit 1fb8d4
			if (n == 0)
Packit 1fb8d4
				tmp_n = vaddq_s16(tmp_n, h_n);
Packit 1fb8d4
			else
Packit 1fb8d4
			{
Packit 1fb8d4
				int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
Packit 1fb8d4
				tmp_n = vaddq_s16(tmp_n, h_n_m);
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			tmp_n = vshrq_n_s16(tmp_n, 1);
Packit 1fb8d4
			int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
Packit 1fb8d4
			vst1q_s16(dst_ptr, dst_n);
Packit 1fb8d4
			l_ptr += 8;
Packit 1fb8d4
			h_ptr += 8;
Packit 1fb8d4
			dst_ptr += 8;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		dst_ptr += total_width;
Packit 1fb8d4
	}
Packit 1fb8d4
Packit 1fb8d4
	h_ptr = h;
Packit 1fb8d4
	dst_ptr = dst + total_width;
Packit 1fb8d4
Packit 1fb8d4
	/* Odd coefficients */
Packit 1fb8d4
	for (n = 0; n < subband_width; n++)
Packit 1fb8d4
	{
Packit 1fb8d4
		for (x = 0; x < total_width; x += 8)
Packit 1fb8d4
		{
Packit 1fb8d4
			// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
Packit 1fb8d4
			int16x8_t h_n = vld1q_s16(h_ptr);
Packit 1fb8d4
			int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
Packit 1fb8d4
			h_n = vshlq_n_s16(h_n, 1);
Packit 1fb8d4
			int16x8_t tmp_n = dst_n_m;
Packit 1fb8d4
Packit 1fb8d4
			if (n == subband_width - 1)
Packit 1fb8d4
				tmp_n = vaddq_s16(tmp_n, dst_n_m);
Packit 1fb8d4
			else
Packit 1fb8d4
			{
Packit 1fb8d4
				int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
Packit 1fb8d4
				tmp_n = vaddq_s16(tmp_n, dst_n_p);
Packit 1fb8d4
			}
Packit 1fb8d4
Packit 1fb8d4
			tmp_n = vshrq_n_s16(tmp_n, 1);
Packit 1fb8d4
			int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
Packit 1fb8d4
			vst1q_s16(dst_ptr, dst_n);
Packit 1fb8d4
			h_ptr += 8;
Packit 1fb8d4
			dst_ptr += 8;
Packit 1fb8d4
		}
Packit 1fb8d4
Packit 1fb8d4
		dst_ptr += total_width;
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Packit 1fb8d4
rfx_dwt_2d_decode_block_NEON(INT16* buffer, INT16* idwt, int subband_width)
Packit 1fb8d4
{
Packit 1fb8d4
	INT16* hl, * lh, * hh, * ll;
Packit 1fb8d4
	INT16* l_dst, * h_dst;
Packit 1fb8d4
	/* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
Packit 1fb8d4
	/* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
Packit 1fb8d4
	/* The lower part L uses LL(3) and HL(0). */
Packit 1fb8d4
	/* The higher part H uses LH(1) and HH(2). */
Packit 1fb8d4
	ll = buffer + subband_width * subband_width * 3;
Packit 1fb8d4
	hl = buffer;
Packit 1fb8d4
	l_dst = idwt;
Packit 1fb8d4
	rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
Packit 1fb8d4
	lh = buffer + subband_width * subband_width;
Packit 1fb8d4
	hh = buffer + subband_width * subband_width * 2;
Packit 1fb8d4
	h_dst = idwt + subband_width * subband_width * 2;
Packit 1fb8d4
	rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
Packit 1fb8d4
	/* Inverse DWT in vertical direction, results are stored in original buffer. */
Packit 1fb8d4
	rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
Packit 1fb8d4
{
Packit 1fb8d4
	rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
Packit 1fb8d4
	rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
Packit 1fb8d4
	rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
void rfx_init_neon(RFX_CONTEXT* context)
Packit 1fb8d4
{
Packit 1fb8d4
	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
Packit 1fb8d4
	{
Packit 1fb8d4
		DEBUG_RFX("Using NEON optimizations");
Packit 1fb8d4
		PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
Packit 1fb8d4
		PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_NEON");
Packit 1fb8d4
		PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
Packit 1fb8d4
		context->quantization_decode = rfx_quantization_decode_NEON;
Packit 1fb8d4
		context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
Packit 1fb8d4
	}
Packit 1fb8d4
}
Packit 1fb8d4
Packit 1fb8d4
#endif // __ARM_NEON__
Packit 1fb8d4