Blame src/libmpg123/synth_neon_s32.S

Packit c32a2d
/*
Packit c32a2d
	synth_neon_s32: ARM NEON optimized synth (32-bit output version)
Packit c32a2d
Packit c32a2d
	copyright 1995-2010 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#define WINDOW r0
Packit c32a2d
#define B0 r1
Packit c32a2d
#define SAMPLES r2
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	int synth_1to1_s32_neon_asm(real *window, real *b0, int *samples, int bo1);
Packit c32a2d
	return value: number of clipped samples (0)
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#ifndef _M_ARM
Packit c32a2d
	.code 32
Packit c32a2d
#endif
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.fpu neon
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
	.text
Packit c32a2d
	GLOBAL_SYMBOL ASM_NAME(synth_1to1_s32_neon_asm)
Packit c32a2d
#ifdef __ELF__
Packit c32a2d
	.type ASM_NAME(synth_1to1_s32_neon_asm), %function
Packit c32a2d
#endif
Packit c32a2d
	ALIGN4
Packit c32a2d
ASM_NAME(synth_1to1_s32_neon_asm):
Packit c32a2d
	push		{r4-r6, lr}
Packit c32a2d
	vpush		{q4-q7}
Packit c32a2d
	mov			r6, sp
Packit c32a2d
	sub			sp, sp, #16
Packit c32a2d
	mov			r5, sp
Packit c32a2d
	and			r5, r5, #0xf
Packit c32a2d
	sub			sp, sp, r5
Packit c32a2d
	
Packit c32a2d
	add			WINDOW, WINDOW, #64
Packit c32a2d
	sub			WINDOW, WINDOW, r3, lsl #2
Packit c32a2d
Packit c32a2d
	mov			r3, #4
Packit c32a2d
	mov			r4, #128
Packit c32a2d
	mov			r5, #64
Packit c32a2d
1:
Packit c32a2d
	vld1.32		{q0,q1}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q2,q3}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q4,q5}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q6,q7}, [WINDOW]
Packit c32a2d
	sub			WINDOW, WINDOW, #352
Packit c32a2d
	vld1.32		{q8,q9}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q10,q11}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q12,q13}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q14,q15}, [B0, :128]
Packit c32a2d
	vswp		q1, q4
Packit c32a2d
	vswp		q3, q6
Packit c32a2d
	sub			B0, B0, #160
Packit c32a2d
	vmul.f32	q0, q0, q8
Packit c32a2d
	vmul.f32	q2, q2, q10
Packit c32a2d
	vmul.f32	q1, q1, q12
Packit c32a2d
	vmul.f32	q3, q3, q14
Packit c32a2d
	vmla.f32	q0, q4, q9
Packit c32a2d
	vmla.f32	q2, q6, q11
Packit c32a2d
	vmla.f32	q1, q5, q13
Packit c32a2d
	vmla.f32	q3, q7, q15
Packit c32a2d
	vld1.32		{q4,q5}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q6,q7}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q8,q9}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q10,q11}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q12,q13}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q14,q15}, [B0, :128], r5
Packit c32a2d
	vswp		q5, q6
Packit c32a2d
	vswp		q11, q12
Packit c32a2d
	vmla.f32	q0, q4, q10
Packit c32a2d
	vmla.f32	q2, q5, q11
Packit c32a2d
	vmla.f32	q1, q8, q14
Packit c32a2d
	vld1.32		{q4,q5}, [WINDOW]
Packit c32a2d
	vld1.32		{q10,q11}, [B0, :128]!
Packit c32a2d
	add			WINDOW, WINDOW, #96
Packit c32a2d
	vmla.f32	q3, q4, q10
Packit c32a2d
	vmla.f32	q0, q6, q12
Packit c32a2d
	vmla.f32	q2, q7, q13
Packit c32a2d
	vmla.f32	q1, q9, q15
Packit c32a2d
	vmla.f32	q3, q5, q11
Packit c32a2d
	vmvn.i32	q5, #0xb9000000
Packit c32a2d
	vpadd.f32	d0, d0, d1
Packit c32a2d
	vpadd.f32	d4, d4, d5
Packit c32a2d
	vpadd.f32	d2, d2, d3
Packit c32a2d
	vpadd.f32	d6, d6, d7
Packit c32a2d
	vld1.32		{q6}, [sp, :128]
Packit c32a2d
	vpadd.f32	d0, d0, d4
Packit c32a2d
	vpadd.f32	d1, d2, d6
Packit c32a2d
Packit c32a2d
	vcvt.s32.f32	q3, q0, #16
Packit c32a2d
	vacgt.f32	q5, q0, q5
Packit c32a2d
	vld2.32		{q1,q2}, [SAMPLES]
Packit c32a2d
	vshr.u32	q5, q5, #31
Packit c32a2d
	vmov		q1, q3
Packit c32a2d
	vst2.32		{q1,q2}, [SAMPLES]!
Packit c32a2d
	vadd.i32	q5, q5, q6
Packit c32a2d
	vst1.32		{q5}, [sp, :128]
Packit c32a2d
Packit c32a2d
	subs		r3, r3, #1
Packit c32a2d
	bne			1b
Packit c32a2d
Packit c32a2d
	mov			r3, #4
Packit c32a2d
	mov			r5, #-64
Packit c32a2d
1:
Packit c32a2d
	vld1.32		{q0,q1}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q2,q3}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q4,q5}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q6,q7}, [WINDOW]
Packit c32a2d
	sub			WINDOW, WINDOW, #352
Packit c32a2d
	vld1.32		{q8,q9}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q10,q11}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q12,q13}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q14,q15}, [B0, :128]
Packit c32a2d
	vswp		q1, q4
Packit c32a2d
	vswp		q3, q6
Packit c32a2d
	add			B0, B0, #224
Packit c32a2d
	vmul.f32	q0, q0, q8
Packit c32a2d
	vmul.f32	q2, q2, q10
Packit c32a2d
	vmul.f32	q1, q1, q12
Packit c32a2d
	vmul.f32	q3, q3, q14
Packit c32a2d
	vmla.f32	q0, q4, q9
Packit c32a2d
	vmla.f32	q2, q6, q11
Packit c32a2d
	vmla.f32	q1, q5, q13
Packit c32a2d
	vmla.f32	q3, q7, q15
Packit c32a2d
	vld1.32		{q4,q5}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q6,q7}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q8,q9}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q10,q11}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q12,q13}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q14,q15}, [B0, :128], r5
Packit c32a2d
	vswp		q5, q6
Packit c32a2d
	vswp		q11, q12
Packit c32a2d
	vmla.f32	q0, q4, q10
Packit c32a2d
	vmla.f32	q2, q5, q11
Packit c32a2d
	vmla.f32	q1, q8, q14
Packit c32a2d
	vld1.32		{q4,q5}, [WINDOW]
Packit c32a2d
	vld1.32		{q10,q11}, [B0, :128]
Packit c32a2d
	add			WINDOW, WINDOW, #96
Packit c32a2d
	sub			B0, B0, #96
Packit c32a2d
	vmla.f32	q3, q4, q10
Packit c32a2d
	vmla.f32	q0, q6, q12
Packit c32a2d
	vmla.f32	q2, q7, q13
Packit c32a2d
	vmla.f32	q1, q9, q15
Packit c32a2d
	vmla.f32	q3, q5, q11
Packit c32a2d
	vmvn.i32	q5, #0xb9000000
Packit c32a2d
	vpadd.f32	d0, d0, d1
Packit c32a2d
	vpadd.f32	d4, d4, d5
Packit c32a2d
	vpadd.f32	d2, d2, d3
Packit c32a2d
	vpadd.f32	d6, d6, d7
Packit c32a2d
	vld1.32		{q6}, [sp, :128]
Packit c32a2d
	vpadd.f32	d0, d0, d4
Packit c32a2d
	vpadd.f32	d1, d2, d6
Packit c32a2d
Packit c32a2d
	vcvt.s32.f32	q3, q0, #16
Packit c32a2d
	vacgt.f32	q5, q0, q5
Packit c32a2d
	vld2.32		{q1,q2}, [SAMPLES]
Packit c32a2d
	vshr.u32	q5, q5, #31
Packit c32a2d
	vmov		q1, q3
Packit c32a2d
	vst2.32		{q1,q2}, [SAMPLES]!
Packit c32a2d
	vadd.i32	q5, q5, q6
Packit c32a2d
	vst1.32		{q5}, [sp, :128]
Packit c32a2d
Packit c32a2d
	subs		r3, r3, #1
Packit c32a2d
	bne			1b
Packit c32a2d
Packit c32a2d
	vld1.32		{q0}, [sp, :128]
Packit c32a2d
	vpadd.i32	d0, d0, d1
Packit c32a2d
	vpadd.i32	d0, d0, d0
Packit c32a2d
	vmov.32		r0, d0[0]
Packit c32a2d
Packit c32a2d
	mov			sp, r6
Packit c32a2d
	vpop		{q4-q7}
Packit c32a2d
	pop			{r4-r6, pc}
Packit c32a2d
Packit c32a2d
NONEXEC_STACK