Blame src/libmpg123/synth_stereo_neon_accurate.S

Packit c32a2d
/*
Packit c32a2d
	synth_stereo_neon_accurate: ARM NEON optimized synth (stereo specific, MPEG compliant 16-bit output version)
Packit c32a2d
Packit c32a2d
	copyright 1995-2010 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#define WINDOW r0
Packit c32a2d
#define B0L r1
Packit c32a2d
#define B0R r2
Packit c32a2d
#define SAMPLES r3
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	int synth_1to1_s_neon_accurate_asm(real *window, real *b0l, real *b0r, real *samples, int bo1);
Packit c32a2d
	return value: number of clipped samples
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
	.code 32
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.fpu neon
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
	.text
Packit c32a2d
	.globl ASM_NAME(synth_1to1_s_neon_accurate_asm)
Packit c32a2d
#ifdef __ELF__
Packit c32a2d
	.type ASM_NAME(synth_1to1_s_neon_accurate_asm), %function
Packit c32a2d
#endif
Packit c32a2d
ASM_NAME(synth_1to1_s_neon_accurate_asm):
Packit c32a2d
	push		{r4-r7, lr}
Packit c32a2d
	vpush		{q4-q7}
Packit c32a2d
	ldr			r4, [sp, #84]
Packit c32a2d
	mov			r7, sp
Packit c32a2d
	sub			sp, sp, #16
Packit c32a2d
	bic			sp, #0xff
Packit c32a2d
	
Packit c32a2d
	add			WINDOW, WINDOW, #64
Packit c32a2d
	sub			WINDOW, WINDOW, r4, lsl #2
Packit c32a2d
Packit c32a2d
	mov			r4, #4
Packit c32a2d
	mov			r5, #128
Packit c32a2d
	mov			r6, #64
Packit c32a2d
1:
Packit c32a2d
	vld1.32		{q0,q1}, [WINDOW], r5
Packit c32a2d
	vld1.32		{q2,q3}, [WINDOW]
Packit c32a2d
	vld1.32		{q4,q5}, [B0L, :128], r6
Packit c32a2d
	vld1.32		{q6,q7}, [B0R, :128], r6
Packit c32a2d
	vld1.32		{q8,q9}, [B0L, :128]
Packit c32a2d
	vld1.32		{q10,q11}, [B0R, :128]
Packit c32a2d
	vmul.f32	q12, q0, q4
Packit c32a2d
	vmul.f32	q13, q0, q6
Packit c32a2d
	vmul.f32	q14, q2, q8
Packit c32a2d
	vmul.f32	q15, q2, q10
Packit c32a2d
	vmla.f32	q12, q1, q5
Packit c32a2d
	vmla.f32	q13, q1, q7
Packit c32a2d
	vmla.f32	q14, q3, q9
Packit c32a2d
	vmla.f32	q15, q3, q11
Packit c32a2d
	sub			WINDOW, WINDOW, #96
Packit c32a2d
	sub			B0L, B0L, #32
Packit c32a2d
	sub			B0R, B0R, #32
Packit c32a2d
	vld1.32		{q0,q1}, [WINDOW], r5
Packit c32a2d
	vld1.32		{q2,q3}, [WINDOW]
Packit c32a2d
	vld1.32		{q4,q5}, [B0L, :128], r6
Packit c32a2d
	vld1.32		{q6,q7}, [B0R, :128], r6
Packit c32a2d
	vld1.32		{q8,q9}, [B0L, :128]!
Packit c32a2d
	vld1.32		{q10,q11}, [B0R, :128]!
Packit c32a2d
	vmla.f32	q12, q0, q4
Packit c32a2d
	vmla.f32	q13, q0, q6
Packit c32a2d
	vmla.f32	q14, q2, q8
Packit c32a2d
	vmla.f32	q15, q2, q10
Packit c32a2d
	add			WINDOW, WINDOW, #96
Packit c32a2d
	vmla.f32	q12, q1, q5
Packit c32a2d
	vmla.f32	q13, q1, q7
Packit c32a2d
	vmla.f32	q14, q3, q9
Packit c32a2d
	vmla.f32	q15, q3, q11
Packit c32a2d
Packit c32a2d
	vld1.32		{q0,q1}, [WINDOW], r5
Packit c32a2d
	vld1.32		{q2,q3}, [WINDOW]
Packit c32a2d
	vld1.32		{q4,q5}, [B0L, :128], r6
Packit c32a2d
	vld1.32		{q6,q7}, [B0R, :128], r6
Packit c32a2d
	vld1.32		{q8,q9}, [B0L, :128]
Packit c32a2d
	vld1.32		{q10,q11}, [B0R, :128]
Packit c32a2d
	vpadd.f32	d24, d24, d25
Packit c32a2d
	vpadd.f32	d25, d26, d27
Packit c32a2d
	vpadd.f32	d26, d28, d29
Packit c32a2d
	vpadd.f32	d27, d30, d31
Packit c32a2d
	vmov.i32	q15, #0x4b000000
Packit c32a2d
	vmvn.i32	q14, #0xb9000000
Packit c32a2d
	vorr.i32	q15, #0x00400000
Packit c32a2d
	vpadd.f32	d24, d24, d25
Packit c32a2d
	vpadd.f32	d25, d26, d27
Packit c32a2d
	vacgt.f32	q14, q12, q14
Packit c32a2d
	vadd.f32	q13, q12, q15
Packit c32a2d
	vld1.32		{q15}, [sp, :128]
Packit c32a2d
	vshr.u32	q14, q14, #31
Packit c32a2d
	vshl.i32	q13, q13, #10
Packit c32a2d
	vadd.i32	q14, q14, q15
Packit c32a2d
	vqshrn.s32	d26, q13, #10
Packit c32a2d
	vst1.32		{q14}, [sp, :128]
Packit c32a2d
	vst1.16		{d26}, [SAMPLES]!
Packit c32a2d
	vmul.f32	q12, q0, q4
Packit c32a2d
	vmul.f32	q13, q0, q6
Packit c32a2d
	vmul.f32	q14, q2, q8
Packit c32a2d
	vmul.f32	q15, q2, q10
Packit c32a2d
	vmla.f32	q12, q1, q5
Packit c32a2d
	vmla.f32	q13, q1, q7
Packit c32a2d
	vmla.f32	q14, q3, q9
Packit c32a2d
	vmla.f32	q15, q3, q11
Packit c32a2d
	sub			WINDOW, WINDOW, #96
Packit c32a2d
	sub			B0L, B0L, #32
Packit c32a2d
	sub			B0R, B0R, #32
Packit c32a2d
	vld1.32		{q0,q1}, [WINDOW], r5
Packit c32a2d
	vld1.32		{q2,q3}, [WINDOW]
Packit c32a2d
	vld1.32		{q4,q5}, [B0L, :128], r6
Packit c32a2d
	vld1.32		{q6,q7}, [B0R, :128], r6
Packit c32a2d
	vld1.32		{q8,q9}, [B0L, :128]!
Packit c32a2d
	vld1.32		{q10,q11}, [B0R, :128]!
Packit c32a2d
	vmla.f32	q12, q0, q4
Packit c32a2d
	vmla.f32	q13, q0, q6
Packit c32a2d
	vmla.f32	q14, q2, q8
Packit c32a2d
	vmla.f32	q15, q2, q10
Packit c32a2d
	add			WINDOW, WINDOW, #96
Packit c32a2d
	vmla.f32	q12, q1, q5
Packit c32a2d
	vmla.f32	q13, q1, q7
Packit c32a2d
	vmla.f32	q14, q3, q9
Packit c32a2d
	vmla.f32	q15, q3, q11
Packit c32a2d
	vpadd.f32	d24, d24, d25
Packit c32a2d
	vpadd.f32	d25, d26, d27
Packit c32a2d
	vpadd.f32	d26, d28, d29
Packit c32a2d
	vpadd.f32	d27, d30, d31
Packit c32a2d
	vmov.i32	q15, #0x4b000000
Packit c32a2d
	vmvn.i32	q14, #0xb9000000
Packit c32a2d
	vorr.i32	q15, #0x00400000
Packit c32a2d
	vpadd.f32	d24, d24, d25
Packit c32a2d
	vpadd.f32	d25, d26, d27
Packit c32a2d
	vacgt.f32	q14, q12, q14
Packit c32a2d
	vadd.f32	q13, q12, q15
Packit c32a2d
	vld1.32		{q15}, [sp, :128]
Packit c32a2d
	vshr.u32	q14, q14, #31
Packit c32a2d
	vshl.i32	q13, q13, #10
Packit c32a2d
	vadd.i32	q14, q14, q15
Packit c32a2d
	vqshrn.s32	d26, q13, #10
Packit c32a2d
	vst1.32		{q14}, [sp, :128]
Packit c32a2d
	vst1.16		{d26}, [SAMPLES]!
Packit c32a2d
Packit c32a2d
	subs		r4, r4, #1
Packit c32a2d
	bne			1b
Packit c32a2d
Packit c32a2d
	mov			r4, #4
Packit c32a2d
	mov			r6, #-64
Packit c32a2d
1:
Packit c32a2d
	vld1.32		{q0,q1}, [WINDOW], r5
Packit c32a2d
	vld1.32		{q2,q3}, [WINDOW]
Packit c32a2d
	vld1.32		{q4,q5}, [B0L, :128], r6
Packit c32a2d
	vld1.32		{q6,q7}, [B0R, :128], r6
Packit c32a2d
	vld1.32		{q8,q9}, [B0L, :128]
Packit c32a2d
	vld1.32		{q10,q11}, [B0R, :128]
Packit c32a2d
	vmul.f32	q12, q0, q4
Packit c32a2d
	vmul.f32	q13, q0, q6
Packit c32a2d
	vmul.f32	q14, q2, q8
Packit c32a2d
	vmul.f32	q15, q2, q10
Packit c32a2d
	vmla.f32	q12, q1, q5
Packit c32a2d
	vmla.f32	q13, q1, q7
Packit c32a2d
	vmla.f32	q14, q3, q9
Packit c32a2d
	vmla.f32	q15, q3, q11
Packit c32a2d
	sub			WINDOW, WINDOW, #96
Packit c32a2d
	add			B0L, B0L, #96
Packit c32a2d
	add			B0R, B0R, #96
Packit c32a2d
	vld1.32		{q0,q1}, [WINDOW], r5
Packit c32a2d
	vld1.32		{q2,q3}, [WINDOW]
Packit c32a2d
	vld1.32		{q4,q5}, [B0L, :128], r6
Packit c32a2d
	vld1.32		{q6,q7}, [B0R, :128], r6
Packit c32a2d
	vld1.32		{q8,q9}, [B0L, :128]
Packit c32a2d
	vld1.32		{q10,q11}, [B0R, :128]
Packit c32a2d
	vmla.f32	q12, q0, q4
Packit c32a2d
	vmla.f32	q13, q0, q6
Packit c32a2d
	vmla.f32	q14, q2, q8
Packit c32a2d
	vmla.f32	q15, q2, q10
Packit c32a2d
	add			WINDOW, WINDOW, #96
Packit c32a2d
	sub			B0L, B0L, #96
Packit c32a2d
	sub			B0R, B0R, #96
Packit c32a2d
	vmla.f32	q12, q1, q5
Packit c32a2d
	vmla.f32	q13, q1, q7
Packit c32a2d
	vmla.f32	q14, q3, q9
Packit c32a2d
	vmla.f32	q15, q3, q11
Packit c32a2d
Packit c32a2d
	vld1.32		{q0,q1}, [WINDOW], r5
Packit c32a2d
	vld1.32		{q2,q3}, [WINDOW]
Packit c32a2d
	vld1.32		{q4,q5}, [B0L, :128], r6
Packit c32a2d
	vld1.32		{q6,q7}, [B0R, :128], r6
Packit c32a2d
	vld1.32		{q8,q9}, [B0L, :128]
Packit c32a2d
	vld1.32		{q10,q11}, [B0R, :128]
Packit c32a2d
	vpadd.f32	d24, d24, d25
Packit c32a2d
	vpadd.f32	d25, d26, d27
Packit c32a2d
	vpadd.f32	d26, d28, d29
Packit c32a2d
	vpadd.f32	d27, d30, d31
Packit c32a2d
	vmov.i32	q15, #0x4b000000
Packit c32a2d
	vmvn.i32	q14, #0xb9000000
Packit c32a2d
	vorr.i32	q15, #0x00400000
Packit c32a2d
	vpadd.f32	d24, d24, d25
Packit c32a2d
	vpadd.f32	d25, d26, d27
Packit c32a2d
	vacgt.f32	q14, q12, q14
Packit c32a2d
	vadd.f32	q13, q12, q15
Packit c32a2d
	vld1.32		{q15}, [sp, :128]
Packit c32a2d
	vshr.u32	q14, q14, #31
Packit c32a2d
	vshl.i32	q13, q13, #10
Packit c32a2d
	vadd.i32	q14, q14, q15
Packit c32a2d
	vqshrn.s32	d26, q13, #10
Packit c32a2d
	vst1.32		{q14}, [sp, :128]
Packit c32a2d
	vst1.16		{d26}, [SAMPLES]!
Packit c32a2d
	vmul.f32	q12, q0, q4
Packit c32a2d
	vmul.f32	q13, q0, q6
Packit c32a2d
	vmul.f32	q14, q2, q8
Packit c32a2d
	vmul.f32	q15, q2, q10
Packit c32a2d
	vmla.f32	q12, q1, q5
Packit c32a2d
	vmla.f32	q13, q1, q7
Packit c32a2d
	vmla.f32	q14, q3, q9
Packit c32a2d
	vmla.f32	q15, q3, q11
Packit c32a2d
	sub			WINDOW, WINDOW, #96
Packit c32a2d
	add			B0L, B0L, #96
Packit c32a2d
	add			B0R, B0R, #96
Packit c32a2d
	vld1.32		{q0,q1}, [WINDOW], r5
Packit c32a2d
	vld1.32		{q2,q3}, [WINDOW]
Packit c32a2d
	vld1.32		{q4,q5}, [B0L, :128], r6
Packit c32a2d
	vld1.32		{q6,q7}, [B0R, :128], r6
Packit c32a2d
	vld1.32		{q8,q9}, [B0L, :128]
Packit c32a2d
	vld1.32		{q10,q11}, [B0R, :128]
Packit c32a2d
	vmla.f32	q12, q0, q4
Packit c32a2d
	vmla.f32	q13, q0, q6
Packit c32a2d
	vmla.f32	q14, q2, q8
Packit c32a2d
	vmla.f32	q15, q2, q10
Packit c32a2d
	add			WINDOW, WINDOW, #96
Packit c32a2d
	sub			B0L, B0L, #96
Packit c32a2d
	sub			B0R, B0R, #96
Packit c32a2d
	vmla.f32	q12, q1, q5
Packit c32a2d
	vmla.f32	q13, q1, q7
Packit c32a2d
	vmla.f32	q14, q3, q9
Packit c32a2d
	vmla.f32	q15, q3, q11
Packit c32a2d
	vpadd.f32	d24, d24, d25
Packit c32a2d
	vpadd.f32	d25, d26, d27
Packit c32a2d
	vpadd.f32	d26, d28, d29
Packit c32a2d
	vpadd.f32	d27, d30, d31
Packit c32a2d
	vmov.i32	q15, #0x4b000000
Packit c32a2d
	vmvn.i32	q14, #0xb9000000
Packit c32a2d
	vorr.i32	q15, #0x00400000
Packit c32a2d
	vpadd.f32	d24, d24, d25
Packit c32a2d
	vpadd.f32	d25, d26, d27
Packit c32a2d
	vacgt.f32	q14, q12, q14
Packit c32a2d
	vadd.f32	q13, q12, q15
Packit c32a2d
	vld1.32		{q15}, [sp, :128]
Packit c32a2d
	vshr.u32	q14, q14, #31
Packit c32a2d
	vshl.i32	q13, q13, #10
Packit c32a2d
	vadd.i32	q14, q14, q15
Packit c32a2d
	vqshrn.s32	d26, q13, #10
Packit c32a2d
	vst1.32		{q14}, [sp, :128]
Packit c32a2d
	vst1.16		{d26}, [SAMPLES]!
Packit c32a2d
Packit c32a2d
	subs		r4, r4, #1
Packit c32a2d
	bne			1b
Packit c32a2d
Packit c32a2d
	vld1.32		{q0}, [sp, :128]
Packit c32a2d
	vpadd.i32	d0, d0, d1
Packit c32a2d
	vpadd.i32	d0, d0, d0
Packit c32a2d
	vmov.32		r0, d0[0]
Packit c32a2d
Packit c32a2d
	mov			sp, r7
Packit c32a2d
	vpop		{q4-q7}
Packit c32a2d
	pop			{r4-r7, pc}
Packit c32a2d
Packit c32a2d
NONEXEC_STACK