Blame src/libmpg123/synth_neon_float.S

Packit c32a2d
/*
Packit c32a2d
	synth_neon_float: ARM NEON optimized synth (float output version)
Packit c32a2d
Packit c32a2d
	copyright 1995-2010 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#define WINDOW r0
Packit c32a2d
#define B0 r1
Packit c32a2d
#define SAMPLES r2
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	int synth_1to1_real_neon_asm(real *window, real *b0, real *samples, int bo1);
Packit c32a2d
	return value: number of clipped samples (0)
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#ifndef _M_ARM
Packit c32a2d
	.code 32
Packit c32a2d
#endif
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.fpu neon
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
	.text
Packit c32a2d
	GLOBAL_SYMBOL ASM_NAME(synth_1to1_real_neon_asm)
Packit c32a2d
#ifdef __ELF__
Packit c32a2d
	.type ASM_NAME(synth_1to1_real_neon_asm), %function
Packit c32a2d
#endif
Packit c32a2d
	ALIGN4
Packit c32a2d
ASM_NAME(synth_1to1_real_neon_asm):
Packit c32a2d
	push		{r4-r5, lr}
Packit c32a2d
	vpush		{q4-q7}
Packit c32a2d
Packit c32a2d
	add			WINDOW, WINDOW, #64
Packit c32a2d
	sub			WINDOW, WINDOW, r3, lsl #2
Packit c32a2d
Packit c32a2d
	mov			r3, #4
Packit c32a2d
	mov			r4, #128
Packit c32a2d
	mov			r5, #64
Packit c32a2d
1:
Packit c32a2d
	vld1.32		{q0,q1}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q2,q3}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q4,q5}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q6,q7}, [WINDOW]
Packit c32a2d
	sub			WINDOW, WINDOW, #352
Packit c32a2d
	vld1.32		{q8,q9}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q10,q11}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q12,q13}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q14,q15}, [B0, :128]
Packit c32a2d
	vswp		q1, q4
Packit c32a2d
	vswp		q3, q6
Packit c32a2d
	sub			B0, B0, #160
Packit c32a2d
	vmul.f32	q0, q0, q8
Packit c32a2d
	vmul.f32	q2, q2, q10
Packit c32a2d
	vmul.f32	q1, q1, q12
Packit c32a2d
	vmul.f32	q3, q3, q14
Packit c32a2d
	vmla.f32	q0, q4, q9
Packit c32a2d
	vmla.f32	q2, q6, q11
Packit c32a2d
	vmla.f32	q1, q5, q13
Packit c32a2d
	vmla.f32	q3, q7, q15
Packit c32a2d
	vld1.32		{q4,q5}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q6,q7}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q8,q9}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q10,q11}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q12,q13}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q14,q15}, [B0, :128], r5
Packit c32a2d
	vswp		q5, q6
Packit c32a2d
	vswp		q11, q12
Packit c32a2d
	vmla.f32	q0, q4, q10
Packit c32a2d
	vmla.f32	q2, q5, q11
Packit c32a2d
	vmla.f32	q1, q8, q14
Packit c32a2d
	vld1.32		{q4,q5}, [WINDOW]
Packit c32a2d
	vld1.32		{q10,q11}, [B0, :128]!
Packit c32a2d
	add			WINDOW, WINDOW, #96
Packit c32a2d
	vmla.f32	q3, q4, q10
Packit c32a2d
	vmla.f32	q0, q6, q12
Packit c32a2d
	vmla.f32	q2, q7, q13
Packit c32a2d
	vmla.f32	q1, q9, q15
Packit c32a2d
	vmla.f32	q3, q5, q11
Packit c32a2d
	vld2.32		{q4,q5}, [SAMPLES]
Packit c32a2d
	vpadd.f32	d0, d0, d1
Packit c32a2d
	vpadd.f32	d4, d4, d5
Packit c32a2d
	vpadd.f32	d2, d2, d3
Packit c32a2d
	vpadd.f32	d6, d6, d7
Packit c32a2d
	vpadd.f32	d0, d0, d4
Packit c32a2d
	vpadd.f32	d1, d2, d6
Packit c32a2d
Packit c32a2d
	vmov.i32	q1, #0x38000000
Packit c32a2d
	vmul.f32	q4, q0, q1
Packit c32a2d
	vst2.32		{q4,q5}, [SAMPLES]!
Packit c32a2d
Packit c32a2d
	subs		r3, r3, #1
Packit c32a2d
	bne			1b
Packit c32a2d
Packit c32a2d
	mov			r3, #4
Packit c32a2d
	mov			r5, #-64
Packit c32a2d
1:
Packit c32a2d
	vld1.32		{q0,q1}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q2,q3}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q4,q5}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q6,q7}, [WINDOW]
Packit c32a2d
	sub			WINDOW, WINDOW, #352
Packit c32a2d
	vld1.32		{q8,q9}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q10,q11}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q12,q13}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q14,q15}, [B0, :128]
Packit c32a2d
	vswp		q1, q4
Packit c32a2d
	vswp		q3, q6
Packit c32a2d
	add			B0, B0, #224
Packit c32a2d
	vmul.f32	q0, q0, q8
Packit c32a2d
	vmul.f32	q2, q2, q10
Packit c32a2d
	vmul.f32	q1, q1, q12
Packit c32a2d
	vmul.f32	q3, q3, q14
Packit c32a2d
	vmla.f32	q0, q4, q9
Packit c32a2d
	vmla.f32	q2, q6, q11
Packit c32a2d
	vmla.f32	q1, q5, q13
Packit c32a2d
	vmla.f32	q3, q7, q15
Packit c32a2d
	vld1.32		{q4,q5}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q6,q7}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q8,q9}, [WINDOW], r4
Packit c32a2d
	vld1.32		{q10,q11}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q12,q13}, [B0, :128], r5
Packit c32a2d
	vld1.32		{q14,q15}, [B0, :128], r5
Packit c32a2d
	vswp		q5, q6
Packit c32a2d
	vswp		q11, q12
Packit c32a2d
	vmla.f32	q0, q4, q10
Packit c32a2d
	vmla.f32	q2, q5, q11
Packit c32a2d
	vmla.f32	q1, q8, q14
Packit c32a2d
	vld1.32		{q4,q5}, [WINDOW]
Packit c32a2d
	vld1.32		{q10,q11}, [B0, :128]
Packit c32a2d
	add			WINDOW, WINDOW, #96
Packit c32a2d
	sub			B0, B0, #96
Packit c32a2d
	vmla.f32	q3, q4, q10
Packit c32a2d
	vmla.f32	q0, q6, q12
Packit c32a2d
	vmla.f32	q2, q7, q13
Packit c32a2d
	vmla.f32	q1, q9, q15
Packit c32a2d
	vmla.f32	q3, q5, q11
Packit c32a2d
	vld2.32		{q4,q5}, [SAMPLES]
Packit c32a2d
	vpadd.f32	d0, d0, d1
Packit c32a2d
	vpadd.f32	d4, d4, d5
Packit c32a2d
	vpadd.f32	d2, d2, d3
Packit c32a2d
	vpadd.f32	d6, d6, d7
Packit c32a2d
	vpadd.f32	d0, d0, d4
Packit c32a2d
	vpadd.f32	d1, d2, d6
Packit c32a2d
Packit c32a2d
	vmov.i32	q1, #0x38000000
Packit c32a2d
	vmul.f32	q4, q0, q1
Packit c32a2d
	vst2.32		{q4,q5}, [SAMPLES]!
Packit c32a2d
Packit c32a2d
	subs		r3, r3, #1
Packit c32a2d
	bne			1b
Packit c32a2d
Packit c32a2d
	mov			r0, #0
Packit c32a2d
Packit c32a2d
	vpop		{q4-q7}
Packit c32a2d
	pop			{r4-r5, pc}
Packit c32a2d
Packit c32a2d
NONEXEC_STACK