Blame src/libmpg123/synth_stereo_neon.S

Packit c32a2d
/*
Packit c32a2d
	synth_stereo_neon: ARM NEON optimized synth (stereo specific version)
Packit c32a2d
Packit c32a2d
	copyright 1995-2010 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#define WINDOW r0
Packit c32a2d
#define B0L r1
Packit c32a2d
#define B0R r2
Packit c32a2d
#define SAMPLES r3
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	int synth_1to1_s_neon_asm(short *window, short *b0l, short *b0r, short *samples, int bo1);
Packit c32a2d
	return value: number of clipped samples
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#ifndef _M_ARM
Packit c32a2d
	.code 32
Packit c32a2d
#endif
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.fpu neon
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
	.text
Packit c32a2d
	GLOBAL_SYMBOL ASM_NAME(synth_1to1_s_neon_asm)
Packit c32a2d
#ifdef __ELF__
Packit c32a2d
	.type ASM_NAME(synth_1to1_s_neon_asm), %function
Packit c32a2d
#endif
Packit c32a2d
	ALIGN4
Packit c32a2d
ASM_NAME(synth_1to1_s_neon_asm):
Packit c32a2d
	push		{r4-r6, lr}
Packit c32a2d
	vpush		{q4-q7}
Packit c32a2d
Packit c32a2d
	ldr			r4, [sp, #80]
Packit c32a2d
	add			WINDOW, WINDOW, #32
Packit c32a2d
	sub			WINDOW, WINDOW, r4, lsl #1
Packit c32a2d
Packit c32a2d
	mov			r4, #4
Packit c32a2d
	mov			r5, #64
Packit c32a2d
1:
Packit c32a2d
	vld1.16		{d0-d3}, [WINDOW], r5
Packit c32a2d
	vld1.16		{d4-d7}, [WINDOW], r5
Packit c32a2d
	vld1.16		{d8-d11}, [B0L, :128]!
Packit c32a2d
	vld1.16		{d12-d15}, [B0R, :128]!
Packit c32a2d
	vld1.16		{d16-d19}, [B0L, :128]!
Packit c32a2d
	vld1.16		{d20-d23}, [B0R, :128]!
Packit c32a2d
	
Packit c32a2d
	vmull.s16	q12, d0, d8
Packit c32a2d
	vmull.s16	q13, d0, d12
Packit c32a2d
	vmull.s16	q14, d4, d16
Packit c32a2d
	vmull.s16	q15, d4, d20
Packit c32a2d
	vmlal.s16	q12, d1, d9
Packit c32a2d
	vmlal.s16	q13, d1, d13
Packit c32a2d
	vmlal.s16	q14, d5, d17
Packit c32a2d
	vmlal.s16	q15, d5, d21
Packit c32a2d
	vmlal.s16	q12, d2, d10
Packit c32a2d
	vmlal.s16	q13, d2, d14
Packit c32a2d
	vmlal.s16	q14, d6, d18
Packit c32a2d
	vmlal.s16	q15, d6, d22
Packit c32a2d
	vmlal.s16	q12, d3, d11
Packit c32a2d
	vmlal.s16	q13, d3, d15
Packit c32a2d
	vmlal.s16	q14, d7, d19
Packit c32a2d
	vmlal.s16	q15, d7, d23
Packit c32a2d
	vpadd.i32	d24, d24, d25
Packit c32a2d
	vpadd.i32	d26, d26, d27
Packit c32a2d
	vpadd.i32	d28, d28, d29
Packit c32a2d
	vpadd.i32	d30, d30, d31
Packit c32a2d
	vpadd.i32	d24, d24, d26
Packit c32a2d
	vpadd.i32	d25, d28, d30
Packit c32a2d
	vqrshrn.s32	d0, q12, #13
Packit c32a2d
	vst1.16		{d0}, [SAMPLES]!
Packit c32a2d
	
Packit c32a2d
	vld1.16		{d0-d3}, [WINDOW], r5
Packit c32a2d
	vld1.16		{d4-d7}, [WINDOW], r5
Packit c32a2d
	vld1.16		{d8-d11}, [B0L, :128]!
Packit c32a2d
	vld1.16		{d12-d15}, [B0R, :128]!
Packit c32a2d
	vld1.16		{d16-d19}, [B0L, :128]!
Packit c32a2d
	vld1.16		{d20-d23}, [B0R, :128]!
Packit c32a2d
	
Packit c32a2d
	vmull.s16	q12, d0, d8
Packit c32a2d
	vmull.s16	q13, d0, d12
Packit c32a2d
	vmull.s16	q14, d4, d16
Packit c32a2d
	vmull.s16	q15, d4, d20
Packit c32a2d
	vmlal.s16	q12, d1, d9
Packit c32a2d
	vmlal.s16	q13, d1, d13
Packit c32a2d
	vmlal.s16	q14, d5, d17
Packit c32a2d
	vmlal.s16	q15, d5, d21
Packit c32a2d
	vmlal.s16	q12, d2, d10
Packit c32a2d
	vmlal.s16	q13, d2, d14
Packit c32a2d
	vmlal.s16	q14, d6, d18
Packit c32a2d
	vmlal.s16	q15, d6, d22
Packit c32a2d
	vmlal.s16	q12, d3, d11
Packit c32a2d
	vmlal.s16	q13, d3, d15
Packit c32a2d
	vmlal.s16	q14, d7, d19
Packit c32a2d
	vmlal.s16	q15, d7, d23
Packit c32a2d
	vpadd.i32	d24, d24, d25
Packit c32a2d
	vpadd.i32	d26, d26, d27
Packit c32a2d
	vpadd.i32	d28, d28, d29
Packit c32a2d
	vpadd.i32	d30, d30, d31
Packit c32a2d
	vpadd.i32	d24, d24, d26
Packit c32a2d
	vpadd.i32	d25, d28, d30
Packit c32a2d
	vqrshrn.s32	d0, q12, #13
Packit c32a2d
	vst1.16		{d0}, [SAMPLES]!
Packit c32a2d
Packit c32a2d
	subs		r4, r4, #1
Packit c32a2d
	bne			1b
Packit c32a2d
Packit c32a2d
	mov			r4, #4
Packit c32a2d
	mov			r6, #-32
Packit c32a2d
1:
Packit c32a2d
	vld1.16		{d0-d3}, [WINDOW], r5
Packit c32a2d
	vld1.16		{d4-d7}, [WINDOW], r5
Packit c32a2d
	vld1.16		{d8-d11}, [B0L, :128], r6
Packit c32a2d
	vld1.16		{d12-d15}, [B0R, :128], r6
Packit c32a2d
	vld1.16		{d16-d19}, [B0L, :128], r6
Packit c32a2d
	vld1.16		{d20-d23}, [B0R, :128], r6
Packit c32a2d
	
Packit c32a2d
	vmull.s16	q12, d0, d8
Packit c32a2d
	vmull.s16	q13, d0, d12
Packit c32a2d
	vmull.s16	q14, d4, d16
Packit c32a2d
	vmull.s16	q15, d4, d20
Packit c32a2d
	vmlal.s16	q12, d1, d9
Packit c32a2d
	vmlal.s16	q13, d1, d13
Packit c32a2d
	vmlal.s16	q14, d5, d17
Packit c32a2d
	vmlal.s16	q15, d5, d21
Packit c32a2d
	vmlal.s16	q12, d2, d10
Packit c32a2d
	vmlal.s16	q13, d2, d14
Packit c32a2d
	vmlal.s16	q14, d6, d18
Packit c32a2d
	vmlal.s16	q15, d6, d22
Packit c32a2d
	vmlal.s16	q12, d3, d11
Packit c32a2d
	vmlal.s16	q13, d3, d15
Packit c32a2d
	vmlal.s16	q14, d7, d19
Packit c32a2d
	vmlal.s16	q15, d7, d23
Packit c32a2d
	vpadd.i32	d24, d24, d25
Packit c32a2d
	vpadd.i32	d26, d26, d27
Packit c32a2d
	vpadd.i32	d28, d28, d29
Packit c32a2d
	vpadd.i32	d30, d30, d31
Packit c32a2d
	vpadd.i32	d24, d24, d26
Packit c32a2d
	vpadd.i32	d25, d28, d30
Packit c32a2d
	vqrshrn.s32	d0, q12, #13
Packit c32a2d
	vst1.16		{d0}, [SAMPLES]!
Packit c32a2d
	
Packit c32a2d
	vld1.16		{d0-d3}, [WINDOW], r5
Packit c32a2d
	vld1.16		{d4-d7}, [WINDOW], r5
Packit c32a2d
	vld1.16		{d8-d11}, [B0L, :128], r6
Packit c32a2d
	vld1.16		{d12-d15}, [B0R, :128], r6
Packit c32a2d
	vld1.16		{d16-d19}, [B0L, :128], r6
Packit c32a2d
	vld1.16		{d20-d23}, [B0R, :128], r6
Packit c32a2d
	
Packit c32a2d
	vmull.s16	q12, d0, d8
Packit c32a2d
	vmull.s16	q13, d0, d12
Packit c32a2d
	vmull.s16	q14, d4, d16
Packit c32a2d
	vmull.s16	q15, d4, d20
Packit c32a2d
	vmlal.s16	q12, d1, d9
Packit c32a2d
	vmlal.s16	q13, d1, d13
Packit c32a2d
	vmlal.s16	q14, d5, d17
Packit c32a2d
	vmlal.s16	q15, d5, d21
Packit c32a2d
	vmlal.s16	q12, d2, d10
Packit c32a2d
	vmlal.s16	q13, d2, d14
Packit c32a2d
	vmlal.s16	q14, d6, d18
Packit c32a2d
	vmlal.s16	q15, d6, d22
Packit c32a2d
	vmlal.s16	q12, d3, d11
Packit c32a2d
	vmlal.s16	q13, d3, d15
Packit c32a2d
	vmlal.s16	q14, d7, d19
Packit c32a2d
	vmlal.s16	q15, d7, d23
Packit c32a2d
	vpadd.i32	d24, d24, d25
Packit c32a2d
	vpadd.i32	d26, d26, d27
Packit c32a2d
	vpadd.i32	d28, d28, d29
Packit c32a2d
	vpadd.i32	d30, d30, d31
Packit c32a2d
	vpadd.i32	d24, d24, d26
Packit c32a2d
	vpadd.i32	d25, d28, d30
Packit c32a2d
	vqrshrn.s32	d0, q12, #13
Packit c32a2d
	vst1.16		{d0}, [SAMPLES]!
Packit c32a2d
Packit c32a2d
	subs		r4, r4, #1
Packit c32a2d
	bne			1b
Packit c32a2d
Packit c32a2d
	mov			r0, #0
Packit c32a2d
Packit c32a2d
	vpop		{q4-q7}
Packit c32a2d
	pop			{r4-r6, pc}
Packit c32a2d
Packit c32a2d
NONEXEC_STACK