Blame src/libmpg123/synth_neon.S

Packit c32a2d
/*
Packit c32a2d
	synth_neon: ARM NEON optimized synth
Packit c32a2d
Packit c32a2d
	copyright 1995-2010 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#define WINDOW r0
Packit c32a2d
#define B0 r1
Packit c32a2d
#define SAMPLES r2
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	int synth_1to1_neon_asm(short *window, short *b0, short *samples, int bo1);
Packit c32a2d
	return value: number of clipped samples
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#ifndef _M_ARM
Packit c32a2d
	.code 32
Packit c32a2d
#endif
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.fpu neon
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
	.text
Packit c32a2d
	GLOBAL_SYMBOL ASM_NAME(synth_1to1_neon_asm)
Packit c32a2d
#ifdef __ELF__
Packit c32a2d
	.type ASM_NAME(synth_1to1_neon_asm), %function
Packit c32a2d
#endif
Packit c32a2d
	ALIGN4
Packit c32a2d
ASM_NAME(synth_1to1_neon_asm):
Packit c32a2d
	push		{r4-r5, lr}
Packit c32a2d
	vpush		{q4-q7}
Packit c32a2d
Packit c32a2d
	add			WINDOW, WINDOW, #32
Packit c32a2d
	sub			WINDOW, WINDOW, r3, lsl #1
Packit c32a2d
Packit c32a2d
	mov			r3, #4
Packit c32a2d
	mov			r4, #64
Packit c32a2d
1:
Packit c32a2d
	vld1.16		{d0-d3}, [WINDOW], r4
Packit c32a2d
	vld1.16		{d4-d7}, [B0, :128]!
Packit c32a2d
	vld1.16		{d8-d11}, [WINDOW], r4
Packit c32a2d
	vswp		d1, d4
Packit c32a2d
	vld1.16		{d12-d15}, [B0, :128]!
Packit c32a2d
	vld1.16		{d16-d19}, [WINDOW], r4
Packit c32a2d
	vld1.16		{d20-d23}, [B0, :128]!
Packit c32a2d
	vswp		d9, d12
Packit c32a2d
	vld1.16		{d24-d27}, [WINDOW], r4
Packit c32a2d
	vld1.16		{d28-d31}, [B0, :128]!
Packit c32a2d
	vswp		d17, d20
Packit c32a2d
	vswp		d25, d28
Packit c32a2d
	vmull.s16	q0, d0, d1
Packit c32a2d
	vmull.s16	q4, d8, d9
Packit c32a2d
	vmull.s16	q8, d16, d17
Packit c32a2d
	vmull.s16	q12, d24, d25
Packit c32a2d
	vmlal.s16	q0, d4, d5
Packit c32a2d
	vmlal.s16	q4, d12, d13
Packit c32a2d
	vmlal.s16	q8, d20, d21
Packit c32a2d
	vmlal.s16	q12, d28, d29
Packit c32a2d
	vmlal.s16	q0, d2, d6
Packit c32a2d
	vmlal.s16	q4, d10, d14
Packit c32a2d
	vmlal.s16	q8, d18, d22
Packit c32a2d
	vmlal.s16	q12, d26, d30
Packit c32a2d
	vmlal.s16	q0, d3, d7
Packit c32a2d
	vmlal.s16	q4, d11, d15
Packit c32a2d
	vmlal.s16	q8, d19, d23
Packit c32a2d
	vmlal.s16	q12, d27, d31
Packit c32a2d
	vpadd.i32	d0, d0, d1
Packit c32a2d
	vpadd.i32	d8, d8, d9
Packit c32a2d
	vpadd.i32	d16, d16, d17
Packit c32a2d
	vpadd.i32	d24, d24, d25
Packit c32a2d
	vpadd.i32	d0, d0, d8
Packit c32a2d
	vpadd.i32	d1, d16, d24
Packit c32a2d
Packit c32a2d
	vld2.16		{d2,d3}, [SAMPLES]
Packit c32a2d
	vqrshrn.s32	d1, q0, #13
Packit c32a2d
	vst2.16		{d1,d3}, [SAMPLES]!
Packit c32a2d
Packit c32a2d
	subs		r3, r3, #1
Packit c32a2d
	bne			1b
Packit c32a2d
Packit c32a2d
	mov			r3, #4
Packit c32a2d
	mov			r5, #-32
Packit c32a2d
1:
Packit c32a2d
	vld1.16		{d0-d3}, [WINDOW], r4
Packit c32a2d
	vld1.16		{d4-d7}, [B0, :128], r5
Packit c32a2d
	vld1.16		{d8-d11}, [WINDOW], r4
Packit c32a2d
	vswp		d1, d4
Packit c32a2d
	vld1.16		{d12-d15}, [B0, :128], r5
Packit c32a2d
	vld1.16		{d16-d19}, [WINDOW], r4
Packit c32a2d
	vld1.16		{d20-d23}, [B0, :128], r5
Packit c32a2d
	vswp		d9, d12
Packit c32a2d
	vld1.16		{d24-d27}, [WINDOW], r4
Packit c32a2d
	vld1.16		{d28-d31}, [B0, :128], r5
Packit c32a2d
	vswp		d17, d20
Packit c32a2d
	vswp		d25, d28
Packit c32a2d
	vmull.s16	q0, d0, d1
Packit c32a2d
	vmull.s16	q4, d8, d9
Packit c32a2d
	vmull.s16	q8, d16, d17
Packit c32a2d
	vmull.s16	q12, d24, d25
Packit c32a2d
	vmlal.s16	q0, d4, d5
Packit c32a2d
	vmlal.s16	q4, d12, d13
Packit c32a2d
	vmlal.s16	q8, d20, d21
Packit c32a2d
	vmlal.s16	q12, d28, d29
Packit c32a2d
	vmlal.s16	q0, d2, d6
Packit c32a2d
	vmlal.s16	q4, d10, d14
Packit c32a2d
	vmlal.s16	q8, d18, d22
Packit c32a2d
	vmlal.s16	q12, d26, d30
Packit c32a2d
	vmlal.s16	q0, d3, d7
Packit c32a2d
	vmlal.s16	q4, d11, d15
Packit c32a2d
	vmlal.s16	q8, d19, d23
Packit c32a2d
	vmlal.s16	q12, d27, d31
Packit c32a2d
	vpadd.i32	d0, d0, d1
Packit c32a2d
	vpadd.i32	d8, d8, d9
Packit c32a2d
	vpadd.i32	d16, d16, d17
Packit c32a2d
	vpadd.i32	d24, d24, d25
Packit c32a2d
	vpadd.i32	d0, d0, d8
Packit c32a2d
	vpadd.i32	d1, d16, d24
Packit c32a2d
Packit c32a2d
	vld2.16		{d2,d3}, [SAMPLES]
Packit c32a2d
	vqrshrn.s32	d1, q0, #13
Packit c32a2d
	vst2.16		{d1,d3}, [SAMPLES]!
Packit c32a2d
Packit c32a2d
	subs		r3, r3, #1
Packit c32a2d
	bne			1b
Packit c32a2d
Packit c32a2d
	mov			r0, #0
Packit c32a2d
Packit c32a2d
	vpop		{q4-q7}
Packit c32a2d
	pop			{r4-r5, pc}
Packit c32a2d
Packit c32a2d
NONEXEC_STACK