Blame src/libmpg123/dct36_neon.S

Packit c32a2d
/*
Packit c32a2d
	dct36_neon: ARM NEON optimized dct36
Packit c32a2d
Packit c32a2d
	copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#ifndef _M_ARM
Packit c32a2d
	.code 32
Packit c32a2d
#endif
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.fpu neon
Packit c32a2d
#endif
Packit c32a2d
	
Packit c32a2d
	.text
Packit c32a2d
	ALIGN16
Packit c32a2d
dct36_neon_COS9:
Packit c32a2d
	.word 0x3f5db3d7
Packit c32a2d
	.word 0x3f5db3d7
Packit c32a2d
	.word 0x3f000000
Packit c32a2d
	.word 0x3f000000
Packit c32a2d
	.word 0x3f7c1c5c
Packit c32a2d
	.word 0x3f7c1c5c
Packit c32a2d
	.word 0x3f708fb2
Packit c32a2d
	.word 0x3f708fb2
Packit c32a2d
	.word 0x3f248dbb
Packit c32a2d
	.word 0x3f248dbb
Packit c32a2d
	.word 0x3e31d0d4
Packit c32a2d
	.word 0x3e31d0d4
Packit c32a2d
	.word 0x3eaf1d44
Packit c32a2d
	.word 0x3eaf1d44
Packit c32a2d
	.word 0x3f441b7d
Packit c32a2d
	.word 0x3f441b7d
Packit c32a2d
	.word 0x3f007d2b
Packit c32a2d
	.word 0x3f0483ee
Packit c32a2d
	.word 0x3f0d3b7d
Packit c32a2d
	.word 0x3f1c4257
Packit c32a2d
	.word 0x40b79454
Packit c32a2d
	.word 0x3ff746ea
Packit c32a2d
	.word 0x3f976fd9
Packit c32a2d
	.word 0x3f5f2944
Packit c32a2d
	.word 0x3f800000
Packit c32a2d
	.word 0x3f3504f3
Packit c32a2d
	
Packit c32a2d
	ALIGN4
Packit c32a2d
	GLOBAL_SYMBOL ASM_NAME(dct36_neon)
Packit c32a2d
#ifdef __ELF__
Packit c32a2d
	.type ASM_NAME(dct36_neon), %function
Packit c32a2d
#endif
Packit c32a2d
ASM_NAME(dct36_neon):
Packit c32a2d
	push		{r4-r5, lr}
Packit c32a2d
	vpush		{q4-q7}
Packit c32a2d
	ldr			r4, [sp, #76]
Packit c32a2d
	adr			r5, dct36_neon_COS9
Packit c32a2d
	
Packit c32a2d
	vceq.i32	q14, q14, q14
Packit c32a2d
	veor		q15, q15, q15
Packit c32a2d
	vshl.i64	q14, q14, #32
Packit c32a2d
	vld1.32		{q0, q1}, [r0]!
Packit c32a2d
	vld1.32		{q2, q3}, [r0]!
Packit c32a2d
	vld1.32		{d8}, [r0]
Packit c32a2d
	
Packit c32a2d
	vext.8		q5, q15, q0, #12
Packit c32a2d
	vext.8		q6, q0, q1, #12
Packit c32a2d
	vext.8		q7, q1, q2, #12
Packit c32a2d
	vext.8		q8, q2, q3, #12
Packit c32a2d
	vext.8		d18, d7, d8, #4
Packit c32a2d
	vadd.f32	q0, q0, q5
Packit c32a2d
	vadd.f32	q1, q1, q6
Packit c32a2d
	vadd.f32	q2, q2, q7
Packit c32a2d
	vadd.f32	q3, q3, q8
Packit c32a2d
	vadd.f32	d8, d8, d18
Packit c32a2d
	
Packit c32a2d
	vext.8		q6, q0, q1, #8
Packit c32a2d
	vext.8		q7, q1, q2, #8
Packit c32a2d
	vext.8		q8, q2, q3, #8
Packit c32a2d
	vext.8		q9, q3, q4, #8
Packit c32a2d
	vand		q10, q0, q14
Packit c32a2d
	vext.8		q0, q15, q0, #8
Packit c32a2d
	vand		q11, q1, q14
Packit c32a2d
	vand		q12, q2, q14
Packit c32a2d
	vand		q13, q3, q14
Packit c32a2d
	vadd.f32	q1, q10, q6
Packit c32a2d
	vadd.f32	q2, q11, q7
Packit c32a2d
	vadd.f32	q3, q12, q8
Packit c32a2d
	vadd.f32	q4, q13, q9
Packit c32a2d
	
Packit c32a2d
/*
Packit c32a2d
q0 in[-,-,0,1]
Packit c32a2d
q1 in[2,3,4,5]
Packit c32a2d
q2 in[6,7,8,9]
Packit c32a2d
q3 in[10,11,12,13]
Packit c32a2d
q4 in[14,15,16,17]
Packit c32a2d
*/
Packit c32a2d
	
Packit c32a2d
	vswp		d5, d7
Packit c32a2d
	vswp		d7, d9
Packit c32a2d
	
Packit c32a2d
/*
Packit c32a2d
q2 in[6,7,12,13]
Packit c32a2d
q3 in[10,11,16,17]
Packit c32a2d
q4 in[14,15,8,9]
Packit c32a2d
*/
Packit c32a2d
	
Packit c32a2d
	vld1.32		{q5, q6}, [r5, :128]!
Packit c32a2d
	vld1.32		{q7, q8}, [r5, :128]!
Packit c32a2d
	vmov		q9, q0
Packit c32a2d
	vmla.f32	q9, q2, q5
Packit c32a2d
	
Packit c32a2d
/*
Packit c32a2d
q6 COS9_[1,1,2,2]
Packit c32a2d
q7 COS9_[5,5,8,8]
Packit c32a2d
q8 COS9_[7,7,4,4]
Packit c32a2d
q5 COS9_[3,3,6,6]
Packit c32a2d
q9 [ta33,tb33,ta66,tb66]
Packit c32a2d
*/
Packit c32a2d
	
Packit c32a2d
	vmov		q10, q9
Packit c32a2d
	vmov		d26, d0
Packit c32a2d
	vmov		d27, d5
Packit c32a2d
	vmul.f32	q12, q1, q6
Packit c32a2d
	vsub.f32	q11, q1, q3
Packit c32a2d
	vmla.f32	q10, q3, q7
Packit c32a2d
	vsub.f32	q13, q0, q13
Packit c32a2d
	vmla.f32	q12, q4, q8
Packit c32a2d
	vsub.f32	q11, q11, q4
Packit c32a2d
	vmul.f32	q14, q1, q7
Packit c32a2d
	vmul.f32	q15, q1, q8
Packit c32a2d
	vadd.f32	q12, q12, q10
Packit c32a2d
	vmov		q10, q9
Packit c32a2d
	vmla.f32	q13, q11, q5
Packit c32a2d
	vmla.f32	q10, q3, q8
Packit c32a2d
	vmla.f32	q14, q4, q6
Packit c32a2d
	vmla.f32	q9, q4, q7
Packit c32a2d
	vmla.f32	q15, q3, q6
Packit c32a2d
	vsub.f32	q14, q14, q10
Packit c32a2d
	vsub.f32	q15, q15, q9
Packit c32a2d
	
Packit c32a2d
/*
Packit c32a2d
q12 [1a-0,1b-0, 2a-0, 2b-0]
Packit c32a2d
q13 [1a-1,1b-1, 2a-1, 2b-1]
Packit c32a2d
q14 [1a-2,1b-2,-2a-2,-2b-2]
Packit c32a2d
q15 [1a-3,1b-3,-2a-3,-2b-3]
Packit c32a2d
*/
Packit c32a2d
	
Packit c32a2d
	vzip.32		q12, q13
Packit c32a2d
	vzip.32		q14, q15
Packit c32a2d
	vneg.f32	q15, q15
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
q12 [1a-0,1a-1,1b-0,1b-1]
Packit c32a2d
q13 [2a-0,2a-1,2b-0,2b-1]
Packit c32a2d
q14 [1a-2,1a-3,1b-2,1b-3]
Packit c32a2d
q15 [2a-2,2a-3,2b-2,2b-3]
Packit c32a2d
*/
Packit c32a2d
	
Packit c32a2d
	vswp		d25, d28
Packit c32a2d
	vswp		d27, d30
Packit c32a2d
	
Packit c32a2d
/*
Packit c32a2d
q12 tmp1a
Packit c32a2d
q13 tmp2a
Packit c32a2d
q14 tmp1b
Packit c32a2d
q15 tmp2b
Packit c32a2d
*/
Packit c32a2d
	vsub.f32	d1, d1, d3
Packit c32a2d
	vsub.f32	d9, d9, d5
Packit c32a2d
	vld1.32		{q5, q6}, [r5, :128]!
Packit c32a2d
	vld1.32		{d0}, [r5, :64]
Packit c32a2d
	vadd.f32	q10, q14, q15
Packit c32a2d
	vsub.f32	q8, q15, q14
Packit c32a2d
	vadd.f32	d1, d1, d7
Packit c32a2d
	vadd.f32	q9, q12, q13
Packit c32a2d
	vsub.f32	q7, q13, q12
Packit c32a2d
	vadd.f32	d1, d1, d9
Packit c32a2d
	vmul.f32	q10, q10, q5
Packit c32a2d
	vmul.f32	q8, q8, q6
Packit c32a2d
	vmul.f32	d0, d1, d0
Packit c32a2d
	
Packit c32a2d
/*
Packit c32a2d
q9 tmp[0,1,2,3]
Packit c32a2d
q10 tmp[17,16,15,14]
Packit c32a2d
q7 tmp[8,7,6,5]
Packit c32a2d
q8 tmp[9,10,11,12]
Packit c32a2d
d0 tmp[4,13]
Packit c32a2d
*/
Packit c32a2d
	
Packit c32a2d
	add			r0, r4, #640
Packit c32a2d
	add			r5, r3, #20
Packit c32a2d
	vld1.32		{q1,q2}, [r5]
Packit c32a2d
	add			r5, r3, #92
Packit c32a2d
	vld1.32		{q3,q4}, [r5]
Packit c32a2d
	add			r5, r1, #20
Packit c32a2d
	vld1.32		{q5,q6}, [r5] 
Packit c32a2d
	vadd.f32	q11, q9, q10
Packit c32a2d
	vsub.f32	q12, q9, q10
Packit c32a2d
	vmul.f32	q10, q11, q4
Packit c32a2d
	vmla.f32	q6, q12, q2
Packit c32a2d
	vrev64.32	q11, q11
Packit c32a2d
	vrev64.32	q12, q12
Packit c32a2d
	vswp		d22, d23
Packit c32a2d
	vswp		d24, d25
Packit c32a2d
	vmul.f32	q9, q11, q3
Packit c32a2d
	vmla.f32	q5, q12, q1
Packit c32a2d
	add			r5, r2, #20
Packit c32a2d
	vst1.32		{q9,q10}, [r5]
Packit c32a2d
	mov			r5, #128
Packit c32a2d
	vst1.32		{d10[0]}, [r0], r5
Packit c32a2d
	vst1.32		{d10[1]}, [r0], r5
Packit c32a2d
	vst1.32		{d11[0]}, [r0], r5
Packit c32a2d
	vst1.32		{d11[1]}, [r0], r5
Packit c32a2d
	vst1.32		{d12[0]}, [r0], r5
Packit c32a2d
	vst1.32		{d12[1]}, [r0], r5
Packit c32a2d
	vst1.32		{d13[0]}, [r0], r5
Packit c32a2d
	vst1.32		{d13[1]}, [r0], r5
Packit c32a2d
	
Packit c32a2d
	add			r0, r4, #1792
Packit c32a2d
	add			r5, r3, #56
Packit c32a2d
	vld1.32		{q1}, [r3]
Packit c32a2d
	vld1.32		{q2,q3}, [r5]
Packit c32a2d
	add			r5, r3, #128
Packit c32a2d
	vld1.32		{q4}, [r5]
Packit c32a2d
	add			r5, r1, #56
Packit c32a2d
	vld1.32		{q5}, [r1]
Packit c32a2d
	vld1.32		{q6}, [r5]
Packit c32a2d
	vadd.f32	q9, q7, q8
Packit c32a2d
	vsub.f32	q10, q7, q8
Packit c32a2d
	vmul.f32	q7, q9, q3
Packit c32a2d
	vmla.f32	q5, q10, q1
Packit c32a2d
	vrev64.32	q9, q9
Packit c32a2d
	vrev64.32	q10, q10
Packit c32a2d
	vswp		d18, d19
Packit c32a2d
	vswp		d20, d21
Packit c32a2d
	vmul.f32	q8, q9, q4
Packit c32a2d
	vmla.f32	q6, q10, q2
Packit c32a2d
	add			r5, r2, #56
Packit c32a2d
	vst1.32		{q7}, [r2]
Packit c32a2d
	vst1.32		{q8}, [r5]
Packit c32a2d
	mov			r5, #128
Packit c32a2d
	vst1.32		{d10[0]}, [r4], r5
Packit c32a2d
	vst1.32		{d10[1]}, [r4], r5
Packit c32a2d
	vst1.32		{d11[0]}, [r4], r5
Packit c32a2d
	vst1.32		{d11[1]}, [r4], r5
Packit c32a2d
	vst1.32		{d12[0]}, [r0], r5
Packit c32a2d
	vst1.32		{d12[1]}, [r0], r5
Packit c32a2d
	vst1.32		{d13[0]}, [r0], r5
Packit c32a2d
	vst1.32		{d13[1]}, [r0], r5
Packit c32a2d
	
Packit c32a2d
	vtrn.32		d0, d1
Packit c32a2d
	add			r5, r3, #16
Packit c32a2d
	vld1.32		{d2}, [r5]
Packit c32a2d
	add			r5, r3, #52
Packit c32a2d
	vld1.32		{d3}, [r5]
Packit c32a2d
	add			r5, r3, #88
Packit c32a2d
	vld1.32		{d4}, [r5]
Packit c32a2d
	add			r3, r3, #124
Packit c32a2d
	vld1.32		{d5}, [r3]
Packit c32a2d
	add			r5, r1, #16
Packit c32a2d
	vld1.32		{d6}, [r5]
Packit c32a2d
	add			r1, r1, #52
Packit c32a2d
	vld1.32		{d7}, [r1]
Packit c32a2d
	vadd.f32	d8, d0, d1
Packit c32a2d
	vsub.f32	d9, d0, d1
Packit c32a2d
	vmul.f32	d4, d8, d4
Packit c32a2d
	vmul.f32	d5, d8, d5
Packit c32a2d
	vmla.f32	d6, d9, d2
Packit c32a2d
	vmla.f32	d7, d9, d3
Packit c32a2d
	add			r2, r2, #16
Packit c32a2d
	vst1.32		{d4[0]}, [r2]
Packit c32a2d
	add			r2, r2, #36
Packit c32a2d
	vst1.32		{d5[0]}, [r2]
Packit c32a2d
	vst1.32		{d6[0]}, [r4]
Packit c32a2d
	add			r4, r4, #1152
Packit c32a2d
	vst1.32		{d7[0]}, [r4]
Packit c32a2d
	
Packit c32a2d
	vpop		{q4-q7}
Packit c32a2d
	pop			{r4-r5, pc}
Packit c32a2d
Packit c32a2d
NONEXEC_STACK