Blame src/libmpg123/dct64_neon_float.S

Packit c32a2d
/*
Packit c32a2d
	dct64_neon_float: ARM NEON optimized dct64 (float output version)
Packit c32a2d
Packit c32a2d
	copyright 1995-2010 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#ifndef _M_ARM
Packit c32a2d
	.code 32
Packit c32a2d
#endif
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.fpu neon
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
	.text
Packit c32a2d
	ALIGN16
Packit c32a2d
costab_arm:
Packit c32a2d
	.word 1056974725
Packit c32a2d
	.word 1057056395
Packit c32a2d
	.word 1057223771
Packit c32a2d
	.word 1057485416
Packit c32a2d
	.word 1057855544
Packit c32a2d
	.word 1058356026
Packit c32a2d
	.word 1059019886
Packit c32a2d
	.word 1059897405
Packit c32a2d
	.word 1061067246
Packit c32a2d
	.word 1062657950
Packit c32a2d
	.word 1064892987
Packit c32a2d
	.word 1066774581
Packit c32a2d
	.word 1069414683
Packit c32a2d
	.word 1073984175
Packit c32a2d
	.word 1079645762
Packit c32a2d
	.word 1092815430
Packit c32a2d
	.word 1057005197
Packit c32a2d
	.word 1057342072
Packit c32a2d
	.word 1058087743
Packit c32a2d
	.word 1059427869
Packit c32a2d
	.word 1061799040
Packit c32a2d
	.word 1065862217
Packit c32a2d
	.word 1071413542
Packit c32a2d
	.word 1084439708
Packit c32a2d
	.word 1057128951
Packit c32a2d
	.word 1058664893
Packit c32a2d
	.word 1063675095
Packit c32a2d
	.word 1076102863
Packit c32a2d
	.word 1057655764
Packit c32a2d
	.word 1067924853
Packit c32a2d
	.word 1060439283
Packit c32a2d
	.word 1060439283
Packit c32a2d
	ALIGN4
Packit c32a2d
	GLOBAL_SYMBOL ASM_NAME(dct64_real_neon)
Packit c32a2d
#ifdef __ELF__
Packit c32a2d
	.type ASM_NAME(dct64_real_neon), %function
Packit c32a2d
#endif
Packit c32a2d
ASM_NAME(dct64_real_neon):
Packit c32a2d
	vpush		{q4-q7}
Packit c32a2d
Packit c32a2d
	adr			r3, costab_arm
Packit c32a2d
	vld1.32		{q0, q1}, [r2]!
Packit c32a2d
	vld1.32		{q2, q3}, [r2]!
Packit c32a2d
	vld1.32		{q4, q5}, [r2]!
Packit c32a2d
	vld1.32		{q6, q7}, [r2]
Packit c32a2d
	vld1.32		{q12, q13}, [r3, :128]!
Packit c32a2d
	vld1.32		{q14, q15}, [r3, :128]!
Packit c32a2d
Packit c32a2d
	vrev64.32	q4, q4
Packit c32a2d
	vrev64.32	q5, q5
Packit c32a2d
	vrev64.32	q6, q6
Packit c32a2d
	vrev64.32	q7, q7
Packit c32a2d
	vswp		d8, d9
Packit c32a2d
	vswp		d10, d11
Packit c32a2d
	vswp		d12, d13
Packit c32a2d
	vswp		d14, d15
Packit c32a2d
Packit c32a2d
	vsub.f32	q8, q0, q7
Packit c32a2d
	vsub.f32	q9, q1, q6
Packit c32a2d
	vsub.f32	q10, q2, q5
Packit c32a2d
	vsub.f32	q11, q3, q4
Packit c32a2d
	vadd.f32	q0, q0, q7
Packit c32a2d
	vadd.f32	q1, q1, q6
Packit c32a2d
	vadd.f32	q2, q2, q5
Packit c32a2d
	vadd.f32	q3, q3, q4
Packit c32a2d
	vmul.f32	q4, q8, q12
Packit c32a2d
	vmul.f32	q5, q9, q13
Packit c32a2d
	vmul.f32	q6, q10, q14
Packit c32a2d
	vmul.f32	q7, q11, q15
Packit c32a2d
Packit c32a2d
	vld1.32		{q12, q13}, [r3, :128]!
Packit c32a2d
	vld1.32		{q14, q15}, [r3, :128]
Packit c32a2d
Packit c32a2d
	vrev64.32	q2, q2
Packit c32a2d
	vrev64.32	q3, q3
Packit c32a2d
	vrev64.32	q6, q6
Packit c32a2d
	vrev64.32	q7, q7
Packit c32a2d
	vswp		d4, d5
Packit c32a2d
	vswp		d6, d7
Packit c32a2d
	vswp		d12, d13
Packit c32a2d
	vswp		d14, d15
Packit c32a2d
Packit c32a2d
	vsub.f32	q8, q0, q3
Packit c32a2d
	vsub.f32	q9, q1, q2
Packit c32a2d
	vsub.f32	q10, q4, q7
Packit c32a2d
	vsub.f32	q11, q5, q6
Packit c32a2d
	vadd.f32	q0, q0, q3
Packit c32a2d
	vadd.f32	q1, q1, q2
Packit c32a2d
	vadd.f32	q4, q4, q7
Packit c32a2d
	vadd.f32	q5, q5, q6
Packit c32a2d
	vmul.f32	q2, q8, q12
Packit c32a2d
	vmul.f32	q3, q9, q13
Packit c32a2d
	vmul.f32	q6, q10, q12
Packit c32a2d
	vmul.f32	q7, q11, q13
Packit c32a2d
Packit c32a2d
	vrev64.32	q1, q1
Packit c32a2d
	vrev64.32	q3, q3
Packit c32a2d
	vrev64.32	q5, q5
Packit c32a2d
	vrev64.32	q7, q7
Packit c32a2d
	vswp		d2, d3
Packit c32a2d
	vswp		d6, d7
Packit c32a2d
	vswp		d10, d11
Packit c32a2d
	vswp		d14, d15
Packit c32a2d
Packit c32a2d
	vsub.f32	q8, q0, q1
Packit c32a2d
	vsub.f32	q9, q2, q3
Packit c32a2d
	vsub.f32	q10, q4, q5
Packit c32a2d
	vsub.f32	q11, q6, q7
Packit c32a2d
	vadd.f32	q0, q0, q1
Packit c32a2d
	vadd.f32	q2, q2, q3
Packit c32a2d
	vadd.f32	q4, q4, q5
Packit c32a2d
	vadd.f32	q6, q6, q7
Packit c32a2d
	vmul.f32	q1, q8, q14
Packit c32a2d
	vmul.f32	q3, q9, q14
Packit c32a2d
	vmul.f32	q5, q10, q14
Packit c32a2d
	vmul.f32	q7, q11, q14
Packit c32a2d
Packit c32a2d
	vdup.32		q12, d31[0]
Packit c32a2d
	vmov		d31, d30
Packit c32a2d
Packit c32a2d
	vswp		d1, d2
Packit c32a2d
	vswp		d5, d6
Packit c32a2d
	vswp		d9, d10
Packit c32a2d
	vswp		d13, d14
Packit c32a2d
	vrev64.32	q1, q1
Packit c32a2d
	vrev64.32	q3, q3
Packit c32a2d
	vrev64.32	q5, q5
Packit c32a2d
	vrev64.32	q7, q7
Packit c32a2d
Packit c32a2d
	vsub.f32	q8, q0, q1
Packit c32a2d
	vsub.f32	q9, q2, q3
Packit c32a2d
	vsub.f32	q10, q4, q5
Packit c32a2d
	vsub.f32	q11, q6, q7
Packit c32a2d
	vadd.f32	q0, q0, q1
Packit c32a2d
	vadd.f32	q2, q2, q3
Packit c32a2d
	vadd.f32	q4, q4, q5
Packit c32a2d
	vadd.f32	q6, q6, q7
Packit c32a2d
	vmul.f32	q1, q8, q15
Packit c32a2d
	vmul.f32	q3, q9, q15
Packit c32a2d
	vmul.f32	q5, q10, q15
Packit c32a2d
	vmul.f32	q7, q11, q15
Packit c32a2d
Packit c32a2d
	vtrn.32		q0, q1
Packit c32a2d
	vtrn.32		q2, q3
Packit c32a2d
	vtrn.32		q4, q5
Packit c32a2d
	vtrn.32		q6, q7
Packit c32a2d
Packit c32a2d
	vsub.f32	q8, q0, q1
Packit c32a2d
	vsub.f32	q9, q2, q3
Packit c32a2d
	vsub.f32	q10, q4, q5
Packit c32a2d
	vsub.f32	q11, q6, q7
Packit c32a2d
	vadd.f32	q0, q0, q1
Packit c32a2d
	vadd.f32	q2, q2, q3
Packit c32a2d
	vadd.f32	q4, q4, q5
Packit c32a2d
	vadd.f32	q6, q6, q7
Packit c32a2d
	vmul.f32	q1, q8, q12
Packit c32a2d
	vmul.f32	q3, q9, q12
Packit c32a2d
	vmul.f32	q5, q10, q12
Packit c32a2d
	vmul.f32	q7, q11, q12
Packit c32a2d
Packit c32a2d
	vtrn.32		q0, q1
Packit c32a2d
	vtrn.32		q2, q3
Packit c32a2d
	vtrn.32		q4, q5
Packit c32a2d
	vtrn.32		q6, q7
Packit c32a2d
	vswp		d1, d2
Packit c32a2d
	vswp		d5, d6
Packit c32a2d
	vswp		d9, d10
Packit c32a2d
	vswp		d13, d14
Packit c32a2d
Packit c32a2d
	vshr.u64	d16, d1, #32
Packit c32a2d
	vshr.u64	d17, d3, #32
Packit c32a2d
	vshr.u64	d18, d5, #32
Packit c32a2d
	vshr.u64	d19, d7, #32
Packit c32a2d
	vadd.f32	d1, d1, d16
Packit c32a2d
	vadd.f32	d3, d3, d17
Packit c32a2d
	vadd.f32	d5, d5, d18
Packit c32a2d
	vadd.f32	d7, d7, d19
Packit c32a2d
	vshr.u64	d20, d9, #32
Packit c32a2d
	vshr.u64	d21, d11, #32
Packit c32a2d
	vshr.u64	d22, d13, #32
Packit c32a2d
	vshr.u64	d23, d15, #32
Packit c32a2d
	vadd.f32	d9, d9, d20
Packit c32a2d
	vadd.f32	d11, d11, d21
Packit c32a2d
	vadd.f32	d13, d13, d22
Packit c32a2d
	vadd.f32	d15, d15, d23
Packit c32a2d
Packit c32a2d
	vshr.u64	d16, d2, #32
Packit c32a2d
	vshr.u64	d18, d6, #32
Packit c32a2d
	vshr.u64	d20, d10, #32
Packit c32a2d
	vshr.u64	d22, d14, #32
Packit c32a2d
	vext.8		q8, q1, q8, #8
Packit c32a2d
	vext.8		q9, q3, q9, #8
Packit c32a2d
	vext.8		q10, q5, q10, #8
Packit c32a2d
	vext.8		q11, q7, q11, #8
Packit c32a2d
	vadd.f32	q1, q1, q8
Packit c32a2d
	vadd.f32	q3, q3, q9
Packit c32a2d
	vadd.f32	q5, q5, q10
Packit c32a2d
	vadd.f32	q7, q7, q11
Packit c32a2d
Packit c32a2d
	vshr.u64	d16, d4, #32
Packit c32a2d
	vshr.u64	d18, d12, #32
Packit c32a2d
	vext.8		q8, q2, q8, #8
Packit c32a2d
	vext.8		q9, q6, q9, #8
Packit c32a2d
	vadd.f32	q2, q2, q3
Packit c32a2d
	vadd.f32	q6, q6, q7
Packit c32a2d
	vadd.f32	q3, q3, q8
Packit c32a2d
	vadd.f32	q7, q7, q9
Packit c32a2d
Packit c32a2d
	vrev64.32	q8, q4
Packit c32a2d
	vshr.u64	d19, d9, #32
Packit c32a2d
	vext.8		d17, d17, d16, #4
Packit c32a2d
	vswp		d9, d10
Packit c32a2d
	vswp		d13, d14
Packit c32a2d
	vtrn.32		q4, q5
Packit c32a2d
	vtrn.32		q6, q7
Packit c32a2d
	vmov		d16, d9
Packit c32a2d
	vmov		d18, d11
Packit c32a2d
Packit c32a2d
	vadd.f32	q4, q6
Packit c32a2d
	vadd.f32	q5, q7
Packit c32a2d
	vadd.f32	q6, q8
Packit c32a2d
	vadd.f32	q7, q9
Packit c32a2d
Packit c32a2d
	mov			r3, #64
Packit c32a2d
	vst1.32		{d0[1]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d13[1]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d7[0]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d9[1]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d3[0]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d12[1]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d5[0]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d8[1]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d1[0]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d13[0]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d6[0]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d9[0]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d2[0]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d12[0]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d4[0]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d8[0]}, [r0, :32], r3
Packit c32a2d
	vst1.32		{d0[0]}, [r0, :32]
Packit c32a2d
Packit c32a2d
	vst1.32		{d0[1]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d10[0]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d4[1]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d14[0]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d2[1]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d11[0]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d6[1]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d15[0]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d1[1]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d10[1]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d5[1]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d14[1]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d3[1]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d11[1]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d7[1]}, [r1, :32], r3
Packit c32a2d
	vst1.32		{d15[1]}, [r1, :32]
Packit c32a2d
Packit c32a2d
	vpop		{q4-q7}
Packit c32a2d
	bx			lr
Packit c32a2d
Packit c32a2d
NONEXEC_STACK