|
Packit |
c32a2d |
/*
|
|
Packit |
c32a2d |
dct36_neon: ARM NEON optimized dct36
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1
|
|
Packit |
c32a2d |
see COPYING and AUTHORS files in distribution or http://mpg123.org
|
|
Packit |
c32a2d |
initially written by Taihei Monma
|
|
Packit |
c32a2d |
*/
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
#include "mangle.h"
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
#ifndef _M_ARM
|
|
Packit |
c32a2d |
.code 32
|
|
Packit |
c32a2d |
#endif
|
|
Packit |
c32a2d |
#ifndef __APPLE__
|
|
Packit |
c32a2d |
.fpu neon
|
|
Packit |
c32a2d |
#endif
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
.text
|
|
Packit |
c32a2d |
ALIGN16
|
|
Packit |
c32a2d |
dct36_neon_COS9:
|
|
Packit |
c32a2d |
.word 0x3f5db3d7
|
|
Packit |
c32a2d |
.word 0x3f5db3d7
|
|
Packit |
c32a2d |
.word 0x3f000000
|
|
Packit |
c32a2d |
.word 0x3f000000
|
|
Packit |
c32a2d |
.word 0x3f7c1c5c
|
|
Packit |
c32a2d |
.word 0x3f7c1c5c
|
|
Packit |
c32a2d |
.word 0x3f708fb2
|
|
Packit |
c32a2d |
.word 0x3f708fb2
|
|
Packit |
c32a2d |
.word 0x3f248dbb
|
|
Packit |
c32a2d |
.word 0x3f248dbb
|
|
Packit |
c32a2d |
.word 0x3e31d0d4
|
|
Packit |
c32a2d |
.word 0x3e31d0d4
|
|
Packit |
c32a2d |
.word 0x3eaf1d44
|
|
Packit |
c32a2d |
.word 0x3eaf1d44
|
|
Packit |
c32a2d |
.word 0x3f441b7d
|
|
Packit |
c32a2d |
.word 0x3f441b7d
|
|
Packit |
c32a2d |
.word 0x3f007d2b
|
|
Packit |
c32a2d |
.word 0x3f0483ee
|
|
Packit |
c32a2d |
.word 0x3f0d3b7d
|
|
Packit |
c32a2d |
.word 0x3f1c4257
|
|
Packit |
c32a2d |
.word 0x40b79454
|
|
Packit |
c32a2d |
.word 0x3ff746ea
|
|
Packit |
c32a2d |
.word 0x3f976fd9
|
|
Packit |
c32a2d |
.word 0x3f5f2944
|
|
Packit |
c32a2d |
.word 0x3f800000
|
|
Packit |
c32a2d |
.word 0x3f3504f3
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
ALIGN4
|
|
Packit |
c32a2d |
GLOBAL_SYMBOL ASM_NAME(dct36_neon)
|
|
Packit |
c32a2d |
#ifdef __ELF__
|
|
Packit |
c32a2d |
.type ASM_NAME(dct36_neon), %function
|
|
Packit |
c32a2d |
#endif
|
|
Packit |
c32a2d |
ASM_NAME(dct36_neon):
|
|
Packit |
c32a2d |
push {r4-r5, lr}
|
|
Packit |
c32a2d |
vpush {q4-q7}
|
|
Packit |
c32a2d |
ldr r4, [sp, #76]
|
|
Packit |
c32a2d |
adr r5, dct36_neon_COS9
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vceq.i32 q14, q14, q14
|
|
Packit |
c32a2d |
veor q15, q15, q15
|
|
Packit |
c32a2d |
vshl.i64 q14, q14, #32
|
|
Packit |
c32a2d |
vld1.32 {q0, q1}, [r0]!
|
|
Packit |
c32a2d |
vld1.32 {q2, q3}, [r0]!
|
|
Packit |
c32a2d |
vld1.32 {d8}, [r0]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vext.8 q5, q15, q0, #12
|
|
Packit |
c32a2d |
vext.8 q6, q0, q1, #12
|
|
Packit |
c32a2d |
vext.8 q7, q1, q2, #12
|
|
Packit |
c32a2d |
vext.8 q8, q2, q3, #12
|
|
Packit |
c32a2d |
vext.8 d18, d7, d8, #4
|
|
Packit |
c32a2d |
vadd.f32 q0, q0, q5
|
|
Packit |
c32a2d |
vadd.f32 q1, q1, q6
|
|
Packit |
c32a2d |
vadd.f32 q2, q2, q7
|
|
Packit |
c32a2d |
vadd.f32 q3, q3, q8
|
|
Packit |
c32a2d |
vadd.f32 d8, d8, d18
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vext.8 q6, q0, q1, #8
|
|
Packit |
c32a2d |
vext.8 q7, q1, q2, #8
|
|
Packit |
c32a2d |
vext.8 q8, q2, q3, #8
|
|
Packit |
c32a2d |
vext.8 q9, q3, q4, #8
|
|
Packit |
c32a2d |
vand q10, q0, q14
|
|
Packit |
c32a2d |
vext.8 q0, q15, q0, #8
|
|
Packit |
c32a2d |
vand q11, q1, q14
|
|
Packit |
c32a2d |
vand q12, q2, q14
|
|
Packit |
c32a2d |
vand q13, q3, q14
|
|
Packit |
c32a2d |
vadd.f32 q1, q10, q6
|
|
Packit |
c32a2d |
vadd.f32 q2, q11, q7
|
|
Packit |
c32a2d |
vadd.f32 q3, q12, q8
|
|
Packit |
c32a2d |
vadd.f32 q4, q13, q9
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
/*
|
|
Packit |
c32a2d |
q0 in[-,-,0,1]
|
|
Packit |
c32a2d |
q1 in[2,3,4,5]
|
|
Packit |
c32a2d |
q2 in[6,7,8,9]
|
|
Packit |
c32a2d |
q3 in[10,11,12,13]
|
|
Packit |
c32a2d |
q4 in[14,15,16,17]
|
|
Packit |
c32a2d |
*/
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vswp d5, d7
|
|
Packit |
c32a2d |
vswp d7, d9
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
/*
|
|
Packit |
c32a2d |
q2 in[6,7,12,13]
|
|
Packit |
c32a2d |
q3 in[10,11,16,17]
|
|
Packit |
c32a2d |
q4 in[14,15,8,9]
|
|
Packit |
c32a2d |
*/
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vld1.32 {q5, q6}, [r5, :128]!
|
|
Packit |
c32a2d |
vld1.32 {q7, q8}, [r5, :128]!
|
|
Packit |
c32a2d |
vmov q9, q0
|
|
Packit |
c32a2d |
vmla.f32 q9, q2, q5
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
/*
|
|
Packit |
c32a2d |
q6 COS9_[1,1,2,2]
|
|
Packit |
c32a2d |
q7 COS9_[5,5,8,8]
|
|
Packit |
c32a2d |
q8 COS9_[7,7,4,4]
|
|
Packit |
c32a2d |
q5 COS9_[3,3,6,6]
|
|
Packit |
c32a2d |
q9 [ta33,tb33,ta66,tb66]
|
|
Packit |
c32a2d |
*/
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vmov q10, q9
|
|
Packit |
c32a2d |
vmov d26, d0
|
|
Packit |
c32a2d |
vmov d27, d5
|
|
Packit |
c32a2d |
vmul.f32 q12, q1, q6
|
|
Packit |
c32a2d |
vsub.f32 q11, q1, q3
|
|
Packit |
c32a2d |
vmla.f32 q10, q3, q7
|
|
Packit |
c32a2d |
vsub.f32 q13, q0, q13
|
|
Packit |
c32a2d |
vmla.f32 q12, q4, q8
|
|
Packit |
c32a2d |
vsub.f32 q11, q11, q4
|
|
Packit |
c32a2d |
vmul.f32 q14, q1, q7
|
|
Packit |
c32a2d |
vmul.f32 q15, q1, q8
|
|
Packit |
c32a2d |
vadd.f32 q12, q12, q10
|
|
Packit |
c32a2d |
vmov q10, q9
|
|
Packit |
c32a2d |
vmla.f32 q13, q11, q5
|
|
Packit |
c32a2d |
vmla.f32 q10, q3, q8
|
|
Packit |
c32a2d |
vmla.f32 q14, q4, q6
|
|
Packit |
c32a2d |
vmla.f32 q9, q4, q7
|
|
Packit |
c32a2d |
vmla.f32 q15, q3, q6
|
|
Packit |
c32a2d |
vsub.f32 q14, q14, q10
|
|
Packit |
c32a2d |
vsub.f32 q15, q15, q9
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
/*
|
|
Packit |
c32a2d |
q12 [1a-0,1b-0, 2a-0, 2b-0]
|
|
Packit |
c32a2d |
q13 [1a-1,1b-1, 2a-1, 2b-1]
|
|
Packit |
c32a2d |
q14 [1a-2,1b-2,-2a-2,-2b-2]
|
|
Packit |
c32a2d |
q15 [1a-3,1b-3,-2a-3,-2b-3]
|
|
Packit |
c32a2d |
*/
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vzip.32 q12, q13
|
|
Packit |
c32a2d |
vzip.32 q14, q15
|
|
Packit |
c32a2d |
vneg.f32 q15, q15
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
/*
|
|
Packit |
c32a2d |
q12 [1a-0,1a-1,1b-0,1b-1]
|
|
Packit |
c32a2d |
q13 [2a-0,2a-1,2b-0,2b-1]
|
|
Packit |
c32a2d |
q14 [1a-2,1a-3,1b-2,1b-3]
|
|
Packit |
c32a2d |
q15 [2a-2,2a-3,2b-2,2b-3]
|
|
Packit |
c32a2d |
*/
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vswp d25, d28
|
|
Packit |
c32a2d |
vswp d27, d30
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
/*
|
|
Packit |
c32a2d |
q12 tmp1a
|
|
Packit |
c32a2d |
q13 tmp2a
|
|
Packit |
c32a2d |
q14 tmp1b
|
|
Packit |
c32a2d |
q15 tmp2b
|
|
Packit |
c32a2d |
*/
|
|
Packit |
c32a2d |
vsub.f32 d1, d1, d3
|
|
Packit |
c32a2d |
vsub.f32 d9, d9, d5
|
|
Packit |
c32a2d |
vld1.32 {q5, q6}, [r5, :128]!
|
|
Packit |
c32a2d |
vld1.32 {d0}, [r5, :64]
|
|
Packit |
c32a2d |
vadd.f32 q10, q14, q15
|
|
Packit |
c32a2d |
vsub.f32 q8, q15, q14
|
|
Packit |
c32a2d |
vadd.f32 d1, d1, d7
|
|
Packit |
c32a2d |
vadd.f32 q9, q12, q13
|
|
Packit |
c32a2d |
vsub.f32 q7, q13, q12
|
|
Packit |
c32a2d |
vadd.f32 d1, d1, d9
|
|
Packit |
c32a2d |
vmul.f32 q10, q10, q5
|
|
Packit |
c32a2d |
vmul.f32 q8, q8, q6
|
|
Packit |
c32a2d |
vmul.f32 d0, d1, d0
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
/*
|
|
Packit |
c32a2d |
q9 tmp[0,1,2,3]
|
|
Packit |
c32a2d |
q10 tmp[17,16,15,14]
|
|
Packit |
c32a2d |
q7 tmp[8,7,6,5]
|
|
Packit |
c32a2d |
q8 tmp[9,10,11,12]
|
|
Packit |
c32a2d |
d0 tmp[4,13]
|
|
Packit |
c32a2d |
*/
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
add r0, r4, #640
|
|
Packit |
c32a2d |
add r5, r3, #20
|
|
Packit |
c32a2d |
vld1.32 {q1,q2}, [r5]
|
|
Packit |
c32a2d |
add r5, r3, #92
|
|
Packit |
c32a2d |
vld1.32 {q3,q4}, [r5]
|
|
Packit |
c32a2d |
add r5, r1, #20
|
|
Packit |
c32a2d |
vld1.32 {q5,q6}, [r5]
|
|
Packit |
c32a2d |
vadd.f32 q11, q9, q10
|
|
Packit |
c32a2d |
vsub.f32 q12, q9, q10
|
|
Packit |
c32a2d |
vmul.f32 q10, q11, q4
|
|
Packit |
c32a2d |
vmla.f32 q6, q12, q2
|
|
Packit |
c32a2d |
vrev64.32 q11, q11
|
|
Packit |
c32a2d |
vrev64.32 q12, q12
|
|
Packit |
c32a2d |
vswp d22, d23
|
|
Packit |
c32a2d |
vswp d24, d25
|
|
Packit |
c32a2d |
vmul.f32 q9, q11, q3
|
|
Packit |
c32a2d |
vmla.f32 q5, q12, q1
|
|
Packit |
c32a2d |
add r5, r2, #20
|
|
Packit |
c32a2d |
vst1.32 {q9,q10}, [r5]
|
|
Packit |
c32a2d |
mov r5, #128
|
|
Packit |
c32a2d |
vst1.32 {d10[0]}, [r0], r5
|
|
Packit |
c32a2d |
vst1.32 {d10[1]}, [r0], r5
|
|
Packit |
c32a2d |
vst1.32 {d11[0]}, [r0], r5
|
|
Packit |
c32a2d |
vst1.32 {d11[1]}, [r0], r5
|
|
Packit |
c32a2d |
vst1.32 {d12[0]}, [r0], r5
|
|
Packit |
c32a2d |
vst1.32 {d12[1]}, [r0], r5
|
|
Packit |
c32a2d |
vst1.32 {d13[0]}, [r0], r5
|
|
Packit |
c32a2d |
vst1.32 {d13[1]}, [r0], r5
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
add r0, r4, #1792
|
|
Packit |
c32a2d |
add r5, r3, #56
|
|
Packit |
c32a2d |
vld1.32 {q1}, [r3]
|
|
Packit |
c32a2d |
vld1.32 {q2,q3}, [r5]
|
|
Packit |
c32a2d |
add r5, r3, #128
|
|
Packit |
c32a2d |
vld1.32 {q4}, [r5]
|
|
Packit |
c32a2d |
add r5, r1, #56
|
|
Packit |
c32a2d |
vld1.32 {q5}, [r1]
|
|
Packit |
c32a2d |
vld1.32 {q6}, [r5]
|
|
Packit |
c32a2d |
vadd.f32 q9, q7, q8
|
|
Packit |
c32a2d |
vsub.f32 q10, q7, q8
|
|
Packit |
c32a2d |
vmul.f32 q7, q9, q3
|
|
Packit |
c32a2d |
vmla.f32 q5, q10, q1
|
|
Packit |
c32a2d |
vrev64.32 q9, q9
|
|
Packit |
c32a2d |
vrev64.32 q10, q10
|
|
Packit |
c32a2d |
vswp d18, d19
|
|
Packit |
c32a2d |
vswp d20, d21
|
|
Packit |
c32a2d |
vmul.f32 q8, q9, q4
|
|
Packit |
c32a2d |
vmla.f32 q6, q10, q2
|
|
Packit |
c32a2d |
add r5, r2, #56
|
|
Packit |
c32a2d |
vst1.32 {q7}, [r2]
|
|
Packit |
c32a2d |
vst1.32 {q8}, [r5]
|
|
Packit |
c32a2d |
mov r5, #128
|
|
Packit |
c32a2d |
vst1.32 {d10[0]}, [r4], r5
|
|
Packit |
c32a2d |
vst1.32 {d10[1]}, [r4], r5
|
|
Packit |
c32a2d |
vst1.32 {d11[0]}, [r4], r5
|
|
Packit |
c32a2d |
vst1.32 {d11[1]}, [r4], r5
|
|
Packit |
c32a2d |
vst1.32 {d12[0]}, [r0], r5
|
|
Packit |
c32a2d |
vst1.32 {d12[1]}, [r0], r5
|
|
Packit |
c32a2d |
vst1.32 {d13[0]}, [r0], r5
|
|
Packit |
c32a2d |
vst1.32 {d13[1]}, [r0], r5
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vtrn.32 d0, d1
|
|
Packit |
c32a2d |
add r5, r3, #16
|
|
Packit |
c32a2d |
vld1.32 {d2}, [r5]
|
|
Packit |
c32a2d |
add r5, r3, #52
|
|
Packit |
c32a2d |
vld1.32 {d3}, [r5]
|
|
Packit |
c32a2d |
add r5, r3, #88
|
|
Packit |
c32a2d |
vld1.32 {d4}, [r5]
|
|
Packit |
c32a2d |
add r3, r3, #124
|
|
Packit |
c32a2d |
vld1.32 {d5}, [r3]
|
|
Packit |
c32a2d |
add r5, r1, #16
|
|
Packit |
c32a2d |
vld1.32 {d6}, [r5]
|
|
Packit |
c32a2d |
add r1, r1, #52
|
|
Packit |
c32a2d |
vld1.32 {d7}, [r1]
|
|
Packit |
c32a2d |
vadd.f32 d8, d0, d1
|
|
Packit |
c32a2d |
vsub.f32 d9, d0, d1
|
|
Packit |
c32a2d |
vmul.f32 d4, d8, d4
|
|
Packit |
c32a2d |
vmul.f32 d5, d8, d5
|
|
Packit |
c32a2d |
vmla.f32 d6, d9, d2
|
|
Packit |
c32a2d |
vmla.f32 d7, d9, d3
|
|
Packit |
c32a2d |
add r2, r2, #16
|
|
Packit |
c32a2d |
vst1.32 {d4[0]}, [r2]
|
|
Packit |
c32a2d |
add r2, r2, #36
|
|
Packit |
c32a2d |
vst1.32 {d5[0]}, [r2]
|
|
Packit |
c32a2d |
vst1.32 {d6[0]}, [r4]
|
|
Packit |
c32a2d |
add r4, r4, #1152
|
|
Packit |
c32a2d |
vst1.32 {d7[0]}, [r4]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vpop {q4-q7}
|
|
Packit |
c32a2d |
pop {r4-r5, pc}
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
NONEXEC_STACK
|