|
Packit |
c32a2d |
/*
|
|
Packit |
c32a2d |
synth_stereo_neon64_accurate: NEON optimized synth for AArch64 (stereo specific, MPEG-compliant 16bit output version)
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1
|
|
Packit |
c32a2d |
see COPYING and AUTHORS files in distribution or http://mpg123.org
|
|
Packit |
c32a2d |
initially written by Taihei Monma
|
|
Packit |
c32a2d |
*/
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
#include "mangle.h"
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
#ifndef __APPLE__
|
|
Packit |
c32a2d |
.section .rodata
|
|
Packit |
c32a2d |
#else
|
|
Packit |
c32a2d |
.data
|
|
Packit |
c32a2d |
#endif
|
|
Packit |
c32a2d |
ALIGN16
|
|
Packit |
c32a2d |
maxmin_s16:
|
|
Packit |
c32a2d |
.word 1191181824
|
|
Packit |
c32a2d |
.word -956301312
|
|
Packit |
c32a2d |
.text
|
|
Packit |
c32a2d |
ALIGN4
|
|
Packit |
c32a2d |
.globl ASM_NAME(synth_1to1_s_neon64_accurate_asm)
|
|
Packit |
c32a2d |
#ifdef __ELF__
|
|
Packit |
c32a2d |
.type ASM_NAME(synth_1to1_s_neon64_accurate_asm), %function
|
|
Packit |
c32a2d |
#endif
|
|
Packit |
c32a2d |
ASM_NAME(synth_1to1_s_neon64_accurate_asm):
|
|
Packit |
c32a2d |
add x0, x0, #64
|
|
Packit |
c32a2d |
sub x0, x0, x4, lsl #2
|
|
Packit |
c32a2d |
eor v30.16b, v30.16b, v30.16b
|
|
Packit |
c32a2d |
adrp x5, AARCH64_PCREL_HI(maxmin_s16)
|
|
Packit |
c32a2d |
add x5, x5, AARCH64_PCREL_LO(maxmin_s16)
|
|
Packit |
c32a2d |
ld2r {v28.4s,v29.4s}, [x5]
|
|
Packit |
c32a2d |
sub sp, sp, #32
|
|
Packit |
c32a2d |
st1 {v8.2s,v9.2s,v10.2s,v11.2s}, [sp]
|
|
Packit |
c32a2d |
sub sp, sp, #32
|
|
Packit |
c32a2d |
st1 {v12.2s,v13.2s,v14.2s,v15.2s}, [sp]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
mov w4, #4
|
|
Packit |
c32a2d |
mov x5, #128
|
|
Packit |
c32a2d |
1:
|
|
Packit |
c32a2d |
ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
|
|
Packit |
c32a2d |
ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
|
|
Packit |
c32a2d |
ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64
|
|
Packit |
c32a2d |
ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64
|
|
Packit |
c32a2d |
ld1 {v8.4s,v9.4s,v10.4s,v11.4s}, [x1], #64
|
|
Packit |
c32a2d |
ld1 {v12.4s,v13.4s,v14.4s,v15.4s}, [x2], #64
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
fmul v24.4s, v0.4s, v16.4s
|
|
Packit |
c32a2d |
fmul v25.4s, v0.4s, v20.4s
|
|
Packit |
c32a2d |
fmul v26.4s, v4.4s, v8.4s
|
|
Packit |
c32a2d |
fmul v27.4s, v4.4s, v12.4s
|
|
Packit |
c32a2d |
fmla v24.4s, v1.4s, v17.4s
|
|
Packit |
c32a2d |
fmla v25.4s, v1.4s, v21.4s
|
|
Packit |
c32a2d |
fmla v26.4s, v5.4s, v9.4s
|
|
Packit |
c32a2d |
fmla v27.4s, v5.4s, v13.4s
|
|
Packit |
c32a2d |
fmla v24.4s, v2.4s, v18.4s
|
|
Packit |
c32a2d |
fmla v25.4s, v2.4s, v22.4s
|
|
Packit |
c32a2d |
fmla v26.4s, v6.4s, v10.4s
|
|
Packit |
c32a2d |
fmla v27.4s, v6.4s, v14.4s
|
|
Packit |
c32a2d |
fmla v24.4s, v3.4s, v19.4s
|
|
Packit |
c32a2d |
fmla v25.4s, v3.4s, v23.4s
|
|
Packit |
c32a2d |
fmla v26.4s, v7.4s, v11.4s
|
|
Packit |
c32a2d |
fmla v27.4s, v7.4s, v15.4s
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
faddp v0.4s, v24.4s, v25.4s
|
|
Packit |
c32a2d |
faddp v1.4s, v26.4s, v27.4s
|
|
Packit |
c32a2d |
faddp v0.4s, v0.4s, v1.4s
|
|
Packit |
c32a2d |
fcvtns v1.4s, v0.4s
|
|
Packit |
c32a2d |
fcmgt v2.4s, v0.4s, v28.4s
|
|
Packit |
c32a2d |
fcmgt v3.4s, v29.4s, v0.4s
|
|
Packit |
c32a2d |
sqxtn v31.4h, v1.4s
|
|
Packit |
c32a2d |
add v2.4s, v2.4s, v3.4s
|
|
Packit |
c32a2d |
add v30.4s, v30.4s, v2.4s
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
|
|
Packit |
c32a2d |
ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
|
|
Packit |
c32a2d |
ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64
|
|
Packit |
c32a2d |
ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64
|
|
Packit |
c32a2d |
ld1 {v8.4s,v9.4s,v10.4s,v11.4s}, [x1], #64
|
|
Packit |
c32a2d |
ld1 {v12.4s,v13.4s,v14.4s,v15.4s}, [x2], #64
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
fmul v24.4s, v0.4s, v16.4s
|
|
Packit |
c32a2d |
fmul v25.4s, v0.4s, v20.4s
|
|
Packit |
c32a2d |
fmul v26.4s, v4.4s, v8.4s
|
|
Packit |
c32a2d |
fmul v27.4s, v4.4s, v12.4s
|
|
Packit |
c32a2d |
fmla v24.4s, v1.4s, v17.4s
|
|
Packit |
c32a2d |
fmla v25.4s, v1.4s, v21.4s
|
|
Packit |
c32a2d |
fmla v26.4s, v5.4s, v9.4s
|
|
Packit |
c32a2d |
fmla v27.4s, v5.4s, v13.4s
|
|
Packit |
c32a2d |
fmla v24.4s, v2.4s, v18.4s
|
|
Packit |
c32a2d |
fmla v25.4s, v2.4s, v22.4s
|
|
Packit |
c32a2d |
fmla v26.4s, v6.4s, v10.4s
|
|
Packit |
c32a2d |
fmla v27.4s, v6.4s, v14.4s
|
|
Packit |
c32a2d |
fmla v24.4s, v3.4s, v19.4s
|
|
Packit |
c32a2d |
fmla v25.4s, v3.4s, v23.4s
|
|
Packit |
c32a2d |
fmla v26.4s, v7.4s, v11.4s
|
|
Packit |
c32a2d |
fmla v27.4s, v7.4s, v15.4s
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
faddp v0.4s, v24.4s, v25.4s
|
|
Packit |
c32a2d |
faddp v1.4s, v26.4s, v27.4s
|
|
Packit |
c32a2d |
faddp v0.4s, v0.4s, v1.4s
|
|
Packit |
c32a2d |
fcvtns v1.4s, v0.4s
|
|
Packit |
c32a2d |
fcmgt v2.4s, v0.4s, v28.4s
|
|
Packit |
c32a2d |
fcmgt v3.4s, v29.4s, v0.4s
|
|
Packit |
c32a2d |
AARCH64_SQXTN2_8H(v31, v1)
|
|
Packit |
c32a2d |
add v2.4s, v2.4s, v3.4s
|
|
Packit |
c32a2d |
add v30.4s, v30.4s, v2.4s
|
|
Packit |
c32a2d |
st1 {v31.4s}, [x3], #16
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
subs w4, w4, #1
|
|
Packit |
c32a2d |
b.ne 1b
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
mov w4, #4
|
|
Packit |
c32a2d |
mov x6, #-64
|
|
Packit |
c32a2d |
2:
|
|
Packit |
c32a2d |
ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
|
|
Packit |
c32a2d |
ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
|
|
Packit |
c32a2d |
ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6
|
|
Packit |
c32a2d |
ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], x6
|
|
Packit |
c32a2d |
ld1 {v8.4s,v9.4s,v10.4s,v11.4s}, [x1], x6
|
|
Packit |
c32a2d |
ld1 {v12.4s,v13.4s,v14.4s,v15.4s}, [x2], x6
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
fmul v24.4s, v0.4s, v16.4s
|
|
Packit |
c32a2d |
fmul v25.4s, v0.4s, v20.4s
|
|
Packit |
c32a2d |
fmul v26.4s, v4.4s, v8.4s
|
|
Packit |
c32a2d |
fmul v27.4s, v4.4s, v12.4s
|
|
Packit |
c32a2d |
fmla v24.4s, v1.4s, v17.4s
|
|
Packit |
c32a2d |
fmla v25.4s, v1.4s, v21.4s
|
|
Packit |
c32a2d |
fmla v26.4s, v5.4s, v9.4s
|
|
Packit |
c32a2d |
fmla v27.4s, v5.4s, v13.4s
|
|
Packit |
c32a2d |
fmla v24.4s, v2.4s, v18.4s
|
|
Packit |
c32a2d |
fmla v25.4s, v2.4s, v22.4s
|
|
Packit |
c32a2d |
fmla v26.4s, v6.4s, v10.4s
|
|
Packit |
c32a2d |
fmla v27.4s, v6.4s, v14.4s
|
|
Packit |
c32a2d |
fmla v24.4s, v3.4s, v19.4s
|
|
Packit |
c32a2d |
fmla v25.4s, v3.4s, v23.4s
|
|
Packit |
c32a2d |
fmla v26.4s, v7.4s, v11.4s
|
|
Packit |
c32a2d |
fmla v27.4s, v7.4s, v15.4s
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
faddp v0.4s, v24.4s, v25.4s
|
|
Packit |
c32a2d |
faddp v1.4s, v26.4s, v27.4s
|
|
Packit |
c32a2d |
faddp v0.4s, v0.4s, v1.4s
|
|
Packit |
c32a2d |
fcvtns v1.4s, v0.4s
|
|
Packit |
c32a2d |
fcmgt v2.4s, v0.4s, v28.4s
|
|
Packit |
c32a2d |
fcmgt v3.4s, v29.4s, v0.4s
|
|
Packit |
c32a2d |
sqxtn v31.4h, v1.4s
|
|
Packit |
c32a2d |
add v2.4s, v2.4s, v3.4s
|
|
Packit |
c32a2d |
add v30.4s, v30.4s, v2.4s
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
|
|
Packit |
c32a2d |
ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
|
|
Packit |
c32a2d |
ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6
|
|
Packit |
c32a2d |
ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], x6
|
|
Packit |
c32a2d |
ld1 {v8.4s,v9.4s,v10.4s,v11.4s}, [x1], x6
|
|
Packit |
c32a2d |
ld1 {v12.4s,v13.4s,v14.4s,v15.4s}, [x2], x6
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
fmul v24.4s, v0.4s, v16.4s
|
|
Packit |
c32a2d |
fmul v25.4s, v0.4s, v20.4s
|
|
Packit |
c32a2d |
fmul v26.4s, v4.4s, v8.4s
|
|
Packit |
c32a2d |
fmul v27.4s, v4.4s, v12.4s
|
|
Packit |
c32a2d |
fmla v24.4s, v1.4s, v17.4s
|
|
Packit |
c32a2d |
fmla v25.4s, v1.4s, v21.4s
|
|
Packit |
c32a2d |
fmla v26.4s, v5.4s, v9.4s
|
|
Packit |
c32a2d |
fmla v27.4s, v5.4s, v13.4s
|
|
Packit |
c32a2d |
fmla v24.4s, v2.4s, v18.4s
|
|
Packit |
c32a2d |
fmla v25.4s, v2.4s, v22.4s
|
|
Packit |
c32a2d |
fmla v26.4s, v6.4s, v10.4s
|
|
Packit |
c32a2d |
fmla v27.4s, v6.4s, v14.4s
|
|
Packit |
c32a2d |
fmla v24.4s, v3.4s, v19.4s
|
|
Packit |
c32a2d |
fmla v25.4s, v3.4s, v23.4s
|
|
Packit |
c32a2d |
fmla v26.4s, v7.4s, v11.4s
|
|
Packit |
c32a2d |
fmla v27.4s, v7.4s, v15.4s
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
faddp v0.4s, v24.4s, v25.4s
|
|
Packit |
c32a2d |
faddp v1.4s, v26.4s, v27.4s
|
|
Packit |
c32a2d |
faddp v0.4s, v0.4s, v1.4s
|
|
Packit |
c32a2d |
fcvtns v1.4s, v0.4s
|
|
Packit |
c32a2d |
fcmgt v2.4s, v0.4s, v28.4s
|
|
Packit |
c32a2d |
fcmgt v3.4s, v29.4s, v0.4s
|
|
Packit |
c32a2d |
AARCH64_SQXTN2_8H(v31, v1)
|
|
Packit |
c32a2d |
add v2.4s, v2.4s, v3.4s
|
|
Packit |
c32a2d |
add v30.4s, v30.4s, v2.4s
|
|
Packit |
c32a2d |
st1 {v31.4s}, [x3], #16
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
subs w4, w4, #1
|
|
Packit |
c32a2d |
b.ne 2b
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
AARCH64_DUP_2D(v0, v30, 1)
|
|
Packit |
c32a2d |
add v0.4s, v0.4s, v30.4s
|
|
Packit |
c32a2d |
AARCH64_DUP_4S(v1, v0, 1)
|
|
Packit |
c32a2d |
add v0.4s, v0.4s, v1.4s
|
|
Packit |
c32a2d |
umov w0, v0.s[0]
|
|
Packit |
c32a2d |
neg w0, w0
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
ld1 {v12.2s,v13.2s,v14.2s,v15.2s}, [sp], #32
|
|
Packit |
c32a2d |
ld1 {v8.2s,v9.2s,v10.2s,v11.2s}, [sp], #32
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
ret
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
NONEXEC_STACK
|