|
Packit |
c32a2d |
/*
|
|
Packit |
c32a2d |
synth_neon: ARM NEON optimized synth
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
copyright 1995-2010 by the mpg123 project - free software under the terms of the LGPL 2.1
|
|
Packit |
c32a2d |
see COPYING and AUTHORS files in distribution or http://mpg123.org
|
|
Packit |
c32a2d |
initially written by Taihei Monma
|
|
Packit |
c32a2d |
*/
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
#include "mangle.h"
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
#define WINDOW r0
|
|
Packit |
c32a2d |
#define B0 r1
|
|
Packit |
c32a2d |
#define SAMPLES r2
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
/*
|
|
Packit |
c32a2d |
int synth_1to1_neon_asm(short *window, short *b0, short *samples, int bo1);
|
|
Packit |
c32a2d |
return value: number of clipped samples
|
|
Packit |
c32a2d |
*/
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
#ifndef _M_ARM
|
|
Packit |
c32a2d |
.code 32
|
|
Packit |
c32a2d |
#endif
|
|
Packit |
c32a2d |
#ifndef __APPLE__
|
|
Packit |
c32a2d |
.fpu neon
|
|
Packit |
c32a2d |
#endif
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
.text
|
|
Packit |
c32a2d |
GLOBAL_SYMBOL ASM_NAME(synth_1to1_neon_asm)
|
|
Packit |
c32a2d |
#ifdef __ELF__
|
|
Packit |
c32a2d |
.type ASM_NAME(synth_1to1_neon_asm), %function
|
|
Packit |
c32a2d |
#endif
|
|
Packit |
c32a2d |
ALIGN4
|
|
Packit |
c32a2d |
ASM_NAME(synth_1to1_neon_asm):
|
|
Packit |
c32a2d |
push {r4-r5, lr}
|
|
Packit |
c32a2d |
vpush {q4-q7}
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
add WINDOW, WINDOW, #32
|
|
Packit |
c32a2d |
sub WINDOW, WINDOW, r3, lsl #1
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
mov r3, #4
|
|
Packit |
c32a2d |
mov r4, #64
|
|
Packit |
c32a2d |
1:
|
|
Packit |
c32a2d |
vld1.16 {d0-d3}, [WINDOW], r4
|
|
Packit |
c32a2d |
vld1.16 {d4-d7}, [B0, :128]!
|
|
Packit |
c32a2d |
vld1.16 {d8-d11}, [WINDOW], r4
|
|
Packit |
c32a2d |
vswp d1, d4
|
|
Packit |
c32a2d |
vld1.16 {d12-d15}, [B0, :128]!
|
|
Packit |
c32a2d |
vld1.16 {d16-d19}, [WINDOW], r4
|
|
Packit |
c32a2d |
vld1.16 {d20-d23}, [B0, :128]!
|
|
Packit |
c32a2d |
vswp d9, d12
|
|
Packit |
c32a2d |
vld1.16 {d24-d27}, [WINDOW], r4
|
|
Packit |
c32a2d |
vld1.16 {d28-d31}, [B0, :128]!
|
|
Packit |
c32a2d |
vswp d17, d20
|
|
Packit |
c32a2d |
vswp d25, d28
|
|
Packit |
c32a2d |
vmull.s16 q0, d0, d1
|
|
Packit |
c32a2d |
vmull.s16 q4, d8, d9
|
|
Packit |
c32a2d |
vmull.s16 q8, d16, d17
|
|
Packit |
c32a2d |
vmull.s16 q12, d24, d25
|
|
Packit |
c32a2d |
vmlal.s16 q0, d4, d5
|
|
Packit |
c32a2d |
vmlal.s16 q4, d12, d13
|
|
Packit |
c32a2d |
vmlal.s16 q8, d20, d21
|
|
Packit |
c32a2d |
vmlal.s16 q12, d28, d29
|
|
Packit |
c32a2d |
vmlal.s16 q0, d2, d6
|
|
Packit |
c32a2d |
vmlal.s16 q4, d10, d14
|
|
Packit |
c32a2d |
vmlal.s16 q8, d18, d22
|
|
Packit |
c32a2d |
vmlal.s16 q12, d26, d30
|
|
Packit |
c32a2d |
vmlal.s16 q0, d3, d7
|
|
Packit |
c32a2d |
vmlal.s16 q4, d11, d15
|
|
Packit |
c32a2d |
vmlal.s16 q8, d19, d23
|
|
Packit |
c32a2d |
vmlal.s16 q12, d27, d31
|
|
Packit |
c32a2d |
vpadd.i32 d0, d0, d1
|
|
Packit |
c32a2d |
vpadd.i32 d8, d8, d9
|
|
Packit |
c32a2d |
vpadd.i32 d16, d16, d17
|
|
Packit |
c32a2d |
vpadd.i32 d24, d24, d25
|
|
Packit |
c32a2d |
vpadd.i32 d0, d0, d8
|
|
Packit |
c32a2d |
vpadd.i32 d1, d16, d24
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vld2.16 {d2,d3}, [SAMPLES]
|
|
Packit |
c32a2d |
vqrshrn.s32 d1, q0, #13
|
|
Packit |
c32a2d |
vst2.16 {d1,d3}, [SAMPLES]!
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
subs r3, r3, #1
|
|
Packit |
c32a2d |
bne 1b
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
mov r3, #4
|
|
Packit |
c32a2d |
mov r5, #-32
|
|
Packit |
c32a2d |
1:
|
|
Packit |
c32a2d |
vld1.16 {d0-d3}, [WINDOW], r4
|
|
Packit |
c32a2d |
vld1.16 {d4-d7}, [B0, :128], r5
|
|
Packit |
c32a2d |
vld1.16 {d8-d11}, [WINDOW], r4
|
|
Packit |
c32a2d |
vswp d1, d4
|
|
Packit |
c32a2d |
vld1.16 {d12-d15}, [B0, :128], r5
|
|
Packit |
c32a2d |
vld1.16 {d16-d19}, [WINDOW], r4
|
|
Packit |
c32a2d |
vld1.16 {d20-d23}, [B0, :128], r5
|
|
Packit |
c32a2d |
vswp d9, d12
|
|
Packit |
c32a2d |
vld1.16 {d24-d27}, [WINDOW], r4
|
|
Packit |
c32a2d |
vld1.16 {d28-d31}, [B0, :128], r5
|
|
Packit |
c32a2d |
vswp d17, d20
|
|
Packit |
c32a2d |
vswp d25, d28
|
|
Packit |
c32a2d |
vmull.s16 q0, d0, d1
|
|
Packit |
c32a2d |
vmull.s16 q4, d8, d9
|
|
Packit |
c32a2d |
vmull.s16 q8, d16, d17
|
|
Packit |
c32a2d |
vmull.s16 q12, d24, d25
|
|
Packit |
c32a2d |
vmlal.s16 q0, d4, d5
|
|
Packit |
c32a2d |
vmlal.s16 q4, d12, d13
|
|
Packit |
c32a2d |
vmlal.s16 q8, d20, d21
|
|
Packit |
c32a2d |
vmlal.s16 q12, d28, d29
|
|
Packit |
c32a2d |
vmlal.s16 q0, d2, d6
|
|
Packit |
c32a2d |
vmlal.s16 q4, d10, d14
|
|
Packit |
c32a2d |
vmlal.s16 q8, d18, d22
|
|
Packit |
c32a2d |
vmlal.s16 q12, d26, d30
|
|
Packit |
c32a2d |
vmlal.s16 q0, d3, d7
|
|
Packit |
c32a2d |
vmlal.s16 q4, d11, d15
|
|
Packit |
c32a2d |
vmlal.s16 q8, d19, d23
|
|
Packit |
c32a2d |
vmlal.s16 q12, d27, d31
|
|
Packit |
c32a2d |
vpadd.i32 d0, d0, d1
|
|
Packit |
c32a2d |
vpadd.i32 d8, d8, d9
|
|
Packit |
c32a2d |
vpadd.i32 d16, d16, d17
|
|
Packit |
c32a2d |
vpadd.i32 d24, d24, d25
|
|
Packit |
c32a2d |
vpadd.i32 d0, d0, d8
|
|
Packit |
c32a2d |
vpadd.i32 d1, d16, d24
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vld2.16 {d2,d3}, [SAMPLES]
|
|
Packit |
c32a2d |
vqrshrn.s32 d1, q0, #13
|
|
Packit |
c32a2d |
vst2.16 {d1,d3}, [SAMPLES]!
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
subs r3, r3, #1
|
|
Packit |
c32a2d |
bne 1b
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
mov r0, #0
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vpop {q4-q7}
|
|
Packit |
c32a2d |
pop {r4-r5, pc}
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
NONEXEC_STACK
|