Blame src/libmpg123/synth_stereo_neon64.S

Packit c32a2d
/*
Packit c32a2d
	synth_stereo_neon64: NEON optimized synth for AArch64 (stereo specific version)
Packit c32a2d
Packit c32a2d
	copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.section	.rodata
Packit c32a2d
#else
Packit c32a2d
	.data
Packit c32a2d
#endif
Packit c32a2d
	ALIGN16
Packit c32a2d
maxmin_s16:
Packit c32a2d
	.word   32767
Packit c32a2d
	.word   -32768
Packit c32a2d
	.text
Packit c32a2d
	ALIGN4
Packit c32a2d
	.globl ASM_NAME(synth_1to1_s_neon64_asm)
Packit c32a2d
#ifdef __ELF__
Packit c32a2d
	.type ASM_NAME(synth_1to1_s_neon64_asm), %function
Packit c32a2d
#endif
Packit c32a2d
ASM_NAME(synth_1to1_s_neon64_asm):
Packit c32a2d
	add		x0, x0, #32
Packit c32a2d
	sub		x0, x0, x4, lsl #1
Packit c32a2d
	eor		v30.16b, v30.16b, v30.16b
Packit c32a2d
	adrp	x5, AARCH64_PCREL_HI(maxmin_s16)
Packit c32a2d
	add		x5, x5, AARCH64_PCREL_LO(maxmin_s16)
Packit c32a2d
	ld2r	{v28.4s,v29.4s}, [x5]
Packit c32a2d
	
Packit c32a2d
	mov		w4, #4
Packit c32a2d
	mov		x5, #64
Packit c32a2d
1:
Packit c32a2d
	ld1		{v0.8h,v1.8h}, [x0], x5
Packit c32a2d
	ld1		{v2.8h,v3.8h}, [x0], x5
Packit c32a2d
	ld1		{v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
Packit c32a2d
	ld1		{v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64
Packit c32a2d
	
Packit c32a2d
	smull	v24.4s, v0.4h, v4.4h
Packit c32a2d
	smull	v25.4s, v0.4h, v16.4h
Packit c32a2d
	smull	v26.4s, v2.4h, v6.4h
Packit c32a2d
	smull	v27.4s, v2.4h, v18.4h
Packit c32a2d
	smlal2	v24.4s, v0.8h, v4.8h
Packit c32a2d
	smlal2	v25.4s, v0.8h, v16.8h
Packit c32a2d
	smlal2	v26.4s, v2.8h, v6.8h
Packit c32a2d
	smlal2	v27.4s, v2.8h, v18.8h
Packit c32a2d
	smlal	v24.4s, v1.4h, v5.4h
Packit c32a2d
	smlal	v25.4s, v1.4h, v17.4h
Packit c32a2d
	smlal	v26.4s, v3.4h, v7.4h
Packit c32a2d
	smlal	v27.4s, v3.4h, v19.4h
Packit c32a2d
	smlal2	v24.4s, v1.8h, v5.8h
Packit c32a2d
	smlal2	v25.4s, v1.8h, v17.8h
Packit c32a2d
	smlal2	v26.4s, v3.8h, v7.8h
Packit c32a2d
	smlal2	v27.4s, v3.8h, v19.8h
Packit c32a2d
		
Packit c32a2d
	addp	v0.4s, v24.4s, v25.4s
Packit c32a2d
	addp	v1.4s, v26.4s, v27.4s
Packit c32a2d
	addp	v0.4s, v0.4s, v1.4s
Packit c32a2d
	sqrshrn	v31.4h, v0.4s, #13
Packit c32a2d
	cmgt	v2.4s, v0.4s, v28.4s
Packit c32a2d
	cmgt	v3.4s, v29.4s, v0.4s
Packit c32a2d
	add		v2.4s, v2.4s, v3.4s
Packit c32a2d
	add		v30.4s, v30.4s, v2.4s
Packit c32a2d
	
Packit c32a2d
	ld1		{v0.8h,v1.8h}, [x0], x5
Packit c32a2d
	ld1		{v2.8h,v3.8h}, [x0], x5
Packit c32a2d
	ld1		{v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
Packit c32a2d
	ld1		{v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64
Packit c32a2d
	
Packit c32a2d
	smull	v24.4s, v0.4h, v4.4h
Packit c32a2d
	smull	v25.4s, v0.4h, v16.4h
Packit c32a2d
	smull	v26.4s, v2.4h, v6.4h
Packit c32a2d
	smull	v27.4s, v2.4h, v18.4h
Packit c32a2d
	smlal2	v24.4s, v0.8h, v4.8h
Packit c32a2d
	smlal2	v25.4s, v0.8h, v16.8h
Packit c32a2d
	smlal2	v26.4s, v2.8h, v6.8h
Packit c32a2d
	smlal2	v27.4s, v2.8h, v18.8h
Packit c32a2d
	smlal	v24.4s, v1.4h, v5.4h
Packit c32a2d
	smlal	v25.4s, v1.4h, v17.4h
Packit c32a2d
	smlal	v26.4s, v3.4h, v7.4h
Packit c32a2d
	smlal	v27.4s, v3.4h, v19.4h
Packit c32a2d
	smlal2	v24.4s, v1.8h, v5.8h
Packit c32a2d
	smlal2	v25.4s, v1.8h, v17.8h
Packit c32a2d
	smlal2	v26.4s, v3.8h, v7.8h
Packit c32a2d
	smlal2	v27.4s, v3.8h, v19.8h
Packit c32a2d
	
Packit c32a2d
	addp	v0.4s, v24.4s, v25.4s
Packit c32a2d
	addp	v1.4s, v26.4s, v27.4s
Packit c32a2d
	addp	v0.4s, v0.4s, v1.4s
Packit c32a2d
	sqrshrn2	v31.8h, v0.4s, #13
Packit c32a2d
	cmgt	v2.4s, v0.4s, v28.4s
Packit c32a2d
	cmgt	v3.4s, v29.4s, v0.4s
Packit c32a2d
	add		v2.4s, v2.4s, v3.4s
Packit c32a2d
	add		v30.4s, v30.4s, v2.4s
Packit c32a2d
	st1		{v31.4s}, [x3], #16
Packit c32a2d
	
Packit c32a2d
	subs	w4, w4, #1
Packit c32a2d
	b.ne	1b
Packit c32a2d
	
Packit c32a2d
	mov		w4, #4
Packit c32a2d
	mov		x6, #-32
Packit c32a2d
2:
Packit c32a2d
	ld1		{v0.8h,v1.8h}, [x0], x5
Packit c32a2d
	ld1		{v2.8h,v3.8h}, [x0], x5
Packit c32a2d
	ld1		{v4.8h,v5.8h}, [x1], x6
Packit c32a2d
	ld1		{v6.8h,v7.8h}, [x1], x6
Packit c32a2d
	ld1		{v16.8h,v17.8h}, [x2], x6
Packit c32a2d
	ld1		{v18.8h,v19.8h}, [x2], x6
Packit c32a2d
	
Packit c32a2d
	smull	v24.4s, v0.4h, v4.4h
Packit c32a2d
	smull	v25.4s, v0.4h, v16.4h
Packit c32a2d
	smull	v26.4s, v2.4h, v6.4h
Packit c32a2d
	smull	v27.4s, v2.4h, v18.4h
Packit c32a2d
	smlal2	v24.4s, v0.8h, v4.8h
Packit c32a2d
	smlal2	v25.4s, v0.8h, v16.8h
Packit c32a2d
	smlal2	v26.4s, v2.8h, v6.8h
Packit c32a2d
	smlal2	v27.4s, v2.8h, v18.8h
Packit c32a2d
	smlal	v24.4s, v1.4h, v5.4h
Packit c32a2d
	smlal	v25.4s, v1.4h, v17.4h
Packit c32a2d
	smlal	v26.4s, v3.4h, v7.4h
Packit c32a2d
	smlal	v27.4s, v3.4h, v19.4h
Packit c32a2d
	smlal2	v24.4s, v1.8h, v5.8h
Packit c32a2d
	smlal2	v25.4s, v1.8h, v17.8h
Packit c32a2d
	smlal2	v26.4s, v3.8h, v7.8h
Packit c32a2d
	smlal2	v27.4s, v3.8h, v19.8h
Packit c32a2d
	
Packit c32a2d
	addp	v0.4s, v24.4s, v25.4s
Packit c32a2d
	addp	v1.4s, v26.4s, v27.4s
Packit c32a2d
	addp	v0.4s, v0.4s, v1.4s
Packit c32a2d
	sqrshrn	v31.4h, v0.4s, #13
Packit c32a2d
	cmgt	v2.4s, v0.4s, v28.4s
Packit c32a2d
	cmgt	v3.4s, v29.4s, v0.4s
Packit c32a2d
	add		v2.4s, v2.4s, v3.4s
Packit c32a2d
	add		v30.4s, v30.4s, v2.4s
Packit c32a2d
	
Packit c32a2d
	ld1		{v0.8h,v1.8h}, [x0], x5
Packit c32a2d
	ld1		{v2.8h,v3.8h}, [x0], x5
Packit c32a2d
	ld1		{v4.8h,v5.8h}, [x1], x6
Packit c32a2d
	ld1		{v6.8h,v7.8h}, [x1], x6
Packit c32a2d
	ld1		{v16.8h,v17.8h}, [x2], x6
Packit c32a2d
	ld1		{v18.8h,v19.8h}, [x2], x6
Packit c32a2d
	
Packit c32a2d
	smull	v24.4s, v0.4h, v4.4h
Packit c32a2d
	smull	v25.4s, v0.4h, v16.4h
Packit c32a2d
	smull	v26.4s, v2.4h, v6.4h
Packit c32a2d
	smull	v27.4s, v2.4h, v18.4h
Packit c32a2d
	smlal2	v24.4s, v0.8h, v4.8h
Packit c32a2d
	smlal2	v25.4s, v0.8h, v16.8h
Packit c32a2d
	smlal2	v26.4s, v2.8h, v6.8h
Packit c32a2d
	smlal2	v27.4s, v2.8h, v18.8h
Packit c32a2d
	smlal	v24.4s, v1.4h, v5.4h
Packit c32a2d
	smlal	v25.4s, v1.4h, v17.4h
Packit c32a2d
	smlal	v26.4s, v3.4h, v7.4h
Packit c32a2d
	smlal	v27.4s, v3.4h, v19.4h
Packit c32a2d
	smlal2	v24.4s, v1.8h, v5.8h
Packit c32a2d
	smlal2	v25.4s, v1.8h, v17.8h
Packit c32a2d
	smlal2	v26.4s, v3.8h, v7.8h
Packit c32a2d
	smlal2	v27.4s, v3.8h, v19.8h
Packit c32a2d
	
Packit c32a2d
	addp	v0.4s, v24.4s, v25.4s
Packit c32a2d
	addp	v1.4s, v26.4s, v27.4s
Packit c32a2d
	addp	v0.4s, v0.4s, v1.4s
Packit c32a2d
	sqrshrn2	v31.8h, v0.4s, #13
Packit c32a2d
	cmgt	v2.4s, v0.4s, v28.4s
Packit c32a2d
	cmgt	v3.4s, v29.4s, v0.4s
Packit c32a2d
	add		v2.4s, v2.4s, v3.4s
Packit c32a2d
	add		v30.4s, v30.4s, v2.4s
Packit c32a2d
	st1		{v31.4s}, [x3], #16
Packit c32a2d
	
Packit c32a2d
	subs	w4, w4, #1
Packit c32a2d
	b.ne	2b
Packit c32a2d
	
Packit c32a2d
	AARCH64_DUP_2D(v0, v30, 1)
Packit c32a2d
	add		v0.4s, v0.4s, v30.4s
Packit c32a2d
	AARCH64_DUP_4S(v1, v0, 1)
Packit c32a2d
	add		v0.4s, v0.4s, v1.4s
Packit c32a2d
	umov	w0, v0.s[0]
Packit c32a2d
	neg		w0, w0
Packit c32a2d
	
Packit c32a2d
	ret
Packit c32a2d
Packit c32a2d
NONEXEC_STACK