Blame src/libmpg123/synth_neon64_s32.S

Packit c32a2d
/*
Packit c32a2d
	synth_neon64_s32: NEON optimized synth for AArch64 (32-bit output version)
Packit c32a2d
Packit c32a2d
	copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.section	.rodata
Packit c32a2d
#else
Packit c32a2d
	.data
Packit c32a2d
#endif
Packit c32a2d
	ALIGN16
Packit c32a2d
maxmin_s32:
Packit c32a2d
	.word   1191182335
Packit c32a2d
	.word   -956301312
Packit c32a2d
	.word   1199570944
Packit c32a2d
	.text
Packit c32a2d
	ALIGN4
Packit c32a2d
	.globl ASM_NAME(synth_1to1_s32_neon64_asm)
Packit c32a2d
#ifdef __ELF__
Packit c32a2d
	.type ASM_NAME(synth_1to1_s32_neon64_asm), %function
Packit c32a2d
#endif
Packit c32a2d
ASM_NAME(synth_1to1_s32_neon64_asm):
Packit c32a2d
	add		x0, x0, #64
Packit c32a2d
	sub		x0, x0, x3, lsl #2
Packit c32a2d
	eor		v31.16b, v31.16b, v31.16b
Packit c32a2d
	adrp	x5, AARCH64_PCREL_HI(maxmin_s32)
Packit c32a2d
	add		x5, x5, AARCH64_PCREL_LO(maxmin_s32)
Packit c32a2d
	ld3r	{v28.4s,v29.4s,v30.4s}, [x5]
Packit c32a2d
	
Packit c32a2d
	mov		w4, #4
Packit c32a2d
	mov		x5, #128
Packit c32a2d
1:
Packit c32a2d
	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
Packit c32a2d
	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
Packit c32a2d
	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64
Packit c32a2d
	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
Packit c32a2d
	
Packit c32a2d
	fmul	v24.4s, v0.4s, v16.4s
Packit c32a2d
	fmul	v25.4s, v4.4s, v20.4s
Packit c32a2d
	fmla	v24.4s, v1.4s, v17.4s
Packit c32a2d
	fmla	v25.4s, v5.4s, v21.4s
Packit c32a2d
	fmla	v24.4s, v2.4s, v18.4s
Packit c32a2d
	fmla	v25.4s, v6.4s, v22.4s
Packit c32a2d
	fmla	v24.4s, v3.4s, v19.4s
Packit c32a2d
	fmla	v25.4s, v7.4s, v23.4s
Packit c32a2d
	
Packit c32a2d
	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
Packit c32a2d
	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
Packit c32a2d
	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64
Packit c32a2d
	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
Packit c32a2d
	
Packit c32a2d
	fmul	v26.4s, v0.4s, v16.4s
Packit c32a2d
	fmul	v27.4s, v4.4s, v20.4s
Packit c32a2d
	fmla	v26.4s, v1.4s, v17.4s
Packit c32a2d
	fmla	v27.4s, v5.4s, v21.4s
Packit c32a2d
	fmla	v26.4s, v2.4s, v18.4s
Packit c32a2d
	fmla	v27.4s, v6.4s, v22.4s
Packit c32a2d
	fmla	v26.4s, v3.4s, v19.4s
Packit c32a2d
	fmla	v27.4s, v7.4s, v23.4s
Packit c32a2d
	
Packit c32a2d
	faddp	v0.4s, v24.4s, v25.4s
Packit c32a2d
	faddp	v1.4s, v26.4s, v27.4s
Packit c32a2d
	faddp	v0.4s, v0.4s, v1.4s
Packit c32a2d
	fmul	v1.4s, v0.4s, v30.4s
Packit c32a2d
	ld2		{v4.4s,v5.4s}, [x2]
Packit c32a2d
	fcvtns	v4.4s, v1.4s
Packit c32a2d
	fcmgt	v2.4s, v0.4s, v28.4s
Packit c32a2d
	fcmgt	v3.4s, v29.4s, v0.4s
Packit c32a2d
	add		v2.4s, v2.4s, v3.4s
Packit c32a2d
	add		v31.4s, v31.4s, v2.4s
Packit c32a2d
	st2		{v4.4s,v5.4s}, [x2], #32
Packit c32a2d
	
Packit c32a2d
	subs	w4, w4, #1
Packit c32a2d
	b.ne	1b
Packit c32a2d
	
Packit c32a2d
	mov		w4, #4
Packit c32a2d
	mov		x6, #-64
Packit c32a2d
2:
Packit c32a2d
	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
Packit c32a2d
	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
Packit c32a2d
	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6
Packit c32a2d
	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x1], x6
Packit c32a2d
	
Packit c32a2d
	fmul	v24.4s, v0.4s, v16.4s
Packit c32a2d
	fmul	v25.4s, v4.4s, v20.4s
Packit c32a2d
	fmla	v24.4s, v1.4s, v17.4s
Packit c32a2d
	fmla	v25.4s, v5.4s, v21.4s
Packit c32a2d
	fmla	v24.4s, v2.4s, v18.4s
Packit c32a2d
	fmla	v25.4s, v6.4s, v22.4s
Packit c32a2d
	fmla	v24.4s, v3.4s, v19.4s
Packit c32a2d
	fmla	v25.4s, v7.4s, v23.4s
Packit c32a2d
	
Packit c32a2d
	ld1		{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5
Packit c32a2d
	ld1		{v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5
Packit c32a2d
	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6
Packit c32a2d
	ld1		{v20.4s,v21.4s,v22.4s,v23.4s}, [x1], x6
Packit c32a2d
	
Packit c32a2d
	fmul	v26.4s, v0.4s, v16.4s
Packit c32a2d
	fmul	v27.4s, v4.4s, v20.4s
Packit c32a2d
	fmla	v26.4s, v1.4s, v17.4s
Packit c32a2d
	fmla	v27.4s, v5.4s, v21.4s
Packit c32a2d
	fmla	v26.4s, v2.4s, v18.4s
Packit c32a2d
	fmla	v27.4s, v6.4s, v22.4s
Packit c32a2d
	fmla	v26.4s, v3.4s, v19.4s
Packit c32a2d
	fmla	v27.4s, v7.4s, v23.4s
Packit c32a2d
	
Packit c32a2d
	faddp	v0.4s, v24.4s, v25.4s
Packit c32a2d
	faddp	v1.4s, v26.4s, v27.4s
Packit c32a2d
	faddp	v0.4s, v0.4s, v1.4s
Packit c32a2d
	fmul	v1.4s, v0.4s, v30.4s
Packit c32a2d
	ld2		{v4.4s,v5.4s}, [x2]
Packit c32a2d
	fcvtns	v4.4s, v1.4s
Packit c32a2d
	fcmgt	v2.4s, v0.4s, v28.4s
Packit c32a2d
	fcmgt	v3.4s, v29.4s, v0.4s
Packit c32a2d
	add		v2.4s, v2.4s, v3.4s
Packit c32a2d
	add		v31.4s, v31.4s, v2.4s
Packit c32a2d
	st2		{v4.4s,v5.4s}, [x2], #32
Packit c32a2d
	
Packit c32a2d
	subs	w4, w4, #1
Packit c32a2d
	b.ne	2b
Packit c32a2d
	
Packit c32a2d
	AARCH64_DUP_2D(v0, v31, 1)
Packit c32a2d
	add		v0.4s, v0.4s, v31.4s
Packit c32a2d
	AARCH64_DUP_4S(v1, v0, 1)
Packit c32a2d
	add		v0.4s, v0.4s, v1.4s
Packit c32a2d
	umov	w0, v0.s[0]
Packit c32a2d
	neg		w0, w0
Packit c32a2d
	
Packit c32a2d
	ret
Packit c32a2d
Packit c32a2d
NONEXEC_STACK