Blame src/libmpg123/dct36_neon64.S

Packit c32a2d
/*
Packit c32a2d
	dct36_neon64: NEON optimized dct36 for AArch64
Packit c32a2d
Packit c32a2d
	copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.section	.rodata
Packit c32a2d
#else
Packit c32a2d
	.data
Packit c32a2d
#endif
Packit c32a2d
	ALIGN16
Packit c32a2d
dct36_aarch64_COS9:
Packit c32a2d
	.word 0x3f5db3d7
Packit c32a2d
	.word 0x3f5db3d7
Packit c32a2d
	.word 0x3f000000
Packit c32a2d
	.word 0x3f000000
Packit c32a2d
	.word 0x3f7c1c5c
Packit c32a2d
	.word 0x3f7c1c5c
Packit c32a2d
	.word 0x3f708fb2
Packit c32a2d
	.word 0x3f708fb2
Packit c32a2d
	.word 0x3f248dbb
Packit c32a2d
	.word 0x3f248dbb
Packit c32a2d
	.word 0x3e31d0d4
Packit c32a2d
	.word 0x3e31d0d4
Packit c32a2d
	.word 0x3eaf1d44
Packit c32a2d
	.word 0x3eaf1d44
Packit c32a2d
	.word 0x3f441b7d
Packit c32a2d
	.word 0x3f441b7d
Packit c32a2d
	.word 0x3f007d2b
Packit c32a2d
	.word 0x3f0483ee
Packit c32a2d
	.word 0x3f0d3b7d
Packit c32a2d
	.word 0x3f1c4257
Packit c32a2d
	.word 0x40b79454
Packit c32a2d
	.word 0x3ff746ea
Packit c32a2d
	.word 0x3f976fd9
Packit c32a2d
	.word 0x3f5f2944
Packit c32a2d
	.word 0x3f800000
Packit c32a2d
	.word 0x3f3504f3
Packit c32a2d
	
Packit c32a2d
	.text
Packit c32a2d
	ALIGN4
Packit c32a2d
	.globl ASM_NAME(dct36_neon64)
Packit c32a2d
#ifdef __ELF__
Packit c32a2d
	.type ASM_NAME(dct36_neon64), %function
Packit c32a2d
#endif
Packit c32a2d
ASM_NAME(dct36_neon64):
Packit c32a2d
	adrp		x5, AARCH64_PCREL_HI(dct36_aarch64_COS9)
Packit c32a2d
	add			x5, x5, AARCH64_PCREL_LO(dct36_aarch64_COS9)
Packit c32a2d
	cmeq		v28.16b, v28.16b, v28.16b
Packit c32a2d
	eor			v29.16b, v29.16b, v29.16b
Packit c32a2d
	shl			v28.2d, v28.2d, #32
Packit c32a2d
	ld1			{v0.4s,v1.4s,v2.4s,v3.4s}, [x0], #64
Packit c32a2d
	ld1			{v4.2s}, [x0]
Packit c32a2d
	
Packit c32a2d
	ext			v16.16b, v29.16b, v0.16b, #12
Packit c32a2d
	ext			v17.16b, v0.16b, v1.16b, #12
Packit c32a2d
	ext			v18.16b, v1.16b, v2.16b, #12
Packit c32a2d
	ext			v19.16b, v2.16b, v3.16b, #12
Packit c32a2d
	ext			v20.16b, v3.16b, v4.16b, #12
Packit c32a2d
	fadd		v0.4s, v0.4s, v16.4s
Packit c32a2d
	fadd		v1.4s, v1.4s, v17.4s
Packit c32a2d
	fadd		v2.4s, v2.4s, v18.4s
Packit c32a2d
	fadd		v3.4s, v3.4s, v19.4s
Packit c32a2d
	fadd		v4.2s, v4.2s, v20.2s
Packit c32a2d
	
Packit c32a2d
	ext			v16.16b, v0.16b, v1.16b, #8
Packit c32a2d
	ext			v17.16b, v1.16b, v2.16b, #8
Packit c32a2d
	ext			v18.16b, v2.16b, v3.16b, #8
Packit c32a2d
	ext			v19.16b, v3.16b, v4.16b, #8
Packit c32a2d
	and			v20.16b, v0.16b, v28.16b
Packit c32a2d
	ext			v0.16b, v29.16b, v0.16b, #8
Packit c32a2d
	and			v21.16b, v1.16b, v28.16b
Packit c32a2d
	and			v22.16b, v2.16b, v28.16b
Packit c32a2d
	and			v23.16b, v3.16b, v28.16b
Packit c32a2d
	fadd		v1.4s, v20.4s, v16.4s
Packit c32a2d
	fadd		v2.4s, v21.4s, v17.4s
Packit c32a2d
	fadd		v3.4s, v22.4s, v18.4s
Packit c32a2d
	fadd		v4.4s, v23.4s, v19.4s
Packit c32a2d
	
Packit c32a2d
/*
Packit c32a2d
v0 in[-,-,0,1]
Packit c32a2d
v1 in[2,3,4,5]
Packit c32a2d
v2 in[6,7,8,9]
Packit c32a2d
v3 in[10,11,12,13]
Packit c32a2d
v4 in[14,15,16,17]
Packit c32a2d
*/
Packit c32a2d
	
Packit c32a2d
	orr			v5.16b, v2.16b, v2.16b
Packit c32a2d
	ins			v2.d[1], v3.d[1]
Packit c32a2d
	ins			v3.d[1], v4.d[1]
Packit c32a2d
	ins			v4.d[1], v5.d[1]
Packit c32a2d
	
Packit c32a2d
/*
Packit c32a2d
v2 in[6,7,12,13]
Packit c32a2d
v3 in[10,11,16,17]
Packit c32a2d
v4 in[14,15,8,9]
Packit c32a2d
*/
Packit c32a2d
	
Packit c32a2d
	ld1			{v16.4s,v17.4s,v18.4s,v19.4s}, [x5], #64
Packit c32a2d
	orr			v20.16b, v0.16b, v0.16b
Packit c32a2d
	fmla		v20.4s, v2.4s, v16.4s
Packit c32a2d
	
Packit c32a2d
/*
Packit c32a2d
v17 COS9_[1,1,2,2]
Packit c32a2d
v18 COS9_[5,5,8,8]
Packit c32a2d
v19 COS9_[7,7,4,4]
Packit c32a2d
v16 COS9_[3,3,6,6]
Packit c32a2d
v20 [ta33,tb33,ta66,tb66]
Packit c32a2d
*/
Packit c32a2d
	
Packit c32a2d
	orr			v21.16b, v20.16b, v20.16b
Packit c32a2d
	orr			v23.16b, v20.16b, v20.16b
Packit c32a2d
	zip2		v25.2d, v29.2d, v2.2d
Packit c32a2d
	fsub		v22.4s, v1.4s, v3.4s
Packit c32a2d
	fmul		v24.4s, v1.4s, v17.4s
Packit c32a2d
	fmul		v26.4s, v1.4s, v18.4s
Packit c32a2d
	fmul		v27.4s, v1.4s, v19.4s
Packit c32a2d
	fmla		v21.4s, v3.4s, v18.4s
Packit c32a2d
	fmla		v23.4s, v3.4s, v19.4s
Packit c32a2d
	fmla		v20.4s, v4.4s, v18.4s
Packit c32a2d
	fsub		v25.4s, v0.4s, v25.4s
Packit c32a2d
	fsub		v22.4s, v22.4s, v4.4s
Packit c32a2d
	fmla		v24.4s, v4.4s, v19.4s
Packit c32a2d
	fmla		v26.4s, v4.4s, v17.4s
Packit c32a2d
	fmla		v27.4s, v3.4s, v17.4s
Packit c32a2d
	fmla		v25.4s, v22.4s, v16.4s
Packit c32a2d
	fadd		v24.4s, v24.4s, v21.4s
Packit c32a2d
	fsub		v26.4s, v26.4s, v23.4s
Packit c32a2d
	fsub		v27.4s, v27.4s, v20.4s
Packit c32a2d
	
Packit c32a2d
	zip1		v16.4s, v24.4s, v25.4s
Packit c32a2d
	zip2		v17.4s, v24.4s, v25.4s
Packit c32a2d
	zip1		v18.4s, v26.4s, v27.4s
Packit c32a2d
	zip2		v19.4s, v26.4s, v27.4s
Packit c32a2d
	fneg		v19.4s, v19.4s
Packit c32a2d
	zip1		v20.2d, v16.2d, v18.2d
Packit c32a2d
	zip1		v21.2d, v17.2d, v19.2d
Packit c32a2d
	zip2		v22.2d, v16.2d, v18.2d
Packit c32a2d
	zip2		v23.2d, v17.2d, v19.2d
Packit c32a2d
	
Packit c32a2d
	ld1			{v5.4s,v6.4s}, [x5], #32
Packit c32a2d
	ld1			{v7.2s}, [x5]
Packit c32a2d
	fsub		v0.4s, v0.4s, v1.4s
Packit c32a2d
	fsub		v4.4s, v4.4s, v2.4s
Packit c32a2d
	fadd		v17.4s, v22.4s, v23.4s
Packit c32a2d
	fsub		v19.4s, v23.4s, v22.4s
Packit c32a2d
	fadd		v0.4s, v0.4s, v3.4s
Packit c32a2d
	fadd		v16.4s, v20.4s, v21.4s
Packit c32a2d
	fsub		v18.4s, v21.4s, v20.4s
Packit c32a2d
	fadd		v0.4s, v0.4s, v4.4s
Packit c32a2d
	fmul		v17.4s, v17.4s, v5.4s
Packit c32a2d
	fmul		v19.4s, v19.4s, v6.4s
Packit c32a2d
	AARCH64_DUP_2D(v0, v0, 1)
Packit c32a2d
	fmul		v0.2s, v0.2s, v7.2s
Packit c32a2d
	
Packit c32a2d
/*
Packit c32a2d
v16 tmp[0,1,2,3]
Packit c32a2d
v17 tmp[17,16,15,14]
Packit c32a2d
v18 tmp[8,7,6,5]
Packit c32a2d
v19 tmp[9,10,11,12]
Packit c32a2d
v0 tmp[4,13]
Packit c32a2d
*/
Packit c32a2d
	
Packit c32a2d
	add			x0, x4, #640
Packit c32a2d
	add			x5, x3, #20
Packit c32a2d
	add			x6, x3, #92
Packit c32a2d
	add			x7, x1, #20
Packit c32a2d
	ld1			{v1.4s,v2.4s}, [x5]
Packit c32a2d
	ld1			{v3.4s,v4.4s}, [x6]
Packit c32a2d
	ld1			{v5.4s,v6.4s}, [x7]
Packit c32a2d
	fadd		v20.4s, v16.4s, v17.4s
Packit c32a2d
	fsub		v21.4s, v16.4s, v17.4s
Packit c32a2d
	fmul		v4.4s, v20.4s, v4.4s
Packit c32a2d
	fmla		v6.4s, v21.4s, v2.4s
Packit c32a2d
	rev64		v20.4s, v20.4s
Packit c32a2d
	rev64		v21.4s, v21.4s
Packit c32a2d
	ext			v20.16b, v20.16b, v20.16b, #8
Packit c32a2d
	ext			v21.16b, v21.16b, v21.16b, #8
Packit c32a2d
	fmul		v3.4s, v20.4s, v3.4s
Packit c32a2d
	fmla		v5.4s, v21.4s, v1.4s
Packit c32a2d
	add			x5, x2, #20
Packit c32a2d
	mov			x9, #128
Packit c32a2d
	st1			{v3.4s,v4.4s}, [x5]
Packit c32a2d
	st1			{v5.s}[0], [x0], x9
Packit c32a2d
	st1			{v5.s}[1], [x0], x9
Packit c32a2d
	st1			{v5.s}[2], [x0], x9
Packit c32a2d
	st1			{v5.s}[3], [x0], x9
Packit c32a2d
	st1			{v6.s}[0], [x0], x9
Packit c32a2d
	st1			{v6.s}[1], [x0], x9
Packit c32a2d
	st1			{v6.s}[2], [x0], x9
Packit c32a2d
	st1			{v6.s}[3], [x0], x9
Packit c32a2d
	
Packit c32a2d
	add			x0, x4, #1792
Packit c32a2d
	add			x5, x3, #56
Packit c32a2d
	add			x6, x3, #128
Packit c32a2d
	add			x7, x1, #56
Packit c32a2d
	ld1			{v1.4s}, [x3]
Packit c32a2d
	ld1			{v2.4s,v3.4s}, [x5]
Packit c32a2d
	ld1			{v4.4s}, [x6]
Packit c32a2d
	ld1			{v5.4s}, [x1]
Packit c32a2d
	ld1			{v6.4s}, [x7]
Packit c32a2d
	fadd		v20.4s, v18.4s, v19.4s
Packit c32a2d
	fsub		v21.4s, v18.4s, v19.4s
Packit c32a2d
	fmul		v3.4s, v20.4s, v3.4s
Packit c32a2d
	fmla		v5.4s, v21.4s, v1.4s
Packit c32a2d
	rev64		v20.4s, v20.4s
Packit c32a2d
	rev64		v21.4s, v21.4s
Packit c32a2d
	ext			v20.16b, v20.16b, v20.16b, #8
Packit c32a2d
	ext			v21.16b, v21.16b, v21.16b, #8
Packit c32a2d
	fmul		v4.4s, v20.4s, v4.4s
Packit c32a2d
	fmla		v6.4s, v21.4s, v2.4s
Packit c32a2d
	add			x5, x2, #56
Packit c32a2d
	st1			{v3.4s}, [x2]
Packit c32a2d
	st1			{v4.4s}, [x5]
Packit c32a2d
	st1			{v5.s}[0], [x4], x9
Packit c32a2d
	st1			{v5.s}[1], [x4], x9
Packit c32a2d
	st1			{v5.s}[2], [x4], x9
Packit c32a2d
	st1			{v5.s}[3], [x4], x9
Packit c32a2d
	st1			{v6.s}[0], [x0], x9
Packit c32a2d
	st1			{v6.s}[1], [x0], x9
Packit c32a2d
	st1			{v6.s}[2], [x0], x9
Packit c32a2d
	st1			{v6.s}[3], [x0], x9
Packit c32a2d
	
Packit c32a2d
	ins			v1.s[0], v0.s[1]
Packit c32a2d
	ldr			s2, [x3, #16]
Packit c32a2d
	ldr			s3, [x3, #52]
Packit c32a2d
	ldr			s4, [x3, #88]
Packit c32a2d
	ldr			s5, [x3, #124]
Packit c32a2d
	ldr			s6, [x1, #16]
Packit c32a2d
	ldr			s7, [x1, #52]
Packit c32a2d
	fadd		s16, s0, s1
Packit c32a2d
	fsub		s17, s0, s1
Packit c32a2d
	fmul		s4, s16, s4
Packit c32a2d
	fmul		s5, s16, s5
Packit c32a2d
	fmadd		s6, s17, s2, s6
Packit c32a2d
	fmadd		s7, s17, s3, s7
Packit c32a2d
	str			s4, [x2, #16]
Packit c32a2d
	str			s5, [x2, #52]
Packit c32a2d
	str			s6, [x4]
Packit c32a2d
	str			s7, [x4, #1152]
Packit c32a2d
	
Packit c32a2d
	ret
Packit c32a2d
	
Packit c32a2d
NONEXEC_STACK