Blame src/libmpg123/dct64_avx_float.S

Packit c32a2d
/*
Packit c32a2d
	dct64_x86_64_float: SSE optimized dct64 for x86-64 (float output version)
Packit c32a2d
Packit c32a2d
	copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#define samples %rdx
Packit c32a2d
#define costab %rcx
Packit c32a2d
#define out0 %rdi
Packit c32a2d
#define out1 %rsi
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	void dct64_real_avx(real *out0, real *out1, real *samples);
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.section	.rodata
Packit c32a2d
#else
Packit c32a2d
	.data
Packit c32a2d
#endif
Packit c32a2d
	ALIGN32
Packit c32a2d
costab_avx:
Packit c32a2d
	.long 1056974725
Packit c32a2d
	.long 1057056395
Packit c32a2d
	.long 1057223771
Packit c32a2d
	.long 1057485416
Packit c32a2d
	.long 1057855544
Packit c32a2d
	.long 1058356026
Packit c32a2d
	.long 1059019886
Packit c32a2d
	.long 1059897405
Packit c32a2d
	.long 1061067246
Packit c32a2d
	.long 1062657950
Packit c32a2d
	.long 1064892987
Packit c32a2d
	.long 1066774581
Packit c32a2d
	.long 1069414683
Packit c32a2d
	.long 1073984175
Packit c32a2d
	.long 1079645762
Packit c32a2d
	.long 1092815430
Packit c32a2d
	.long 1057005197
Packit c32a2d
	.long 1057342072
Packit c32a2d
	.long 1058087743
Packit c32a2d
	.long 1059427869
Packit c32a2d
	.long 1061799040
Packit c32a2d
	.long 1065862217
Packit c32a2d
	.long 1071413542
Packit c32a2d
	.long 1084439708
Packit c32a2d
	.long 1057128951
Packit c32a2d
	.long 1058664893
Packit c32a2d
	.long 1063675095
Packit c32a2d
	.long 1076102863
Packit c32a2d
	.long 1057655764
Packit c32a2d
	.long 1067924853
Packit c32a2d
	.long 1060439283
Packit c32a2d
	.long 0
Packit c32a2d
	.text
Packit c32a2d
	ALIGN16
Packit c32a2d
.globl ASM_NAME(dct64_real_avx)
Packit c32a2d
ASM_NAME(dct64_real_avx):
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	push		%rbp
Packit c32a2d
	mov			%rsp, %rbp
Packit c32a2d
	sub			$112, %rsp
Packit c32a2d
	movaps		%xmm6, (%rsp)
Packit c32a2d
	movaps		%xmm7, 16(%rsp)
Packit c32a2d
	movaps		%xmm8, 32(%rsp)
Packit c32a2d
	movaps		%xmm9, 48(%rsp)
Packit c32a2d
	movaps		%xmm10, 64(%rsp)
Packit c32a2d
	movaps		%xmm11, 80(%rsp)
Packit c32a2d
	movaps		%xmm12, 96(%rsp)
Packit c32a2d
	push		%rdi
Packit c32a2d
	push		%rsi
Packit c32a2d
	mov			%rcx, %rdi
Packit c32a2d
	mov			%rdx, %rsi
Packit c32a2d
	mov			%r8, %rdx
Packit c32a2d
#endif
Packit c32a2d
	leaq		costab_avx(%rip), costab
Packit c32a2d
Packit c32a2d
	vmovups		(samples), %ymm0			# input[0,1,2,3,4,5,6,7]
Packit c32a2d
	vmovups		32(samples), %ymm1			# input[8,9,10,11,12,13,14,15]
Packit c32a2d
	vperm2f128	$0x23, 64(samples), %ymm2, %ymm2
Packit c32a2d
	vperm2f128	$0x23, 96(samples), %ymm3, %ymm3
Packit c32a2d
	vshufps		$0x1b, %ymm2, %ymm2, %ymm2	# input[23,22,21,20,19,18,17,16]
Packit c32a2d
	vshufps		$0x1b, %ymm3, %ymm3, %ymm3	# input[31,30,29,28,27,26,25,24]
Packit c32a2d
	vsubps		%ymm2, %ymm1, %ymm6
Packit c32a2d
	vsubps		%ymm3, %ymm0, %ymm7
Packit c32a2d
	vaddps		%ymm0, %ymm3, %ymm4			# bufs[0,1,2,3,4,5,6,7]
Packit c32a2d
	vaddps		%ymm1, %ymm2, %ymm5			# bufs[8,9,10,11,12,13,14,15]
Packit c32a2d
	vmulps		(costab), %ymm7, %ymm7		# bufs[31,30,29,28,27,26,25,24] cos64[0,1,2,3,4,5,6,7]
Packit c32a2d
	vmulps		32(costab), %ymm6, %ymm6	# bufs[23,22,21,20,19,18,17,16] cos64[8,9,10,11,12,13,14,15]
Packit c32a2d
	
Packit c32a2d
	vmovaps		64(costab), %ymm8			# cos32[0,1,2,3,4,5,6,7]
Packit c32a2d
	
Packit c32a2d
	vshufps		$0x1b, %ymm5, %ymm5, %ymm5
Packit c32a2d
	vshufps		$0x1b, %ymm6, %ymm6, %ymm6
Packit c32a2d
	vperm2f128	$0x01, %ymm5, %ymm5, %ymm5	# bufs[15,14,13,12,11,10,9,8]
Packit c32a2d
	vperm2f128	$0x01, %ymm6, %ymm6, %ymm6	# bufs[16,17,18,19,20,21,22,23]
Packit c32a2d
	vsubps		%ymm5, %ymm4, %ymm1
Packit c32a2d
	vsubps		%ymm6, %ymm7, %ymm3
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm0			# bufs[32,33,34,35,36,37,38,39]
Packit c32a2d
	vaddps		%ymm6, %ymm7, %ymm2			# bufs[48,49,50,51,52,53,54,55]
Packit c32a2d
	vmulps		%ymm1, %ymm8, %ymm1			# bufs[47,46,45,44,43,42,41,40]
Packit c32a2d
	vmulps		%ymm3, %ymm8, %ymm3			# bufs[63,62,61,60,59,58,57,56]
Packit c32a2d
	
Packit c32a2d
	vmovaps		96(costab), %ymm8			# cos16[0,1,2,3]:cos8[0,1]:cos4[0]:-
Packit c32a2d
	vperm2f128	$0x00, %ymm8, %ymm8, %ymm9	# cos16[0,1,2,3,0,1,2,3]
Packit c32a2d
	
Packit c32a2d
	vperm2f128	$0x20, %ymm1, %ymm0, %ymm4	# bufs[32,33,34,35,47,46,45,44]
Packit c32a2d
	vperm2f128	$0x31, %ymm1, %ymm0, %ymm5
Packit c32a2d
	vshufps		$0x1b, %ymm5, %ymm5, %ymm5	# bufs[39,38,37,36,40,41,42,43]
Packit c32a2d
	vperm2f128	$0x20, %ymm3, %ymm2, %ymm6	# bufs[48,49,50,51,63,62,61,60]
Packit c32a2d
	vperm2f128	$0x31, %ymm3, %ymm2, %ymm7
Packit c32a2d
	vshufps		$0x1b, %ymm7, %ymm7, %ymm7	# bufs[55,54,53,52,56,57,58,59]
Packit c32a2d
	vsubps		%ymm5, %ymm4, %ymm1
Packit c32a2d
	vsubps		%ymm7, %ymm6, %ymm3
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm0			# bufs[0,1,2,3,8,9,10,11]
Packit c32a2d
	vaddps		%ymm7, %ymm6, %ymm2			# bufs[16,17,18,19,24,25,26,27]
Packit c32a2d
	vmulps		%ymm1, %ymm9, %ymm1			# bufs[7,6,5,4,15,14,13,12]
Packit c32a2d
	vmulps		%ymm3, %ymm9, %ymm3			# bufs[23,22,21,20,31,30,29,28]
Packit c32a2d
	
Packit c32a2d
	vperm2f128	$0x11, %ymm8, %ymm8, %ymm8	# cos8[0,1]:cos4[0]:-:cos8[0,1]:cos4[0]:-
Packit c32a2d
	vmovddup	%ymm8, %ymm9				# cos8[0,1,0,1,0,1,0,1]
Packit c32a2d
	
Packit c32a2d
	vunpcklps	%ymm1, %ymm0, %ymm4			# bufs[0,7,1,6,8,15,9,14]
Packit c32a2d
	vunpckhps	%ymm1, %ymm0, %ymm5			# bufs[2,5,3,4,10,13,11,12]
Packit c32a2d
	vunpcklps	%ymm3, %ymm2, %ymm6			# bufs[16,23,17,22,24,31,25,30]
Packit c32a2d
	vunpckhps	%ymm3, %ymm2, %ymm7			# bufs[18,21,19,20,26,29,27,28]
Packit c32a2d
	vshufps		$0xd8, %ymm4, %ymm4, %ymm4	# bufs[0,1,7,6,8,9,15,14]
Packit c32a2d
	vshufps		$0x72, %ymm5, %ymm5, %ymm5	# bufs[3,2,4,5,11,10,12,13]
Packit c32a2d
	vshufps		$0xd8, %ymm6, %ymm6, %ymm6	# bufs[16,17,23,22,24,25,31,30]
Packit c32a2d
	vshufps		$0x72, %ymm7, %ymm7, %ymm7	# bufs[19,18,20,21,27,26,28,29]
Packit c32a2d
	vsubps		%ymm5, %ymm4, %ymm1
Packit c32a2d
	vsubps		%ymm7, %ymm6, %ymm3
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm0			# bufs[32,33,36,37,40,41,44,45]
Packit c32a2d
	vaddps		%ymm7, %ymm6, %ymm2			# bufs[48,49,52,53,56,57,60,61]
Packit c32a2d
	vmulps		%ymm1, %ymm9, %ymm1			# bufs[35,34,39,38,43,42,47,46]
Packit c32a2d
	vmulps		%ymm3, %ymm9, %ymm3			# bufs[51,50,55,54,59,58,63,62]
Packit c32a2d
	
Packit c32a2d
	vpermilps	$0xaa, %ymm8, %ymm8			# cos4[0,0,0,0,0,0,0,0]
Packit c32a2d
	
Packit c32a2d
	vshufps		$0xd8, %ymm0, %ymm0, %ymm0	# bufs[32,36,33,37,40,44,41,45]
Packit c32a2d
	vshufps		$0xd8, %ymm1, %ymm1, %ymm1	# bufs[35,39,34,38,43,47,42,46]
Packit c32a2d
	vshufps		$0xd8, %ymm2, %ymm2, %ymm2	# bufs[48,52,49,53,56,60,57,61]
Packit c32a2d
	vshufps		$0xd8, %ymm3, %ymm3, %ymm3	# bufs[51,55,50,54,59,63,58,62]
Packit c32a2d
	vunpcklps	%ymm1, %ymm0, %ymm4			# bufs[32,35,36,39,40,43,44,47]
Packit c32a2d
	vunpckhps	%ymm1, %ymm0, %ymm5			# bufs[33,34,37,38,41,42,45,46]
Packit c32a2d
	vunpcklps	%ymm3, %ymm2, %ymm6			# bufs[48,51,52,55,56,59,60,63]
Packit c32a2d
	vunpckhps	%ymm3, %ymm2, %ymm7			# bufs[49,50,53,54,57,58,61,62]
Packit c32a2d
	vsubps		%ymm5, %ymm4, %ymm1
Packit c32a2d
	vsubps		%ymm7, %ymm6, %ymm3
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm0			# bufs[0,2,4,6,8,10,12,14]
Packit c32a2d
	vaddps		%ymm7, %ymm6, %ymm2			# bufs[16,18,20,22,24,26,28,30]
Packit c32a2d
	vmulps		%ymm1, %ymm8, %ymm1			# bufs[1,3,5,7,9,11,13,15]
Packit c32a2d
	vmulps		%ymm3, %ymm8, %ymm3			# bufs[17,19,21,23,25,27,29,31]
Packit c32a2d
	
Packit c32a2d
	vxorps		%ymm8, %ymm8, %ymm8
Packit c32a2d
	vblendps	$0xaa, %ymm1, %ymm8, %ymm5
Packit c32a2d
	vblendps	$0xaa, %ymm3, %ymm8, %ymm6
Packit c32a2d
	vaddps		%ymm5, %ymm0, %ymm0
Packit c32a2d
	vaddps		%ymm6, %ymm2, %ymm2
Packit c32a2d
	vunpcklps	%ymm1, %ymm0, %ymm4			# bufs[0,1,2,3,8,9,10,11]
Packit c32a2d
	vunpckhps	%ymm1, %ymm0, %ymm5			# bufs[4,5,6,7,12,13,14,15]
Packit c32a2d
	vunpcklps	%ymm3, %ymm2, %ymm6			# bufs[16,17,18,19,24,25,26,27]
Packit c32a2d
	vunpckhps	%ymm3, %ymm2, %ymm7			# bufs[20,21,22,23,28,29,30,31]
Packit c32a2d
	
Packit c32a2d
	vextractf128	$0x1, %ymm4, %xmm0		# bufs[8,9,10,11]
Packit c32a2d
	vextractf128	$0x1, %ymm5, %xmm1		# bufs[12,13,14,15]
Packit c32a2d
	vextractf128	$0x1, %ymm6, %xmm2		# bufs[24,25,26,27]
Packit c32a2d
	vextractf128	$0x1, %ymm7, %xmm3		# bufs[28,29,30,31]
Packit c32a2d
	
Packit c32a2d
	vshufps		$0x1e, %xmm5, %xmm5, %xmm9	# bufs[6,7,5,4]
Packit c32a2d
	vshufps		$0x1e, %xmm1, %xmm1, %xmm10	# bufs[14,15,13,12]
Packit c32a2d
	vshufps		$0x1e, %xmm7, %xmm7, %xmm11	# bufs[22,23,21,20]
Packit c32a2d
	vshufps		$0x1e, %xmm3, %xmm3, %xmm12	# bufs[30,31,29,28]
Packit c32a2d
	vblendps	$0x7, %xmm9, %xmm8, %xmm9	# bufs[6,7,5,-]
Packit c32a2d
	vblendps	$0x7, %xmm10, %xmm8, %xmm10 # bufs[14,15,13,-]
Packit c32a2d
	vblendps	$0x7, %xmm11, %xmm8, %xmm11	# bufs[22,23,21,-]
Packit c32a2d
	vblendps	$0x7, %xmm12, %xmm8, %xmm12	# bufs[30,31,29,-]
Packit c32a2d
	vaddps		%xmm5, %xmm9, %xmm5
Packit c32a2d
	vaddps		%xmm1, %xmm10, %xmm1
Packit c32a2d
	vaddps		%xmm7, %xmm11, %xmm7
Packit c32a2d
	vaddps		%xmm3, %xmm12, %xmm3
Packit c32a2d
	
Packit c32a2d
	prefetcht0	1024(out0)
Packit c32a2d
	
Packit c32a2d
	vshufps		$0x1e, %xmm0, %xmm0, %xmm9	# bufs[10,11,9,8]
Packit c32a2d
	vshufps		$0x1e, %xmm2, %xmm2, %xmm10	# bufs[26,27,25,24]
Packit c32a2d
	vaddps		%xmm1, %xmm0, %xmm0
Packit c32a2d
	vaddps		%xmm3, %xmm2, %xmm2
Packit c32a2d
	vblendps	$0x7, %xmm9, %xmm8, %xmm9	# bufs[10,11,9,-]
Packit c32a2d
	vblendps	$0x7, %xmm10, %xmm8, %xmm10	# bufs[26,27,25,-]
Packit c32a2d
	vaddps		%xmm1, %xmm9, %xmm1
Packit c32a2d
	vaddps		%xmm3, %xmm10, %xmm3
Packit c32a2d
	
Packit c32a2d
	vzeroupper
Packit c32a2d
	prefetcht0	1024(out1)
Packit c32a2d
	
Packit c32a2d
	addq		$1024, out0
Packit c32a2d
	movq		$-128, %rax
Packit c32a2d
	movss		%xmm4, (out0)
Packit c32a2d
	movss		%xmm0, (out0,%rax,1)
Packit c32a2d
	movss		%xmm5, (out0,%rax,2)
Packit c32a2d
	movss		%xmm1, -128(out0,%rax,2)
Packit c32a2d
	leaq		(out0,%rax,4), out0
Packit c32a2d
	movhlps		%xmm4, %xmm9
Packit c32a2d
	movhlps		%xmm0, %xmm10
Packit c32a2d
	movhlps		%xmm5, %xmm11
Packit c32a2d
	movhlps		%xmm1, %xmm12
Packit c32a2d
	vmovss		%xmm9, (out0)
Packit c32a2d
	vmovss		%xmm10, (out0,%rax,1)
Packit c32a2d
	vmovss		%xmm11, (out0,%rax,2)
Packit c32a2d
	vmovss		%xmm12, -128(out0,%rax,2)
Packit c32a2d
	leaq		(out0,%rax,4), out0
Packit c32a2d
	negq		%rax
Packit c32a2d
	shufps		$0xb1, %xmm4, %xmm4
Packit c32a2d
	shufps		$0xb1, %xmm0, %xmm0
Packit c32a2d
	shufps		$0xb1, %xmm5, %xmm5
Packit c32a2d
	shufps		$0xb1, %xmm1, %xmm1
Packit c32a2d
	movss		%xmm4, (out0)
Packit c32a2d
	movss		%xmm4, (out1)
Packit c32a2d
	leaq		(out1,%rax,1), out1
Packit c32a2d
	movss		%xmm0, (out1)
Packit c32a2d
	movss		%xmm5, (out1,%rax,1)
Packit c32a2d
	movss		%xmm1, (out1,%rax,2)
Packit c32a2d
	leaq		(out1,%rax,4), out1
Packit c32a2d
	movhlps		%xmm4, %xmm4
Packit c32a2d
	movhlps		%xmm0, %xmm0
Packit c32a2d
	movhlps		%xmm5, %xmm5
Packit c32a2d
	movhlps		%xmm1, %xmm1
Packit c32a2d
	movss		%xmm4, -128(out1)
Packit c32a2d
	movss		%xmm0, (out1)
Packit c32a2d
	movss		%xmm5, (out1,%rax,1)
Packit c32a2d
	movss		%xmm1, (out1,%rax,2)
Packit c32a2d
	
Packit c32a2d
	leaq		-64(out0,%rax,8), out0
Packit c32a2d
	negq		%rax
Packit c32a2d
	vshufps		$0x1e, %xmm6, %xmm6, %xmm0
Packit c32a2d
	vblendps	$0x7, %xmm0, %xmm8, %xmm0
Packit c32a2d
	addps		%xmm2, %xmm6
Packit c32a2d
	addps		%xmm7, %xmm2
Packit c32a2d
	addps		%xmm3, %xmm7
Packit c32a2d
	addps		%xmm0, %xmm3
Packit c32a2d
	movss		%xmm6, (out0)
Packit c32a2d
	movss		%xmm2, (out0,%rax,1)
Packit c32a2d
	movss		%xmm7, (out0,%rax,2)
Packit c32a2d
	movss		%xmm3, -128(out0,%rax,2)
Packit c32a2d
	leaq		(out0,%rax,4), out0
Packit c32a2d
	movhlps		%xmm6, %xmm0
Packit c32a2d
	movhlps		%xmm2, %xmm1
Packit c32a2d
	movhlps		%xmm7, %xmm4
Packit c32a2d
	movhlps		%xmm3, %xmm5
Packit c32a2d
	movss		%xmm0, (out0)
Packit c32a2d
	movss		%xmm1, (out0,%rax,1)
Packit c32a2d
	movss		%xmm4, (out0,%rax,2)
Packit c32a2d
	movss		%xmm5, -128(out0,%rax,2)
Packit c32a2d
	leaq		64(out1,%rax,4), out1
Packit c32a2d
	negq		%rax
Packit c32a2d
	shufps		$0xb1, %xmm6, %xmm6
Packit c32a2d
	shufps		$0xb1, %xmm2, %xmm2
Packit c32a2d
	shufps		$0xb1, %xmm7, %xmm7
Packit c32a2d
	shufps		$0xb1, %xmm3, %xmm3
Packit c32a2d
	movss		%xmm6, -128(out1)
Packit c32a2d
	movss		%xmm2, (out1)
Packit c32a2d
	movss		%xmm7, (out1,%rax,1)
Packit c32a2d
	movss		%xmm3, (out1,%rax,2)
Packit c32a2d
	leaq		(out1,%rax,4), out1
Packit c32a2d
	movhlps		%xmm6, %xmm6
Packit c32a2d
	movhlps		%xmm2, %xmm2
Packit c32a2d
	movhlps		%xmm7, %xmm7
Packit c32a2d
	movhlps		%xmm3, %xmm3
Packit c32a2d
	movss		%xmm6, -128(out1)
Packit c32a2d
	movss		%xmm2, (out1)
Packit c32a2d
	movss		%xmm7, (out1,%rax,1)
Packit c32a2d
	movss		%xmm3, (out1,%rax,2)
Packit c32a2d
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	pop			%rsi
Packit c32a2d
	pop			%rdi
Packit c32a2d
	movaps		(%rsp), %xmm6
Packit c32a2d
	movaps		16(%rsp), %xmm7
Packit c32a2d
	movaps		32(%rsp), %xmm8
Packit c32a2d
	movaps		48(%rsp), %xmm9
Packit c32a2d
	movaps		64(%rsp), %xmm10
Packit c32a2d
	movaps		80(%rsp), %xmm11
Packit c32a2d
	movaps		96(%rsp), %xmm12
Packit c32a2d
	mov			%rbp, %rsp
Packit c32a2d
	pop			%rbp
Packit c32a2d
#endif
Packit c32a2d
	ret
Packit c32a2d
Packit c32a2d
NONEXEC_STACK