Blame src/libmpg123/dct64_avx.S

Packit c32a2d
/*
Packit c32a2d
	dct36_sse: AVX optimized dct64 for x86-64
Packit c32a2d
Packit c32a2d
	copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#define samples %rdx
Packit c32a2d
#define costab %rcx
Packit c32a2d
#define out0 %rdi
Packit c32a2d
#define out1 %rsi
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	void dct64_avx(short *out0, short *out1, real *samples);
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.section	.rodata
Packit c32a2d
#else
Packit c32a2d
	.data
Packit c32a2d
#endif
Packit c32a2d
	ALIGN32
Packit c32a2d
costab_avx:
Packit c32a2d
	.long 1056974725
Packit c32a2d
	.long 1057056395
Packit c32a2d
	.long 1057223771
Packit c32a2d
	.long 1057485416
Packit c32a2d
	.long 1057855544
Packit c32a2d
	.long 1058356026
Packit c32a2d
	.long 1059019886
Packit c32a2d
	.long 1059897405
Packit c32a2d
	.long 1061067246
Packit c32a2d
	.long 1062657950
Packit c32a2d
	.long 1064892987
Packit c32a2d
	.long 1066774581
Packit c32a2d
	.long 1069414683
Packit c32a2d
	.long 1073984175
Packit c32a2d
	.long 1079645762
Packit c32a2d
	.long 1092815430
Packit c32a2d
	.long 1057005197
Packit c32a2d
	.long 1057342072
Packit c32a2d
	.long 1058087743
Packit c32a2d
	.long 1059427869
Packit c32a2d
	.long 1061799040
Packit c32a2d
	.long 1065862217
Packit c32a2d
	.long 1071413542
Packit c32a2d
	.long 1084439708
Packit c32a2d
	.long 1057128951
Packit c32a2d
	.long 1058664893
Packit c32a2d
	.long 1063675095
Packit c32a2d
	.long 1076102863
Packit c32a2d
	.long 1057655764
Packit c32a2d
	.long 1067924853
Packit c32a2d
	.long 1060439283
Packit c32a2d
	.long 0
Packit c32a2d
	.text
Packit c32a2d
	ALIGN16
Packit c32a2d
.globl ASM_NAME(dct64_avx)
Packit c32a2d
ASM_NAME(dct64_avx):
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	push		%rbp
Packit c32a2d
	mov			%rsp, %rbp
Packit c32a2d
	sub			$112, %rsp
Packit c32a2d
	movaps		%xmm6, (%rsp)
Packit c32a2d
	movaps		%xmm7, 16(%rsp)
Packit c32a2d
	movaps		%xmm8, 32(%rsp)
Packit c32a2d
	movaps		%xmm9, 48(%rsp)
Packit c32a2d
	movaps		%xmm10, 64(%rsp)
Packit c32a2d
	movaps		%xmm11, 80(%rsp)
Packit c32a2d
	movaps		%xmm12, 96(%rsp)
Packit c32a2d
	push		%rdi
Packit c32a2d
	push		%rsi
Packit c32a2d
	mov			%rcx, %rdi
Packit c32a2d
	mov			%rdx, %rsi
Packit c32a2d
	mov			%r8, %rdx
Packit c32a2d
#endif
Packit c32a2d
	leaq		costab_avx(%rip), costab
Packit c32a2d
Packit c32a2d
	vmovups		(samples), %ymm0			# input[0,1,2,3,4,5,6,7]
Packit c32a2d
	vmovups		32(samples), %ymm1			# input[8,9,10,11,12,13,14,15]
Packit c32a2d
	vperm2f128	$0x23, 64(samples), %ymm2, %ymm2
Packit c32a2d
	vperm2f128	$0x23, 96(samples), %ymm3, %ymm3
Packit c32a2d
	vshufps		$0x1b, %ymm2, %ymm2, %ymm2	# input[23,22,21,20,19,18,17,16]
Packit c32a2d
	vshufps		$0x1b, %ymm3, %ymm3, %ymm3	# input[31,30,29,28,27,26,25,24]
Packit c32a2d
	vsubps		%ymm2, %ymm1, %ymm6
Packit c32a2d
	vsubps		%ymm3, %ymm0, %ymm7
Packit c32a2d
	vaddps		%ymm0, %ymm3, %ymm4			# bufs[0,1,2,3,4,5,6,7]
Packit c32a2d
	vaddps		%ymm1, %ymm2, %ymm5			# bufs[8,9,10,11,12,13,14,15]
Packit c32a2d
	vmulps		(costab), %ymm7, %ymm7		# bufs[31,30,29,28,27,26,25,24] cos64[0,1,2,3,4,5,6,7]
Packit c32a2d
	vmulps		32(costab), %ymm6, %ymm6	# bufs[23,22,21,20,19,18,17,16] cos64[8,9,10,11,12,13,14,15]
Packit c32a2d
	
Packit c32a2d
	vmovaps		64(costab), %ymm8			# cos32[0,1,2,3,4,5,6,7]
Packit c32a2d
	
Packit c32a2d
	vshufps		$0x1b, %ymm5, %ymm5, %ymm5
Packit c32a2d
	vshufps		$0x1b, %ymm6, %ymm6, %ymm6
Packit c32a2d
	vperm2f128	$0x01, %ymm5, %ymm5, %ymm5	# bufs[15,14,13,12,11,10,9,8]
Packit c32a2d
	vperm2f128	$0x01, %ymm6, %ymm6, %ymm6	# bufs[16,17,18,19,20,21,22,23]
Packit c32a2d
	vsubps		%ymm5, %ymm4, %ymm1
Packit c32a2d
	vsubps		%ymm6, %ymm7, %ymm3
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm0			# bufs[32,33,34,35,36,37,38,39]
Packit c32a2d
	vaddps		%ymm6, %ymm7, %ymm2			# bufs[48,49,50,51,52,53,54,55]
Packit c32a2d
	vmulps		%ymm1, %ymm8, %ymm1			# bufs[47,46,45,44,43,42,41,40]
Packit c32a2d
	vmulps		%ymm3, %ymm8, %ymm3			# bufs[63,62,61,60,59,58,57,56]
Packit c32a2d
	
Packit c32a2d
	vmovaps		96(costab), %ymm8			# cos16[0,1,2,3]:cos8[0,1]:cos4[0]:-
Packit c32a2d
	vperm2f128	$0x00, %ymm8, %ymm8, %ymm9	# cos16[0,1,2,3,0,1,2,3]
Packit c32a2d
	
Packit c32a2d
	vperm2f128	$0x20, %ymm1, %ymm0, %ymm4	# bufs[32,33,34,35,47,46,45,44]
Packit c32a2d
	vperm2f128	$0x31, %ymm1, %ymm0, %ymm5
Packit c32a2d
	vshufps		$0x1b, %ymm5, %ymm5, %ymm5	# bufs[39,38,37,36,40,41,42,43]
Packit c32a2d
	vperm2f128	$0x20, %ymm3, %ymm2, %ymm6	# bufs[48,49,50,51,63,62,61,60]
Packit c32a2d
	vperm2f128	$0x31, %ymm3, %ymm2, %ymm7
Packit c32a2d
	vshufps		$0x1b, %ymm7, %ymm7, %ymm7	# bufs[55,54,53,52,56,57,58,59]
Packit c32a2d
	vsubps		%ymm5, %ymm4, %ymm1
Packit c32a2d
	vsubps		%ymm7, %ymm6, %ymm3
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm0			# bufs[0,1,2,3,8,9,10,11]
Packit c32a2d
	vaddps		%ymm7, %ymm6, %ymm2			# bufs[16,17,18,19,24,25,26,27]
Packit c32a2d
	vmulps		%ymm1, %ymm9, %ymm1			# bufs[7,6,5,4,15,14,13,12]
Packit c32a2d
	vmulps		%ymm3, %ymm9, %ymm3			# bufs[23,22,21,20,31,30,29,28]
Packit c32a2d
	
Packit c32a2d
	vperm2f128	$0x11, %ymm8, %ymm8, %ymm8	# cos8[0,1]:cos4[0]:-:cos8[0,1]:cos4[0]:-
Packit c32a2d
	vmovddup	%ymm8, %ymm9				# cos8[0,1,0,1,0,1,0,1]
Packit c32a2d
	
Packit c32a2d
	vunpcklps	%ymm1, %ymm0, %ymm4			# bufs[0,7,1,6,8,15,9,14]
Packit c32a2d
	vunpckhps	%ymm1, %ymm0, %ymm5			# bufs[2,5,3,4,10,13,11,12]
Packit c32a2d
	vunpcklps	%ymm3, %ymm2, %ymm6			# bufs[16,23,17,22,24,31,25,30]
Packit c32a2d
	vunpckhps	%ymm3, %ymm2, %ymm7			# bufs[18,21,19,20,26,29,27,28]
Packit c32a2d
	vshufps		$0xd8, %ymm4, %ymm4, %ymm4	# bufs[0,1,7,6,8,9,15,14]
Packit c32a2d
	vshufps		$0x72, %ymm5, %ymm5, %ymm5	# bufs[3,2,4,5,11,10,12,13]
Packit c32a2d
	vshufps		$0xd8, %ymm6, %ymm6, %ymm6	# bufs[16,17,23,22,24,25,31,30]
Packit c32a2d
	vshufps		$0x72, %ymm7, %ymm7, %ymm7	# bufs[19,18,20,21,27,26,28,29]
Packit c32a2d
	vsubps		%ymm5, %ymm4, %ymm1
Packit c32a2d
	vsubps		%ymm7, %ymm6, %ymm3
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm0			# bufs[32,33,36,37,40,41,44,45]
Packit c32a2d
	vaddps		%ymm7, %ymm6, %ymm2			# bufs[48,49,52,53,56,57,60,61]
Packit c32a2d
	vmulps		%ymm1, %ymm9, %ymm1			# bufs[35,34,39,38,43,42,47,46]
Packit c32a2d
	vmulps		%ymm3, %ymm9, %ymm3			# bufs[51,50,55,54,59,58,63,62]
Packit c32a2d
	
Packit c32a2d
	vpermilps	$0xaa, %ymm8, %ymm8			# cos4[0,0,0,0,0,0,0,0]
Packit c32a2d
	
Packit c32a2d
	vshufps		$0xd8, %ymm0, %ymm0, %ymm0	# bufs[32,36,33,37,40,44,41,45]
Packit c32a2d
	vshufps		$0xd8, %ymm1, %ymm1, %ymm1	# bufs[35,39,34,38,43,47,42,46]
Packit c32a2d
	vshufps		$0xd8, %ymm2, %ymm2, %ymm2	# bufs[48,52,49,53,56,60,57,61]
Packit c32a2d
	vshufps		$0xd8, %ymm3, %ymm3, %ymm3	# bufs[51,55,50,54,59,63,58,62]
Packit c32a2d
	vunpcklps	%ymm1, %ymm0, %ymm4			# bufs[32,35,36,39,40,43,44,47]
Packit c32a2d
	vunpckhps	%ymm1, %ymm0, %ymm5			# bufs[33,34,37,38,41,42,45,46]
Packit c32a2d
	vunpcklps	%ymm3, %ymm2, %ymm6			# bufs[48,51,52,55,56,59,60,63]
Packit c32a2d
	vunpckhps	%ymm3, %ymm2, %ymm7			# bufs[49,50,53,54,57,58,61,62]
Packit c32a2d
	vsubps		%ymm5, %ymm4, %ymm1
Packit c32a2d
	vsubps		%ymm7, %ymm6, %ymm3
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm0			# bufs[0,2,4,6,8,10,12,14]
Packit c32a2d
	vaddps		%ymm7, %ymm6, %ymm2			# bufs[16,18,20,22,24,26,28,30]
Packit c32a2d
	vmulps		%ymm1, %ymm8, %ymm1			# bufs[1,3,5,7,9,11,13,15]
Packit c32a2d
	vmulps		%ymm3, %ymm8, %ymm3			# bufs[17,19,21,23,25,27,29,31]
Packit c32a2d
	
Packit c32a2d
	vxorps		%ymm8, %ymm8, %ymm8
Packit c32a2d
	vblendps	$0xaa, %ymm1, %ymm8, %ymm5
Packit c32a2d
	vblendps	$0xaa, %ymm3, %ymm8, %ymm6
Packit c32a2d
	vaddps		%ymm5, %ymm0, %ymm0
Packit c32a2d
	vaddps		%ymm6, %ymm2, %ymm2
Packit c32a2d
	vunpcklps	%ymm1, %ymm0, %ymm4			# bufs[0,1,2,3,8,9,10,11]
Packit c32a2d
	vunpckhps	%ymm1, %ymm0, %ymm5			# bufs[4,5,6,7,12,13,14,15]
Packit c32a2d
	vunpcklps	%ymm3, %ymm2, %ymm6			# bufs[16,17,18,19,24,25,26,27]
Packit c32a2d
	vunpckhps	%ymm3, %ymm2, %ymm7			# bufs[20,21,22,23,28,29,30,31]
Packit c32a2d
	
Packit c32a2d
	vextractf128	$0x1, %ymm4, %xmm0		# bufs[8,9,10,11]
Packit c32a2d
	vextractf128	$0x1, %ymm5, %xmm1		# bufs[12,13,14,15]
Packit c32a2d
	vextractf128	$0x1, %ymm6, %xmm2		# bufs[24,25,26,27]
Packit c32a2d
	vextractf128	$0x1, %ymm7, %xmm3		# bufs[28,29,30,31]
Packit c32a2d
	
Packit c32a2d
	vshufps		$0x1e, %xmm5, %xmm5, %xmm9	# bufs[6,7,5,4]
Packit c32a2d
	vshufps		$0x1e, %xmm1, %xmm1, %xmm10	# bufs[14,15,13,12]
Packit c32a2d
	vshufps		$0x1e, %xmm7, %xmm7, %xmm11	# bufs[22,23,21,20]
Packit c32a2d
	vshufps		$0x1e, %xmm3, %xmm3, %xmm12	# bufs[30,31,29,28]
Packit c32a2d
	vblendps	$0x7, %xmm9, %xmm8, %xmm9	# bufs[6,7,5,-]
Packit c32a2d
	vblendps	$0x7, %xmm10, %xmm8, %xmm10 # bufs[14,15,13,-]
Packit c32a2d
	vblendps	$0x7, %xmm11, %xmm8, %xmm11	# bufs[22,23,21,-]
Packit c32a2d
	vblendps	$0x7, %xmm12, %xmm8, %xmm12	# bufs[30,31,29,-]
Packit c32a2d
	vaddps		%xmm5, %xmm9, %xmm5
Packit c32a2d
	vaddps		%xmm1, %xmm10, %xmm1
Packit c32a2d
	vaddps		%xmm7, %xmm11, %xmm7
Packit c32a2d
	vaddps		%xmm3, %xmm12, %xmm3
Packit c32a2d
	
Packit c32a2d
	prefetcht0	512(out0)
Packit c32a2d
	
Packit c32a2d
	vshufps		$0x1e, %xmm0, %xmm0, %xmm9	# bufs[10,11,9,8]
Packit c32a2d
	vshufps		$0x1e, %xmm2, %xmm2, %xmm10	# bufs[26,27,25,24]
Packit c32a2d
	vaddps		%xmm1, %xmm0, %xmm0
Packit c32a2d
	vaddps		%xmm3, %xmm2, %xmm2
Packit c32a2d
	vblendps	$0x7, %xmm9, %xmm8, %xmm9	# bufs[10,11,9,-]
Packit c32a2d
	vblendps	$0x7, %xmm10, %xmm8, %xmm10	# bufs[26,27,25,-]
Packit c32a2d
	vaddps		%xmm1, %xmm9, %xmm1
Packit c32a2d
	vaddps		%xmm3, %xmm10, %xmm3
Packit c32a2d
	
Packit c32a2d
	vzeroupper
Packit c32a2d
	prefetcht0	512(out1)
Packit c32a2d
	
Packit c32a2d
	cvtps2dq	%xmm4, %xmm4
Packit c32a2d
	cvtps2dq	%xmm0, %xmm0
Packit c32a2d
	cvtps2dq	%xmm5, %xmm5
Packit c32a2d
	cvtps2dq	%xmm1, %xmm1
Packit c32a2d
	packssdw	%xmm5, %xmm4
Packit c32a2d
	packssdw	%xmm1, %xmm0
Packit c32a2d
	movq		%xmm4, %rcx
Packit c32a2d
	pshufd		$0x4e, %xmm4, %xmm5
Packit c32a2d
	movq		%xmm0, %rdx
Packit c32a2d
	pshufd		$0x4e, %xmm0, %xmm1
Packit c32a2d
	movq		%xmm5, %r8
Packit c32a2d
	movq		%xmm1, %r9
Packit c32a2d
	
Packit c32a2d
	addq		$512, out0
Packit c32a2d
	movq		$-64, %rax
Packit c32a2d
	
Packit c32a2d
	movw		%cx, (out0)
Packit c32a2d
	movw		%dx, (out0,%rax,1)
Packit c32a2d
	movw		%r8w, (out0,%rax,2)
Packit c32a2d
	movw		%r9w, -64(out0,%rax,2)
Packit c32a2d
	leaq		(out0,%rax,4), out0
Packit c32a2d
	shr			$16, %rcx
Packit c32a2d
	shr			$16, %rdx
Packit c32a2d
	shr			$16, %r8
Packit c32a2d
	shr			$16, %r9
Packit c32a2d
	movw		%cx, (out0,%rax,4)
Packit c32a2d
	negq		%rax
Packit c32a2d
	movw		%cx, (out1)
Packit c32a2d
	movw		%dx, (out1,%rax,1)
Packit c32a2d
	movw		%r8w, (out1,%rax,2)
Packit c32a2d
	movw		%r9w, 64(out1,%rax,2)
Packit c32a2d
	leaq		(out1,%rax,4), out1
Packit c32a2d
	shr			$16, %rcx
Packit c32a2d
	shr			$16, %rdx
Packit c32a2d
	shr			$16, %r8
Packit c32a2d
	shr			$16, %r9
Packit c32a2d
	negq		%rax
Packit c32a2d
	movw		%cx, (out0)
Packit c32a2d
	movw		%dx, (out0,%rax,1)
Packit c32a2d
	movw		%r8w, (out0,%rax,2)
Packit c32a2d
	movw		%r9w, -64(out0,%rax,2)
Packit c32a2d
	shr			$16, %rcx
Packit c32a2d
	shr			$16, %rdx
Packit c32a2d
	shr			$16, %r8
Packit c32a2d
	shr			$16, %r9
Packit c32a2d
	negq		%rax
Packit c32a2d
	movw		%cx, (out1)
Packit c32a2d
	movw		%dx, (out1,%rax,1)
Packit c32a2d
	movw		%r8w, (out1,%rax,2)
Packit c32a2d
	movw		%r9w, 64(out1,%rax,2)
Packit c32a2d
	
Packit c32a2d
	leaq		-32(out0,%rax,4), out0
Packit c32a2d
	negq		%rax
Packit c32a2d
	leaq		32(out1,%rax,4), out1
Packit c32a2d
	
Packit c32a2d
	vshufps		$0x1e, %xmm6, %xmm6, %xmm0
Packit c32a2d
	vblendps	$0x7, %xmm0, %xmm8, %xmm0
Packit c32a2d
	addps		%xmm2, %xmm6
Packit c32a2d
	addps		%xmm7, %xmm2
Packit c32a2d
	addps		%xmm3, %xmm7
Packit c32a2d
	addps		%xmm0, %xmm3
Packit c32a2d
	cvtps2dq	%xmm6, %xmm6
Packit c32a2d
	cvtps2dq	%xmm2, %xmm2
Packit c32a2d
	cvtps2dq	%xmm7, %xmm7
Packit c32a2d
	cvtps2dq	%xmm3, %xmm3
Packit c32a2d
	packssdw	%xmm7, %xmm6
Packit c32a2d
	packssdw	%xmm3, %xmm2
Packit c32a2d
	movq		%xmm6, %rcx
Packit c32a2d
	pshufd		$0x4e, %xmm6, %xmm7
Packit c32a2d
	movq		%xmm2, %rdx
Packit c32a2d
	pshufd		$0x4e, %xmm2, %xmm3
Packit c32a2d
	movq		%xmm7, %r8
Packit c32a2d
	movq		%xmm3, %r9
Packit c32a2d
	
Packit c32a2d
	movw		%cx, (out0)
Packit c32a2d
	movw		%dx, (out0,%rax,1)
Packit c32a2d
	movw		%r8w, (out0,%rax,2)
Packit c32a2d
	movw		%r9w, -64(out0,%rax,2)
Packit c32a2d
	leaq		(out0,%rax,4), out0
Packit c32a2d
	shr			$16, %rcx
Packit c32a2d
	shr			$16, %rdx
Packit c32a2d
	shr			$16, %r8
Packit c32a2d
	shr			$16, %r9
Packit c32a2d
	negq		%rax
Packit c32a2d
	movw		%cx, (out1)
Packit c32a2d
	movw		%dx, (out1,%rax,1)
Packit c32a2d
	movw		%r8w, (out1,%rax,2)
Packit c32a2d
	movw		%r9w, 64(out1,%rax,2)
Packit c32a2d
	leaq		(out1,%rax,4), out1
Packit c32a2d
	shr			$16, %rcx
Packit c32a2d
	shr			$16, %rdx
Packit c32a2d
	shr			$16, %r8
Packit c32a2d
	shr			$16, %r9
Packit c32a2d
	negq		%rax
Packit c32a2d
	movw		%cx, (out0)
Packit c32a2d
	movw		%dx, (out0,%rax,1)
Packit c32a2d
	movw		%r8w, (out0,%rax,2)
Packit c32a2d
	movw		%r9w, -64(out0,%rax,2)
Packit c32a2d
	shr			$16, %rcx
Packit c32a2d
	shr			$16, %rdx
Packit c32a2d
	shr			$16, %r8
Packit c32a2d
	shr			$16, %r9
Packit c32a2d
	negq		%rax
Packit c32a2d
	movw		%cx, (out1)
Packit c32a2d
	movw		%dx, (out1,%rax,1)
Packit c32a2d
	movw		%r8w, (out1,%rax,2)
Packit c32a2d
	movw		%r9w, 64(out1,%rax,2)
Packit c32a2d
	
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	pop			%rsi
Packit c32a2d
	pop			%rdi
Packit c32a2d
	movaps		(%rsp), %xmm6
Packit c32a2d
	movaps		16(%rsp), %xmm7
Packit c32a2d
	movaps		32(%rsp), %xmm8
Packit c32a2d
	movaps		48(%rsp), %xmm9
Packit c32a2d
	movaps		64(%rsp), %xmm10
Packit c32a2d
	movaps		80(%rsp), %xmm11
Packit c32a2d
	movaps		96(%rsp), %xmm12
Packit c32a2d
	mov			%rbp, %rsp
Packit c32a2d
	pop			%rbp
Packit c32a2d
#endif
Packit c32a2d
	ret
Packit c32a2d
Packit c32a2d
NONEXEC_STACK