Blob Blame History Raw
/*
	dct36_avx: AVX optimized dct36 for x86-64

	copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
	see COPYING and AUTHORS files in distribution or http://mpg123.org
	initially written by Taihei Monma
*/

#include "mangle.h"

#ifdef IS_MSABI
#define in %rcx
#define out1 %rdx
#define out2 %r8
#define w  %r9
#define ts %r10
#define COS9_ %rax
#define tfcos36_ %r11
#else
#define in %rdi
#define out1 %rsi
#define out2 %rdx
#define w  %rcx
#define ts %r8
#define COS9_ %rax
#define tfcos36_ %r9
#endif

/*
	void dct36_avx(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf);
*/
	
#ifndef __APPLE__
	.section	.rodata
#else
	.data
#endif
	ALIGN16
dct36_avx_COS9:
	.long 0x3f5db3d7
	.long 0x3f5db3d7
	.long 0x3f000000
	.long 0x3f000000
	.long 0x3f7c1c5c
	.long 0x3f7c1c5c
	.long 0x3f708fb2
	.long 0x3f708fb2
	.long 0x3f248dbb
	.long 0x3f248dbb
	.long 0x3e31d0d4
	.long 0x3e31d0d4
	.long 0x3eaf1d44
	.long 0x3eaf1d44
	.long 0x3f441b7d
	.long 0x3f441b7d
	ALIGN16
dct36_avx_tfcos36:
	.long 0x3f007d2b
	.long 0x3f0483ee
	.long 0x3f0d3b7d
	.long 0x3f1c4257
	.long 0x40b79454
	.long 0x3ff746ea
	.long 0x3f976fd9
	.long 0x3f5f2944
	.long 0x3f3504f3
	ALIGN16
dct36_avx_sign:
	.long 0x80000000,0x80000000,0x80000000,0x80000000
	.text
	ALIGN16
	.globl ASM_NAME(dct36_avx)
ASM_NAME(dct36_avx):
#ifdef IS_MSABI
	push		%rbp
	mov			%rsp, %rbp
	sub			$160, %rsp
	movaps		%xmm6, (%rsp)
	movaps		%xmm7, 16(%rsp)
	movaps		%xmm8, 32(%rsp)
	movaps		%xmm9, 48(%rsp)
	movaps		%xmm10, 64(%rsp)
	movaps		%xmm11, 80(%rsp)
	movaps		%xmm12, 96(%rsp)
	movaps		%xmm13, 112(%rsp)
	movaps		%xmm14, 128(%rsp)
	movaps		%xmm15, 144(%rsp)
	movq		48(%rbp), ts
#endif
	lea			dct36_avx_COS9(%rip), COS9_
	lea			dct36_avx_tfcos36(%rip), tfcos36_
	
	xorps		%xmm4, %xmm4
	movups		(in), %xmm0
	movups		16(in), %xmm1
	movups		32(in), %xmm2
	movups		48(in), %xmm3
	movlps		64(in), %xmm4
	vshufps		$0x93, %xmm0, %xmm0, %xmm5
	vshufps		$0x93, %xmm1, %xmm1, %xmm6
	vshufps		$0x93, %xmm2, %xmm2, %xmm7
	vshufps		$0x93, %xmm3, %xmm3, %xmm8
	vshufps		$0xe1, %xmm4, %xmm4, %xmm9
	movss		%xmm8, %xmm9 #[fg--]
	addps		%xmm9, %xmm4 #[gh--]
	movss		%xmm7, %xmm8
	addps		%xmm8, %xmm3 #[cdef]
	movss		%xmm6, %xmm7
	addps		%xmm7, %xmm2 #[89ab]
	movss		%xmm5, %xmm6
	addps		%xmm6, %xmm1 #[4567]
	xorps		%xmm6, %xmm6
	movss		%xmm6, %xmm5
	addps		%xmm5, %xmm0 #[0123]
	
	vblendps	$0x5, %xmm6, %xmm3, %xmm7
	vshufps		$0x4e, %xmm4, %xmm3, %xmm4
	addps		%xmm7, %xmm4
	vblendps	$0x5, %xmm6, %xmm2, %xmm7
	vshufps		$0x4e, %xmm3, %xmm2, %xmm3
	addps		%xmm7, %xmm3
	vblendps	$0x5, %xmm6, %xmm1, %xmm7
	vshufps		$0x4e, %xmm2, %xmm1, %xmm2
	addps		%xmm7, %xmm2
	vblendps	$0x5, %xmm6, %xmm0, %xmm7
	vshufps		$0x4e, %xmm1, %xmm0, %xmm1
	addps		%xmm7, %xmm1
	vmovlhps	%xmm0, %xmm6, %xmm0

/*
xmm0 in[-,-,0,1]
xmm1 in[2,3,4,5]
xmm2 in[6,7,8,9]
xmm3 in[10,11,12,13]
xmm4 in[14,15,16,17]
*/

	vblendps	$0xc, %xmm3, %xmm2, %xmm5
	blendps		$0xc, %xmm4, %xmm3
	blendps		$0xc, %xmm2, %xmm4
	movaps		%xmm5, %xmm2

/*
xmm2 in[6,7,12,13]
xmm3 in[10,11,16,17]
xmm4 in[14,15,8,9]
*/

	movaps		(COS9_), %xmm15
	movaps		16(COS9_), %xmm6
	movaps		32(COS9_), %xmm7
	movaps		48(COS9_), %xmm8
	vmulps		%xmm2, %xmm15, %xmm5
	addps		%xmm0, %xmm5
	
/*
xmm5 [ta33,tb33,ta66,tb66]
xmm6 COS9_[1,1,2,2]
xmm7 COS9_[5,5,8,8]
xmm8 COS9_[7,7,4,4]
xmm15 COS9_[3,3,6,6]
*/
	
	vmulps		%xmm1, %xmm6, %xmm9
	vmulps		%xmm3, %xmm7, %xmm12
	vmulps		%xmm4, %xmm8, %xmm13
	addps		%xmm5, %xmm9
	addps		%xmm13, %xmm12
	addps		%xmm9, %xmm12
	
	vsubps		%xmm3, %xmm1, %xmm13
	vshufps		$0xe0, %xmm2, %xmm0, %xmm14
	vsubps		%xmm14, %xmm0, %xmm14
	subps		%xmm4, %xmm13
	mulps		%xmm15, %xmm13
	addps		%xmm14, %xmm13
	
	vmulps		%xmm1, %xmm7, %xmm9
	vmulps		%xmm3, %xmm8, %xmm15
	vmulps		%xmm4, %xmm6, %xmm14
	subps		%xmm5, %xmm9
	subps		%xmm15, %xmm14
	addps		%xmm9, %xmm14
	
	mulps		%xmm1, %xmm8
	mulps		%xmm3, %xmm6
	mulps		%xmm4, %xmm7
	subps		%xmm5, %xmm8
	subps		%xmm7, %xmm6
	vaddps		%xmm6, %xmm8, %xmm15
	
	movss		32(tfcos36_), %xmm5
	subps		%xmm1, %xmm0
	subps		%xmm2, %xmm4
	addps		%xmm3, %xmm0
	addps		%xmm4, %xmm0
	shufps		$0xaf, %xmm0, %xmm0
	vmulss		%xmm5, %xmm0, %xmm11

/*
xmm12 [1a-0,1b-0, 2a-0, 2b-0]
xmm13 [1a-1,1b-1, 2a-1, 2b-1]
xmm14 [1a-2,1b-2,-2a-2,-2b-2]
xmm15 [1a-3,1b-3,-2a-3,-2b-3]
*/
	vunpckhps	%xmm13, %xmm12, %xmm5
	vunpcklps	%xmm13, %xmm12, %xmm12
	vunpckhps	%xmm15, %xmm14, %xmm6
	vunpcklps	%xmm15, %xmm14, %xmm14
	xorps		dct36_avx_sign(%rip), %xmm6

/*
xmm12 [1a-0,1a-1,1b-0,1b-1]
xmm5  [2a-0,2a-1,2b-0,2b-1]
xmm14 [1a-2,1a-3,1b-2,1b-3]
xmm6 [2a-2,2a-3,2b-2,2b-3]
*/

	vmovlhps	%xmm14, %xmm12, %xmm0
	movhlps		%xmm12, %xmm14
	vmovlhps	%xmm6, %xmm5, %xmm1
	vmovhlps	%xmm5, %xmm6, %xmm15

/*
xmm0 tmp1a
xmm1 tmp2a
xmm14 tmp1b
xmm15 tmp2b
*/

	movaps		(tfcos36_), %xmm6
	movaps		16(tfcos36_), %xmm7
	vsubps		%xmm14, %xmm15, %xmm10
	addps		%xmm14, %xmm15
	vsubps		%xmm0, %xmm1, %xmm14
	addps		%xmm1, %xmm0
	vmulps		%xmm6, %xmm15, %xmm1
	mulps		%xmm10, %xmm7

/*
%xmm0 tmp[0,1,2,3]
%xmm1 tmp[17,16,15,14]
%xmm14 tmp[8,7,6,5]
%xmm7 tmp[9,10,11,12]
%xmm11 tmp[13,-,4,-]
*/

	movups		108(w), %xmm2
	movups		92(w), %xmm3
	shufps		$0x1b, %xmm3, %xmm3
	movups		36(w), %xmm4
	movups		20(w), %xmm5
	shufps		$0x1b, %xmm5, %xmm5
	vsubps		%xmm1, %xmm0, %xmm6
	addps		%xmm1, %xmm0
	mulps		%xmm0, %xmm2
	mulps		%xmm3, %xmm0
	mulps		%xmm6, %xmm4
	mulps		%xmm5, %xmm6
	movups		36(out1), %xmm1
	movups		20(out1), %xmm3
	shufps		$0x1b, %xmm6, %xmm6
	addps		%xmm4, %xmm1
	addps		%xmm6, %xmm3
	shufps		$0x1b, %xmm0, %xmm0
	movups		%xmm2, 36(out2)
	movups		%xmm0, 20(out2)
	movss		%xmm1, 32*36(ts)
	movss		%xmm3, 32*20(ts)
	movhlps		%xmm1, %xmm2
	movhlps		%xmm3, %xmm4
	movss		%xmm2, 32*44(ts)
	movss		%xmm4, 32*28(ts)
	shufps		$0xb1, %xmm1, %xmm1
	shufps		$0xb1, %xmm3, %xmm3
	movss		%xmm1, 32*40(ts)
	movss		%xmm3, 32*24(ts)
	movhlps		%xmm1, %xmm2
	movhlps		%xmm3, %xmm4
	movss		%xmm2, 32*48(ts)
	movss		%xmm4, 32*32(ts)
	
	movhlps		%xmm11, %xmm0
	movss		124(w), %xmm2
	movss		88(w), %xmm3
	movss		52(w), %xmm4
	movss		16(w), %xmm5
	movss		%xmm0, %xmm6
	addss		%xmm11, %xmm0
	subss		%xmm11, %xmm6
	mulss		%xmm0, %xmm2
	mulss		%xmm3, %xmm0
	mulss		%xmm6, %xmm4
	mulss		%xmm5, %xmm6
	addss		52(out1), %xmm4
	addss		16(out1), %xmm6
	movss		%xmm2, 52(out2)
	movss		%xmm0, 16(out2)
	movss		%xmm4, 32*52(ts)
	movss		%xmm6, 32*16(ts)
	
	movaps		%xmm14, %xmm0
	movaps		%xmm7, %xmm1
	MOVUAPS		128(w), %xmm2
	movups		72(w), %xmm3
	shufps		$0x1b, %xmm2, %xmm2
	movlps		56(w), %xmm4
	movhps		64(w), %xmm4
	MOVUAPS		(w), %xmm5
	shufps		$0x1b, %xmm4, %xmm4
	vsubps		%xmm1, %xmm0, %xmm6
	addps		%xmm1, %xmm0
	mulps		%xmm0, %xmm2
	mulps		%xmm3, %xmm0
	mulps		%xmm6, %xmm4
	mulps		%xmm5, %xmm6
	movlps		56(out1), %xmm1
	movhps		64(out1), %xmm1
	movups		(out1), %xmm3
	shufps		$0x1b, %xmm4, %xmm4
	addps		%xmm6, %xmm3
	addps		%xmm4, %xmm1
	shufps		$0x1b, %xmm2, %xmm2
	movups		%xmm0, (out2)
	movlps		%xmm2, 56(out2)
	movhps		%xmm2, 64(out2)
	movss		%xmm1, 32*56(ts)
	movss		%xmm3, (ts)
	movhlps		%xmm1, %xmm2
	movhlps		%xmm3, %xmm4
	movss		%xmm2, 32*64(ts)
	movss		%xmm4, 32*8(ts)
	shufps		$0xb1, %xmm1, %xmm1
	shufps		$0xb1, %xmm3, %xmm3
	movss		%xmm1, 32*60(ts)
	movss		%xmm3, 32*4(ts)
	movhlps		%xmm1, %xmm2
	movhlps		%xmm3, %xmm4
	movss		%xmm2, 32*68(ts)
	movss		%xmm4, 32*12(ts)
	
#ifdef IS_MSABI
	movaps		(%rsp), %xmm6
	movaps		16(%rsp), %xmm7
	movaps		32(%rsp), %xmm8
	movaps		48(%rsp), %xmm9
	movaps		64(%rsp), %xmm10
	movaps		80(%rsp), %xmm11
	movaps		96(%rsp), %xmm12
	movaps		112(%rsp), %xmm13
	movaps		128(%rsp), %xmm14
	movaps		144(%rsp), %xmm15
	mov			%rbp, %rsp
	pop			%rbp
#endif
	ret

NONEXEC_STACK