Blob Blame History Raw
/*
	dct36_x86_64: SSE optimized dct36 for x86-64

	copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
	see COPYING and AUTHORS files in distribution or http://mpg123.org
	initially written by Taihei Monma
*/

#include "mangle.h"

#ifdef IS_MSABI
#define in %rcx
#define out1 %rdx
#define out2 %r8
#define w  %r9
#define ts %r10
#define COS9_ %rax
#define tfcos36_ %r11
#else
#define in %rdi
#define out1 %rsi
#define out2 %rdx
#define w  %rcx
#define ts %r8
#define COS9_ %rax
#define tfcos36_ %r9
#endif

/*
	void dct36_x86_64(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf);
*/
	
#ifndef __APPLE__
	.section	.rodata
#else
	.data
#endif
	ALIGN16
dct36_x86_64_COS9:
	.long 0x3f5db3d7
	.long 0x3f5db3d7
	.long 0x3f000000
	.long 0x3f000000
	.long 0x3f7c1c5c
	.long 0x3f7c1c5c
	.long 0x3f708fb2
	.long 0x3f708fb2
	.long 0x3f248dbb
	.long 0x3f248dbb
	.long 0x3e31d0d4
	.long 0x3e31d0d4
	.long 0x3eaf1d44
	.long 0x3eaf1d44
	.long 0x3f441b7d
	.long 0x3f441b7d
	ALIGN16
dct36_x86_64_tfcos36:
	.long 0x3f007d2b
	.long 0x3f0483ee
	.long 0x3f0d3b7d
	.long 0x3f1c4257
	.long 0x40b79454
	.long 0x3ff746ea
	.long 0x3f976fd9
	.long 0x3f5f2944
	.long 0x3f3504f3
	ALIGN16
dct36_x86_64_mask:
	.long 0,0xffffffff,0,0xffffffff
	ALIGN16
dct36_x86_64_sign:
	.long 0x80000000,0x80000000,0x80000000,0x80000000
	.text
	ALIGN16
	.globl ASM_NAME(dct36_x86_64)
ASM_NAME(dct36_x86_64):
#ifdef IS_MSABI
	push		%rbp
	mov			%rsp, %rbp
	sub			$160, %rsp
	movaps		%xmm6, (%rsp)
	movaps		%xmm7, 16(%rsp)
	movaps		%xmm8, 32(%rsp)
	movaps		%xmm9, 48(%rsp)
	movaps		%xmm10, 64(%rsp)
	movaps		%xmm11, 80(%rsp)
	movaps		%xmm12, 96(%rsp)
	movaps		%xmm13, 112(%rsp)
	movaps		%xmm14, 128(%rsp)
	movaps		%xmm15, 144(%rsp)
	movq		48(%rbp), ts
#endif
	lea			dct36_x86_64_COS9(%rip), COS9_
	lea			dct36_x86_64_tfcos36(%rip), tfcos36_
	
	xorps		%xmm5, %xmm5
	movups		(in), %xmm1
	movups		16(in), %xmm2
	movups		32(in), %xmm3
	movups		48(in), %xmm4
	movlps		64(in), %xmm5
	xorps		%xmm6, %xmm6
	movaps		%xmm1, %xmm7
	shufps		$0x93, %xmm7, %xmm7
	movaps		%xmm2, %xmm8
	shufps		$0x93, %xmm8, %xmm8
	movaps		%xmm3, %xmm9
	shufps		$0x93, %xmm9, %xmm9
	movaps		%xmm4, %xmm10
	shufps		$0x93, %xmm10, %xmm10
	movaps		%xmm5, %xmm11
	shufps		$0xe1, %xmm11, %xmm11
	movss		%xmm10, %xmm11
	addps		%xmm11, %xmm5
	movss		%xmm9, %xmm10
	addps		%xmm10, %xmm4
	movss		%xmm8, %xmm9
	addps		%xmm9, %xmm3
	movss		%xmm7, %xmm8
	addps		%xmm8, %xmm2
	movss		%xmm6, %xmm7
	addps		%xmm7, %xmm1
	
	movaps		dct36_x86_64_mask(%rip), %xmm0
	movaps		%xmm4, %xmm6
	shufps		$0x4e, %xmm5, %xmm4
	movaps		%xmm3, %xmm7
	shufps		$0x4e, %xmm6, %xmm3
	andps		%xmm0, %xmm6
	addps		%xmm6, %xmm4
	movaps		%xmm2, %xmm6
	shufps		$0x4e, %xmm7, %xmm2
	andps		%xmm0, %xmm7
	addps		%xmm7, %xmm3
	movaps		%xmm1, %xmm7
	shufps		$0x4e, %xmm6, %xmm1
	andps		%xmm0, %xmm6
	addps		%xmm6, %xmm2
	movaps		%xmm7, %xmm6
	andps		%xmm0, %xmm7
	xorps		%xmm0, %xmm0
	addps		%xmm7, %xmm1
	movlhps		%xmm6, %xmm0

/*
xmm0 in[-,-,0,1]
xmm1 in[2,3,4,5]
xmm2 in[6,7,8,9]
xmm3 in[10,11,12,13]
xmm4 in[14,15,16,17]
*/
	
	movaps		%xmm2, %xmm5
	shufps		$0xe4, %xmm3, %xmm5
	shufps		$0xe4, %xmm4, %xmm3
	shufps		$0xe4, %xmm2, %xmm4
	movaps		%xmm5, %xmm2
/*
xmm2 in[6,7,12,13]
xmm3 in[10,11,16,17]
xmm4 in[14,15,8,9]
*/

	movaps		(COS9_), %xmm15
	movaps		16(COS9_), %xmm6
	movaps		32(COS9_), %xmm7
	movaps		48(COS9_), %xmm8
	mulps		%xmm15, %xmm5
	addps		%xmm0, %xmm5
	
/*
xmm5 [ta33,tb33,ta66,tb66]
xmm6 COS9_[1,1,2,2]
xmm7 COS9_[5,5,8,8]
xmm8 COS9_[7,7,4,4]
xmm15 COS9_[3,3,6,6]
*/
	movaps		%xmm6, %xmm9
	movaps		%xmm7, %xmm12
	movaps		%xmm8, %xmm13
	mulps		%xmm1, %xmm9
	mulps		%xmm3, %xmm12
	mulps		%xmm4, %xmm13
	addps		%xmm5, %xmm9
	addps		%xmm13, %xmm12
	addps		%xmm9, %xmm12
	
	movaps		%xmm1, %xmm13
	subps		%xmm3, %xmm13
	movaps		%xmm0, %xmm10
	shufps		$0xe0, %xmm2, %xmm10
	movaps		%xmm0, %xmm14
	subps		%xmm10, %xmm14
	subps		%xmm4, %xmm13
	mulps		%xmm15, %xmm13
	addps		%xmm14, %xmm13
	
	movaps		%xmm7, %xmm9
	movaps		%xmm8, %xmm15
	movaps		%xmm6, %xmm14
	mulps		%xmm1, %xmm9
	mulps		%xmm3, %xmm15
	mulps		%xmm4, %xmm14
	subps		%xmm5, %xmm9
	subps		%xmm15, %xmm14
	addps		%xmm9, %xmm14
	
	mulps		%xmm1, %xmm8
	mulps		%xmm3, %xmm6
	mulps		%xmm4, %xmm7
	subps		%xmm5, %xmm8
	subps		%xmm7, %xmm6
	addps		%xmm6, %xmm8
	movaps		%xmm8, %xmm15
	
	movss		32(tfcos36_), %xmm5
	subps		%xmm1, %xmm0
	subps		%xmm2, %xmm4
	addps		%xmm3, %xmm0
	addps		%xmm4, %xmm0
	shufps		$0xaf, %xmm0, %xmm0
	mulss		%xmm5, %xmm0
	movaps		%xmm0, %xmm11

/*
xmm12 [1a-0,1b-0, 2a-0, 2b-0]
xmm13 [1a-1,1b-1, 2a-1, 2b-1]
xmm14 [1a-2,1b-2,-2a-2,-2b-2]
xmm15 [1a-3,1b-3,-2a-3,-2b-3]
*/
	movaps		%xmm12, %xmm5
	unpckhps	%xmm13, %xmm5
	unpcklps	%xmm13, %xmm12
	movaps		%xmm14, %xmm6
	unpckhps	%xmm15, %xmm6
	unpcklps	%xmm15, %xmm14
	xorps		dct36_x86_64_sign(%rip), %xmm6

/*
xmm12 [1a-0,1a-1,1b-0,1b-1]
xmm5  [2a-0,2a-1,2b-0,2b-1]
xmm14 [1a-2,1a-3,1b-2,1b-3]
xmm6 [2a-2,2a-3,2b-2,2b-3]
*/

	movaps		%xmm12, %xmm0
	movlhps		%xmm14, %xmm12
	movhlps		%xmm0, %xmm14
	movaps		%xmm5, %xmm0
	movlhps		%xmm6, %xmm0
	movhlps		%xmm5, %xmm6
	movaps		%xmm6, %xmm15

/*
xmm12 tmp1a
xmm0 tmp2a
xmm14 tmp1b
xmm15 tmp2b
*/

	movaps		(tfcos36_), %xmm6
	movaps		16(tfcos36_), %xmm7
	movaps		%xmm15, %xmm10
	addps		%xmm14, %xmm15
	subps		%xmm14, %xmm10
	movaps		%xmm0, %xmm14
	addps		%xmm12, %xmm0
	subps		%xmm12, %xmm14
	mulps		%xmm6, %xmm15
	mulps		%xmm10, %xmm7

/*
%xmm0  tmp[0,1,2,3]
%xmm15 tmp[17,16,15,14]
%xmm14 tmp[8,7,6,5]
%xmm7  tmp[9,10,11,12]
%xmm11 tmp[13,-,4,-]
*/

	movaps		%xmm15, %xmm1
	movups		108(w), %xmm2
	movups		92(w), %xmm3
	shufps		$0x1b, %xmm3, %xmm3
	movups		36(w), %xmm4
	movups		20(w), %xmm5
	shufps		$0x1b, %xmm5, %xmm5
	movaps		%xmm0, %xmm6
	addps		%xmm1, %xmm0
	subps		%xmm1, %xmm6
	mulps		%xmm0, %xmm2
	mulps		%xmm3, %xmm0
	mulps		%xmm6, %xmm4
	mulps		%xmm5, %xmm6
	movups		36(out1), %xmm1
	movups		20(out1), %xmm3
	shufps		$0x1b, %xmm6, %xmm6
	addps		%xmm4, %xmm1
	addps		%xmm6, %xmm3
	shufps		$0x1b, %xmm0, %xmm0
	movups		%xmm2, 36(out2)
	movups		%xmm0, 20(out2)
	movss		%xmm1, 32*36(ts)
	movss		%xmm3, 32*20(ts)
	movhlps		%xmm1, %xmm2
	movhlps		%xmm3, %xmm4
	movss		%xmm2, 32*44(ts)
	movss		%xmm4, 32*28(ts)
	shufps		$0xb1, %xmm1, %xmm1
	shufps		$0xb1, %xmm3, %xmm3
	movss		%xmm1, 32*40(ts)
	movss		%xmm3, 32*24(ts)
	movhlps		%xmm1, %xmm2
	movhlps		%xmm3, %xmm4
	movss		%xmm2, 32*48(ts)
	movss		%xmm4, 32*32(ts)
	
	movhlps		%xmm11, %xmm0
	movaps		%xmm11, %xmm1
	movss		124(w), %xmm2
	movss		88(w), %xmm3
	movss		52(w), %xmm4
	movss		16(w), %xmm5
	movss		%xmm0, %xmm6
	addss		%xmm1, %xmm0
	subss		%xmm1, %xmm6
	mulss		%xmm0, %xmm2
	mulss		%xmm3, %xmm0
	mulss		%xmm6, %xmm4
	mulss		%xmm5, %xmm6
	addss		52(out1), %xmm4
	addss		16(out1), %xmm6
	movss		%xmm2, 52(out2)
	movss		%xmm0, 16(out2)
	movss		%xmm4, 32*52(ts)
	movss		%xmm6, 32*16(ts)
	
	movaps		%xmm14, %xmm0
	movaps		%xmm7, %xmm1
	MOVUAPS		128(w), %xmm2
	movups		72(w), %xmm3
	shufps		$0x1b, %xmm2, %xmm2
	movlps		56(w), %xmm4
	movhps		64(w), %xmm4
	MOVUAPS		(w), %xmm5
	shufps		$0x1b, %xmm4, %xmm4
	movaps		%xmm0, %xmm6
	addps		%xmm1, %xmm0
	subps		%xmm1, %xmm6
	mulps		%xmm0, %xmm2
	mulps		%xmm3, %xmm0
	mulps		%xmm6, %xmm4
	mulps		%xmm5, %xmm6
	movlps		56(out1), %xmm1
	movhps		64(out1), %xmm1
	movups		(out1), %xmm3
	shufps		$0x1b, %xmm4, %xmm4
	addps		%xmm6, %xmm3
	addps		%xmm4, %xmm1
	shufps		$0x1b, %xmm2, %xmm2
	movups		%xmm0, (out2)
	movlps		%xmm2, 56(out2)
	movhps		%xmm2, 64(out2)
	movss		%xmm1, 32*56(ts)
	movss		%xmm3, (ts)
	movhlps		%xmm1, %xmm2
	movhlps		%xmm3, %xmm4
	movss		%xmm2, 32*64(ts)
	movss		%xmm4, 32*8(ts)
	shufps		$0xb1, %xmm1, %xmm1
	shufps		$0xb1, %xmm3, %xmm3
	movss		%xmm1, 32*60(ts)
	movss		%xmm3, 32*4(ts)
	movhlps		%xmm1, %xmm2
	movhlps		%xmm3, %xmm4
	movss		%xmm2, 32*68(ts)
	movss		%xmm4, 32*12(ts)
	
#ifdef IS_MSABI
	movaps		(%rsp), %xmm6
	movaps		16(%rsp), %xmm7
	movaps		32(%rsp), %xmm8
	movaps		48(%rsp), %xmm9
	movaps		64(%rsp), %xmm10
	movaps		80(%rsp), %xmm11
	movaps		96(%rsp), %xmm12
	movaps		112(%rsp), %xmm13
	movaps		128(%rsp), %xmm14
	movaps		144(%rsp), %xmm15
	mov			%rbp, %rsp
	pop			%rbp
#endif
	ret

NONEXEC_STACK