Blob Blame History Raw
/*
	dct36_sse: SSE optimized dct36

	copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
	see COPYING and AUTHORS files in distribution or http://mpg123.org
	initially written by Taihei Monma
*/

#include "mangle.h"

#define in %edi
#define out1 %edi
#define out2 %edx
#define w  %ecx
#define ts %eax
#define tmp %esi

/*
	void dct36_sse(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf);
*/
	
#ifndef __APPLE__
	.section	.rodata
#else
	.data
#endif
	ALIGN16
dct36_sse_COS9:
	.long 0x3f5db3d7
	.long 0x3f5db3d7
	.long 0x3f000000
	.long 0x3f000000
	.long 0x3f7c1c5c
	.long 0x3f7c1c5c
	.long 0x3f708fb2
	.long 0x3f708fb2
	.long 0x3f248dbb
	.long 0x3f248dbb
	.long 0x3e31d0d4
	.long 0x3e31d0d4
	.long 0x3eaf1d44
	.long 0x3eaf1d44
	.long 0x3f441b7d
	.long 0x3f441b7d
	ALIGN16
dct36_sse_tfcos36:
	.long 0x3f007d2b
	.long 0x3f0483ee
	.long 0x3f0d3b7d
	.long 0x3f1c4257
	.long 0x40b79454
	.long 0x3ff746ea
	.long 0x3f976fd9
	.long 0x3f5f2944
	.long 0x3f3504f3
	ALIGN16
dct36_sse_mask:
	.long 0,0xffffffff,0,0xffffffff
	ALIGN16
dct36_sse_sign:
	.long 0x80000000,0x80000000,0x80000000,0x80000000
	.text
	ALIGN16
	.globl ASM_NAME(dct36_sse)
ASM_NAME(dct36_sse):
	push		%ebp
	mov			%esp, %ebp
	and			$-16, %esp
	sub			$80, %esp
	push		%ebx
	push		%esi
	push		%edi
	lea			12(%esp), tmp
	movl		8(%ebp), in

	GET_GOT
	
	lea			LOCAL_VAR(dct36_sse_COS9), %eax
	lea			LOCAL_VAR(dct36_sse_tfcos36), %edx

	xorps		%xmm0, %xmm0
	xorps		%xmm5, %xmm5
	movlps		64(in), %xmm5
	movups		48(in), %xmm4
	movups		32(in), %xmm3
	movups		16(in), %xmm2
	movups		(in), %xmm1
	movaps		%xmm5, %xmm6
	shufps		$0xe1, %xmm6, %xmm6
	movaps		%xmm4, %xmm7
	shufps		$0x93, %xmm7, %xmm7
	movss		%xmm7, %xmm6
	addps		%xmm6, %xmm5
	movaps		%xmm3, %xmm6
	shufps		$0x93, %xmm6, %xmm6
	movss		%xmm6, %xmm7
	addps		%xmm7, %xmm4
	movaps		%xmm2, %xmm7
	shufps		$0x93, %xmm7, %xmm7
	movss		%xmm7, %xmm6
	addps		%xmm6, %xmm3
	movaps		%xmm1, %xmm6
	shufps		$0x93, %xmm6, %xmm6
	movss		%xmm6, %xmm7
	addps		%xmm7, %xmm2
	movss		%xmm0, %xmm6
	addps		%xmm6, %xmm1
	
	movaps		LOCAL_VAR(dct36_sse_mask), %xmm0
	movaps		%xmm4, %xmm6
	shufps		$0x4e, %xmm5, %xmm4
	movaps		%xmm3, %xmm7
	shufps		$0x4e, %xmm6, %xmm3
	andps		%xmm0, %xmm6
	addps		%xmm6, %xmm4
	movaps		%xmm2, %xmm6
	shufps		$0x4e, %xmm7, %xmm2
	andps		%xmm0, %xmm7
	addps		%xmm7, %xmm3
	movaps		%xmm1, %xmm7
	shufps		$0x4e, %xmm6, %xmm1
	andps		%xmm0, %xmm6
	addps		%xmm6, %xmm2
	movaps		%xmm7, %xmm6
	andps		%xmm0, %xmm7
	xorps		%xmm0, %xmm0
	addps		%xmm7, %xmm1
	movlhps		%xmm6, %xmm0

/*
xmm0 in[-,-,0,1]
xmm1 in[2,3,4,5]
xmm2 in[6,7,8,9]
xmm3 in[10,11,12,13]
xmm4 in[14,15,16,17]
*/

	movaps		%xmm2, %xmm5
	shufps		$0xe4, %xmm3, %xmm5
	shufps		$0xe4, %xmm4, %xmm3
	shufps		$0xe4, %xmm2, %xmm4
	movaps		%xmm5, %xmm2

/*
xmm2 in[6,7,12,13]
xmm3 in[10,11,16,17]
xmm4 in[14,15,8,9]
*/

	mulps		(%eax), %xmm5
	addps		%xmm0, %xmm5
	
	movaps		%xmm0, (tmp)
	movaps		%xmm2, 16(tmp)

/*
0(tmp) in[-,-,0,1]
xmm5 [ta33,tb33,ta66,tb66]
*/

	movaps		%xmm1, %xmm6
	subps		%xmm3, %xmm6
	subps		%xmm4, %xmm6
	xorps		%xmm7, %xmm7
	shufps		$0xe0, %xmm2, %xmm7
	mulps		(%eax), %xmm6
	subps		%xmm7, %xmm0
	addps		%xmm0, %xmm6
	movaps		%xmm6, 48(tmp)
	
	movaps		16(%eax), %xmm2

	movaps		%xmm1, %xmm0
	movaps		%xmm3, %xmm6
	movaps		%xmm4, %xmm7
	mulps		%xmm2, %xmm0
	mulps		32(%eax), %xmm6
	mulps		48(%eax), %xmm7
	addps		%xmm5, %xmm0
	addps		%xmm7, %xmm6
	addps		%xmm6, %xmm0
	movaps		%xmm0, 32(tmp)
	
	movaps		%xmm1, %xmm0
	movaps		%xmm3, %xmm6
	movaps		%xmm4, %xmm7
	mulps		32(%eax), %xmm0
	mulps		48(%eax), %xmm6
	mulps		%xmm2, %xmm7
	subps		%xmm5, %xmm0
	subps		%xmm6, %xmm7
	addps		%xmm7, %xmm0
	movaps		%xmm0, 64(tmp)
	
	movaps		%xmm1, %xmm6
	movaps		%xmm4, %xmm7
	mulps		48(%eax), %xmm6
	mulps		%xmm3, %xmm2
	mulps		32(%eax), %xmm7
	subps		%xmm5, %xmm6
	subps		%xmm7, %xmm2
	addps		%xmm2, %xmm6
	
	movaps		(tmp), %xmm0
	movss		32(%edx), %xmm5
	subps		%xmm1, %xmm0
	subps		16(tmp), %xmm4
	addps		%xmm3, %xmm0
	addps		%xmm4, %xmm0
	shufps		$0xaf, %xmm0, %xmm0
	mulss		%xmm5, %xmm0
	movaps		%xmm0, (tmp)
	
	movaps		32(tmp), %xmm0
	movaps		48(tmp), %xmm1
	movaps		64(tmp), %xmm2

/*
xmm0 [1a-0,1b-0, 2a-0, 2b-0]
xmm1 [1a-1,1b-1, 2a-1, 2b-1]
xmm2 [1a-2,1b-2,-2a-2,-2b-2]
xmm6 [1a-3,1b-3,-2a-3,-2b-3]
*/

	movaps		%xmm0, %xmm3
	unpcklps	%xmm1, %xmm0
	unpckhps	%xmm1, %xmm3
	movaps		%xmm2, %xmm5
	unpcklps	%xmm6, %xmm2
	unpckhps	%xmm6, %xmm5
	xorps		LOCAL_VAR(dct36_sse_sign), %xmm5

/*
xmm0 [1a-0,1a-1,1b-0,1b-1]
xmm3 [2a-0,2a-1,2b-0,2b-1]
xmm2 [1a-2,1a-3,1b-2,1b-3]
xmm5 [2a-2,2a-3,2b-2,2b-3]
*/

	movaps		%xmm0, %xmm1
	movlhps		%xmm2, %xmm0
	movhlps		%xmm1, %xmm2
	movaps		%xmm3, %xmm4
	movlhps		%xmm5, %xmm3
	movhlps		%xmm4, %xmm5

/*
xmm0 tmp1a
xmm3 tmp2a
xmm2 tmp1b
xmm5 tmp2b
*/

	movaps		(%edx), %xmm6
	movaps		16(%edx), %xmm7
	movaps		%xmm5, %xmm1
	addps		%xmm2, %xmm5
	subps		%xmm2, %xmm1
	movaps		%xmm3, %xmm2
	addps		%xmm0, %xmm3
	subps		%xmm0, %xmm2
	mulps		%xmm6, %xmm5
	mulps		%xmm1, %xmm7
	
	movaps		%xmm2, 16(tmp)

/*
%xmm3 tmp[0,1,2,3]
%xmm5 tmp[17,16,15,14]
16(tmp) tmp[8,7,6,5]
%xmm7 tmp[9,10,11,12]
0(tmp) tmp[13,-,4,-]
*/

	movl		12(%ebp), out1
	movl		16(%ebp), out2
	movl		20(%ebp), w
	movl		24(%ebp), ts

	movaps		%xmm3, %xmm0
	movaps		%xmm5, %xmm1
	movups		108(w), %xmm2
	movups		92(w), %xmm3
	shufps		$0x1b, %xmm3, %xmm3
	movups		36(w), %xmm4
	movups		20(w), %xmm5
	shufps		$0x1b, %xmm5, %xmm5
	movaps		%xmm0, %xmm6
	addps		%xmm1, %xmm0
	subps		%xmm1, %xmm6
	mulps		%xmm0, %xmm2
	mulps		%xmm3, %xmm0
	mulps		%xmm6, %xmm4
	mulps		%xmm5, %xmm6
	movups		36(out1), %xmm1
	movups		20(out1), %xmm3
	shufps		$0x1b, %xmm6, %xmm6
	addps		%xmm4, %xmm1
	addps		%xmm6, %xmm3
	shufps		$0x1b, %xmm0, %xmm0
	movups		%xmm2, 36(out2)
	movups		%xmm0, 20(out2)
	movss		%xmm1, 32*36(ts)
	movss		%xmm3, 32*20(ts)
	movhlps		%xmm1, %xmm2
	movhlps		%xmm3, %xmm4
	movss		%xmm2, 32*44(ts)
	movss		%xmm4, 32*28(ts)
	shufps		$0xb1, %xmm1, %xmm1
	shufps		$0xb1, %xmm3, %xmm3
	movss		%xmm1, 32*40(ts)
	movss		%xmm3, 32*24(ts)
	movhlps		%xmm1, %xmm2
	movhlps		%xmm3, %xmm4
	movss		%xmm2, 32*48(ts)
	movss		%xmm4, 32*32(ts)
	
	movss		8(tmp), %xmm0
	movss		(tmp), %xmm1
	movss		124(w), %xmm2
	movss		88(w), %xmm3
	movss		52(w), %xmm4
	movss		16(w), %xmm5
	movss		%xmm0, %xmm6
	addss		%xmm1, %xmm0
	subss		%xmm1, %xmm6
	mulss		%xmm0, %xmm2
	mulss		%xmm3, %xmm0
	mulss		%xmm6, %xmm4
	mulss		%xmm5, %xmm6
	addss		52(out1), %xmm4
	addss		16(out1), %xmm6
	movss		%xmm2, 52(out2)
	movss		%xmm0, 16(out2)
	movss		%xmm4, 32*52(ts)
	movss		%xmm6, 32*16(ts)
	
	movaps		16(tmp), %xmm0
	movaps		%xmm7, %xmm1
	MOVUAPS		128(w), %xmm2
	movups		72(w), %xmm3
	shufps		$0x1b, %xmm2, %xmm2
	movlps		56(w), %xmm4
	movhps		64(w), %xmm4
	MOVUAPS		(w), %xmm5
	shufps		$0x1b, %xmm4, %xmm4
	movaps		%xmm0, %xmm6
	addps		%xmm1, %xmm0
	subps		%xmm1, %xmm6
	mulps		%xmm0, %xmm2
	mulps		%xmm3, %xmm0
	mulps		%xmm6, %xmm4
	mulps		%xmm5, %xmm6
	movlps		56(out1), %xmm1
	movhps		64(out1), %xmm1
	movups		(out1), %xmm3
	shufps		$0x1b, %xmm4, %xmm4
	addps		%xmm6, %xmm3
	addps		%xmm4, %xmm1
	shufps		$0x1b, %xmm2, %xmm2
	movups		%xmm0, (out2)
	movlps		%xmm2, 56(out2)
	movhps		%xmm2, 64(out2)
	movss		%xmm1, 32*56(ts)
	movss		%xmm3, (ts)
	movhlps		%xmm1, %xmm2
	movhlps		%xmm3, %xmm4
	movss		%xmm2, 32*64(ts)
	movss		%xmm4, 32*8(ts)
	shufps		$0xb1, %xmm1, %xmm1
	shufps		$0xb1, %xmm3, %xmm3
	movss		%xmm1, 32*60(ts)
	movss		%xmm3, 32*4(ts)
	movhlps		%xmm1, %xmm2
	movhlps		%xmm3, %xmm4
	movss		%xmm2, 32*68(ts)
	movss		%xmm4, 32*12(ts)
	
	pop			%edi
	pop			%esi
	pop			%ebx
	mov			%ebp, %esp
	pop			%ebp
	
	ret

NONEXEC_STACK