Blame src/libmpg123/dct36_sse.S

Packit c32a2d
/*
Packit c32a2d
	dct36_sse: SSE optimized dct36
Packit c32a2d
Packit c32a2d
	copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#define in %edi
Packit c32a2d
#define out1 %edi
Packit c32a2d
#define out2 %edx
Packit c32a2d
#define w  %ecx
Packit c32a2d
#define ts %eax
Packit c32a2d
#define tmp %esi
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	void dct36_sse(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf);
Packit c32a2d
*/
Packit c32a2d
	
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.section	.rodata
Packit c32a2d
#else
Packit c32a2d
	.data
Packit c32a2d
#endif
Packit c32a2d
	ALIGN16
Packit c32a2d
dct36_sse_COS9:
Packit c32a2d
	.long 0x3f5db3d7
Packit c32a2d
	.long 0x3f5db3d7
Packit c32a2d
	.long 0x3f000000
Packit c32a2d
	.long 0x3f000000
Packit c32a2d
	.long 0x3f7c1c5c
Packit c32a2d
	.long 0x3f7c1c5c
Packit c32a2d
	.long 0x3f708fb2
Packit c32a2d
	.long 0x3f708fb2
Packit c32a2d
	.long 0x3f248dbb
Packit c32a2d
	.long 0x3f248dbb
Packit c32a2d
	.long 0x3e31d0d4
Packit c32a2d
	.long 0x3e31d0d4
Packit c32a2d
	.long 0x3eaf1d44
Packit c32a2d
	.long 0x3eaf1d44
Packit c32a2d
	.long 0x3f441b7d
Packit c32a2d
	.long 0x3f441b7d
Packit c32a2d
	ALIGN16
Packit c32a2d
dct36_sse_tfcos36:
Packit c32a2d
	.long 0x3f007d2b
Packit c32a2d
	.long 0x3f0483ee
Packit c32a2d
	.long 0x3f0d3b7d
Packit c32a2d
	.long 0x3f1c4257
Packit c32a2d
	.long 0x40b79454
Packit c32a2d
	.long 0x3ff746ea
Packit c32a2d
	.long 0x3f976fd9
Packit c32a2d
	.long 0x3f5f2944
Packit c32a2d
	.long 0x3f3504f3
Packit c32a2d
	ALIGN16
Packit c32a2d
dct36_sse_mask:
Packit c32a2d
	.long 0,0xffffffff,0,0xffffffff
Packit c32a2d
	ALIGN16
Packit c32a2d
dct36_sse_sign:
Packit c32a2d
	.long 0x80000000,0x80000000,0x80000000,0x80000000
Packit c32a2d
	.text
Packit c32a2d
	ALIGN16
Packit c32a2d
	.globl ASM_NAME(dct36_sse)
Packit c32a2d
ASM_NAME(dct36_sse):
Packit c32a2d
	push		%ebp
Packit c32a2d
	mov			%esp, %ebp
Packit c32a2d
	and			$-16, %esp
Packit c32a2d
	sub			$80, %esp
Packit c32a2d
	push		%ebx
Packit c32a2d
	push		%esi
Packit c32a2d
	push		%edi
Packit c32a2d
	lea			12(%esp), tmp
Packit c32a2d
	movl		8(%ebp), in
Packit c32a2d
Packit c32a2d
	GET_GOT
Packit c32a2d
	
Packit c32a2d
	lea			LOCAL_VAR(dct36_sse_COS9), %eax
Packit c32a2d
	lea			LOCAL_VAR(dct36_sse_tfcos36), %edx
Packit c32a2d
Packit c32a2d
	xorps		%xmm0, %xmm0
Packit c32a2d
	xorps		%xmm5, %xmm5
Packit c32a2d
	movlps		64(in), %xmm5
Packit c32a2d
	movups		48(in), %xmm4
Packit c32a2d
	movups		32(in), %xmm3
Packit c32a2d
	movups		16(in), %xmm2
Packit c32a2d
	movups		(in), %xmm1
Packit c32a2d
	movaps		%xmm5, %xmm6
Packit c32a2d
	shufps		$0xe1, %xmm6, %xmm6
Packit c32a2d
	movaps		%xmm4, %xmm7
Packit c32a2d
	shufps		$0x93, %xmm7, %xmm7
Packit c32a2d
	movss		%xmm7, %xmm6
Packit c32a2d
	addps		%xmm6, %xmm5
Packit c32a2d
	movaps		%xmm3, %xmm6
Packit c32a2d
	shufps		$0x93, %xmm6, %xmm6
Packit c32a2d
	movss		%xmm6, %xmm7
Packit c32a2d
	addps		%xmm7, %xmm4
Packit c32a2d
	movaps		%xmm2, %xmm7
Packit c32a2d
	shufps		$0x93, %xmm7, %xmm7
Packit c32a2d
	movss		%xmm7, %xmm6
Packit c32a2d
	addps		%xmm6, %xmm3
Packit c32a2d
	movaps		%xmm1, %xmm6
Packit c32a2d
	shufps		$0x93, %xmm6, %xmm6
Packit c32a2d
	movss		%xmm6, %xmm7
Packit c32a2d
	addps		%xmm7, %xmm2
Packit c32a2d
	movss		%xmm0, %xmm6
Packit c32a2d
	addps		%xmm6, %xmm1
Packit c32a2d
	
Packit c32a2d
	movaps		LOCAL_VAR(dct36_sse_mask), %xmm0
Packit c32a2d
	movaps		%xmm4, %xmm6
Packit c32a2d
	shufps		$0x4e, %xmm5, %xmm4
Packit c32a2d
	movaps		%xmm3, %xmm7
Packit c32a2d
	shufps		$0x4e, %xmm6, %xmm3
Packit c32a2d
	andps		%xmm0, %xmm6
Packit c32a2d
	addps		%xmm6, %xmm4
Packit c32a2d
	movaps		%xmm2, %xmm6
Packit c32a2d
	shufps		$0x4e, %xmm7, %xmm2
Packit c32a2d
	andps		%xmm0, %xmm7
Packit c32a2d
	addps		%xmm7, %xmm3
Packit c32a2d
	movaps		%xmm1, %xmm7
Packit c32a2d
	shufps		$0x4e, %xmm6, %xmm1
Packit c32a2d
	andps		%xmm0, %xmm6
Packit c32a2d
	addps		%xmm6, %xmm2
Packit c32a2d
	movaps		%xmm7, %xmm6
Packit c32a2d
	andps		%xmm0, %xmm7
Packit c32a2d
	xorps		%xmm0, %xmm0
Packit c32a2d
	addps		%xmm7, %xmm1
Packit c32a2d
	movlhps		%xmm6, %xmm0
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
xmm0 in[-,-,0,1]
Packit c32a2d
xmm1 in[2,3,4,5]
Packit c32a2d
xmm2 in[6,7,8,9]
Packit c32a2d
xmm3 in[10,11,12,13]
Packit c32a2d
xmm4 in[14,15,16,17]
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
	movaps		%xmm2, %xmm5
Packit c32a2d
	shufps		$0xe4, %xmm3, %xmm5
Packit c32a2d
	shufps		$0xe4, %xmm4, %xmm3
Packit c32a2d
	shufps		$0xe4, %xmm2, %xmm4
Packit c32a2d
	movaps		%xmm5, %xmm2
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
xmm2 in[6,7,12,13]
Packit c32a2d
xmm3 in[10,11,16,17]
Packit c32a2d
xmm4 in[14,15,8,9]
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
	mulps		(%eax), %xmm5
Packit c32a2d
	addps		%xmm0, %xmm5
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm0, (tmp)
Packit c32a2d
	movaps		%xmm2, 16(tmp)
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
0(tmp) in[-,-,0,1]
Packit c32a2d
xmm5 [ta33,tb33,ta66,tb66]
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
	movaps		%xmm1, %xmm6
Packit c32a2d
	subps		%xmm3, %xmm6
Packit c32a2d
	subps		%xmm4, %xmm6
Packit c32a2d
	xorps		%xmm7, %xmm7
Packit c32a2d
	shufps		$0xe0, %xmm2, %xmm7
Packit c32a2d
	mulps		(%eax), %xmm6
Packit c32a2d
	subps		%xmm7, %xmm0
Packit c32a2d
	addps		%xmm0, %xmm6
Packit c32a2d
	movaps		%xmm6, 48(tmp)
Packit c32a2d
	
Packit c32a2d
	movaps		16(%eax), %xmm2
Packit c32a2d
Packit c32a2d
	movaps		%xmm1, %xmm0
Packit c32a2d
	movaps		%xmm3, %xmm6
Packit c32a2d
	movaps		%xmm4, %xmm7
Packit c32a2d
	mulps		%xmm2, %xmm0
Packit c32a2d
	mulps		32(%eax), %xmm6
Packit c32a2d
	mulps		48(%eax), %xmm7
Packit c32a2d
	addps		%xmm5, %xmm0
Packit c32a2d
	addps		%xmm7, %xmm6
Packit c32a2d
	addps		%xmm6, %xmm0
Packit c32a2d
	movaps		%xmm0, 32(tmp)
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm1, %xmm0
Packit c32a2d
	movaps		%xmm3, %xmm6
Packit c32a2d
	movaps		%xmm4, %xmm7
Packit c32a2d
	mulps		32(%eax), %xmm0
Packit c32a2d
	mulps		48(%eax), %xmm6
Packit c32a2d
	mulps		%xmm2, %xmm7
Packit c32a2d
	subps		%xmm5, %xmm0
Packit c32a2d
	subps		%xmm6, %xmm7
Packit c32a2d
	addps		%xmm7, %xmm0
Packit c32a2d
	movaps		%xmm0, 64(tmp)
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm1, %xmm6
Packit c32a2d
	movaps		%xmm4, %xmm7
Packit c32a2d
	mulps		48(%eax), %xmm6
Packit c32a2d
	mulps		%xmm3, %xmm2
Packit c32a2d
	mulps		32(%eax), %xmm7
Packit c32a2d
	subps		%xmm5, %xmm6
Packit c32a2d
	subps		%xmm7, %xmm2
Packit c32a2d
	addps		%xmm2, %xmm6
Packit c32a2d
	
Packit c32a2d
	movaps		(tmp), %xmm0
Packit c32a2d
	movss		32(%edx), %xmm5
Packit c32a2d
	subps		%xmm1, %xmm0
Packit c32a2d
	subps		16(tmp), %xmm4
Packit c32a2d
	addps		%xmm3, %xmm0
Packit c32a2d
	addps		%xmm4, %xmm0
Packit c32a2d
	shufps		$0xaf, %xmm0, %xmm0
Packit c32a2d
	mulss		%xmm5, %xmm0
Packit c32a2d
	movaps		%xmm0, (tmp)
Packit c32a2d
	
Packit c32a2d
	movaps		32(tmp), %xmm0
Packit c32a2d
	movaps		48(tmp), %xmm1
Packit c32a2d
	movaps		64(tmp), %xmm2
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
xmm0 [1a-0,1b-0, 2a-0, 2b-0]
Packit c32a2d
xmm1 [1a-1,1b-1, 2a-1, 2b-1]
Packit c32a2d
xmm2 [1a-2,1b-2,-2a-2,-2b-2]
Packit c32a2d
xmm6 [1a-3,1b-3,-2a-3,-2b-3]
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
	movaps		%xmm0, %xmm3
Packit c32a2d
	unpcklps	%xmm1, %xmm0
Packit c32a2d
	unpckhps	%xmm1, %xmm3
Packit c32a2d
	movaps		%xmm2, %xmm5
Packit c32a2d
	unpcklps	%xmm6, %xmm2
Packit c32a2d
	unpckhps	%xmm6, %xmm5
Packit c32a2d
	xorps		LOCAL_VAR(dct36_sse_sign), %xmm5
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
xmm0 [1a-0,1a-1,1b-0,1b-1]
Packit c32a2d
xmm3 [2a-0,2a-1,2b-0,2b-1]
Packit c32a2d
xmm2 [1a-2,1a-3,1b-2,1b-3]
Packit c32a2d
xmm5 [2a-2,2a-3,2b-2,2b-3]
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
	movaps		%xmm0, %xmm1
Packit c32a2d
	movlhps		%xmm2, %xmm0
Packit c32a2d
	movhlps		%xmm1, %xmm2
Packit c32a2d
	movaps		%xmm3, %xmm4
Packit c32a2d
	movlhps		%xmm5, %xmm3
Packit c32a2d
	movhlps		%xmm4, %xmm5
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
xmm0 tmp1a
Packit c32a2d
xmm3 tmp2a
Packit c32a2d
xmm2 tmp1b
Packit c32a2d
xmm5 tmp2b
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
	movaps		(%edx), %xmm6
Packit c32a2d
	movaps		16(%edx), %xmm7
Packit c32a2d
	movaps		%xmm5, %xmm1
Packit c32a2d
	addps		%xmm2, %xmm5
Packit c32a2d
	subps		%xmm2, %xmm1
Packit c32a2d
	movaps		%xmm3, %xmm2
Packit c32a2d
	addps		%xmm0, %xmm3
Packit c32a2d
	subps		%xmm0, %xmm2
Packit c32a2d
	mulps		%xmm6, %xmm5
Packit c32a2d
	mulps		%xmm1, %xmm7
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm2, 16(tmp)
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
%xmm3 tmp[0,1,2,3]
Packit c32a2d
%xmm5 tmp[17,16,15,14]
Packit c32a2d
16(tmp) tmp[8,7,6,5]
Packit c32a2d
%xmm7 tmp[9,10,11,12]
Packit c32a2d
0(tmp) tmp[13,-,4,-]
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
	movl		12(%ebp), out1
Packit c32a2d
	movl		16(%ebp), out2
Packit c32a2d
	movl		20(%ebp), w
Packit c32a2d
	movl		24(%ebp), ts
Packit c32a2d
Packit c32a2d
	movaps		%xmm3, %xmm0
Packit c32a2d
	movaps		%xmm5, %xmm1
Packit c32a2d
	movups		108(w), %xmm2
Packit c32a2d
	movups		92(w), %xmm3
Packit c32a2d
	shufps		$0x1b, %xmm3, %xmm3
Packit c32a2d
	movups		36(w), %xmm4
Packit c32a2d
	movups		20(w), %xmm5
Packit c32a2d
	shufps		$0x1b, %xmm5, %xmm5
Packit c32a2d
	movaps		%xmm0, %xmm6
Packit c32a2d
	addps		%xmm1, %xmm0
Packit c32a2d
	subps		%xmm1, %xmm6
Packit c32a2d
	mulps		%xmm0, %xmm2
Packit c32a2d
	mulps		%xmm3, %xmm0
Packit c32a2d
	mulps		%xmm6, %xmm4
Packit c32a2d
	mulps		%xmm5, %xmm6
Packit c32a2d
	movups		36(out1), %xmm1
Packit c32a2d
	movups		20(out1), %xmm3
Packit c32a2d
	shufps		$0x1b, %xmm6, %xmm6
Packit c32a2d
	addps		%xmm4, %xmm1
Packit c32a2d
	addps		%xmm6, %xmm3
Packit c32a2d
	shufps		$0x1b, %xmm0, %xmm0
Packit c32a2d
	movups		%xmm2, 36(out2)
Packit c32a2d
	movups		%xmm0, 20(out2)
Packit c32a2d
	movss		%xmm1, 32*36(ts)
Packit c32a2d
	movss		%xmm3, 32*20(ts)
Packit c32a2d
	movhlps		%xmm1, %xmm2
Packit c32a2d
	movhlps		%xmm3, %xmm4
Packit c32a2d
	movss		%xmm2, 32*44(ts)
Packit c32a2d
	movss		%xmm4, 32*28(ts)
Packit c32a2d
	shufps		$0xb1, %xmm1, %xmm1
Packit c32a2d
	shufps		$0xb1, %xmm3, %xmm3
Packit c32a2d
	movss		%xmm1, 32*40(ts)
Packit c32a2d
	movss		%xmm3, 32*24(ts)
Packit c32a2d
	movhlps		%xmm1, %xmm2
Packit c32a2d
	movhlps		%xmm3, %xmm4
Packit c32a2d
	movss		%xmm2, 32*48(ts)
Packit c32a2d
	movss		%xmm4, 32*32(ts)
Packit c32a2d
	
Packit c32a2d
	movss		8(tmp), %xmm0
Packit c32a2d
	movss		(tmp), %xmm1
Packit c32a2d
	movss		124(w), %xmm2
Packit c32a2d
	movss		88(w), %xmm3
Packit c32a2d
	movss		52(w), %xmm4
Packit c32a2d
	movss		16(w), %xmm5
Packit c32a2d
	movss		%xmm0, %xmm6
Packit c32a2d
	addss		%xmm1, %xmm0
Packit c32a2d
	subss		%xmm1, %xmm6
Packit c32a2d
	mulss		%xmm0, %xmm2
Packit c32a2d
	mulss		%xmm3, %xmm0
Packit c32a2d
	mulss		%xmm6, %xmm4
Packit c32a2d
	mulss		%xmm5, %xmm6
Packit c32a2d
	addss		52(out1), %xmm4
Packit c32a2d
	addss		16(out1), %xmm6
Packit c32a2d
	movss		%xmm2, 52(out2)
Packit c32a2d
	movss		%xmm0, 16(out2)
Packit c32a2d
	movss		%xmm4, 32*52(ts)
Packit c32a2d
	movss		%xmm6, 32*16(ts)
Packit c32a2d
	
Packit c32a2d
	movaps		16(tmp), %xmm0
Packit c32a2d
	movaps		%xmm7, %xmm1
Packit c32a2d
	MOVUAPS		128(w), %xmm2
Packit c32a2d
	movups		72(w), %xmm3
Packit c32a2d
	shufps		$0x1b, %xmm2, %xmm2
Packit c32a2d
	movlps		56(w), %xmm4
Packit c32a2d
	movhps		64(w), %xmm4
Packit c32a2d
	MOVUAPS		(w), %xmm5
Packit c32a2d
	shufps		$0x1b, %xmm4, %xmm4
Packit c32a2d
	movaps		%xmm0, %xmm6
Packit c32a2d
	addps		%xmm1, %xmm0
Packit c32a2d
	subps		%xmm1, %xmm6
Packit c32a2d
	mulps		%xmm0, %xmm2
Packit c32a2d
	mulps		%xmm3, %xmm0
Packit c32a2d
	mulps		%xmm6, %xmm4
Packit c32a2d
	mulps		%xmm5, %xmm6
Packit c32a2d
	movlps		56(out1), %xmm1
Packit c32a2d
	movhps		64(out1), %xmm1
Packit c32a2d
	movups		(out1), %xmm3
Packit c32a2d
	shufps		$0x1b, %xmm4, %xmm4
Packit c32a2d
	addps		%xmm6, %xmm3
Packit c32a2d
	addps		%xmm4, %xmm1
Packit c32a2d
	shufps		$0x1b, %xmm2, %xmm2
Packit c32a2d
	movups		%xmm0, (out2)
Packit c32a2d
	movlps		%xmm2, 56(out2)
Packit c32a2d
	movhps		%xmm2, 64(out2)
Packit c32a2d
	movss		%xmm1, 32*56(ts)
Packit c32a2d
	movss		%xmm3, (ts)
Packit c32a2d
	movhlps		%xmm1, %xmm2
Packit c32a2d
	movhlps		%xmm3, %xmm4
Packit c32a2d
	movss		%xmm2, 32*64(ts)
Packit c32a2d
	movss		%xmm4, 32*8(ts)
Packit c32a2d
	shufps		$0xb1, %xmm1, %xmm1
Packit c32a2d
	shufps		$0xb1, %xmm3, %xmm3
Packit c32a2d
	movss		%xmm1, 32*60(ts)
Packit c32a2d
	movss		%xmm3, 32*4(ts)
Packit c32a2d
	movhlps		%xmm1, %xmm2
Packit c32a2d
	movhlps		%xmm3, %xmm4
Packit c32a2d
	movss		%xmm2, 32*68(ts)
Packit c32a2d
	movss		%xmm4, 32*12(ts)
Packit c32a2d
	
Packit c32a2d
	pop			%edi
Packit c32a2d
	pop			%esi
Packit c32a2d
	pop			%ebx
Packit c32a2d
	mov			%ebp, %esp
Packit c32a2d
	pop			%ebp
Packit c32a2d
	
Packit c32a2d
	ret
Packit c32a2d
Packit c32a2d
NONEXEC_STACK