Blame src/libmpg123/dct36_x86_64.S

Packit c32a2d
/*
Packit c32a2d
	dct36_x86_64: SSE optimized dct36 for x86-64
Packit c32a2d
Packit c32a2d
	copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
#define in %rcx
Packit c32a2d
#define out1 %rdx
Packit c32a2d
#define out2 %r8
Packit c32a2d
#define w  %r9
Packit c32a2d
#define ts %r10
Packit c32a2d
#define COS9_ %rax
Packit c32a2d
#define tfcos36_ %r11
Packit c32a2d
#else
Packit c32a2d
#define in %rdi
Packit c32a2d
#define out1 %rsi
Packit c32a2d
#define out2 %rdx
Packit c32a2d
#define w  %rcx
Packit c32a2d
#define ts %r8
Packit c32a2d
#define COS9_ %rax
Packit c32a2d
#define tfcos36_ %r9
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	void dct36_x86_64(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf);
Packit c32a2d
*/
Packit c32a2d
	
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.section	.rodata
Packit c32a2d
#else
Packit c32a2d
	.data
Packit c32a2d
#endif
Packit c32a2d
	ALIGN16
Packit c32a2d
dct36_x86_64_COS9:
Packit c32a2d
	.long 0x3f5db3d7
Packit c32a2d
	.long 0x3f5db3d7
Packit c32a2d
	.long 0x3f000000
Packit c32a2d
	.long 0x3f000000
Packit c32a2d
	.long 0x3f7c1c5c
Packit c32a2d
	.long 0x3f7c1c5c
Packit c32a2d
	.long 0x3f708fb2
Packit c32a2d
	.long 0x3f708fb2
Packit c32a2d
	.long 0x3f248dbb
Packit c32a2d
	.long 0x3f248dbb
Packit c32a2d
	.long 0x3e31d0d4
Packit c32a2d
	.long 0x3e31d0d4
Packit c32a2d
	.long 0x3eaf1d44
Packit c32a2d
	.long 0x3eaf1d44
Packit c32a2d
	.long 0x3f441b7d
Packit c32a2d
	.long 0x3f441b7d
Packit c32a2d
	ALIGN16
Packit c32a2d
dct36_x86_64_tfcos36:
Packit c32a2d
	.long 0x3f007d2b
Packit c32a2d
	.long 0x3f0483ee
Packit c32a2d
	.long 0x3f0d3b7d
Packit c32a2d
	.long 0x3f1c4257
Packit c32a2d
	.long 0x40b79454
Packit c32a2d
	.long 0x3ff746ea
Packit c32a2d
	.long 0x3f976fd9
Packit c32a2d
	.long 0x3f5f2944
Packit c32a2d
	.long 0x3f3504f3
Packit c32a2d
	ALIGN16
Packit c32a2d
dct36_x86_64_mask:
Packit c32a2d
	.long 0,0xffffffff,0,0xffffffff
Packit c32a2d
	ALIGN16
Packit c32a2d
dct36_x86_64_sign:
Packit c32a2d
	.long 0x80000000,0x80000000,0x80000000,0x80000000
Packit c32a2d
	.text
Packit c32a2d
	ALIGN16
Packit c32a2d
	.globl ASM_NAME(dct36_x86_64)
Packit c32a2d
ASM_NAME(dct36_x86_64):
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	push		%rbp
Packit c32a2d
	mov			%rsp, %rbp
Packit c32a2d
	sub			$160, %rsp
Packit c32a2d
	movaps		%xmm6, (%rsp)
Packit c32a2d
	movaps		%xmm7, 16(%rsp)
Packit c32a2d
	movaps		%xmm8, 32(%rsp)
Packit c32a2d
	movaps		%xmm9, 48(%rsp)
Packit c32a2d
	movaps		%xmm10, 64(%rsp)
Packit c32a2d
	movaps		%xmm11, 80(%rsp)
Packit c32a2d
	movaps		%xmm12, 96(%rsp)
Packit c32a2d
	movaps		%xmm13, 112(%rsp)
Packit c32a2d
	movaps		%xmm14, 128(%rsp)
Packit c32a2d
	movaps		%xmm15, 144(%rsp)
Packit c32a2d
	movq		48(%rbp), ts
Packit c32a2d
#endif
Packit c32a2d
	lea			dct36_x86_64_COS9(%rip), COS9_
Packit c32a2d
	lea			dct36_x86_64_tfcos36(%rip), tfcos36_
Packit c32a2d
	
Packit c32a2d
	xorps		%xmm5, %xmm5
Packit c32a2d
	movups		(in), %xmm1
Packit c32a2d
	movups		16(in), %xmm2
Packit c32a2d
	movups		32(in), %xmm3
Packit c32a2d
	movups		48(in), %xmm4
Packit c32a2d
	movlps		64(in), %xmm5
Packit c32a2d
	xorps		%xmm6, %xmm6
Packit c32a2d
	movaps		%xmm1, %xmm7
Packit c32a2d
	shufps		$0x93, %xmm7, %xmm7
Packit c32a2d
	movaps		%xmm2, %xmm8
Packit c32a2d
	shufps		$0x93, %xmm8, %xmm8
Packit c32a2d
	movaps		%xmm3, %xmm9
Packit c32a2d
	shufps		$0x93, %xmm9, %xmm9
Packit c32a2d
	movaps		%xmm4, %xmm10
Packit c32a2d
	shufps		$0x93, %xmm10, %xmm10
Packit c32a2d
	movaps		%xmm5, %xmm11
Packit c32a2d
	shufps		$0xe1, %xmm11, %xmm11
Packit c32a2d
	movss		%xmm10, %xmm11
Packit c32a2d
	addps		%xmm11, %xmm5
Packit c32a2d
	movss		%xmm9, %xmm10
Packit c32a2d
	addps		%xmm10, %xmm4
Packit c32a2d
	movss		%xmm8, %xmm9
Packit c32a2d
	addps		%xmm9, %xmm3
Packit c32a2d
	movss		%xmm7, %xmm8
Packit c32a2d
	addps		%xmm8, %xmm2
Packit c32a2d
	movss		%xmm6, %xmm7
Packit c32a2d
	addps		%xmm7, %xmm1
Packit c32a2d
	
Packit c32a2d
	movaps		dct36_x86_64_mask(%rip), %xmm0
Packit c32a2d
	movaps		%xmm4, %xmm6
Packit c32a2d
	shufps		$0x4e, %xmm5, %xmm4
Packit c32a2d
	movaps		%xmm3, %xmm7
Packit c32a2d
	shufps		$0x4e, %xmm6, %xmm3
Packit c32a2d
	andps		%xmm0, %xmm6
Packit c32a2d
	addps		%xmm6, %xmm4
Packit c32a2d
	movaps		%xmm2, %xmm6
Packit c32a2d
	shufps		$0x4e, %xmm7, %xmm2
Packit c32a2d
	andps		%xmm0, %xmm7
Packit c32a2d
	addps		%xmm7, %xmm3
Packit c32a2d
	movaps		%xmm1, %xmm7
Packit c32a2d
	shufps		$0x4e, %xmm6, %xmm1
Packit c32a2d
	andps		%xmm0, %xmm6
Packit c32a2d
	addps		%xmm6, %xmm2
Packit c32a2d
	movaps		%xmm7, %xmm6
Packit c32a2d
	andps		%xmm0, %xmm7
Packit c32a2d
	xorps		%xmm0, %xmm0
Packit c32a2d
	addps		%xmm7, %xmm1
Packit c32a2d
	movlhps		%xmm6, %xmm0
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
xmm0 in[-,-,0,1]
Packit c32a2d
xmm1 in[2,3,4,5]
Packit c32a2d
xmm2 in[6,7,8,9]
Packit c32a2d
xmm3 in[10,11,12,13]
Packit c32a2d
xmm4 in[14,15,16,17]
Packit c32a2d
*/
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm2, %xmm5
Packit c32a2d
	shufps		$0xe4, %xmm3, %xmm5
Packit c32a2d
	shufps		$0xe4, %xmm4, %xmm3
Packit c32a2d
	shufps		$0xe4, %xmm2, %xmm4
Packit c32a2d
	movaps		%xmm5, %xmm2
Packit c32a2d
/*
Packit c32a2d
xmm2 in[6,7,12,13]
Packit c32a2d
xmm3 in[10,11,16,17]
Packit c32a2d
xmm4 in[14,15,8,9]
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
	movaps		(COS9_), %xmm15
Packit c32a2d
	movaps		16(COS9_), %xmm6
Packit c32a2d
	movaps		32(COS9_), %xmm7
Packit c32a2d
	movaps		48(COS9_), %xmm8
Packit c32a2d
	mulps		%xmm15, %xmm5
Packit c32a2d
	addps		%xmm0, %xmm5
Packit c32a2d
	
Packit c32a2d
/*
Packit c32a2d
xmm5 [ta33,tb33,ta66,tb66]
Packit c32a2d
xmm6 COS9_[1,1,2,2]
Packit c32a2d
xmm7 COS9_[5,5,8,8]
Packit c32a2d
xmm8 COS9_[7,7,4,4]
Packit c32a2d
xmm15 COS9_[3,3,6,6]
Packit c32a2d
*/
Packit c32a2d
	movaps		%xmm6, %xmm9
Packit c32a2d
	movaps		%xmm7, %xmm12
Packit c32a2d
	movaps		%xmm8, %xmm13
Packit c32a2d
	mulps		%xmm1, %xmm9
Packit c32a2d
	mulps		%xmm3, %xmm12
Packit c32a2d
	mulps		%xmm4, %xmm13
Packit c32a2d
	addps		%xmm5, %xmm9
Packit c32a2d
	addps		%xmm13, %xmm12
Packit c32a2d
	addps		%xmm9, %xmm12
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm1, %xmm13
Packit c32a2d
	subps		%xmm3, %xmm13
Packit c32a2d
	movaps		%xmm0, %xmm10
Packit c32a2d
	shufps		$0xe0, %xmm2, %xmm10
Packit c32a2d
	movaps		%xmm0, %xmm14
Packit c32a2d
	subps		%xmm10, %xmm14
Packit c32a2d
	subps		%xmm4, %xmm13
Packit c32a2d
	mulps		%xmm15, %xmm13
Packit c32a2d
	addps		%xmm14, %xmm13
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm7, %xmm9
Packit c32a2d
	movaps		%xmm8, %xmm15
Packit c32a2d
	movaps		%xmm6, %xmm14
Packit c32a2d
	mulps		%xmm1, %xmm9
Packit c32a2d
	mulps		%xmm3, %xmm15
Packit c32a2d
	mulps		%xmm4, %xmm14
Packit c32a2d
	subps		%xmm5, %xmm9
Packit c32a2d
	subps		%xmm15, %xmm14
Packit c32a2d
	addps		%xmm9, %xmm14
Packit c32a2d
	
Packit c32a2d
	mulps		%xmm1, %xmm8
Packit c32a2d
	mulps		%xmm3, %xmm6
Packit c32a2d
	mulps		%xmm4, %xmm7
Packit c32a2d
	subps		%xmm5, %xmm8
Packit c32a2d
	subps		%xmm7, %xmm6
Packit c32a2d
	addps		%xmm6, %xmm8
Packit c32a2d
	movaps		%xmm8, %xmm15
Packit c32a2d
	
Packit c32a2d
	movss		32(tfcos36_), %xmm5
Packit c32a2d
	subps		%xmm1, %xmm0
Packit c32a2d
	subps		%xmm2, %xmm4
Packit c32a2d
	addps		%xmm3, %xmm0
Packit c32a2d
	addps		%xmm4, %xmm0
Packit c32a2d
	shufps		$0xaf, %xmm0, %xmm0
Packit c32a2d
	mulss		%xmm5, %xmm0
Packit c32a2d
	movaps		%xmm0, %xmm11
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
xmm12 [1a-0,1b-0, 2a-0, 2b-0]
Packit c32a2d
xmm13 [1a-1,1b-1, 2a-1, 2b-1]
Packit c32a2d
xmm14 [1a-2,1b-2,-2a-2,-2b-2]
Packit c32a2d
xmm15 [1a-3,1b-3,-2a-3,-2b-3]
Packit c32a2d
*/
Packit c32a2d
	movaps		%xmm12, %xmm5
Packit c32a2d
	unpckhps	%xmm13, %xmm5
Packit c32a2d
	unpcklps	%xmm13, %xmm12
Packit c32a2d
	movaps		%xmm14, %xmm6
Packit c32a2d
	unpckhps	%xmm15, %xmm6
Packit c32a2d
	unpcklps	%xmm15, %xmm14
Packit c32a2d
	xorps		dct36_x86_64_sign(%rip), %xmm6
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
xmm12 [1a-0,1a-1,1b-0,1b-1]
Packit c32a2d
xmm5  [2a-0,2a-1,2b-0,2b-1]
Packit c32a2d
xmm14 [1a-2,1a-3,1b-2,1b-3]
Packit c32a2d
xmm6 [2a-2,2a-3,2b-2,2b-3]
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
	movaps		%xmm12, %xmm0
Packit c32a2d
	movlhps		%xmm14, %xmm12
Packit c32a2d
	movhlps		%xmm0, %xmm14
Packit c32a2d
	movaps		%xmm5, %xmm0
Packit c32a2d
	movlhps		%xmm6, %xmm0
Packit c32a2d
	movhlps		%xmm5, %xmm6
Packit c32a2d
	movaps		%xmm6, %xmm15
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
xmm12 tmp1a
Packit c32a2d
xmm0 tmp2a
Packit c32a2d
xmm14 tmp1b
Packit c32a2d
xmm15 tmp2b
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
	movaps		(tfcos36_), %xmm6
Packit c32a2d
	movaps		16(tfcos36_), %xmm7
Packit c32a2d
	movaps		%xmm15, %xmm10
Packit c32a2d
	addps		%xmm14, %xmm15
Packit c32a2d
	subps		%xmm14, %xmm10
Packit c32a2d
	movaps		%xmm0, %xmm14
Packit c32a2d
	addps		%xmm12, %xmm0
Packit c32a2d
	subps		%xmm12, %xmm14
Packit c32a2d
	mulps		%xmm6, %xmm15
Packit c32a2d
	mulps		%xmm10, %xmm7
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
%xmm0  tmp[0,1,2,3]
Packit c32a2d
%xmm15 tmp[17,16,15,14]
Packit c32a2d
%xmm14 tmp[8,7,6,5]
Packit c32a2d
%xmm7  tmp[9,10,11,12]
Packit c32a2d
%xmm11 tmp[13,-,4,-]
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
	movaps		%xmm15, %xmm1
Packit c32a2d
	movups		108(w), %xmm2
Packit c32a2d
	movups		92(w), %xmm3
Packit c32a2d
	shufps		$0x1b, %xmm3, %xmm3
Packit c32a2d
	movups		36(w), %xmm4
Packit c32a2d
	movups		20(w), %xmm5
Packit c32a2d
	shufps		$0x1b, %xmm5, %xmm5
Packit c32a2d
	movaps		%xmm0, %xmm6
Packit c32a2d
	addps		%xmm1, %xmm0
Packit c32a2d
	subps		%xmm1, %xmm6
Packit c32a2d
	mulps		%xmm0, %xmm2
Packit c32a2d
	mulps		%xmm3, %xmm0
Packit c32a2d
	mulps		%xmm6, %xmm4
Packit c32a2d
	mulps		%xmm5, %xmm6
Packit c32a2d
	movups		36(out1), %xmm1
Packit c32a2d
	movups		20(out1), %xmm3
Packit c32a2d
	shufps		$0x1b, %xmm6, %xmm6
Packit c32a2d
	addps		%xmm4, %xmm1
Packit c32a2d
	addps		%xmm6, %xmm3
Packit c32a2d
	shufps		$0x1b, %xmm0, %xmm0
Packit c32a2d
	movups		%xmm2, 36(out2)
Packit c32a2d
	movups		%xmm0, 20(out2)
Packit c32a2d
	movss		%xmm1, 32*36(ts)
Packit c32a2d
	movss		%xmm3, 32*20(ts)
Packit c32a2d
	movhlps		%xmm1, %xmm2
Packit c32a2d
	movhlps		%xmm3, %xmm4
Packit c32a2d
	movss		%xmm2, 32*44(ts)
Packit c32a2d
	movss		%xmm4, 32*28(ts)
Packit c32a2d
	shufps		$0xb1, %xmm1, %xmm1
Packit c32a2d
	shufps		$0xb1, %xmm3, %xmm3
Packit c32a2d
	movss		%xmm1, 32*40(ts)
Packit c32a2d
	movss		%xmm3, 32*24(ts)
Packit c32a2d
	movhlps		%xmm1, %xmm2
Packit c32a2d
	movhlps		%xmm3, %xmm4
Packit c32a2d
	movss		%xmm2, 32*48(ts)
Packit c32a2d
	movss		%xmm4, 32*32(ts)
Packit c32a2d
	
Packit c32a2d
	movhlps		%xmm11, %xmm0
Packit c32a2d
	movaps		%xmm11, %xmm1
Packit c32a2d
	movss		124(w), %xmm2
Packit c32a2d
	movss		88(w), %xmm3
Packit c32a2d
	movss		52(w), %xmm4
Packit c32a2d
	movss		16(w), %xmm5
Packit c32a2d
	movss		%xmm0, %xmm6
Packit c32a2d
	addss		%xmm1, %xmm0
Packit c32a2d
	subss		%xmm1, %xmm6
Packit c32a2d
	mulss		%xmm0, %xmm2
Packit c32a2d
	mulss		%xmm3, %xmm0
Packit c32a2d
	mulss		%xmm6, %xmm4
Packit c32a2d
	mulss		%xmm5, %xmm6
Packit c32a2d
	addss		52(out1), %xmm4
Packit c32a2d
	addss		16(out1), %xmm6
Packit c32a2d
	movss		%xmm2, 52(out2)
Packit c32a2d
	movss		%xmm0, 16(out2)
Packit c32a2d
	movss		%xmm4, 32*52(ts)
Packit c32a2d
	movss		%xmm6, 32*16(ts)
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm14, %xmm0
Packit c32a2d
	movaps		%xmm7, %xmm1
Packit c32a2d
	MOVUAPS		128(w), %xmm2
Packit c32a2d
	movups		72(w), %xmm3
Packit c32a2d
	shufps		$0x1b, %xmm2, %xmm2
Packit c32a2d
	movlps		56(w), %xmm4
Packit c32a2d
	movhps		64(w), %xmm4
Packit c32a2d
	MOVUAPS		(w), %xmm5
Packit c32a2d
	shufps		$0x1b, %xmm4, %xmm4
Packit c32a2d
	movaps		%xmm0, %xmm6
Packit c32a2d
	addps		%xmm1, %xmm0
Packit c32a2d
	subps		%xmm1, %xmm6
Packit c32a2d
	mulps		%xmm0, %xmm2
Packit c32a2d
	mulps		%xmm3, %xmm0
Packit c32a2d
	mulps		%xmm6, %xmm4
Packit c32a2d
	mulps		%xmm5, %xmm6
Packit c32a2d
	movlps		56(out1), %xmm1
Packit c32a2d
	movhps		64(out1), %xmm1
Packit c32a2d
	movups		(out1), %xmm3
Packit c32a2d
	shufps		$0x1b, %xmm4, %xmm4
Packit c32a2d
	addps		%xmm6, %xmm3
Packit c32a2d
	addps		%xmm4, %xmm1
Packit c32a2d
	shufps		$0x1b, %xmm2, %xmm2
Packit c32a2d
	movups		%xmm0, (out2)
Packit c32a2d
	movlps		%xmm2, 56(out2)
Packit c32a2d
	movhps		%xmm2, 64(out2)
Packit c32a2d
	movss		%xmm1, 32*56(ts)
Packit c32a2d
	movss		%xmm3, (ts)
Packit c32a2d
	movhlps		%xmm1, %xmm2
Packit c32a2d
	movhlps		%xmm3, %xmm4
Packit c32a2d
	movss		%xmm2, 32*64(ts)
Packit c32a2d
	movss		%xmm4, 32*8(ts)
Packit c32a2d
	shufps		$0xb1, %xmm1, %xmm1
Packit c32a2d
	shufps		$0xb1, %xmm3, %xmm3
Packit c32a2d
	movss		%xmm1, 32*60(ts)
Packit c32a2d
	movss		%xmm3, 32*4(ts)
Packit c32a2d
	movhlps		%xmm1, %xmm2
Packit c32a2d
	movhlps		%xmm3, %xmm4
Packit c32a2d
	movss		%xmm2, 32*68(ts)
Packit c32a2d
	movss		%xmm4, 32*12(ts)
Packit c32a2d
	
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	movaps		(%rsp), %xmm6
Packit c32a2d
	movaps		16(%rsp), %xmm7
Packit c32a2d
	movaps		32(%rsp), %xmm8
Packit c32a2d
	movaps		48(%rsp), %xmm9
Packit c32a2d
	movaps		64(%rsp), %xmm10
Packit c32a2d
	movaps		80(%rsp), %xmm11
Packit c32a2d
	movaps		96(%rsp), %xmm12
Packit c32a2d
	movaps		112(%rsp), %xmm13
Packit c32a2d
	movaps		128(%rsp), %xmm14
Packit c32a2d
	movaps		144(%rsp), %xmm15
Packit c32a2d
	mov			%rbp, %rsp
Packit c32a2d
	pop			%rbp
Packit c32a2d
#endif
Packit c32a2d
	ret
Packit c32a2d
Packit c32a2d
NONEXEC_STACK