Blame src/libmpg123/synth_x86_64_float.S

Packit c32a2d
/*
Packit c32a2d
	synth_x86_64_float: SSE optimized synth for x86-64 (float output version)
Packit c32a2d
Packit c32a2d
	copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
/* short *window; */
Packit c32a2d
#define ARG0 %r10
Packit c32a2d
/* short *b0; */
Packit c32a2d
#define ARG1 %rdx
Packit c32a2d
/* short *samples; */
Packit c32a2d
#define ARG2 %r8
Packit c32a2d
/* int bo1; */
Packit c32a2d
#define ARG3 %r9
Packit c32a2d
#else
Packit c32a2d
/* real *window; */
Packit c32a2d
#define ARG0 %rdi
Packit c32a2d
/* real *b0; */
Packit c32a2d
#define ARG1 %rsi
Packit c32a2d
/* real *samples; */
Packit c32a2d
#define ARG2 %rdx
Packit c32a2d
/* int bo1; */
Packit c32a2d
#define ARG3 %rcx
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
#define XMMREG_SCALE %xmm15  /* {1/32768.0, 1/32768.0, 1/32768.0, 1/32768.0} */
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	int synth_1to1_real_x86_64_asm(real *window, real *b0, real *samples, int bo1);
Packit c32a2d
	return value: number of clipped samples (0)
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.section	.rodata
Packit c32a2d
#else
Packit c32a2d
	.data
Packit c32a2d
#endif
Packit c32a2d
	ALIGN32
Packit c32a2d
ASM_NAME(scale_x86_64):
Packit c32a2d
	.long   939524096
Packit c32a2d
	.long   939524096
Packit c32a2d
	.long   939524096
Packit c32a2d
	.long   939524096
Packit c32a2d
	.text
Packit c32a2d
	ALIGN16
Packit c32a2d
.globl ASM_NAME(synth_1to1_real_x86_64_asm)
Packit c32a2d
ASM_NAME(synth_1to1_real_x86_64_asm):
Packit c32a2d
#ifdef IS_MSABI /* should save xmm6-15 */
Packit c32a2d
	movq		%rcx, ARG0
Packit c32a2d
	subq		$120, %rsp /* stack alignment + 7 xmm registers */
Packit c32a2d
	movaps		%xmm6, (%rsp)
Packit c32a2d
	movaps		%xmm7, 16(%rsp)
Packit c32a2d
	movaps		%xmm8, 32(%rsp)
Packit c32a2d
	movaps		%xmm9, 48(%rsp)
Packit c32a2d
	movaps		%xmm10, 64(%rsp)
Packit c32a2d
	movaps		%xmm11, 80(%rsp)
Packit c32a2d
	movaps		%xmm15, 96(%rsp)
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
	leaq		ASM_NAME(scale_x86_64)(%rip), %rax
Packit c32a2d
	movaps		(%rax), XMMREG_SCALE
Packit c32a2d
	
Packit c32a2d
	andq		$0xf, ARG3
Packit c32a2d
	shlq		$2, ARG3
Packit c32a2d
	leaq		64(ARG0), ARG0
Packit c32a2d
	subq		ARG3, ARG0
Packit c32a2d
Packit c32a2d
	movl		$4, %ecx
Packit c32a2d
	
Packit c32a2d
	ALIGN16
Packit c32a2d
1:
Packit c32a2d
	movups		(ARG0), %xmm8
Packit c32a2d
	movups		16(ARG0), %xmm1
Packit c32a2d
	movups		32(ARG0), %xmm2
Packit c32a2d
	movups		48(ARG0), %xmm3
Packit c32a2d
	movups		128(ARG0), %xmm9
Packit c32a2d
	movups		144(ARG0), %xmm5
Packit c32a2d
	movups		160(ARG0), %xmm6
Packit c32a2d
	movups		176(ARG0), %xmm7
Packit c32a2d
	mulps		(ARG1), %xmm8
Packit c32a2d
	mulps		16(ARG1), %xmm1
Packit c32a2d
	mulps		32(ARG1), %xmm2
Packit c32a2d
	mulps		48(ARG1), %xmm3
Packit c32a2d
	mulps		64(ARG1), %xmm9
Packit c32a2d
	mulps		80(ARG1), %xmm5
Packit c32a2d
	mulps		96(ARG1), %xmm6
Packit c32a2d
	mulps		112(ARG1), %xmm7
Packit c32a2d
	
Packit c32a2d
	addps		%xmm1, %xmm8
Packit c32a2d
	addps		%xmm2, %xmm3
Packit c32a2d
	addps		%xmm5, %xmm9
Packit c32a2d
	addps		%xmm7, %xmm6
Packit c32a2d
	addps		%xmm3, %xmm8
Packit c32a2d
	addps		%xmm6, %xmm9
Packit c32a2d
	leaq		256(ARG0), ARG0
Packit c32a2d
	leaq		128(ARG1), ARG1
Packit c32a2d
	
Packit c32a2d
	movups		(ARG0), %xmm10
Packit c32a2d
	movups		16(ARG0), %xmm1
Packit c32a2d
	movups		32(ARG0), %xmm2
Packit c32a2d
	movups		48(ARG0), %xmm3
Packit c32a2d
	movups		128(ARG0), %xmm11
Packit c32a2d
	movups		144(ARG0), %xmm5
Packit c32a2d
	movups		160(ARG0), %xmm6
Packit c32a2d
	movups		176(ARG0), %xmm7
Packit c32a2d
	mulps		(ARG1), %xmm10
Packit c32a2d
	mulps		16(ARG1), %xmm1
Packit c32a2d
	mulps		32(ARG1), %xmm2
Packit c32a2d
	mulps		48(ARG1), %xmm3
Packit c32a2d
	mulps		64(ARG1), %xmm11
Packit c32a2d
	mulps		80(ARG1), %xmm5
Packit c32a2d
	mulps		96(ARG1), %xmm6
Packit c32a2d
	mulps		112(ARG1), %xmm7
Packit c32a2d
	
Packit c32a2d
	addps		%xmm1, %xmm10
Packit c32a2d
	addps		%xmm2, %xmm3
Packit c32a2d
	addps		%xmm5, %xmm11
Packit c32a2d
	addps		%xmm7, %xmm6
Packit c32a2d
	addps		%xmm3, %xmm10
Packit c32a2d
	addps		%xmm6, %xmm11
Packit c32a2d
	leaq		256(ARG0), ARG0
Packit c32a2d
	leaq		128(ARG1), ARG1
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm8, %xmm0
Packit c32a2d
	movaps		%xmm10, %xmm1
Packit c32a2d
	unpcklps	%xmm9, %xmm8
Packit c32a2d
	unpcklps	%xmm11, %xmm10
Packit c32a2d
	unpckhps	%xmm9, %xmm0
Packit c32a2d
	unpckhps	%xmm11, %xmm1
Packit c32a2d
	movaps		%xmm8, %xmm2
Packit c32a2d
	movaps		%xmm0, %xmm3
Packit c32a2d
	movlhps		%xmm10, %xmm8
Packit c32a2d
	movhlps		%xmm2, %xmm10
Packit c32a2d
	movlhps		%xmm1, %xmm0
Packit c32a2d
	movhlps		%xmm3, %xmm1
Packit c32a2d
	subps		%xmm10, %xmm8
Packit c32a2d
	subps		%xmm1, %xmm0
Packit c32a2d
	addps		%xmm8, %xmm0
Packit c32a2d
	
Packit c32a2d
	movups		(ARG2), %xmm1
Packit c32a2d
	movups		16(ARG2), %xmm2
Packit c32a2d
	mulps		XMMREG_SCALE, %xmm0
Packit c32a2d
	shufps		$0xdd, %xmm2, %xmm1
Packit c32a2d
	movaps		%xmm0, %xmm2
Packit c32a2d
	unpcklps	%xmm1, %xmm0
Packit c32a2d
	unpckhps	%xmm1, %xmm2
Packit c32a2d
	movups		%xmm0, (ARG2)
Packit c32a2d
	movups		%xmm2, 16(ARG2)
Packit c32a2d
	
Packit c32a2d
	leaq		32(ARG2), ARG2
Packit c32a2d
	decl		%ecx
Packit c32a2d
	jnz			1b
Packit c32a2d
	
Packit c32a2d
	movl		$4, %ecx
Packit c32a2d
	
Packit c32a2d
	ALIGN16
Packit c32a2d
1:
Packit c32a2d
	movups		(ARG0), %xmm8
Packit c32a2d
	movups		16(ARG0), %xmm1
Packit c32a2d
	movups		32(ARG0), %xmm2
Packit c32a2d
	movups		48(ARG0), %xmm3
Packit c32a2d
	movups		128(ARG0), %xmm9
Packit c32a2d
	movups		144(ARG0), %xmm5
Packit c32a2d
	movups		160(ARG0), %xmm6
Packit c32a2d
	movups		176(ARG0), %xmm7
Packit c32a2d
	mulps		(ARG1), %xmm8
Packit c32a2d
	mulps		16(ARG1), %xmm1
Packit c32a2d
	mulps		32(ARG1), %xmm2
Packit c32a2d
	mulps		48(ARG1), %xmm3
Packit c32a2d
	mulps		-64(ARG1), %xmm9
Packit c32a2d
	mulps		-48(ARG1), %xmm5
Packit c32a2d
	mulps		-32(ARG1), %xmm6
Packit c32a2d
	mulps		-16(ARG1), %xmm7
Packit c32a2d
	
Packit c32a2d
	addps		%xmm1, %xmm8
Packit c32a2d
	addps		%xmm2, %xmm3
Packit c32a2d
	addps		%xmm5, %xmm9
Packit c32a2d
	addps		%xmm7, %xmm6
Packit c32a2d
	addps		%xmm3, %xmm8
Packit c32a2d
	addps		%xmm6, %xmm9
Packit c32a2d
	leaq		256(ARG0), ARG0
Packit c32a2d
	leaq		-128(ARG1), ARG1
Packit c32a2d
	
Packit c32a2d
	movups		(ARG0), %xmm10
Packit c32a2d
	movups		16(ARG0), %xmm1
Packit c32a2d
	movups		32(ARG0), %xmm2
Packit c32a2d
	movups		48(ARG0), %xmm3
Packit c32a2d
	movups		128(ARG0), %xmm11
Packit c32a2d
	movups		144(ARG0), %xmm5
Packit c32a2d
	movups		160(ARG0), %xmm6
Packit c32a2d
	movups		176(ARG0), %xmm7
Packit c32a2d
	mulps		(ARG1), %xmm10
Packit c32a2d
	mulps		16(ARG1), %xmm1
Packit c32a2d
	mulps		32(ARG1), %xmm2
Packit c32a2d
	mulps		48(ARG1), %xmm3
Packit c32a2d
	mulps		-64(ARG1), %xmm11
Packit c32a2d
	mulps		-48(ARG1), %xmm5
Packit c32a2d
	mulps		-32(ARG1), %xmm6
Packit c32a2d
	mulps		-16(ARG1), %xmm7
Packit c32a2d
	
Packit c32a2d
	addps		%xmm1, %xmm10
Packit c32a2d
	addps		%xmm2, %xmm3
Packit c32a2d
	addps		%xmm5, %xmm11
Packit c32a2d
	addps		%xmm7, %xmm6
Packit c32a2d
	addps		%xmm3, %xmm10
Packit c32a2d
	addps		%xmm6, %xmm11
Packit c32a2d
	leaq		256(ARG0), ARG0
Packit c32a2d
	leaq		-128(ARG1), ARG1
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm8, %xmm0
Packit c32a2d
	movaps		%xmm10, %xmm1
Packit c32a2d
	unpcklps	%xmm9, %xmm8
Packit c32a2d
	unpcklps	%xmm11, %xmm10
Packit c32a2d
	unpckhps	%xmm9, %xmm0
Packit c32a2d
	unpckhps	%xmm11, %xmm1
Packit c32a2d
	movaps		%xmm8, %xmm2
Packit c32a2d
	movaps		%xmm0, %xmm3
Packit c32a2d
	movlhps		%xmm10, %xmm8
Packit c32a2d
	movhlps		%xmm2, %xmm10
Packit c32a2d
	movlhps		%xmm1, %xmm0
Packit c32a2d
	movhlps		%xmm3, %xmm1
Packit c32a2d
	addps		%xmm10, %xmm8
Packit c32a2d
	addps		%xmm1, %xmm0
Packit c32a2d
	addps		%xmm8, %xmm0
Packit c32a2d
	
Packit c32a2d
	movups		(ARG2), %xmm1
Packit c32a2d
	movups		16(ARG2), %xmm2
Packit c32a2d
	mulps		XMMREG_SCALE, %xmm0
Packit c32a2d
	shufps		$0xdd, %xmm2, %xmm1
Packit c32a2d
	movaps		%xmm0, %xmm2
Packit c32a2d
	unpcklps	%xmm1, %xmm0
Packit c32a2d
	unpckhps	%xmm1, %xmm2
Packit c32a2d
	movups		%xmm0, (ARG2)
Packit c32a2d
	movups		%xmm2, 16(ARG2)
Packit c32a2d
	
Packit c32a2d
	leaq		32(ARG2), ARG2
Packit c32a2d
	decl		%ecx
Packit c32a2d
	jnz			1b
Packit c32a2d
	
Packit c32a2d
	xorl		%eax, %eax
Packit c32a2d
	
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	movaps		(%rsp), %xmm6
Packit c32a2d
	movaps		16(%rsp), %xmm7
Packit c32a2d
	movaps		32(%rsp), %xmm8
Packit c32a2d
	movaps		48(%rsp), %xmm9
Packit c32a2d
	movaps		64(%rsp), %xmm10
Packit c32a2d
	movaps		80(%rsp), %xmm11
Packit c32a2d
	movaps		96(%rsp), %xmm15
Packit c32a2d
	addq		$120, %rsp
Packit c32a2d
#endif
Packit c32a2d
	ret
Packit c32a2d
Packit c32a2d
NONEXEC_STACK