Blame src/libmpg123/synth_stereo_avx_float.S

Packit c32a2d
/*
Packit c32a2d
	synth_stereo_avx_float: AVX optimized synth for x86-64 (stereo specific, float output version)
Packit c32a2d
Packit c32a2d
	copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
/* real *window; */
Packit c32a2d
#define WINDOW %r10
Packit c32a2d
/* real *b0l; */
Packit c32a2d
#define B0L %rdx
Packit c32a2d
/* real *b0r; */
Packit c32a2d
#define B0R %r8
Packit c32a2d
/* real *samples; */
Packit c32a2d
#define SAMPLES %r9
Packit c32a2d
#else
Packit c32a2d
/* real *window; */
Packit c32a2d
#define WINDOW %rdi
Packit c32a2d
/* real *b0l; */
Packit c32a2d
#define B0L %rsi
Packit c32a2d
/* real *b0r; */
Packit c32a2d
#define B0R %rdx
Packit c32a2d
/* real *samples; */
Packit c32a2d
#define SAMPLES %r9
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	int synth_1to1_real_s_avx_asm(real *window, real *b0l, real *b0r, real *samples, int bo1);
Packit c32a2d
	return value: number of clipped samples (0)
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.section	.rodata
Packit c32a2d
#else
Packit c32a2d
	.data
Packit c32a2d
#endif
Packit c32a2d
	ALIGN16
Packit c32a2d
scale_avx:
Packit c32a2d
	.long   939524096
Packit c32a2d
	.text
Packit c32a2d
	ALIGN16
Packit c32a2d
	.globl ASM_NAME(synth_1to1_real_s_avx_asm)
Packit c32a2d
ASM_NAME(synth_1to1_real_s_avx_asm):
Packit c32a2d
#ifdef IS_MSABI /* should save xmm6-15 */
Packit c32a2d
	push		%rbp
Packit c32a2d
	mov			%rsp, %rbp
Packit c32a2d
	sub			$144, %rsp
Packit c32a2d
	movaps		%xmm6, (%rsp)
Packit c32a2d
	movaps		%xmm7, 16(%rsp)
Packit c32a2d
	movaps		%xmm8, 32(%rsp)
Packit c32a2d
	movaps		%xmm9, 48(%rsp)
Packit c32a2d
	movaps		%xmm10, 64(%rsp)
Packit c32a2d
	movaps		%xmm11, 80(%rsp)
Packit c32a2d
	movaps		%xmm12, 96(%rsp)
Packit c32a2d
	movaps		%xmm13, 112(%rsp)
Packit c32a2d
	movaps		%xmm14, 128(%rsp)
Packit c32a2d
	movl		48(%rbp), %eax /* 5th argument; placed after 32-byte shadow space */
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
	vbroadcastss	scale_avx(%rip), %ymm14
Packit c32a2d
	
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	shl			$2, %eax
Packit c32a2d
	mov			%rcx, WINDOW
Packit c32a2d
#else
Packit c32a2d
	mov			%r8d, %eax
Packit c32a2d
	shl			$2, %eax
Packit c32a2d
	mov			%rcx, SAMPLES
Packit c32a2d
#endif
Packit c32a2d
	add			$64, WINDOW
Packit c32a2d
	sub			%rax, WINDOW
Packit c32a2d
Packit c32a2d
	mov			$128, %rax
Packit c32a2d
	mov			$4, %ecx
Packit c32a2d
	
Packit c32a2d
	ALIGN16
Packit c32a2d
1:
Packit c32a2d
	vmovups		(WINDOW), %ymm8
Packit c32a2d
	vmovups		32(WINDOW), %ymm9
Packit c32a2d
	vmovups		(WINDOW,%rax), %ymm10
Packit c32a2d
	vmovups		32(WINDOW,%rax), %ymm11
Packit c32a2d
	vmulps		(B0L), %ymm8, %ymm0
Packit c32a2d
	vmulps		32(B0L), %ymm9, %ymm1
Packit c32a2d
	vmulps		(B0R), %ymm8, %ymm2
Packit c32a2d
	vmulps		32(B0R), %ymm9, %ymm3
Packit c32a2d
	vmulps		64(B0L), %ymm10, %ymm4
Packit c32a2d
	vmulps		96(B0L), %ymm11, %ymm5
Packit c32a2d
	vmulps		64(B0R), %ymm10, %ymm6
Packit c32a2d
	vmulps		96(B0R), %ymm11, %ymm7
Packit c32a2d
	vaddps		%ymm1, %ymm0, %ymm8
Packit c32a2d
	vaddps		%ymm3, %ymm2, %ymm0
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm9
Packit c32a2d
	vaddps		%ymm7, %ymm6, %ymm1
Packit c32a2d
	lea			(WINDOW,%rax,2), WINDOW
Packit c32a2d
	add			%rax, B0L
Packit c32a2d
	add			%rax, B0R
Packit c32a2d
	
Packit c32a2d
	vmovups		(WINDOW), %ymm10
Packit c32a2d
	vmovups		32(WINDOW), %ymm11
Packit c32a2d
	vmovups		(WINDOW,%rax), %ymm12
Packit c32a2d
	vmovups		32(WINDOW,%rax), %ymm13
Packit c32a2d
	vmulps		(B0L), %ymm10, %ymm2
Packit c32a2d
	vmulps		32(B0L), %ymm11, %ymm3
Packit c32a2d
	vmulps		(B0R), %ymm10, %ymm4
Packit c32a2d
	vmulps		32(B0R), %ymm11, %ymm5
Packit c32a2d
	vmulps		64(B0L), %ymm12, %ymm6
Packit c32a2d
	vmulps		96(B0L), %ymm13, %ymm10
Packit c32a2d
	vmulps		64(B0R), %ymm12, %ymm7
Packit c32a2d
	vmulps		96(B0R), %ymm13, %ymm11
Packit c32a2d
	vaddps		%ymm3, %ymm2, %ymm2
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm3
Packit c32a2d
	vaddps		%ymm6, %ymm10, %ymm4
Packit c32a2d
	vaddps		%ymm7, %ymm11, %ymm5
Packit c32a2d
	lea			(WINDOW,%rax,2), WINDOW
Packit c32a2d
	add			%rax, B0L
Packit c32a2d
	add			%rax, B0R
Packit c32a2d
	
Packit c32a2d
	vunpcklps	%ymm0, %ymm8, %ymm6
Packit c32a2d
	vunpckhps	%ymm0, %ymm8, %ymm0
Packit c32a2d
	vunpcklps	%ymm1, %ymm9, %ymm7
Packit c32a2d
	vunpckhps	%ymm1, %ymm9, %ymm1
Packit c32a2d
	vaddps		%ymm6, %ymm0, %ymm0
Packit c32a2d
	vaddps		%ymm7, %ymm1, %ymm1
Packit c32a2d
	vunpcklps	%ymm3, %ymm2, %ymm6
Packit c32a2d
	vunpckhps	%ymm3, %ymm2, %ymm2
Packit c32a2d
	vunpcklps	%ymm5, %ymm4, %ymm7
Packit c32a2d
	vunpckhps	%ymm5, %ymm4, %ymm3
Packit c32a2d
	vaddps		%ymm6, %ymm2, %ymm2
Packit c32a2d
	vaddps		%ymm7, %ymm3, %ymm3
Packit c32a2d
	
Packit c32a2d
	vunpcklpd	%ymm1, %ymm0, %ymm4
Packit c32a2d
	vunpckhpd	%ymm1, %ymm0, %ymm0
Packit c32a2d
	vunpcklpd	%ymm3, %ymm2, %ymm5
Packit c32a2d
	vunpckhpd	%ymm3, %ymm2, %ymm1
Packit c32a2d
	vsubps		%ymm0, %ymm4, %ymm0
Packit c32a2d
	vsubps		%ymm1, %ymm5, %ymm1
Packit c32a2d
	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
Packit c32a2d
	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
Packit c32a2d
	vaddps		%ymm3, %ymm2, %ymm0
Packit c32a2d
	vmulps		%ymm14, %ymm0, %ymm0
Packit c32a2d
	
Packit c32a2d
	vmovups		%ymm0, (SAMPLES)
Packit c32a2d
	add			$32, SAMPLES
Packit c32a2d
	dec			%ecx
Packit c32a2d
	jnz			1b
Packit c32a2d
	
Packit c32a2d
	mov			$4, %ecx
Packit c32a2d
	
Packit c32a2d
	ALIGN16
Packit c32a2d
1:
Packit c32a2d
	vmovups		(WINDOW), %ymm8
Packit c32a2d
	vmovups		32(WINDOW), %ymm9
Packit c32a2d
	vmovups		(WINDOW,%rax), %ymm10
Packit c32a2d
	vmovups		32(WINDOW,%rax), %ymm11
Packit c32a2d
	vmulps		(B0L), %ymm8, %ymm0
Packit c32a2d
	vmulps		32(B0L), %ymm9, %ymm1
Packit c32a2d
	vmulps		(B0R), %ymm8, %ymm2
Packit c32a2d
	vmulps		32(B0R), %ymm9, %ymm3
Packit c32a2d
	vmulps		-64(B0L), %ymm10, %ymm4
Packit c32a2d
	vmulps		-32(B0L), %ymm11, %ymm5
Packit c32a2d
	vmulps		-64(B0R), %ymm10, %ymm6
Packit c32a2d
	vmulps		-32(B0R), %ymm11, %ymm7
Packit c32a2d
	vaddps		%ymm1, %ymm0, %ymm8
Packit c32a2d
	vaddps		%ymm3, %ymm2, %ymm0
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm9
Packit c32a2d
	vaddps		%ymm7, %ymm6, %ymm1
Packit c32a2d
	lea			(WINDOW,%rax,2), WINDOW
Packit c32a2d
	sub			%rax, B0L
Packit c32a2d
	sub			%rax, B0R
Packit c32a2d
	
Packit c32a2d
	vmovups		(WINDOW), %ymm10
Packit c32a2d
	vmovups		32(WINDOW), %ymm11
Packit c32a2d
	vmovups		(WINDOW,%rax), %ymm12
Packit c32a2d
	vmovups		32(WINDOW,%rax), %ymm13
Packit c32a2d
	vmulps		(B0L), %ymm10, %ymm2
Packit c32a2d
	vmulps		32(B0L), %ymm11, %ymm3
Packit c32a2d
	vmulps		(B0R), %ymm10, %ymm4
Packit c32a2d
	vmulps		32(B0R), %ymm11, %ymm5
Packit c32a2d
	vmulps		-64(B0L), %ymm12, %ymm6
Packit c32a2d
	vmulps		-32(B0L), %ymm13, %ymm10
Packit c32a2d
	vmulps		-64(B0R), %ymm12, %ymm7
Packit c32a2d
	vmulps		-32(B0R), %ymm13, %ymm11
Packit c32a2d
	vaddps		%ymm3, %ymm2, %ymm2
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm3
Packit c32a2d
	vaddps		%ymm6, %ymm10, %ymm4
Packit c32a2d
	vaddps		%ymm7, %ymm11, %ymm5
Packit c32a2d
	lea			(WINDOW,%rax,2), WINDOW
Packit c32a2d
	sub			%rax, B0L
Packit c32a2d
	sub			%rax, B0R
Packit c32a2d
	
Packit c32a2d
	vunpcklps	%ymm0, %ymm8, %ymm6
Packit c32a2d
	vunpckhps	%ymm0, %ymm8, %ymm0
Packit c32a2d
	vunpcklps	%ymm1, %ymm9, %ymm7
Packit c32a2d
	vunpckhps	%ymm1, %ymm9, %ymm1
Packit c32a2d
	vaddps		%ymm6, %ymm0, %ymm0
Packit c32a2d
	vaddps		%ymm7, %ymm1, %ymm1
Packit c32a2d
	vunpcklps	%ymm3, %ymm2, %ymm6
Packit c32a2d
	vunpckhps	%ymm3, %ymm2, %ymm2
Packit c32a2d
	vunpcklps	%ymm5, %ymm4, %ymm7
Packit c32a2d
	vunpckhps	%ymm5, %ymm4, %ymm3
Packit c32a2d
	vaddps		%ymm6, %ymm2, %ymm2
Packit c32a2d
	vaddps		%ymm7, %ymm3, %ymm3
Packit c32a2d
	
Packit c32a2d
	vunpcklpd	%ymm1, %ymm0, %ymm4
Packit c32a2d
	vunpckhpd	%ymm1, %ymm0, %ymm0
Packit c32a2d
	vunpcklpd	%ymm3, %ymm2, %ymm5
Packit c32a2d
	vunpckhpd	%ymm3, %ymm2, %ymm1
Packit c32a2d
	vaddps		%ymm0, %ymm4, %ymm0
Packit c32a2d
	vaddps		%ymm1, %ymm5, %ymm1
Packit c32a2d
	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
Packit c32a2d
	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
Packit c32a2d
	vaddps		%ymm3, %ymm2, %ymm0
Packit c32a2d
	vmulps		%ymm14, %ymm0, %ymm0
Packit c32a2d
	
Packit c32a2d
	vmovups		%ymm0, (SAMPLES)
Packit c32a2d
	add			$32, SAMPLES
Packit c32a2d
	dec			%ecx
Packit c32a2d
	jnz			1b
Packit c32a2d
	
Packit c32a2d
	vzeroupper
Packit c32a2d
	
Packit c32a2d
	xor			%eax, %eax
Packit c32a2d
	
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	movaps		(%rsp), %xmm6
Packit c32a2d
	movaps		16(%rsp), %xmm7
Packit c32a2d
	movaps		32(%rsp), %xmm8
Packit c32a2d
	movaps		48(%rsp), %xmm9
Packit c32a2d
	movaps		64(%rsp), %xmm10
Packit c32a2d
	movaps		80(%rsp), %xmm11
Packit c32a2d
	movaps		96(%rsp), %xmm12
Packit c32a2d
	movaps		112(%rsp), %xmm13
Packit c32a2d
	movaps		128(%rsp), %xmm14
Packit c32a2d
	mov			%rbp, %rsp
Packit c32a2d
	pop			%rbp
Packit c32a2d
#endif
Packit c32a2d
	ret
Packit c32a2d
Packit c32a2d
NONEXEC_STACK