Blame src/libmpg123/synth_stereo_avx_s32.S

Packit c32a2d
/*
Packit c32a2d
	synth_stereo_avx_s32: AVX optimized synth for x86-64 (stereo specific, s32 output version)
Packit c32a2d
Packit c32a2d
	copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
/* real *window; */
Packit c32a2d
#define WINDOW %r10
Packit c32a2d
/* real *b0l; */
Packit c32a2d
#define B0L %rdx
Packit c32a2d
/* real *b0r; */
Packit c32a2d
#define B0R %r8
Packit c32a2d
/* real *samples; */
Packit c32a2d
#define SAMPLES %r9
Packit c32a2d
#else
Packit c32a2d
/* real *window; */
Packit c32a2d
#define WINDOW %rdi
Packit c32a2d
/* real *b0l; */
Packit c32a2d
#define B0L %rsi
Packit c32a2d
/* real *b0r; */
Packit c32a2d
#define B0R %rdx
Packit c32a2d
/* real *samples; */
Packit c32a2d
#define SAMPLES %r9
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	int synth_1to1_s32_s_avx_asm(real *window, real *b0l, real *b0r, real *samples, int bo1);
Packit c32a2d
	return value: number of clipped samples
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.section	.rodata
Packit c32a2d
#else
Packit c32a2d
	.data
Packit c32a2d
#endif
Packit c32a2d
	ALIGN32
Packit c32a2d
maxmin_avx:
Packit c32a2d
	.long   1191182335
Packit c32a2d
	.long   1191182335
Packit c32a2d
	.long   1191182335
Packit c32a2d
	.long   1191182335
Packit c32a2d
	.long   1191182335
Packit c32a2d
	.long   1191182335
Packit c32a2d
	.long   1191182335
Packit c32a2d
	.long   1191182335
Packit c32a2d
	.long   -956301312
Packit c32a2d
	.long   -956301312
Packit c32a2d
	.long   -956301312
Packit c32a2d
	.long   -956301312
Packit c32a2d
	.long   -956301312
Packit c32a2d
	.long   -956301312
Packit c32a2d
	.long   -956301312
Packit c32a2d
	.long   -956301312
Packit c32a2d
scale_avx:
Packit c32a2d
	.long   1199570944
Packit c32a2d
	.text
Packit c32a2d
	ALIGN16
Packit c32a2d
	.globl ASM_NAME(synth_1to1_s32_s_avx_asm)
Packit c32a2d
ASM_NAME(synth_1to1_s32_s_avx_asm):
Packit c32a2d
#ifdef IS_MSABI /* should save xmm6-15 */
Packit c32a2d
	push		%rbp
Packit c32a2d
	mov			%rsp, %rbp
Packit c32a2d
	sub			$160, %rsp
Packit c32a2d
	movaps		%xmm6, (%rsp)
Packit c32a2d
	movaps		%xmm7, 16(%rsp)
Packit c32a2d
	movaps		%xmm8, 32(%rsp)
Packit c32a2d
	movaps		%xmm9, 48(%rsp)
Packit c32a2d
	movaps		%xmm10, 64(%rsp)
Packit c32a2d
	movaps		%xmm11, 80(%rsp)
Packit c32a2d
	movaps		%xmm12, 96(%rsp)
Packit c32a2d
	movaps		%xmm13, 112(%rsp)
Packit c32a2d
	movaps		%xmm14, 128(%rsp)
Packit c32a2d
	movaps		%xmm15, 144(%rsp)
Packit c32a2d
	movl		48(%rbp), %eax /* 5th argument; placed after 32-byte shadow space */
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
	vbroadcastss	scale_avx(%rip), %ymm14
Packit c32a2d
	
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	shl			$2, %eax
Packit c32a2d
	mov			%rcx, WINDOW
Packit c32a2d
#else
Packit c32a2d
	mov			%r8d, %eax
Packit c32a2d
	shl			$2, %eax
Packit c32a2d
	mov			%rcx, SAMPLES
Packit c32a2d
#endif
Packit c32a2d
	add			$64, WINDOW
Packit c32a2d
	sub			%rax, WINDOW
Packit c32a2d
Packit c32a2d
	mov			$128, %rax
Packit c32a2d
	mov			$4, %ecx
Packit c32a2d
	vpxor		%xmm15, %xmm15, %xmm15
Packit c32a2d
	
Packit c32a2d
	ALIGN16
Packit c32a2d
1:
Packit c32a2d
	vmovups		(WINDOW), %ymm8
Packit c32a2d
	vmovups		32(WINDOW), %ymm9
Packit c32a2d
	vmovups		(WINDOW,%rax), %ymm10
Packit c32a2d
	vmovups		32(WINDOW,%rax), %ymm11
Packit c32a2d
	vmulps		(B0L), %ymm8, %ymm0
Packit c32a2d
	vmulps		32(B0L), %ymm9, %ymm1
Packit c32a2d
	vmulps		(B0R), %ymm8, %ymm2
Packit c32a2d
	vmulps		32(B0R), %ymm9, %ymm3
Packit c32a2d
	vmulps		64(B0L), %ymm10, %ymm4
Packit c32a2d
	vmulps		96(B0L), %ymm11, %ymm5
Packit c32a2d
	vmulps		64(B0R), %ymm10, %ymm6
Packit c32a2d
	vmulps		96(B0R), %ymm11, %ymm7
Packit c32a2d
	vaddps		%ymm1, %ymm0, %ymm8
Packit c32a2d
	vaddps		%ymm3, %ymm2, %ymm0
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm9
Packit c32a2d
	vaddps		%ymm7, %ymm6, %ymm1
Packit c32a2d
	lea			(WINDOW,%rax,2), WINDOW
Packit c32a2d
	add			%rax, B0L
Packit c32a2d
	add			%rax, B0R
Packit c32a2d
	
Packit c32a2d
	vmovups		(WINDOW), %ymm10
Packit c32a2d
	vmovups		32(WINDOW), %ymm11
Packit c32a2d
	vmovups		(WINDOW,%rax), %ymm12
Packit c32a2d
	vmovups		32(WINDOW,%rax), %ymm13
Packit c32a2d
	vmulps		(B0L), %ymm10, %ymm2
Packit c32a2d
	vmulps		32(B0L), %ymm11, %ymm3
Packit c32a2d
	vmulps		(B0R), %ymm10, %ymm4
Packit c32a2d
	vmulps		32(B0R), %ymm11, %ymm5
Packit c32a2d
	vmulps		64(B0L), %ymm12, %ymm6
Packit c32a2d
	vmulps		96(B0L), %ymm13, %ymm10
Packit c32a2d
	vmulps		64(B0R), %ymm12, %ymm7
Packit c32a2d
	vmulps		96(B0R), %ymm13, %ymm11
Packit c32a2d
	vaddps		%ymm3, %ymm2, %ymm2
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm3
Packit c32a2d
	vaddps		%ymm6, %ymm10, %ymm4
Packit c32a2d
	vaddps		%ymm7, %ymm11, %ymm5
Packit c32a2d
	lea			(WINDOW,%rax,2), WINDOW
Packit c32a2d
	add			%rax, B0L
Packit c32a2d
	add			%rax, B0R
Packit c32a2d
	
Packit c32a2d
	vunpcklps	%ymm0, %ymm8, %ymm6
Packit c32a2d
	vunpckhps	%ymm0, %ymm8, %ymm0
Packit c32a2d
	vunpcklps	%ymm1, %ymm9, %ymm7
Packit c32a2d
	vunpckhps	%ymm1, %ymm9, %ymm1
Packit c32a2d
	vaddps		%ymm6, %ymm0, %ymm0
Packit c32a2d
	vaddps		%ymm7, %ymm1, %ymm1
Packit c32a2d
	vunpcklps	%ymm3, %ymm2, %ymm6
Packit c32a2d
	vunpckhps	%ymm3, %ymm2, %ymm2
Packit c32a2d
	vunpcklps	%ymm5, %ymm4, %ymm7
Packit c32a2d
	vunpckhps	%ymm5, %ymm4, %ymm3
Packit c32a2d
	vaddps		%ymm6, %ymm2, %ymm2
Packit c32a2d
	vaddps		%ymm7, %ymm3, %ymm3
Packit c32a2d
	
Packit c32a2d
	vunpcklpd	%ymm1, %ymm0, %ymm4
Packit c32a2d
	vunpckhpd	%ymm1, %ymm0, %ymm0
Packit c32a2d
	vunpcklpd	%ymm3, %ymm2, %ymm5
Packit c32a2d
	vunpckhpd	%ymm3, %ymm2, %ymm1
Packit c32a2d
	vsubps		%ymm0, %ymm4, %ymm0
Packit c32a2d
	vsubps		%ymm1, %ymm5, %ymm1
Packit c32a2d
	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
Packit c32a2d
	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
Packit c32a2d
	vaddps		%ymm3, %ymm2, %ymm0
Packit c32a2d
	vcmpnleps	maxmin_avx(%rip), %ymm0, %ymm1
Packit c32a2d
	vcmpltps	32+maxmin_avx(%rip), %ymm0, %ymm2
Packit c32a2d
	vmulps		%ymm14, %ymm0, %ymm0
Packit c32a2d
	vextractf128	$0x1, %ymm1, %xmm3
Packit c32a2d
	vextractf128	$0x1, %ymm2, %xmm4
Packit c32a2d
	vpackssdw	%xmm2, %xmm1, %xmm5
Packit c32a2d
	vpackssdw	%xmm4, %xmm3, %xmm3
Packit c32a2d
	vcvtps2dq	%ymm0, %ymm0
Packit c32a2d
	vpaddw		%xmm3, %xmm5, %xmm5
Packit c32a2d
	vpaddw		%xmm5, %xmm15, %xmm15
Packit c32a2d
	vxorps		%ymm1, %ymm0, %ymm0
Packit c32a2d
	
Packit c32a2d
	vmovups		%ymm0, (SAMPLES)
Packit c32a2d
	add			$32, SAMPLES
Packit c32a2d
	dec			%ecx
Packit c32a2d
	jnz			1b
Packit c32a2d
	
Packit c32a2d
	mov			$4, %ecx
Packit c32a2d
	
Packit c32a2d
	ALIGN16
Packit c32a2d
1:
Packit c32a2d
	vmovups		(WINDOW), %ymm8
Packit c32a2d
	vmovups		32(WINDOW), %ymm9
Packit c32a2d
	vmovups		(WINDOW,%rax), %ymm10
Packit c32a2d
	vmovups		32(WINDOW,%rax), %ymm11
Packit c32a2d
	vmulps		(B0L), %ymm8, %ymm0
Packit c32a2d
	vmulps		32(B0L), %ymm9, %ymm1
Packit c32a2d
	vmulps		(B0R), %ymm8, %ymm2
Packit c32a2d
	vmulps		32(B0R), %ymm9, %ymm3
Packit c32a2d
	vmulps		-64(B0L), %ymm10, %ymm4
Packit c32a2d
	vmulps		-32(B0L), %ymm11, %ymm5
Packit c32a2d
	vmulps		-64(B0R), %ymm10, %ymm6
Packit c32a2d
	vmulps		-32(B0R), %ymm11, %ymm7
Packit c32a2d
	vaddps		%ymm1, %ymm0, %ymm8
Packit c32a2d
	vaddps		%ymm3, %ymm2, %ymm0
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm9
Packit c32a2d
	vaddps		%ymm7, %ymm6, %ymm1
Packit c32a2d
	lea			(WINDOW,%rax,2), WINDOW
Packit c32a2d
	sub			%rax, B0L
Packit c32a2d
	sub			%rax, B0R
Packit c32a2d
	
Packit c32a2d
	vmovups		(WINDOW), %ymm10
Packit c32a2d
	vmovups		32(WINDOW), %ymm11
Packit c32a2d
	vmovups		(WINDOW,%rax), %ymm12
Packit c32a2d
	vmovups		32(WINDOW,%rax), %ymm13
Packit c32a2d
	vmulps		(B0L), %ymm10, %ymm2
Packit c32a2d
	vmulps		32(B0L), %ymm11, %ymm3
Packit c32a2d
	vmulps		(B0R), %ymm10, %ymm4
Packit c32a2d
	vmulps		32(B0R), %ymm11, %ymm5
Packit c32a2d
	vmulps		-64(B0L), %ymm12, %ymm6
Packit c32a2d
	vmulps		-32(B0L), %ymm13, %ymm10
Packit c32a2d
	vmulps		-64(B0R), %ymm12, %ymm7
Packit c32a2d
	vmulps		-32(B0R), %ymm13, %ymm11
Packit c32a2d
	vaddps		%ymm3, %ymm2, %ymm2
Packit c32a2d
	vaddps		%ymm5, %ymm4, %ymm3
Packit c32a2d
	vaddps		%ymm6, %ymm10, %ymm4
Packit c32a2d
	vaddps		%ymm7, %ymm11, %ymm5
Packit c32a2d
	lea			(WINDOW,%rax,2), WINDOW
Packit c32a2d
	sub			%rax, B0L
Packit c32a2d
	sub			%rax, B0R
Packit c32a2d
	
Packit c32a2d
	vunpcklps	%ymm0, %ymm8, %ymm6
Packit c32a2d
	vunpckhps	%ymm0, %ymm8, %ymm0
Packit c32a2d
	vunpcklps	%ymm1, %ymm9, %ymm7
Packit c32a2d
	vunpckhps	%ymm1, %ymm9, %ymm1
Packit c32a2d
	vaddps		%ymm6, %ymm0, %ymm0
Packit c32a2d
	vaddps		%ymm7, %ymm1, %ymm1
Packit c32a2d
	vunpcklps	%ymm3, %ymm2, %ymm6
Packit c32a2d
	vunpckhps	%ymm3, %ymm2, %ymm2
Packit c32a2d
	vunpcklps	%ymm5, %ymm4, %ymm7
Packit c32a2d
	vunpckhps	%ymm5, %ymm4, %ymm3
Packit c32a2d
	vaddps		%ymm6, %ymm2, %ymm2
Packit c32a2d
	vaddps		%ymm7, %ymm3, %ymm3
Packit c32a2d
	
Packit c32a2d
	vunpcklpd	%ymm1, %ymm0, %ymm4
Packit c32a2d
	vunpckhpd	%ymm1, %ymm0, %ymm0
Packit c32a2d
	vunpcklpd	%ymm3, %ymm2, %ymm5
Packit c32a2d
	vunpckhpd	%ymm3, %ymm2, %ymm1
Packit c32a2d
	vaddps		%ymm0, %ymm4, %ymm0
Packit c32a2d
	vaddps		%ymm1, %ymm5, %ymm1
Packit c32a2d
	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
Packit c32a2d
	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
Packit c32a2d
	vaddps		%ymm3, %ymm2, %ymm0
Packit c32a2d
	vcmpnleps	maxmin_avx(%rip), %ymm0, %ymm1
Packit c32a2d
	vcmpltps	32+maxmin_avx(%rip), %ymm0, %ymm2
Packit c32a2d
	vmulps		%ymm14, %ymm0, %ymm0
Packit c32a2d
	vextractf128	$0x1, %ymm1, %xmm3
Packit c32a2d
	vextractf128	$0x1, %ymm2, %xmm4
Packit c32a2d
	vpackssdw	%xmm2, %xmm1, %xmm5
Packit c32a2d
	vpackssdw	%xmm4, %xmm3, %xmm3
Packit c32a2d
	vcvtps2dq	%ymm0, %ymm0
Packit c32a2d
	vpaddw		%xmm3, %xmm5, %xmm5
Packit c32a2d
	vpaddw		%xmm5, %xmm15, %xmm15
Packit c32a2d
	vxorps		%ymm1, %ymm0, %ymm0
Packit c32a2d
	
Packit c32a2d
	vmovups		%ymm0, (SAMPLES)
Packit c32a2d
	add			$32, SAMPLES
Packit c32a2d
	dec			%ecx
Packit c32a2d
	jnz			1b
Packit c32a2d
	
Packit c32a2d
	vzeroupper
Packit c32a2d
	
Packit c32a2d
	pxor		%xmm1, %xmm1
Packit c32a2d
	psubw		%xmm15, %xmm1
Packit c32a2d
	pshufd		$0x4e, %xmm1, %xmm0
Packit c32a2d
	paddw		%xmm1, %xmm0
Packit c32a2d
	pshuflw		$0x4e, %xmm0, %xmm1
Packit c32a2d
	paddw		%xmm1, %xmm0
Packit c32a2d
	pshuflw		$0x11, %xmm0, %xmm1
Packit c32a2d
	paddw		%xmm1, %xmm0
Packit c32a2d
	movd		%xmm0, %eax
Packit c32a2d
	and			$0x7f, %eax
Packit c32a2d
	
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	movaps		(%rsp), %xmm6
Packit c32a2d
	movaps		16(%rsp), %xmm7
Packit c32a2d
	movaps		32(%rsp), %xmm8
Packit c32a2d
	movaps		48(%rsp), %xmm9
Packit c32a2d
	movaps		64(%rsp), %xmm10
Packit c32a2d
	movaps		80(%rsp), %xmm11
Packit c32a2d
	movaps		96(%rsp), %xmm12
Packit c32a2d
	movaps		112(%rsp), %xmm13
Packit c32a2d
	movaps		128(%rsp), %xmm14
Packit c32a2d
	movaps		144(%rsp), %xmm15
Packit c32a2d
	mov			%rbp, %rsp
Packit c32a2d
	pop			%rbp
Packit c32a2d
#endif
Packit c32a2d
	ret
Packit c32a2d
Packit c32a2d
NONEXEC_STACK