Blame src/libmpg123/synth_stereo_avx.S

Packit c32a2d
/*
Packit c32a2d
	synth_stereo_avx: AVX optimized synth for x86-64 (stereo specific version)
Packit c32a2d
Packit c32a2d
	copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
/* short *window; */
Packit c32a2d
#define WINDOW %r10
Packit c32a2d
/* short *b0l; */
Packit c32a2d
#define B0L %rdx
Packit c32a2d
/* short *b0r; */
Packit c32a2d
#define B0R %r8
Packit c32a2d
/* short *samples; */
Packit c32a2d
#define SAMPLES %r9
Packit c32a2d
#else
Packit c32a2d
/* short *window; */
Packit c32a2d
#define WINDOW %rdi
Packit c32a2d
/* short *b0l; */
Packit c32a2d
#define B0L %rsi
Packit c32a2d
/* short *b0r; */
Packit c32a2d
#define B0R %rdx
Packit c32a2d
/* short *samples; */
Packit c32a2d
#define SAMPLES %r9
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	int synth_1to1_s_avx_asm(short *window, short *b0l, short *b0r, short *samples, int bo1);
Packit c32a2d
	return value: number of clipped samples
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
	ALIGN16
Packit c32a2d
.globl ASM_NAME(synth_1to1_s_avx_asm)
Packit c32a2d
ASM_NAME(synth_1to1_s_avx_asm):
Packit c32a2d
#ifdef IS_MSABI /* should save xmm6-15 */
Packit c32a2d
	push		%rbp
Packit c32a2d
	mov			%rsp, %rbp
Packit c32a2d
	sub			$144, %rsp
Packit c32a2d
	movaps		%xmm6, (%rsp)
Packit c32a2d
	movaps		%xmm7, 16(%rsp)
Packit c32a2d
	movaps		%xmm8, 32(%rsp)
Packit c32a2d
	movaps		%xmm9, 48(%rsp)
Packit c32a2d
	movaps		%xmm10, 64(%rsp)
Packit c32a2d
	movaps		%xmm11, 80(%rsp)
Packit c32a2d
	movaps		%xmm12, 96(%rsp)
Packit c32a2d
	movaps		%xmm13, 112(%rsp)
Packit c32a2d
	movaps		%xmm14, 128(%rsp)
Packit c32a2d
	movl		48(%rbp), %eax /* 5th argument; placed after 32-byte shadow space */
Packit c32a2d
#endif
Packit c32a2d
	
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	shl			$1, %eax
Packit c32a2d
	mov			%rcx, WINDOW
Packit c32a2d
#else
Packit c32a2d
	mov			%r8d, %eax
Packit c32a2d
	shl			$1, %eax
Packit c32a2d
	movq		%rcx, SAMPLES
Packit c32a2d
#endif
Packit c32a2d
	add			$32, WINDOW
Packit c32a2d
	sub			%rax, WINDOW
Packit c32a2d
	
Packit c32a2d
	mov			$64, %rax
Packit c32a2d
	movl		$4, %ecx
Packit c32a2d
	vpxor		%xmm14, %xmm14, %xmm14
Packit c32a2d
	
Packit c32a2d
	ALIGN16
Packit c32a2d
1:
Packit c32a2d
	movups		(WINDOW), %xmm8
Packit c32a2d
	movups		16(WINDOW), %xmm9
Packit c32a2d
	movups		(WINDOW,%rax), %xmm10
Packit c32a2d
	movups		16(WINDOW,%rax), %xmm11
Packit c32a2d
	vpmaddwd	(B0L), %xmm8, %xmm0
Packit c32a2d
	vpmaddwd	16(B0L), %xmm9, %xmm1
Packit c32a2d
	vpmaddwd	(B0R), %xmm8, %xmm2
Packit c32a2d
	vpmaddwd	16(B0R), %xmm9, %xmm3
Packit c32a2d
	vpmaddwd	32(B0L), %xmm10, %xmm4
Packit c32a2d
	vpmaddwd	48(B0L), %xmm11, %xmm5
Packit c32a2d
	vpmaddwd	32(B0R), %xmm10, %xmm6
Packit c32a2d
	vpmaddwd	48(B0R), %xmm11, %xmm7
Packit c32a2d
	vpaddd		%xmm1, %xmm0, %xmm8
Packit c32a2d
	vpaddd		%xmm3, %xmm2, %xmm0
Packit c32a2d
	vpaddd		%xmm5, %xmm4, %xmm9
Packit c32a2d
	vpaddd		%xmm7, %xmm6, %xmm1
Packit c32a2d
	lea			(WINDOW,%rax,2), WINDOW
Packit c32a2d
	add			%rax, B0L
Packit c32a2d
	add			%rax, B0R
Packit c32a2d
	
Packit c32a2d
	movups		(WINDOW), %xmm10
Packit c32a2d
	movups		16(WINDOW), %xmm11
Packit c32a2d
	movups		(WINDOW,%rax), %xmm12
Packit c32a2d
	movups		16(WINDOW,%rax), %xmm13
Packit c32a2d
	vpmaddwd	(B0L), %xmm10, %xmm2
Packit c32a2d
	vpmaddwd	16(B0L), %xmm11, %xmm3
Packit c32a2d
	vpmaddwd	(B0R), %xmm10, %xmm4
Packit c32a2d
	vpmaddwd	16(B0R), %xmm11, %xmm5
Packit c32a2d
	vpmaddwd	32(B0L), %xmm12, %xmm6
Packit c32a2d
	vpmaddwd	48(B0L), %xmm13, %xmm10
Packit c32a2d
	vpmaddwd	32(B0R), %xmm12, %xmm7
Packit c32a2d
	vpmaddwd	48(B0R), %xmm13, %xmm11
Packit c32a2d
	vpaddd		%xmm3, %xmm2, %xmm2
Packit c32a2d
	vpaddd		%xmm5, %xmm4, %xmm3
Packit c32a2d
	vpaddd		%xmm6, %xmm10, %xmm4
Packit c32a2d
	vpaddd		%xmm7, %xmm11, %xmm5
Packit c32a2d
	lea			(WINDOW,%rax,2), WINDOW
Packit c32a2d
	add			%rax, B0L
Packit c32a2d
	add			%rax, B0R
Packit c32a2d
	
Packit c32a2d
	vpunpckldq	%xmm0, %xmm8, %xmm6
Packit c32a2d
	vpunpckhdq	%xmm0, %xmm8, %xmm0
Packit c32a2d
	vpunpckldq	%xmm1, %xmm9, %xmm7
Packit c32a2d
	vpunpckhdq	%xmm1, %xmm9, %xmm1
Packit c32a2d
	vpaddd		%xmm6, %xmm0, %xmm0
Packit c32a2d
	vpaddd		%xmm7, %xmm1, %xmm1
Packit c32a2d
	vpunpckldq	%xmm3, %xmm2, %xmm6
Packit c32a2d
	vpunpckhdq	%xmm3, %xmm2, %xmm2
Packit c32a2d
	vpunpckldq	%xmm5, %xmm4, %xmm7
Packit c32a2d
	vpunpckhdq	%xmm5, %xmm4, %xmm3
Packit c32a2d
	vpaddd		%xmm6, %xmm2, %xmm2
Packit c32a2d
	vpaddd		%xmm7, %xmm3, %xmm3
Packit c32a2d
	
Packit c32a2d
	vpunpcklqdq	%xmm1, %xmm0, %xmm4
Packit c32a2d
	vpunpckhqdq	%xmm1, %xmm0, %xmm0
Packit c32a2d
	vpunpcklqdq	%xmm3, %xmm2, %xmm5
Packit c32a2d
	vpunpckhqdq	%xmm3, %xmm2, %xmm1
Packit c32a2d
	vpaddd		%xmm0, %xmm4, %xmm0
Packit c32a2d
	vpaddd		%xmm1, %xmm5, %xmm1
Packit c32a2d
	vpsrad		$13, %xmm0, %xmm0
Packit c32a2d
	vpsrad		$13, %xmm1, %xmm1
Packit c32a2d
	vpackssdw	%xmm1, %xmm0, %xmm2
Packit c32a2d
	vpcmpeqd	%xmm3, %xmm3, %xmm3
Packit c32a2d
	vpslld		$16, %xmm0, %xmm0
Packit c32a2d
	vpslld		$16, %xmm1, %xmm1
Packit c32a2d
	vpsrld		$16, %xmm0, %xmm0
Packit c32a2d
	vpsrld		$16, %xmm1, %xmm1
Packit c32a2d
	vpackusdw	%xmm1, %xmm0, %xmm0
Packit c32a2d
	vpcmpeqw	%xmm2, %xmm0, %xmm0
Packit c32a2d
	vpxor		%xmm3, %xmm0, %xmm0
Packit c32a2d
	vpaddw		%xmm0, %xmm14, %xmm14
Packit c32a2d
	
Packit c32a2d
	movups		%xmm2, (SAMPLES)
Packit c32a2d
	add			$16, SAMPLES
Packit c32a2d
	dec			%ecx
Packit c32a2d
	jnz			1b
Packit c32a2d
	
Packit c32a2d
	movl		$4, %ecx
Packit c32a2d
	
Packit c32a2d
	ALIGN16
Packit c32a2d
1:
Packit c32a2d
	movups		(WINDOW), %xmm8
Packit c32a2d
	movups		16(WINDOW), %xmm9
Packit c32a2d
	movups		(WINDOW,%rax), %xmm10
Packit c32a2d
	movups		16(WINDOW,%rax), %xmm11
Packit c32a2d
	vpmaddwd	(B0L), %xmm8, %xmm0
Packit c32a2d
	vpmaddwd	16(B0L), %xmm9, %xmm1
Packit c32a2d
	vpmaddwd	(B0R), %xmm8, %xmm2
Packit c32a2d
	vpmaddwd	16(B0R), %xmm9, %xmm3
Packit c32a2d
	vpmaddwd	-32(B0L), %xmm10, %xmm4
Packit c32a2d
	vpmaddwd	-16(B0L), %xmm11, %xmm5
Packit c32a2d
	vpmaddwd	-32(B0R), %xmm10, %xmm6
Packit c32a2d
	vpmaddwd	-16(B0R), %xmm11, %xmm7
Packit c32a2d
	vpaddd		%xmm1, %xmm0, %xmm8
Packit c32a2d
	vpaddd		%xmm3, %xmm2, %xmm0
Packit c32a2d
	vpaddd		%xmm5, %xmm4, %xmm9
Packit c32a2d
	vpaddd		%xmm7, %xmm6, %xmm1
Packit c32a2d
	lea			(WINDOW,%rax,2), WINDOW
Packit c32a2d
	sub			%rax, B0L
Packit c32a2d
	sub			%rax, B0R
Packit c32a2d
	
Packit c32a2d
	movups		(WINDOW), %xmm10
Packit c32a2d
	movups		16(WINDOW), %xmm11
Packit c32a2d
	movups		(WINDOW,%rax), %xmm12
Packit c32a2d
	movups		16(WINDOW,%rax), %xmm13
Packit c32a2d
	vpmaddwd	(B0L), %xmm10, %xmm2
Packit c32a2d
	vpmaddwd	16(B0L), %xmm11, %xmm3
Packit c32a2d
	vpmaddwd	(B0R), %xmm10, %xmm4
Packit c32a2d
	vpmaddwd	16(B0R), %xmm11, %xmm5
Packit c32a2d
	vpmaddwd	-32(B0L), %xmm12, %xmm6
Packit c32a2d
	vpmaddwd	-16(B0L), %xmm13, %xmm10
Packit c32a2d
	vpmaddwd	-32(B0R), %xmm12, %xmm7
Packit c32a2d
	vpmaddwd	-16(B0R), %xmm13, %xmm11
Packit c32a2d
	vpaddd		%xmm3, %xmm2, %xmm2
Packit c32a2d
	vpaddd		%xmm5, %xmm4, %xmm3
Packit c32a2d
	vpaddd		%xmm6, %xmm10, %xmm4
Packit c32a2d
	vpaddd		%xmm7, %xmm11, %xmm5
Packit c32a2d
	lea			(WINDOW,%rax,2), WINDOW
Packit c32a2d
	sub			%rax, B0L
Packit c32a2d
	sub			%rax, B0R
Packit c32a2d
	
Packit c32a2d
	vpunpckldq	%xmm0, %xmm8, %xmm6
Packit c32a2d
	vpunpckhdq	%xmm0, %xmm8, %xmm0
Packit c32a2d
	vpunpckldq	%xmm1, %xmm9, %xmm7
Packit c32a2d
	vpunpckhdq	%xmm1, %xmm9, %xmm1
Packit c32a2d
	vpaddd		%xmm6, %xmm0, %xmm0
Packit c32a2d
	vpaddd		%xmm7, %xmm1, %xmm1
Packit c32a2d
	vpunpckldq	%xmm3, %xmm2, %xmm6
Packit c32a2d
	vpunpckhdq	%xmm3, %xmm2, %xmm2
Packit c32a2d
	vpunpckldq	%xmm5, %xmm4, %xmm7
Packit c32a2d
	vpunpckhdq	%xmm5, %xmm4, %xmm3
Packit c32a2d
	vpaddd		%xmm6, %xmm2, %xmm2
Packit c32a2d
	vpaddd		%xmm7, %xmm3, %xmm3
Packit c32a2d
	
Packit c32a2d
	vpunpcklqdq	%xmm1, %xmm0, %xmm4
Packit c32a2d
	vpunpckhqdq	%xmm1, %xmm0, %xmm0
Packit c32a2d
	vpunpcklqdq	%xmm3, %xmm2, %xmm5
Packit c32a2d
	vpunpckhqdq	%xmm3, %xmm2, %xmm1
Packit c32a2d
	vpaddd		%xmm0, %xmm4, %xmm0
Packit c32a2d
	vpaddd		%xmm1, %xmm5, %xmm1
Packit c32a2d
	vpsrad		$13, %xmm0, %xmm0
Packit c32a2d
	vpsrad		$13, %xmm1, %xmm1
Packit c32a2d
	vpackssdw	%xmm1, %xmm0, %xmm2
Packit c32a2d
	vpcmpeqd	%xmm3, %xmm3, %xmm3
Packit c32a2d
	vpslld		$16, %xmm0, %xmm0
Packit c32a2d
	vpslld		$16, %xmm1, %xmm1
Packit c32a2d
	vpsrld		$16, %xmm0, %xmm0
Packit c32a2d
	vpsrld		$16, %xmm1, %xmm1
Packit c32a2d
	vpackusdw	%xmm1, %xmm0, %xmm0
Packit c32a2d
	vpcmpeqw	%xmm2, %xmm0, %xmm0
Packit c32a2d
	vpxor		%xmm3, %xmm0, %xmm0
Packit c32a2d
	vpaddw		%xmm0, %xmm14, %xmm14
Packit c32a2d
	
Packit c32a2d
	movups		%xmm2, (SAMPLES)
Packit c32a2d
	add			$16, SAMPLES
Packit c32a2d
	dec			%ecx
Packit c32a2d
	jnz			1b
Packit c32a2d
	
Packit c32a2d
	pxor		%xmm1, %xmm1
Packit c32a2d
	psubw		%xmm14, %xmm1
Packit c32a2d
	pshufd		$0x4e, %xmm1, %xmm0
Packit c32a2d
	paddw		%xmm1, %xmm0
Packit c32a2d
	pshuflw		$0x4e, %xmm0, %xmm1
Packit c32a2d
	paddw		%xmm1, %xmm0
Packit c32a2d
	pshuflw		$0x11, %xmm0, %xmm1
Packit c32a2d
	paddw		%xmm1, %xmm0
Packit c32a2d
	movd		%xmm0, %eax
Packit c32a2d
	and			$0x7f, %eax
Packit c32a2d
	
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	movaps		(%rsp), %xmm6
Packit c32a2d
	movaps		16(%rsp), %xmm7
Packit c32a2d
	movaps		32(%rsp), %xmm8
Packit c32a2d
	movaps		48(%rsp), %xmm9
Packit c32a2d
	movaps		64(%rsp), %xmm10
Packit c32a2d
	movaps		80(%rsp), %xmm11
Packit c32a2d
	movaps		96(%rsp), %xmm12
Packit c32a2d
	movaps		112(%rsp), %xmm13
Packit c32a2d
	movaps		128(%rsp), %xmm14
Packit c32a2d
	mov			%rbp, %rsp
Packit c32a2d
	pop			%rbp
Packit c32a2d
#endif
Packit c32a2d
	ret
Packit c32a2d
Packit c32a2d
NONEXEC_STACK