Blame src/libmpg123/synth_stereo_x86_64_s32.S

Packit c32a2d
/*
Packit c32a2d
	synth_stereo_x86_64_s32: SSE optimized synth for x86-64 (stereo specific, s32 output version)
Packit c32a2d
Packit c32a2d
	copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Taihei Monma
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
/* short *window; */
Packit c32a2d
#define WINDOW %rsi
Packit c32a2d
/* short *b0l; */
Packit c32a2d
#define B0L %rdx
Packit c32a2d
/* short *b0r; */
Packit c32a2d
#define B0R %r8
Packit c32a2d
/* short *samples; */
Packit c32a2d
#define SAMPLES %rdi
Packit c32a2d
#else
Packit c32a2d
/* real *window; */
Packit c32a2d
#define WINDOW %rdi
Packit c32a2d
/* real *b0l; */
Packit c32a2d
#define B0L %rsi
Packit c32a2d
/* real *b0r; */
Packit c32a2d
#define B0R %rdx
Packit c32a2d
/* real *samples; */
Packit c32a2d
#define SAMPLES %r8
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
#define XMMREG_SCALE (%r9)  /* {65536.0, 65536.0, 65536.0, 65536.0} */
Packit c32a2d
#define XMMREG_MAX (%r10)  /* {32767.999, 32767.999, 32767.999, 32767.999} */
Packit c32a2d
#define XMMREG_MIN (%r11)  /* {-32768.0, -32768.0, -32768.0, -32768.0} */
Packit c32a2d
#define TEMP_CLIP (%rsp)
Packit c32a2d
Packit c32a2d
/*
Packit c32a2d
	int synth_1to1_s32_s_x86_64_asm(real *window, real *b0l, real *b0r, int32_t *samples, int bo1);
Packit c32a2d
	return value: number of clipped samples
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.section	.rodata
Packit c32a2d
#else
Packit c32a2d
	.data
Packit c32a2d
#endif
Packit c32a2d
	ALIGN32
Packit c32a2d
ASM_NAME(scale_s32):
Packit c32a2d
	.long   1199570944
Packit c32a2d
	.long   1199570944
Packit c32a2d
	.long   1199570944
Packit c32a2d
	.long   1199570944
Packit c32a2d
	ALIGN16
Packit c32a2d
ASM_NAME(maxmin_s32):
Packit c32a2d
	.long   1191182335
Packit c32a2d
	.long   1191182335
Packit c32a2d
	.long   1191182335
Packit c32a2d
	.long   1191182335
Packit c32a2d
	.long   -956301312
Packit c32a2d
	.long   -956301312
Packit c32a2d
	.long   -956301312
Packit c32a2d
	.long   -956301312
Packit c32a2d
	.text
Packit c32a2d
	ALIGN16
Packit c32a2d
.globl ASM_NAME(synth_1to1_s32_s_x86_64_asm)
Packit c32a2d
ASM_NAME(synth_1to1_s32_s_x86_64_asm):
Packit c32a2d
#ifdef IS_MSABI /* should save xmm6-15 */
Packit c32a2d
	movl		40(%rsp), %eax /* 5th argument; placed after 32-byte shadow space */
Packit c32a2d
	pushq		%rsi
Packit c32a2d
	pushq		%rdi
Packit c32a2d
	subq		$184, %rsp /* stack alignment + 10 xmm registers + temp */
Packit c32a2d
	movaps		%xmm6, 16(%rsp)
Packit c32a2d
	movaps		%xmm7, 32(%rsp)
Packit c32a2d
	movaps		%xmm8, 48(%rsp)
Packit c32a2d
	movaps		%xmm9, 64(%rsp)
Packit c32a2d
	movaps		%xmm10, 80(%rsp)
Packit c32a2d
	movaps		%xmm11, 96(%rsp)
Packit c32a2d
	movaps		%xmm12, 112(%rsp)
Packit c32a2d
	movaps		%xmm13, 128(%rsp)
Packit c32a2d
	movaps		%xmm14, 144(%rsp)
Packit c32a2d
	movaps		%xmm15, 160(%rsp)
Packit c32a2d
#else
Packit c32a2d
	subq		$24, %rsp  /* stack alignment + temp */
Packit c32a2d
#endif
Packit c32a2d
	
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	shlq		$32, %rax
Packit c32a2d
	shrq		$30, %rax
Packit c32a2d
	movq		%rcx, %rsi
Packit c32a2d
	movq		%r9, %rdi
Packit c32a2d
#else
Packit c32a2d
	movq		%r8, %rax
Packit c32a2d
	shlq		$32, %rax
Packit c32a2d
	shrq		$30, %rax
Packit c32a2d
	movq		%rcx, %r8
Packit c32a2d
#endif
Packit c32a2d
	leaq		64(WINDOW), WINDOW
Packit c32a2d
	subq		%rax, WINDOW
Packit c32a2d
	
Packit c32a2d
	leaq		ASM_NAME(scale_s32)(%rip), %r9
Packit c32a2d
	leaq		ASM_NAME(maxmin_s32)(%rip), %r10
Packit c32a2d
	leaq		16(%r10), %r11
Packit c32a2d
	xorps		%xmm0, %xmm0
Packit c32a2d
	movaps		%xmm0, TEMP_CLIP
Packit c32a2d
Packit c32a2d
	movl		$4, %ecx
Packit c32a2d
	
Packit c32a2d
	ALIGN16
Packit c32a2d
1:
Packit c32a2d
	movups		(WINDOW), %xmm8
Packit c32a2d
	movups		16(WINDOW), %xmm1
Packit c32a2d
	movups		32(WINDOW), %xmm2
Packit c32a2d
	movups		48(WINDOW), %xmm3
Packit c32a2d
	movups		128(WINDOW), %xmm9
Packit c32a2d
	movups		144(WINDOW), %xmm5
Packit c32a2d
	movups		160(WINDOW), %xmm6
Packit c32a2d
	movups		176(WINDOW), %xmm7
Packit c32a2d
	movaps		%xmm8, %xmm0
Packit c32a2d
	movaps		%xmm1, %xmm4
Packit c32a2d
	movaps		%xmm2, %xmm10
Packit c32a2d
	movaps		%xmm3, %xmm11
Packit c32a2d
	movaps		%xmm9, %xmm12
Packit c32a2d
	movaps		%xmm5, %xmm13
Packit c32a2d
	movaps		%xmm6, %xmm14
Packit c32a2d
	movaps		%xmm7, %xmm15
Packit c32a2d
	mulps		(B0L), %xmm8
Packit c32a2d
	mulps		16(B0L), %xmm1
Packit c32a2d
	mulps		32(B0L), %xmm2
Packit c32a2d
	mulps		48(B0L), %xmm3
Packit c32a2d
	mulps		64(B0L), %xmm9
Packit c32a2d
	mulps		80(B0L), %xmm5
Packit c32a2d
	mulps		96(B0L), %xmm6
Packit c32a2d
	mulps		112(B0L), %xmm7
Packit c32a2d
	mulps		(B0R), %xmm0
Packit c32a2d
	mulps		16(B0R), %xmm4
Packit c32a2d
	mulps		32(B0R), %xmm10
Packit c32a2d
	mulps		48(B0R), %xmm11
Packit c32a2d
	mulps		64(B0R), %xmm12
Packit c32a2d
	mulps		80(B0R), %xmm13
Packit c32a2d
	mulps		96(B0R), %xmm14
Packit c32a2d
	mulps		112(B0R), %xmm15
Packit c32a2d
	
Packit c32a2d
	addps		%xmm1, %xmm8
Packit c32a2d
	addps		%xmm2, %xmm3
Packit c32a2d
	addps		%xmm4, %xmm0
Packit c32a2d
	addps		%xmm11, %xmm10
Packit c32a2d
	addps		%xmm5, %xmm9
Packit c32a2d
	addps		%xmm7, %xmm6
Packit c32a2d
	addps		%xmm13, %xmm12
Packit c32a2d
	addps		%xmm15, %xmm14
Packit c32a2d
	addps		%xmm3, %xmm8
Packit c32a2d
	addps		%xmm6, %xmm9
Packit c32a2d
	addps		%xmm10, %xmm0
Packit c32a2d
	addps		%xmm12, %xmm14
Packit c32a2d
	movaps		%xmm0, %xmm12
Packit c32a2d
	movaps		%xmm14, %xmm13
Packit c32a2d
	leaq		256(WINDOW), WINDOW
Packit c32a2d
	leaq		128(B0L), B0L
Packit c32a2d
	leaq		128(B0R), B0R
Packit c32a2d
	
Packit c32a2d
	movups		(WINDOW), %xmm10
Packit c32a2d
	movups		16(WINDOW), %xmm1
Packit c32a2d
	movups		32(WINDOW), %xmm2
Packit c32a2d
	movups		48(WINDOW), %xmm3
Packit c32a2d
	movups		128(WINDOW), %xmm11
Packit c32a2d
	movups		144(WINDOW), %xmm5
Packit c32a2d
	movups		160(WINDOW), %xmm6
Packit c32a2d
	movups		176(WINDOW), %xmm7
Packit c32a2d
	movaps		%xmm10, %xmm0
Packit c32a2d
	movaps		%xmm1, %xmm4
Packit c32a2d
	movaps		%xmm2, %xmm14
Packit c32a2d
	movaps		%xmm3, %xmm15
Packit c32a2d
	mulps		(B0L), %xmm10
Packit c32a2d
	mulps		16(B0L), %xmm1
Packit c32a2d
	mulps		32(B0L), %xmm2
Packit c32a2d
	mulps		48(B0L), %xmm3
Packit c32a2d
	mulps		(B0R), %xmm0
Packit c32a2d
	mulps		16(B0R), %xmm4
Packit c32a2d
	mulps		32(B0R), %xmm14
Packit c32a2d
	mulps		48(B0R), %xmm15
Packit c32a2d
	addps		%xmm1, %xmm10
Packit c32a2d
	addps		%xmm2, %xmm3
Packit c32a2d
	addps		%xmm4, %xmm0
Packit c32a2d
	addps		%xmm15, %xmm14
Packit c32a2d
	movaps		%xmm11, %xmm1
Packit c32a2d
	movaps		%xmm5, %xmm2
Packit c32a2d
	movaps		%xmm6, %xmm4
Packit c32a2d
	movaps		%xmm7, %xmm15
Packit c32a2d
	mulps		64(B0L), %xmm11
Packit c32a2d
	mulps		80(B0L), %xmm5
Packit c32a2d
	mulps		96(B0L), %xmm6
Packit c32a2d
	mulps		112(B0L), %xmm7
Packit c32a2d
	mulps		64(B0R), %xmm1
Packit c32a2d
	mulps		80(B0R), %xmm2
Packit c32a2d
	mulps		96(B0R), %xmm4
Packit c32a2d
	mulps		112(B0R), %xmm15
Packit c32a2d
	addps		%xmm5, %xmm11
Packit c32a2d
	addps		%xmm7, %xmm6
Packit c32a2d
	addps		%xmm2, %xmm1
Packit c32a2d
	addps		%xmm15, %xmm4
Packit c32a2d
	
Packit c32a2d
	addps		%xmm3, %xmm10
Packit c32a2d
	addps		%xmm6, %xmm11
Packit c32a2d
	addps		%xmm0, %xmm14
Packit c32a2d
	addps		%xmm4, %xmm1
Packit c32a2d
	movaps		%xmm1, %xmm15
Packit c32a2d
	leaq		256(WINDOW), WINDOW
Packit c32a2d
	leaq		128(B0L), B0L
Packit c32a2d
	leaq		128(B0R), B0R
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm8, %xmm0
Packit c32a2d
	movaps		%xmm10, %xmm1
Packit c32a2d
	movaps		%xmm12, %xmm4
Packit c32a2d
	movaps		%xmm14, %xmm5
Packit c32a2d
	unpcklps	%xmm9, %xmm8
Packit c32a2d
	unpcklps	%xmm11, %xmm10
Packit c32a2d
	unpckhps	%xmm9, %xmm0
Packit c32a2d
	unpckhps	%xmm11, %xmm1
Packit c32a2d
	unpcklps	%xmm13, %xmm12
Packit c32a2d
	unpcklps	%xmm15, %xmm14
Packit c32a2d
	unpckhps	%xmm13, %xmm4
Packit c32a2d
	unpckhps	%xmm15, %xmm5
Packit c32a2d
	movaps		%xmm8, %xmm2
Packit c32a2d
	movaps		%xmm0, %xmm3
Packit c32a2d
	movaps		%xmm12, %xmm6
Packit c32a2d
	movaps		%xmm4, %xmm7
Packit c32a2d
	movlhps		%xmm10, %xmm8
Packit c32a2d
	movhlps		%xmm2, %xmm10
Packit c32a2d
	movlhps		%xmm1, %xmm0
Packit c32a2d
	movhlps		%xmm3, %xmm1
Packit c32a2d
	movlhps		%xmm14, %xmm12
Packit c32a2d
	movhlps		%xmm6, %xmm14
Packit c32a2d
	movlhps		%xmm5, %xmm4
Packit c32a2d
	movhlps		%xmm7, %xmm5
Packit c32a2d
	subps		%xmm10, %xmm8
Packit c32a2d
	subps		%xmm1, %xmm0
Packit c32a2d
	subps		%xmm14, %xmm12
Packit c32a2d
	subps		%xmm5, %xmm4
Packit c32a2d
	addps		%xmm8, %xmm0
Packit c32a2d
	addps		%xmm12, %xmm4
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm0, %xmm2
Packit c32a2d
	movaps		%xmm0, %xmm3
Packit c32a2d
	movaps		%xmm4, %xmm5
Packit c32a2d
	movaps		%xmm4, %xmm6
Packit c32a2d
	mulps		XMMREG_SCALE, %xmm0
Packit c32a2d
	mulps		XMMREG_SCALE, %xmm4
Packit c32a2d
	cmpnleps	XMMREG_MAX, %xmm2
Packit c32a2d
	cmpltps		XMMREG_MIN, %xmm3
Packit c32a2d
	cmpnleps	XMMREG_MAX, %xmm5
Packit c32a2d
	cmpltps		XMMREG_MIN, %xmm6
Packit c32a2d
	cvtps2dq	%xmm0, %xmm0
Packit c32a2d
	cvtps2dq	%xmm4, %xmm4
Packit c32a2d
	xorps		%xmm2, %xmm0
Packit c32a2d
	xorps		%xmm5, %xmm4
Packit c32a2d
	movaps		%xmm0, %xmm1
Packit c32a2d
	unpcklps	%xmm4, %xmm0
Packit c32a2d
	unpckhps	%xmm4, %xmm1
Packit c32a2d
	movups		%xmm0, (SAMPLES)
Packit c32a2d
	movups		%xmm1, 16(SAMPLES)
Packit c32a2d
	
Packit c32a2d
	packssdw	%xmm5, %xmm2
Packit c32a2d
	packssdw	%xmm6, %xmm3
Packit c32a2d
	psrlw		$15, %xmm2
Packit c32a2d
	psrlw		$15, %xmm3
Packit c32a2d
	paddw		%xmm3, %xmm2
Packit c32a2d
	paddw		TEMP_CLIP, %xmm2
Packit c32a2d
	movaps		%xmm2, TEMP_CLIP
Packit c32a2d
	
Packit c32a2d
	leaq		32(SAMPLES), SAMPLES
Packit c32a2d
	decl		%ecx
Packit c32a2d
	jnz			1b
Packit c32a2d
	
Packit c32a2d
	movl		$4, %ecx
Packit c32a2d
	
Packit c32a2d
	ALIGN16
Packit c32a2d
1:
Packit c32a2d
	movups		(WINDOW), %xmm8
Packit c32a2d
	movups		16(WINDOW), %xmm1
Packit c32a2d
	movups		32(WINDOW), %xmm2
Packit c32a2d
	movups		48(WINDOW), %xmm3
Packit c32a2d
	movups		128(WINDOW), %xmm9
Packit c32a2d
	movups		144(WINDOW), %xmm5
Packit c32a2d
	movups		160(WINDOW), %xmm6
Packit c32a2d
	movups		176(WINDOW), %xmm7
Packit c32a2d
	movaps		%xmm8, %xmm0
Packit c32a2d
	movaps		%xmm1, %xmm4
Packit c32a2d
	movaps		%xmm2, %xmm10
Packit c32a2d
	movaps		%xmm3, %xmm11
Packit c32a2d
	movaps		%xmm9, %xmm12
Packit c32a2d
	movaps		%xmm5, %xmm13
Packit c32a2d
	movaps		%xmm6, %xmm14
Packit c32a2d
	movaps		%xmm7, %xmm15
Packit c32a2d
	mulps		(B0L), %xmm8
Packit c32a2d
	mulps		16(B0L), %xmm1
Packit c32a2d
	mulps		32(B0L), %xmm2
Packit c32a2d
	mulps		48(B0L), %xmm3
Packit c32a2d
	mulps		-64(B0L), %xmm9
Packit c32a2d
	mulps		-48(B0L), %xmm5
Packit c32a2d
	mulps		-32(B0L), %xmm6
Packit c32a2d
	mulps		-16(B0L), %xmm7
Packit c32a2d
	mulps		(B0R), %xmm0
Packit c32a2d
	mulps		16(B0R), %xmm4
Packit c32a2d
	mulps		32(B0R), %xmm10
Packit c32a2d
	mulps		48(B0R), %xmm11
Packit c32a2d
	mulps		-64(B0R), %xmm12
Packit c32a2d
	mulps		-48(B0R), %xmm13
Packit c32a2d
	mulps		-32(B0R), %xmm14
Packit c32a2d
	mulps		-16(B0R), %xmm15
Packit c32a2d
	
Packit c32a2d
	addps		%xmm1, %xmm8
Packit c32a2d
	addps		%xmm2, %xmm3
Packit c32a2d
	addps		%xmm4, %xmm0
Packit c32a2d
	addps		%xmm11, %xmm10
Packit c32a2d
	addps		%xmm5, %xmm9
Packit c32a2d
	addps		%xmm7, %xmm6
Packit c32a2d
	addps		%xmm13, %xmm12
Packit c32a2d
	addps		%xmm15, %xmm14
Packit c32a2d
	addps		%xmm3, %xmm8
Packit c32a2d
	addps		%xmm6, %xmm9
Packit c32a2d
	addps		%xmm10, %xmm0
Packit c32a2d
	addps		%xmm12, %xmm14
Packit c32a2d
	movaps		%xmm0, %xmm12
Packit c32a2d
	movaps		%xmm14, %xmm13
Packit c32a2d
	leaq		256(WINDOW), WINDOW
Packit c32a2d
	leaq		-128(B0L), B0L
Packit c32a2d
	leaq		-128(B0R), B0R
Packit c32a2d
	
Packit c32a2d
	movups		(WINDOW), %xmm10
Packit c32a2d
	movups		16(WINDOW), %xmm1
Packit c32a2d
	movups		32(WINDOW), %xmm2
Packit c32a2d
	movups		48(WINDOW), %xmm3
Packit c32a2d
	movups		128(WINDOW), %xmm11
Packit c32a2d
	movups		144(WINDOW), %xmm5
Packit c32a2d
	movups		160(WINDOW), %xmm6
Packit c32a2d
	movups		176(WINDOW), %xmm7
Packit c32a2d
	movaps		%xmm10, %xmm0
Packit c32a2d
	movaps		%xmm1, %xmm4
Packit c32a2d
	movaps		%xmm2, %xmm14
Packit c32a2d
	movaps		%xmm3, %xmm15
Packit c32a2d
	mulps		(B0L), %xmm10
Packit c32a2d
	mulps		16(B0L), %xmm1
Packit c32a2d
	mulps		32(B0L), %xmm2
Packit c32a2d
	mulps		48(B0L), %xmm3
Packit c32a2d
	mulps		(B0R), %xmm0
Packit c32a2d
	mulps		16(B0R), %xmm4
Packit c32a2d
	mulps		32(B0R), %xmm14
Packit c32a2d
	mulps		48(B0R), %xmm15
Packit c32a2d
	addps		%xmm1, %xmm10
Packit c32a2d
	addps		%xmm2, %xmm3
Packit c32a2d
	addps		%xmm4, %xmm0
Packit c32a2d
	addps		%xmm15, %xmm14
Packit c32a2d
	movaps		%xmm11, %xmm1
Packit c32a2d
	movaps		%xmm5, %xmm2
Packit c32a2d
	movaps		%xmm6, %xmm4
Packit c32a2d
	movaps		%xmm7, %xmm15
Packit c32a2d
	mulps		-64(B0L), %xmm11
Packit c32a2d
	mulps		-48(B0L), %xmm5
Packit c32a2d
	mulps		-32(B0L), %xmm6
Packit c32a2d
	mulps		-16(B0L), %xmm7
Packit c32a2d
	mulps		-64(B0R), %xmm1
Packit c32a2d
	mulps		-48(B0R), %xmm2
Packit c32a2d
	mulps		-32(B0R), %xmm4
Packit c32a2d
	mulps		-16(B0R), %xmm15
Packit c32a2d
	addps		%xmm5, %xmm11
Packit c32a2d
	addps		%xmm7, %xmm6
Packit c32a2d
	addps		%xmm2, %xmm1
Packit c32a2d
	addps		%xmm15, %xmm4
Packit c32a2d
	
Packit c32a2d
	addps		%xmm3, %xmm10
Packit c32a2d
	addps		%xmm6, %xmm11
Packit c32a2d
	addps		%xmm0, %xmm14
Packit c32a2d
	addps		%xmm4, %xmm1
Packit c32a2d
	movaps		%xmm1, %xmm15
Packit c32a2d
	leaq		256(WINDOW), WINDOW
Packit c32a2d
	leaq		-128(B0L), B0L
Packit c32a2d
	leaq		-128(B0R), B0R
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm8, %xmm0
Packit c32a2d
	movaps		%xmm10, %xmm1
Packit c32a2d
	movaps		%xmm12, %xmm4
Packit c32a2d
	movaps		%xmm14, %xmm5
Packit c32a2d
	unpcklps	%xmm9, %xmm8
Packit c32a2d
	unpcklps	%xmm11, %xmm10
Packit c32a2d
	unpckhps	%xmm9, %xmm0
Packit c32a2d
	unpckhps	%xmm11, %xmm1
Packit c32a2d
	unpcklps	%xmm13, %xmm12
Packit c32a2d
	unpcklps	%xmm15, %xmm14
Packit c32a2d
	unpckhps	%xmm13, %xmm4
Packit c32a2d
	unpckhps	%xmm15, %xmm5
Packit c32a2d
	movaps		%xmm8, %xmm2
Packit c32a2d
	movaps		%xmm0, %xmm3
Packit c32a2d
	movaps		%xmm12, %xmm6
Packit c32a2d
	movaps		%xmm4, %xmm7
Packit c32a2d
	movlhps		%xmm10, %xmm8
Packit c32a2d
	movhlps		%xmm2, %xmm10
Packit c32a2d
	movlhps		%xmm1, %xmm0
Packit c32a2d
	movhlps		%xmm3, %xmm1
Packit c32a2d
	movlhps		%xmm14, %xmm12
Packit c32a2d
	movhlps		%xmm6, %xmm14
Packit c32a2d
	movlhps		%xmm5, %xmm4
Packit c32a2d
	movhlps		%xmm7, %xmm5
Packit c32a2d
	addps		%xmm10, %xmm8
Packit c32a2d
	addps		%xmm1, %xmm0
Packit c32a2d
	addps		%xmm14, %xmm12
Packit c32a2d
	addps		%xmm5, %xmm4
Packit c32a2d
	addps		%xmm8, %xmm0
Packit c32a2d
	addps		%xmm12, %xmm4
Packit c32a2d
	
Packit c32a2d
	movaps		%xmm0, %xmm2
Packit c32a2d
	movaps		%xmm0, %xmm3
Packit c32a2d
	movaps		%xmm4, %xmm5
Packit c32a2d
	movaps		%xmm4, %xmm6
Packit c32a2d
	mulps		XMMREG_SCALE, %xmm0
Packit c32a2d
	mulps		XMMREG_SCALE, %xmm4
Packit c32a2d
	cmpnleps	XMMREG_MAX, %xmm2
Packit c32a2d
	cmpltps		XMMREG_MIN, %xmm3
Packit c32a2d
	cmpnleps	XMMREG_MAX, %xmm5
Packit c32a2d
	cmpltps		XMMREG_MIN, %xmm6
Packit c32a2d
	cvtps2dq	%xmm0, %xmm0
Packit c32a2d
	cvtps2dq	%xmm4, %xmm4
Packit c32a2d
	xorps		%xmm2, %xmm0
Packit c32a2d
	xorps		%xmm5, %xmm4
Packit c32a2d
	movaps		%xmm0, %xmm1
Packit c32a2d
	unpcklps	%xmm4, %xmm0
Packit c32a2d
	unpckhps	%xmm4, %xmm1
Packit c32a2d
	movups		%xmm0, (SAMPLES)
Packit c32a2d
	movups		%xmm1, 16(SAMPLES)
Packit c32a2d
	
Packit c32a2d
	packssdw	%xmm5, %xmm2
Packit c32a2d
	packssdw	%xmm6, %xmm3
Packit c32a2d
	psrlw		$15, %xmm2
Packit c32a2d
	psrlw		$15, %xmm3
Packit c32a2d
	paddw		%xmm3, %xmm2
Packit c32a2d
	paddw		TEMP_CLIP, %xmm2
Packit c32a2d
	movaps		%xmm2, TEMP_CLIP
Packit c32a2d
	
Packit c32a2d
	leaq		32(SAMPLES), SAMPLES
Packit c32a2d
	decl		%ecx
Packit c32a2d
	jnz			1b
Packit c32a2d
	
Packit c32a2d
	movaps		TEMP_CLIP, %xmm4
Packit c32a2d
	movhlps		%xmm4, %xmm0
Packit c32a2d
	paddw		%xmm4, %xmm0
Packit c32a2d
	pshuflw		$0x55, %xmm0, %xmm1
Packit c32a2d
	pshuflw		$0xaa, %xmm0, %xmm2
Packit c32a2d
	pshuflw		$0xff, %xmm0, %xmm3
Packit c32a2d
	paddw		%xmm1, %xmm0
Packit c32a2d
	paddw		%xmm2, %xmm0
Packit c32a2d
	paddw		%xmm3, %xmm0
Packit c32a2d
	
Packit c32a2d
	movd		%xmm0, %eax
Packit c32a2d
	andl		$0xffff, %eax
Packit c32a2d
	
Packit c32a2d
#ifdef IS_MSABI
Packit c32a2d
	movaps		(%rsp), %xmm6
Packit c32a2d
	movaps		16(%rsp), %xmm7
Packit c32a2d
	movaps		32(%rsp), %xmm8
Packit c32a2d
	movaps		48(%rsp), %xmm9
Packit c32a2d
	movaps		64(%rsp), %xmm10
Packit c32a2d
	movaps		80(%rsp), %xmm11
Packit c32a2d
	movaps		96(%rsp), %xmm12
Packit c32a2d
	movaps		112(%rsp), %xmm13
Packit c32a2d
	movaps		128(%rsp), %xmm14
Packit c32a2d
	movaps		144(%rsp), %xmm15
Packit c32a2d
	addq		$184, %rsp
Packit c32a2d
	popq		%rdi
Packit c32a2d
	popq		%rsi
Packit c32a2d
#else
Packit c32a2d
	addq		$24, %rsp
Packit c32a2d
#endif
Packit c32a2d
	ret
Packit c32a2d
Packit c32a2d
NONEXEC_STACK