Blame src/libmpg123/synth_3dnow.S

Packit c32a2d
/*
Packit c32a2d
	decode_3dnow.s - 3DNow! optimized synth_1to1()
Packit c32a2d
Packit c32a2d
	copyright ?-2007 by the mpg123 project - free software under the terms of the LGPL 2.1
Packit c32a2d
	see COPYING and AUTHORS files in distribution or http://mpg123.org
Packit c32a2d
	initially written by Syuuhei Kashiyama
Packit c32a2d
Packit c32a2d
	This code based 'decode_3dnow.s' by Syuuhei Kashiyama
Packit c32a2d
	<squash@mb.kcom.ne.jp>,only two types of changes have been made:
Packit c32a2d
Packit c32a2d
	- remove PREFETCH instruction for speedup
Packit c32a2d
	- change function name for support 3DNow! automatic detect
Packit c32a2d
	- femms moved to before 'call dct64_3dnow'
Packit c32a2d
Packit c32a2d
	You can find Kashiyama's original 3dnow! support patch
Packit c32a2d
	(for mpg123-0.59o) at
Packit c32a2d
	http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese).
Packit c32a2d
Packit c32a2d
	by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1999
Packit c32a2d
                  	<kim@comtec.co.jp>               - after  1.Apr.1999
Packit c32a2d
Packit c32a2d
Packit c32a2d
Packit c32a2d
	Replacement of synth_1to1() with AMD's 3DNow! SIMD operations support
Packit c32a2d
Packit c32a2d
	Syuuhei Kashiyama <squash@mb.kcom.ne.jp>
Packit c32a2d
Packit c32a2d
	The author of this program disclaim whole expressed or implied
Packit c32a2d
	warranties with regard to this program, and in no event shall the
Packit c32a2d
	author of this program liable to whatever resulted from the use of
Packit c32a2d
	this program. Use it at your own risk.
Packit c32a2d
*/
Packit c32a2d
Packit c32a2d
#include "mangle.h"
Packit c32a2d
Packit c32a2d
#ifdef ACCURATE_ROUNDING
Packit c32a2d
#ifndef __APPLE__
Packit c32a2d
	.section	.rodata
Packit c32a2d
#else
Packit c32a2d
	.data
Packit c32a2d
#endif
Packit c32a2d
	ALIGN8
Packit c32a2d
max_s16:
Packit c32a2d
	.long   1191181824 /* 32767.0 */
Packit c32a2d
	.long   1191181824
Packit c32a2d
min_s16:
Packit c32a2d
	.long   -956301312 /* -32768.0 */
Packit c32a2d
	.long   -956301312
Packit c32a2d
ftoi_magic:
Packit c32a2d
	.long	1262485504 /* 2^23 + 2^22 */
Packit c32a2d
	.long	1262485504
Packit c32a2d
#endif
Packit c32a2d
	.text
Packit c32a2d
	ALIGN16
Packit c32a2d
.globl ASM_NAME(synth_1to1_3dnow_asm)
Packit c32a2d
/* int synth_1to1_3dnow_asm(real *bandPtr, int channel, unsigned char *out, unsigned char *buffs, int *bo, real *decwin); */
Packit c32a2d
ASM_NAME(synth_1to1_3dnow_asm):
Packit c32a2d
	subl $24,%esp
Packit c32a2d
	pushl %ebp
Packit c32a2d
	pushl %edi
Packit c32a2d
	xorl %ebp,%ebp
Packit c32a2d
	pushl %esi
Packit c32a2d
	pushl %ebx
Packit c32a2d
/* stack old: 0=ebx 4=esi 8=edi 12=ebp 16,20,24,28,32,36=local 40=back 44=bandptr 48=channel 52=out 56=pnt */
Packit c32a2d
/* stack new: 0=ebx 4=esi 8=edi 12=ebp 16,20,24,28,32,36=local 40=back 44=bandptr 48=channel 52=out 56=buffs 60=bo 64=decwin */
Packit c32a2d
#define OUT     52(%esp)
Packit c32a2d
#define CHANNEL 48(%esp)
Packit c32a2d
#define BANDPTR 44(%esp)
Packit c32a2d
#define BUFFS   56(%esp)
Packit c32a2d
#define BO      60(%esp)
Packit c32a2d
#define DECWIN  64(%esp)
Packit c32a2d
#define LOCAL0  16(%esp)
Packit c32a2d
#define LOCAL1  20(%esp)
Packit c32a2d
#define EBXSAVE 24(%esp)
Packit c32a2d
#define LOCAL5  36(%esp)
Packit c32a2d
Packit c32a2d
#ifdef ACCURATE_ROUNDING
Packit c32a2d
	#undef _EBX_
Packit c32a2d
	#define _EBX_ %eax
Packit c32a2d
	GET_GOT
Packit c32a2d
	/* FIXME */
Packit c32a2d
#if PIC
Packit c32a2d
	movl _EBX_, EBXSAVE
Packit c32a2d
#endif
Packit c32a2d
#endif
Packit c32a2d
	movl OUT,%esi
Packit c32a2d
	movl %esi,LOCAL0 /* save buffer start (samples pointer) to another local var */
Packit c32a2d
	movl CHANNEL,%ebx
Packit c32a2d
	movl BO,%esi     /* bo address */
Packit c32a2d
	movl (%esi),%edx /* bo value */
Packit c32a2d
Packit c32a2d
	femms
Packit c32a2d
	testl %ebx,%ebx
Packit c32a2d
	jne .L26
Packit c32a2d
/* if(!channel) */
Packit c32a2d
	decl %edx   /* --bo */
Packit c32a2d
	andl $15,%edx
Packit c32a2d
	movl %edx,(%esi) /* save bo */
Packit c32a2d
	movl BUFFS,%ecx
Packit c32a2d
	jmp .L27
Packit c32a2d
.L26: /* if(channel) */
Packit c32a2d
	addl $2,LOCAL0   /* samples++ */
Packit c32a2d
	movl BUFFS,%ecx
Packit c32a2d
	addl $2176,%ecx
Packit c32a2d
.L27:
Packit c32a2d
/* edx (and it's lower end) still holds bo value */
Packit c32a2d
	testb $1,%dl  /* bo & 0x1 */
Packit c32a2d
	je .L28
Packit c32a2d
	movl %edx,LOCAL5
Packit c32a2d
	movl %ecx,%ebx
Packit c32a2d
	movl BANDPTR,%esi
Packit c32a2d
	movl %edx,%edi
Packit c32a2d
	pushl %esi
Packit c32a2d
	sall $2,%edi
Packit c32a2d
	movl %ebx,%eax
Packit c32a2d
	movl %edi,24(%esp) /* LOCAL1, actually */
Packit c32a2d
	addl %edi,%eax
Packit c32a2d
	pushl %eax
Packit c32a2d
	movl %edx,%eax
Packit c32a2d
	incl %eax
Packit c32a2d
	andl $15,%eax
Packit c32a2d
	leal 1088(,%eax,4),%eax
Packit c32a2d
	addl %ebx,%eax
Packit c32a2d
	pushl %eax
Packit c32a2d
	call FUNC(dct64_3dnow)
Packit c32a2d
	addl $12,%esp
Packit c32a2d
	jmp .L29
Packit c32a2d
.L28:
Packit c32a2d
	leal 1(%edx),%esi
Packit c32a2d
	movl BANDPTR,%edi
Packit c32a2d
	movl %esi,LOCAL5
Packit c32a2d
	leal 1092(%ecx,%edx,4),%eax
Packit c32a2d
	pushl %edi
Packit c32a2d
	leal 1088(%ecx),%ebx
Packit c32a2d
	pushl %eax
Packit c32a2d
	sall $2,%esi
Packit c32a2d
	leal (%ecx,%edx,4),%eax
Packit c32a2d
	pushl %eax
Packit c32a2d
	call FUNC(dct64_3dnow)
Packit c32a2d
	addl $12,%esp
Packit c32a2d
	movl %esi,LOCAL1
Packit c32a2d
.L29:
Packit c32a2d
	movl DECWIN,%edx
Packit c32a2d
	addl $64,%edx
Packit c32a2d
	movl $16,%ecx
Packit c32a2d
	subl LOCAL1,%edx
Packit c32a2d
	movl LOCAL0,%edi
Packit c32a2d
Packit c32a2d
	pcmpeqb %mm7,%mm7
Packit c32a2d
	pslld $31,%mm7
Packit c32a2d
	movq (%edx),%mm0
Packit c32a2d
	movq (%ebx),%mm1
Packit c32a2d
	ALIGN32
Packit c32a2d
.L33:
Packit c32a2d
#if defined(ACCURATE_ROUNDING) && defined(PIC)
Packit c32a2d
	movl EBXSAVE, _EBX_
Packit c32a2d
#endif
Packit c32a2d
	movq 8(%edx),%mm3
Packit c32a2d
	pfmul %mm1,%mm0
Packit c32a2d
	movq 8(%ebx),%mm4
Packit c32a2d
	movq 16(%edx),%mm5
Packit c32a2d
	pfmul %mm4,%mm3
Packit c32a2d
	movq 16(%ebx),%mm6
Packit c32a2d
	pfadd %mm3,%mm0
Packit c32a2d
	movq 24(%edx),%mm1
Packit c32a2d
	pfmul %mm6,%mm5
Packit c32a2d
	movq 24(%ebx),%mm2
Packit c32a2d
	pfadd %mm5,%mm0
Packit c32a2d
	movq 32(%edx),%mm3
Packit c32a2d
	pfmul %mm2,%mm1
Packit c32a2d
	movq 32(%ebx),%mm4
Packit c32a2d
	pfadd %mm1,%mm0
Packit c32a2d
	movq 40(%edx),%mm5
Packit c32a2d
	pfmul %mm4,%mm3
Packit c32a2d
	movq 40(%ebx),%mm6
Packit c32a2d
	pfadd %mm3,%mm0
Packit c32a2d
	movq 48(%edx),%mm1
Packit c32a2d
	pfmul %mm6,%mm5
Packit c32a2d
	movq 48(%ebx),%mm2
Packit c32a2d
	pfadd %mm0,%mm5
Packit c32a2d
	movq 56(%edx),%mm3
Packit c32a2d
	pfmul %mm1,%mm2
Packit c32a2d
	movq 56(%ebx),%mm4
Packit c32a2d
	pfadd %mm5,%mm2
Packit c32a2d
	addl $64,%ebx
Packit c32a2d
	subl $-128,%edx
Packit c32a2d
	movq (%edx),%mm0
Packit c32a2d
	pfmul %mm4,%mm3
Packit c32a2d
	movq (%ebx),%mm1
Packit c32a2d
	pfadd %mm3,%mm2
Packit c32a2d
	movq %mm2,%mm3
Packit c32a2d
	psrlq $32,%mm3
Packit c32a2d
	pfsub %mm3,%mm2
Packit c32a2d
	incl %ebp
Packit c32a2d
#ifdef ACCURATE_ROUNDING
Packit c32a2d
	pfmin LOCAL_VAR(max_s16),%mm2
Packit c32a2d
	pfmax LOCAL_VAR(min_s16),%mm2
Packit c32a2d
	pfadd LOCAL_VAR(ftoi_magic),%mm2
Packit c32a2d
#else
Packit c32a2d
	pf2id %mm2,%mm2
Packit c32a2d
	packssdw %mm2,%mm2
Packit c32a2d
#endif
Packit c32a2d
	movd %mm2,%eax
Packit c32a2d
	movw %ax,0(%edi)
Packit c32a2d
	addl $4,%edi
Packit c32a2d
	decl %ecx
Packit c32a2d
	jnz .L33
Packit c32a2d
Packit c32a2d
#if defined(ACCURATE_ROUNDING) && defined(PIC)
Packit c32a2d
	movl EBXSAVE, _EBX_
Packit c32a2d
#endif
Packit c32a2d
	movd (%ebx),%mm0
Packit c32a2d
	movd (%edx),%mm1
Packit c32a2d
	punpckldq 8(%ebx),%mm0
Packit c32a2d
	punpckldq 8(%edx),%mm1
Packit c32a2d
	movd 16(%ebx),%mm3
Packit c32a2d
	movd 16(%edx),%mm4
Packit c32a2d
	pfmul %mm1,%mm0
Packit c32a2d
	punpckldq 24(%ebx),%mm3
Packit c32a2d
	punpckldq 24(%edx),%mm4
Packit c32a2d
	movd 32(%ebx),%mm5
Packit c32a2d
	movd 32(%edx),%mm6
Packit c32a2d
	pfmul %mm4,%mm3
Packit c32a2d
	punpckldq 40(%ebx),%mm5
Packit c32a2d
	punpckldq 40(%edx),%mm6
Packit c32a2d
	pfadd %mm3,%mm0
Packit c32a2d
	movd 48(%ebx),%mm1
Packit c32a2d
	movd 48(%edx),%mm2
Packit c32a2d
	pfmul %mm6,%mm5
Packit c32a2d
	punpckldq 56(%ebx),%mm1
Packit c32a2d
	punpckldq 56(%edx),%mm2
Packit c32a2d
	pfadd %mm5,%mm0
Packit c32a2d
	pfmul %mm2,%mm1
Packit c32a2d
	pfadd %mm1,%mm0
Packit c32a2d
	pfacc %mm1,%mm0
Packit c32a2d
#ifdef ACCURATE_ROUNDING
Packit c32a2d
	pfmin LOCAL_VAR(max_s16),%mm0
Packit c32a2d
	pfmax LOCAL_VAR(min_s16),%mm0
Packit c32a2d
	pfadd LOCAL_VAR(ftoi_magic),%mm0
Packit c32a2d
#else
Packit c32a2d
	pf2id %mm0,%mm0
Packit c32a2d
	packssdw %mm0,%mm0
Packit c32a2d
#endif
Packit c32a2d
	movd %mm0,%eax
Packit c32a2d
	movw %ax,0(%edi)
Packit c32a2d
	incl %ebp
Packit c32a2d
	movl LOCAL5,%esi
Packit c32a2d
	addl $-64,%ebx
Packit c32a2d
	movl $15,%ebp
Packit c32a2d
	addl $4,%edi
Packit c32a2d
	leal -128(%edx,%esi,8),%edx
Packit c32a2d
Packit c32a2d
	movl $15,%ecx
Packit c32a2d
	movd (%ebx),%mm0
Packit c32a2d
	movd -4(%edx),%mm1
Packit c32a2d
	punpckldq 4(%ebx),%mm0
Packit c32a2d
	punpckldq -8(%edx),%mm1
Packit c32a2d
	ALIGN32
Packit c32a2d
.L46:
Packit c32a2d
#if defined(ACCURATE_ROUNDING) && defined(PIC)
Packit c32a2d
	movl EBXSAVE, _EBX_
Packit c32a2d
#endif
Packit c32a2d
	movd 8(%ebx),%mm3
Packit c32a2d
	movd -12(%edx),%mm4
Packit c32a2d
	pfmul %mm1,%mm0
Packit c32a2d
	punpckldq 12(%ebx),%mm3
Packit c32a2d
	punpckldq -16(%edx),%mm4
Packit c32a2d
	movd 16(%ebx),%mm5
Packit c32a2d
	movd -20(%edx),%mm6
Packit c32a2d
	pfmul %mm4,%mm3
Packit c32a2d
	punpckldq 20(%ebx),%mm5
Packit c32a2d
	punpckldq -24(%edx),%mm6
Packit c32a2d
	pfadd %mm3,%mm0
Packit c32a2d
	movd 24(%ebx),%mm1
Packit c32a2d
	movd -28(%edx),%mm2
Packit c32a2d
	pfmul %mm6,%mm5
Packit c32a2d
	punpckldq 28(%ebx),%mm1
Packit c32a2d
	punpckldq -32(%edx),%mm2
Packit c32a2d
	pfadd %mm5,%mm0
Packit c32a2d
	movd 32(%ebx),%mm3
Packit c32a2d
	movd -36(%edx),%mm4
Packit c32a2d
	pfmul %mm2,%mm1
Packit c32a2d
	punpckldq 36(%ebx),%mm3
Packit c32a2d
	punpckldq -40(%edx),%mm4
Packit c32a2d
	pfadd %mm1,%mm0
Packit c32a2d
	movd 40(%ebx),%mm5
Packit c32a2d
	movd -44(%edx),%mm6
Packit c32a2d
	pfmul %mm4,%mm3
Packit c32a2d
	punpckldq 44(%ebx),%mm5
Packit c32a2d
	punpckldq -48(%edx),%mm6
Packit c32a2d
	pfadd %mm3,%mm0
Packit c32a2d
	movd 48(%ebx),%mm1
Packit c32a2d
	movd -52(%edx),%mm2
Packit c32a2d
	pfmul %mm6,%mm5
Packit c32a2d
	punpckldq 52(%ebx),%mm1
Packit c32a2d
	punpckldq -56(%edx),%mm2
Packit c32a2d
	pfadd %mm0,%mm5
Packit c32a2d
	movd 56(%ebx),%mm3
Packit c32a2d
	movd -60(%edx),%mm4
Packit c32a2d
	pfmul %mm2,%mm1
Packit c32a2d
	punpckldq 60(%ebx),%mm3
Packit c32a2d
	punpckldq (%edx),%mm4
Packit c32a2d
	pfadd %mm1,%mm5
Packit c32a2d
	addl $-128,%edx
Packit c32a2d
	addl $-64,%ebx
Packit c32a2d
	movd (%ebx),%mm0
Packit c32a2d
	movd -4(%edx),%mm1
Packit c32a2d
	pfmul %mm4,%mm3
Packit c32a2d
	punpckldq 4(%ebx),%mm0
Packit c32a2d
	punpckldq -8(%edx),%mm1
Packit c32a2d
	pfadd %mm5,%mm3
Packit c32a2d
	pfacc %mm3,%mm3
Packit c32a2d
	incl %ebp
Packit c32a2d
	pxor %mm7,%mm3
Packit c32a2d
#ifdef ACCURATE_ROUNDING
Packit c32a2d
	pfmin LOCAL_VAR(max_s16),%mm3
Packit c32a2d
	pfmax LOCAL_VAR(min_s16),%mm3
Packit c32a2d
	pfadd LOCAL_VAR(ftoi_magic),%mm3
Packit c32a2d
#else
Packit c32a2d
	pf2id %mm3,%mm3
Packit c32a2d
	packssdw %mm3,%mm3
Packit c32a2d
#endif
Packit c32a2d
	movd %mm3,%eax
Packit c32a2d
	movw %ax,(%edi)
Packit c32a2d
	addl $4,%edi
Packit c32a2d
	decl %ecx
Packit c32a2d
	jnz .L46
Packit c32a2d
Packit c32a2d
	femms
Packit c32a2d
	movl %ebp,%eax
Packit c32a2d
	popl %ebx
Packit c32a2d
	popl %esi
Packit c32a2d
	popl %edi
Packit c32a2d
	popl %ebp
Packit c32a2d
	addl $24,%esp
Packit c32a2d
	ret
Packit c32a2d
Packit c32a2d
#if defined(PIC) && defined(__APPLE__)
Packit c32a2d
	.section __IMPORT,__jump_table,symbol_stubs,self_modifying_code+pure_instructions,5
Packit c32a2d
L_dct64_3dnow:
Packit c32a2d
	.indirect_symbol ASM_NAME(dct64_3dnow)
Packit c32a2d
	hlt ; hlt ; hlt ; hlt ; hlt
Packit c32a2d
#endif
Packit c32a2d
Packit c32a2d
NONEXEC_STACK