/* dct36_3dnowext: extended 3DNow optimized DCT36 copyright ?-2007 by the mpg123 project - free software under the terms of the LGPL 2.1 see COPYING and AUTHORS files in distribution or http://mpg123.org Transformed back into standalone asm, with help of gcc -S -DHAVE_CONFIG_H -I. -march=k6-3 -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o dct36_3dnowext.{S,c} MPlayer comment follows. */ /* * dct36_3dnow.c - 3DNow! optimized dct36() * * This code based 'dct36_3dnow.s' by Syuuhei Kashiyama * , only two types of changes have been made: * * - removed PREFETCH instruction for speedup * - changed function name for support 3DNow! automatic detection * * You can find Kashiyama's original 3dnow! support patch * (for mpg123-0.59o) at * http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese). * * by KIMURA Takuhiro - until 31.Mar.1999 * - after 1.Apr.1999 * * Modified for use with MPlayer, for details see the changelog at * http://svn.mplayerhq.hu/mplayer/trunk/ * $Id: dct36_3dnow.c 18786 2006-06-22 13:34:00Z diego $ * * Original disclaimer: * The author of this program disclaim whole expressed or implied * warranties with regard to this program, and in no event shall the * author of this program liable to whatever resulted from the use of * this program. Use it at your own risk. * * 2003/06/21: Moved to GCC inline assembly - Alex Beregszaszi */ #include "mangle.h" .text ALIGN32 .globl ASM_NAME(dct36_3dnowext) /* .type ASM_NAME(dct36_3dnowext), @function */ ASM_NAME(dct36_3dnowext): pushl %ebp movl %esp, %ebp #if defined(PIC) && defined(__APPLE__) sub $4, %esp #endif pushl %esi pushl %ebx #undef _EBX_ #define _EBX_ %edi PREPARE_GOT GET_GOT #if defined(PIC) && defined(__APPLE__) #define _COS9_ 0(%edi) #define _tfcos36_ 0(%eax) mov GLOBAL_VAR_PTR(tfcos36), %eax mov GLOBAL_VAR_PTR(COS9), %edi mov %eax, -4(%ebp) #else #define _COS9_ GLOBAL_VAR(COS9) #define _tfcos36_ GLOBAL_VAR(tfcos36) #endif movl 8(%ebp), %eax movl 12(%ebp), %esi movl 16(%ebp), %ecx movl 20(%ebp), %edx movl 24(%ebp), %ebx /* APP */ movq (%eax),%mm0 movq 4(%eax),%mm1 pfadd %mm1,%mm0 movq %mm0,4(%eax) psrlq $32,%mm1 movq 12(%eax),%mm2 punpckldq %mm2,%mm1 pfadd %mm2,%mm1 movq %mm1,12(%eax) psrlq $32,%mm2 movq 20(%eax),%mm3 punpckldq %mm3,%mm2 pfadd %mm3,%mm2 movq %mm2,20(%eax) psrlq $32,%mm3 movq 28(%eax),%mm4 punpckldq %mm4,%mm3 pfadd %mm4,%mm3 movq %mm3,28(%eax) psrlq $32,%mm4 movq 36(%eax),%mm5 punpckldq %mm5,%mm4 pfadd %mm5,%mm4 movq %mm4,36(%eax) psrlq $32,%mm5 movq 44(%eax),%mm6 punpckldq %mm6,%mm5 pfadd %mm6,%mm5 movq %mm5,44(%eax) psrlq $32,%mm6 movq 52(%eax),%mm7 punpckldq %mm7,%mm6 pfadd %mm7,%mm6 movq %mm6,52(%eax) psrlq $32,%mm7 movq 60(%eax),%mm0 punpckldq %mm0,%mm7 pfadd %mm0,%mm7 movq %mm7,60(%eax) psrlq $32,%mm0 movd 68(%eax),%mm1 pfadd %mm1,%mm0 movd %mm0,68(%eax) movd 4(%eax),%mm0 movd 12(%eax),%mm1 punpckldq %mm1,%mm0 punpckldq 20(%eax),%mm1 pfadd %mm1,%mm0 movd %mm0,12(%eax) psrlq $32,%mm0 movd %mm0,20(%eax) psrlq $32,%mm1 movd 28(%eax),%mm2 punpckldq %mm2,%mm1 punpckldq 36(%eax),%mm2 pfadd %mm2,%mm1 movd %mm1,28(%eax) psrlq $32,%mm1 movd %mm1,36(%eax) psrlq $32,%mm2 movd 44(%eax),%mm3 punpckldq %mm3,%mm2 punpckldq 52(%eax),%mm3 pfadd %mm3,%mm2 movd %mm2,44(%eax) psrlq $32,%mm2 movd %mm2,52(%eax) psrlq $32,%mm3 movd 60(%eax),%mm4 punpckldq %mm4,%mm3 punpckldq 68(%eax),%mm4 pfadd %mm4,%mm3 movd %mm3,60(%eax) psrlq $32,%mm3 movd %mm3,68(%eax) movq 24(%eax),%mm0 movq 48(%eax),%mm1 movd 12+_COS9_,%mm2 punpckldq %mm2,%mm2 movd 24+_COS9_,%mm3 punpckldq %mm3,%mm3 pfmul %mm2,%mm0 pfmul %mm3,%mm1 pushl %eax movl $1,%eax movd %eax,%mm7 pi2fd %mm7,%mm7 popl %eax movq 8(%eax),%mm2 movd 4+_COS9_,%mm3 punpckldq %mm3,%mm3 pfmul %mm3,%mm2 pfadd %mm0,%mm2 movq 40(%eax),%mm3 movd 20+_COS9_,%mm4 punpckldq %mm4,%mm4 pfmul %mm4,%mm3 pfadd %mm3,%mm2 movq 56(%eax),%mm3 movd 28+_COS9_,%mm4 punpckldq %mm4,%mm4 pfmul %mm4,%mm3 pfadd %mm3,%mm2 movq (%eax),%mm3 movq 16(%eax),%mm4 movd 8+_COS9_,%mm5 punpckldq %mm5,%mm5 pfmul %mm5,%mm4 pfadd %mm4,%mm3 movq 32(%eax),%mm4 movd 16+_COS9_,%mm5 punpckldq %mm5,%mm5 pfmul %mm5,%mm4 pfadd %mm4,%mm3 pfadd %mm1,%mm3 movq 64(%eax),%mm4 movd 32+_COS9_,%mm5 punpckldq %mm5,%mm5 pfmul %mm5,%mm4 pfadd %mm4,%mm3 movq %mm2,%mm4 pfadd %mm3,%mm4 movq %mm7,%mm5 #if defined(PIC) && defined(__APPLE__) mov -4(%ebp),%eax #endif punpckldq 0+_tfcos36_,%mm5 pfmul %mm5,%mm4 movq %mm4,%mm5 pfacc %mm5,%mm5 movd 108(%edx),%mm6 punpckldq 104(%edx),%mm6 pfmul %mm6,%mm5 pswapd %mm5,%mm5 movq %mm5,32(%ecx) movq %mm4,%mm6 punpckldq %mm6,%mm5 pfsub %mm6,%mm5 punpckhdq %mm5,%mm5 movd 32(%edx),%mm6 punpckldq 36(%edx),%mm6 pfmul %mm6,%mm5 movd 32(%esi),%mm6 punpckldq 36(%esi),%mm6 pfadd %mm6,%mm5 movd %mm5,1024(%ebx) psrlq $32,%mm5 movd %mm5,1152(%ebx) movq %mm3,%mm4 pfsub %mm2,%mm4 movq %mm7,%mm5 punpckldq 32+_tfcos36_,%mm5 #if defined(PIC) && defined(__APPLE__) mov 8(%ebp),%eax #endif pfmul %mm5,%mm4 movq %mm4,%mm5 pfacc %mm5,%mm5 movd 140(%edx),%mm6 punpckldq 72(%edx),%mm6 pfmul %mm6,%mm5 movd %mm5,68(%ecx) psrlq $32,%mm5 movd %mm5,0(%ecx) movq %mm4,%mm6 punpckldq %mm6,%mm5 pfsub %mm6,%mm5 punpckhdq %mm5,%mm5 movd 0(%edx),%mm6 punpckldq 68(%edx),%mm6 pfmul %mm6,%mm5 movd 0(%esi),%mm6 punpckldq 68(%esi),%mm6 pfadd %mm6,%mm5 movd %mm5,0(%ebx) psrlq $32,%mm5 movd %mm5,2176(%ebx) movq 8(%eax),%mm2 movq 40(%eax),%mm3 pfsub %mm3,%mm2 movq 56(%eax),%mm3 pfsub %mm3,%mm2 movd 12+_COS9_,%mm3 punpckldq %mm3,%mm3 pfmul %mm3,%mm2 movq 16(%eax),%mm3 movq 32(%eax),%mm4 pfsub %mm4,%mm3 movq 64(%eax),%mm4 pfsub %mm4,%mm3 movd 24+_COS9_,%mm4 punpckldq %mm4,%mm4 pfmul %mm4,%mm3 movq 48(%eax),%mm4 pfsub %mm4,%mm3 movq (%eax),%mm4 pfadd %mm4,%mm3 movq %mm2,%mm4 pfadd %mm3,%mm4 movq %mm7,%mm5 #if defined(PIC) && defined(__APPLE__) mov -4(%ebp),%eax #endif punpckldq 4+_tfcos36_,%mm5 pfmul %mm5,%mm4 movq %mm4,%mm5 pfacc %mm5,%mm5 movd 112(%edx),%mm6 punpckldq 100(%edx),%mm6 pfmul %mm6,%mm5 movd %mm5,40(%ecx) psrlq $32,%mm5 movd %mm5,28(%ecx) movq %mm4,%mm6 punpckldq %mm6,%mm5 pfsub %mm6,%mm5 punpckhdq %mm5,%mm5 movd 28(%edx),%mm6 punpckldq 40(%edx),%mm6 pfmul %mm6,%mm5 movd 28(%esi),%mm6 punpckldq 40(%esi),%mm6 pfadd %mm6,%mm5 movd %mm5,896(%ebx) psrlq $32,%mm5 movd %mm5,1280(%ebx) movq %mm3,%mm4 pfsub %mm2,%mm4 movq %mm7,%mm5 punpckldq 28+_tfcos36_,%mm5 #if defined(PIC) && defined(__APPLE__) mov 8(%ebp),%eax #endif pfmul %mm5,%mm4 movq %mm4,%mm5 pfacc %mm5,%mm5 movd 136(%edx),%mm6 punpckldq 76(%edx),%mm6 pfmul %mm6,%mm5 movd %mm5,64(%ecx) psrlq $32,%mm5 movd %mm5,4(%ecx) movq %mm4,%mm6 punpckldq %mm6,%mm5 pfsub %mm6,%mm5 punpckhdq %mm5,%mm5 movd 4(%edx),%mm6 punpckldq 64(%edx),%mm6 pfmul %mm6,%mm5 movd 4(%esi),%mm6 punpckldq 64(%esi),%mm6 pfadd %mm6,%mm5 movd %mm5,128(%ebx) psrlq $32,%mm5 movd %mm5,2048(%ebx) movq 8(%eax),%mm2 movd 20+_COS9_,%mm3 punpckldq %mm3,%mm3 pfmul %mm3,%mm2 pfsub %mm0,%mm2 movq 40(%eax),%mm3 movd 28+_COS9_,%mm4 punpckldq %mm4,%mm4 pfmul %mm4,%mm3 pfsub %mm3,%mm2 movq 56(%eax),%mm3 movd 4+_COS9_,%mm4 punpckldq %mm4,%mm4 pfmul %mm4,%mm3 pfadd %mm3,%mm2 movq (%eax),%mm3 movq 16(%eax),%mm4 movd 32+_COS9_,%mm5 punpckldq %mm5,%mm5 pfmul %mm5,%mm4 pfsub %mm4,%mm3 movq 32(%eax),%mm4 movd 8+_COS9_,%mm5 punpckldq %mm5,%mm5 pfmul %mm5,%mm4 pfsub %mm4,%mm3 pfadd %mm1,%mm3 movq 64(%eax),%mm4 movd 16+_COS9_,%mm5 punpckldq %mm5,%mm5 pfmul %mm5,%mm4 pfadd %mm4,%mm3 movq %mm2,%mm4 pfadd %mm3,%mm4 movq %mm7,%mm5 #if defined(PIC) && defined(__APPLE__) mov -4(%ebp),%eax #endif punpckldq 8+_tfcos36_,%mm5 pfmul %mm5,%mm4 movq %mm4,%mm5 pfacc %mm5,%mm5 movd 116(%edx),%mm6 punpckldq 96(%edx),%mm6 pfmul %mm6,%mm5 movd %mm5,44(%ecx) psrlq $32,%mm5 movd %mm5,24(%ecx) movq %mm4,%mm6 punpckldq %mm6,%mm5 pfsub %mm6,%mm5 punpckhdq %mm5,%mm5 movd 24(%edx),%mm6 punpckldq 44(%edx),%mm6 pfmul %mm6,%mm5 movd 24(%esi),%mm6 punpckldq 44(%esi),%mm6 pfadd %mm6,%mm5 movd %mm5,768(%ebx) psrlq $32,%mm5 movd %mm5,1408(%ebx) movq %mm3,%mm4 pfsub %mm2,%mm4 movq %mm7,%mm5 punpckldq 24+_tfcos36_,%mm5 #if defined(PIC) && defined(__APPLE__) mov 8(%ebp),%eax #endif pfmul %mm5,%mm4 movq %mm4,%mm5 pfacc %mm5,%mm5 movd 132(%edx),%mm6 punpckldq 80(%edx),%mm6 pfmul %mm6,%mm5 movd %mm5,60(%ecx) psrlq $32,%mm5 movd %mm5,8(%ecx) movq %mm4,%mm6 punpckldq %mm6,%mm5 pfsub %mm6,%mm5 punpckhdq %mm5,%mm5 movd 8(%edx),%mm6 punpckldq 60(%edx),%mm6 pfmul %mm6,%mm5 movd 8(%esi),%mm6 punpckldq 60(%esi),%mm6 pfadd %mm6,%mm5 movd %mm5,256(%ebx) psrlq $32,%mm5 movd %mm5,1920(%ebx) movq 8(%eax),%mm2 movd 28+_COS9_,%mm3 punpckldq %mm3,%mm3 pfmul %mm3,%mm2 pfsub %mm0,%mm2 movq 40(%eax),%mm3 movd 4+_COS9_,%mm4 punpckldq %mm4,%mm4 pfmul %mm4,%mm3 pfadd %mm3,%mm2 movq 56(%eax),%mm3 movd 20+_COS9_,%mm4 punpckldq %mm4,%mm4 pfmul %mm4,%mm3 pfsub %mm3,%mm2 movq (%eax),%mm3 movq 16(%eax),%mm4 movd 16+_COS9_,%mm5 punpckldq %mm5,%mm5 pfmul %mm5,%mm4 pfsub %mm4,%mm3 movq 32(%eax),%mm4 movd 32+_COS9_,%mm5 punpckldq %mm5,%mm5 pfmul %mm5,%mm4 pfadd %mm4,%mm3 pfadd %mm1,%mm3 movq 64(%eax),%mm4 movd 8+_COS9_,%mm5 punpckldq %mm5,%mm5 pfmul %mm5,%mm4 pfsub %mm4,%mm3 movq %mm2,%mm4 pfadd %mm3,%mm4 movq %mm7,%mm5 #if defined(PIC) && defined(__APPLE__) mov -4(%ebp),%eax #endif punpckldq 12+_tfcos36_,%mm5 pfmul %mm5,%mm4 movq %mm4,%mm5 pfacc %mm5,%mm5 movd 120(%edx),%mm6 punpckldq 92(%edx),%mm6 pfmul %mm6,%mm5 movd %mm5,48(%ecx) psrlq $32,%mm5 movd %mm5,20(%ecx) movq %mm4,%mm6 punpckldq %mm6,%mm5 pfsub %mm6,%mm5 punpckhdq %mm5,%mm5 movd 20(%edx),%mm6 punpckldq 48(%edx),%mm6 pfmul %mm6,%mm5 movd 20(%esi),%mm6 punpckldq 48(%esi),%mm6 pfadd %mm6,%mm5 movd %mm5,640(%ebx) psrlq $32,%mm5 movd %mm5,1536(%ebx) movq %mm3,%mm4 pfsub %mm2,%mm4 movq %mm7,%mm5 punpckldq 20+_tfcos36_,%mm5 #if defined(PIC) && defined(__APPLE__) mov 8(%ebp),%eax #endif pfmul %mm5,%mm4 movq %mm4,%mm5 pfacc %mm5,%mm5 movd 128(%edx),%mm6 punpckldq 84(%edx),%mm6 pfmul %mm6,%mm5 movd %mm5,56(%ecx) psrlq $32,%mm5 movd %mm5,12(%ecx) movq %mm4,%mm6 punpckldq %mm6,%mm5 pfsub %mm6,%mm5 punpckhdq %mm5,%mm5 movd 12(%edx),%mm6 punpckldq 56(%edx),%mm6 pfmul %mm6,%mm5 movd 12(%esi),%mm6 punpckldq 56(%esi),%mm6 pfadd %mm6,%mm5 movd %mm5,384(%ebx) psrlq $32,%mm5 movd %mm5,1792(%ebx) movq (%eax),%mm4 movq 16(%eax),%mm3 pfsub %mm3,%mm4 movq 32(%eax),%mm3 pfadd %mm3,%mm4 movq 48(%eax),%mm3 pfsub %mm3,%mm4 movq 64(%eax),%mm3 pfadd %mm3,%mm4 movq %mm7,%mm5 #if defined(PIC) && defined(__APPLE__) mov -4(%ebp),%eax #endif punpckldq 16+_tfcos36_,%mm5 pfmul %mm5,%mm4 movq %mm4,%mm5 pfacc %mm5,%mm5 movd 124(%edx),%mm6 punpckldq 88(%edx),%mm6 pfmul %mm6,%mm5 movd %mm5,52(%ecx) psrlq $32,%mm5 movd %mm5,16(%ecx) movq %mm4,%mm6 punpckldq %mm6,%mm5 pfsub %mm6,%mm5 punpckhdq %mm5,%mm5 movd 16(%edx),%mm6 punpckldq 52(%edx),%mm6 pfmul %mm6,%mm5 movd 16(%esi),%mm6 punpckldq 52(%esi),%mm6 pfadd %mm6,%mm5 movd %mm5,512(%ebx) psrlq $32,%mm5 movd %mm5,1664(%ebx) femms /* NO_APP */ RESTORE_GOT popl %ebx popl %esi leave ret /* .size ASM_NAME(dct36_3dnowext), .-ASM_NAME(dct36_3dnowext) */ #if defined(PIC) && defined(__APPLE__) .section __IMPORT,__pointers,non_lazy_symbol_pointers L_tfcos36: .indirect_symbol ASM_NAME(tfcos36) .long 0 L_COS9: .indirect_symbol ASM_NAME(COS9) .long 0 #endif NONEXEC_STACK