Blame libmp3lame/i386/fft3dn.nas

Packit 47f805
; from a new GOGO-no-coda (1999/09)
Packit 47f805
;	Copyright (C) 1999 shigeo
Packit 47f805
;	special thanks to Keiichi SAKAI, URURI
Packit 47f805
; hacked and back-ported to LAME
Packit 47f805
;	 by Takehiro TOMINAGA Nov 2000
Packit 47f805
Packit 47f805
%include "nasm.h"
Packit 47f805
Packit 47f805
	globaldef fht_3DN
Packit 47f805
Packit 47f805
	segment_data
Packit 47f805
	align	16
Packit 47f805
costab	dd	0x80000000, 0
Packit 47f805
	dd	1.414213562,1.414213562
Packit 47f805
	dd	9.238795283293805e-01, 9.238795283293805e-01
Packit 47f805
	dd	3.826834424611044e-01, 3.826834424611044e-01
Packit 47f805
	dd	9.951847264044178e-01, 9.951847264044178e-01
Packit 47f805
	dd	9.801714304836734e-02, 9.801714304836734e-02
Packit 47f805
	dd	9.996988186794428e-01, 9.996988186794428e-01
Packit 47f805
	dd	2.454122920569705e-02, 2.454122920569705e-02
Packit 47f805
	dd	9.999811752815535e-01, 9.999811752815535e-01
Packit 47f805
	dd	6.135884819898878e-03, 6.135884819898878e-03
Packit 47f805
D_1_0_0_0	dd	0.0		, 1.0
Packit 47f805
Packit 47f805
	segment_code
Packit 47f805
Packit 47f805
PIC_OFFSETTABLE
Packit 47f805
Packit 47f805
Packit 47f805
;void fht_3DN(float *fz, int nn);
Packit 47f805
Packit 47f805
proc	fht_3DN
Packit 47f805
Packit 47f805
	pushd	ebp, ebx, esi, edi
Packit 47f805
Packit 47f805
	sub	esp, 20
Packit 47f805
Packit 47f805
	call	get_pc.bp
Packit 47f805
	add	ebp, PIC_BASE()
Packit 47f805
Packit 47f805
	mov	r0, [esp+40]		;fi
Packit 47f805
	mov	r1, [esp+44]		;r1 = nn
Packit 47f805
	lea	r3, [PIC_EBP_REL(costab)]		;tri = costab
Packit 47f805
	lea	r4, [r0+r1*8]		;r4 = fn = &fz[n]
Packit 47f805
	mov	[esp+16], r4
Packit 47f805
	mov	r4, 8			;kx = k1/2
Packit 47f805
Packit 47f805
	pmov	mm7, [r3]
Packit 47f805
Packit 47f805
	loopalign 16
Packit 47f805
.do1
Packit 47f805
	lea	r3, [r3+16]	;tri += 2;
Packit 47f805
	pmov	mm6, [PIC_EBP_REL(costab+8)]
Packit 47f805
	lea	r2, [r4+r4*2]		;k3*fsize/2
Packit 47f805
	mov	r5, 4		;i = 1*fsize
Packit 47f805
Packit 47f805
	loopalign 16
Packit 47f805
.do2:
Packit 47f805
	lea	r1, [r0+r4]		;gi = fi + kx
Packit 47f805
	;f
Packit 47f805
	pmov	mm0, [r0]	;fi0
Packit 47f805
	pmov	mm1, [r0+r4*2]	;fi1
Packit 47f805
	pmov	mm2, [r0+r2*2]	;fi3
Packit 47f805
	pmov	mm3, [r0+r4*4]	;fi2
Packit 47f805
Packit 47f805
	pupldq	mm0, mm0	;fi0 | fi0
Packit 47f805
	pupldq	mm1, mm1	;fi1 | fi1
Packit 47f805
	pupldq	mm2, mm2	;fi2 | fi2
Packit 47f805
	pupldq	mm3, mm3	;fi3 | fi3
Packit 47f805
Packit 47f805
	pxor	mm1, mm7	;fi1 | -fi1
Packit 47f805
	pxor	mm3, mm7	;fi3 | -fi3
Packit 47f805
Packit 47f805
	pfsub	mm0, mm1	;f1 | f0
Packit 47f805
	pfsub	mm2, mm3	;f3 | f2
Packit 47f805
Packit 47f805
	pmov	mm4, mm0
Packit 47f805
	pfadd	mm0, mm2	;f1+f3|f0+f2 = fi1 | fi0
Packit 47f805
	pfsub	mm4, mm2	;f1-f3|f0-f2 = fi3 | fi2
Packit 47f805
Packit 47f805
	pmovd	[r0], mm0	;fi[0]
Packit 47f805
	puphdq	mm0, mm0
Packit 47f805
	pmovd	[r0+r4*4], mm4	;fi[k2]
Packit 47f805
	puphdq	mm4, mm4
Packit 47f805
Packit 47f805
	pmovd	[r0+r4*2], mm4	;fi[k1]
Packit 47f805
	pmovd	[r0+r2*2], mm0	;fi[k3]
Packit 47f805
	lea	r0, [r0+r4*8]
Packit 47f805
Packit 47f805
	;g
Packit 47f805
	pmov	mm0, [r1]	;gi0
Packit 47f805
	pmov	mm1, [r1+r4*2]	;gi1
Packit 47f805
	pmov	mm2, [r1+r4*4]	;gi2
Packit 47f805
	pmov	mm3, [r1+r2*2]	;gi3
Packit 47f805
Packit 47f805
	pupldq	mm1, mm1
Packit 47f805
	pupldq	mm0, mm0	;gi0 | gi0
Packit 47f805
	pupldq	mm2, mm3	;gi3 | gi2
Packit 47f805
Packit 47f805
	pxor	mm1, mm7	;gi1 | -gi1
Packit 47f805
Packit 47f805
	pfsub	mm0, mm1	;gi0-gi1|gi0+gi1 = g1 | g0
Packit 47f805
	pfmul	mm2, mm6	;gi3*SQRT2|gi2*SQRT2 = g3 | g2
Packit 47f805
Packit 47f805
	pmov	mm4, mm0
Packit 47f805
	pfadd	mm0, mm2	;g1+g3|g0+g2 = gi1 | gi0
Packit 47f805
	pfsub	mm4, mm2	;g1-g3|g0-g2 = gi3 | gi2
Packit 47f805
Packit 47f805
	pmovd	[r1], mm0	;gi[0]
Packit 47f805
	puphdq	mm0, mm0
Packit 47f805
	pmovd	[r1+r4*4], mm4	;gi[k2]
Packit 47f805
	puphdq	mm4, mm4
Packit 47f805
Packit 47f805
	cmp	r0, [esp + 16]
Packit 47f805
	pmovd	[r1+r4*2], mm0	;gi[k1]
Packit 47f805
	pmovd	[r1+r2*2], mm4	;gi[k3]
Packit 47f805
Packit 47f805
	jb near .do2
Packit 47f805
Packit 47f805
	pmov	mm6, [r3+r5]	; this is not aligned address!!
Packit 47f805
Packit 47f805
	loopalign 16
Packit 47f805
.for:
Packit 47f805
;
Packit 47f805
; mm6 = c1 | s1
Packit 47f805
; mm7 = 0x800000000 | 0
Packit 47f805
;
Packit 47f805
	pmov	mm1, mm6
Packit 47f805
	mov	r0, [esp+40]	; fz
Packit 47f805
	puphdq	mm1, mm1	; c1 | c1
Packit 47f805
	lea	r1, [r0+r4*2]
Packit 47f805
	pfadd	mm1, mm1	; c1+c1 | c1+c1
Packit 47f805
	pfmul	mm1, mm6	; 2*c1*c1 | 2*c1*s1
Packit 47f805
	pfsub	mm1, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
Packit 47f805
Packit 47f805
	pmov	mm0, mm1
Packit 47f805
	pxor	mm7, mm6	; c1 | -s1
Packit 47f805
Packit 47f805
	pupldq	mm2, mm0
Packit 47f805
	pupldq	mm3, mm6	; ** | c1
Packit 47f805
	puphdq	mm0, mm2	; s2 | c2
Packit 47f805
	puphdq	mm6, mm3	;-s1 | c1
Packit 47f805
Packit 47f805
	pxor	mm0, [PIC_EBP_REL(costab)]	; c2 | -s2
Packit 47f805
Packit 47f805
; mm0 =  s2| c2
Packit 47f805
; mm1 = -c2| s2
Packit 47f805
; mm6 =  c1| s1
Packit 47f805
; mm7 =  s1|-c1 (we use the opposite sign. from GOGO here)
Packit 47f805
Packit 47f805
	pmov	[esp], mm0
Packit 47f805
	pmov	[esp+8], mm1
Packit 47f805
Packit 47f805
	sub	r1, r5		;r1 = gi
Packit 47f805
	add	r0, r5		;r0 = fi
Packit 47f805
Packit 47f805
	loopalign 16
Packit 47f805
.do3:
Packit 47f805
	pmov	mm2, [r0+r4*2] ; fi[k1]
Packit 47f805
	pmov	mm4, [r1+r4*2] ; gi[k1]
Packit 47f805
	pmov	mm3, [r0+r2*2] ; fi[k3]
Packit 47f805
	pmov	mm5, [r1+r2*2] ; gi[k3]
Packit 47f805
Packit 47f805
	pupldq	mm2, mm2	; fi1 | fi1
Packit 47f805
	pupldq	mm4, mm4	; gi1 | gi1
Packit 47f805
	pupldq	mm3, mm3	; fi3 | fi3
Packit 47f805
	pupldq	mm5, mm5	; gi3 | gi3
Packit 47f805
Packit 47f805
	pfmul	mm2, mm0	; s2 * fi1 | c2 * fi1
Packit 47f805
	pfmul	mm4, mm1	;-c2 * gi1 | s2 * gi1
Packit 47f805
	pfmul	mm3, mm0	; s2 * fi3 | c2 * fi3
Packit 47f805
	pfmul	mm5, mm1	;-c2 * gi3 | s2 * gi3
Packit 47f805
Packit 47f805
	pfadd	mm2, mm4		;b | a
Packit 47f805
	pfadd	mm3, mm5		;d | c
Packit 47f805
Packit 47f805
	pmov	mm0, [r0]
Packit 47f805
	pmov	mm4, [r1]
Packit 47f805
	pmov	mm1, [r0+r4*4]
Packit 47f805
	pmov	mm5, [r1+r4*4]
Packit 47f805
Packit 47f805
	pupldq	mm0, mm4		;gi0 | fi0
Packit 47f805
	pupldq	mm1, mm5		;gi2 | fi2
Packit 47f805
Packit 47f805
	pmov	mm4, mm2
Packit 47f805
	pmov	mm5, mm3
Packit 47f805
Packit 47f805
	pfadd	mm2, mm0		;g0 | f0
Packit 47f805
	pfadd	mm3, mm1		;g2 | f2
Packit 47f805
Packit 47f805
	pfsub	mm0, mm4		;g1 | f1
Packit 47f805
	pfsub	mm1, mm5		;g3 | f3
Packit 47f805
Packit 47f805
	pmov	mm4, mm3
Packit 47f805
	pmov	mm5, mm1
Packit 47f805
Packit 47f805
	pupldq	mm4, mm4		;f2 | f2
Packit 47f805
	puphdq	mm5, mm5		;g3 | g3
Packit 47f805
	puphdq	mm3, mm3		;g2 | g2
Packit 47f805
	pupldq	mm1, mm1		;f3 | f3
Packit 47f805
Packit 47f805
	pfmul	mm4, mm6		;f2 * c1 | f2 * s1
Packit 47f805
	pfmul	mm5, mm7		;g3 * s1 | g3 *-c1
Packit 47f805
	pfmul	mm3, mm6		;g2 * c1 | g2 * s1
Packit 47f805
	pfmul	mm1, mm7		;f3 * s1 | f3 *-c1
Packit 47f805
Packit 47f805
	pfadd	mm4, mm5		;a | b
Packit 47f805
	pfsub	mm3, mm1		;d | c
Packit 47f805
Packit 47f805
	pmov	mm5, mm2
Packit 47f805
	pmov	mm1, mm0
Packit 47f805
Packit 47f805
	pupldq	mm2, mm2		;f0 | f0
Packit 47f805
	pupldq	mm0, mm0		;f1 | f1
Packit 47f805
Packit 47f805
	puphdq	mm1, mm2		;f0 | g1
Packit 47f805
	puphdq	mm5, mm0		;f1 | g0
Packit 47f805
Packit 47f805
	pmov	mm2, mm4
Packit 47f805
	pmov	mm0, mm3
Packit 47f805
Packit 47f805
	pfadd	mm4, mm1		;fi0 | gi1
Packit 47f805
	pfadd	mm3, mm5		;fi1 | gi0
Packit 47f805
	pfsub	mm1, mm2		;fi2 | gi3
Packit 47f805
	pfsub	mm5, mm0		;fi3 | gi2
Packit 47f805
Packit 47f805
	pmovd	[r1+r4*2], mm4	;gi[k1]
Packit 47f805
	puphdq	mm4, mm4
Packit 47f805
	pmovd	[r1], mm3		;gi[0]
Packit 47f805
	puphdq	mm3, mm3
Packit 47f805
	pmovd	[r1+r2*2], mm1	;gi[k3]
Packit 47f805
	puphdq	mm1, mm1
Packit 47f805
	pmovd	[r1+r4*4], mm5	;gi[k2]
Packit 47f805
	puphdq	mm5, mm5
Packit 47f805
Packit 47f805
	pmovd	[r0], mm4	;fi[0]
Packit 47f805
	pmovd	[r0+r4*2], mm3	;fi[k1]
Packit 47f805
	pmovd	[r0+r4*4], mm1	;fi[k2]
Packit 47f805
	pmovd	[r0+r2*2], mm5	;fi[k3]
Packit 47f805
Packit 47f805
	lea	r0, [r0+r4*8]
Packit 47f805
	lea	r1, [r1+r4*8]
Packit 47f805
	cmp	r0, [esp + 16]
Packit 47f805
	pmov	mm0, [esp]
Packit 47f805
	pmov	mm1, [esp+8]
Packit 47f805
Packit 47f805
	jb near	.do3
Packit 47f805
Packit 47f805
	add	r5, 4
Packit 47f805
; mm6 =  c1| s1
Packit 47f805
; mm7 =  s1|-c1 (we use the opposite sign. from GOGO here)
Packit 47f805
	pfmul	mm6, [r3]	; c1*a | s1*a
Packit 47f805
	pfmul	mm7, [r3+8]	; s1*b |-c1*b
Packit 47f805
	cmp	r5, r4
Packit 47f805
Packit 47f805
	pfsub	mm6, mm7	; c1*a-s1*b | s1*a+c1*b
Packit 47f805
	pupldq	mm7,mm6
Packit 47f805
	puphdq	mm6,mm7
Packit 47f805
	pmov	mm7, [PIC_EBP_REL(costab)]
Packit 47f805
	jb near	.for
Packit 47f805
Packit 47f805
	mov	r0, [esp+40]	;fi
Packit 47f805
	cmp	r4, [esp+40+4]
Packit 47f805
	lea	r4, [r4*4]	;kx *= 4
Packit 47f805
Packit 47f805
	jb near	.do1
Packit 47f805
.exitttt
Packit 47f805
	femms
Packit 47f805
	add	esp,20
Packit 47f805
	popd	ebp, ebx, esi, edi
Packit 47f805
endproc
Packit 47f805
Packit 47f805
Packit 47f805
;void fht_E3DN(float *fz, int nn);
Packit 47f805
Packit 47f805
proc	fht_E3DN
Packit 47f805
Packit 47f805
	pushd	ebp, ebx, esi, edi
Packit 47f805
Packit 47f805
	sub	esp, 20
Packit 47f805
Packit 47f805
	call	get_pc.bp
Packit 47f805
	add	ebp, PIC_BASE()
Packit 47f805
Packit 47f805
	mov	r0, [esp+40]		;fi
Packit 47f805
	mov	r1, [esp+44]		;r1 = nn
Packit 47f805
	lea	r3, [PIC_EBP_REL(costab)]		;tri = costab
Packit 47f805
	lea	r4, [r0+r1*8]		;r4 = fn = &fz[n]
Packit 47f805
	mov	[esp+16], r4
Packit 47f805
	mov	r4, 8			;kx = k1/2
Packit 47f805
Packit 47f805
	pmov	mm7, [r3]
Packit 47f805
Packit 47f805
	loopalign 16
Packit 47f805
.do1
Packit 47f805
	lea	r3, [r3+16]	;tri += 2;
Packit 47f805
	pmov	mm6, [PIC_EBP_REL(costab+8)]
Packit 47f805
	lea	r2, [r4+r4*2]		;k3*fsize/2
Packit 47f805
	mov	r5, 4		;i = 1*fsize
Packit 47f805
Packit 47f805
	loopalign 16
Packit 47f805
.do2:
Packit 47f805
	lea	r1, [r0+r4]		;gi = fi + kx
Packit 47f805
;f
Packit 47f805
	pmov	mm0, [r0]	; X  | fi0
Packit 47f805
	pmov	mm1, [r0+r4*4]	; X  | fi2
Packit 47f805
	pupldq	mm0, [r0+r4*2]	;fi1 | fi0
Packit 47f805
	pupldq	mm1, [r0+r2*2]	;fi3 | fi2
Packit 47f805
	pfpnacc	mm0, mm0	;fi0+fi1 | fi0-fi1 = f0|f1
Packit 47f805
	pfpnacc	mm1, mm1	;fi2+fi3 | fi2-fi3 = f2|f3
Packit 47f805
Packit 47f805
	pmov	mm2, mm0
Packit 47f805
	pfadd	mm0, mm1	;f0+f2|f1+f3 = fi0 | fi1
Packit 47f805
	pfsub	mm2, mm1	;f0-f2|f1-f3 = fi2 | fi3
Packit 47f805
Packit 47f805
	pmovd	[r0+r4*2], mm0	;fi[k1]
Packit 47f805
	pmovd	[r0+r2*2], mm2	;fi[k3]
Packit 47f805
Packit 47f805
	puphdq	mm0, mm0
Packit 47f805
	puphdq	mm2, mm2
Packit 47f805
	pmovd	[r0], mm0	;fi[0]
Packit 47f805
	pmovd	[r0+r4*4], mm2	;fi[k2]
Packit 47f805
Packit 47f805
	lea	r0, [r0+r4*8]
Packit 47f805
;g
Packit 47f805
	pmov	mm3, [r1]	;    gi0
Packit 47f805
	pmov	mm4, [r1+r2*2]	;    gi3
Packit 47f805
	pupldq	mm3, [r1+r4*2]	;gi1|gi0
Packit 47f805
	pupldq	mm4, [r1+r4*4]	;gi2|gi3
Packit 47f805
Packit 47f805
	pfpnacc	mm3, mm3	;gi0+gi1  |gi0-gi1   = f0|f1
Packit 47f805
	pfmul	mm4, mm6	;gi2*SQRT2|gi3*SQRT2 = f2|f3
Packit 47f805
Packit 47f805
	pmov	mm5, mm3
Packit 47f805
	pfadd	mm3, mm4	;f0+f2|f1+f3
Packit 47f805
	pfsub	mm5, mm4	;f0-f2|f1-f3
Packit 47f805
Packit 47f805
	cmp	r0, [esp + 16]
Packit 47f805
	pmovd	[r1+r4*2], mm3	;gi[k1]
Packit 47f805
	pmovd	[r1+r2*2], mm5	;gi[k3]
Packit 47f805
	puphdq	mm3, mm3
Packit 47f805
	puphdq	mm5, mm5
Packit 47f805
	pmovd	[r1], mm3	;gi[0]
Packit 47f805
	pmovd	[r1+r4*4], mm5	;gi[k2]
Packit 47f805
Packit 47f805
	jb near .do2
Packit 47f805
Packit 47f805
	pmov	mm6, [r3+r5]	; this is not aligned address!!
Packit 47f805
Packit 47f805
	loopalign 16
Packit 47f805
.for:
Packit 47f805
;
Packit 47f805
; mm6 = c1 | s1
Packit 47f805
; mm7 = 0x800000000 | 0
Packit 47f805
;
Packit 47f805
	pmov	mm5, mm6
Packit 47f805
	mov	r0, [esp+40]	; fz
Packit 47f805
	puphdq	mm5, mm5	; c1 | c1
Packit 47f805
	lea	r1, [r0+r4*2]
Packit 47f805
	pfadd	mm5, mm5	; c1+c1 | c1+c1
Packit 47f805
	pfmul	mm5, mm6	; 2*c1*c1 | 2*c1*s1
Packit 47f805
	pfsub	mm5, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
Packit 47f805
Packit 47f805
	pswapd	mm4, mm5	; s2 |-c2
Packit 47f805
	pxor	mm4, mm7	; s2 | c2
Packit 47f805
	pxor	mm7, mm6	; c1 |-s1
Packit 47f805
	pswapd	mm6, mm6	; s1 | c1
Packit 47f805
Packit 47f805
; mm4 =  s2| c2
Packit 47f805
; mm5 = -c2| s2
Packit 47f805
; mm6 =  c1| s1
Packit 47f805
; mm7 =  s1|-c1 (we use the opposite sign. from GOGO here)
Packit 47f805
Packit 47f805
	pmov	[esp], mm4
Packit 47f805
	pmov	[esp+8], mm5
Packit 47f805
Packit 47f805
	sub	r1, r5		;r1 = gi
Packit 47f805
	add	r0, r5		;r0 = fi
Packit 47f805
Packit 47f805
	loopalign 16
Packit 47f805
.do3:
Packit 47f805
	pmov	mm0, [r0+r2*2] ; fi[k1]
Packit 47f805
	pmov	mm2, [r1+r2*2] ; gi[k1]
Packit 47f805
	pmov	mm1, [r0+r4*2] ; fi[k3]
Packit 47f805
	pmov	mm3, [r1+r4*2] ; gi[k3]
Packit 47f805
Packit 47f805
	pupldq	mm0, mm0
Packit 47f805
	pupldq	mm2, mm2
Packit 47f805
	pupldq	mm1, mm1
Packit 47f805
	pupldq	mm3, mm3
Packit 47f805
Packit 47f805
	pfmul	mm0, mm4
Packit 47f805
	pfmul	mm2, mm5
Packit 47f805
	pfmul	mm1, mm4
Packit 47f805
	pfmul	mm3, mm5
Packit 47f805
Packit 47f805
	pfadd	mm0, mm2		;d | c
Packit 47f805
	pfadd	mm1, mm3		;b | a
Packit 47f805
Packit 47f805
	pmov	mm2, [r0+r4*4]		;fi2
Packit 47f805
	pupldq	mm3, [r1+r4*4]		;gi2 | -
Packit 47f805
	pmov	mm4, [r0]		;fi0
Packit 47f805
	pupldq	mm5, [r1]		;gi0 | -
Packit 47f805
Packit 47f805
	pupldq	mm2, mm0		;c | fi2
Packit 47f805
	puphdq	mm3, mm0		;d | gi2
Packit 47f805
	pupldq	mm4, mm1		;a | fi0
Packit 47f805
	puphdq	mm5, mm1		;b | gi0
Packit 47f805
Packit 47f805
	pfpnacc	mm2, mm2		;f2 | f3
Packit 47f805
	pfpnacc	mm3, mm3		;g2 | g3
Packit 47f805
	pfpnacc	mm4, mm4		;f0 | f1
Packit 47f805
	pfpnacc	mm5, mm5		;g0 | g1
Packit 47f805
Packit 47f805
	pmov	mm0, mm2
Packit 47f805
	pmov	mm1, mm3
Packit 47f805
	pupldq	mm2, mm2		;f3 | f3
Packit 47f805
	pupldq	mm3, mm3		;g3 | g3
Packit 47f805
	puphdq	mm0, mm0		;f2 | f2
Packit 47f805
	puphdq	mm1, mm1		;g2 | g2
Packit 47f805
Packit 47f805
	pswapd	mm4, mm4		;f1 | f0
Packit 47f805
	pswapd	mm5, mm5		;g1 | g0
Packit 47f805
Packit 47f805
	pfmul	mm0, mm7		;f2 * s1 | f2 *-c1
Packit 47f805
	pfmul	mm3, mm6		;g3 * c1 | g3 * s1
Packit 47f805
	pfmul	mm1, mm6		;g2 * c1 | g2 * s1
Packit 47f805
	pfmul	mm2, mm7		;f3 * s1 | f3 *-c1
Packit 47f805
Packit 47f805
	pfsub	mm0, mm3		; b |-a
Packit 47f805
	pfsub	mm1, mm2		; d | c
Packit 47f805
Packit 47f805
	pmov	mm2, mm5
Packit 47f805
	pmov	mm3, mm4
Packit 47f805
	pupldq	mm4, mm0		;-a | f0
Packit 47f805
	pupldq	mm5, mm1		; c | g0
Packit 47f805
	puphdq	mm2, mm0		; b | g1
Packit 47f805
	puphdq	mm3, mm1		; d | f1
Packit 47f805
Packit 47f805
	pfpnacc	mm4, mm4		;fi2 | fi0
Packit 47f805
	pfpnacc	mm5, mm5		;gi0 | gi2
Packit 47f805
	pfpnacc	mm2, mm2		;gi1 | gi3
Packit 47f805
	pfpnacc	mm3, mm3		;fi1 | fi3
Packit 47f805
Packit 47f805
	pmovd	[r0], mm4		;fi[0]
Packit 47f805
	pmovd	[r1+r4*4], mm5		;gi[k2]
Packit 47f805
	pmovd	[r1+r2*2], mm2		;gi[k3]
Packit 47f805
	pmovd	[r0+r2*2], mm3		;fi[k3]
Packit 47f805
Packit 47f805
	puphdq	mm4, mm4
Packit 47f805
	puphdq	mm5, mm5
Packit 47f805
	puphdq	mm2, mm2
Packit 47f805
	puphdq	mm3, mm3
Packit 47f805
	pmovd	[r0+r4*4], mm4		;fi[k2]
Packit 47f805
	pmovd	[r1], mm5		;gi[0]
Packit 47f805
	pmovd	[r1+r4*2], mm2		;gi[k1]
Packit 47f805
	pmovd	[r0+r4*2], mm3		;fi[k1]
Packit 47f805
Packit 47f805
	lea	r0, [r0+r4*8]
Packit 47f805
	lea	r1, [r1+r4*8]
Packit 47f805
	cmp	r0, [esp + 16]
Packit 47f805
	pmov	mm4, [esp]
Packit 47f805
	pmov	mm5, [esp+8]
Packit 47f805
Packit 47f805
	jb near	.do3
Packit 47f805
Packit 47f805
	add	r5, 4
Packit 47f805
; mm6 =  c1| s1
Packit 47f805
; mm7 =  s1|-c1 (we use the opposite sign. from GOGO here)
Packit 47f805
	pfmul	mm6, [r3]	; c1*a | s1*a
Packit 47f805
	pfmul	mm7, [r3+8]	; s1*b |-c1*b
Packit 47f805
	cmp	r5, r4
Packit 47f805
Packit 47f805
	pfsub	mm6, mm7	; c1*a-s1*b | s1*a+c1*b
Packit 47f805
	pswapd	mm6, mm6 ; ???	; s1*a+c1*b | c1*a-s1*b
Packit 47f805
	pmov	mm7, [PIC_EBP_REL(costab)]
Packit 47f805
	jb near	.for
Packit 47f805
Packit 47f805
	mov	r0, [esp+40]	;fi
Packit 47f805
	cmp	r4, [esp+40+4]
Packit 47f805
	lea	r4, [r4*4]	;kx *= 4
Packit 47f805
Packit 47f805
	jb near	.do1
Packit 47f805
.exitttt
Packit 47f805
	femms
Packit 47f805
	add	esp,20
Packit 47f805
	popd	ebp, ebx, esi, edi
Packit 47f805
endproc