Blame libmp3lame/i386/fftsse.nas

Packit 47f805
; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA
Packit 47f805
Packit 47f805
; GOGO-no-coda
Packit 47f805
;	Copyright (C) 1999 shigeo
Packit 47f805
;	special thanks to Keiichi SAKAI
Packit 47f805
 
Packit 47f805
%include "nasm.h"
Packit 47f805
Packit 47f805
	globaldef fht_SSE
Packit 47f805
Packit 47f805
	segment_data
Packit 47f805
	align 16
Packit 47f805
Q_MMPP	dd	0x0,0x0,0x80000000,0x80000000
Packit 47f805
Q_MPMP	dd	0x0,0x80000000,0x0,0x80000000
Packit 47f805
D_1100	dd 0.0, 0.0, 1.0, 1.0
Packit 47f805
costab_fft:
Packit 47f805
	dd 9.238795325112867e-01
Packit 47f805
	dd 3.826834323650898e-01
Packit 47f805
	dd 9.951847266721969e-01
Packit 47f805
	dd 9.801714032956060e-02
Packit 47f805
	dd 9.996988186962042e-01
Packit 47f805
	dd 2.454122852291229e-02
Packit 47f805
	dd 9.999811752836011e-01
Packit 47f805
	dd 6.135884649154475e-03
Packit 47f805
S_SQRT2	dd	1.414213562
Packit 47f805
Packit 47f805
	segment_code
Packit 47f805
Packit 47f805
PIC_OFFSETTABLE
Packit 47f805
Packit 47f805
;------------------------------------------------------------------------
Packit 47f805
;	by K. SAKAI
Packit 47f805
;	99/08/18	PIII 23k[clk]
Packit 47f805
;	99/08/19	命令順序入れ換え PIII 22k[clk]
Packit 47f805
;	99/08/20	bit reversal を旧午後から移植した PIII 17k[clk]
Packit 47f805
;	99/08/23	一部 unroll PIII 14k[clk]
Packit 47f805
;	99/11/12	clean up
Packit 47f805
;
Packit 47f805
;void fht_SSE(float *fz, int n);
Packit 47f805
	align 16
Packit 47f805
fht_SSE:
Packit 47f805
	push	ebx
Packit 47f805
	push	esi
Packit 47f805
	push	edi
Packit 47f805
	push	ebp
Packit 47f805
Packit 47f805
%assign _P 4*5
Packit 47f805
Packit 47f805
	;2つ目のループ
Packit 47f805
	mov	eax,[esp+_P+0]	;eax=fz
Packit 47f805
	mov	ebp,[esp+_P+4]	;=n
Packit 47f805
	shl	ebp,3
Packit 47f805
	add	ebp,eax		; fn  = fz + n, この関数終了まで不変
Packit 47f805
	push	ebp
Packit 47f805
Packit 47f805
	call	get_pc.bp
Packit 47f805
	add	ebp, PIC_BASE()
Packit 47f805
Packit 47f805
	lea	ecx,[PIC_EBP_REL(costab_fft)]
Packit 47f805
	xor	eax,eax
Packit 47f805
	mov	al,8		; =k1=1*(sizeof float)	// 4, 16, 64, 256,...
Packit 47f805
.lp2:				; do{
Packit 47f805
	mov	esi,[esp+_P+4]	; esi=fi=fz
Packit 47f805
	lea	edx,[eax+eax*2]
Packit 47f805
	mov	ebx, esi
Packit 47f805
Packit 47f805
; たかだか2並列しか期待できない部分はFPUのほうが速い。
Packit 47f805
	loopalign	16
Packit 47f805
.lp20:				; do{
Packit 47f805
;                       f0     = fi[0 ] + fi[k1];
Packit 47f805
;                       f2     = fi[k2] + fi[k3];
Packit 47f805
;                       f1     = fi[0 ] - fi[k1];
Packit 47f805
;                       f3     = fi[k2] - fi[k3];
Packit 47f805
;                       fi[0 ] = f0     + f2;
Packit 47f805
;                       fi[k1] = f1     + f3;
Packit 47f805
;                       fi[k2] = f0     - f2;
Packit 47f805
;                       fi[k3] = f1     - f3;
Packit 47f805
	lea	edi,[ebx+eax]	; edi=gi=fi+ki/2
Packit 47f805
	fld	dword [ebx]
Packit 47f805
	fadd	dword [ebx+eax*2]
Packit 47f805
	fld	dword [ebx+eax*4]
Packit 47f805
	fadd	dword [ebx+edx*2]
Packit 47f805
Packit 47f805
	fld	dword [ebx]
Packit 47f805
	fsub	dword [ebx+eax*2]
Packit 47f805
	fld	dword [ebx+eax*4]
Packit 47f805
	fsub	dword [ebx+edx*2]
Packit 47f805
Packit 47f805
	fld	st1
Packit 47f805
	fadd	st0,st1
Packit 47f805
	fstp	dword [ebx+eax*2]
Packit 47f805
	fsubp	st1,st0
Packit 47f805
	fstp	dword [ebx+edx*2]
Packit 47f805
Packit 47f805
	fld	st1
Packit 47f805
	fadd	st0,st1
Packit 47f805
	fstp	dword [ebx]
Packit 47f805
	fsubp	st1,st0
Packit 47f805
	fstp	dword [ebx+eax*4]
Packit 47f805
Packit 47f805
	lea	ebx,[ebx + eax*8]	; = fi += (k1 * 4);
Packit 47f805
;                       g0     = gi[0 ] + gi[k1];
Packit 47f805
;                       g2     = SQRT2  * gi[k2];
Packit 47f805
;                       g1     = gi[0 ] - gi[k1];
Packit 47f805
;                       g3     = SQRT2  * gi[k3];
Packit 47f805
;                       gi[0 ] = g0     + g2;
Packit 47f805
;                       gi[k2] = g0     - g2;
Packit 47f805
;                       gi[k1] = g1     + g3;
Packit 47f805
;                       gi[k3] = g1     - g3;
Packit 47f805
	fld	dword [edi]
Packit 47f805
	fadd	dword [edi+eax*2]
Packit 47f805
	fld	dword [PIC_EBP_REL(S_SQRT2)]
Packit 47f805
	fmul	dword [edi+eax*4]
Packit 47f805
Packit 47f805
	fld	dword [edi]
Packit 47f805
	fsub	dword [edi+eax*2]
Packit 47f805
	fld	dword [PIC_EBP_REL(S_SQRT2)]
Packit 47f805
	fmul	dword [edi+edx*2]
Packit 47f805
Packit 47f805
	fld	st1
Packit 47f805
	fadd	st0,st1
Packit 47f805
	fstp	dword [edi+eax*2]
Packit 47f805
	fsubp	st1,st0
Packit 47f805
	fstp	dword [edi+edx*2]
Packit 47f805
Packit 47f805
	fld	st1
Packit 47f805
	fadd	st0,st1
Packit 47f805
	fstp	dword [edi]
Packit 47f805
	fsubp	st1,st0
Packit 47f805
	fstp	dword [edi+eax*4]
Packit 47f805
Packit 47f805
	cmp	ebx,[esp]
Packit 47f805
	jl	near .lp20		; while (fi
Packit 47f805
Packit 47f805
Packit 47f805
;               i = 1; //for (i=1;i
Packit 47f805
;                       c1 = 1.0*t_c - 0.0*t_s;
Packit 47f805
;                       s1 = 0.0*t_c + 1.0*t_s;
Packit 47f805
	movlps	xmm6,[ecx] ; = { --,  --,  s1, c1}
Packit 47f805
	movaps	xmm7,xmm6
Packit 47f805
Packit 47f805
	shufps	xmm6,xmm6,R4(0,1,1,0)	; = {+c1, +s1, +s1, +c1} -> 必要
Packit 47f805
;                       c2 = c1*c1 - s1*s1 = 1 - (2*s1)*s1;
Packit 47f805
;                       s2 = c1*s1 + s1*c1 = 2*s1*c1;
Packit 47f805
	shufps	xmm7,xmm7,R4(1,0,0,1)
Packit 47f805
	movss	xmm5,xmm7		; = { --,  --,  --, s1}
Packit 47f805
	xorps	xmm7,[PIC_EBP_REL(Q_MMPP)]	; = {-s1, -c1, +c1, +s1} -> 必要
Packit 47f805
Packit 47f805
	addss	xmm5,xmm5		; = (--, --,  --, 2*s1)
Packit 47f805
	add	esi,4		; esi = fi = fz + i
Packit 47f805
	shufps	xmm5,xmm5,R4(0,0,0,0)	; = (2*s1, 2*s1, 2*s1, 2*s1)
Packit 47f805
	mulps	xmm5,xmm6		; = (2*s1*c1, 2*s1*s1, 2*s1*s1, 2*s1*c1)
Packit 47f805
	subps	xmm5,[PIC_EBP_REL(D_1100)]		; = (--, 2*s1*s1-1, --, 2*s1*c1) = {-- -c2 -- s2}
Packit 47f805
	movaps	xmm4,xmm5
Packit 47f805
	shufps	xmm5,xmm5,R4(2,0,2,0)	; = {-c2, s2, -c2, s2} -> 必要
Packit 47f805
Packit 47f805
	xorps	xmm4,[PIC_EBP_REL(Q_MMPP)]		; = {--, c2, --, s2}
Packit 47f805
	shufps	xmm4,xmm4,R4(0,2,0,2)	; = {s2, c2, s2, c2} -> 必要
Packit 47f805
Packit 47f805
	loopalign	16
Packit 47f805
.lp21:				; do{
Packit 47f805
;                               a       = c2*fi[k1] + s2*gi[k1];
Packit 47f805
;                               b       = s2*fi[k1] - c2*gi[k1];
Packit 47f805
;                               c       = c2*fi[k3] + s2*gi[k3];
Packit 47f805
;                               d       = s2*fi[k3] - c2*gi[k3];
Packit 47f805
;                               f0      = fi[0 ]        + a;
Packit 47f805
;                               g0      = gi[0 ]        + b;
Packit 47f805
;                               f2      = fi[k1 * 2]    + c;
Packit 47f805
;                               g2      = gi[k1 * 2]    + d;
Packit 47f805
;                               f1      = fi[0 ]        - a;
Packit 47f805
;                               g1      = gi[0 ]        - b;
Packit 47f805
;                               f3      = fi[k1 * 2]    - c;
Packit 47f805
;                               g3      = gi[k1 * 2]    - d;
Packit 47f805
	lea	edi,[esi + eax*2 - 8]	; edi = gi = fz +k1-i
Packit 47f805
Packit 47f805
	movss	xmm0,[esi + eax*2]	; = fi[k1]
Packit 47f805
	movss	xmm2,[esi + edx*2]	; = fi[k3]
Packit 47f805
	shufps	xmm0,xmm2,0x00	; = {fi[k3], fi[k3], fi[k1], fi[k1]}
Packit 47f805
	movss	xmm1,[edi + eax*2]	; = fi[k1]
Packit 47f805
	movss	xmm3,[edi + edx*2]	; = fi[k3]
Packit 47f805
	shufps	xmm1,xmm3,0x00	; = {gi[k3], gi[k3], gi[k1], gi[k1]}
Packit 47f805
	movss	xmm2,[esi]		; = fi[0]
Packit 47f805
	mulps	xmm0,xmm4		; *= {+s2, +c2, +s2, +c2}
Packit 47f805
	movss	xmm3,[esi + eax*4]	; = fi[k2]
Packit 47f805
	unpcklps	xmm2,xmm3	; = {--, --, fi[k2], fi[0]}
Packit 47f805
	mulps	xmm1,xmm5		; *= {-c2, +s2, -c2, +s2}
Packit 47f805
	movss	xmm3,[edi + eax*4]	; = gi[k2]
Packit 47f805
	addps	xmm0,xmm1		; = {d, c, b, a}
Packit 47f805
	movss	xmm1,[edi]		; = gi[0]
Packit 47f805
	unpcklps	xmm1,xmm3	; = {--,  --, gi[k2], gi[0]}
Packit 47f805
	unpcklps	xmm2,xmm1	; = {gi[k2], fi[k2], gi[0], fi[0]}
Packit 47f805
	movaps	xmm1,xmm2
Packit 47f805
	addps	xmm1,xmm0	; = {g2, f2, g0, f0}
Packit 47f805
	subps	xmm2,xmm0	; = {g3, f3, g1, f1}
Packit 47f805
Packit 47f805
;                               a       = c1*f2     + s1*g3;
Packit 47f805
;                               c       = s1*g2     + c1*f3;
Packit 47f805
;                               b       = s1*f2     - c1*g3;
Packit 47f805
;                               d       = c1*g2     - s1*f3;
Packit 47f805
;                               fi[0 ]  = f0        + a;
Packit 47f805
;                               gi[0 ]  = g0        + c;
Packit 47f805
;                               gi[k1]  = g1        + b;
Packit 47f805
;                               fi[k1]  = f1        + d;
Packit 47f805
;                               fi[k1 * 2]  = f0    - a;
Packit 47f805
;                               gi[k1 * 2]  = g0    - c;
Packit 47f805
;                               gi[k3]      = g1    - b;
Packit 47f805
;                               fi[k3]      = f1    - d;
Packit 47f805
	movaps	xmm3,xmm1
Packit 47f805
	movhlps	xmm1,xmm1	; = {g2, f2, g2, f2}
Packit 47f805
	shufps	xmm3,xmm2,0x14	; = {f1, g1, g0, f0}
Packit 47f805
	mulps	xmm1,xmm6	; *= {+c1, +s1, +s1, +c1}
Packit 47f805
	shufps	xmm2,xmm2,0xBB	; = {f3, g3, f3, g3}
Packit 47f805
	mulps	xmm2,xmm7	; *= {-s1, -c1, +c1, +s1}
Packit 47f805
	addps	xmm1,xmm2	; = {d, b, c, a}
Packit 47f805
	movaps	xmm2,xmm3
Packit 47f805
	addps	xmm3,xmm1	; = {fi[k1], gi[k1], gi[0], fi[0]}
Packit 47f805
	subps	xmm2,xmm1	; = {fi[k3], gi[k3], gi[k1*2], fi[k1*2]}
Packit 47f805
	movhlps	xmm0,xmm3
Packit 47f805
	movss	[esi],xmm3
Packit 47f805
	shufps	xmm3,xmm3,0x55
Packit 47f805
	movss	[edi+eax*2],xmm0
Packit 47f805
	shufps	xmm0,xmm0,0x55
Packit 47f805
	movss	[edi],xmm3
Packit 47f805
	movss	[esi+eax*2],xmm0
Packit 47f805
	movhlps	xmm0,xmm2
Packit 47f805
	movss	[esi+eax*4],xmm2
Packit 47f805
	shufps	xmm2,xmm2,0x55
Packit 47f805
	movss	[edi+edx*2],xmm0
Packit 47f805
	shufps	xmm0,xmm0,0x55
Packit 47f805
	movss	[edi+eax*4],xmm2
Packit 47f805
	movss	[esi+edx*2],xmm0
Packit 47f805
	lea	esi,[esi + eax*8] ; fi += (k1 * 4);
Packit 47f805
	cmp	esi,[esp]
Packit 47f805
	jl	near .lp21		; while (fi
Packit 47f805
Packit 47f805
Packit 47f805
; unroll前のdo loopは43+4命令
Packit 47f805
Packit 47f805
; 最内周ではないforループのi=2から先をunrollingした
Packit 47f805
; kx=   2,   8,  32,  128
Packit 47f805
; k4=  16,  64, 256, 1024
Packit 47f805
;       0, 6/2,30/2,126/2
Packit 47f805
Packit 47f805
	xor	ebx,ebx
Packit 47f805
	mov	bl, 4*2		; = i = 4
Packit 47f805
	cmp	ebx,eax		; i < k1
Packit 47f805
	jnl	near .F22
Packit 47f805
;               for (i=2;i
Packit 47f805
	loopalign	16
Packit 47f805
.lp22:
Packit 47f805
; at here, xmm6 is {c3, s3, s3, c3}
Packit 47f805
;                       c1 = c3*t_c - s3*t_s;
Packit 47f805
;                       s1 = c3*t_s + s3*t_c;
Packit 47f805
	movlps	xmm0,[ecx]
Packit 47f805
	shufps	xmm0,xmm0,R4(1,1,0,0)	; = {t_s, t_s, t_c, t_c}
Packit 47f805
	mulps	xmm6,xmm0	; = {c3*ts, s3*ts, s3*tc, c3*tc}
Packit 47f805
	movhlps	xmm4,xmm6	; = {--,    --,    c3*ts, s3*ts}
Packit 47f805
	xorps	xmm4,[PIC_EBP_REL(Q_MPMP)]	; = {--,    --,   -c3*ts, s3*ts}
Packit 47f805
	subps	xmm6,xmm4	; = {-,-, c3*ts+s3*tc, c3*tc-s3*ts}={-,-,s1,c1}
Packit 47f805
Packit 47f805
;                       c3 = c1*t_c - s1*t_s;
Packit 47f805
;                       s3 = s1*t_c + c1*t_s;
Packit 47f805
	shufps	xmm6,xmm6,0x14	; = {c1, s1, s1, c1}
Packit 47f805
	mulps	xmm0,xmm6	; = {ts*c1 ts*s1 tc*s1 tc*c1}
Packit 47f805
	movhlps	xmm3,xmm0
Packit 47f805
	xorps	xmm3,[PIC_EBP_REL(Q_MPMP)]
Packit 47f805
	subps	xmm0,xmm3	; = {--, --, s3, c3}
Packit 47f805
Packit 47f805
; {s2 s4 c4 c2} = {2*s1*c1 2*s3*c3 1-2*s3*s3 1-2*s1*s1}
Packit 47f805
	unpcklps	xmm6,xmm0	; xmm6 = {s3, s1, c3, c1}
Packit 47f805
	movaps	xmm7, xmm6
Packit 47f805
	shufps	xmm6,xmm6,R4(2,3,1,0)	; xmm6 = {s1, s3, c3, c1}
Packit 47f805
	addps	xmm7, xmm7		; {s3*2, s1*2,   --,   --}
Packit 47f805
	mov	edi,[esp+_P+4]		; = fz
Packit 47f805
	shufps	xmm7, xmm7, R4(2,3,3,2)	; {s1*2, s3*2, s3*2, s1*2}
Packit 47f805
	sub	edi,ebx			; edi = fz - i/2
Packit 47f805
	mulps	xmm7, xmm6		; {s1*s1*2, s3*s3*2, s3*c3*2, s1*c1*2}
Packit 47f805
	lea	esi,[edi + ebx*2]	; esi = fi = fz +i/2
Packit 47f805
	subps	xmm7, [PIC_EBP_REL(D_1100)]		; {-c2, -c4, s4, s2}
Packit 47f805
	lea	edi,[edi + eax*2-4]	; edi = gi = fz +k1-i/2
Packit 47f805
Packit 47f805
;                       fi = fz +i;
Packit 47f805
;                       gi = fz +k1-i;
Packit 47f805
;                       do{
Packit 47f805
.lp220:
Packit 47f805
; unroll後のdo loopは51+4命令
Packit 47f805
;                               a       = c2*fi[k1  ] + s2*gi[k1  ];
Packit 47f805
;                               e       = c4*fi[k1+1] + s4*gi[k1-1];
Packit 47f805
;                               f       = s4*fi[k1+1] - c4*gi[k1-1];
Packit 47f805
;                               b       = s2*fi[k1  ] - c2*gi[k1  ];
Packit 47f805
;                               c       = c2*fi[k3  ] + s2*gi[k3  ];
Packit 47f805
;                               g       = c4*fi[k3+1] + s4*gi[k3-1];
Packit 47f805
;                               h       = s4*fi[k3+1] - c4*gi[k3-1];
Packit 47f805
;                               d       = s2*fi[k3  ] - c2*gi[k3  ];
Packit 47f805
Packit 47f805
	movaps	xmm4,xmm7	; = {-c2 -c4  s4  s2}
Packit 47f805
	xorps	xmm4,[PIC_EBP_REL(Q_MMPP)]	; = { c2  c4  s4  s2}
Packit 47f805
	shufps	xmm4,xmm4,0x1B	; = { s2  s4  c4  c2}
Packit 47f805
	movlps	xmm0,[esi+eax*2]
Packit 47f805
	movlps	xmm1,[edi+eax*2]
Packit 47f805
	movlps	xmm2,[esi+edx*2]
Packit 47f805
	movlps	xmm3,[edi+edx*2]
Packit 47f805
	shufps	xmm0,xmm0,0x14
Packit 47f805
	shufps	xmm1,xmm1,0x41
Packit 47f805
	shufps	xmm2,xmm2,0x14
Packit 47f805
	shufps	xmm3,xmm3,0x41
Packit 47f805
	mulps	xmm0,xmm4
Packit 47f805
	mulps	xmm1,xmm7
Packit 47f805
	mulps	xmm2,xmm4
Packit 47f805
	mulps	xmm3,xmm7
Packit 47f805
	addps	xmm0,xmm1	; xmm0 = {b, f, e, a}
Packit 47f805
	addps	xmm2,xmm3	; xmm2 = {d, h, g, c}
Packit 47f805
;17
Packit 47f805
Packit 47f805
;                               f0      = fi[0   ]    + a;
Packit 47f805
;                               f4      = fi[0 +1]    + e;
Packit 47f805
;                               g4      = gi[0 -1]    + f;
Packit 47f805
;                               g0      = gi[0   ]    + b;
Packit 47f805
;                               f1      = fi[0   ]    - a;
Packit 47f805
;                               f5      = fi[0 +1]    - e;
Packit 47f805
;                               g5      = gi[0 -1]    - f;
Packit 47f805
;                               g1      = gi[0   ]    - b;
Packit 47f805
;                               f2      = fi[k2  ]    + c;
Packit 47f805
;                               f6      = fi[k2+1]    + g;
Packit 47f805
;                               g6      = gi[k2-1]    + h;
Packit 47f805
;                               g2      = gi[k2  ]    + d;
Packit 47f805
;                               f3      = fi[k2  ]    - c;
Packit 47f805
;                               f7      = fi[k2+1]    - g;
Packit 47f805
;                               g7      = gi[k2-1]    - h;
Packit 47f805
;                               g3      = gi[k2  ]    - d;
Packit 47f805
	movlps	xmm1,[esi      ]
Packit 47f805
	movhps	xmm1,[edi      ]
Packit 47f805
	movaps	xmm4,xmm1
Packit 47f805
	subps	xmm1,xmm0	; xmm1 = {g1, g5, f5, f1}
Packit 47f805
	movlps	xmm3,[esi+eax*4]
Packit 47f805
	movhps	xmm3,[edi+eax*4]
Packit 47f805
	movaps	xmm5,xmm3
Packit 47f805
	subps	xmm3,xmm2	; xmm3 = {g3, g7, f7, f3}
Packit 47f805
	addps	xmm0,xmm4	; xmm0 = {g0, g4, f4, f0}
Packit 47f805
	addps	xmm2,xmm5	; xmm2 = {g2, g6, f6, f2}
Packit 47f805
;10
Packit 47f805
Packit 47f805
;                               a       = c1*f2     + s1*g3;	順*順 + 逆*逆
Packit 47f805
;                               e       = c3*f6     + s3*g7;
Packit 47f805
;                               g       = s3*g6     + c3*f7;
Packit 47f805
;                               c       = s1*g2     + c1*f3;
Packit 47f805
;                               d       = c1*g2     - s1*f3;	順*逆 - 逆*順
Packit 47f805
;                               h       = c3*g6     - s3*f7;
Packit 47f805
;                               f       = s3*f6     - c3*g7;
Packit 47f805
;                               b       = s1*f2     - c1*g3;
Packit 47f805
Packit 47f805
	movaps	xmm5,xmm6	; xmm6 = {s1, s3, c3, c1}
Packit 47f805
	shufps	xmm5,xmm5,0x1B	; = {c1, c3, s3, s1}
Packit 47f805
	movaps	xmm4,xmm2
Packit 47f805
	mulps	xmm4,xmm6
Packit 47f805
	shufps	xmm2,xmm2,0x1B	; xmm2 = {f2, f6, g6, g2}
Packit 47f805
	mulps	xmm2,xmm6
Packit 47f805
	mulps	xmm5,xmm3
Packit 47f805
	mulps	xmm3,xmm6
Packit 47f805
	shufps	xmm3,xmm3,0x1B
Packit 47f805
	addps	xmm4,xmm3	; = {c, g, e, a}
Packit 47f805
	subps	xmm2,xmm5	; = {b, f, h, d}
Packit 47f805
;10
Packit 47f805
Packit 47f805
;                               fi[0   ]  = f0        + a;
Packit 47f805
;                               fi[0 +1]  = f4        + e;
Packit 47f805
;                               gi[0 -1]  = g4        + g;
Packit 47f805
;                               gi[0   ]  = g0        + c;
Packit 47f805
;                               fi[k2  ]  = f0        - a;
Packit 47f805
;                               fi[k2+1]  = f4        - e;
Packit 47f805
;                               gi[k2-1]  = g4        - g;
Packit 47f805
;                               gi[k2  ]  = g0        - c;
Packit 47f805
;                               fi[k1  ]  = f1        + d;
Packit 47f805
;                               fi[k1+1]  = f5        + h;
Packit 47f805
;                               gi[k1-1]  = g5        + f;
Packit 47f805
;                               gi[k1  ]  = g1        + b;
Packit 47f805
;                               fi[k3  ]  = f1        - d;
Packit 47f805
;                               fi[k3+1]  = f5        - h;
Packit 47f805
;                               gi[k3-1]  = g5        - f;
Packit 47f805
;                               gi[k3  ]  = g1        - b;
Packit 47f805
	movaps	xmm3,xmm0
Packit 47f805
	subps	xmm0,xmm4
Packit 47f805
	movlps	[esi+eax*4],xmm0
Packit 47f805
	movhps	[edi+eax*4],xmm0
Packit 47f805
	addps	xmm4,xmm3
Packit 47f805
	movlps	[esi      ],xmm4
Packit 47f805
	movhps	[edi      ],xmm4
Packit 47f805
Packit 47f805
	movaps	xmm5,xmm1
Packit 47f805
	subps	xmm1,xmm2
Packit 47f805
	movlps	[esi+edx*2],xmm1
Packit 47f805
	movhps	[edi+edx*2],xmm1
Packit 47f805
	addps	xmm2,xmm5
Packit 47f805
	movlps	[esi+eax*2],xmm2
Packit 47f805
	movhps	[edi+eax*2],xmm2
Packit 47f805
; 14
Packit 47f805
;                               gi     += k4;
Packit 47f805
;                               fi     += k4;
Packit 47f805
	lea	edi,[edi + eax*8] ; gi += (k1 * 4);
Packit 47f805
	lea	esi,[esi + eax*8] ; fi += (k1 * 4);
Packit 47f805
	cmp	esi,[esp]
Packit 47f805
	jl	near .lp220		; while (fi
Packit 47f805
;                       } while (fi
Packit 47f805
Packit 47f805
	add	ebx,byte 2*4	; i+= 4
Packit 47f805
	cmp	ebx,eax		; i < k1
Packit 47f805
	shufps	xmm6,xmm6,R4(1,2,2,1)	; (--,s3,c3,--) => {c3, s3, s3, c3}
Packit 47f805
	jl	near .lp22
Packit 47f805
;               }
Packit 47f805
.F22:
Packit 47f805
	shl	eax,2
Packit 47f805
	add	ecx, byte 8
Packit 47f805
	cmp	eax,[esp+_P+8]	; while ((k1 * 4)
Packit 47f805
	jle	near .lp2
Packit 47f805
	pop	ebp
Packit 47f805
	pop	ebp
Packit 47f805
	pop	edi
Packit 47f805
	pop	esi
Packit 47f805
	pop	ebx
Packit 47f805
	ret
Packit 47f805
Packit 47f805
	end