Blame mpn/x86/mul_basecase.asm

Packit 5c3484
dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
Packit 5c3484
dnl  in a third limb vector.
Packit 5c3484
Packit 5c3484
dnl  Copyright 1996-2002 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C     cycles/crossproduct
Packit 5c3484
C P5	  15
Packit 5c3484
C P6	   7.5
Packit 5c3484
C K6	  12.5
Packit 5c3484
C K7	   5.5
Packit 5c3484
C P4	  24
Packit 5c3484
Packit 5c3484
Packit 5c3484
C void mpn_mul_basecase (mp_ptr wp,
Packit 5c3484
C                        mp_srcptr xp, mp_size_t xsize,
Packit 5c3484
C                        mp_srcptr yp, mp_size_t ysize);
Packit 5c3484
C
Packit 5c3484
C This was written in a haste since the Pentium optimized code that was used
Packit 5c3484
C for all x86 machines was slow for the Pentium II.  This code would benefit
Packit 5c3484
C from some cleanup.
Packit 5c3484
C
Packit 5c3484
C To shave off some percentage of the run-time, one should make 4 variants
Packit 5c3484
C of the Louter loop, for the four different outcomes of un mod 4.  That
Packit 5c3484
C would avoid Loop0 altogether.  Code expansion would be > 4-fold for that
Packit 5c3484
C part of the function, but since it is not very large, that would be
Packit 5c3484
C acceptable.
Packit 5c3484
C
Packit 5c3484
C The mul loop (at L(oopM)) might need some tweaking.  It's current speed is
Packit 5c3484
C unknown.
Packit 5c3484
Packit 5c3484
defframe(PARAM_YSIZE,20)
Packit 5c3484
defframe(PARAM_YP,   16)
Packit 5c3484
defframe(PARAM_XSIZE,12)
Packit 5c3484
defframe(PARAM_XP,   8)
Packit 5c3484
defframe(PARAM_WP,   4)
Packit 5c3484
Packit 5c3484
defframe(VAR_MULTIPLIER, -4)
Packit 5c3484
defframe(VAR_COUNTER,    -8)
Packit 5c3484
deflit(VAR_STACK_SPACE,  8)
Packit 5c3484
Packit 5c3484
	TEXT
Packit 5c3484
	ALIGN(8)
Packit 5c3484
Packit 5c3484
PROLOGUE(mpn_mul_basecase)
Packit 5c3484
deflit(`FRAME',0)
Packit 5c3484
Packit 5c3484
	subl	$VAR_STACK_SPACE,%esp
Packit 5c3484
	pushl	%esi
Packit 5c3484
	pushl	%ebp
Packit 5c3484
	pushl	%edi
Packit 5c3484
deflit(`FRAME',eval(VAR_STACK_SPACE+12))
Packit 5c3484
Packit 5c3484
	movl	PARAM_XP,%esi
Packit 5c3484
	movl	PARAM_WP,%edi
Packit 5c3484
	movl	PARAM_YP,%ebp
Packit 5c3484
Packit 5c3484
	movl	(%esi),%eax		C load xp[0]
Packit 5c3484
	mull	(%ebp)			C multiply by yp[0]
Packit 5c3484
	movl	%eax,(%edi)		C store to wp[0]
Packit 5c3484
	movl	PARAM_XSIZE,%ecx	C xsize
Packit 5c3484
	decl	%ecx			C If xsize = 1, ysize = 1 too
Packit 5c3484
	jz	L(done)
Packit 5c3484
Packit 5c3484
	pushl	%ebx
Packit 5c3484
FRAME_pushl()
Packit 5c3484
	movl	%edx,%ebx
Packit 5c3484
Packit 5c3484
	leal	4(%esi),%esi
Packit 5c3484
	leal	4(%edi),%edi
Packit 5c3484
Packit 5c3484
L(oopM):
Packit 5c3484
	movl	(%esi),%eax		C load next limb at xp[j]
Packit 5c3484
	leal	4(%esi),%esi
Packit 5c3484
	mull	(%ebp)
Packit 5c3484
	addl	%ebx,%eax
Packit 5c3484
	movl	%edx,%ebx
Packit 5c3484
	adcl	$0,%ebx
Packit 5c3484
	movl	%eax,(%edi)
Packit 5c3484
	leal	4(%edi),%edi
Packit 5c3484
	decl	%ecx
Packit 5c3484
	jnz	L(oopM)
Packit 5c3484
Packit 5c3484
	movl	%ebx,(%edi)		C most significant limb of product
Packit 5c3484
	addl	$4,%edi			C increment wp
Packit 5c3484
	movl	PARAM_XSIZE,%eax
Packit 5c3484
	shll	$2,%eax
Packit 5c3484
	subl	%eax,%edi
Packit 5c3484
	subl	%eax,%esi
Packit 5c3484
Packit 5c3484
	movl	PARAM_YSIZE,%eax	C ysize
Packit 5c3484
	decl	%eax
Packit 5c3484
	jz	L(skip)
Packit 5c3484
	movl	%eax,VAR_COUNTER	C set index i to ysize
Packit 5c3484
Packit 5c3484
L(outer):
Packit 5c3484
	movl	PARAM_YP,%ebp		C yp
Packit 5c3484
	addl	$4,%ebp			C make ebp point to next v limb
Packit 5c3484
	movl	%ebp,PARAM_YP
Packit 5c3484
	movl	(%ebp),%eax		C copy y limb ...
Packit 5c3484
	movl	%eax,VAR_MULTIPLIER	C ... to stack slot
Packit 5c3484
	movl	PARAM_XSIZE,%ecx
Packit 5c3484
Packit 5c3484
	xorl	%ebx,%ebx
Packit 5c3484
	andl	$3,%ecx
Packit 5c3484
	jz	L(end0)
Packit 5c3484
Packit 5c3484
L(oop0):
Packit 5c3484
	movl	(%esi),%eax
Packit 5c3484
	mull	VAR_MULTIPLIER
Packit 5c3484
	leal	4(%esi),%esi
Packit 5c3484
	addl	%ebx,%eax
Packit 5c3484
	movl	$0,%ebx
Packit 5c3484
	adcl	%ebx,%edx
Packit 5c3484
	addl	%eax,(%edi)
Packit 5c3484
	adcl	%edx,%ebx		C propagate carry into cylimb
Packit 5c3484
Packit 5c3484
	leal	4(%edi),%edi
Packit 5c3484
	decl	%ecx
Packit 5c3484
	jnz	L(oop0)
Packit 5c3484
Packit 5c3484
L(end0):
Packit 5c3484
	movl	PARAM_XSIZE,%ecx
Packit 5c3484
	shrl	$2,%ecx
Packit 5c3484
	jz	L(endX)
Packit 5c3484
Packit 5c3484
	ALIGN(8)
Packit 5c3484
L(oopX):
Packit 5c3484
	movl	(%esi),%eax
Packit 5c3484
	mull	VAR_MULTIPLIER
Packit 5c3484
	addl	%eax,%ebx
Packit 5c3484
	movl	$0,%ebp
Packit 5c3484
	adcl	%edx,%ebp
Packit 5c3484
Packit 5c3484
	movl	4(%esi),%eax
Packit 5c3484
	mull	VAR_MULTIPLIER
Packit 5c3484
	addl	%ebx,(%edi)
Packit 5c3484
	adcl	%eax,%ebp	C new lo + cylimb
Packit 5c3484
	movl	$0,%ebx
Packit 5c3484
	adcl	%edx,%ebx
Packit 5c3484
Packit 5c3484
	movl	8(%esi),%eax
Packit 5c3484
	mull	VAR_MULTIPLIER
Packit 5c3484
	addl	%ebp,4(%edi)
Packit 5c3484
	adcl	%eax,%ebx	C new lo + cylimb
Packit 5c3484
	movl	$0,%ebp
Packit 5c3484
	adcl	%edx,%ebp
Packit 5c3484
Packit 5c3484
	movl	12(%esi),%eax
Packit 5c3484
	mull	VAR_MULTIPLIER
Packit 5c3484
	addl	%ebx,8(%edi)
Packit 5c3484
	adcl	%eax,%ebp	C new lo + cylimb
Packit 5c3484
	movl	$0,%ebx
Packit 5c3484
	adcl	%edx,%ebx
Packit 5c3484
Packit 5c3484
	addl	%ebp,12(%edi)
Packit 5c3484
	adcl	$0,%ebx		C propagate carry into cylimb
Packit 5c3484
Packit 5c3484
	leal	16(%esi),%esi
Packit 5c3484
	leal	16(%edi),%edi
Packit 5c3484
	decl	%ecx
Packit 5c3484
	jnz	L(oopX)
Packit 5c3484
Packit 5c3484
L(endX):
Packit 5c3484
	movl	%ebx,(%edi)
Packit 5c3484
	addl	$4,%edi
Packit 5c3484
Packit 5c3484
	C we incremented wp and xp in the loop above; compensate
Packit 5c3484
	movl	PARAM_XSIZE,%eax
Packit 5c3484
	shll	$2,%eax
Packit 5c3484
	subl	%eax,%edi
Packit 5c3484
	subl	%eax,%esi
Packit 5c3484
Packit 5c3484
	movl	VAR_COUNTER,%eax
Packit 5c3484
	decl	%eax
Packit 5c3484
	movl	%eax,VAR_COUNTER
Packit 5c3484
	jnz	L(outer)
Packit 5c3484
Packit 5c3484
L(skip):
Packit 5c3484
	popl	%ebx
Packit 5c3484
	popl	%edi
Packit 5c3484
	popl	%ebp
Packit 5c3484
	popl	%esi
Packit 5c3484
	addl	$8,%esp
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
L(done):
Packit 5c3484
	movl	%edx,4(%edi)	   C store to wp[1]
Packit 5c3484
	popl	%edi
Packit 5c3484
	popl	%ebp
Packit 5c3484
	popl	%esi
Packit 5c3484
	addl	$8,%esp
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
EPILOGUE()