Blame mpn/x86/divrem_1.asm

Packit 5c3484
dnl  x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
Packit 5c3484
Packit 5c3484
dnl  Copyright 1999-2003, 2007 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C       cycles/limb
Packit 5c3484
C 486   approx 43 maybe
Packit 5c3484
C P5        44
Packit 5c3484
C P6        39
Packit 5c3484
C P6MMX     39
Packit 5c3484
C K6        22
Packit 5c3484
C K7        42
Packit 5c3484
C P4        58
Packit 5c3484
Packit 5c3484
Packit 5c3484
C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
Packit 5c3484
C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
Packit 5c3484
C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
Packit 5c3484
C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor,
Packit 5c3484
C                          mp_limb_t carry);
Packit 5c3484
C
Packit 5c3484
C Divide src,size by divisor and store the quotient in dst+xsize,size.
Packit 5c3484
C Extend the division to fractional quotient limbs in dst,xsize.  Return the
Packit 5c3484
C remainder.  Either or both xsize and size can be 0.
Packit 5c3484
C
Packit 5c3484
C mpn_divrem_1c takes a carry parameter which is an initial high limb,
Packit 5c3484
C effectively one extra limb at the top of src,size.  Must have
Packit 5c3484
C carry
Packit 5c3484
C
Packit 5c3484
C
Packit 5c3484
C Essentially the code is the same as the division based part of
Packit 5c3484
C mpn/generic/divrem_1.c, but has the advantage that we get the desired divl
Packit 5c3484
C instruction even when gcc is not being used (when longlong.h only has the
Packit 5c3484
C rather slow generic C udiv_qrnnd().
Packit 5c3484
C
Packit 5c3484
C A test is done to see if the high limb is less than the divisor, and if so
Packit 5c3484
C one less div is done.  A div is between 20 and 40 cycles on the various
Packit 5c3484
C x86s, so assuming high
Packit 5c3484
C half that amount.  The branch misprediction penalty on each chip is less
Packit 5c3484
C than half a div.
Packit 5c3484
C
Packit 5c3484
C
Packit 5c3484
C Notes for P5:
Packit 5c3484
C
Packit 5c3484
C It might be thought that moving the load down to pair with the store would
Packit 5c3484
C save 1 cycle, but that doesn't seem to happen in practice, and in any case
Packit 5c3484
C would be a mere 2.2% saving, so it's hardly worth bothering about.
Packit 5c3484
C
Packit 5c3484
C A mul-by-inverse might be a possibility for P5, as done in
Packit 5c3484
C mpn/x86/pentium/mod_1.asm.  The number of auxiliary instructions required
Packit 5c3484
C is a hinderance, but there could be a 10-15% speedup available.
Packit 5c3484
C
Packit 5c3484
C
Packit 5c3484
C Notes for K6:
Packit 5c3484
C
Packit 5c3484
C K6 has its own version of this code, using loop and paying attention to
Packit 5c3484
C cache line boundary crossings.  The target 20 c/l can be had with the
Packit 5c3484
C decl+jnz of the present code by pairing up the load and store in the
Packit 5c3484
C loops.  But it's considered easier not to introduce complexity just for
Packit 5c3484
C that, but instead let k6 have its own code.
Packit 5c3484
C
Packit 5c3484
Packit 5c3484
defframe(PARAM_CARRY,  24)
Packit 5c3484
defframe(PARAM_DIVISOR,20)
Packit 5c3484
defframe(PARAM_SIZE,   16)
Packit 5c3484
defframe(PARAM_SRC,    12)
Packit 5c3484
defframe(PARAM_XSIZE,  8)
Packit 5c3484
defframe(PARAM_DST,    4)
Packit 5c3484
Packit 5c3484
	TEXT
Packit 5c3484
	ALIGN(16)
Packit 5c3484
Packit 5c3484
PROLOGUE(mpn_divrem_1c)
Packit 5c3484
deflit(`FRAME',0)
Packit 5c3484
Packit 5c3484
	movl	PARAM_SIZE, %ecx
Packit 5c3484
	pushl	%edi		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_SRC, %edi
Packit 5c3484
	pushl	%esi		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_DIVISOR, %esi
Packit 5c3484
	pushl	%ebx		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_DST, %ebx
Packit 5c3484
	pushl	%ebp		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_XSIZE, %ebp
Packit 5c3484
	orl	%ecx, %ecx
Packit 5c3484
Packit 5c3484
	movl	PARAM_CARRY, %edx
Packit 5c3484
	jz	L(fraction)
Packit 5c3484
Packit 5c3484
	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
Packit 5c3484
	jmp	L(integer_top)
Packit 5c3484
Packit 5c3484
EPILOGUE()
Packit 5c3484
Packit 5c3484
Packit 5c3484
PROLOGUE(mpn_divrem_1)
Packit 5c3484
deflit(`FRAME',0)
Packit 5c3484
Packit 5c3484
	movl	PARAM_SIZE, %ecx
Packit 5c3484
	pushl	%edi		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_SRC, %edi
Packit 5c3484
	pushl	%esi		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_DIVISOR, %esi
Packit 5c3484
	orl	%ecx,%ecx
Packit 5c3484
Packit 5c3484
	jz	L(size_zero)
Packit 5c3484
	pushl	%ebx		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	-4(%edi,%ecx,4), %eax	C src high limb
Packit 5c3484
	xorl	%edx, %edx
Packit 5c3484
Packit 5c3484
	movl	PARAM_DST, %ebx
Packit 5c3484
	pushl	%ebp		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_XSIZE, %ebp
Packit 5c3484
	cmpl	%esi, %eax
Packit 5c3484
Packit 5c3484
	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
Packit 5c3484
	jae	L(integer_entry)
Packit 5c3484
Packit 5c3484
Packit 5c3484
	C high
Packit 5c3484
Packit 5c3484
	movl	%edx, (%ebx,%ecx,4)
Packit 5c3484
	decl	%ecx
Packit 5c3484
Packit 5c3484
	movl	%eax, %edx
Packit 5c3484
	jz	L(fraction)
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(integer_top):
Packit 5c3484
	C eax	scratch (quotient)
Packit 5c3484
	C ebx	dst+4*xsize-4
Packit 5c3484
	C ecx	counter
Packit 5c3484
	C edx	scratch (remainder)
Packit 5c3484
	C esi	divisor
Packit 5c3484
	C edi	src
Packit 5c3484
	C ebp	xsize
Packit 5c3484
Packit 5c3484
	movl	-4(%edi,%ecx,4), %eax
Packit 5c3484
L(integer_entry):
Packit 5c3484
Packit 5c3484
	divl	%esi
Packit 5c3484
Packit 5c3484
	movl	%eax, (%ebx,%ecx,4)
Packit 5c3484
	decl	%ecx
Packit 5c3484
	jnz	L(integer_top)
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(fraction):
Packit 5c3484
	orl	%ebp, %ecx
Packit 5c3484
	jz	L(done)
Packit 5c3484
Packit 5c3484
	movl	PARAM_DST, %ebx
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(fraction_top):
Packit 5c3484
	C eax	scratch (quotient)
Packit 5c3484
	C ebx	dst
Packit 5c3484
	C ecx	counter
Packit 5c3484
	C edx	scratch (remainder)
Packit 5c3484
	C esi	divisor
Packit 5c3484
	C edi
Packit 5c3484
	C ebp
Packit 5c3484
Packit 5c3484
	xorl	%eax, %eax
Packit 5c3484
Packit 5c3484
	divl	%esi
Packit 5c3484
Packit 5c3484
	movl	%eax, -4(%ebx,%ecx,4)
Packit 5c3484
	decl	%ecx
Packit 5c3484
	jnz	L(fraction_top)
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(done):
Packit 5c3484
	popl	%ebp
Packit 5c3484
	movl	%edx, %eax
Packit 5c3484
	popl	%ebx
Packit 5c3484
	popl	%esi
Packit 5c3484
	popl	%edi
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(size_zero):
Packit 5c3484
deflit(`FRAME',8)
Packit 5c3484
	movl	PARAM_XSIZE, %ecx
Packit 5c3484
	xorl	%eax, %eax
Packit 5c3484
Packit 5c3484
	movl	PARAM_DST, %edi
Packit 5c3484
Packit 5c3484
	cld	C better safe than sorry, see mpn/x86/README
Packit 5c3484
Packit 5c3484
	rep
Packit 5c3484
	stosl
Packit 5c3484
Packit 5c3484
	popl	%esi
Packit 5c3484
	popl	%edi
Packit 5c3484
	ret
Packit 5c3484
EPILOGUE()