Blame mpn/x86/k6/divrem_1.asm

Packit 5c3484
dnl  AMD K6 mpn_divrem_1 -- mpn by limb division.
Packit 5c3484
Packit 5c3484
dnl  Copyright 1999-2003, 2007 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C K6: 20 cycles/limb
Packit 5c3484
Packit 5c3484
Packit 5c3484
C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
Packit 5c3484
C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
Packit 5c3484
C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
Packit 5c3484
C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor,
Packit 5c3484
C                          mp_limb_t carry);
Packit 5c3484
C
Packit 5c3484
C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop
Packit 5c3484
C instead of decl+jnz, since it comes out 2 cycles/limb faster.
Packit 5c3484
C
Packit 5c3484
C A test is done to see if the high limb is less than the divisor, and if so
Packit 5c3484
C one less div is done.  A div is 20 cycles, so assuming high
Packit 5c3484
C half the time, then this test saves half that amount.  The branch
Packit 5c3484
C misprediction penalty is less than that.
Packit 5c3484
C
Packit 5c3484
C Back-to-back div instructions run at 20 cycles, the same as the loop here,
Packit 5c3484
C so it seems there's nothing to gain by rearranging the loop.  Pairing the
Packit 5c3484
C mov and loop instructions was found to gain nothing.
Packit 5c3484
C
Packit 5c3484
C Enhancements:
Packit 5c3484
C
Packit 5c3484
C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but
Packit 5c3484
C that algorithm has been found to suffer from the relatively poor carry
Packit 5c3484
C handling on K6 and too many auxiliary instructions.  The fractional part
Packit 5c3484
C however could be done at about 13 c/l, if it mattered enough.
Packit 5c3484
Packit 5c3484
defframe(PARAM_CARRY,  24)
Packit 5c3484
defframe(PARAM_DIVISOR,20)
Packit 5c3484
defframe(PARAM_SIZE,   16)
Packit 5c3484
defframe(PARAM_SRC,    12)
Packit 5c3484
defframe(PARAM_XSIZE,  8)
Packit 5c3484
defframe(PARAM_DST,    4)
Packit 5c3484
Packit 5c3484
	TEXT
Packit 5c3484
Packit 5c3484
	ALIGN(32)
Packit 5c3484
PROLOGUE(mpn_divrem_1c)
Packit 5c3484
deflit(`FRAME',0)
Packit 5c3484
Packit 5c3484
	movl	PARAM_SIZE, %ecx
Packit 5c3484
	pushl	%edi		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_SRC, %edi
Packit 5c3484
	pushl	%esi		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_DIVISOR, %esi
Packit 5c3484
	pushl	%ebx		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_DST, %ebx
Packit 5c3484
	pushl	%ebp		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_XSIZE, %ebp
Packit 5c3484
	orl	%ecx, %ecx		C size
Packit 5c3484
Packit 5c3484
	movl	PARAM_CARRY, %edx
Packit 5c3484
	jz	L(fraction)		C if size==0
Packit 5c3484
Packit 5c3484
	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
Packit 5c3484
	jmp	L(integer_top)
Packit 5c3484
Packit 5c3484
EPILOGUE()
Packit 5c3484
Packit 5c3484
Packit 5c3484
	ALIGN(16)
Packit 5c3484
PROLOGUE(mpn_divrem_1)
Packit 5c3484
deflit(`FRAME',0)
Packit 5c3484
Packit 5c3484
	movl	PARAM_SIZE, %ecx
Packit 5c3484
	pushl	%edi		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_SRC, %edi
Packit 5c3484
	pushl	%esi		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_DIVISOR, %esi
Packit 5c3484
	orl	%ecx,%ecx		C size
Packit 5c3484
Packit 5c3484
	jz	L(size_zero)
Packit 5c3484
	pushl	%ebx		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	-4(%edi,%ecx,4), %eax	C src high limb
Packit 5c3484
	xorl	%edx, %edx
Packit 5c3484
Packit 5c3484
	movl	PARAM_DST, %ebx
Packit 5c3484
	pushl	%ebp		FRAME_pushl()
Packit 5c3484
Packit 5c3484
	movl	PARAM_XSIZE, %ebp
Packit 5c3484
	cmpl	%esi, %eax
Packit 5c3484
Packit 5c3484
	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
Packit 5c3484
	jae	L(integer_entry)
Packit 5c3484
Packit 5c3484
Packit 5c3484
	C high
Packit 5c3484
Packit 5c3484
	movl	%edx, (%ebx,%ecx,4)
Packit 5c3484
	decl	%ecx
Packit 5c3484
Packit 5c3484
	movl	%eax, %edx
Packit 5c3484
	jz	L(fraction)
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(integer_top):
Packit 5c3484
	C eax	scratch (quotient)
Packit 5c3484
	C ebx	dst+4*xsize-4
Packit 5c3484
	C ecx	counter
Packit 5c3484
	C edx	scratch (remainder)
Packit 5c3484
	C esi	divisor
Packit 5c3484
	C edi	src
Packit 5c3484
	C ebp	xsize
Packit 5c3484
Packit 5c3484
	movl	-4(%edi,%ecx,4), %eax
Packit 5c3484
L(integer_entry):
Packit 5c3484
Packit 5c3484
	divl	%esi
Packit 5c3484
Packit 5c3484
	movl	%eax, (%ebx,%ecx,4)
Packit 5c3484
	loop	L(integer_top)
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(fraction):
Packit 5c3484
	orl	%ebp, %ecx
Packit 5c3484
	jz	L(done)
Packit 5c3484
Packit 5c3484
	movl	PARAM_DST, %ebx
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(fraction_top):
Packit 5c3484
	C eax	scratch (quotient)
Packit 5c3484
	C ebx	dst
Packit 5c3484
	C ecx	counter
Packit 5c3484
	C edx	scratch (remainder)
Packit 5c3484
	C esi	divisor
Packit 5c3484
	C edi
Packit 5c3484
	C ebp
Packit 5c3484
Packit 5c3484
	xorl	%eax, %eax
Packit 5c3484
Packit 5c3484
	divl	%esi
Packit 5c3484
Packit 5c3484
	movl	%eax, -4(%ebx,%ecx,4)
Packit 5c3484
	loop	L(fraction_top)
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(done):
Packit 5c3484
	popl	%ebp
Packit 5c3484
	movl	%edx, %eax
Packit 5c3484
	popl	%ebx
Packit 5c3484
	popl	%esi
Packit 5c3484
	popl	%edi
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(size_zero):
Packit 5c3484
deflit(`FRAME',8)
Packit 5c3484
	movl	PARAM_XSIZE, %ecx
Packit 5c3484
	xorl	%eax, %eax
Packit 5c3484
Packit 5c3484
	movl	PARAM_DST, %edi
Packit 5c3484
Packit 5c3484
	cld	C better safe than sorry, see mpn/x86/README
Packit 5c3484
Packit 5c3484
	rep
Packit 5c3484
	stosl
Packit 5c3484
Packit 5c3484
	popl	%esi
Packit 5c3484
	popl	%edi
Packit 5c3484
	ret
Packit 5c3484
EPILOGUE()