|
Packit |
5c3484 |
dnl AMD K6 mpn_divrem_1 -- mpn by limb division.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 1999-2003, 2007 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C K6: 20 cycles/limb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
|
|
Packit |
5c3484 |
C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
|
|
Packit |
5c3484 |
C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
|
|
Packit |
5c3484 |
C mp_srcptr src, mp_size_t size, mp_limb_t divisor,
|
|
Packit |
5c3484 |
C mp_limb_t carry);
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop
|
|
Packit |
5c3484 |
C instead of decl+jnz, since it comes out 2 cycles/limb faster.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C A test is done to see if the high limb is less than the divisor, and if so
|
|
Packit |
5c3484 |
C one less div is done. A div is 20 cycles, so assuming high
|
|
Packit |
5c3484 |
C half the time, then this test saves half that amount. The branch
|
|
Packit |
5c3484 |
C misprediction penalty is less than that.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C Back-to-back div instructions run at 20 cycles, the same as the loop here,
|
|
Packit |
5c3484 |
C so it seems there's nothing to gain by rearranging the loop. Pairing the
|
|
Packit |
5c3484 |
C mov and loop instructions was found to gain nothing.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C Enhancements:
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but
|
|
Packit |
5c3484 |
C that algorithm has been found to suffer from the relatively poor carry
|
|
Packit |
5c3484 |
C handling on K6 and too many auxiliary instructions. The fractional part
|
|
Packit |
5c3484 |
C however could be done at about 13 c/l, if it mattered enough.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
defframe(PARAM_CARRY, 24)
|
|
Packit |
5c3484 |
defframe(PARAM_DIVISOR,20)
|
|
Packit |
5c3484 |
defframe(PARAM_SIZE, 16)
|
|
Packit |
5c3484 |
defframe(PARAM_SRC, 12)
|
|
Packit |
5c3484 |
defframe(PARAM_XSIZE, 8)
|
|
Packit |
5c3484 |
defframe(PARAM_DST, 4)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
TEXT
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(32)
|
|
Packit |
5c3484 |
PROLOGUE(mpn_divrem_1c)
|
|
Packit |
5c3484 |
deflit(`FRAME',0)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_SIZE, %ecx
|
|
Packit |
5c3484 |
pushl %edi FRAME_pushl()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_SRC, %edi
|
|
Packit |
5c3484 |
pushl %esi FRAME_pushl()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_DIVISOR, %esi
|
|
Packit |
5c3484 |
pushl %ebx FRAME_pushl()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_DST, %ebx
|
|
Packit |
5c3484 |
pushl %ebp FRAME_pushl()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_XSIZE, %ebp
|
|
Packit |
5c3484 |
orl %ecx, %ecx C size
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_CARRY, %edx
|
|
Packit |
5c3484 |
jz L(fraction) C if size==0
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
|
|
Packit |
5c3484 |
jmp L(integer_top)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
EPILOGUE()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
PROLOGUE(mpn_divrem_1)
|
|
Packit |
5c3484 |
deflit(`FRAME',0)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_SIZE, %ecx
|
|
Packit |
5c3484 |
pushl %edi FRAME_pushl()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_SRC, %edi
|
|
Packit |
5c3484 |
pushl %esi FRAME_pushl()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_DIVISOR, %esi
|
|
Packit |
5c3484 |
orl %ecx,%ecx C size
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
jz L(size_zero)
|
|
Packit |
5c3484 |
pushl %ebx FRAME_pushl()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl -4(%edi,%ecx,4), %eax C src high limb
|
|
Packit |
5c3484 |
xorl %edx, %edx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_DST, %ebx
|
|
Packit |
5c3484 |
pushl %ebp FRAME_pushl()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_XSIZE, %ebp
|
|
Packit |
5c3484 |
cmpl %esi, %eax
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
|
|
Packit |
5c3484 |
jae L(integer_entry)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C high
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl %edx, (%ebx,%ecx,4)
|
|
Packit |
5c3484 |
decl %ecx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl %eax, %edx
|
|
Packit |
5c3484 |
jz L(fraction)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(integer_top):
|
|
Packit |
5c3484 |
C eax scratch (quotient)
|
|
Packit |
5c3484 |
C ebx dst+4*xsize-4
|
|
Packit |
5c3484 |
C ecx counter
|
|
Packit |
5c3484 |
C edx scratch (remainder)
|
|
Packit |
5c3484 |
C esi divisor
|
|
Packit |
5c3484 |
C edi src
|
|
Packit |
5c3484 |
C ebp xsize
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl -4(%edi,%ecx,4), %eax
|
|
Packit |
5c3484 |
L(integer_entry):
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
divl %esi
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl %eax, (%ebx,%ecx,4)
|
|
Packit |
5c3484 |
loop L(integer_top)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(fraction):
|
|
Packit |
5c3484 |
orl %ebp, %ecx
|
|
Packit |
5c3484 |
jz L(done)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_DST, %ebx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(fraction_top):
|
|
Packit |
5c3484 |
C eax scratch (quotient)
|
|
Packit |
5c3484 |
C ebx dst
|
|
Packit |
5c3484 |
C ecx counter
|
|
Packit |
5c3484 |
C edx scratch (remainder)
|
|
Packit |
5c3484 |
C esi divisor
|
|
Packit |
5c3484 |
C edi
|
|
Packit |
5c3484 |
C ebp
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
xorl %eax, %eax
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
divl %esi
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl %eax, -4(%ebx,%ecx,4)
|
|
Packit |
5c3484 |
loop L(fraction_top)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(done):
|
|
Packit |
5c3484 |
popl %ebp
|
|
Packit |
5c3484 |
movl %edx, %eax
|
|
Packit |
5c3484 |
popl %ebx
|
|
Packit |
5c3484 |
popl %esi
|
|
Packit |
5c3484 |
popl %edi
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(size_zero):
|
|
Packit |
5c3484 |
deflit(`FRAME',8)
|
|
Packit |
5c3484 |
movl PARAM_XSIZE, %ecx
|
|
Packit |
5c3484 |
xorl %eax, %eax
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_DST, %edi
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
cld C better safe than sorry, see mpn/x86/README
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
rep
|
|
Packit |
5c3484 |
stosl
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
popl %esi
|
|
Packit |
5c3484 |
popl %edi
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
EPILOGUE()
|