|
Packit |
5c3484 |
dnl AMD64 mpn_mod_1_1p
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Contributed to the GNU project by Torbjörn Granlund and Niels Möller.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C cycles/limb
|
|
Packit |
5c3484 |
C AMD K8,K9 6
|
|
Packit |
5c3484 |
C AMD K10 6
|
|
Packit |
5c3484 |
C Intel P4 26
|
|
Packit |
5c3484 |
C Intel core2 12.5
|
|
Packit |
5c3484 |
C Intel NHM 11.3
|
|
Packit |
5c3484 |
C Intel SBR 8.4 (slowdown, old code took 8.0)
|
|
Packit |
5c3484 |
C Intel atom 26
|
|
Packit |
5c3484 |
C VIA nano 13
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
define(`B2mb', `%r10')
|
|
Packit |
5c3484 |
define(`B2modb', `%r11')
|
|
Packit |
5c3484 |
define(`ap', `%rdi')
|
|
Packit |
5c3484 |
define(`n', `%rsi')
|
|
Packit |
5c3484 |
define(`pre', `%r8')
|
|
Packit |
5c3484 |
define(`b', `%rbx')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
define(`r0', `%rbp') C r1 kept in %rax
|
|
Packit |
5c3484 |
define(`r2', `%rcx') C kept negated. Also used as shift count
|
|
Packit |
5c3484 |
define(`t0', `%r9')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C mp_limb_t
|
|
Packit |
5c3484 |
C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
|
|
Packit |
5c3484 |
C %rdi %rsi %rdx %rcx
|
|
Packit |
5c3484 |
C The pre array contains bi, cnt, B1modb, B2modb
|
|
Packit |
5c3484 |
C Note: This implementation needs B1modb only when cnt > 0
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C The iteration is almost as follows,
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C where r2 is a single bit represented as a mask. But to make sure that the
|
|
Packit |
5c3484 |
C result fits in two limbs and a bit, carry from the addition
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C r_0 + r_2 B2mod
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C is handled specially. On carry, we subtract b to cancel the carry,
|
|
Packit |
5c3484 |
C and we use instead the value
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C r_0 + B2mb (mod B)
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C This addition can be issued early since it doesn't depend on r2, and it is
|
|
Packit |
5c3484 |
C the source of the cmov in the loop.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ABI_SUPPORT(DOS64)
|
|
Packit |
5c3484 |
ABI_SUPPORT(STD64)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASM_START()
|
|
Packit |
5c3484 |
TEXT
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
PROLOGUE(mpn_mod_1_1p)
|
|
Packit |
5c3484 |
FUNC_ENTRY(4)
|
|
Packit |
5c3484 |
push %rbp
|
|
Packit |
5c3484 |
push %rbx
|
|
Packit |
5c3484 |
mov %rdx, b
|
|
Packit |
5c3484 |
mov %rcx, pre
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mov -8(ap, n, 8), %rax
|
|
Packit |
5c3484 |
cmp $3, n
|
|
Packit |
5c3484 |
jnc L(first)
|
|
Packit |
5c3484 |
mov -16(ap, n, 8), r0
|
|
Packit |
5c3484 |
jmp L(reduce_two)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(first):
|
|
Packit |
5c3484 |
C First iteration, no r2
|
|
Packit |
5c3484 |
mov 24(pre), B2modb
|
|
Packit |
5c3484 |
mul B2modb
|
|
Packit |
5c3484 |
mov -24(ap, n, 8), r0
|
|
Packit |
5c3484 |
add %rax, r0
|
|
Packit |
5c3484 |
mov -16(ap, n, 8), %rax
|
|
Packit |
5c3484 |
adc %rdx, %rax
|
|
Packit |
5c3484 |
sbb r2, r2
|
|
Packit |
5c3484 |
sub $4, n
|
|
Packit |
5c3484 |
jc L(reduce_three)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mov B2modb, B2mb
|
|
Packit |
5c3484 |
sub b, B2mb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
L(top): and B2modb, r2
|
|
Packit |
5c3484 |
lea (B2mb, r0), t0
|
|
Packit |
5c3484 |
mul B2modb
|
|
Packit |
5c3484 |
add r0, r2
|
|
Packit |
5c3484 |
mov (ap, n, 8), r0
|
|
Packit |
5c3484 |
cmovc t0, r2
|
|
Packit |
5c3484 |
add %rax, r0
|
|
Packit |
5c3484 |
mov r2, %rax
|
|
Packit |
5c3484 |
adc %rdx, %rax
|
|
Packit |
5c3484 |
sbb r2, r2
|
|
Packit |
5c3484 |
sub $1, n
|
|
Packit |
5c3484 |
jnc L(top)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(reduce_three):
|
|
Packit |
5c3484 |
C Eliminate r2
|
|
Packit |
5c3484 |
and b, r2
|
|
Packit |
5c3484 |
sub r2, %rax
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(reduce_two):
|
|
Packit |
5c3484 |
mov 8(pre), R32(%rcx)
|
|
Packit |
5c3484 |
test R32(%rcx), R32(%rcx)
|
|
Packit |
5c3484 |
jz L(normalized)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Unnormalized, use B1modb to reduce to size < B (b+1)
|
|
Packit |
5c3484 |
mulq 16(pre)
|
|
Packit |
5c3484 |
xor t0, t0
|
|
Packit |
5c3484 |
add %rax, r0
|
|
Packit |
5c3484 |
adc %rdx, t0
|
|
Packit |
5c3484 |
mov t0, %rax
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Left-shift to normalize
|
|
Packit |
5c3484 |
ifdef(`SHLD_SLOW',`
|
|
Packit |
5c3484 |
shl R8(%rcx), %rax
|
|
Packit |
5c3484 |
mov r0, t0
|
|
Packit |
5c3484 |
neg R32(%rcx)
|
|
Packit |
5c3484 |
shr R8(%rcx), t0
|
|
Packit |
5c3484 |
or t0, %rax
|
|
Packit |
5c3484 |
neg R32(%rcx)
|
|
Packit |
5c3484 |
',`
|
|
Packit |
5c3484 |
shld R8(%rcx), r0, %rax
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
shl R8(%rcx), r0
|
|
Packit |
5c3484 |
jmp L(udiv)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(normalized):
|
|
Packit |
5c3484 |
mov %rax, t0
|
|
Packit |
5c3484 |
sub b, t0
|
|
Packit |
5c3484 |
cmovnc t0, %rax
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(udiv):
|
|
Packit |
5c3484 |
lea 1(%rax), t0
|
|
Packit |
5c3484 |
mulq (pre)
|
|
Packit |
5c3484 |
add r0, %rax
|
|
Packit |
5c3484 |
adc t0, %rdx
|
|
Packit |
5c3484 |
imul b, %rdx
|
|
Packit |
5c3484 |
sub %rdx, r0
|
|
Packit |
5c3484 |
cmp r0, %rax
|
|
Packit |
5c3484 |
lea (b, r0), %rax
|
|
Packit |
5c3484 |
cmovnc r0, %rax
|
|
Packit |
5c3484 |
cmp b, %rax
|
|
Packit |
5c3484 |
jnc L(fix)
|
|
Packit |
5c3484 |
L(ok): shr R8(%rcx), %rax
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
pop %rbx
|
|
Packit |
5c3484 |
pop %rbp
|
|
Packit |
5c3484 |
FUNC_EXIT()
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
L(fix): sub b, %rax
|
|
Packit |
5c3484 |
jmp L(ok)
|
|
Packit |
5c3484 |
EPILOGUE()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
PROLOGUE(mpn_mod_1_1p_cps)
|
|
Packit |
5c3484 |
FUNC_ENTRY(2)
|
|
Packit |
5c3484 |
push %rbp
|
|
Packit |
5c3484 |
bsr %rsi, %rcx
|
|
Packit |
5c3484 |
push %rbx
|
|
Packit |
5c3484 |
mov %rdi, %rbx
|
|
Packit |
5c3484 |
push %r12
|
|
Packit |
5c3484 |
xor $63, R32(%rcx)
|
|
Packit |
5c3484 |
mov %rsi, %r12
|
|
Packit |
5c3484 |
mov R32(%rcx), R32(%rbp)
|
|
Packit |
5c3484 |
sal R8(%rcx), %r12
|
|
Packit |
5c3484 |
IFSTD(` mov %r12, %rdi ') C pass parameter
|
|
Packit |
5c3484 |
IFDOS(` mov %r12, %rcx ') C pass parameter
|
|
Packit |
5c3484 |
ASSERT(nz, `test $15, %rsp')
|
|
Packit |
5c3484 |
CALL( mpn_invert_limb)
|
|
Packit |
5c3484 |
neg %r12
|
|
Packit |
5c3484 |
mov %r12, %r8
|
|
Packit |
5c3484 |
mov %rax, (%rbx) C store bi
|
|
Packit |
5c3484 |
mov %rbp, 8(%rbx) C store cnt
|
|
Packit |
5c3484 |
imul %rax, %r12
|
|
Packit |
5c3484 |
mov %r12, 24(%rbx) C store B2modb
|
|
Packit |
5c3484 |
mov R32(%rbp), R32(%rcx)
|
|
Packit |
5c3484 |
test R32(%rcx), R32(%rcx)
|
|
Packit |
5c3484 |
jz L(z)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mov $1, R32(%rdx)
|
|
Packit |
5c3484 |
ifdef(`SHLD_SLOW',`
|
|
Packit |
5c3484 |
C Destroys %rax, unlike shld. Otherwise, we could do B1modb
|
|
Packit |
5c3484 |
C before B2modb, and get rid of the move %r12, %r8 above.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
shl R8(%rcx), %rdx
|
|
Packit |
5c3484 |
neg R32(%rcx)
|
|
Packit |
5c3484 |
shr R8(%rcx), %rax
|
|
Packit |
5c3484 |
or %rax, %rdx
|
|
Packit |
5c3484 |
neg R32(%rcx)
|
|
Packit |
5c3484 |
',`
|
|
Packit |
5c3484 |
shld R8(%rcx), %rax, %rdx
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
imul %rdx, %r8
|
|
Packit |
5c3484 |
shr R8(%rcx), %r8
|
|
Packit |
5c3484 |
mov %r8, 16(%rbx) C store B1modb
|
|
Packit |
5c3484 |
L(z):
|
|
Packit |
5c3484 |
pop %r12
|
|
Packit |
5c3484 |
pop %rbx
|
|
Packit |
5c3484 |
pop %rbp
|
|
Packit |
5c3484 |
FUNC_EXIT()
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
EPILOGUE()
|
|
Packit |
5c3484 |
ASM_END()
|