|
Packit |
5c3484 |
dnl AMD64 mpn_mod_1s_2p
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Contributed to the GNU project by Torbjorn Granlund.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C cycles/limb
|
|
Packit |
5c3484 |
C AMD K8,K9 4
|
|
Packit |
5c3484 |
C AMD K10 4
|
|
Packit |
5c3484 |
C Intel P4 19
|
|
Packit |
5c3484 |
C Intel core2 8
|
|
Packit |
5c3484 |
C Intel NHM 6.5
|
|
Packit |
5c3484 |
C Intel SBR 4.5
|
|
Packit |
5c3484 |
C Intel atom 28
|
|
Packit |
5c3484 |
C VIA nano 8
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ABI_SUPPORT(DOS64)
|
|
Packit |
5c3484 |
ABI_SUPPORT(STD64)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASM_START()
|
|
Packit |
5c3484 |
TEXT
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
PROLOGUE(mpn_mod_1s_2p)
|
|
Packit |
5c3484 |
FUNC_ENTRY(4)
|
|
Packit |
5c3484 |
push %r14
|
|
Packit |
5c3484 |
test $1, R8(%rsi)
|
|
Packit |
5c3484 |
mov %rdx, %r14
|
|
Packit |
5c3484 |
push %r13
|
|
Packit |
5c3484 |
mov %rcx, %r13
|
|
Packit |
5c3484 |
push %r12
|
|
Packit |
5c3484 |
push %rbp
|
|
Packit |
5c3484 |
push %rbx
|
|
Packit |
5c3484 |
mov 16(%rcx), %r10
|
|
Packit |
5c3484 |
mov 24(%rcx), %rbx
|
|
Packit |
5c3484 |
mov 32(%rcx), %rbp
|
|
Packit |
5c3484 |
je L(b0)
|
|
Packit |
5c3484 |
dec %rsi
|
|
Packit |
5c3484 |
je L(one)
|
|
Packit |
5c3484 |
mov -8(%rdi,%rsi,8), %rax
|
|
Packit |
5c3484 |
mul %r10
|
|
Packit |
5c3484 |
mov %rax, %r9
|
|
Packit |
5c3484 |
mov %rdx, %r8
|
|
Packit |
5c3484 |
mov (%rdi,%rsi,8), %rax
|
|
Packit |
5c3484 |
add -16(%rdi,%rsi,8), %r9
|
|
Packit |
5c3484 |
adc $0, %r8
|
|
Packit |
5c3484 |
mul %rbx
|
|
Packit |
5c3484 |
add %rax, %r9
|
|
Packit |
5c3484 |
adc %rdx, %r8
|
|
Packit |
5c3484 |
jmp L(11)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(b0): mov -8(%rdi,%rsi,8), %r8
|
|
Packit |
5c3484 |
mov -16(%rdi,%rsi,8), %r9
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(11): sub $4, %rsi
|
|
Packit |
5c3484 |
jb L(ed2)
|
|
Packit |
5c3484 |
lea 40(%rdi,%rsi,8), %rdi
|
|
Packit |
5c3484 |
mov -40(%rdi), %r11
|
|
Packit |
5c3484 |
mov -32(%rdi), %rax
|
|
Packit |
5c3484 |
jmp L(m0)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
L(top): mov -24(%rdi), %r9
|
|
Packit |
5c3484 |
add %rax, %r11
|
|
Packit |
5c3484 |
mov -16(%rdi), %rax
|
|
Packit |
5c3484 |
adc %rdx, %r12
|
|
Packit |
5c3484 |
mul %r10
|
|
Packit |
5c3484 |
add %rax, %r9
|
|
Packit |
5c3484 |
mov %r11, %rax
|
|
Packit |
5c3484 |
mov %rdx, %r8
|
|
Packit |
5c3484 |
adc $0, %r8
|
|
Packit |
5c3484 |
mul %rbx
|
|
Packit |
5c3484 |
add %rax, %r9
|
|
Packit |
5c3484 |
mov %r12, %rax
|
|
Packit |
5c3484 |
adc %rdx, %r8
|
|
Packit |
5c3484 |
mul %rbp
|
|
Packit |
5c3484 |
sub $2, %rsi
|
|
Packit |
5c3484 |
jb L(ed1)
|
|
Packit |
5c3484 |
mov -40(%rdi), %r11
|
|
Packit |
5c3484 |
add %rax, %r9
|
|
Packit |
5c3484 |
mov -32(%rdi), %rax
|
|
Packit |
5c3484 |
adc %rdx, %r8
|
|
Packit |
5c3484 |
L(m0): mul %r10
|
|
Packit |
5c3484 |
add %rax, %r11
|
|
Packit |
5c3484 |
mov %r9, %rax
|
|
Packit |
5c3484 |
mov %rdx, %r12
|
|
Packit |
5c3484 |
adc $0, %r12
|
|
Packit |
5c3484 |
mul %rbx
|
|
Packit |
5c3484 |
add %rax, %r11
|
|
Packit |
5c3484 |
lea -32(%rdi), %rdi C ap -= 4
|
|
Packit |
5c3484 |
mov %r8, %rax
|
|
Packit |
5c3484 |
adc %rdx, %r12
|
|
Packit |
5c3484 |
mul %rbp
|
|
Packit |
5c3484 |
sub $2, %rsi
|
|
Packit |
5c3484 |
jae L(top)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(ed0): mov %r11, %r9
|
|
Packit |
5c3484 |
mov %r12, %r8
|
|
Packit |
5c3484 |
L(ed1): add %rax, %r9
|
|
Packit |
5c3484 |
adc %rdx, %r8
|
|
Packit |
5c3484 |
L(ed2): mov 8(%r13), R32(%rdi) C cnt
|
|
Packit |
5c3484 |
mov %r8, %rax
|
|
Packit |
5c3484 |
mov %r9, %r8
|
|
Packit |
5c3484 |
mul %r10
|
|
Packit |
5c3484 |
add %rax, %r8
|
|
Packit |
5c3484 |
adc $0, %rdx
|
|
Packit |
5c3484 |
L(1): xor R32(%rcx), R32(%rcx)
|
|
Packit |
5c3484 |
mov %r8, %r9
|
|
Packit |
5c3484 |
sub R32(%rdi), R32(%rcx)
|
|
Packit |
5c3484 |
shr R8(%rcx), %r9
|
|
Packit |
5c3484 |
mov R32(%rdi), R32(%rcx)
|
|
Packit |
5c3484 |
sal R8(%rcx), %rdx
|
|
Packit |
5c3484 |
or %rdx, %r9
|
|
Packit |
5c3484 |
sal R8(%rcx), %r8
|
|
Packit |
5c3484 |
mov %r9, %rax
|
|
Packit |
5c3484 |
mulq (%r13)
|
|
Packit |
5c3484 |
mov %rax, %rsi
|
|
Packit |
5c3484 |
inc %r9
|
|
Packit |
5c3484 |
add %r8, %rsi
|
|
Packit |
5c3484 |
adc %r9, %rdx
|
|
Packit |
5c3484 |
imul %r14, %rdx
|
|
Packit |
5c3484 |
sub %rdx, %r8
|
|
Packit |
5c3484 |
lea (%r8,%r14), %rax
|
|
Packit |
5c3484 |
cmp %r8, %rsi
|
|
Packit |
5c3484 |
cmovc %rax, %r8
|
|
Packit |
5c3484 |
mov %r8, %rax
|
|
Packit |
5c3484 |
sub %r14, %rax
|
|
Packit |
5c3484 |
cmovc %r8, %rax
|
|
Packit |
5c3484 |
mov R32(%rdi), R32(%rcx)
|
|
Packit |
5c3484 |
shr R8(%rcx), %rax
|
|
Packit |
5c3484 |
pop %rbx
|
|
Packit |
5c3484 |
pop %rbp
|
|
Packit |
5c3484 |
pop %r12
|
|
Packit |
5c3484 |
pop %r13
|
|
Packit |
5c3484 |
pop %r14
|
|
Packit |
5c3484 |
FUNC_EXIT()
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
L(one):
|
|
Packit |
5c3484 |
mov (%rdi), %r8
|
|
Packit |
5c3484 |
mov 8(%rcx), R32(%rdi)
|
|
Packit |
5c3484 |
xor %rdx, %rdx
|
|
Packit |
5c3484 |
jmp L(1)
|
|
Packit |
5c3484 |
EPILOGUE()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
PROLOGUE(mpn_mod_1s_2p_cps)
|
|
Packit |
5c3484 |
FUNC_ENTRY(2)
|
|
Packit |
5c3484 |
push %rbp
|
|
Packit |
5c3484 |
bsr %rsi, %rcx
|
|
Packit |
5c3484 |
push %rbx
|
|
Packit |
5c3484 |
mov %rdi, %rbx
|
|
Packit |
5c3484 |
push %r12
|
|
Packit |
5c3484 |
xor $63, R32(%rcx)
|
|
Packit |
5c3484 |
mov %rsi, %r12
|
|
Packit |
5c3484 |
mov R32(%rcx), R32(%rbp) C preserve cnt over call
|
|
Packit |
5c3484 |
sal R8(%rcx), %r12 C b << cnt
|
|
Packit |
5c3484 |
IFSTD(` mov %r12, %rdi ') C pass parameter
|
|
Packit |
5c3484 |
IFDOS(` mov %r12, %rcx ') C pass parameter
|
|
Packit |
5c3484 |
ASSERT(nz, `test $15, %rsp')
|
|
Packit |
5c3484 |
CALL( mpn_invert_limb)
|
|
Packit |
5c3484 |
mov %r12, %r8
|
|
Packit |
5c3484 |
mov %rax, %r11
|
|
Packit |
5c3484 |
mov %rax, (%rbx) C store bi
|
|
Packit |
5c3484 |
mov %rbp, 8(%rbx) C store cnt
|
|
Packit |
5c3484 |
neg %r8
|
|
Packit |
5c3484 |
mov R32(%rbp), R32(%rcx)
|
|
Packit |
5c3484 |
mov $1, R32(%rsi)
|
|
Packit |
5c3484 |
ifdef(`SHLD_SLOW',`
|
|
Packit |
5c3484 |
shl R8(%rcx), %rsi
|
|
Packit |
5c3484 |
neg R32(%rcx)
|
|
Packit |
5c3484 |
mov %rax, %rbp
|
|
Packit |
5c3484 |
shr R8(%rcx), %rax
|
|
Packit |
5c3484 |
or %rax, %rsi
|
|
Packit |
5c3484 |
mov %rbp, %rax
|
|
Packit |
5c3484 |
neg R32(%rcx)
|
|
Packit |
5c3484 |
',`
|
|
Packit |
5c3484 |
shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
imul %r8, %rsi
|
|
Packit |
5c3484 |
mul %rsi
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
add %rsi, %rdx
|
|
Packit |
5c3484 |
shr R8(%rcx), %rsi
|
|
Packit |
5c3484 |
mov %rsi, 16(%rbx) C store B1modb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
not %rdx
|
|
Packit |
5c3484 |
imul %r12, %rdx
|
|
Packit |
5c3484 |
lea (%rdx,%r12), %rsi
|
|
Packit |
5c3484 |
cmp %rdx, %rax
|
|
Packit |
5c3484 |
cmovnc %rdx, %rsi
|
|
Packit |
5c3484 |
mov %r11, %rax
|
|
Packit |
5c3484 |
mul %rsi
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
add %rsi, %rdx
|
|
Packit |
5c3484 |
shr R8(%rcx), %rsi
|
|
Packit |
5c3484 |
mov %rsi, 24(%rbx) C store B2modb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
not %rdx
|
|
Packit |
5c3484 |
imul %r12, %rdx
|
|
Packit |
5c3484 |
add %rdx, %r12
|
|
Packit |
5c3484 |
cmp %rdx, %rax
|
|
Packit |
5c3484 |
cmovnc %rdx, %r12
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
shr R8(%rcx), %r12
|
|
Packit |
5c3484 |
mov %r12, 32(%rbx) C store B3modb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
pop %r12
|
|
Packit |
5c3484 |
pop %rbx
|
|
Packit |
5c3484 |
pop %rbp
|
|
Packit |
5c3484 |
FUNC_EXIT()
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
EPILOGUE()
|