|
Packit |
5c3484 |
dnl Alpha mpn_modexact_1c_odd -- mpn exact remainder
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 2003, 2004 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C cycles/limb
|
|
Packit |
5c3484 |
C EV4: 47
|
|
Packit |
5c3484 |
C EV5: 30
|
|
Packit |
5c3484 |
C EV6: 15
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
|
|
Packit |
5c3484 |
C mp_limb_t c)
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C This code follows the "alternate" code in mpn/generic/mode1o.c,
|
|
Packit |
5c3484 |
C eliminating cbit+climb from the dependent chain. This leaves,
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C ev4 ev5 ev6
|
|
Packit |
5c3484 |
C 1 3 1 subq y = x - h
|
|
Packit |
5c3484 |
C 23 13 7 mulq q = y * inverse
|
|
Packit |
5c3484 |
C 23 14 7 umulh h = high (q * d)
|
|
Packit |
5c3484 |
C -- -- --
|
|
Packit |
5c3484 |
C 47 30 15
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C In each case, the load latency, loop control, and extra carry bit handling
|
|
Packit |
5c3484 |
C hide under the multiply latencies. Those latencies are long enough that
|
|
Packit |
5c3484 |
C we don't need to worry about alignment or pairing to squeeze out
|
|
Packit |
5c3484 |
C performance.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C For the first limb, some of the loop code is broken out and scheduled back
|
|
Packit |
5c3484 |
C since it can be done earlier.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C - The first ldq src[0] is near the start of the routine, for maximum
|
|
Packit |
5c3484 |
C time from memory.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C - The subq y=x-climb can be done without waiting for the inverse.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C - The mulq y*inverse is replicated after the final subq for the inverse,
|
|
Packit |
5c3484 |
C instead of branching to the mulq in the main loop. On ev4 a branch
|
|
Packit |
5c3484 |
C there would cost cycles, but we can hide them under the mulq latency.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C For the last limb, high
|
|
Packit |
5c3484 |
C and addback is done, as per the main mpn/generic/mode1o.c code. This is a
|
|
Packit |
5c3484 |
C data-dependent branch, but we're waiting for umulh so any penalty should
|
|
Packit |
5c3484 |
C hide there. The multiplies saved would be worth the cost anyway.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C Enhancements:
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C For size==1, a plain division (done bitwise say) might be faster than
|
|
Packit |
5c3484 |
C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
|
|
Packit |
5c3484 |
C ev5. A call to gcc __remqu might be a possibility.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASM_START()
|
|
Packit |
5c3484 |
PROLOGUE(mpn_modexact_1c_odd,gp)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C r16 src
|
|
Packit |
5c3484 |
C r17 size
|
|
Packit |
5c3484 |
C r18 d
|
|
Packit |
5c3484 |
C r19 c
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
LEA(r0, binvert_limb_table)
|
|
Packit |
5c3484 |
srl r18, 1, r20 C d >> 1
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
and r20, 127, r20 C idx = d>>1 & 0x7F
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
addq r0, r20, r21 C table + idx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ifelse(bwx_available_p,1,
|
|
Packit |
5c3484 |
` ldbu r20, 0(r21) C table[idx], inverse 8 bits
|
|
Packit |
5c3484 |
',`
|
|
Packit |
5c3484 |
ldq_u r20, 0(r21) C table[idx] qword
|
|
Packit |
5c3484 |
extbl r20, r21, r20 C table[idx], inverse 8 bits
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mull r20, r20, r7 C i*i
|
|
Packit |
5c3484 |
addq r20, r20, r20 C 2*i
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ldq r2, 0(r16) C x = s = src[0]
|
|
Packit |
5c3484 |
lda r17, -1(r17) C size--
|
|
Packit |
5c3484 |
clr r0 C initial cbit=0
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mull r7, r18, r7 C i*i*d
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
subq r20, r7, r20 C 2*i-i*i*d, inverse 16 bits
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mull r20, r20, r7 C i*i
|
|
Packit |
5c3484 |
addq r20, r20, r20 C 2*i
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mull r7, r18, r7 C i*i*d
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
subq r20, r7, r20 C 2*i-i*i*d, inverse 32 bits
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r20, r20, r7 C i*i
|
|
Packit |
5c3484 |
addq r20, r20, r20 C 2*i
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r7, r18, r7 C i*i*d
|
|
Packit |
5c3484 |
subq r2, r19, r3 C y = x - climb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
subq r20, r7, r20 C inv = 2*i-i*i*d, inverse 64 bits
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASSERT(r7, C should have d*inv==1 mod 2^64
|
|
Packit |
5c3484 |
` mulq r18, r20, r7
|
|
Packit |
5c3484 |
cmpeq r7, 1, r7')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r3, r20, r4 C first q = y * inv
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
beq r17, L(one) C if size==1
|
|
Packit |
5c3484 |
br L(entry)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(top):
|
|
Packit |
5c3484 |
C r0 cbit
|
|
Packit |
5c3484 |
C r16 src, incrementing
|
|
Packit |
5c3484 |
C r17 size, decrementing
|
|
Packit |
5c3484 |
C r18 d
|
|
Packit |
5c3484 |
C r19 climb
|
|
Packit |
5c3484 |
C r20 inv
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ldq r1, 0(r16) C s = src[i]
|
|
Packit |
5c3484 |
subq r1, r0, r2 C x = s - cbit
|
|
Packit |
5c3484 |
cmpult r1, r0, r0 C new cbit = s < cbit
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
subq r2, r19, r3 C y = x - climb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r3, r20, r4 C q = y * inv
|
|
Packit |
5c3484 |
L(entry):
|
|
Packit |
5c3484 |
cmpult r2, r19, r5 C cbit2 = x < climb
|
|
Packit |
5c3484 |
addq r5, r0, r0 C cbit += cbit2
|
|
Packit |
5c3484 |
lda r16, 8(r16) C src++
|
|
Packit |
5c3484 |
lda r17, -1(r17) C size--
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
umulh r4, r18, r19 C climb = q * d
|
|
Packit |
5c3484 |
bne r17, L(top) C while 2 or more limbs left
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C r0 cbit
|
|
Packit |
5c3484 |
C r18 d
|
|
Packit |
5c3484 |
C r19 climb
|
|
Packit |
5c3484 |
C r20 inv
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ldq r1, 0(r16) C s = src[size-1] high limb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
cmpult r1, r18, r2 C test high
|
|
Packit |
5c3484 |
bne r2, L(skip) C skip if so
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C can't skip a division, repeat loop code
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
subq r1, r0, r2 C x = s - cbit
|
|
Packit |
5c3484 |
cmpult r1, r0, r0 C new cbit = s < cbit
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
subq r2, r19, r3 C y = x - climb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r3, r20, r4 C q = y * inv
|
|
Packit |
5c3484 |
L(one):
|
|
Packit |
5c3484 |
cmpult r2, r19, r5 C cbit2 = x < climb
|
|
Packit |
5c3484 |
addq r5, r0, r0 C cbit += cbit2
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
umulh r4, r18, r19 C climb = q * d
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
addq r19, r0, r0 C return climb + cbit
|
|
Packit |
5c3484 |
ret r31, (r26), 1
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(8)
|
|
Packit |
5c3484 |
L(skip):
|
|
Packit |
5c3484 |
C with high
|
|
Packit |
5c3484 |
C an addback of d if that underflows
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
addq r19, r0, r19 C c = climb + cbit
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
subq r19, r1, r2 C c - s
|
|
Packit |
5c3484 |
cmpult r19, r1, r3 C c < s
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
addq r2, r18, r0 C return c-s + divisor
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
cmoveq r3, r2, r0 C return c-s if no underflow
|
|
Packit |
5c3484 |
ret r31, (r26), 1
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
EPILOGUE()
|
|
Packit |
5c3484 |
ASM_END()
|