|
Packit |
5c3484 |
dnl IA-64 mpn_mod_34lsub1
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Contributed to the GNU project by Torbjorn Granlund.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 2003-2005, 2010 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C cycles/limb
|
|
Packit |
5c3484 |
C Itanium: ?
|
|
Packit |
5c3484 |
C Itanium 2: 1
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C INPUT PARAMETERS
|
|
Packit |
5c3484 |
define(`up', `r32')
|
|
Packit |
5c3484 |
define(`n', `r33')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Some useful aliases for registers we use
|
|
Packit |
5c3484 |
define(`u0',`r14') define(`u1',`r15') define(`u2',`r16')
|
|
Packit |
5c3484 |
define(`a0',`r17') define(`a1',`r18') define(`a2',`r19')
|
|
Packit |
5c3484 |
define(`c0',`r20') define(`c1',`r21') define(`c2',`r22')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C This is a fairly simple-minded implementation. One could approach 0.67 c/l
|
|
Packit |
5c3484 |
C with a more sophisticated implementation. If we're really crazy, we could
|
|
Packit |
5c3484 |
C super-unroll, storing carries just in predicate registers, then copy them to
|
|
Packit |
5c3484 |
C a general register, and population count them from there. That'd bring us
|
|
Packit |
5c3484 |
C close to 3 insn/limb, for nearly 0.5 c/l.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Computing n/3 needs 16 cycles, which is a lot of startup overhead.
|
|
Packit |
5c3484 |
C We therefore use a plain while-style loop:
|
|
Packit |
5c3484 |
C add n = -3, n
|
|
Packit |
5c3484 |
C cmp.le p9, p0 = 3, n
|
|
Packit |
5c3484 |
C (p9) br.cond .Loop
|
|
Packit |
5c3484 |
C Alternatively, we could table n/3 for, say, n < 256, and predicate the
|
|
Packit |
5c3484 |
C 16-cycle code.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C The summing-up code at the end was written quickly, and could surely be
|
|
Packit |
5c3484 |
C vastly improved.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASM_START()
|
|
Packit |
5c3484 |
PROLOGUE(mpn_mod_34lsub1)
|
|
Packit |
5c3484 |
.prologue
|
|
Packit |
5c3484 |
.save ar.lc, r2
|
|
Packit |
5c3484 |
.body
|
|
Packit |
5c3484 |
ifdef(`HAVE_ABI_32',`
|
|
Packit |
5c3484 |
addp4 up = 0, up C M I
|
|
Packit |
5c3484 |
nop.m 0
|
|
Packit |
5c3484 |
zxt4 n = n C I
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ifelse(0,1,`
|
|
Packit |
5c3484 |
movl r14 = 0xAAAAAAAAAAAAAAAB
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
setf.sig f6 = r14
|
|
Packit |
5c3484 |
setf.sig f7 = r33
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
xmpy.hu f6 = f6, f7
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
getf.sig r8 = f6
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
shr.u r8 = r8, 1 C Loop count
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
mov.i ar.lc = r8
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ld8 u0 = [up], 8
|
|
Packit |
5c3484 |
cmp.ne p9, p0 = 1, n
|
|
Packit |
5c3484 |
(p9) br L(gt1)
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
shr.u r8 = u0, 48
|
|
Packit |
5c3484 |
dep.z r27 = u0, 0, 48
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
add r8 = r8, r27
|
|
Packit |
5c3484 |
br.ret.sptk.many b0
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(gt1):
|
|
Packit |
5c3484 |
{.mmi; nop.m 0
|
|
Packit |
5c3484 |
mov a0 = 0
|
|
Packit |
5c3484 |
add n = -2, n
|
|
Packit |
5c3484 |
}{.mmi; mov c0 = 0
|
|
Packit |
5c3484 |
mov c1 = 0
|
|
Packit |
5c3484 |
mov c2 = 0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}{.mmi; ld8 u1 = [up], 8
|
|
Packit |
5c3484 |
mov a1 = 0
|
|
Packit |
5c3484 |
cmp.ltu p6, p0 = r0, r0 C clear p6
|
|
Packit |
5c3484 |
}{.mmb; cmp.gt p9, p0 = 3, n
|
|
Packit |
5c3484 |
mov a2 = 0
|
|
Packit |
5c3484 |
(p9) br.cond.dptk L(end)
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
ALIGN(32)
|
|
Packit |
5c3484 |
L(top):
|
|
Packit |
5c3484 |
{.mmi; ld8 u2 = [up], 8
|
|
Packit |
5c3484 |
(p6) add c0 = 1, c0
|
|
Packit |
5c3484 |
cmp.ltu p7, p0 = a0, u0
|
|
Packit |
5c3484 |
}{.mmb; sub a0 = a0, u0
|
|
Packit |
5c3484 |
add n = -3, n
|
|
Packit |
5c3484 |
nop.b 0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}{.mmi; ld8 u0 = [up], 8
|
|
Packit |
5c3484 |
(p7) add c1 = 1, c1
|
|
Packit |
5c3484 |
cmp.ltu p8, p0 = a1, u1
|
|
Packit |
5c3484 |
}{.mmb; sub a1 = a1, u1
|
|
Packit |
5c3484 |
cmp.le p9, p0 = 3, n
|
|
Packit |
5c3484 |
nop.b 0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}{.mmi; ld8 u1 = [up], 8
|
|
Packit |
5c3484 |
(p8) add c2 = 1, c2
|
|
Packit |
5c3484 |
cmp.ltu p6, p0 = a2, u2
|
|
Packit |
5c3484 |
}{.mmb; sub a2 = a2, u2
|
|
Packit |
5c3484 |
nop.m 0
|
|
Packit |
5c3484 |
dnl br.cloop.dptk L(top)
|
|
Packit |
5c3484 |
(p9) br.cond.dptk L(top)
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
L(end):
|
|
Packit |
5c3484 |
cmp.eq p10, p0 = 0, n
|
|
Packit |
5c3484 |
cmp.eq p11, p0 = 1, n
|
|
Packit |
5c3484 |
(p10) br L(0)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(2):
|
|
Packit |
5c3484 |
{.mmi; ld8 u2 = [up], 8
|
|
Packit |
5c3484 |
(p6) add c0 = 1, c0
|
|
Packit |
5c3484 |
cmp.ltu p7, p0 = a0, u0
|
|
Packit |
5c3484 |
}{.mmb; sub a0 = a0, u0
|
|
Packit |
5c3484 |
nop.m 0
|
|
Packit |
5c3484 |
(p11) br L(1)
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
} ld8 u0 = [up], 8
|
|
Packit |
5c3484 |
(p7) add c1 = 1, c1
|
|
Packit |
5c3484 |
cmp.ltu p8, p0 = a1, u1
|
|
Packit |
5c3484 |
sub a1 = a1, u1
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p8) add c2 = 1, c2
|
|
Packit |
5c3484 |
cmp.ltu p6, p0 = a2, u2
|
|
Packit |
5c3484 |
sub a2 = a2, u2
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p6) add c0 = 1, c0
|
|
Packit |
5c3484 |
cmp.ltu p7, p0 = a0, u0
|
|
Packit |
5c3484 |
sub a0 = a0, u0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p7) add c1 = 1, c1
|
|
Packit |
5c3484 |
br L(com)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(1):
|
|
Packit |
5c3484 |
(p7) add c1 = 1, c1
|
|
Packit |
5c3484 |
cmp.ltu p8, p0 = a1, u1
|
|
Packit |
5c3484 |
sub a1 = a1, u1
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p8) add c2 = 1, c2
|
|
Packit |
5c3484 |
cmp.ltu p6, p0 = a2, u2
|
|
Packit |
5c3484 |
sub a2 = a2, u2
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p6) add c0 = 1, c0
|
|
Packit |
5c3484 |
br L(com)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(0):
|
|
Packit |
5c3484 |
(p6) add c0 = 1, c0
|
|
Packit |
5c3484 |
cmp.ltu p7, p0 = a0, u0
|
|
Packit |
5c3484 |
sub a0 = a0, u0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p7) add c1 = 1, c1
|
|
Packit |
5c3484 |
cmp.ltu p8, p0 = a1, u1
|
|
Packit |
5c3484 |
sub a1 = a1, u1
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p8) add c2 = 1, c2
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(com):
|
|
Packit |
5c3484 |
C | a2 | a1 | a0 |
|
|
Packit |
5c3484 |
C | | | | |
|
|
Packit |
5c3484 |
shr.u r24 = a0, 48 C 16 bits
|
|
Packit |
5c3484 |
shr.u r25 = a1, 32 C 32 bits
|
|
Packit |
5c3484 |
shr.u r26 = a2, 16 C 48 bits
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
shr.u r10 = c0, 48 C 16 bits, always zero
|
|
Packit |
5c3484 |
shr.u r11 = c1, 32 C 32 bits
|
|
Packit |
5c3484 |
shr.u r30 = c2, 16 C 48 bits
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
dep.z r27 = a0, 0, 48 C 48 bits
|
|
Packit |
5c3484 |
dep.z r28 = a1, 16, 32 C 48 bits
|
|
Packit |
5c3484 |
dep.z r29 = a2, 32, 16 C 48 bits
|
|
Packit |
5c3484 |
dep.z r31 = c0, 0, 48 C 48 bits
|
|
Packit |
5c3484 |
dep.z r14 = c1, 16, 32 C 48 bits
|
|
Packit |
5c3484 |
dep.z r15 = c2, 32, 16 C 48 bits
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
{.mmi; add r24 = r24, r25
|
|
Packit |
5c3484 |
add r26 = r26, r27
|
|
Packit |
5c3484 |
add r28 = r28, r29
|
|
Packit |
5c3484 |
}{.mmi; add r10 = r10, r11
|
|
Packit |
5c3484 |
add r30 = r30, r31
|
|
Packit |
5c3484 |
add r14 = r14, r15
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
movl r8 = 0xffffffffffff0
|
|
Packit |
5c3484 |
add r24 = r24, r26
|
|
Packit |
5c3484 |
add r10 = r10, r30
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
add r24 = r24, r28
|
|
Packit |
5c3484 |
add r10 = r10, r14
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
sub r8 = r8, r24
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
add r8 = r8, r10
|
|
Packit |
5c3484 |
br.ret.sptk.many b0
|
|
Packit |
5c3484 |
EPILOGUE()
|
|
Packit |
5c3484 |
ASM_END()
|