|
Packit |
15dc08 |
dnl PowerPC-64 mpn_mod_1s_4p
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
dnl Copyright 2010, 2011 Free Software Foundation, Inc.
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
15dc08 |
dnl
|
|
Packit |
15dc08 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
15dc08 |
dnl it under the terms of either:
|
|
Packit |
15dc08 |
dnl
|
|
Packit |
15dc08 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
15dc08 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
15dc08 |
dnl option) any later version.
|
|
Packit |
15dc08 |
dnl
|
|
Packit |
15dc08 |
dnl or
|
|
Packit |
15dc08 |
dnl
|
|
Packit |
15dc08 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
15dc08 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
15dc08 |
dnl later version.
|
|
Packit |
15dc08 |
dnl
|
|
Packit |
15dc08 |
dnl or both in parallel, as here.
|
|
Packit |
15dc08 |
dnl
|
|
Packit |
15dc08 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
15dc08 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
15dc08 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
15dc08 |
dnl for more details.
|
|
Packit |
15dc08 |
dnl
|
|
Packit |
15dc08 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
15dc08 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
15dc08 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
include(`../config.m4')
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
C cycles/limb
|
|
Packit |
15dc08 |
C POWER3/PPC630 ?
|
|
Packit |
15dc08 |
C POWER4/PPC970 9
|
|
Packit |
15dc08 |
C POWER5 9
|
|
Packit |
15dc08 |
C POWER6 13
|
|
Packit |
15dc08 |
C POWER7 3.5
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
C TODO
|
|
Packit |
15dc08 |
C * Optimise, in particular the cps function. This was compiler-generated and
|
|
Packit |
15dc08 |
C then hand optimised.
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
C INPUT PARAMETERS
|
|
Packit |
15dc08 |
define(`ap', `r3')
|
|
Packit |
15dc08 |
define(`n', `r4')
|
|
Packit |
15dc08 |
define(`d', `r5')
|
|
Packit |
15dc08 |
define(`cps', `r6')
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
ASM_START()
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
EXTERN_FUNC(mpn_invert_limb)
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
PROLOGUE(mpn_mod_1s_4p)
|
|
Packit |
15dc08 |
std r23, -72(r1)
|
|
Packit |
15dc08 |
ld r23, 48(cps)
|
|
Packit |
15dc08 |
std r24, -64(r1)
|
|
Packit |
15dc08 |
std r25, -56(r1)
|
|
Packit |
15dc08 |
ld r24, 32(cps)
|
|
Packit |
15dc08 |
ld r25, 24(cps)
|
|
Packit |
15dc08 |
std r26, -48(r1)
|
|
Packit |
15dc08 |
std r27, -40(r1)
|
|
Packit |
15dc08 |
ld r26, 16(cps)
|
|
Packit |
15dc08 |
std r28, -32(r1)
|
|
Packit |
15dc08 |
std r29, -24(r1)
|
|
Packit |
15dc08 |
std r30, -16(r1)
|
|
Packit |
15dc08 |
std r31, -8(r1)
|
|
Packit |
15dc08 |
ld r30, 40(cps)
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
rldicl. r0, n, 0,62
|
|
Packit |
15dc08 |
sldi r31, n, 3
|
|
Packit |
15dc08 |
add ap, ap, r31 C make ap point at end of operand
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
cmpdi cr7, r0, 2
|
|
Packit |
15dc08 |
beq cr0, L(b00)
|
|
Packit |
15dc08 |
blt cr7, L(b01)
|
|
Packit |
15dc08 |
beq cr7, L(b10)
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
L(b11): ld r11, -16(ap)
|
|
Packit |
15dc08 |
ld r9, -8(ap)
|
|
Packit |
15dc08 |
ld r0, -24(ap)
|
|
Packit |
15dc08 |
mulhdu r27, r11, r26
|
|
Packit |
15dc08 |
mulld r8, r11, r26
|
|
Packit |
15dc08 |
mulhdu r11, r9, r25
|
|
Packit |
15dc08 |
mulld r9, r9, r25
|
|
Packit |
15dc08 |
addc r31, r8, r0
|
|
Packit |
15dc08 |
addze r10, r27
|
|
Packit |
15dc08 |
addc r0, r9, r31
|
|
Packit |
15dc08 |
adde r9, r11, r10
|
|
Packit |
15dc08 |
addi ap, ap, -40
|
|
Packit |
15dc08 |
b L(6)
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
ALIGN(16)
|
|
Packit |
15dc08 |
L(b00): ld r11, -24(ap)
|
|
Packit |
15dc08 |
ld r10, -16(ap)
|
|
Packit |
15dc08 |
ld r9, -8(ap)
|
|
Packit |
15dc08 |
ld r0, -32(ap)
|
|
Packit |
15dc08 |
mulld r8, r11, r26
|
|
Packit |
15dc08 |
mulhdu r7, r10, r25
|
|
Packit |
15dc08 |
mulhdu r27, r11, r26
|
|
Packit |
15dc08 |
mulhdu r11, r9, r24
|
|
Packit |
15dc08 |
mulld r10, r10, r25
|
|
Packit |
15dc08 |
mulld r9, r9, r24
|
|
Packit |
15dc08 |
addc r31, r8, r0
|
|
Packit |
15dc08 |
addze r0, r27
|
|
Packit |
15dc08 |
addc r8, r31, r10
|
|
Packit |
15dc08 |
adde r10, r0, r7
|
|
Packit |
15dc08 |
addc r0, r9, r8
|
|
Packit |
15dc08 |
adde r9, r11, r10
|
|
Packit |
15dc08 |
addi ap, ap, -48
|
|
Packit |
15dc08 |
b L(6)
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
ALIGN(16)
|
|
Packit |
15dc08 |
L(b01): li r9, 0
|
|
Packit |
15dc08 |
ld r0, -8(ap)
|
|
Packit |
15dc08 |
addi ap, ap, -24
|
|
Packit |
15dc08 |
b L(6)
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
ALIGN(16)
|
|
Packit |
15dc08 |
L(b10): ld r9, -8(ap)
|
|
Packit |
15dc08 |
ld r0, -16(ap)
|
|
Packit |
15dc08 |
addi ap, ap, -32
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
ALIGN(16)
|
|
Packit |
15dc08 |
L(6): addi r10, n, 3
|
|
Packit |
15dc08 |
srdi r7, r10, 2
|
|
Packit |
15dc08 |
mtctr r7
|
|
Packit |
15dc08 |
bdz L(end)
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
ALIGN(16)
|
|
Packit |
15dc08 |
L(top): ld r31, -16(ap)
|
|
Packit |
15dc08 |
ld r10, -8(ap)
|
|
Packit |
15dc08 |
ld r11, 8(ap)
|
|
Packit |
15dc08 |
ld r12, 0(ap)
|
|
Packit |
15dc08 |
mulld r29, r0, r30 C rl * B4modb
|
|
Packit |
15dc08 |
mulhdu r0, r0, r30 C rl * B4modb
|
|
Packit |
15dc08 |
mulhdu r27, r10, r26
|
|
Packit |
15dc08 |
mulld r10, r10, r26
|
|
Packit |
15dc08 |
mulhdu r7, r9, r23 C rh * B5modb
|
|
Packit |
15dc08 |
mulld r9, r9, r23 C rh * B5modb
|
|
Packit |
15dc08 |
mulhdu r28, r11, r24
|
|
Packit |
15dc08 |
mulld r11, r11, r24
|
|
Packit |
15dc08 |
mulhdu r4, r12, r25
|
|
Packit |
15dc08 |
mulld r12, r12, r25
|
|
Packit |
15dc08 |
addc r8, r10, r31
|
|
Packit |
15dc08 |
addze r10, r27
|
|
Packit |
15dc08 |
addi ap, ap, -32
|
|
Packit |
15dc08 |
addc r27, r8, r12
|
|
Packit |
15dc08 |
adde r12, r10, r4
|
|
Packit |
15dc08 |
addc r11, r27, r11
|
|
Packit |
15dc08 |
adde r31, r12, r28
|
|
Packit |
15dc08 |
addc r12, r11, r29
|
|
Packit |
15dc08 |
adde r4, r31, r0
|
|
Packit |
15dc08 |
addc r0, r9, r12
|
|
Packit |
15dc08 |
adde r9, r7, r4
|
|
Packit |
15dc08 |
bdnz L(top)
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
L(end):
|
|
Packit |
15dc08 |
ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
|
|
Packit |
15dc08 |
` lwz r3, 8(cps)',
|
|
Packit |
15dc08 |
` lwz r3, 12(cps)')
|
|
Packit |
15dc08 |
mulld r10, r9, r26
|
|
Packit |
15dc08 |
mulhdu r9, r9, r26
|
|
Packit |
15dc08 |
addc r11, r0, r10
|
|
Packit |
15dc08 |
addze r9, r9
|
|
Packit |
15dc08 |
ld r10, 0(cps)
|
|
Packit |
15dc08 |
subfic r8, r3, 64
|
|
Packit |
15dc08 |
sld r9, r9, r3
|
|
Packit |
15dc08 |
srd r8, r11, r8
|
|
Packit |
15dc08 |
sld r11, r11, r3
|
|
Packit |
15dc08 |
or r9, r8, r9
|
|
Packit |
15dc08 |
mulld r0, r9, r10
|
|
Packit |
15dc08 |
mulhdu r10, r9, r10
|
|
Packit |
15dc08 |
addi r9, r9, 1
|
|
Packit |
15dc08 |
addc r8, r0, r11
|
|
Packit |
15dc08 |
adde r0, r10, r9
|
|
Packit |
15dc08 |
mulld r0, r0, d
|
|
Packit |
15dc08 |
subf r0, r0, r11
|
|
Packit |
15dc08 |
cmpld cr7, r8, r0
|
|
Packit |
15dc08 |
bge cr7, L(9)
|
|
Packit |
15dc08 |
add r0, r0, d
|
|
Packit |
15dc08 |
L(9): cmpld cr7, r0, d
|
|
Packit |
15dc08 |
bge- cr7, L(16)
|
|
Packit |
15dc08 |
L(10): srd r3, r0, r3
|
|
Packit |
15dc08 |
ld r23, -72(r1)
|
|
Packit |
15dc08 |
ld r24, -64(r1)
|
|
Packit |
15dc08 |
ld r25, -56(r1)
|
|
Packit |
15dc08 |
ld r26, -48(r1)
|
|
Packit |
15dc08 |
ld r27, -40(r1)
|
|
Packit |
15dc08 |
ld r28, -32(r1)
|
|
Packit |
15dc08 |
ld r29, -24(r1)
|
|
Packit |
15dc08 |
ld r30, -16(r1)
|
|
Packit |
15dc08 |
ld r31, -8(r1)
|
|
Packit |
15dc08 |
blr
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
L(16): subf r0, d, r0
|
|
Packit |
15dc08 |
b L(10)
|
|
Packit |
15dc08 |
EPILOGUE()
|
|
Packit |
15dc08 |
|
|
Packit |
15dc08 |
PROLOGUE(mpn_mod_1s_4p_cps,toc)
|
|
Packit |
15dc08 |
mflr r0
|
|
Packit |
15dc08 |
std r29, -24(r1)
|
|
Packit |
15dc08 |
std r30, -16(r1)
|
|
Packit |
15dc08 |
mr r29, r3
|
|
Packit |
15dc08 |
std r0, 16(r1)
|
|
Packit |
15dc08 |
std r31, -8(r1)
|
|
Packit |
15dc08 |
stdu r1, -144(r1)
|
|
Packit |
15dc08 |
cntlzd r31, r4
|
|
Packit |
15dc08 |
sld r30, r4, r31
|
|
Packit |
15dc08 |
mr r3, r30
|
|
Packit |
15dc08 |
CALL( mpn_invert_limb)
|
|
Packit |
15dc08 |
subfic r9, r31, 64
|
|
Packit |
15dc08 |
li r10, 1
|
|
Packit |
15dc08 |
sld r10, r10, r31
|
|
Packit |
15dc08 |
srd r9, r3, r9
|
|
Packit |
15dc08 |
neg r0, r30
|
|
Packit |
15dc08 |
or r10, r10, r9
|
|
Packit |
15dc08 |
mulld r10, r10, r0
|
|
Packit |
15dc08 |
mulhdu r11, r10, r3
|
|
Packit |
15dc08 |
nor r11, r11, r11
|
|
Packit |
15dc08 |
subf r11, r10, r11
|
|
Packit |
15dc08 |
mulld r11, r11, r30
|
|
Packit |
15dc08 |
mulld r0, r10, r3
|
|
Packit |
15dc08 |
cmpld cr7, r0, r11
|
|
Packit |
15dc08 |
bge cr7, L(18)
|
|
Packit |
15dc08 |
add r11, r11, r30
|
|
Packit |
15dc08 |
L(18): mulhdu r9, r11, r3
|
|
Packit |
15dc08 |
add r9, r11, r9
|
|
Packit |
15dc08 |
nor r9, r9, r9
|
|
Packit |
15dc08 |
mulld r9, r9, r30
|
|
Packit |
15dc08 |
mulld r0, r11, r3
|
|
Packit |
15dc08 |
cmpld cr7, r0, r9
|
|
Packit |
15dc08 |
bge cr7, L(19)
|
|
Packit |
15dc08 |
add r9, r9, r30
|
|
Packit |
15dc08 |
L(19): mulhdu r0, r9, r3
|
|
Packit |
15dc08 |
add r0, r9, r0
|
|
Packit |
15dc08 |
nor r0, r0, r0
|
|
Packit |
15dc08 |
mulld r0, r0, r30
|
|
Packit |
15dc08 |
mulld r8, r9, r3
|
|
Packit |
15dc08 |
cmpld cr7, r8, r0
|
|
Packit |
15dc08 |
bge cr7, L(20)
|
|
Packit |
15dc08 |
add r0, r0, r30
|
|
Packit |
15dc08 |
L(20): mulhdu r8, r0, r3
|
|
Packit |
15dc08 |
add r8, r0, r8
|
|
Packit |
15dc08 |
nor r8, r8, r8
|
|
Packit |
15dc08 |
mulld r8, r8, r30
|
|
Packit |
15dc08 |
mulld r7, r0, r3
|
|
Packit |
15dc08 |
cmpld cr7, r7, r8
|
|
Packit |
15dc08 |
bge cr7, L(21)
|
|
Packit |
15dc08 |
add r8, r8, r30
|
|
Packit |
15dc08 |
L(21): srd r0, r0, r31
|
|
Packit |
15dc08 |
addi r1, r1, 144
|
|
Packit |
15dc08 |
srd r8, r8, r31
|
|
Packit |
15dc08 |
srd r10, r10, r31
|
|
Packit |
15dc08 |
srd r11, r11, r31
|
|
Packit |
15dc08 |
std r0, 40(r29)
|
|
Packit |
15dc08 |
std r31, 8(r29)
|
|
Packit |
15dc08 |
srd r9, r9, r31
|
|
Packit |
15dc08 |
ld r0, 16(r1)
|
|
Packit |
15dc08 |
ld r30, -16(r1)
|
|
Packit |
15dc08 |
std r8, 48(r29)
|
|
Packit |
15dc08 |
std r3, 0(r29)
|
|
Packit |
15dc08 |
mtlr r0
|
|
Packit |
15dc08 |
ld r31, -8(r1)
|
|
Packit |
15dc08 |
std r10, 16(r29)
|
|
Packit |
15dc08 |
std r11, 24(r29)
|
|
Packit |
15dc08 |
std r9, 32(r29)
|
|
Packit |
15dc08 |
ld r29, -24(r1)
|
|
Packit |
15dc08 |
blr
|
|
Packit |
15dc08 |
EPILOGUE()
|