|
Packit |
5c3484 |
dnl Alpha mpn_bdiv_dbm1c.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 2008 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C cycles/limb
|
|
Packit |
5c3484 |
C EV4: 42
|
|
Packit |
5c3484 |
C EV5: 18
|
|
Packit |
5c3484 |
C EV6: 3
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C TODO
|
|
Packit |
5c3484 |
C * Try less unrolling, 2-way should give the same performance.
|
|
Packit |
5c3484 |
C * Optimize feed-in and wind-down code, for speed, and perhaps further for
|
|
Packit |
5c3484 |
C code size.
|
|
Packit |
5c3484 |
C * This runs optimally given the algorithm, r8 is on a 3 operation recurrency
|
|
Packit |
5c3484 |
C path. We have not tried very hard to find a better algorithm. Perhaps
|
|
Packit |
5c3484 |
C it would be a good task for the GNU superoptimizer.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C INPUT PARAMETERS
|
|
Packit |
5c3484 |
define(`rp', `r16')
|
|
Packit |
5c3484 |
define(`up', `r17')
|
|
Packit |
5c3484 |
define(`n', `r18')
|
|
Packit |
5c3484 |
define(`bd', `r19')
|
|
Packit |
5c3484 |
define(`cy', `r19')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASM_START()
|
|
Packit |
5c3484 |
PROLOGUE(mpn_bdiv_dbm1c)
|
|
Packit |
5c3484 |
mov r20, r8
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ldq r24, 0(r17)
|
|
Packit |
5c3484 |
and r18, 3, r28
|
|
Packit |
5c3484 |
lda r18, -4(r18)
|
|
Packit |
5c3484 |
beq r28, L(b0)
|
|
Packit |
5c3484 |
cmpeq r28, 1, r21
|
|
Packit |
5c3484 |
bne r21, L(b1)
|
|
Packit |
5c3484 |
cmpeq r28, 2, r21
|
|
Packit |
5c3484 |
bne r21, L(b2)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(b3): ldq r2, 8(r17)
|
|
Packit |
5c3484 |
ldq r3, 16(r17)
|
|
Packit |
5c3484 |
bgt r18, L(gt3)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r24, r19, r5 C U1
|
|
Packit |
5c3484 |
umulh r24, r19, r21 C U1
|
|
Packit |
5c3484 |
mulq r2, r19, r6 C U1
|
|
Packit |
5c3484 |
umulh r2, r19, r22 C U1
|
|
Packit |
5c3484 |
mulq r3, r19, r7 C U1
|
|
Packit |
5c3484 |
umulh r3, r19, r23 C U1
|
|
Packit |
5c3484 |
lda r16, -32(r16)
|
|
Packit |
5c3484 |
br L(cj3)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(gt3): ldq r0, 24(r17)
|
|
Packit |
5c3484 |
mulq r24, r19, r5 C U1
|
|
Packit |
5c3484 |
umulh r24, r19, r21 C U1
|
|
Packit |
5c3484 |
ldq r1, 32(r17)
|
|
Packit |
5c3484 |
mulq r2, r19, r6 C U1
|
|
Packit |
5c3484 |
umulh r2, r19, r22 C U1
|
|
Packit |
5c3484 |
ldq r2, 40(r17)
|
|
Packit |
5c3484 |
mulq r3, r19, r7 C U1
|
|
Packit |
5c3484 |
umulh r3, r19, r23 C U1
|
|
Packit |
5c3484 |
ldq r3, 48(r17)
|
|
Packit |
5c3484 |
lda r18, -4(r18)
|
|
Packit |
5c3484 |
lda r17, 56(r17)
|
|
Packit |
5c3484 |
mulq r0, r19, r4 C U1
|
|
Packit |
5c3484 |
bgt r18, L(L3)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
br L(cj7)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(b2): ldq r3, 8(r17)
|
|
Packit |
5c3484 |
bgt r18, L(gt2)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r24, r19, r6 C U1
|
|
Packit |
5c3484 |
umulh r24, r19, r22 C U1
|
|
Packit |
5c3484 |
mulq r3, r19, r7 C U1
|
|
Packit |
5c3484 |
umulh r3, r19, r23 C U1
|
|
Packit |
5c3484 |
lda r16, -40(r16)
|
|
Packit |
5c3484 |
br L(cj2)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(gt2): ldq r0, 16(r17)
|
|
Packit |
5c3484 |
ldq r1, 24(r17)
|
|
Packit |
5c3484 |
mulq r24, r19, r6 C U1
|
|
Packit |
5c3484 |
umulh r24, r19, r22 C U1
|
|
Packit |
5c3484 |
ldq r2, 32(r17)
|
|
Packit |
5c3484 |
mulq r3, r19, r7 C U1
|
|
Packit |
5c3484 |
umulh r3, r19, r23 C U1
|
|
Packit |
5c3484 |
ldq r3, 40(r17)
|
|
Packit |
5c3484 |
lda r18, -4(r18)
|
|
Packit |
5c3484 |
lda r17, 48(r17)
|
|
Packit |
5c3484 |
mulq r0, r19, r4 C U1
|
|
Packit |
5c3484 |
umulh r0, r19, r20 C U1
|
|
Packit |
5c3484 |
lda r16, -8(r16)
|
|
Packit |
5c3484 |
bgt r18, L(gt6)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r1, r19, r5 C U1
|
|
Packit |
5c3484 |
br L(cj6)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(gt6): ldq r0, 0(r17)
|
|
Packit |
5c3484 |
mulq r1, r19, r5 C U1
|
|
Packit |
5c3484 |
br L(L2)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(b1): bgt r18, L(gt1)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r24, r19, r7 C U1
|
|
Packit |
5c3484 |
umulh r24, r19, r23 C U1
|
|
Packit |
5c3484 |
lda r16, -48(r16)
|
|
Packit |
5c3484 |
br L(cj1)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(gt1): ldq r0, 8(r17)
|
|
Packit |
5c3484 |
ldq r1, 16(r17)
|
|
Packit |
5c3484 |
ldq r2, 24(r17)
|
|
Packit |
5c3484 |
mulq r24, r19, r7 C U1
|
|
Packit |
5c3484 |
umulh r24, r19, r23 C U1
|
|
Packit |
5c3484 |
ldq r3, 32(r17)
|
|
Packit |
5c3484 |
lda r18, -4(r18)
|
|
Packit |
5c3484 |
lda r17, 40(r17)
|
|
Packit |
5c3484 |
mulq r0, r19, r4 C U1
|
|
Packit |
5c3484 |
umulh r0, r19, r20 C U1
|
|
Packit |
5c3484 |
lda r16, -16(r16)
|
|
Packit |
5c3484 |
bgt r18, L(gt5)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r1, r19, r5 C U1
|
|
Packit |
5c3484 |
umulh r1, r19, r21 C U1
|
|
Packit |
5c3484 |
mulq r2, r19, r6 C U1
|
|
Packit |
5c3484 |
br L(cj5)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(gt5): ldq r0, 0(r17)
|
|
Packit |
5c3484 |
mulq r1, r19, r5 C U1
|
|
Packit |
5c3484 |
umulh r1, r19, r21 C U1
|
|
Packit |
5c3484 |
ldq r1, 8(r17)
|
|
Packit |
5c3484 |
mulq r2, r19, r6 C U1
|
|
Packit |
5c3484 |
br L(L1)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(b0): ldq r1, 8(r17)
|
|
Packit |
5c3484 |
ldq r2, 16(r17)
|
|
Packit |
5c3484 |
ldq r3, 24(r17)
|
|
Packit |
5c3484 |
lda r17, 32(r17)
|
|
Packit |
5c3484 |
lda r16, -24(r16)
|
|
Packit |
5c3484 |
mulq r24, r19, r4 C U1
|
|
Packit |
5c3484 |
umulh r24, r19, r20 C U1
|
|
Packit |
5c3484 |
bgt r18, L(gt4)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r1, r19, r5 C U1
|
|
Packit |
5c3484 |
umulh r1, r19, r21 C U1
|
|
Packit |
5c3484 |
mulq r2, r19, r6 C U1
|
|
Packit |
5c3484 |
umulh r2, r19, r22 C U1
|
|
Packit |
5c3484 |
mulq r3, r19, r7 C U1
|
|
Packit |
5c3484 |
br L(cj4)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(gt4): ldq r0, 0(r17)
|
|
Packit |
5c3484 |
mulq r1, r19, r5 C U1
|
|
Packit |
5c3484 |
umulh r1, r19, r21 C U1
|
|
Packit |
5c3484 |
ldq r1, 8(r17)
|
|
Packit |
5c3484 |
mulq r2, r19, r6 C U1
|
|
Packit |
5c3484 |
umulh r2, r19, r22 C U1
|
|
Packit |
5c3484 |
ldq r2, 16(r17)
|
|
Packit |
5c3484 |
mulq r3, r19, r7 C U1
|
|
Packit |
5c3484 |
br L(L0)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C *** MAIN LOOP START ***
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
L(top): mulq r0, r19, r4 C U1
|
|
Packit |
5c3484 |
subq r8, r28, r8
|
|
Packit |
5c3484 |
L(L3): umulh r0, r19, r20 C U1
|
|
Packit |
5c3484 |
cmpult r8, r5, r28
|
|
Packit |
5c3484 |
ldq r0, 0(r17)
|
|
Packit |
5c3484 |
subq r8, r5, r8
|
|
Packit |
5c3484 |
addq r21, r28, r28
|
|
Packit |
5c3484 |
stq r8, 0(r16)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r1, r19, r5 C U1
|
|
Packit |
5c3484 |
subq r8, r28, r8
|
|
Packit |
5c3484 |
L(L2): umulh r1, r19, r21 C U1
|
|
Packit |
5c3484 |
cmpult r8, r6, r28
|
|
Packit |
5c3484 |
ldq r1, 8(r17)
|
|
Packit |
5c3484 |
subq r8, r6, r8
|
|
Packit |
5c3484 |
addq r22, r28, r28
|
|
Packit |
5c3484 |
stq r8, 8(r16)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r2, r19, r6 C U1
|
|
Packit |
5c3484 |
subq r8, r28, r8
|
|
Packit |
5c3484 |
L(L1): umulh r2, r19, r22 C U1
|
|
Packit |
5c3484 |
cmpult r8, r7, r28
|
|
Packit |
5c3484 |
ldq r2, 16(r17)
|
|
Packit |
5c3484 |
subq r8, r7, r8
|
|
Packit |
5c3484 |
addq r23, r28, r28
|
|
Packit |
5c3484 |
stq r8, 16(r16)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r3, r19, r7 C U1
|
|
Packit |
5c3484 |
subq r8, r28, r8
|
|
Packit |
5c3484 |
L(L0): umulh r3, r19, r23 C U1
|
|
Packit |
5c3484 |
cmpult r8, r4, r28
|
|
Packit |
5c3484 |
ldq r3, 24(r17)
|
|
Packit |
5c3484 |
subq r8, r4, r8
|
|
Packit |
5c3484 |
addq r20, r28, r28
|
|
Packit |
5c3484 |
stq r8, 24(r16)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
lda r18, -4(r18)
|
|
Packit |
5c3484 |
lda r17, 32(r17)
|
|
Packit |
5c3484 |
lda r16, 32(r16)
|
|
Packit |
5c3484 |
bgt r18, L(top)
|
|
Packit |
5c3484 |
C *** MAIN LOOP END ***
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mulq r0, r19, r4 C U1
|
|
Packit |
5c3484 |
subq r8, r28, r8
|
|
Packit |
5c3484 |
L(cj7): umulh r0, r19, r20 C U1
|
|
Packit |
5c3484 |
cmpult r8, r5, r28
|
|
Packit |
5c3484 |
subq r8, r5, r8
|
|
Packit |
5c3484 |
addq r21, r28, r28
|
|
Packit |
5c3484 |
stq r8, 0(r16)
|
|
Packit |
5c3484 |
mulq r1, r19, r5 C U1
|
|
Packit |
5c3484 |
subq r8, r28, r8
|
|
Packit |
5c3484 |
L(cj6): umulh r1, r19, r21 C U1
|
|
Packit |
5c3484 |
cmpult r8, r6, r28
|
|
Packit |
5c3484 |
subq r8, r6, r8
|
|
Packit |
5c3484 |
addq r22, r28, r28
|
|
Packit |
5c3484 |
stq r8, 8(r16)
|
|
Packit |
5c3484 |
mulq r2, r19, r6 C U1
|
|
Packit |
5c3484 |
subq r8, r28, r8
|
|
Packit |
5c3484 |
L(cj5): umulh r2, r19, r22 C U1
|
|
Packit |
5c3484 |
cmpult r8, r7, r28
|
|
Packit |
5c3484 |
subq r8, r7, r8
|
|
Packit |
5c3484 |
addq r23, r28, r28
|
|
Packit |
5c3484 |
stq r8, 16(r16)
|
|
Packit |
5c3484 |
mulq r3, r19, r7 C U1
|
|
Packit |
5c3484 |
subq r8, r28, r8
|
|
Packit |
5c3484 |
L(cj4): umulh r3, r19, r23 C U1
|
|
Packit |
5c3484 |
cmpult r8, r4, r28
|
|
Packit |
5c3484 |
subq r8, r4, r8
|
|
Packit |
5c3484 |
addq r20, r28, r28
|
|
Packit |
5c3484 |
stq r8, 24(r16)
|
|
Packit |
5c3484 |
subq r8, r28, r8
|
|
Packit |
5c3484 |
L(cj3): cmpult r8, r5, r28
|
|
Packit |
5c3484 |
subq r8, r5, r8
|
|
Packit |
5c3484 |
addq r21, r28, r28
|
|
Packit |
5c3484 |
stq r8, 32(r16)
|
|
Packit |
5c3484 |
subq r8, r28, r8
|
|
Packit |
5c3484 |
L(cj2): cmpult r8, r6, r28
|
|
Packit |
5c3484 |
subq r8, r6, r8
|
|
Packit |
5c3484 |
addq r22, r28, r28
|
|
Packit |
5c3484 |
stq r8, 40(r16)
|
|
Packit |
5c3484 |
subq r8, r28, r8
|
|
Packit |
5c3484 |
L(cj1): cmpult r8, r7, r28
|
|
Packit |
5c3484 |
subq r8, r7, r8
|
|
Packit |
5c3484 |
addq r23, r28, r28
|
|
Packit |
5c3484 |
stq r8, 48(r16)
|
|
Packit |
5c3484 |
subq r8, r28, r0
|
|
Packit |
5c3484 |
ret r31, (r26), 1
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
EPILOGUE()
|
|
Packit |
5c3484 |
ASM_END()
|