|
Packit |
5c3484 |
dnl IA-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 2010, 2013 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C norm frac
|
|
Packit |
5c3484 |
C itanium 1
|
|
Packit |
5c3484 |
C itanium 2 29 29
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C TODO
|
|
Packit |
5c3484 |
C * Inline and interleave limb inversion code with loop setup code.
|
|
Packit |
5c3484 |
C * We should use explicit bundling in much of the code, since it typically
|
|
Packit |
5c3484 |
C cuts some cycles with the GNU assembler.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASM_START()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C HP's assembler requires these declarations for importing mpn_invert_limb
|
|
Packit |
5c3484 |
.global mpn_invert_limb
|
|
Packit |
5c3484 |
.type mpn_invert_limb,@function
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C INPUT PARAMETERS
|
|
Packit |
5c3484 |
C qp = r32
|
|
Packit |
5c3484 |
C fn = r33
|
|
Packit |
5c3484 |
C np = r34
|
|
Packit |
5c3484 |
C nn = r35
|
|
Packit |
5c3484 |
C dp = r36
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
define(`f0x1', `f15')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASM_START()
|
|
Packit |
5c3484 |
PROLOGUE(mpn_divrem_2)
|
|
Packit |
5c3484 |
.prologue
|
|
Packit |
5c3484 |
ifdef(`HAVE_ABI_32',
|
|
Packit |
5c3484 |
` addp4 r32 = 0, r32 C M I
|
|
Packit |
5c3484 |
addp4 r34 = 0, r34 C M I
|
|
Packit |
5c3484 |
zxt4 r35 = r35 C I
|
|
Packit |
5c3484 |
addp4 r36 = 0, r36 C M I
|
|
Packit |
5c3484 |
nop.m 0
|
|
Packit |
5c3484 |
zxt4 r33 = r33 C I
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
.save ar.pfs, r42
|
|
Packit |
5c3484 |
alloc r42 = ar.pfs, 5, 9, 1, 0
|
|
Packit |
5c3484 |
shladd r34 = r35, 3, r34
|
|
Packit |
5c3484 |
adds r14 = 8, r36
|
|
Packit |
5c3484 |
mov r43 = r1
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
adds r15 = -8, r34
|
|
Packit |
5c3484 |
ld8 r39 = [r14]
|
|
Packit |
5c3484 |
.save ar.lc, r45
|
|
Packit |
5c3484 |
mov r45 = ar.lc
|
|
Packit |
5c3484 |
adds r14 = -16, r34
|
|
Packit |
5c3484 |
mov r40 = r0
|
|
Packit |
5c3484 |
adds r34 = -24, r34
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
ld8 r38 = [r15]
|
|
Packit |
5c3484 |
.save rp, r41
|
|
Packit |
5c3484 |
mov r41 = b0
|
|
Packit |
5c3484 |
.body
|
|
Packit |
5c3484 |
ld8 r36 = [r36]
|
|
Packit |
5c3484 |
ld8 r37 = [r14]
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
cmp.gtu p6, p7 = r39, r38
|
|
Packit |
5c3484 |
(p6) br.cond.dptk .L8
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
cmp.leu p8, p9 = r36, r37
|
|
Packit |
5c3484 |
cmp.geu p6, p7 = r39, r38
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p8) cmp4.ne.and.orcm p6, p7 = 0, r0
|
|
Packit |
5c3484 |
(p7) br.cond.dptk .L51
|
|
Packit |
5c3484 |
.L8:
|
|
Packit |
5c3484 |
add r14 = r33, r35 // un + fn
|
|
Packit |
5c3484 |
mov r46 = r39 // argument to mpn_invert_limb
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
adds r35 = -3, r14
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
cmp.gt p12, p0 = r0, r35
|
|
Packit |
5c3484 |
(p12) br.cond.dpnt L(end)
|
|
Packit |
5c3484 |
br.call.sptk.many b0 = mpn_invert_limb
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
setf.sig f11 = r8 // di (non-final)
|
|
Packit |
5c3484 |
setf.sig f34 = r39 // d1
|
|
Packit |
5c3484 |
setf.sig f33 = r36 // d0
|
|
Packit |
5c3484 |
mov r1 = r43
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
mov r17 = 1
|
|
Packit |
5c3484 |
setf.sig f9 = r38 // n2
|
|
Packit |
5c3484 |
xma.l f6 = f11, f34, f0 // t0 = LO(di * d1)
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
setf.sig f10 = r37 // n1
|
|
Packit |
5c3484 |
setf.sig f15 = r17 // 1
|
|
Packit |
5c3484 |
xma.hu f8 = f11, f33, f0 // s0 = HI(di * d0)
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
getf.sig r17 = f6
|
|
Packit |
5c3484 |
getf.sig r16 = f8
|
|
Packit |
5c3484 |
mov ar.lc = r35
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
sub r18 = r0, r39 // -d1
|
|
Packit |
5c3484 |
add r14 = r17, r36
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
setf.sig f14 = r18 // -d1
|
|
Packit |
5c3484 |
cmp.leu p8, p9 = r17, r14
|
|
Packit |
5c3484 |
add r16 = r14, r16
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p9) adds r19 = 0, r0
|
|
Packit |
5c3484 |
(p8) adds r19 = -1, r0
|
|
Packit |
5c3484 |
cmp.gtu p6, p7 = r14, r16
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p6) adds r19 = 1, r19
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
ifelse(1,1,`
|
|
Packit |
5c3484 |
cmp.gt p7, p6 = r0, r19
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p6) adds r8 = -1, r8 // di--
|
|
Packit |
5c3484 |
(p6) sub r14 = r16, r39 // t0 -= d1
|
|
Packit |
5c3484 |
(p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p6) cmp.gt p9, p8 = 1, r19
|
|
Packit |
5c3484 |
(p7) cmp.gt p9, p8 = 0, r19
|
|
Packit |
5c3484 |
(p6) adds r19 = -1, r19 // t1 -= cy
|
|
Packit |
5c3484 |
mov r16 = r14
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p8) adds r8 = -1, r8 // di--
|
|
Packit |
5c3484 |
(p8) sub r14 = r16, r39 // t0 -= d1
|
|
Packit |
5c3484 |
(p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p8) cmp.gt p7, p6 = 1, r19
|
|
Packit |
5c3484 |
(p9) cmp.gt p7, p6 = 0, r19
|
|
Packit |
5c3484 |
(p8) adds r19 = -1, r19 // t1 -= cy
|
|
Packit |
5c3484 |
mov r16 = r14
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p6) adds r8 = -1, r8 // di--
|
|
Packit |
5c3484 |
(p6) sub r14 = r16, r39 // t0 -= d1
|
|
Packit |
5c3484 |
(p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p6) cmp.gt p9, p8 = 1, r19
|
|
Packit |
5c3484 |
(p7) cmp.gt p9, p8 = 0, r19
|
|
Packit |
5c3484 |
(p6) adds r19 = -1, r19 // t1 -= cy
|
|
Packit |
5c3484 |
mov r16 = r14
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p8) adds r8 = -1, r8 // di--
|
|
Packit |
5c3484 |
(p8) sub r14 = r16, r39 // t0 -= d1
|
|
Packit |
5c3484 |
(p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p8) adds r19 = -1, r19 // t1 -= cy
|
|
Packit |
5c3484 |
mov r16 = r14
|
|
Packit |
5c3484 |
',`
|
|
Packit |
5c3484 |
cmp.gt p8, p9 = r0, r19
|
|
Packit |
5c3484 |
(p8) br.cond.dpnt .L46
|
|
Packit |
5c3484 |
.L52:
|
|
Packit |
5c3484 |
cmp.leu p6, p7 = r39, r16
|
|
Packit |
5c3484 |
sub r14 = r16, r39
|
|
Packit |
5c3484 |
adds r8 = -1, r8
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p7) adds r19 = -1, r19
|
|
Packit |
5c3484 |
mov r16 = r14
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p7) cmp.gt p8, p9 = r0, r19
|
|
Packit |
5c3484 |
(p9) br.cond.dptk .L52
|
|
Packit |
5c3484 |
.L46:
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
setf.sig f32 = r8 // di
|
|
Packit |
5c3484 |
shladd r32 = r35, 3, r32
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
L(top): nop 0
|
|
Packit |
5c3484 |
nop 0
|
|
Packit |
5c3484 |
cmp.gt p8, p9 = r33, r35
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p8) mov r37 = r0
|
|
Packit |
5c3484 |
(p9) ld8 r37 = [r34], -8
|
|
Packit |
5c3484 |
xma.hu f8 = f9, f32, f10 // 0,29
|
|
Packit |
5c3484 |
xma.l f12 = f9, f32, f10 // 0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
getf.sig r20 = f12 // q0 4
|
|
Packit |
5c3484 |
xma.l f13 = f15, f8, f9 // q += n2 4
|
|
Packit |
5c3484 |
sub r8 = -1, r36 // bitnot d0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
getf.sig r18 = f13 // 8
|
|
Packit |
5c3484 |
xma.l f7 = f14, f13, f10 // 8
|
|
Packit |
5c3484 |
xma.l f6 = f33, f13, f33 // t0 = LO(d0*q+d0) 8
|
|
Packit |
5c3484 |
xma.hu f9 = f33, f13, f33 // t1 = HI(d0*q+d0) 9
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
getf.sig r38 = f7 // n1 12
|
|
Packit |
5c3484 |
getf.sig r16 = f6 // 13
|
|
Packit |
5c3484 |
getf.sig r19 = f9 // 14
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
sub r38 = r38, r39 // n1 -= d1 17
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
cmp.ne p9, p0 = r0, r0 // clear p9
|
|
Packit |
5c3484 |
cmp.leu p10, p11 = r16, r37 // cy for: n0 - t0 18
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
sub r37 = r37, r16 // n0 -= t0 19
|
|
Packit |
5c3484 |
(p11) sub r38 = r38, r19, 1 // n1 -= t1 - cy 19
|
|
Packit |
5c3484 |
(p10) sub r38 = r38, r19 // n1 -= t1 19
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
cmp.gtu p6, p7 = r20, r38 // n1 >= q0 20
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p7) cmp.ltu p9, p0 = r8, r37 // 21
|
|
Packit |
5c3484 |
(p6) add r18 = 1, r18 //
|
|
Packit |
5c3484 |
(p7) add r37 = r37, r36 // 21
|
|
Packit |
5c3484 |
(p7) add r38 = r38, r39 // 21
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
setf.sig f10 = r37 // n1 22
|
|
Packit |
5c3484 |
(p9) add r38 = 1, r38 // 22
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
setf.sig f9 = r38 // n2 23
|
|
Packit |
5c3484 |
cmp.gtu p6, p7 = r39, r38 // 23
|
|
Packit |
5c3484 |
(p7) br.cond.spnt L(fix)
|
|
Packit |
5c3484 |
L(bck): st8 [r32] = r18, -8
|
|
Packit |
5c3484 |
adds r35 = -1, r35
|
|
Packit |
5c3484 |
br.cloop.sptk.few L(top)
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(end): add r14 = 8, r34
|
|
Packit |
5c3484 |
add r15 = 16, r34
|
|
Packit |
5c3484 |
mov b0 = r41
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
st8 [r14] = r37
|
|
Packit |
5c3484 |
st8 [r15] = r38
|
|
Packit |
5c3484 |
mov ar.pfs = r42
|
|
Packit |
5c3484 |
mov r8 = r40
|
|
Packit |
5c3484 |
mov ar.lc = r45
|
|
Packit |
5c3484 |
br.ret.sptk.many b0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
.L51:
|
|
Packit |
5c3484 |
.pred.rel "mutex", p8, p9
|
|
Packit |
5c3484 |
sub r37 = r37, r36
|
|
Packit |
5c3484 |
(p9) sub r38 = r38, r39, 1
|
|
Packit |
5c3484 |
(p8) sub r38 = r38, r39
|
|
Packit |
5c3484 |
adds r40 = 1, r0
|
|
Packit |
5c3484 |
br .L8
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(fix): cmp.geu p6, p7 = r39, r38
|
|
Packit |
5c3484 |
cmp.leu p8, p9 = r36, r37
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
(p8) cmp4.ne.and.orcm p6, p7 = 0, r0
|
|
Packit |
5c3484 |
(p6) br.cond.dptk L(bck)
|
|
Packit |
5c3484 |
sub r37 = r37, r36
|
|
Packit |
5c3484 |
(p9) sub r38 = r38, r39, 1
|
|
Packit |
5c3484 |
(p8) sub r38 = r38, r39
|
|
Packit |
5c3484 |
adds r18 = 1, r18
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
setf.sig f9 = r38 // n2
|
|
Packit |
5c3484 |
setf.sig f10 = r37 // n1
|
|
Packit |
5c3484 |
br L(bck)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
EPILOGUE()
|
|
Packit |
5c3484 |
ASM_END()
|