|
Packit |
5c3484 |
dnl IA-64 mpn_sqr_diag_addlsh1
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Contributed to the GNU project by Torbjorn Granlund.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 2010, 2011 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C cycles/limb
|
|
Packit |
5c3484 |
C Itanium: ?
|
|
Packit |
5c3484 |
C Itanium 2: 2 Unrolling could bring it to 1.5 + epsilon
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Exact performance table. The 2nd line is this code, the 3rd line is ctop-
|
|
Packit |
5c3484 |
C less code. In an assembly sqr_basecase, the ctop-full numbers will become a
|
|
Packit |
5c3484 |
C few cycles better since we can mitigate the many I0 instructions.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
|
|
Packit |
5c3484 |
C - 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 Needs updating
|
|
Packit |
5c3484 |
C - 13 16 17 18 20 21 23 25 26 30 31 31 33 34 36 38 39 42 43
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C We should keep in mind that this code takes linear time in a O(n^2) context
|
|
Packit |
5c3484 |
C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become
|
|
Packit |
5c3484 |
C around 60. Keeping overhead down for smallish operands (< 10) is more
|
|
Packit |
5c3484 |
C important than optimal cycle counts.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C TODO
|
|
Packit |
5c3484 |
C * Make sure we don't depend on uninitialised r-registers, f-registers, or
|
|
Packit |
5c3484 |
C * p-registers.
|
|
Packit |
5c3484 |
C * Optimise by doing first two loop iterations in function header.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C INPUT PARAMETERS
|
|
Packit |
5c3484 |
define(`rp_param', `r32') define(`rp', `r14') C size: 2n
|
|
Packit |
5c3484 |
define(`tp_param', `r33') define(`tp', `r15') C size: 2n - 2
|
|
Packit |
5c3484 |
define(`up_param', `r34') define(`up', `r31') C size: n
|
|
Packit |
5c3484 |
define(`n', `r35')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ifdef(`HAVE_ABI_32',`
|
|
Packit |
5c3484 |
define(`ABI64', `')
|
|
Packit |
5c3484 |
define(`ABI32', `$1')
|
|
Packit |
5c3484 |
',`
|
|
Packit |
5c3484 |
define(`ABI64', `$1')
|
|
Packit |
5c3484 |
define(`ABI32', `')
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASM_START()
|
|
Packit |
5c3484 |
PROLOGUE(mpn_sqr_diag_addlsh1)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
.prologue
|
|
Packit |
5c3484 |
.save ar.pfs, r2
|
|
Packit |
5c3484 |
.save ar.lc, r3
|
|
Packit |
5c3484 |
.body
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
{.mii; alloc r2 = ar.pfs, 4,24,0,24 C M
|
|
Packit |
5c3484 |
mov r3 = ar.lc C I0
|
|
Packit |
5c3484 |
ABI64(` nop 4711 ')
|
|
Packit |
5c3484 |
ABI32(` zxt4 n = n ')
|
|
Packit |
5c3484 |
}{.mmi; ABI64(` mov tp = tp_param ') C M I
|
|
Packit |
5c3484 |
ABI32(` addp4 tp = 0, tp_param') C M I
|
|
Packit |
5c3484 |
ABI64(` mov up = up_param ') C M I
|
|
Packit |
5c3484 |
ABI32(` addp4 up = 0, up_param') C M I
|
|
Packit |
5c3484 |
ABI64(` mov rp = rp_param ') C M I
|
|
Packit |
5c3484 |
ABI32(` addp4 rp = 0, rp_param') C M I
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}{.mmi; ld8 r36 = [tp], 8 C M
|
|
Packit |
5c3484 |
add r20 = -2, n C M I
|
|
Packit |
5c3484 |
mov r9 = ar.ec C I0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}{.mmi; ld8 r32 = [tp], 8 C M
|
|
Packit |
5c3484 |
mov r16 = 0 C M I
|
|
Packit |
5c3484 |
mov ar.ec = 7 C I0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}{.mmi; nop 4711
|
|
Packit |
5c3484 |
mov r44 = 0 C M I
|
|
Packit |
5c3484 |
mov ar.lc = r20 C I0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}{.mii; mov r33 = 0
|
|
Packit |
5c3484 |
mov r10 = pr C I0
|
|
Packit |
5c3484 |
mov pr.rot = 0x30000 C I0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
} br.cexit.spnt.few.clr L(end)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl *** MAIN LOOP START ***
|
|
Packit |
5c3484 |
ALIGN(32)
|
|
Packit |
5c3484 |
L(top):
|
|
Packit |
5c3484 |
{.mfi; (p18) ldf8 f33 = [up], 8 C M
|
|
Packit |
5c3484 |
(p20) xma.l f36 = f35, f35, f42 C F
|
|
Packit |
5c3484 |
(p41) cmpequc p50, p0 = -1, r44 C M I
|
|
Packit |
5c3484 |
}{.mfi; setfsig f40 = r16 C M23
|
|
Packit |
5c3484 |
(p20) xma.hu f38 = f35, f35, f42 C F
|
|
Packit |
5c3484 |
(p23) add r50 = r41, r49 C M I
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}{.mmi; (p16) ld8 r36 = [tp], 8 C M
|
|
Packit |
5c3484 |
(p23) cmpltu p40, p0 = r50, r41 C cyout hi M I
|
|
Packit |
5c3484 |
(p19) shrp r45 = r38, r35, 63 C non-critical I0
|
|
Packit |
5c3484 |
}{.mmi; (p21) getfsig r39 = f39 C hi M2
|
|
Packit |
5c3484 |
(p24) st8 [rp] = r51, 8 C hi M23
|
|
Packit |
5c3484 |
(p41) add r44 = 1, r44 C M I
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}{.mmi; (p16) ld8 r32 = [tp], 8 C M
|
|
Packit |
5c3484 |
(p50) cmpeqor p40, p0 = -1, r50 C cyout hi M I
|
|
Packit |
5c3484 |
(p17) shrp r16 = r33, r37, 63 C critical I0
|
|
Packit |
5c3484 |
}{.mmi; (p21) getfsig r42 = f37 C lo M2
|
|
Packit |
5c3484 |
(p23) st8 [rp] = r44, 8 C lo M23
|
|
Packit |
5c3484 |
(p50) add r50 = 1, r50 C M I
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
} br.ctop.sptk.few.clr L(top) C B
|
|
Packit |
5c3484 |
dnl *** MAIN LOOP END ***
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
L(end):
|
|
Packit |
5c3484 |
{.mmi; nop 4711
|
|
Packit |
5c3484 |
(p41) add r44 = 1, r44 C M I
|
|
Packit |
5c3484 |
shr.u r48 = r39, 63 C I0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}{.mmi; st8 [rp] = r51, 8 C M23
|
|
Packit |
5c3484 |
(p41) cmpequc p6, p0 = 0, r44 C M I
|
|
Packit |
5c3484 |
add r50 = r41, r48 C M I
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}{.mmi; st8 [rp] = r44, 8 C M23
|
|
Packit |
5c3484 |
(p6) add r50 = 1, r50 C M I
|
|
Packit |
5c3484 |
mov ar.lc = r3 C I0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}{.mii; st8 [rp] = r50 C M23
|
|
Packit |
5c3484 |
mov ar.ec = r9 C I0
|
|
Packit |
5c3484 |
mov pr = r10 C I0
|
|
Packit |
5c3484 |
;;
|
|
Packit |
5c3484 |
}{.mib; nop 4711
|
|
Packit |
5c3484 |
mov ar.pfs = r2 C I0
|
|
Packit |
5c3484 |
br.ret.sptk.many b0 C B
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
EPILOGUE()
|