|
Packit |
5c3484 |
dnl ARM Neon mpn_lshift and mpn_rshift.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Contributed to the GNU project by Torbjörn Granlund.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 2013 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C cycles/limb cycles/limb cycles/limb good
|
|
Packit |
5c3484 |
C aligned unaligned best seen for cpu?
|
|
Packit |
5c3484 |
C StrongARM - -
|
|
Packit |
5c3484 |
C XScale - -
|
|
Packit |
5c3484 |
C Cortex-A7 ? ?
|
|
Packit |
5c3484 |
C Cortex-A8 ? ?
|
|
Packit |
5c3484 |
C Cortex-A9 3 3 Y
|
|
Packit |
5c3484 |
C Cortex-A15 1.5 1.5 Y
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C We read 64 bits at a time at 32-bit aligned addresses, and except for the
|
|
Packit |
5c3484 |
C first and last store, we write using 64-bit aligned addresses. All shifting
|
|
Packit |
5c3484 |
C is done on 64-bit words in 'extension' registers.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C It should be possible to read also using 64-bit alignment, by manipulating
|
|
Packit |
5c3484 |
C the shift count for unaligned operands. Not done, since it does not seem to
|
|
Packit |
5c3484 |
C matter for A9 or A15.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C This will not work in big-endian mode.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C TODO
|
|
Packit |
5c3484 |
C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts,
|
|
Packit |
5c3484 |
C which might make it tricky.
|
|
Packit |
5c3484 |
C * Clean up and simplify.
|
|
Packit |
5c3484 |
C * Consider sharing most of the code for lshift and rshift, since the feed-in code,
|
|
Packit |
5c3484 |
C the loop, and most of the wind-down code are identical.
|
|
Packit |
5c3484 |
C * Replace the basecase code with code using 'extension' registers.
|
|
Packit |
5c3484 |
C * Optimise. It is not clear that this loop insn permutation is optimal for
|
|
Packit |
5c3484 |
C either A9 or A15.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C INPUT PARAMETERS
|
|
Packit |
5c3484 |
define(`rp', `r0')
|
|
Packit |
5c3484 |
define(`ap', `r1')
|
|
Packit |
5c3484 |
define(`n', `r2')
|
|
Packit |
5c3484 |
define(`cnt', `r3')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ifdef(`OPERATION_lshift',`
|
|
Packit |
5c3484 |
define(`IFLSH', `$1')
|
|
Packit |
5c3484 |
define(`IFRSH', `')
|
|
Packit |
5c3484 |
define(`X',`0')
|
|
Packit |
5c3484 |
define(`Y',`1')
|
|
Packit |
5c3484 |
define(`func',`mpn_lshift')
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
ifdef(`OPERATION_rshift',`
|
|
Packit |
5c3484 |
define(`IFLSH', `')
|
|
Packit |
5c3484 |
define(`IFRSH', `$1')
|
|
Packit |
5c3484 |
define(`X',`1')
|
|
Packit |
5c3484 |
define(`Y',`0')
|
|
Packit |
5c3484 |
define(`func',`mpn_rshift')
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASM_START()
|
|
Packit |
5c3484 |
TEXT
|
|
Packit |
5c3484 |
ALIGN(64)
|
|
Packit |
5c3484 |
PROLOGUE(func)
|
|
Packit |
5c3484 |
IFLSH(` mov r12, n, lsl #2 ')
|
|
Packit |
5c3484 |
IFLSH(` add rp, rp, r12 ')
|
|
Packit |
5c3484 |
IFLSH(` add ap, ap, r12 ')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
cmp n, #4 C SIMD code n limit
|
|
Packit |
5c3484 |
ble L(base)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ifdef(`OPERATION_lshift',`
|
|
Packit |
5c3484 |
vdup.32 d6, r3 C left shift count is positive
|
|
Packit |
5c3484 |
sub r3, r3, #64 C right shift count is negative
|
|
Packit |
5c3484 |
vdup.32 d7, r3
|
|
Packit |
5c3484 |
mov r12, #-8') C lshift pointer update offset
|
|
Packit |
5c3484 |
ifdef(`OPERATION_rshift',`
|
|
Packit |
5c3484 |
rsb r3, r3, #0 C right shift count is negative
|
|
Packit |
5c3484 |
vdup.32 d6, r3
|
|
Packit |
5c3484 |
add r3, r3, #64 C left shift count is positive
|
|
Packit |
5c3484 |
vdup.32 d7, r3
|
|
Packit |
5c3484 |
mov r12, #8') C rshift pointer update offset
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
IFLSH(` sub ap, ap, #8 ')
|
|
Packit |
5c3484 |
vld1.32 {d19}, [ap], r12 C load initial 2 limbs
|
|
Packit |
5c3484 |
vshl.u64 d18, d19, d7 C retval
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
tst rp, #4 C is rp 64-bit aligned already?
|
|
Packit |
5c3484 |
beq L(rp_aligned) C yes, skip
|
|
Packit |
5c3484 |
IFLSH(` add ap, ap, #4 ') C move back ap pointer
|
|
Packit |
5c3484 |
IFRSH(` sub ap, ap, #4 ') C move back ap pointer
|
|
Packit |
5c3484 |
vshl.u64 d4, d19, d6
|
|
Packit |
5c3484 |
sub n, n, #1 C first limb handled
|
|
Packit |
5c3484 |
IFLSH(` sub rp, rp, #4 ')
|
|
Packit |
5c3484 |
vst1.32 {d4[Y]}, [rp]IFRSH(!) C store first limb, rp gets aligned
|
|
Packit |
5c3484 |
vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2]
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(rp_aligned):
|
|
Packit |
5c3484 |
IFLSH(` sub rp, rp, #8 ')
|
|
Packit |
5c3484 |
subs n, n, #6
|
|
Packit |
5c3484 |
blt L(two_or_three_more)
|
|
Packit |
5c3484 |
tst n, #2
|
|
Packit |
5c3484 |
beq L(2)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(1): vld1.32 {d17}, [ap], r12
|
|
Packit |
5c3484 |
vshl.u64 d5, d19, d6
|
|
Packit |
5c3484 |
vld1.32 {d16}, [ap], r12
|
|
Packit |
5c3484 |
vshl.u64 d0, d17, d7
|
|
Packit |
5c3484 |
vshl.u64 d4, d17, d6
|
|
Packit |
5c3484 |
sub n, n, #2
|
|
Packit |
5c3484 |
b L(mid)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(2): vld1.32 {d16}, [ap], r12
|
|
Packit |
5c3484 |
vshl.u64 d4, d19, d6
|
|
Packit |
5c3484 |
vld1.32 {d17}, [ap], r12
|
|
Packit |
5c3484 |
vshl.u64 d1, d16, d7
|
|
Packit |
5c3484 |
vshl.u64 d5, d16, d6
|
|
Packit |
5c3484 |
subs n, n, #4
|
|
Packit |
5c3484 |
blt L(end)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(top): vld1.32 {d16}, [ap], r12
|
|
Packit |
5c3484 |
vorr d2, d4, d1
|
|
Packit |
5c3484 |
vshl.u64 d0, d17, d7
|
|
Packit |
5c3484 |
vshl.u64 d4, d17, d6
|
|
Packit |
5c3484 |
vst1.32 {d2}, [rp:64], r12
|
|
Packit |
5c3484 |
L(mid): vld1.32 {d17}, [ap], r12
|
|
Packit |
5c3484 |
vorr d3, d5, d0
|
|
Packit |
5c3484 |
vshl.u64 d1, d16, d7
|
|
Packit |
5c3484 |
vshl.u64 d5, d16, d6
|
|
Packit |
5c3484 |
vst1.32 {d3}, [rp:64], r12
|
|
Packit |
5c3484 |
subs n, n, #4
|
|
Packit |
5c3484 |
bge L(top)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(end): tst n, #1
|
|
Packit |
5c3484 |
beq L(evn)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
vorr d2, d4, d1
|
|
Packit |
5c3484 |
vst1.32 {d2}, [rp:64], r12
|
|
Packit |
5c3484 |
b L(cj1)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(evn): vorr d2, d4, d1
|
|
Packit |
5c3484 |
vshl.u64 d0, d17, d7
|
|
Packit |
5c3484 |
vshl.u64 d16, d17, d6
|
|
Packit |
5c3484 |
vst1.32 {d2}, [rp:64], r12
|
|
Packit |
5c3484 |
vorr d2, d5, d0
|
|
Packit |
5c3484 |
b L(cj2)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Load last 2 - 3 limbs, store last 4 - 5 limbs
|
|
Packit |
5c3484 |
L(two_or_three_more):
|
|
Packit |
5c3484 |
tst n, #1
|
|
Packit |
5c3484 |
beq L(l2)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(l3): vshl.u64 d5, d19, d6
|
|
Packit |
5c3484 |
vld1.32 {d17}, [ap], r12
|
|
Packit |
5c3484 |
L(cj1): veor d16, d16, d16
|
|
Packit |
5c3484 |
IFLSH(` add ap, ap, #4 ')
|
|
Packit |
5c3484 |
vld1.32 {d16[Y]}, [ap], r12
|
|
Packit |
5c3484 |
vshl.u64 d0, d17, d7
|
|
Packit |
5c3484 |
vshl.u64 d4, d17, d6
|
|
Packit |
5c3484 |
vorr d3, d5, d0
|
|
Packit |
5c3484 |
vshl.u64 d1, d16, d7
|
|
Packit |
5c3484 |
vshl.u64 d5, d16, d6
|
|
Packit |
5c3484 |
vst1.32 {d3}, [rp:64], r12
|
|
Packit |
5c3484 |
vorr d2, d4, d1
|
|
Packit |
5c3484 |
vst1.32 {d2}, [rp:64], r12
|
|
Packit |
5c3484 |
IFLSH(` add rp, rp, #4 ')
|
|
Packit |
5c3484 |
vst1.32 {d5[Y]}, [rp]
|
|
Packit |
5c3484 |
vmov.32 r0, d18[X]
|
|
Packit |
5c3484 |
bx lr
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(l2): vld1.32 {d16}, [ap], r12
|
|
Packit |
5c3484 |
vshl.u64 d4, d19, d6
|
|
Packit |
5c3484 |
vshl.u64 d1, d16, d7
|
|
Packit |
5c3484 |
vshl.u64 d16, d16, d6
|
|
Packit |
5c3484 |
vorr d2, d4, d1
|
|
Packit |
5c3484 |
L(cj2): vst1.32 {d2}, [rp:64], r12
|
|
Packit |
5c3484 |
vst1.32 {d16}, [rp]
|
|
Packit |
5c3484 |
vmov.32 r0, d18[X]
|
|
Packit |
5c3484 |
bx lr
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
define(`tnc', `r12')
|
|
Packit |
5c3484 |
L(base):
|
|
Packit |
5c3484 |
push {r4, r6, r7, r8}
|
|
Packit |
5c3484 |
ifdef(`OPERATION_lshift',`
|
|
Packit |
5c3484 |
ldr r4, [ap, #-4]!
|
|
Packit |
5c3484 |
rsb tnc, cnt, #32
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mov r7, r4, lsl cnt
|
|
Packit |
5c3484 |
tst n, #1
|
|
Packit |
5c3484 |
beq L(ev) C n even
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(od): subs n, n, #2
|
|
Packit |
5c3484 |
bcc L(ed1) C n = 1
|
|
Packit |
5c3484 |
ldr r8, [ap, #-4]!
|
|
Packit |
5c3484 |
b L(md) C n = 3
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(ev): ldr r6, [ap, #-4]!
|
|
Packit |
5c3484 |
subs n, n, #2
|
|
Packit |
5c3484 |
beq L(ed) C n = 3
|
|
Packit |
5c3484 |
C n = 4
|
|
Packit |
5c3484 |
L(tp): ldr r8, [ap, #-4]!
|
|
Packit |
5c3484 |
orr r7, r7, r6, lsr tnc
|
|
Packit |
5c3484 |
str r7, [rp, #-4]!
|
|
Packit |
5c3484 |
mov r7, r6, lsl cnt
|
|
Packit |
5c3484 |
L(md): ldr r6, [ap, #-4]!
|
|
Packit |
5c3484 |
orr r7, r7, r8, lsr tnc
|
|
Packit |
5c3484 |
str r7, [rp, #-4]!
|
|
Packit |
5c3484 |
mov r7, r8, lsl cnt
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(ed): orr r7, r7, r6, lsr tnc
|
|
Packit |
5c3484 |
str r7, [rp, #-4]!
|
|
Packit |
5c3484 |
mov r7, r6, lsl cnt
|
|
Packit |
5c3484 |
L(ed1): str r7, [rp, #-4]
|
|
Packit |
5c3484 |
mov r0, r4, lsr tnc
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
ifdef(`OPERATION_rshift',`
|
|
Packit |
5c3484 |
ldr r4, [ap]
|
|
Packit |
5c3484 |
rsb tnc, cnt, #32
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mov r7, r4, lsr cnt
|
|
Packit |
5c3484 |
tst n, #1
|
|
Packit |
5c3484 |
beq L(ev) C n even
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(od): subs n, n, #2
|
|
Packit |
5c3484 |
bcc L(ed1) C n = 1
|
|
Packit |
5c3484 |
ldr r8, [ap, #4]!
|
|
Packit |
5c3484 |
b L(md) C n = 3
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(ev): ldr r6, [ap, #4]!
|
|
Packit |
5c3484 |
subs n, n, #2
|
|
Packit |
5c3484 |
beq L(ed) C n = 2
|
|
Packit |
5c3484 |
C n = 4
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(tp): ldr r8, [ap, #4]!
|
|
Packit |
5c3484 |
orr r7, r7, r6, lsl tnc
|
|
Packit |
5c3484 |
str r7, [rp], #4
|
|
Packit |
5c3484 |
mov r7, r6, lsr cnt
|
|
Packit |
5c3484 |
L(md): ldr r6, [ap, #4]!
|
|
Packit |
5c3484 |
orr r7, r7, r8, lsl tnc
|
|
Packit |
5c3484 |
str r7, [rp], #4
|
|
Packit |
5c3484 |
mov r7, r8, lsr cnt
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(ed): orr r7, r7, r6, lsl tnc
|
|
Packit |
5c3484 |
str r7, [rp], #4
|
|
Packit |
5c3484 |
mov r7, r6, lsr cnt
|
|
Packit |
5c3484 |
L(ed1): str r7, [rp], #4
|
|
Packit |
5c3484 |
mov r0, r4, lsl tnc
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
pop {r4, r6, r7, r8}
|
|
Packit |
5c3484 |
bx r14
|
|
Packit |
5c3484 |
EPILOGUE()
|