|
Packit |
5c3484 |
dnl ARM64 Neon mpn_popcount -- mpn bit population count.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 2013, 2014 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C cycles/limb
|
|
Packit |
5c3484 |
C Cortex-A53 ?
|
|
Packit |
5c3484 |
C Cortex-A57 ?
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C TODO
|
|
Packit |
5c3484 |
C * Consider greater unrolling.
|
|
Packit |
5c3484 |
C * Arrange to align the pointer, if that helps performance. Use the same
|
|
Packit |
5c3484 |
C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
|
|
Packit |
5c3484 |
C valgrind!)
|
|
Packit |
5c3484 |
C * Explore if explicit align directives, e.g., "[ptr:128]" help.
|
|
Packit |
5c3484 |
C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
changecom(@&*$)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C INPUT PARAMETERS
|
|
Packit |
5c3484 |
define(`ap', x0)
|
|
Packit |
5c3484 |
define(`n', x1)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
|
|
Packit |
5c3484 |
C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
|
|
Packit |
5c3484 |
C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which
|
|
Packit |
5c3484 |
C allows the huge count code to jump deep into the code (at L(chu)).
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
define(`maxsize', 0x1fff)
|
|
Packit |
5c3484 |
define(`chunksize',0x1ff0)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASM_START()
|
|
Packit |
5c3484 |
PROLOGUE(mpn_popcount)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mov x11, #maxsize
|
|
Packit |
5c3484 |
cmp n, x11
|
|
Packit |
5c3484 |
b.hi L(gt8k)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(lt8k):
|
|
Packit |
5c3484 |
movi v4.16b, #0 C clear summation register
|
|
Packit |
5c3484 |
movi v5.16b, #0 C clear summation register
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
tbz n, #0, L(xx0)
|
|
Packit |
5c3484 |
sub n, n, #1
|
|
Packit |
5c3484 |
ld1 {v0.1d}, [ap], #8 C load 1 limb
|
|
Packit |
5c3484 |
cnt v6.16b, v0.16b
|
|
Packit |
5c3484 |
uadalp v4.8h, v6.16b C could also splat
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(xx0): tbz n, #1, L(x00)
|
|
Packit |
5c3484 |
sub n, n, #2
|
|
Packit |
5c3484 |
ld1 {v0.2d}, [ap], #16 C load 2 limbs
|
|
Packit |
5c3484 |
cnt v6.16b, v0.16b
|
|
Packit |
5c3484 |
uadalp v4.8h, v6.16b
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(x00): tbz n, #2, L(000)
|
|
Packit |
5c3484 |
subs n, n, #4
|
|
Packit |
5c3484 |
ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
|
|
Packit |
5c3484 |
b.ls L(sum)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
|
|
Packit |
5c3484 |
sub n, n, #4
|
|
Packit |
5c3484 |
cnt v6.16b, v0.16b
|
|
Packit |
5c3484 |
cnt v7.16b, v1.16b
|
|
Packit |
5c3484 |
b L(mid)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(000): subs n, n, #8
|
|
Packit |
5c3484 |
b.lo L(e0)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
|
|
Packit |
5c3484 |
ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
|
|
Packit |
5c3484 |
cnt v6.16b, v2.16b
|
|
Packit |
5c3484 |
cnt v7.16b, v3.16b
|
|
Packit |
5c3484 |
subs n, n, #8
|
|
Packit |
5c3484 |
b.lo L(end)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
|
|
Packit |
5c3484 |
uadalp v4.8h, v6.16b
|
|
Packit |
5c3484 |
cnt v6.16b, v0.16b
|
|
Packit |
5c3484 |
uadalp v5.8h, v7.16b
|
|
Packit |
5c3484 |
cnt v7.16b, v1.16b
|
|
Packit |
5c3484 |
L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
|
|
Packit |
5c3484 |
subs n, n, #8
|
|
Packit |
5c3484 |
uadalp v4.8h, v6.16b
|
|
Packit |
5c3484 |
cnt v6.16b, v2.16b
|
|
Packit |
5c3484 |
uadalp v5.8h, v7.16b
|
|
Packit |
5c3484 |
cnt v7.16b, v3.16b
|
|
Packit |
5c3484 |
b.hs L(top)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(end): uadalp v4.8h, v6.16b
|
|
Packit |
5c3484 |
uadalp v5.8h, v7.16b
|
|
Packit |
5c3484 |
L(sum): cnt v6.16b, v0.16b
|
|
Packit |
5c3484 |
cnt v7.16b, v1.16b
|
|
Packit |
5c3484 |
uadalp v4.8h, v6.16b
|
|
Packit |
5c3484 |
uadalp v5.8h, v7.16b
|
|
Packit |
5c3484 |
add v4.8h, v4.8h, v5.8h
|
|
Packit |
5c3484 |
C we have 8 16-bit counts
|
|
Packit |
5c3484 |
L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts
|
|
Packit |
5c3484 |
uaddlp v4.2d, v4.4s C we have 2 64-bit counts
|
|
Packit |
5c3484 |
mov x0, v4.d[0]
|
|
Packit |
5c3484 |
mov x1, v4.d[1]
|
|
Packit |
5c3484 |
add x0, x0, x1
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Code for count > maxsize. Splits operand and calls above code.
|
|
Packit |
5c3484 |
define(`ap2', x5) C caller-saves reg not used above
|
|
Packit |
5c3484 |
L(gt8k):
|
|
Packit |
5c3484 |
mov x8, x30
|
|
Packit |
5c3484 |
mov x7, n C full count (caller-saves reg not used above)
|
|
Packit |
5c3484 |
mov x4, #0 C total sum (caller-saves reg not used above)
|
|
Packit |
5c3484 |
mov x9, #chunksize*8 C caller-saves reg not used above
|
|
Packit |
5c3484 |
mov x10, #chunksize C caller-saves reg not used above
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
1: add ap2, ap, x9 C point at subsequent block
|
|
Packit |
5c3484 |
mov n, #chunksize-8 C count for this invocation, adjusted for entry pt
|
|
Packit |
5c3484 |
movi v4.16b, #0 C clear chunk summation register
|
|
Packit |
5c3484 |
movi v5.16b, #0 C clear chunk summation register
|
|
Packit |
5c3484 |
bl L(chu) C jump deep inside code
|
|
Packit |
5c3484 |
add x4, x4, x0
|
|
Packit |
5c3484 |
mov ap, ap2 C put chunk pointer in place for calls
|
|
Packit |
5c3484 |
sub x7, x7, x10
|
|
Packit |
5c3484 |
cmp x7, x11
|
|
Packit |
5c3484 |
b.hi 1b
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mov n, x7 C count for final invocation
|
|
Packit |
5c3484 |
bl L(lt8k)
|
|
Packit |
5c3484 |
add x0, x4, x0
|
|
Packit |
5c3484 |
mov x30, x8
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
EPILOGUE()
|