|
Packit |
5c3484 |
dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 2006, 2010 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C cycles/limb
|
|
Packit |
5c3484 |
C 7400,7410 (G4): ?
|
|
Packit |
5c3484 |
C 744x,745x (G4+): 1.125
|
|
Packit |
5c3484 |
C 970 (G5): 2.25
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C TODO
|
|
Packit |
5c3484 |
C * Rewrite the awkward huge n outer loop code.
|
|
Packit |
5c3484 |
C * Two lvx, two vperm, and two vxor could make us a similar hamdist.
|
|
Packit |
5c3484 |
C * Compress cnsts table in 64-bit mode, only half the values are needed.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
|
|
Packit |
5c3484 |
define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
|
|
Packit |
5c3484 |
define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
define(`OPERATION_popcount')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
define(`ap', `r3')
|
|
Packit |
5c3484 |
define(`n', `r4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
define(`rtab', `v10')
|
|
Packit |
5c3484 |
define(`cnt4', `v11')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ifelse(GMP_LIMB_BITS,32,`
|
|
Packit |
5c3484 |
define(`LIMB32',` $1')
|
|
Packit |
5c3484 |
define(`LIMB64',`')
|
|
Packit |
5c3484 |
',`
|
|
Packit |
5c3484 |
define(`LIMB32',`')
|
|
Packit |
5c3484 |
define(`LIMB64',` $1')
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow
|
|
Packit |
5c3484 |
C in vsum4ubs. For large operands, we work in chunks, of size LIMBS_PER_CHUNK.
|
|
Packit |
5c3484 |
define(`LIMBS_PER_CHUNK', 0x1000)
|
|
Packit |
5c3484 |
define(`LIMBS_CHUNK_THRES', 0x1001)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASM_START()
|
|
Packit |
5c3484 |
PROLOGUE(mpn_popcount,toc)
|
|
Packit |
5c3484 |
mfspr r10, 256
|
|
Packit |
5c3484 |
oris r0, r10, 0xfffc C Set VRSAVE bit 0-13
|
|
Packit |
5c3484 |
mtspr 256, r0
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ifdef(`HAVE_ABI_mode32',
|
|
Packit |
5c3484 |
` rldicl n, n, 0, 32') C zero extend n
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Load various constants into vector registers
|
|
Packit |
5c3484 |
LEAL( r11, cnsts)
|
|
Packit |
5c3484 |
li r12, 16
|
|
Packit |
5c3484 |
vspltisb cnt4, 4 C 0x0404...04 used as shift count
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
li r7, 160
|
|
Packit |
5c3484 |
lvx rtab, 0, r11
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
LIMB64(`lis r0, LIMBS_CHUNK_THRES ')
|
|
Packit |
5c3484 |
LIMB64(`cmpd cr7, n, r0 ')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
lvx v0, 0, ap
|
|
Packit |
5c3484 |
addi r7, r11, 80
|
|
Packit |
5c3484 |
rlwinm r6, ap, 2,26,29
|
|
Packit |
5c3484 |
lvx v8, r7, r6
|
|
Packit |
5c3484 |
vand v0, v0, v8
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
LIMB32(`rlwinm r8, ap, 30,30,31 ')
|
|
Packit |
5c3484 |
LIMB64(`rlwinm r8, ap, 29,31,31 ')
|
|
Packit |
5c3484 |
add n, n, r8 C compensate n for rounded down `ap'
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
vxor v1, v1, v1
|
|
Packit |
5c3484 |
li r8, 0 C grand total count
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
vxor v12, v12, v12 C zero total count
|
|
Packit |
5c3484 |
vxor v13, v13, v13 C zero total count
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
addic. n, n, -LIMBS_PER_VR
|
|
Packit |
5c3484 |
ble L(sum)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
addic. n, n, -LIMBS_PER_VR
|
|
Packit |
5c3484 |
ble L(lsum)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C For 64-bit machines, handle huge n that would overflow vsum4ubs
|
|
Packit |
5c3484 |
LIMB64(`ble cr7, L(small) ')
|
|
Packit |
5c3484 |
LIMB64(`addis r9, n, -LIMBS_PER_CHUNK ') C remaining n
|
|
Packit |
5c3484 |
LIMB64(`lis n, LIMBS_PER_CHUNK ')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
L(small):
|
|
Packit |
5c3484 |
LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n
|
|
Packit |
5c3484 |
LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n
|
|
Packit |
5c3484 |
addi r7, r7, 1
|
|
Packit |
5c3484 |
mtctr r7 C copy n to count register
|
|
Packit |
5c3484 |
b L(ent)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
L(top):
|
|
Packit |
5c3484 |
lvx v0, 0, ap
|
|
Packit |
5c3484 |
L(ent): lvx v1, r12, ap
|
|
Packit |
5c3484 |
addi ap, ap, 32
|
|
Packit |
5c3484 |
vsrb v8, v0, cnt4
|
|
Packit |
5c3484 |
vsrb v9, v1, cnt4
|
|
Packit |
5c3484 |
vperm v2, rtab, rtab, v0
|
|
Packit |
5c3484 |
vperm v3, rtab, rtab, v8
|
|
Packit |
5c3484 |
vperm v4, rtab, rtab, v1
|
|
Packit |
5c3484 |
vperm v5, rtab, rtab, v9
|
|
Packit |
5c3484 |
vaddubm v6, v2, v3
|
|
Packit |
5c3484 |
vaddubm v7, v4, v5
|
|
Packit |
5c3484 |
vsum4ubs v12, v6, v12
|
|
Packit |
5c3484 |
vsum4ubs v13, v7, v13
|
|
Packit |
5c3484 |
bdnz L(top)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
andi. n, n, eval(LIMBS_PER_2VR-1)
|
|
Packit |
5c3484 |
beq L(rt)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
lvx v0, 0, ap
|
|
Packit |
5c3484 |
vxor v1, v1, v1
|
|
Packit |
5c3484 |
cmpwi n, LIMBS_PER_VR
|
|
Packit |
5c3484 |
ble L(sum)
|
|
Packit |
5c3484 |
L(lsum):
|
|
Packit |
5c3484 |
vor v1, v0, v0
|
|
Packit |
5c3484 |
lvx v0, r12, ap
|
|
Packit |
5c3484 |
L(sum):
|
|
Packit |
5c3484 |
LIMB32(`rlwinm r6, n, 4,26,27 ')
|
|
Packit |
5c3484 |
LIMB64(`rlwinm r6, n, 5,26,26 ')
|
|
Packit |
5c3484 |
addi r7, r11, 16
|
|
Packit |
5c3484 |
lvx v8, r7, r6
|
|
Packit |
5c3484 |
vand v0, v0, v8
|
|
Packit |
5c3484 |
vsrb v8, v0, cnt4
|
|
Packit |
5c3484 |
vsrb v9, v1, cnt4
|
|
Packit |
5c3484 |
vperm v2, rtab, rtab, v0
|
|
Packit |
5c3484 |
vperm v3, rtab, rtab, v8
|
|
Packit |
5c3484 |
vperm v4, rtab, rtab, v1
|
|
Packit |
5c3484 |
vperm v5, rtab, rtab, v9
|
|
Packit |
5c3484 |
vaddubm v6, v2, v3
|
|
Packit |
5c3484 |
vaddubm v7, v4, v5
|
|
Packit |
5c3484 |
vsum4ubs v12, v6, v12
|
|
Packit |
5c3484 |
vsum4ubs v13, v7, v13
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
L(rt): vadduwm v3, v12, v13
|
|
Packit |
5c3484 |
li r7, -16 C FIXME: does all ppc32 and ppc64 ABIs
|
|
Packit |
5c3484 |
stvx v3, r7, r1 C FIXME: ...support storing below sp?
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
lwz r7, -16(r1)
|
|
Packit |
5c3484 |
add r8, r8, r7
|
|
Packit |
5c3484 |
lwz r7, -12(r1)
|
|
Packit |
5c3484 |
add r8, r8, r7
|
|
Packit |
5c3484 |
lwz r7, -8(r1)
|
|
Packit |
5c3484 |
add r8, r8, r7
|
|
Packit |
5c3484 |
lwz r7, -4(r1)
|
|
Packit |
5c3484 |
add r8, r8, r7
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Handle outer loop for huge n. We inherit cr7 and r0 from above.
|
|
Packit |
5c3484 |
LIMB64(`ble cr7, L(ret)
|
|
Packit |
5c3484 |
vxor v12, v12, v12 C zero total count
|
|
Packit |
5c3484 |
vxor v13, v13, v13 C zero total count
|
|
Packit |
5c3484 |
mr n, r9
|
|
Packit |
5c3484 |
cmpd cr7, n, r0
|
|
Packit |
5c3484 |
ble cr7, L(2)
|
|
Packit |
5c3484 |
addis r9, n, -LIMBS_PER_CHUNK C remaining n
|
|
Packit |
5c3484 |
lis n, LIMBS_PER_CHUNK
|
|
Packit |
5c3484 |
L(2): srdi r7, n, 2 C loop count corresponding to n
|
|
Packit |
5c3484 |
mtctr r7 C copy n to count register
|
|
Packit |
5c3484 |
b L(top)
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
L(ret): mr r3, r8
|
|
Packit |
5c3484 |
mtspr 256, r10
|
|
Packit |
5c3484 |
blr
|
|
Packit |
5c3484 |
EPILOGUE()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
DEF_OBJECT(cnsts,16)
|
|
Packit |
5c3484 |
C Counts for vperm
|
|
Packit |
5c3484 |
.byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
|
|
Packit |
5c3484 |
.byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
|
|
Packit |
5c3484 |
C Masks for high end of number
|
|
Packit |
5c3484 |
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
|
|
Packit |
5c3484 |
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
|
|
Packit |
5c3484 |
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
|
|
Packit |
5c3484 |
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
|
|
Packit |
5c3484 |
.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
|
|
Packit |
5c3484 |
C Masks for low end of number
|
|
Packit |
5c3484 |
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
|
|
Packit |
5c3484 |
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
.byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
|
|
Packit |
5c3484 |
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
|
Packit |
5c3484 |
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
|
Packit |
5c3484 |
.byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
|
|
Packit |
5c3484 |
END_OBJECT(cnsts)
|
|
Packit |
5c3484 |
ASM_END()
|