|
Packit |
5c3484 |
dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n,
|
|
Packit |
5c3484 |
dnl mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise
|
|
Packit |
5c3484 |
dnl logical operations.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 2006 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C and,ior,andn,nior,xor iorn,xnor nand
|
|
Packit |
5c3484 |
C cycles/limb cycles/limb cycles/limb
|
|
Packit |
5c3484 |
C 7400,7410 (G4): 1.39 ? ?
|
|
Packit |
5c3484 |
C 744x,745x (G4+): 1.14 1.39 1.39
|
|
Packit |
5c3484 |
C 970: 1.7 2.0 2.0
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C STATUS
|
|
Packit |
5c3484 |
C * Works for all sizes and alignment for 32-bit limbs.
|
|
Packit |
5c3484 |
C * Works for n >= 4 for 64-bit limbs; untested for smaller operands.
|
|
Packit |
5c3484 |
C * Current performance makes this pointless for 970
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C TODO
|
|
Packit |
5c3484 |
C * Might want to make variants when just one of the source operands needs
|
|
Packit |
5c3484 |
C vperm, and when neither needs it. The latter runs 50% faster on 7400.
|
|
Packit |
5c3484 |
C * Idea: If the source operands are equally aligned, we could do the logops
|
|
Packit |
5c3484 |
C first, then vperm before storing! That means we never need more than one
|
|
Packit |
5c3484 |
C vperm, ever!
|
|
Packit |
5c3484 |
C * Perhaps align `rp' after initial alignment loop?
|
|
Packit |
5c3484 |
C * Instead of having scalar code in the beginning and end, consider using
|
|
Packit |
5c3484 |
C read-modify-write vector code.
|
|
Packit |
5c3484 |
C * Software pipeline? Hopefully not too important, this is hairy enough
|
|
Packit |
5c3484 |
C already.
|
|
Packit |
5c3484 |
C * At least be more clever about operand loading, i.e., load v operands before
|
|
Packit |
5c3484 |
C u operands, since v operands are sometimes negated.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
|
|
Packit |
5c3484 |
define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
|
|
Packit |
5c3484 |
define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
define(`vnegb', `') C default neg-before to null
|
|
Packit |
5c3484 |
define(`vnega', `') C default neg-before to null
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ifdef(`OPERATION_and_n',
|
|
Packit |
5c3484 |
` define(`func', `mpn_and_n')
|
|
Packit |
5c3484 |
define(`logopS',`and $1,$2,$3')
|
|
Packit |
5c3484 |
define(`logop', `vand $1,$2,$3')')
|
|
Packit |
5c3484 |
ifdef(`OPERATION_andn_n',
|
|
Packit |
5c3484 |
` define(`func', `mpn_andn_n')
|
|
Packit |
5c3484 |
define(`logopS',`andc $1,$2,$3')
|
|
Packit |
5c3484 |
define(`logop', `vandc $1,$2,$3')')
|
|
Packit |
5c3484 |
ifdef(`OPERATION_nand_n',
|
|
Packit |
5c3484 |
` define(`func', `mpn_nand_n')
|
|
Packit |
5c3484 |
define(`logopS',`nand $1,$2,$3')
|
|
Packit |
5c3484 |
define(`logop', `vand $1,$2,$3')
|
|
Packit |
5c3484 |
define(`vnega', `vnor $1,$2,$2')')
|
|
Packit |
5c3484 |
ifdef(`OPERATION_ior_n',
|
|
Packit |
5c3484 |
` define(`func', `mpn_ior_n')
|
|
Packit |
5c3484 |
define(`logopS',`or $1,$2,$3')
|
|
Packit |
5c3484 |
define(`logop', `vor $1,$2,$3')')
|
|
Packit |
5c3484 |
ifdef(`OPERATION_iorn_n',
|
|
Packit |
5c3484 |
` define(`func', `mpn_iorn_n')
|
|
Packit |
5c3484 |
define(`logopS',`orc $1,$2,$3')
|
|
Packit |
5c3484 |
define(`vnegb', `vnor $1,$2,$2')
|
|
Packit |
5c3484 |
define(`logop', `vor $1,$2,$3')')
|
|
Packit |
5c3484 |
ifdef(`OPERATION_nior_n',
|
|
Packit |
5c3484 |
` define(`func', `mpn_nior_n')
|
|
Packit |
5c3484 |
define(`logopS',`nor $1,$2,$3')
|
|
Packit |
5c3484 |
define(`logop', `vnor $1,$2,$3')')
|
|
Packit |
5c3484 |
ifdef(`OPERATION_xor_n',
|
|
Packit |
5c3484 |
` define(`func', `mpn_xor_n')
|
|
Packit |
5c3484 |
define(`logopS',`xor $1,$2,$3')
|
|
Packit |
5c3484 |
define(`logop', `vxor $1,$2,$3')')
|
|
Packit |
5c3484 |
ifdef(`OPERATION_xnor_n',
|
|
Packit |
5c3484 |
` define(`func',`mpn_xnor_n')
|
|
Packit |
5c3484 |
define(`logopS',`eqv $1,$2,$3')
|
|
Packit |
5c3484 |
define(`vnegb', `vnor $1,$2,$2')
|
|
Packit |
5c3484 |
define(`logop', `vxor $1,$2,$3')')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ifelse(GMP_LIMB_BITS,`32',`
|
|
Packit |
5c3484 |
define(`LIMB32',` $1')
|
|
Packit |
5c3484 |
define(`LIMB64',`')
|
|
Packit |
5c3484 |
',`
|
|
Packit |
5c3484 |
define(`LIMB32',`')
|
|
Packit |
5c3484 |
define(`LIMB64',` $1')
|
|
Packit |
5c3484 |
')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C INPUT PARAMETERS
|
|
Packit |
5c3484 |
define(`rp', `r3')
|
|
Packit |
5c3484 |
define(`up', `r4')
|
|
Packit |
5c3484 |
define(`vp', `r5')
|
|
Packit |
5c3484 |
define(`n', `r6')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
define(`us', `v8')
|
|
Packit |
5c3484 |
define(`vs', `v9')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASM_START()
|
|
Packit |
5c3484 |
PROLOGUE(func)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
LIMB32(`cmpwi cr0, n, 8 ')
|
|
Packit |
5c3484 |
LIMB64(`cmpdi cr0, n, 4 ')
|
|
Packit |
5c3484 |
bge L(big)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mtctr n
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
LIMB32(`lwz r8, 0(up) ')
|
|
Packit |
5c3484 |
LIMB32(`lwz r9, 0(vp) ')
|
|
Packit |
5c3484 |
LIMB32(`logopS( r0, r8, r9) ')
|
|
Packit |
5c3484 |
LIMB32(`stw r0, 0(rp) ')
|
|
Packit |
5c3484 |
LIMB32(`bdz L(endS) ')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(topS):
|
|
Packit |
5c3484 |
LIMB32(`lwzu r8, 4(up) ')
|
|
Packit |
5c3484 |
LIMB64(`ld r8, 0(up) ')
|
|
Packit |
5c3484 |
LIMB64(`addi up, up, GMP_LIMB_BYTES ')
|
|
Packit |
5c3484 |
LIMB32(`lwzu r9, 4(vp) ')
|
|
Packit |
5c3484 |
LIMB64(`ld r9, 0(vp) ')
|
|
Packit |
5c3484 |
LIMB64(`addi vp, vp, GMP_LIMB_BYTES ')
|
|
Packit |
5c3484 |
logopS( r0, r8, r9)
|
|
Packit |
5c3484 |
LIMB32(`stwu r0, 4(rp) ')
|
|
Packit |
5c3484 |
LIMB64(`std r0, 0(rp) ')
|
|
Packit |
5c3484 |
LIMB64(`addi rp, rp, GMP_LIMB_BYTES ')
|
|
Packit |
5c3484 |
bdnz L(topS)
|
|
Packit |
5c3484 |
L(endS):
|
|
Packit |
5c3484 |
blr
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(big): mfspr r12, 256
|
|
Packit |
5c3484 |
oris r0, r12, 0xfffc C Set VRSAVE bit 0-13 FIXME
|
|
Packit |
5c3484 |
mtspr 256, r0
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C First loop until the destination is 16-byte aligned. This will execute 0 or 1
|
|
Packit |
5c3484 |
C times for 64-bit machines, and 0 to 3 times for 32-bit machines.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
LIMB32(`rlwinm. r0, rp, 30,30,31') C (rp >> 2) mod 4
|
|
Packit |
5c3484 |
LIMB64(`rlwinm. r0, rp, 29,31,31') C (rp >> 3) mod 2
|
|
Packit |
5c3484 |
beq L(aligned)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
subfic r7, r0, LIMBS_PER_VR
|
|
Packit |
5c3484 |
LIMB32(`li r10, 0 ')
|
|
Packit |
5c3484 |
subf n, r7, n
|
|
Packit |
5c3484 |
L(top0):
|
|
Packit |
5c3484 |
LIMB32(`lwz r8, 0(up) ')
|
|
Packit |
5c3484 |
LIMB64(`ld r8, 0(up) ')
|
|
Packit |
5c3484 |
addi up, up, GMP_LIMB_BYTES
|
|
Packit |
5c3484 |
LIMB32(`lwz r9, 0(vp) ')
|
|
Packit |
5c3484 |
LIMB64(`ld r9, 0(vp) ')
|
|
Packit |
5c3484 |
addi vp, vp, GMP_LIMB_BYTES
|
|
Packit |
5c3484 |
LIMB32(`addic. r7, r7, -1 ')
|
|
Packit |
5c3484 |
logopS( r0, r8, r9)
|
|
Packit |
5c3484 |
LIMB32(`stwx r0, r10, rp ')
|
|
Packit |
5c3484 |
LIMB64(`std r0, 0(rp) ')
|
|
Packit |
5c3484 |
LIMB32(`addi r10, r10, GMP_LIMB_BYTES')
|
|
Packit |
5c3484 |
LIMB32(`bne L(top0) ')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
addi rp, rp, 16 C update rp, but preserve its alignment
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(aligned):
|
|
Packit |
5c3484 |
LIMB64(`srdi r7, n, 1 ') C loop count corresponding to n
|
|
Packit |
5c3484 |
LIMB32(`srwi r7, n, 2 ') C loop count corresponding to n
|
|
Packit |
5c3484 |
mtctr r7 C copy n to count register
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
li r10, 16
|
|
Packit |
5c3484 |
lvsl us, 0, up
|
|
Packit |
5c3484 |
lvsl vs, 0, vp
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
lvx v2, 0, up
|
|
Packit |
5c3484 |
lvx v3, 0, vp
|
|
Packit |
5c3484 |
bdnz L(gt1)
|
|
Packit |
5c3484 |
lvx v0, r10, up
|
|
Packit |
5c3484 |
lvx v1, r10, vp
|
|
Packit |
5c3484 |
vperm v4, v2, v0, us
|
|
Packit |
5c3484 |
vperm v5, v3, v1, vs
|
|
Packit |
5c3484 |
vnegb( v5, v5)
|
|
Packit |
5c3484 |
logop( v6, v4, v5)
|
|
Packit |
5c3484 |
vnega( v6, v6)
|
|
Packit |
5c3484 |
stvx v6, 0, rp
|
|
Packit |
5c3484 |
addi up, up, 16
|
|
Packit |
5c3484 |
addi vp, vp, 16
|
|
Packit |
5c3484 |
addi rp, rp, 4
|
|
Packit |
5c3484 |
b L(tail)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(gt1): addi up, up, 16
|
|
Packit |
5c3484 |
addi vp, vp, 16
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(top): lvx v0, 0, up
|
|
Packit |
5c3484 |
lvx v1, 0, vp
|
|
Packit |
5c3484 |
vperm v4, v2, v0, us
|
|
Packit |
5c3484 |
vperm v5, v3, v1, vs
|
|
Packit |
5c3484 |
vnegb( v5, v5)
|
|
Packit |
5c3484 |
logop( v6, v4, v5)
|
|
Packit |
5c3484 |
vnega( v6, v6)
|
|
Packit |
5c3484 |
stvx v6, 0, rp
|
|
Packit |
5c3484 |
bdz L(end)
|
|
Packit |
5c3484 |
lvx v2, r10, up
|
|
Packit |
5c3484 |
lvx v3, r10, vp
|
|
Packit |
5c3484 |
vperm v4, v0, v2, us
|
|
Packit |
5c3484 |
vperm v5, v1, v3, vs
|
|
Packit |
5c3484 |
vnegb( v5, v5)
|
|
Packit |
5c3484 |
logop( v6, v4, v5)
|
|
Packit |
5c3484 |
vnega( v6, v6)
|
|
Packit |
5c3484 |
stvx v6, r10, rp
|
|
Packit |
5c3484 |
addi up, up, 32
|
|
Packit |
5c3484 |
addi vp, vp, 32
|
|
Packit |
5c3484 |
addi rp, rp, 32
|
|
Packit |
5c3484 |
bdnz L(top)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
andi. r0, up, 15
|
|
Packit |
5c3484 |
vxor v0, v0, v0
|
|
Packit |
5c3484 |
beq 1f
|
|
Packit |
5c3484 |
lvx v0, 0, up
|
|
Packit |
5c3484 |
1: andi. r0, vp, 15
|
|
Packit |
5c3484 |
vxor v1, v1, v1
|
|
Packit |
5c3484 |
beq 1f
|
|
Packit |
5c3484 |
lvx v1, 0, vp
|
|
Packit |
5c3484 |
1: vperm v4, v2, v0, us
|
|
Packit |
5c3484 |
vperm v5, v3, v1, vs
|
|
Packit |
5c3484 |
vnegb( v5, v5)
|
|
Packit |
5c3484 |
logop( v6, v4, v5)
|
|
Packit |
5c3484 |
vnega( v6, v6)
|
|
Packit |
5c3484 |
stvx v6, 0, rp
|
|
Packit |
5c3484 |
addi rp, rp, 4
|
|
Packit |
5c3484 |
b L(tail)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(end): andi. r0, up, 15
|
|
Packit |
5c3484 |
vxor v2, v2, v2
|
|
Packit |
5c3484 |
beq 1f
|
|
Packit |
5c3484 |
lvx v2, r10, up
|
|
Packit |
5c3484 |
1: andi. r0, vp, 15
|
|
Packit |
5c3484 |
vxor v3, v3, v3
|
|
Packit |
5c3484 |
beq 1f
|
|
Packit |
5c3484 |
lvx v3, r10, vp
|
|
Packit |
5c3484 |
1: vperm v4, v0, v2, us
|
|
Packit |
5c3484 |
vperm v5, v1, v3, vs
|
|
Packit |
5c3484 |
vnegb( v5, v5)
|
|
Packit |
5c3484 |
logop( v6, v4, v5)
|
|
Packit |
5c3484 |
vnega( v6, v6)
|
|
Packit |
5c3484 |
stvx v6, r10, rp
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
addi up, up, 16
|
|
Packit |
5c3484 |
addi vp, vp, 16
|
|
Packit |
5c3484 |
addi rp, rp, 20
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(tail):
|
|
Packit |
5c3484 |
LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4
|
|
Packit |
5c3484 |
LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2
|
|
Packit |
5c3484 |
beq L(ret)
|
|
Packit |
5c3484 |
addi rp, rp, 15
|
|
Packit |
5c3484 |
LIMB32(`rlwinm rp, rp, 0,0,27 ')
|
|
Packit |
5c3484 |
LIMB64(`rldicr rp, rp, 0,59 ')
|
|
Packit |
5c3484 |
li r10, 0
|
|
Packit |
5c3484 |
L(top2):
|
|
Packit |
5c3484 |
LIMB32(`lwzx r8, r10, up ')
|
|
Packit |
5c3484 |
LIMB64(`ldx r8, r10, up ')
|
|
Packit |
5c3484 |
LIMB32(`lwzx r9, r10, vp ')
|
|
Packit |
5c3484 |
LIMB64(`ldx r9, r10, vp ')
|
|
Packit |
5c3484 |
LIMB32(`addic. r7, r7, -1 ')
|
|
Packit |
5c3484 |
logopS( r0, r8, r9)
|
|
Packit |
5c3484 |
LIMB32(`stwx r0, r10, rp ')
|
|
Packit |
5c3484 |
LIMB64(`std r0, 0(rp) ')
|
|
Packit |
5c3484 |
LIMB32(`addi r10, r10, GMP_LIMB_BYTES')
|
|
Packit |
5c3484 |
LIMB32(`bne L(top2) ')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(ret): mtspr 256, r12
|
|
Packit |
5c3484 |
blr
|
|
Packit |
5c3484 |
EPILOGUE()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C This works for 64-bit PowerPC, since a limb ptr can only be aligned
|
|
Packit |
5c3484 |
C in 2 relevant ways, which means we can always find a pair of aligned
|
|
Packit |
5c3484 |
C pointers of rp, up, and vp.
|
|
Packit |
5c3484 |
C process words until rp is 16-byte aligned
|
|
Packit |
5c3484 |
C if (((up | vp) & 15) == 0)
|
|
Packit |
5c3484 |
C process with VMX without any vperm
|
|
Packit |
5c3484 |
C else if ((up & 15) != 0 && (vp & 15) != 0)
|
|
Packit |
5c3484 |
C process with VMX using vperm on store data
|
|
Packit |
5c3484 |
C else if ((up & 15) != 0)
|
|
Packit |
5c3484 |
C process with VMX using vperm on up data
|
|
Packit |
5c3484 |
C else
|
|
Packit |
5c3484 |
C process with VMX using vperm on vp data
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C rlwinm, r0, up, 0,28,31
|
|
Packit |
5c3484 |
C rlwinm r0, vp, 0,28,31
|
|
Packit |
5c3484 |
C cmpwi cr7, r0, 0
|
|
Packit |
5c3484 |
C cror cr6, cr0, cr7
|
|
Packit |
5c3484 |
C crand cr0, cr0, cr7
|