Blame mpn/powerpc32/vmx/logops_n.asm

Packit 5c3484
dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n,
Packit 5c3484
dnl  mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise
Packit 5c3484
dnl  logical operations.
Packit 5c3484
Packit 5c3484
dnl  Copyright 2006 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C               and,ior,andn,nior,xor    iorn,xnor         nand
Packit 5c3484
C                   cycles/limb         cycles/limb    cycles/limb
Packit 5c3484
C 7400,7410 (G4):       1.39                 ?              ?
Packit 5c3484
C 744x,745x (G4+):      1.14                1.39           1.39
Packit 5c3484
C 970:                  1.7                 2.0            2.0
Packit 5c3484
Packit 5c3484
C STATUS
Packit 5c3484
C  * Works for all sizes and alignment for 32-bit limbs.
Packit 5c3484
C  * Works for n >= 4 for 64-bit limbs; untested for smaller operands.
Packit 5c3484
C  * Current performance makes this pointless for 970
Packit 5c3484
Packit 5c3484
C TODO
Packit 5c3484
C  * Might want to make variants when just one of the source operands needs
Packit 5c3484
C    vperm, and when neither needs it.  The latter runs 50% faster on 7400.
Packit 5c3484
C  * Idea: If the source operands are equally aligned, we could do the logops
Packit 5c3484
C    first, then vperm before storing!  That means we never need more than one
Packit 5c3484
C    vperm, ever!
Packit 5c3484
C  * Perhaps align `rp' after initial alignment loop?
Packit 5c3484
C  * Instead of having scalar code in the beginning and end, consider using
Packit 5c3484
C    read-modify-write vector code.
Packit 5c3484
C  * Software pipeline?  Hopefully not too important, this is hairy enough
Packit 5c3484
C    already.
Packit 5c3484
C  * At least be more clever about operand loading, i.e., load v operands before
Packit 5c3484
C    u operands, since v operands are sometimes negated.
Packit 5c3484
Packit 5c3484
define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
Packit 5c3484
define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
Packit 5c3484
define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
Packit 5c3484
Packit 5c3484
define(`vnegb', `')		C default neg-before to null
Packit 5c3484
define(`vnega', `')		C default neg-before to null
Packit 5c3484
Packit 5c3484
ifdef(`OPERATION_and_n',
Packit 5c3484
`	define(`func',	`mpn_and_n')
Packit 5c3484
	define(`logopS',`and	$1,$2,$3')
Packit 5c3484
	define(`logop',	`vand	$1,$2,$3')')
Packit 5c3484
ifdef(`OPERATION_andn_n',
Packit 5c3484
`	define(`func',	`mpn_andn_n')
Packit 5c3484
	define(`logopS',`andc	$1,$2,$3')
Packit 5c3484
	define(`logop',	`vandc	$1,$2,$3')')
Packit 5c3484
ifdef(`OPERATION_nand_n',
Packit 5c3484
`	define(`func',	`mpn_nand_n')
Packit 5c3484
	define(`logopS',`nand	$1,$2,$3')
Packit 5c3484
	define(`logop',	`vand	$1,$2,$3')
Packit 5c3484
	define(`vnega',	`vnor	$1,$2,$2')')
Packit 5c3484
ifdef(`OPERATION_ior_n',
Packit 5c3484
`	define(`func',	`mpn_ior_n')
Packit 5c3484
	define(`logopS',`or	$1,$2,$3')
Packit 5c3484
	define(`logop',	`vor	$1,$2,$3')')
Packit 5c3484
ifdef(`OPERATION_iorn_n',
Packit 5c3484
`	define(`func',	`mpn_iorn_n')
Packit 5c3484
	define(`logopS',`orc	$1,$2,$3')
Packit 5c3484
	define(`vnegb',	`vnor	$1,$2,$2')
Packit 5c3484
	define(`logop',	`vor	$1,$2,$3')')
Packit 5c3484
ifdef(`OPERATION_nior_n',
Packit 5c3484
`	define(`func',	`mpn_nior_n')
Packit 5c3484
	define(`logopS',`nor	$1,$2,$3')
Packit 5c3484
	define(`logop',	`vnor	$1,$2,$3')')
Packit 5c3484
ifdef(`OPERATION_xor_n',
Packit 5c3484
`	define(`func',	`mpn_xor_n')
Packit 5c3484
	define(`logopS',`xor	$1,$2,$3')
Packit 5c3484
	define(`logop',	`vxor	$1,$2,$3')')
Packit 5c3484
ifdef(`OPERATION_xnor_n',
Packit 5c3484
`	define(`func',`mpn_xnor_n')
Packit 5c3484
	define(`logopS',`eqv	$1,$2,$3')
Packit 5c3484
	define(`vnegb',	`vnor	$1,$2,$2')
Packit 5c3484
	define(`logop',	`vxor	$1,$2,$3')')
Packit 5c3484
Packit 5c3484
ifelse(GMP_LIMB_BITS,`32',`
Packit 5c3484
	define(`LIMB32',`	$1')
Packit 5c3484
	define(`LIMB64',`')
Packit 5c3484
',`
Packit 5c3484
	define(`LIMB32',`')
Packit 5c3484
	define(`LIMB64',`	$1')
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
C INPUT PARAMETERS
Packit 5c3484
define(`rp',	`r3')
Packit 5c3484
define(`up',	`r4')
Packit 5c3484
define(`vp',	`r5')
Packit 5c3484
define(`n',	`r6')
Packit 5c3484
Packit 5c3484
define(`us',	`v8')
Packit 5c3484
define(`vs',	`v9')
Packit 5c3484
Packit 5c3484
MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
Packit 5c3484
Packit 5c3484
ASM_START()
Packit 5c3484
PROLOGUE(func)
Packit 5c3484
Packit 5c3484
LIMB32(`cmpwi	cr0, n, 8	')
Packit 5c3484
LIMB64(`cmpdi	cr0, n, 4	')
Packit 5c3484
	bge	L(big)
Packit 5c3484
Packit 5c3484
	mtctr	n
Packit 5c3484
Packit 5c3484
LIMB32(`lwz	r8, 0(up)	')
Packit 5c3484
LIMB32(`lwz	r9, 0(vp)	')
Packit 5c3484
LIMB32(`logopS(	r0, r8, r9)	')
Packit 5c3484
LIMB32(`stw	r0, 0(rp)	')
Packit 5c3484
LIMB32(`bdz	L(endS)		')
Packit 5c3484
Packit 5c3484
L(topS):
Packit 5c3484
LIMB32(`lwzu	r8, 4(up)	')
Packit 5c3484
LIMB64(`ld	r8, 0(up)	')
Packit 5c3484
LIMB64(`addi	up, up, GMP_LIMB_BYTES	')
Packit 5c3484
LIMB32(`lwzu	r9, 4(vp)	')
Packit 5c3484
LIMB64(`ld	r9, 0(vp)	')
Packit 5c3484
LIMB64(`addi	vp, vp, GMP_LIMB_BYTES	')
Packit 5c3484
	logopS(	r0, r8, r9)
Packit 5c3484
LIMB32(`stwu	r0, 4(rp)	')
Packit 5c3484
LIMB64(`std	r0, 0(rp)	')
Packit 5c3484
LIMB64(`addi	rp, rp, GMP_LIMB_BYTES	')
Packit 5c3484
	bdnz	L(topS)
Packit 5c3484
L(endS):
Packit 5c3484
	blr
Packit 5c3484
Packit 5c3484
L(big):	mfspr	r12, 256
Packit 5c3484
	oris	r0, r12, 0xfffc		C Set VRSAVE bit 0-13 FIXME
Packit 5c3484
	mtspr	256, r0
Packit 5c3484
Packit 5c3484
C First loop until the destination is 16-byte aligned.  This will execute 0 or 1
Packit 5c3484
C times for 64-bit machines, and 0 to 3 times for 32-bit machines.
Packit 5c3484
Packit 5c3484
LIMB32(`rlwinm.	r0, rp, 30,30,31')	C (rp >> 2) mod 4
Packit 5c3484
LIMB64(`rlwinm.	r0, rp, 29,31,31')	C (rp >> 3) mod 2
Packit 5c3484
	beq	L(aligned)
Packit 5c3484
Packit 5c3484
	subfic	r7, r0, LIMBS_PER_VR
Packit 5c3484
LIMB32(`li	r10, 0		')
Packit 5c3484
	subf	n, r7, n
Packit 5c3484
L(top0):
Packit 5c3484
LIMB32(`lwz	r8, 0(up)	')
Packit 5c3484
LIMB64(`ld	r8, 0(up)	')
Packit 5c3484
	addi	up, up, GMP_LIMB_BYTES
Packit 5c3484
LIMB32(`lwz	r9, 0(vp)	')
Packit 5c3484
LIMB64(`ld	r9, 0(vp)	')
Packit 5c3484
	addi	vp, vp, GMP_LIMB_BYTES
Packit 5c3484
LIMB32(`addic.	r7, r7, -1	')
Packit 5c3484
	logopS(	r0, r8, r9)
Packit 5c3484
LIMB32(`stwx	r0, r10, rp	')
Packit 5c3484
LIMB64(`std	r0, 0(rp)	')
Packit 5c3484
LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
Packit 5c3484
LIMB32(`bne	L(top0)		')
Packit 5c3484
Packit 5c3484
	addi	rp, rp, 16		C update rp, but preserve its alignment
Packit 5c3484
Packit 5c3484
L(aligned):
Packit 5c3484
LIMB64(`srdi	r7, n, 1	')	C loop count corresponding to n
Packit 5c3484
LIMB32(`srwi	r7, n, 2	')	C loop count corresponding to n
Packit 5c3484
	mtctr	r7			C copy n to count register
Packit 5c3484
Packit 5c3484
	li	r10, 16
Packit 5c3484
	lvsl	us, 0, up
Packit 5c3484
	lvsl	vs, 0, vp
Packit 5c3484
Packit 5c3484
	lvx	v2, 0, up
Packit 5c3484
	lvx	v3, 0, vp
Packit 5c3484
	bdnz	L(gt1)
Packit 5c3484
	lvx	v0, r10, up
Packit 5c3484
	lvx	v1, r10, vp
Packit 5c3484
	vperm	v4, v2, v0, us
Packit 5c3484
	vperm	v5, v3, v1, vs
Packit 5c3484
	vnegb(	v5, v5)
Packit 5c3484
	logop(	v6, v4, v5)
Packit 5c3484
	vnega(	v6, v6)
Packit 5c3484
	stvx	v6, 0, rp
Packit 5c3484
	addi	up, up, 16
Packit 5c3484
	addi	vp, vp, 16
Packit 5c3484
	addi	rp, rp, 4
Packit 5c3484
	b	L(tail)
Packit 5c3484
Packit 5c3484
L(gt1):	addi	up, up, 16
Packit 5c3484
	addi	vp, vp, 16
Packit 5c3484
Packit 5c3484
L(top):	lvx	v0, 0, up
Packit 5c3484
	lvx	v1, 0, vp
Packit 5c3484
	vperm	v4, v2, v0, us
Packit 5c3484
	vperm	v5, v3, v1, vs
Packit 5c3484
	vnegb(	v5, v5)
Packit 5c3484
	logop(	v6, v4, v5)
Packit 5c3484
	vnega(	v6, v6)
Packit 5c3484
	stvx	v6, 0, rp
Packit 5c3484
	bdz	L(end)
Packit 5c3484
	lvx	v2, r10, up
Packit 5c3484
	lvx	v3, r10, vp
Packit 5c3484
	vperm	v4, v0, v2, us
Packit 5c3484
	vperm	v5, v1, v3, vs
Packit 5c3484
	vnegb(	v5, v5)
Packit 5c3484
	logop(	v6, v4, v5)
Packit 5c3484
	vnega(	v6, v6)
Packit 5c3484
	stvx	v6, r10, rp
Packit 5c3484
	addi	up, up, 32
Packit 5c3484
	addi	vp, vp, 32
Packit 5c3484
	addi	rp, rp, 32
Packit 5c3484
	bdnz	L(top)
Packit 5c3484
Packit 5c3484
	andi.	r0, up, 15
Packit 5c3484
	vxor	v0, v0, v0
Packit 5c3484
	beq	1f
Packit 5c3484
	lvx	v0, 0, up
Packit 5c3484
1:	andi.	r0, vp, 15
Packit 5c3484
	vxor	v1, v1, v1
Packit 5c3484
	beq	1f
Packit 5c3484
	lvx	v1, 0, vp
Packit 5c3484
1:	vperm	v4, v2, v0, us
Packit 5c3484
	vperm	v5, v3, v1, vs
Packit 5c3484
	vnegb(	v5, v5)
Packit 5c3484
	logop(	v6, v4, v5)
Packit 5c3484
	vnega(	v6, v6)
Packit 5c3484
	stvx	v6, 0, rp
Packit 5c3484
	addi	rp, rp, 4
Packit 5c3484
	b	L(tail)
Packit 5c3484
Packit 5c3484
L(end):	andi.	r0, up, 15
Packit 5c3484
	vxor	v2, v2, v2
Packit 5c3484
	beq	1f
Packit 5c3484
	lvx	v2, r10, up
Packit 5c3484
1:	andi.	r0, vp, 15
Packit 5c3484
	vxor	v3, v3, v3
Packit 5c3484
	beq	1f
Packit 5c3484
	lvx	v3, r10, vp
Packit 5c3484
1:	vperm	v4, v0, v2, us
Packit 5c3484
	vperm	v5, v1, v3, vs
Packit 5c3484
	vnegb(	v5, v5)
Packit 5c3484
	logop(	v6, v4, v5)
Packit 5c3484
	vnega(	v6, v6)
Packit 5c3484
	stvx	v6, r10, rp
Packit 5c3484
Packit 5c3484
	addi	up, up, 16
Packit 5c3484
	addi	vp, vp, 16
Packit 5c3484
	addi	rp, rp, 20
Packit 5c3484
Packit 5c3484
L(tail):
Packit 5c3484
LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
Packit 5c3484
LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
Packit 5c3484
	beq	L(ret)
Packit 5c3484
	addi	rp, rp, 15
Packit 5c3484
LIMB32(`rlwinm	rp, rp, 0,0,27	')
Packit 5c3484
LIMB64(`rldicr	rp, rp, 0,59	')
Packit 5c3484
	li	r10, 0
Packit 5c3484
L(top2):
Packit 5c3484
LIMB32(`lwzx	r8, r10, up	')
Packit 5c3484
LIMB64(`ldx	r8, r10, up	')
Packit 5c3484
LIMB32(`lwzx	r9, r10, vp	')
Packit 5c3484
LIMB64(`ldx	r9, r10, vp	')
Packit 5c3484
LIMB32(`addic.	r7, r7, -1	')
Packit 5c3484
	logopS(	r0, r8, r9)
Packit 5c3484
LIMB32(`stwx	r0, r10, rp	')
Packit 5c3484
LIMB64(`std	r0, 0(rp)	')
Packit 5c3484
LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
Packit 5c3484
LIMB32(`bne	L(top2)		')
Packit 5c3484
Packit 5c3484
L(ret):	mtspr	256, r12
Packit 5c3484
	blr
Packit 5c3484
EPILOGUE()
Packit 5c3484
Packit 5c3484
C This works for 64-bit PowerPC, since a limb ptr can only be aligned
Packit 5c3484
C in 2 relevant ways, which means we can always find a pair of aligned
Packit 5c3484
C pointers of rp, up, and vp.
Packit 5c3484
C process words until rp is 16-byte aligned
Packit 5c3484
C if (((up | vp) & 15) == 0)
Packit 5c3484
C   process with VMX without any vperm
Packit 5c3484
C else if ((up & 15) != 0 && (vp & 15) != 0)
Packit 5c3484
C   process with VMX using vperm on store data
Packit 5c3484
C else if ((up & 15) != 0)
Packit 5c3484
C   process with VMX using vperm on up data
Packit 5c3484
C else
Packit 5c3484
C   process with VMX using vperm on vp data
Packit 5c3484
C
Packit 5c3484
C	rlwinm,	r0, up, 0,28,31
Packit 5c3484
C	rlwinm	r0, vp, 0,28,31
Packit 5c3484
C	cmpwi	cr7, r0, 0
Packit 5c3484
C	cror	cr6, cr0, cr7
Packit 5c3484
C	crand	cr0, cr0, cr7