Blame mpn/alpha/ev5/diveby3.asm

Packit 5c3484
dnl  Alpha mpn_divexact_by3c -- mpn division by 3, expecting no remainder.
Packit 5c3484
Packit 5c3484
dnl  Copyright 2004, 2005, 2009 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
C      cycles/limb
Packit 5c3484
C EV4:    22
Packit 5c3484
C EV5:    11.5
Packit 5c3484
C EV6:     6.3		Note that mpn_bdiv_dbm1c is faster
Packit 5c3484
Packit 5c3484
C TODO
Packit 5c3484
C  * Remove the unops, they benefit just ev6, which no longer uses this file.
Packit 5c3484
C  * Try prefetch for destination, using lds.
Packit 5c3484
C  * Improve feed-in code, by moving initial mulq earlier; make initial load
Packit 5c3484
C    to u0/u0 to save some copying.
Packit 5c3484
C  * Combine u0 and u2, u1 and u3.
Packit 5c3484
Packit 5c3484
C INPUT PARAMETERS
Packit 5c3484
define(`rp',	`r16')
Packit 5c3484
define(`up',	`r17')
Packit 5c3484
define(`n',	`r18')
Packit 5c3484
define(`cy',	`r19')
Packit 5c3484
Packit 5c3484
ASM_START()
Packit 5c3484
Packit 5c3484
DATASTART(L(LC),8)
Packit 5c3484
	.quad	0xAAAAAAAAAAAAAAAB
Packit 5c3484
	.quad	0x5555555555555555
Packit 5c3484
	.quad	0xAAAAAAAAAAAAAAAA
Packit 5c3484
DATAEND()
Packit 5c3484
Packit 5c3484
define(`xAAAAAAAAAAAAAAAB',	`r20')
Packit 5c3484
define(`x5555555555555555',	`r21')
Packit 5c3484
define(`xAAAAAAAAAAAAAAAA',	`r22')
Packit 5c3484
define(`u0',	`r0')	define(`u1',	`r1')
Packit 5c3484
define(`u2',	`r2')	define(`u3',	`r3')
Packit 5c3484
define(`l0',	`r25')	define(`x',	`r8')
Packit 5c3484
define(`q0',	`r4')	define(`q1',	`r5')
Packit 5c3484
define(`p6',	`r6')	define(`p7',	`r7')
Packit 5c3484
define(`t0',	`r23')	define(`t1',	`r24')
Packit 5c3484
define(`cymask',`r28')
Packit 5c3484
Packit 5c3484
Packit 5c3484
PROLOGUE(mpn_divexact_by3c,gp)
Packit 5c3484
Packit 5c3484
	ldq	r28, 0(up)			C load first limb early
Packit 5c3484
Packit 5c3484
C Put magic constants in registers
Packit 5c3484
	lda	r0, L(LC)
Packit 5c3484
	ldq	xAAAAAAAAAAAAAAAB, 0(r0)
Packit 5c3484
	ldq	x5555555555555555, 8(r0)
Packit 5c3484
	ldq	xAAAAAAAAAAAAAAAA, 16(r0)
Packit 5c3484
Packit 5c3484
C Compute initial l0 value
Packit 5c3484
	cmpeq	cy, 1, p6
Packit 5c3484
	cmpeq	cy, 2, p7
Packit 5c3484
	negq	p6, p6
Packit 5c3484
	and	p6, x5555555555555555, l0
Packit 5c3484
	cmovne	p7, xAAAAAAAAAAAAAAAA, l0
Packit 5c3484
Packit 5c3484
C Feed-in depending on (n mod 4)
Packit 5c3484
	and	n, 3, r8
Packit 5c3484
	lda	n, -3(n)
Packit 5c3484
	cmpeq	r8, 1, r4
Packit 5c3484
	cmpeq	r8, 2, r5
Packit 5c3484
	bne	r4, $Lb01
Packit 5c3484
	bne	r5, $Lb10
Packit 5c3484
	beq	r8, $Lb00
Packit 5c3484
Packit 5c3484
$Lb11:	ldq	u3, 8(up)
Packit 5c3484
	lda	up, -24(up)
Packit 5c3484
	lda	rp, -24(rp)
Packit 5c3484
	mulq	r28, xAAAAAAAAAAAAAAAB, q0
Packit 5c3484
	mov	r28, u2
Packit 5c3484
	br	r31, $L11
Packit 5c3484
Packit 5c3484
$Lb00:	ldq	u2, 8(up)
Packit 5c3484
	lda	up, -16(up)
Packit 5c3484
	lda	rp, -16(rp)
Packit 5c3484
	mulq	r28, xAAAAAAAAAAAAAAAB, q1
Packit 5c3484
	mov	r28, u1
Packit 5c3484
	br	r31, $L00
Packit 5c3484
Packit 5c3484
$Lb01:	lda	rp, -8(rp)
Packit 5c3484
	mulq	r28, xAAAAAAAAAAAAAAAB, q0
Packit 5c3484
	mov	r28, u0
Packit 5c3484
	blt	n, $Lcj1
Packit 5c3484
	ldq	u1, 8(up)
Packit 5c3484
	lda	up, -8(up)
Packit 5c3484
	br	r31, $L01
Packit 5c3484
Packit 5c3484
$Lb10:	ldq	u0, 8(up)
Packit 5c3484
	mulq	r28, xAAAAAAAAAAAAAAAB, q1
Packit 5c3484
	mov	r28, u3
Packit 5c3484
	blt	n, $Lend
Packit 5c3484
Packit 5c3484
	ALIGN(16)
Packit 5c3484
$Ltop:
Packit 5c3484
C 0
Packit 5c3484
	cmpult	u3, cy, cy			C L0
Packit 5c3484
	mulq	u0, xAAAAAAAAAAAAAAAB, q0	C U1
Packit 5c3484
	ldq	u1, 16(up)			C L1
Packit 5c3484
	addq	q1, l0, x			C U0
Packit 5c3484
C 1
Packit 5c3484
	negq	cy, cymask			C L0
Packit 5c3484
	unop					C U1
Packit 5c3484
	unop					C L1
Packit 5c3484
	cmpult	x5555555555555555, x, p6	C U0
Packit 5c3484
C 2
Packit 5c3484
	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
Packit 5c3484
	unop
Packit 5c3484
	unop
Packit 5c3484
	negq	p6, t0				C L0
Packit 5c3484
C 3
Packit 5c3484
	negq	p7, t1				C L0
Packit 5c3484
	and	cymask, x5555555555555555, l0	C U1
Packit 5c3484
	addq	p6, cy, cy
Packit 5c3484
	and	t0, x5555555555555555, t0
Packit 5c3484
C 4
Packit 5c3484
	and	t1, x5555555555555555, t1
Packit 5c3484
	addq	p7, cy, cy
Packit 5c3484
	unop
Packit 5c3484
	addq	t0, l0, l0
Packit 5c3484
C 5
Packit 5c3484
	addq	t1, l0, l0
Packit 5c3484
	unop
Packit 5c3484
	stq	x, 0(rp)			C L1
Packit 5c3484
	unop
Packit 5c3484
$L01:
Packit 5c3484
C 0
Packit 5c3484
	cmpult	u0, cy, cy			C L0
Packit 5c3484
	mulq	u1, xAAAAAAAAAAAAAAAB, q1	C U1
Packit 5c3484
	ldq	u2, 24(up)			C L1
Packit 5c3484
	addq	q0, l0, x			C U0
Packit 5c3484
C 1
Packit 5c3484
	negq	cy, cymask			C L0
Packit 5c3484
	unop					C U1
Packit 5c3484
	unop					C L1
Packit 5c3484
	cmpult	x5555555555555555, x, p6	C U0
Packit 5c3484
C 2
Packit 5c3484
	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
Packit 5c3484
	unop
Packit 5c3484
	unop
Packit 5c3484
	negq	p6, t0				C L0
Packit 5c3484
C 3
Packit 5c3484
	negq	p7, t1				C L0
Packit 5c3484
	and	cymask, x5555555555555555, l0	C U1
Packit 5c3484
	addq	p6, cy, cy
Packit 5c3484
	and	t0, x5555555555555555, t0
Packit 5c3484
C 4
Packit 5c3484
	and	t1, x5555555555555555, t1
Packit 5c3484
	addq	p7, cy, cy
Packit 5c3484
	unop
Packit 5c3484
	addq	t0, l0, l0
Packit 5c3484
C 5
Packit 5c3484
	addq	t1, l0, l0
Packit 5c3484
	unop
Packit 5c3484
	stq	x, 8(rp)			C L1
Packit 5c3484
	unop
Packit 5c3484
$L00:
Packit 5c3484
C 0
Packit 5c3484
	cmpult	u1, cy, cy			C L0
Packit 5c3484
	mulq	u2, xAAAAAAAAAAAAAAAB, q0	C U1
Packit 5c3484
	ldq	u3, 32(up)			C L1
Packit 5c3484
	addq	q1, l0, x			C U0
Packit 5c3484
C 1
Packit 5c3484
	negq	cy, cymask			C L0
Packit 5c3484
	unop					C U1
Packit 5c3484
	unop					C L1
Packit 5c3484
	cmpult	x5555555555555555, x, p6	C U0
Packit 5c3484
C 2
Packit 5c3484
	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
Packit 5c3484
	unop
Packit 5c3484
	unop
Packit 5c3484
	negq	p6, t0				C L0
Packit 5c3484
C 3
Packit 5c3484
	negq	p7, t1				C L0
Packit 5c3484
	and	cymask, x5555555555555555, l0	C U1
Packit 5c3484
	addq	p6, cy, cy
Packit 5c3484
	and	t0, x5555555555555555, t0
Packit 5c3484
C 4
Packit 5c3484
	and	t1, x5555555555555555, t1
Packit 5c3484
	addq	p7, cy, cy
Packit 5c3484
	unop
Packit 5c3484
	addq	t0, l0, l0
Packit 5c3484
C 5
Packit 5c3484
	addq	t1, l0, l0
Packit 5c3484
	unop
Packit 5c3484
	stq	x, 16(rp)			C L1
Packit 5c3484
	unop
Packit 5c3484
$L11:
Packit 5c3484
C 0
Packit 5c3484
	cmpult	u2, cy, cy			C L0
Packit 5c3484
	mulq	u3, xAAAAAAAAAAAAAAAB, q1	C U1
Packit 5c3484
	ldq	u0, 40(up)			C L1
Packit 5c3484
	addq	q0, l0, x			C U0
Packit 5c3484
C 1
Packit 5c3484
	negq	cy, cymask			C L0
Packit 5c3484
	unop					C U1
Packit 5c3484
	unop					C L1
Packit 5c3484
	cmpult	x5555555555555555, x, p6	C U0
Packit 5c3484
C 2
Packit 5c3484
	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
Packit 5c3484
	lda	n, -4(n)			C L1 bookkeeping
Packit 5c3484
	unop
Packit 5c3484
	negq	p6, t0				C L0
Packit 5c3484
C 3
Packit 5c3484
	negq	p7, t1				C L0
Packit 5c3484
	and	cymask, x5555555555555555, l0	C U1
Packit 5c3484
	addq	p6, cy, cy
Packit 5c3484
	and	t0, x5555555555555555, t0
Packit 5c3484
C 4
Packit 5c3484
	and	t1, x5555555555555555, t1
Packit 5c3484
	addq	p7, cy, cy
Packit 5c3484
	unop
Packit 5c3484
	addq	t0, l0, l0
Packit 5c3484
C 5
Packit 5c3484
	addq	t1, l0, l0
Packit 5c3484
	unop
Packit 5c3484
	stq	x, 24(rp)			C L1
Packit 5c3484
	lda	up, 32(up)
Packit 5c3484
C
Packit 5c3484
	ldl	r31, 256(up)			C prefetch
Packit 5c3484
	unop
Packit 5c3484
	lda	rp, 32(rp)
Packit 5c3484
	bge	n, $Ltop			C U1
Packit 5c3484
C *** MAIN LOOP END ***
Packit 5c3484
$Lend:
Packit 5c3484
Packit 5c3484
	cmpult	u3, cy, cy			C L0
Packit 5c3484
	mulq	u0, xAAAAAAAAAAAAAAAB, q0	C U1
Packit 5c3484
	unop
Packit 5c3484
	addq	q1, l0, x			C U0
Packit 5c3484
C 1
Packit 5c3484
	negq	cy, cymask			C L0
Packit 5c3484
	unop					C U1
Packit 5c3484
	unop					C L1
Packit 5c3484
	cmpult	x5555555555555555, x, p6	C U0
Packit 5c3484
C 2
Packit 5c3484
	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
Packit 5c3484
	unop
Packit 5c3484
	unop
Packit 5c3484
	negq	p6, t0				C L0
Packit 5c3484
C 3
Packit 5c3484
	negq	p7, t1				C L0
Packit 5c3484
	and	cymask, x5555555555555555, l0	C U1
Packit 5c3484
	addq	p6, cy, cy
Packit 5c3484
	and	t0, x5555555555555555, t0
Packit 5c3484
C 4
Packit 5c3484
	and	t1, x5555555555555555, t1
Packit 5c3484
	addq	p7, cy, cy
Packit 5c3484
	unop
Packit 5c3484
	addq	t0, l0, l0
Packit 5c3484
C 5
Packit 5c3484
	addq	t1, l0, l0
Packit 5c3484
	unop
Packit 5c3484
	stq	x, 0(rp)			C L1
Packit 5c3484
	unop
Packit 5c3484
$Lcj1:
Packit 5c3484
	cmpult	u0, cy, cy			C L0
Packit 5c3484
	addq	q0, l0, x			C U0
Packit 5c3484
	cmpult	x5555555555555555, x, p6	C U0
Packit 5c3484
	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
Packit 5c3484
	addq	p6, cy, cy
Packit 5c3484
	addq	p7, cy, r0
Packit 5c3484
	stq	x, 8(rp)			C L1
Packit 5c3484
Packit 5c3484
	ret	r31,(r26),1
Packit 5c3484
EPILOGUE()
Packit 5c3484
ASM_END()
Packit 5c3484
Packit 5c3484
C This is useful for playing with various schedules.
Packit 5c3484
C Expand as: one(0)one(1)one(2)one(3)
Packit 5c3484
define(`one',`
Packit 5c3484
C 0
Packit 5c3484
	cmpult	`$'eval(($1+3)%4), cy, cy		C L0
Packit 5c3484
	mulq	`$'$1, xAAAAAAAAAAAAAAAB, `$'eval(4+$1%2) C U1
Packit 5c3484
	ldq	`$'eval(($1+1)%4), eval($1*8+16)(up)	C L1
Packit 5c3484
	addq	`$'eval(4+($1+1)%2), l0, x		C U0
Packit 5c3484
C 1
Packit 5c3484
	negq	cy, cymask				C L0
Packit 5c3484
	unop						C U1
Packit 5c3484
	unop						C L1
Packit 5c3484
	cmpult	x5555555555555555, x, p6		C U0
Packit 5c3484
C 2
Packit 5c3484
	cmpult	xAAAAAAAAAAAAAAAA, x, p7		C U1
Packit 5c3484
	unop
Packit 5c3484
	unop
Packit 5c3484
	negq	p6, t0					C L0
Packit 5c3484
C 3
Packit 5c3484
	negq	p7, t1					C L0
Packit 5c3484
	and	cymask, x5555555555555555, l0		C U1
Packit 5c3484
	addq	p6, cy, cy
Packit 5c3484
	and	t0, x5555555555555555, t0
Packit 5c3484
C 4
Packit 5c3484
	and	t1, x5555555555555555, t1
Packit 5c3484
	addq	p7, cy, cy
Packit 5c3484
	unop
Packit 5c3484
	addq	t0, l0, l0
Packit 5c3484
C 5
Packit 5c3484
	addq	t1, l0, l0
Packit 5c3484
	unop
Packit 5c3484
	stq	x, eval($1*8)(rp)			C L1
Packit 5c3484
	unop
Packit 5c3484
')