Blame mpn/alpha/mode1o.asm

Packit 5c3484
dnl  Alpha mpn_modexact_1c_odd -- mpn exact remainder
Packit 5c3484
Packit 5c3484
dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C      cycles/limb
Packit 5c3484
C EV4:    47
Packit 5c3484
C EV5:    30
Packit 5c3484
C EV6:    15
Packit 5c3484
Packit 5c3484
Packit 5c3484
C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
Packit 5c3484
C                                mp_limb_t c)
Packit 5c3484
C
Packit 5c3484
C This code follows the "alternate" code in mpn/generic/mode1o.c,
Packit 5c3484
C eliminating cbit+climb from the dependent chain.  This leaves,
Packit 5c3484
C
Packit 5c3484
C        ev4   ev5   ev6
Packit 5c3484
C         1     3     1    subq   y = x - h
Packit 5c3484
C        23    13     7    mulq   q = y * inverse
Packit 5c3484
C        23    14     7    umulh  h = high (q * d)
Packit 5c3484
C        --    --    --
Packit 5c3484
C        47    30    15
Packit 5c3484
C
Packit 5c3484
C In each case, the load latency, loop control, and extra carry bit handling
Packit 5c3484
C hide under the multiply latencies.  Those latencies are long enough that
Packit 5c3484
C we don't need to worry about alignment or pairing to squeeze out
Packit 5c3484
C performance.
Packit 5c3484
C
Packit 5c3484
C For the first limb, some of the loop code is broken out and scheduled back
Packit 5c3484
C since it can be done earlier.
Packit 5c3484
C
Packit 5c3484
C   - The first ldq src[0] is near the start of the routine, for maximum
Packit 5c3484
C     time from memory.
Packit 5c3484
C
Packit 5c3484
C   - The subq y=x-climb can be done without waiting for the inverse.
Packit 5c3484
C
Packit 5c3484
C   - The mulq y*inverse is replicated after the final subq for the inverse,
Packit 5c3484
C     instead of branching to the mulq in the main loop.  On ev4 a branch
Packit 5c3484
C     there would cost cycles, but we can hide them under the mulq latency.
Packit 5c3484
C
Packit 5c3484
C For the last limb, high
Packit 5c3484
C and addback is done, as per the main mpn/generic/mode1o.c code.  This is a
Packit 5c3484
C data-dependent branch, but we're waiting for umulh so any penalty should
Packit 5c3484
C hide there.  The multiplies saved would be worth the cost anyway.
Packit 5c3484
C
Packit 5c3484
C Enhancements:
Packit 5c3484
C
Packit 5c3484
C For size==1, a plain division (done bitwise say) might be faster than
Packit 5c3484
C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
Packit 5c3484
C ev5.  A call to gcc __remqu might be a possibility.
Packit 5c3484
Packit 5c3484
ASM_START()
Packit 5c3484
PROLOGUE(mpn_modexact_1c_odd,gp)
Packit 5c3484
Packit 5c3484
	C r16	src
Packit 5c3484
	C r17	size
Packit 5c3484
	C r18	d
Packit 5c3484
	C r19	c
Packit 5c3484
Packit 5c3484
	LEA(r0, binvert_limb_table)
Packit 5c3484
	srl	r18, 1, r20		C d >> 1
Packit 5c3484
Packit 5c3484
	and	r20, 127, r20		C idx = d>>1 & 0x7F
Packit 5c3484
Packit 5c3484
	addq	r0, r20, r21		C table + idx
Packit 5c3484
Packit 5c3484
ifelse(bwx_available_p,1,
Packit 5c3484
`	ldbu	r20, 0(r21)		C table[idx], inverse 8 bits
Packit 5c3484
',`
Packit 5c3484
	ldq_u	r20, 0(r21)		C table[idx] qword
Packit 5c3484
	extbl	r20, r21, r20		C table[idx], inverse 8 bits
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
	mull	r20, r20, r7		C i*i
Packit 5c3484
	addq	r20, r20, r20		C 2*i
Packit 5c3484
Packit 5c3484
	ldq	r2, 0(r16)		C x = s = src[0]
Packit 5c3484
	lda	r17, -1(r17)		C size--
Packit 5c3484
	clr	r0			C initial cbit=0
Packit 5c3484
Packit 5c3484
	mull	r7, r18, r7		C i*i*d
Packit 5c3484
Packit 5c3484
	subq	r20, r7, r20		C 2*i-i*i*d, inverse 16 bits
Packit 5c3484
Packit 5c3484
	mull	r20, r20, r7		C i*i
Packit 5c3484
	addq	r20, r20, r20		C 2*i
Packit 5c3484
Packit 5c3484
	mull	r7, r18, r7		C i*i*d
Packit 5c3484
Packit 5c3484
	subq	r20, r7, r20		C 2*i-i*i*d, inverse 32 bits
Packit 5c3484
Packit 5c3484
	mulq	r20, r20, r7		C i*i
Packit 5c3484
	addq	r20, r20, r20		C 2*i
Packit 5c3484
Packit 5c3484
	mulq	r7, r18, r7		C i*i*d
Packit 5c3484
	subq	r2, r19, r3		C y = x - climb
Packit 5c3484
Packit 5c3484
	subq	r20, r7, r20		C inv = 2*i-i*i*d, inverse 64 bits
Packit 5c3484
Packit 5c3484
ASSERT(r7, C should have d*inv==1 mod 2^64
Packit 5c3484
`	mulq	r18, r20, r7
Packit 5c3484
	cmpeq	r7, 1, r7')
Packit 5c3484
Packit 5c3484
	mulq	r3, r20, r4		C first q = y * inv
Packit 5c3484
Packit 5c3484
	beq	r17, L(one)		C if size==1
Packit 5c3484
	br	L(entry)
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(top):
Packit 5c3484
	C r0	cbit
Packit 5c3484
	C r16	src, incrementing
Packit 5c3484
	C r17	size, decrementing
Packit 5c3484
	C r18	d
Packit 5c3484
	C r19	climb
Packit 5c3484
	C r20	inv
Packit 5c3484
Packit 5c3484
	ldq	r1, 0(r16)		C s = src[i]
Packit 5c3484
	subq	r1, r0, r2		C x = s - cbit
Packit 5c3484
	cmpult	r1, r0, r0		C new cbit = s < cbit
Packit 5c3484
Packit 5c3484
	subq	r2, r19, r3		C y = x - climb
Packit 5c3484
Packit 5c3484
	mulq	r3, r20, r4		C q = y * inv
Packit 5c3484
L(entry):
Packit 5c3484
	cmpult	r2, r19, r5		C cbit2 = x < climb
Packit 5c3484
	addq	r5, r0, r0		C cbit += cbit2
Packit 5c3484
	lda	r16, 8(r16)		C src++
Packit 5c3484
	lda	r17, -1(r17)		C size--
Packit 5c3484
Packit 5c3484
	umulh	r4, r18, r19		C climb = q * d
Packit 5c3484
	bne	r17, L(top)		C while 2 or more limbs left
Packit 5c3484
Packit 5c3484
Packit 5c3484
Packit 5c3484
	C r0	cbit
Packit 5c3484
	C r18	d
Packit 5c3484
	C r19	climb
Packit 5c3484
	C r20	inv
Packit 5c3484
Packit 5c3484
	ldq	r1, 0(r16)		C s = src[size-1] high limb
Packit 5c3484
Packit 5c3484
	cmpult	r1, r18, r2		C test high
Packit 5c3484
	bne	r2, L(skip)		C skip if so
Packit 5c3484
Packit 5c3484
	C can't skip a division, repeat loop code
Packit 5c3484
Packit 5c3484
	subq	r1, r0, r2		C x = s - cbit
Packit 5c3484
	cmpult	r1, r0, r0		C new cbit = s < cbit
Packit 5c3484
Packit 5c3484
	subq	r2, r19, r3		C y = x - climb
Packit 5c3484
Packit 5c3484
	mulq	r3, r20, r4		C q = y * inv
Packit 5c3484
L(one):
Packit 5c3484
	cmpult	r2, r19, r5		C cbit2 = x < climb
Packit 5c3484
	addq	r5, r0, r0		C cbit += cbit2
Packit 5c3484
Packit 5c3484
	umulh	r4, r18, r19		C climb = q * d
Packit 5c3484
Packit 5c3484
	addq	r19, r0, r0		C return climb + cbit
Packit 5c3484
	ret	r31, (r26), 1
Packit 5c3484
Packit 5c3484
Packit 5c3484
	ALIGN(8)
Packit 5c3484
L(skip):
Packit 5c3484
	C with high
Packit 5c3484
	C an addback of d if that underflows
Packit 5c3484
Packit 5c3484
	addq	r19, r0, r19		C c = climb + cbit
Packit 5c3484
Packit 5c3484
	subq	r19, r1, r2		C c - s
Packit 5c3484
	cmpult	r19, r1, r3		C c < s
Packit 5c3484
Packit 5c3484
	addq	r2, r18, r0		C return c-s + divisor
Packit 5c3484
Packit 5c3484
	cmoveq	r3, r2, r0		C return c-s if no underflow
Packit 5c3484
	ret	r31, (r26), 1
Packit 5c3484
Packit 5c3484
EPILOGUE()
Packit 5c3484
ASM_END()