Blame mpn/ia64/mode1o.asm

Packit 5c3484
dnl  Itanium-2 mpn_modexact_1c_odd -- mpn by 1 exact remainder.
Packit 5c3484
Packit 5c3484
dnl  Contributed to the GNU project by Kevin Ryde.
Packit 5c3484
Packit 5c3484
dnl  Copyright 2003-2005 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C            cycles/limb
Packit 5c3484
C Itanium:      15
Packit 5c3484
C Itanium 2:     8
Packit 5c3484
Packit 5c3484
Packit 5c3484
dnl  Usage: ABI32(`code')
Packit 5c3484
dnl
Packit 5c3484
dnl  Emit the given code only under HAVE_ABI_32.
Packit 5c3484
dnl
Packit 5c3484
define(ABI32,
Packit 5c3484
m4_assert_onearg()
Packit 5c3484
`ifdef(`HAVE_ABI_32',`$1')')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
Packit 5c3484
C                                mp_limb_t divisor, mp_limb_t carry);
Packit 5c3484
C
Packit 5c3484
C The modexact algorithm is usually conceived as a dependent chain
Packit 5c3484
C
Packit 5c3484
C	l = src[i] - c
Packit 5c3484
C	q = low(l * inverse)
Packit 5c3484
C	c = high(q*divisor) + (src[i]
Packit 5c3484
C
Packit 5c3484
C but we can work the src[i]-c into an xma by calculating si=src[i]*inverse
Packit 5c3484
C separately (off the dependent chain) and using
Packit 5c3484
C
Packit 5c3484
C	q = low(c * inverse + si)
Packit 5c3484
C	c = high(q*divisor + c)
Packit 5c3484
C
Packit 5c3484
C This means the dependent chain is simply xma.l followed by xma.hu, for a
Packit 5c3484
C total 8 cycles/limb on itanium-2.
Packit 5c3484
C
Packit 5c3484
C The reason xma.hu works for the new c is that the low of q*divisor is
Packit 5c3484
C src[i]-c (being the whole purpose of the q generated, and it can be
Packit 5c3484
C verified algebraically).  If there was an underflow from src[i]-c, then
Packit 5c3484
C there will be an overflow from (src-c)+c, thereby adding 1 to the new c
Packit 5c3484
C the same as the borrow bit (src[i]
Packit 5c3484
C
Packit 5c3484
C Incidentally, fcmp is not an option for treating src[i]-c, since it
Packit 5c3484
C apparently traps to the kernel for unnormalized operands like those used
Packit 5c3484
C and generated by ldf8 and xma.  On one GNU/Linux system it took about 1200
Packit 5c3484
C cycles.
Packit 5c3484
C
Packit 5c3484
C
Packit 5c3484
C First Limb:
Packit 5c3484
C
Packit 5c3484
C The first limb uses q = (src[0]-c) * inverse shown in the first style.
Packit 5c3484
C This lets us get the first q as soon as the inverse is ready, without
Packit 5c3484
C going through si=s*inverse.  Basically at the start we have c and can use
Packit 5c3484
C it while waiting for the inverse, whereas for the second and subsequent
Packit 5c3484
C limbs it's the other way around, ie. we have the inverse and are waiting
Packit 5c3484
C for c.
Packit 5c3484
C
Packit 5c3484
C At .Lentry the first two instructions in the loop have been done already.
Packit 5c3484
C The load of f11=src[1] at the start (predicated on size>=2), and the
Packit 5c3484
C calculation of q by the initial different scheme.
Packit 5c3484
C
Packit 5c3484
C
Packit 5c3484
C Entry Sequence:
Packit 5c3484
C
Packit 5c3484
C In the entry sequence, the critical path is the calculation of the
Packit 5c3484
C inverse, so this is begun first and optimized.  Apart from that, ar.lc is
Packit 5c3484
C established nice and early so the br.cloop's should predict perfectly.
Packit 5c3484
C And the load for the low limbs src[0] and src[1] can be initiated long
Packit 5c3484
C ahead of where they're needed.
Packit 5c3484
C
Packit 5c3484
C
Packit 5c3484
C Inverse Calculation:
Packit 5c3484
C
Packit 5c3484
C The initial 8-bit inverse is calculated using a table lookup.  If it hits
Packit 5c3484
C L1 (which is likely if we're called several times) then it should take a
Packit 5c3484
C total 4 cycles, otherwise hopefully L2 for 9 cycles.  This is considered
Packit 5c3484
C the best approach, on balance.  It could be done bitwise, but that would
Packit 5c3484
C probably be about 14 cycles (2 per bit beyond the first couple).  Or it
Packit 5c3484
C could be taken from 4 bits to 8 with xmpy doubling as used beyond 8 bits,
Packit 5c3484
C but that would be about 11 cycles.
Packit 5c3484
C
Packit 5c3484
C The table is not the same as binvert_limb_table, instead it's 256 bytes,
Packit 5c3484
C designed to be indexed by the low byte of the divisor.  The divisor is
Packit 5c3484
C always odd, so the relevant data is every second byte in the table.  The
Packit 5c3484
C padding lets us use zxt1 instead of extr.u, the latter would cost an extra
Packit 5c3484
C cycle because it must go down I0, and we're using the first I0 slot to get
Packit 5c3484
C ip.  The extra 128 bytes of padding should be insignificant compared to
Packit 5c3484
C typical ia64 code bloat.
Packit 5c3484
C
Packit 5c3484
C Having the table in .text allows us to use IP-relative addressing,
Packit 5c3484
C avoiding a fetch from ltoff.  .rodata is apparently not suitable for use
Packit 5c3484
C IP-relative, it gets a linker relocation overflow on GNU/Linux.
Packit 5c3484
C
Packit 5c3484
C
Packit 5c3484
C Load Scheduling:
Packit 5c3484
C
Packit 5c3484
C In the main loop, the data loads are scheduled for an L2 hit, which means
Packit 5c3484
C 6 cycles for the data ready to use.  In fact we end up 7 cycles ahead.  In
Packit 5c3484
C any case that scheduling is achieved simply by doing the load (and xmpy.l
Packit 5c3484
C for "si") in the immediately preceding iteration.
Packit 5c3484
C
Packit 5c3484
C The main loop requires size >= 2, and we handle size==1 by an initial
Packit 5c3484
C br.cloop to enter the loop only if size>1.  Since ar.lc is established
Packit 5c3484
C early, this should predict perfectly.
Packit 5c3484
C
Packit 5c3484
C
Packit 5c3484
C Not done:
Packit 5c3484
C
Packit 5c3484
C Consideration was given to using a plain "(src[0]-c) % divisor" for
Packit 5c3484
C size==1, but cycle counting suggests about 50 for the sort of approach
Packit 5c3484
C taken by gcc __umodsi3, versus about 47 for the modexact.  (Both assuming
Packit 5c3484
C L1 hits for their respective fetching.)
Packit 5c3484
C
Packit 5c3484
C Consideration was given to a test for high
Packit 5c3484
C loop iteration with instead c-=src[size-1] followed by c+=d if underflow.
Packit 5c3484
C Branching on high
Packit 5c3484
C more than the loop iteration saved, and the condition is of course data
Packit 5c3484
C dependent.  So the theory would be to shorten the loop count if
Packit 5c3484
C high
Packit 5c3484
C a gain of 6 when high
Packit 5c3484
C
Packit 5c3484
C Whether such a tradeoff is a win on average depends on assumptions about
Packit 5c3484
C how many bits in the high and the divisor.  If both are uniformly
Packit 5c3484
C distributed then high
Packit 5c3484
C divisors (less chance of high
Packit 5c3484
C applications (mpz_divisible_ui, mpz_gcd_ui, etc).  Though biggish divisors
Packit 5c3484
C would be normal internally from say mpn/generic/perfsqr.c.  On balance,
Packit 5c3484
C for the moment, it's felt the gain is not really enough to be worth the
Packit 5c3484
C trouble.
Packit 5c3484
C
Packit 5c3484
C
Packit 5c3484
C Enhancement:
Packit 5c3484
C
Packit 5c3484
C Process two source limbs per iteration using a two-limb inverse and a
Packit 5c3484
C sequence like
Packit 5c3484
C
Packit 5c3484
C	ql  = low (c * il + sil)	quotient low limb
Packit 5c3484
C	qlc = high(c * il + sil)
Packit 5c3484
C	qh1 = low (c * ih + sih)	quotient high, partial
Packit 5c3484
C
Packit 5c3484
C	cl = high (ql * d + c)		carry out of low
Packit 5c3484
C	qh = low (qlc * 1 + qh1)	quotient high limb
Packit 5c3484
C
Packit 5c3484
C	new c = high (qh * d + cl)	carry out of high
Packit 5c3484
C
Packit 5c3484
C This would be 13 cycles/iteration, giving 6.5 cycles/limb.  The two limb
Packit 5c3484
C s*inverse as sih:sil = sh:sl * ih:il would be calculated off the dependent
Packit 5c3484
C chain with 4 multiplies.  The bigger inverse would take extra time to
Packit 5c3484
C calculate, but a one limb iteration to handle an odd size could be done as
Packit 5c3484
C soon as 64-bits of inverse were ready.
Packit 5c3484
C
Packit 5c3484
C Perhaps this could even extend to a 3 limb inverse, which might promise 17
Packit 5c3484
C or 18 cycles for 3 limbs, giving 5.66 or 6.0 cycles/limb.
Packit 5c3484
C
Packit 5c3484
Packit 5c3484
ASM_START()
Packit 5c3484
	.explicit
Packit 5c3484
Packit 5c3484
	.text
Packit 5c3484
	.align	32
Packit 5c3484
.Ltable:
Packit 5c3484
data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
Packit 5c3484
data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
Packit 5c3484
data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
Packit 5c3484
data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
Packit 5c3484
data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
Packit 5c3484
data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
Packit 5c3484
data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
Packit 5c3484
data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
Packit 5c3484
data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
Packit 5c3484
data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
Packit 5c3484
data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
Packit 5c3484
data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
Packit 5c3484
data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
Packit 5c3484
data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
Packit 5c3484
data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
Packit 5c3484
data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
Packit 5c3484
Packit 5c3484
Packit 5c3484
PROLOGUE(mpn_modexact_1c_odd)
Packit 5c3484
Packit 5c3484
	C r32	src
Packit 5c3484
	C r33	size
Packit 5c3484
	C r34	divisor
Packit 5c3484
	C r35	carry
Packit 5c3484
Packit 5c3484
	.prologue
Packit 5c3484
.Lhere:
Packit 5c3484
{ .mmi;	add	r33 = -1, r33		C M0  size-1
Packit 5c3484
	mov	r14 = 2			C M1  2
Packit 5c3484
	mov	r15 = ip		C I0  .Lhere
Packit 5c3484
}{.mmi;	setf.sig f6 = r34		C M2  divisor
Packit 5c3484
	setf.sig f9 = r35		C M3  carry
Packit 5c3484
	zxt1	r3 = r34		C I1  divisor low byte
Packit 5c3484
}	;;
Packit 5c3484
Packit 5c3484
{ .mmi;	add	r3 = .Ltable-.Lhere, r3	C M0  table offset ip and index
Packit 5c3484
	sub	r16 = 0, r34		C M1  -divisor
Packit 5c3484
	.save	ar.lc, r2
Packit 5c3484
	mov	r2 = ar.lc		C I0
Packit 5c3484
}{.mmi;	.body
Packit 5c3484
	setf.sig f13 = r14		C M2  2 in significand
Packit 5c3484
	mov	r17 = -1		C M3  -1
Packit 5c3484
ABI32(`	zxt4	r33 = r33')		C I1  size extend
Packit 5c3484
}	;;
Packit 5c3484
Packit 5c3484
{ .mmi;	add	r3 = r3, r15		C M0  table entry address
Packit 5c3484
ABI32(` addp4	r32 = 0, r32')		C M1  src extend
Packit 5c3484
	mov	ar.lc = r33		C I0  size-1 loop count
Packit 5c3484
}{.mmi;	setf.sig f12 = r16		C M2  -divisor
Packit 5c3484
	setf.sig f8 = r17		C M3  -1
Packit 5c3484
}	;;
Packit 5c3484
Packit 5c3484
{ .mmi;	ld1	r3 = [r3]		C M0  inverse, 8 bits
Packit 5c3484
	ldf8	f10 = [r32], 8		C M1  src[0]
Packit 5c3484
	cmp.ne	p6,p0 = 0, r33		C I0  test size!=1
Packit 5c3484
}	;;
Packit 5c3484
Packit 5c3484
	C Wait for table load.
Packit 5c3484
	C Hope for an L1 hit of 1 cycles to ALU, but could be more.
Packit 5c3484
	setf.sig f7 = r3		C M2  inverse, 8 bits
Packit 5c3484
(p6)	ldf8	f11 = [r32], 8		C M1  src[1], if size!=1
Packit 5c3484
	;;
Packit 5c3484
Packit 5c3484
	C 5 cycles
Packit 5c3484
Packit 5c3484
	C f6	divisor
Packit 5c3484
	C f7	inverse, being calculated
Packit 5c3484
	C f8	-1, will be -inverse
Packit 5c3484
	C f9	carry
Packit 5c3484
	C f10	src[0]
Packit 5c3484
	C f11	src[1]
Packit 5c3484
	C f12	-divisor
Packit 5c3484
	C f13	2
Packit 5c3484
	C f14	scratch
Packit 5c3484
Packit 5c3484
	xmpy.l	f14 = f13, f7		C 2*i
Packit 5c3484
	xmpy.l	f7 = f7, f7		C i*i
Packit 5c3484
	;;
Packit 5c3484
	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 16 bits
Packit 5c3484
	;;
Packit 5c3484
Packit 5c3484
	xmpy.l	f14 = f13, f7		C 2*i
Packit 5c3484
	xmpy.l	f7 = f7, f7		C i*i
Packit 5c3484
	;;
Packit 5c3484
	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 32 bits
Packit 5c3484
	;;
Packit 5c3484
Packit 5c3484
	xmpy.l	f14 = f13, f7		C 2*i
Packit 5c3484
	xmpy.l	f7 = f7, f7		C i*i
Packit 5c3484
	;;
Packit 5c3484
Packit 5c3484
	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 64 bits
Packit 5c3484
	xma.l	f10 = f9, f8, f10	C sc = c * -1 + src[0]
Packit 5c3484
	;;
Packit 5c3484
ASSERT(p6, `
Packit 5c3484
	xmpy.l	f15 = f6, f7 ;;	C divisor*inverse
Packit 5c3484
	getf.sig r31 = f15 ;;
Packit 5c3484
	cmp.eq	p6,p0 = 1, r31	C should == 1
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
	xmpy.l	f10 = f10, f7		C q = sc * inverse
Packit 5c3484
	xmpy.l	f8 = f7, f8		C -inverse = inverse * -1
Packit 5c3484
	br.cloop.sptk.few.clr .Lentry	C main loop, if size > 1
Packit 5c3484
	;;
Packit 5c3484
Packit 5c3484
	C size==1, finish up now
Packit 5c3484
	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
Packit 5c3484
	mov	ar.lc = r2		C I0
Packit 5c3484
	;;
Packit 5c3484
	getf.sig r8 = f9		C M2  return c
Packit 5c3484
	br.ret.sptk.many b0
Packit 5c3484
Packit 5c3484
Packit 5c3484
Packit 5c3484
.Ltop:
Packit 5c3484
	C r2	saved ar.lc
Packit 5c3484
	C f6	divisor
Packit 5c3484
	C f7	inverse
Packit 5c3484
	C f8	-inverse
Packit 5c3484
	C f9	carry
Packit 5c3484
	C f10	src[i] * inverse
Packit 5c3484
	C f11	scratch src[i+1]
Packit 5c3484
Packit 5c3484
	add	r16 = 160, r32
Packit 5c3484
	ldf8	f11 = [r32], 8		C src[i+1]
Packit 5c3484
	;;
Packit 5c3484
	C 2 cycles
Packit 5c3484
Packit 5c3484
	lfetch	[r16]
Packit 5c3484
	xma.l	f10 = f9, f8, f10	C q = c * -inverse + si
Packit 5c3484
	;;
Packit 5c3484
	C 3 cycles
Packit 5c3484
Packit 5c3484
.Lentry:
Packit 5c3484
	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
Packit 5c3484
	xmpy.l	f10 = f11, f7		C si = src[i] * inverse
Packit 5c3484
	br.cloop.sptk.few.clr .Ltop
Packit 5c3484
	;;
Packit 5c3484
Packit 5c3484
Packit 5c3484
Packit 5c3484
	xma.l	f10 = f9, f8, f10	C q = c * -inverse + si
Packit 5c3484
	mov	ar.lc = r2		C I0
Packit 5c3484
	;;
Packit 5c3484
	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
Packit 5c3484
	;;
Packit 5c3484
	getf.sig r8 = f9		C M2  return c
Packit 5c3484
	br.ret.sptk.many b0
Packit 5c3484
Packit 5c3484
EPILOGUE()