Blame mpn/alpha/ev67/gcd_1.asm

Packit 5c3484
dnl  Alpha ev67 mpn_gcd_1 -- Nx1 greatest common divisor.
Packit 5c3484
Packit 5c3484
dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C ev67: 3.4 cycles/bitpair for 1x1 part
Packit 5c3484
Packit 5c3484
Packit 5c3484
C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
Packit 5c3484
C
Packit 5c3484
C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and
Packit 5c3484
C strip trailing zeros from abs(x-y) to maintain x and y both odd.
Packit 5c3484
C
Packit 5c3484
C The trailing zeros are calculated from just x-y, since in twos-complement
Packit 5c3484
C there's the same number of trailing zeros on d or -d.  This means the cttz
Packit 5c3484
C runs in parallel with abs(x-y).
Packit 5c3484
C
Packit 5c3484
C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit
Packit 5c3484
C operands with this algorithm gives the measured 3.4 c/l.
Packit 5c3484
C
Packit 5c3484
C The slottings shown are for SVR4 style systems, Unicos differs in the
Packit 5c3484
C initial gp setup and the LEA.
Packit 5c3484
C
Packit 5c3484
C Enhancement:
Packit 5c3484
C
Packit 5c3484
C On the jsr, !lituse_jsr! (when available) would allow the linker to relax
Packit 5c3484
C it to a bsr, but probably only in a static binary.  Plain "jsr foo" gives
Packit 5c3484
C the right object code for relaxation, and ought to be available
Packit 5c3484
C everywhere, but we prefer to schedule the GOT ldq (LEA) back earlier, for
Packit 5c3484
C the usual case of running in a shared library.
Packit 5c3484
C
Packit 5c3484
C bsr could perhaps be used explicitly anyway.  We should be able to assume
Packit 5c3484
C modexact is in the same module as us (ie. shared library or mainline).
Packit 5c3484
C Would there be any worries about the size of the displacement?  Could
Packit 5c3484
C always put modexact and gcd_1 in the same .o to be certain.
Packit 5c3484
Packit 5c3484
ASM_START()
Packit 5c3484
PROLOGUE(mpn_gcd_1, gp)
Packit 5c3484
Packit 5c3484
	C r16	xp
Packit 5c3484
	C r17	size
Packit 5c3484
	C r18	y
Packit 5c3484
Packit 5c3484
	C ldah				C l
Packit 5c3484
	C lda				C u
Packit 5c3484
Packit 5c3484
	ldq	r0, 0(r16)		C L   x = xp[0]
Packit 5c3484
	lda	r30, -32(r30)		C u   alloc stack
Packit 5c3484
Packit 5c3484
	LEA(  r27, mpn_modexact_1c_odd)	C L   modexact addr, ldq (gp)
Packit 5c3484
	stq	r10, 16(r30)		C L   save r10
Packit 5c3484
	cttz	r18, r10		C U0  y twos
Packit 5c3484
	cmpeq	r17, 1, r5		C u   test size==1
Packit 5c3484
Packit 5c3484
	stq	r9, 8(r30)		C L   save r9
Packit 5c3484
	clr	r19			C u   zero c for modexact
Packit 5c3484
	unop
Packit 5c3484
	unop
Packit 5c3484
Packit 5c3484
	cttz	r0, r6			C U0  x twos
Packit 5c3484
	stq	r26, 0(r30)		C L   save ra
Packit 5c3484
Packit 5c3484
	srl	r18, r10, r18		C U   y odd
Packit 5c3484
Packit 5c3484
	mov	r18, r9			C l   hold y across call
Packit 5c3484
Packit 5c3484
	cmpult	r6, r10, r2		C u   test x_twos < y_twos
Packit 5c3484
Packit 5c3484
	cmovne	r2, r6, r10		C l   common_twos = min(x_twos,y_twos)
Packit 5c3484
	bne	r5, L(one)		C U   no modexact if size==1
Packit 5c3484
	jsr	r26, (r27), mpn_modexact_1c_odd   C L0
Packit 5c3484
Packit 5c3484
	LDGP(	r29, 0(r26))		C u,l ldah,lda
Packit 5c3484
	cttz	r0, r6			C U0  new x twos
Packit 5c3484
	ldq	r26, 0(r30)		C L   restore ra
Packit 5c3484
Packit 5c3484
L(one):
Packit 5c3484
	mov	r9, r1			C u   y
Packit 5c3484
	ldq	r9, 8(r30)		C L   restore r9
Packit 5c3484
	mov	r10, r2			C u   common twos
Packit 5c3484
	ldq	r10, 16(r30)		C L   restore r10
Packit 5c3484
Packit 5c3484
	lda	r30, 32(r30)		C l   free stack
Packit 5c3484
	beq	r0, L(done)		C U   return y if x%y==0
Packit 5c3484
Packit 5c3484
	srl	r0, r6, r0		C U   x odd
Packit 5c3484
	unop
Packit 5c3484
Packit 5c3484
	ALIGN(16)
Packit 5c3484
L(top):
Packit 5c3484
	C r0	x
Packit 5c3484
	C r1	y
Packit 5c3484
	C r2	common twos, for use at end
Packit 5c3484
Packit 5c3484
	subq	r0, r1, r7		C l0  d = x - y
Packit 5c3484
	cmpult	r0, r1, r16		C u0  test x >= y
Packit 5c3484
Packit 5c3484
	subq	r1, r0, r4		C l0  new_x = y - x
Packit 5c3484
	cttz	r7, r8			C U0  d twos
Packit 5c3484
Packit 5c3484
	cmoveq	r16, r7, r4		C l0  new_x = d if x>=y
Packit 5c3484
	cmovne	r16, r0, r1		C u0  y = x if x
Packit 5c3484
	unop				C l   \ force cmoveq into l0
Packit 5c3484
	unop				C u   /
Packit 5c3484
Packit 5c3484
	C				C cmoveq2 L0, cmovne2 U0
Packit 5c3484
Packit 5c3484
	srl	r4, r8, r0		C U0  x = new_x >> twos
Packit 5c3484
	bne	r7, L(top)		C U1  stop when d==0
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(done):
Packit 5c3484
	sll	r1, r2, r0		C U0  return y << common_twos
Packit 5c3484
	ret	r31, (r26), 1		C L0
Packit 5c3484
Packit 5c3484
EPILOGUE()
Packit 5c3484
ASM_END()