Blame mpn/arm64/gcd_1.asm

Packit 5c3484
dnl  ARM v6t2 mpn_gcd_1.
Packit 5c3484
Packit 5c3484
dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for ARM by Torbjorn
Packit 5c3484
dnl  Granlund.
Packit 5c3484
Packit 5c3484
dnl  Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
changecom(@&*$)
Packit 5c3484
Packit 5c3484
C	     cycles/bit (approx)
Packit 5c3484
C Cortex-A53	 ?
Packit 5c3484
C Cortex-A57	 ?
Packit 5c3484
Packit 5c3484
C TODO
Packit 5c3484
C  * Optimise inner-loop better.
Packit 5c3484
C  * Push saving/restoring of callee-user regs into call code
Packit 5c3484
Packit 5c3484
C Threshold of when to call bmod when U is one limb.  Should be about
Packit 5c3484
C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
Packit 5c3484
define(`BMOD_THRES_LOG2', 7)
Packit 5c3484
Packit 5c3484
C INPUT PARAMETERS
Packit 5c3484
define(`up',    `x0')
Packit 5c3484
define(`n',     `x1')
Packit 5c3484
define(`v0',    `x2')
Packit 5c3484
Packit 5c3484
ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
Packit 5c3484
  `define(`BMOD_1_TO_MOD_1_THRESHOLD',30)')
Packit 5c3484
Packit 5c3484
ASM_START()
Packit 5c3484
	TEXT
Packit 5c3484
	ALIGN(16)
Packit 5c3484
PROLOGUE(mpn_gcd_1)
Packit 5c3484
	stp	x29, x30, [sp,#-32]!
Packit 5c3484
	ldr	x3, [up]		C U low limb
Packit 5c3484
	stp     x19, x20, [sp,#16]
Packit 5c3484
Packit 5c3484
	orr	x3, x3, v0
Packit 5c3484
	rbit	x4, x3
Packit 5c3484
	clz	x20, x4			C min(ctz(u0),ctz(v0))
Packit 5c3484
Packit 5c3484
	rbit	x12, v0
Packit 5c3484
	clz	x12, x12
Packit 5c3484
	lsr	v0, v0, x12
Packit 5c3484
Packit 5c3484
	mov	x19, v0
Packit 5c3484
Packit 5c3484
	cmp	n, #1
Packit 5c3484
	b.ne	L(nby1)
Packit 5c3484
Packit 5c3484
C Both U and V are single limbs, reduce with bmod if u0 >> v0.
Packit 5c3484
	ldr	x3, [up]
Packit 5c3484
	cmp	v0, x3, lsr #BMOD_THRES_LOG2
Packit 5c3484
	b.hi	L(red1)
Packit 5c3484
Packit 5c3484
L(bmod):mov	x3, #0			C carry argument
Packit 5c3484
	bl	mpn_modexact_1c_odd
Packit 5c3484
	b	L(red0)
Packit 5c3484
Packit 5c3484
L(nby1):cmp	n, #BMOD_1_TO_MOD_1_THRESHOLD
Packit 5c3484
	b.lo	L(bmod)
Packit 5c3484
Packit 5c3484
	bl	mpn_mod_1
Packit 5c3484
Packit 5c3484
L(red0):mov	x3, x0
Packit 5c3484
L(red1):cmp	x3, #0
Packit 5c3484
	rbit	x12, x3
Packit 5c3484
	clz	x12, x12
Packit 5c3484
	b.ne	L(mid)
Packit 5c3484
	b	L(end)
Packit 5c3484
Packit 5c3484
	ALIGN(8)
Packit 5c3484
L(top):
Packit 5c3484
ifelse(1,1,`
Packit 5c3484
C This shorter variant makes full use of armv8 insns
Packit 5c3484
	csneg	x3, x1, x1, cs		C if x-y < 0
Packit 5c3484
	csel	x19, x4, x19, cs	C use x,y-x
Packit 5c3484
L(mid):	lsr	x4, x3, x12		C
Packit 5c3484
	subs	x1, x19, x4		C
Packit 5c3484
',`
Packit 5c3484
C This variant is akin to the 32-bit v6t2 code
Packit 5c3484
	csel	x3, x1, x3, cs		C if x-y < 0
Packit 5c3484
	csel	x19, x0, x19, cs	C use x,y-x
Packit 5c3484
L(mid):	lsr	x3, x3, x12		C
Packit 5c3484
	mov	x0, x3			C
Packit 5c3484
	subs	x1, x19, x3		C
Packit 5c3484
	sub	x3, x3, x19		C
Packit 5c3484
')
Packit 5c3484
	rbit	x12, x1
Packit 5c3484
	clz	x12, x12		C
Packit 5c3484
	b.ne	L(top)			C
Packit 5c3484
Packit 5c3484
L(end):	lsl	x0, x19, x20
Packit 5c3484
	ldp     x19, x20, [sp,#16]
Packit 5c3484
	ldp	x29, x30, [sp],#32
Packit 5c3484
	ret
Packit 5c3484
EPILOGUE()