dnl  PPC-32 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.

dnl  Copyright 2007, 2008, 2012 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of either:
dnl
dnl    * the GNU Lesser General Public License as published by the Free
dnl      Software Foundation; either version 3 of the License, or (at your
dnl      option) any later version.
dnl
dnl  or
dnl
dnl    * the GNU General Public License as published by the Free Software
dnl      Foundation; either version 2 of the License, or (at your option) any
dnl      later version.
dnl
dnl  or both in parallel, as here.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
dnl  for more details.
dnl
dnl  You should have received copies of the GNU General Public License and the
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
dnl  see https://www.gnu.org/licenses/.

include(`../config.m4')

C		cycles/limb
C		norm	frac
C 7410		~36.5	~36.5
C 744x, 745x	 29	 29

C INPUT PARAMETERS
C qp  = r3
C fn  = r4
C up  = r5
C un  = r6
C d   = r7

C TODO
C  * Decrease register usage.
C  * Make sure mul operands and optimal for early-out.
C  * Check that things work well for a shared library build.
C  * Write an invert_limb, perhaps inline, perhaps as a private call.  Or at
C    least vastly improve the current __udiv_qrnnd_c based code.


ASM_START()
PROLOGUE(mpn_divrem_2)
	stwu	r1, -32(r1)
	slwi	r0, r6, 2
	add	r5, r5, r0
	stmw	r28, 8(r1)
	addi	r29, r5, -8		C up = up_param + un - 2
	lwz	r10, 4(r7)
	lwz	r12, 4(r29)
	addi	r8, r3, -12
	lwz	r7, 0(r7)
	cmplw	cr7, r12, r10
	lwz	r28, 0(r29)
	blt-	cr7, L(2)
	bgt+	cr7, L(4)
	cmplw	cr7, r28, r7
	blt-	cr7, L(2)
L(4):	subfc	r28, r7, r28
	subfe	r12, r10, r12
	li	r3, 1
	b	L(6)
L(2):	li	r3, 0

L(6):	add	r0, r4, r6
	addic.	r30, r0, -2
	ble-	cr0, L(ret)

	slwi	r9, r0, 2
	add	r8, r8, r9		C rp += un + fn
	mtctr	r30

C Compute di from d1
	srwi	r11, r10, 16
	nor	r0, r10, r10
	divwu	r31, r0, r11
	rlwinm	r5, r10, 0, 16, 31
	mullw	r9, r11, r31
	mullw	r6, r5, r31
	subf	r0, r9, r0
	slwi	r0, r0, 16
	ori	r0, r0, 65535
	cmplw	cr7, r0, r6
	bge-	cr7, L(9)
	add	r0, r0, r10
	cmplw	cr7, r0, r10
	cmplw	cr6, r6, r0
	addi	r31, r31, -1		C q1--
	crorc	28, 28, 25
	bc+	12, 28, L(9)
	addi	r31, r31, -1		C q1--
	add	r0, r0, r10
L(9):	subf	r0, r6, r0
	divwu	r6, r0, r11
	mullw	r9, r11, r6
	mullw	r11, r5, r6
	subf	r0, r9, r0
	slwi	r0, r0, 16
	ori	r0, r0, 65535
	cmplw	cr7, r0, r11
	bge-	cr7, L(13)
	add	r0, r0, r10
	cmplw	cr7, r0, r10
	cmplw	cr6, r11, r0
	addi	r6, r6, -1		C q0--
	crorc	28, 28, 25
	bc+	12, 28, L(13)
C	add	r0, r0, r10		C final remainder
	addi	r6, r6, -1		C q0--
L(13):	rlwimi	r6, r31, 16, 0, 15	C assemble final quotient

C Adjust di by including d0
	mullw	r9, r10, r6		C t0 = LO(di * d1)
	addc	r11, r9, r7
	subfe	r0, r1, r1
	mulhwu	r9, r6, r7		C s1 = HI(di * d0)
	addc	r9, r11, r9
	addze.	r0, r0
	blt	cr0, L(17)
L(18):	subfc	r9, r10, r9
	addi	r6, r6, -1
	addme.	r0, r0
	bge+	cr0, L(18)
L(17):

C r0  r3  r4  r5  r6  r7  r8  r9 r10 r11 r12 r28 r29 r30 r31
C     msl         di  d0  qp     d1          fn  up  un
L(loop):
	mullw	r0, r12, r6		C q0 = LO(n2 * di)
	cmpw	cr7, r30, r4
	addc	r31, r0, r28		C q0 += n1
	mulhwu	r9, r12, r6		C q  = HI(n2 * di)
	adde	r12, r9, r12		C q  += n2
	addi	r30, r30, -1
	mullw	r0, r10, r12		C d1 * q
	li	r9, 0
	subf	r0, r0, r28		C n1 -= d1 * q
	addi	r5, r12, 1
	ble-	cr7, L(23)
	lwzu	r9, -4(r29)
L(23):	mullw	r11, r12, r7		C t0 = LO(d0 * q)
	subfc	r28, r7, r9		C n0 -= d0
	subfe	r0, r10, r0		C n1 -= d1
	mulhwu	r12, r12, r7		C t1 = HI(d0 * q)
	subfc	r28, r11, r28		C n0 -= t0
	subfe	r12, r12, r0		C n1 -= t1
	cmplw	cr7, r12, r31
	blt+	cr7, L(24)
	addc	r28, r28, r7
	adde	r12, r12, r10
	addi	r5, r5, -1
L(24):	cmplw	cr7, r12, r10
	bge-	cr7, L(fix)
L(bck):	stw	r5, 0(r8)
	addi	r8, r8, -4
	bdnz	L(loop)

L(ret):	stw	r28, 0(r29)
	stw	r12, 4(r29)
	lmw	r28, 8(r1)
	addi	r1, r1, 32
	blr

L(fix):	cmplw	cr6, r28, r7
	bgt+	cr7, L(28)
	blt-	cr6, L(bck)
L(28):	subfc	r28, r7, r28
	subfe	r12, r10, r12
	addi	r5, r5, 1
	b	L(bck)
EPILOGUE()