Blame sysdeps/alpha/divq.S

Packit 6c4009
/* Copyright (C) 2004-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library.  If not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include "div_libc.h"
Packit 6c4009
Packit 6c4009
Packit 6c4009
/* 64-bit signed long divide.  These are not normal C functions.  Argument
Packit 6c4009
   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
Packit 6c4009
   be clobbered.
Packit 6c4009
Packit 6c4009
   Theory of operation here is that we can use the FPU divider for virtually
Packit 6c4009
   all operands that we see: all dividend values between -2**53 and 2**53-1
Packit 6c4009
   can be computed directly.  Note that divisor values need not be checked
Packit 6c4009
   against that range because the rounded fp value will be close enough such
Packit 6c4009
   that the quotient is < 1, which will properly be truncated to zero when we
Packit 6c4009
   convert back to integer.
Packit 6c4009
Packit 6c4009
   When the dividend is outside the range for which we can compute exact
Packit 6c4009
   results, we use the fp quotent as an estimate from which we begin refining
Packit 6c4009
   an exact integral value.  This reduces the number of iterations in the
Packit 6c4009
   shift-and-subtract loop significantly.
Packit 6c4009
Packit 6c4009
   The FPCR save/restore is due to the fact that the EV6 _will_ set FPCR_INE
Packit 6c4009
   for cvttq/c even without /sui being set.  It will not, however, properly
Packit 6c4009
   raise the exception, so we don't have to worry about FPCR_INED being clear
Packit 6c4009
   and so dying by SIGFPE.  */
Packit 6c4009
Packit 6c4009
	.text
Packit 6c4009
	.align	4
Packit 6c4009
	.globl	__divq
Packit 6c4009
	.type	__divq, @funcnoplt
Packit 6c4009
	.usepv	__divq, no
Packit 6c4009
Packit 6c4009
	cfi_startproc
Packit 6c4009
	cfi_return_column (RA)
Packit 6c4009
__divq:
Packit 6c4009
	lda	sp, -FRAME(sp)
Packit 6c4009
	cfi_def_cfa_offset (FRAME)
Packit 6c4009
	CALL_MCOUNT
Packit 6c4009
Packit 6c4009
	/* Get the fp divide insn issued as quickly as possible.  After
Packit 6c4009
	   that's done, we have at least 22 cycles until its results are
Packit 6c4009
	   ready -- all the time in the world to figure out how we're
Packit 6c4009
	   going to use the results.  */
Packit 6c4009
	stt	$f0, 0(sp)
Packit 6c4009
	excb
Packit 6c4009
	beq	Y, DIVBYZERO
Packit 6c4009
Packit 6c4009
	stt	$f1, 8(sp)
Packit 6c4009
	stt	$f3, 48(sp)
Packit 6c4009
	cfi_rel_offset ($f0, 0)
Packit 6c4009
	cfi_rel_offset ($f1, 8)
Packit 6c4009
	cfi_rel_offset ($f3, 48)
Packit 6c4009
	mf_fpcr	$f3
Packit 6c4009
Packit 6c4009
	_ITOFT2	X, $f0, 16, Y, $f1, 24
Packit 6c4009
	cvtqt	$f0, $f0
Packit 6c4009
	cvtqt	$f1, $f1
Packit 6c4009
	divt/c	$f0, $f1, $f0
Packit 6c4009
Packit 6c4009
	/* Check to see if X fit in the double as an exact value.  */
Packit 6c4009
	sll	X, (64-53), AT
Packit 6c4009
	ldt	$f1, 8(sp)
Packit 6c4009
	sra	AT, (64-53), AT
Packit 6c4009
	cmpeq	X, AT, AT
Packit 6c4009
	beq	AT, $x_big
Packit 6c4009
Packit 6c4009
	/* If we get here, we're expecting exact results from the division.
Packit 6c4009
	   Do nothing else besides convert and clean up.  */
Packit 6c4009
	cvttq/c	$f0, $f0
Packit 6c4009
	excb
Packit 6c4009
	mt_fpcr	$f3
Packit 6c4009
	_FTOIT	$f0, RV, 16
Packit 6c4009
Packit 6c4009
	ldt	$f0, 0(sp)
Packit 6c4009
	ldt	$f3, 48(sp)
Packit 6c4009
	cfi_restore ($f1)
Packit 6c4009
	cfi_remember_state
Packit 6c4009
	cfi_restore ($f0)
Packit 6c4009
	cfi_restore ($f3)
Packit 6c4009
	cfi_def_cfa_offset (0)
Packit 6c4009
	lda	sp, FRAME(sp)
Packit 6c4009
	ret	$31, (RA), 1
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
	cfi_restore_state
Packit 6c4009
$x_big:
Packit 6c4009
	/* If we get here, X is large enough that we don't expect exact
Packit 6c4009
	   results, and neither X nor Y got mis-translated for the fp
Packit 6c4009
	   division.  Our task is to take the fp result, figure out how
Packit 6c4009
	   far it's off from the correct result and compute a fixup.  */
Packit 6c4009
	stq	t0, 16(sp)
Packit 6c4009
	stq	t1, 24(sp)
Packit 6c4009
	stq	t2, 32(sp)
Packit 6c4009
	stq	t5, 40(sp)
Packit 6c4009
	cfi_rel_offset (t0, 16)
Packit 6c4009
	cfi_rel_offset (t1, 24)
Packit 6c4009
	cfi_rel_offset (t2, 32)
Packit 6c4009
	cfi_rel_offset (t5, 40)
Packit 6c4009
Packit 6c4009
#define Q	RV		/* quotient */
Packit 6c4009
#define R	t0		/* remainder */
Packit 6c4009
#define SY	t1		/* scaled Y */
Packit 6c4009
#define S	t2		/* scalar */
Packit 6c4009
#define QY	t3		/* Q*Y */
Packit 6c4009
Packit 6c4009
	/* The fixup code below can only handle unsigned values.  */
Packit 6c4009
	or	X, Y, AT
Packit 6c4009
	mov	$31, t5
Packit 6c4009
	blt	AT, $fix_sign_in
Packit 6c4009
$fix_sign_in_ret1:
Packit 6c4009
	cvttq/c	$f0, $f0
Packit 6c4009
Packit 6c4009
	_FTOIT	$f0, Q, 8
Packit 6c4009
	.align	3
Packit 6c4009
$fix_sign_in_ret2:
Packit 6c4009
	ldt	$f0, 0(sp)
Packit 6c4009
	stq	t3, 0(sp)
Packit 6c4009
	cfi_restore ($f0)
Packit 6c4009
	cfi_rel_offset (t3, 0)
Packit 6c4009
Packit 6c4009
	mulq	Q, Y, QY
Packit 6c4009
	excb
Packit 6c4009
	stq	t4, 8(sp)
Packit 6c4009
	mt_fpcr	$f3
Packit 6c4009
	cfi_rel_offset (t4, 8)
Packit 6c4009
Packit 6c4009
	subq	QY, X, R
Packit 6c4009
	mov	Y, SY
Packit 6c4009
	mov	1, S
Packit 6c4009
	bgt	R, $q_high
Packit 6c4009
Packit 6c4009
$q_high_ret:
Packit 6c4009
	subq	X, QY, R
Packit 6c4009
	mov	Y, SY
Packit 6c4009
	mov	1, S
Packit 6c4009
	bgt	R, $q_low
Packit 6c4009
Packit 6c4009
$q_low_ret:
Packit 6c4009
	ldq	t0, 16(sp)
Packit 6c4009
	ldq	t1, 24(sp)
Packit 6c4009
	ldq	t2, 32(sp)
Packit 6c4009
	bne	t5, $fix_sign_out
Packit 6c4009
Packit 6c4009
$fix_sign_out_ret:
Packit 6c4009
	ldq	t3, 0(sp)
Packit 6c4009
	ldq	t4, 8(sp)
Packit 6c4009
	ldq	t5, 40(sp)
Packit 6c4009
	ldt	$f3, 48(sp)
Packit 6c4009
	lda	sp, FRAME(sp)
Packit 6c4009
	cfi_remember_state
Packit 6c4009
	cfi_restore (t0)
Packit 6c4009
	cfi_restore (t1)
Packit 6c4009
	cfi_restore (t2)
Packit 6c4009
	cfi_restore (t3)
Packit 6c4009
	cfi_restore (t4)
Packit 6c4009
	cfi_restore (t5)
Packit 6c4009
	cfi_restore ($f3)
Packit 6c4009
	cfi_def_cfa_offset (0)
Packit 6c4009
	ret	$31, (RA), 1
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
	cfi_restore_state
Packit 6c4009
	/* The quotient that we computed was too large.  We need to reduce
Packit 6c4009
	   it by S such that Y*S >= R.  Obviously the closer we get to the
Packit 6c4009
	   correct value the better, but overshooting high is ok, as we'll
Packit 6c4009
	   fix that up later.  */
Packit 6c4009
0:
Packit 6c4009
	addq	SY, SY, SY
Packit 6c4009
	addq	S, S, S
Packit 6c4009
$q_high:
Packit 6c4009
	cmpult	SY, R, AT
Packit 6c4009
	bne	AT, 0b
Packit 6c4009
Packit 6c4009
	subq	Q, S, Q
Packit 6c4009
	unop
Packit 6c4009
	subq	QY, SY, QY
Packit 6c4009
	br	$q_high_ret
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
	/* The quotient that we computed was too small.  Divide Y by the
Packit 6c4009
	   current remainder (R) and add that to the existing quotient (Q).
Packit 6c4009
	   The expectation, of course, is that R is much smaller than X.  */
Packit 6c4009
	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
Packit 6c4009
	   already have a copy of Y in SY and the value 1 in S.  */
Packit 6c4009
0:
Packit 6c4009
	addq	SY, SY, SY
Packit 6c4009
	addq	S, S, S
Packit 6c4009
$q_low:
Packit 6c4009
	cmpult	SY, R, AT
Packit 6c4009
	bne	AT, 0b
Packit 6c4009
Packit 6c4009
	/* Shift-down and subtract loop.  Each iteration compares our scaled
Packit 6c4009
	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
Packit 6c4009
	   Y's scalar (S) so add it to the quotient (Q).  */
Packit 6c4009
2:	addq	Q, S, t3
Packit 6c4009
	srl	S, 1, S
Packit 6c4009
	cmpule	SY, R, AT
Packit 6c4009
	subq	R, SY, t4
Packit 6c4009
Packit 6c4009
	cmovne	AT, t3, Q
Packit 6c4009
	cmovne	AT, t4, R
Packit 6c4009
	srl	SY, 1, SY
Packit 6c4009
	bne	S, 2b
Packit 6c4009
Packit 6c4009
	br	$q_low_ret
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
$fix_sign_in:
Packit 6c4009
	/* If we got here, then X|Y is negative.  Need to adjust everything
Packit 6c4009
	   such that we're doing unsigned division in the fixup loop.  */
Packit 6c4009
	/* T5 records the changes we had to make:
Packit 6c4009
		bit 0:	set if result should be negative.
Packit 6c4009
		bit 2:	set if X was negated.
Packit 6c4009
		bit 3:	set if Y was negated.
Packit 6c4009
	*/
Packit 6c4009
	xor	X, Y, AT
Packit 6c4009
	cmplt	AT, 0, t5
Packit 6c4009
	cmplt	X, 0, AT
Packit 6c4009
	negq	X, t0
Packit 6c4009
Packit 6c4009
	s4addq	AT, t5, t5
Packit 6c4009
	cmovne	AT, t0, X
Packit 6c4009
	cmplt	Y, 0, AT
Packit 6c4009
	negq	Y, t0
Packit 6c4009
Packit 6c4009
	s8addq	AT, t5, t5
Packit 6c4009
	cmovne	AT, t0, Y
Packit 6c4009
	unop
Packit 6c4009
	blbc	t5, $fix_sign_in_ret1
Packit 6c4009
Packit 6c4009
	cvttq/c	$f0, $f0
Packit 6c4009
	_FTOIT	$f0, Q, 8
Packit 6c4009
	.align	3
Packit 6c4009
	negq	Q, Q
Packit 6c4009
	br	$fix_sign_in_ret2
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
$fix_sign_out:
Packit 6c4009
	/* Now we get to undo what we did above.  */
Packit 6c4009
	/* ??? Is this really faster than just increasing the size of
Packit 6c4009
	   the stack frame and storing X and Y in memory?  */
Packit 6c4009
	and	t5, 8, AT
Packit 6c4009
	negq	Y, t4
Packit 6c4009
	cmovne	AT, t4, Y
Packit 6c4009
Packit 6c4009
	and	t5, 4, AT
Packit 6c4009
	negq	X, t4
Packit 6c4009
	cmovne	AT, t4, X
Packit 6c4009
Packit 6c4009
	negq	RV, t4
Packit 6c4009
	cmovlbs	t5, t4, RV
Packit 6c4009
Packit 6c4009
	br	$fix_sign_out_ret
Packit 6c4009
Packit 6c4009
	cfi_endproc
Packit 6c4009
	.size	__divq, .-__divq
Packit 6c4009
Packit 6c4009
	DO_DIVBYZERO