Blame mpn/generic/div_qr_2.c

Packit 5c3484
/* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
Packit 5c3484
   quotient.  The divisor is two limbs.
Packit 5c3484
Packit 5c3484
   Contributed to the GNU project by Torbjorn Granlund and Niels Möller
Packit 5c3484
Packit 5c3484
   THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
Packit 5c3484
   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
Packit 5c3484
   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
Packit 5c3484
Packit 5c3484
Packit 5c3484
Copyright 1993-1996, 1999-2002, 2011 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
This file is part of the GNU MP Library.
Packit 5c3484
Packit 5c3484
The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
it under the terms of either:
Packit 5c3484
Packit 5c3484
  * the GNU Lesser General Public License as published by the Free
Packit 5c3484
    Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
    option) any later version.
Packit 5c3484
Packit 5c3484
or
Packit 5c3484
Packit 5c3484
  * the GNU General Public License as published by the Free Software
Packit 5c3484
    Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
    later version.
Packit 5c3484
Packit 5c3484
or both in parallel, as here.
Packit 5c3484
Packit 5c3484
The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
for more details.
Packit 5c3484
Packit 5c3484
You should have received copies of the GNU General Public License and the
Packit 5c3484
GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
see https://www.gnu.org/licenses/.  */
Packit 5c3484
Packit 5c3484
#include "gmp.h"
Packit 5c3484
#include "gmp-impl.h"
Packit 5c3484
#include "longlong.h"
Packit 5c3484
Packit 5c3484
#ifndef DIV_QR_2_PI2_THRESHOLD
Packit 5c3484
/* Disabled unless explicitly tuned. */
Packit 5c3484
#define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
#ifndef SANITY_CHECK
Packit 5c3484
#define SANITY_CHECK 0
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
/* Define some longlong.h-style macros, but for wider operations.
Packit 5c3484
   * add_sssaaaa is like longlong.h's add_ssaaaa but the propagating
Packit 5c3484
     carry-out into an additional sum operand.
Packit 5c3484
   * add_csaac accepts two addends and a carry in, and generates a sum
Packit 5c3484
     and a carry out.  A little like a "full adder".
Packit 5c3484
*/
Packit 5c3484
#if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
Packit 5c3484
Packit 5c3484
#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
Packit 5c3484
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
Packit 5c3484
  __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
Packit 5c3484
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
Packit 5c3484
	   : "0"  ((USItype)(s2)),					\
Packit 5c3484
	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
Packit 5c3484
	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
Packit 5c3484
#define add_csaac(co, s, a, b, ci)					\
Packit 5c3484
  __asm__ ("bt\t$0, %2\n\tadc\t%5, %k1\n\tadc\t%k0, %k0"		\
Packit 5c3484
	   : "=r" (co), "=r" (s)					\
Packit 5c3484
	   : "rm"  ((USItype)(ci)), "0" (CNST_LIMB(0)),			\
Packit 5c3484
	     "%1" ((USItype)(a)), "g" ((USItype)(b)))
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
#if defined (__amd64__) && W_TYPE_SIZE == 64
Packit 5c3484
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
Packit 5c3484
  __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
Packit 5c3484
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
Packit 5c3484
	   : "0"  ((UDItype)(s2)),					\
Packit 5c3484
	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
Packit 5c3484
	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
Packit 5c3484
#define add_csaac(co, s, a, b, ci)					\
Packit 5c3484
  __asm__ ("bt\t$0, %2\n\tadc\t%5, %q1\n\tadc\t%q0, %q0"		\
Packit 5c3484
	   : "=r" (co), "=r" (s)					\
Packit 5c3484
	   : "rm"  ((UDItype)(ci)), "0" (CNST_LIMB(0)),			\
Packit 5c3484
	     "%1" ((UDItype)(a)), "g" ((UDItype)(b)))
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
Packit 5c3484
/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
Packit 5c3484
   processor running in 32-bit mode, since the carry flag then gets the 32-bit
Packit 5c3484
   carry.  */
Packit 5c3484
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
Packit 5c3484
  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"	\
Packit 5c3484
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
Packit 5c3484
	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
#endif /* __GNUC__ */
Packit 5c3484
Packit 5c3484
#ifndef add_sssaaaa
Packit 5c3484
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
Packit 5c3484
  do {									\
Packit 5c3484
    UWtype __s0, __s1, __c0, __c1;					\
Packit 5c3484
    __s0 = (a0) + (b0);							\
Packit 5c3484
    __s1 = (a1) + (b1);							\
Packit 5c3484
    __c0 = __s0 < (a0);							\
Packit 5c3484
    __c1 = __s1 < (a1);							\
Packit 5c3484
    (s0) = __s0;							\
Packit 5c3484
    __s1 = __s1 + __c0;							\
Packit 5c3484
    (s1) = __s1;							\
Packit 5c3484
    (s2) += __c1 + (__s1 < __c0);					\
Packit 5c3484
  } while (0)
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
#ifndef add_csaac
Packit 5c3484
#define add_csaac(co, s, a, b, ci)					\
Packit 5c3484
  do {									\
Packit 5c3484
    UWtype __s, __c;							\
Packit 5c3484
    __s = (a) + (b);							\
Packit 5c3484
    __c = __s < (a);							\
Packit 5c3484
    __s = __s + (ci);							\
Packit 5c3484
    (s) = __s;								\
Packit 5c3484
    (co) = __c + (__s < (ci));						\
Packit 5c3484
  } while (0)
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
/* Typically used with r1, r0 same as n3, n2. Other types of overlap
Packit 5c3484
   between inputs and outputs are not supported. */
Packit 5c3484
#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0)		\
Packit 5c3484
  do {									\
Packit 5c3484
    mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0;		\
Packit 5c3484
    mp_limb_t _t1, _t0;							\
Packit 5c3484
    mp_limb_t _c, _mask;						\
Packit 5c3484
									\
Packit 5c3484
    umul_ppmm (_q3,_q2a, n3, di1);					\
Packit 5c3484
    umul_ppmm (_q2,_q1, n2, di1);					\
Packit 5c3484
    umul_ppmm (_q2c,_q1c, n3, di0);					\
Packit 5c3484
    add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1c);			\
Packit 5c3484
    umul_ppmm (_q1d,_q0, n2, di0);					\
Packit 5c3484
    add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2a,_q1d);			\
Packit 5c3484
									\
Packit 5c3484
    add_ssaaaa (r1, r0, n3, n2, CNST_LIMB(0), CNST_LIMB(1));		\
Packit 5c3484
									\
Packit 5c3484
    /* [q3,q2,q1,q0] += [n3,n3,n1,n0] */				\
Packit 5c3484
    add_csaac (_c, _q0, _q0, n0, CNST_LIMB(0));				\
Packit 5c3484
    add_csaac (_c, _q1, _q1, n1, _c);					\
Packit 5c3484
    add_csaac (_c, _q2, _q2, r0, _c);					\
Packit 5c3484
    _q3 = _q3 + r1 + _c;						\
Packit 5c3484
									\
Packit 5c3484
    umul_ppmm (_t1,_t0, _q2, d0);					\
Packit 5c3484
    _t1 += _q2 * d1 + _q3 * d0;						\
Packit 5c3484
									\
Packit 5c3484
    sub_ddmmss (r1, r0, n1, n0, _t1, _t0);				\
Packit 5c3484
									\
Packit 5c3484
    _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0)));  /* (r1,r0) >= (q1,q0) */  \
Packit 5c3484
    add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask);		\
Packit 5c3484
    sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask);		\
Packit 5c3484
									\
Packit 5c3484
    if (UNLIKELY (r1 >= d1))						\
Packit 5c3484
      {									\
Packit 5c3484
	if (r1 > d1 || r0 >= d0)					\
Packit 5c3484
	  {								\
Packit 5c3484
	    sub_ddmmss (r1, r0, r1, r0, d1, d0);			\
Packit 5c3484
	    add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
Packit 5c3484
	  }								\
Packit 5c3484
      }									\
Packit 5c3484
    (q1) = _q3;								\
Packit 5c3484
    (q0) = _q2;								\
Packit 5c3484
  } while (0)
Packit 5c3484
Packit 5c3484
static void
Packit 5c3484
invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t v1, v0, p1, t1, t0, p0, mask;
Packit 5c3484
  invert_limb (v1, d1);
Packit 5c3484
  p1 = d1 * v1;
Packit 5c3484
  /* <1, v1> * d1 = <B-1, p1> */
Packit 5c3484
  p1 += d0;
Packit 5c3484
  if (p1 < d0)
Packit 5c3484
    {
Packit 5c3484
      v1--;
Packit 5c3484
      mask = -(mp_limb_t) (p1 >= d1);
Packit 5c3484
      p1 -= d1;
Packit 5c3484
      v1 += mask;
Packit 5c3484
      p1 -= mask & d1;
Packit 5c3484
    }
Packit 5c3484
  /* <1, v1> * d1 + d0 = <B-1, p1> */
Packit 5c3484
  umul_ppmm (t1, p0, d0, v1);
Packit 5c3484
  p1 += t1;
Packit 5c3484
  if (p1 < t1)
Packit 5c3484
    {
Packit 5c3484
      if (UNLIKELY (p1 >= d1))
Packit 5c3484
	{
Packit 5c3484
	  if (p1 > d1 || p0 >= d0)
Packit 5c3484
	    {
Packit 5c3484
	      sub_ddmmss (p1, p0, p1, p0, d1, d0);
Packit 5c3484
	      v1--;
Packit 5c3484
	    }
Packit 5c3484
	}
Packit 5c3484
      sub_ddmmss (p1, p0, p1, p0, d1, d0);
Packit 5c3484
      v1--;
Packit 5c3484
    }
Packit 5c3484
  /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
Packit 5c3484
   * with <p1, p0> + <d1, d0> >= B^2.
Packit 5c3484
   *
Packit 5c3484
   * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
Packit 5c3484
   * partial remainder after <1, v1> is
Packit 5c3484
   *
Packit 5c3484
   * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
Packit 5c3484
   *                              = <~p1, ~p0, B-1>
Packit 5c3484
   */
Packit 5c3484
  udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
Packit 5c3484
  di[0] = v0;
Packit 5c3484
  di[1] = v1;
Packit 5c3484
Packit 5c3484
#if SANITY_CHECK
Packit 5c3484
  {
Packit 5c3484
    mp_limb_t tp[4];
Packit 5c3484
    mp_limb_t dp[2];
Packit 5c3484
    dp[0] = d0;
Packit 5c3484
    dp[1] = d1;
Packit 5c3484
    mpn_mul_n (tp, dp, di, 2);
Packit 5c3484
    ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
Packit 5c3484
    ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
Packit 5c3484
    ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
Packit 5c3484
    ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
Packit 5c3484
  }
Packit 5c3484
#endif
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
static mp_limb_t
Packit 5c3484
mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
Packit 5c3484
		   mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t qh;
Packit 5c3484
  mp_size_t i;
Packit 5c3484
  mp_limb_t r1, r0;
Packit 5c3484
Packit 5c3484
  ASSERT (nn >= 2);
Packit 5c3484
  ASSERT (d1 & GMP_NUMB_HIGHBIT);
Packit 5c3484
Packit 5c3484
  r1 = np[nn-1];
Packit 5c3484
  r0 = np[nn-2];
Packit 5c3484
Packit 5c3484
  qh = 0;
Packit 5c3484
  if (r1 >= d1 && (r1 > d1 || r0 >= d0))
Packit 5c3484
    {
Packit 5c3484
#if GMP_NAIL_BITS == 0
Packit 5c3484
      sub_ddmmss (r1, r0, r1, r0, d1, d0);
Packit 5c3484
#else
Packit 5c3484
      r0 = r0 - d0;
Packit 5c3484
      r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
Packit 5c3484
      r0 &= GMP_NUMB_MASK;
Packit 5c3484
#endif
Packit 5c3484
      qh = 1;
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  for (i = nn - 2; i >= 2; i -= 2)
Packit 5c3484
    {
Packit 5c3484
      mp_limb_t n1, n0, q1, q0;
Packit 5c3484
      n1 = np[i-1];
Packit 5c3484
      n0 = np[i-2];
Packit 5c3484
      udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
Packit 5c3484
      qp[i-1] = q1;
Packit 5c3484
      qp[i-2] = q0;
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  if (i > 0)
Packit 5c3484
    {
Packit 5c3484
      mp_limb_t q;
Packit 5c3484
      udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
Packit 5c3484
      qp[0] = q;
Packit 5c3484
    }
Packit 5c3484
  rp[1] = r1;
Packit 5c3484
  rp[0] = r0;
Packit 5c3484
Packit 5c3484
  return qh;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
Packit 5c3484
/* Divide num {np,nn} by den {dp,2} and write the nn-2 least
Packit 5c3484
   significant quotient limbs at qp and the 2 long remainder at np.
Packit 5c3484
   Return the most significant limb of the quotient.
Packit 5c3484
Packit 5c3484
   Preconditions:
Packit 5c3484
   1. qp must either not overlap with the input operands at all, or
Packit 5c3484
      qp >= np + 2 must hold true.  (This means that it's possible to put
Packit 5c3484
      the quotient in the high part of {np,nn}, right above the remainder.
Packit 5c3484
   2. nn >= 2.  */
Packit 5c3484
Packit 5c3484
mp_limb_t
Packit 5c3484
mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
Packit 5c3484
	      mp_srcptr dp)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t d1;
Packit 5c3484
  mp_limb_t d0;
Packit 5c3484
  gmp_pi1_t dinv;
Packit 5c3484
Packit 5c3484
  ASSERT (nn >= 2);
Packit 5c3484
  ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
Packit 5c3484
  ASSERT_MPN (np, nn);
Packit 5c3484
  ASSERT_MPN (dp, 2);
Packit 5c3484
Packit 5c3484
  d1 = dp[1]; d0 = dp[0];
Packit 5c3484
Packit 5c3484
  ASSERT (d1 > 0);
Packit 5c3484
Packit 5c3484
  if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
Packit 5c3484
    {
Packit 5c3484
      if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
Packit 5c3484
	{
Packit 5c3484
	  gmp_pi1_t dinv;
Packit 5c3484
	  invert_pi1 (dinv, d1, d0);
Packit 5c3484
	  return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
Packit 5c3484
	}
Packit 5c3484
      else
Packit 5c3484
	{
Packit 5c3484
	  mp_limb_t di[2];
Packit 5c3484
	  invert_4by2 (di, d1, d0);
Packit 5c3484
	  return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
Packit 5c3484
	}
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      int shift;
Packit 5c3484
      count_leading_zeros (shift, d1);
Packit 5c3484
      d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
Packit 5c3484
      d0 <<= shift;
Packit 5c3484
      invert_pi1 (dinv, d1, d0);
Packit 5c3484
      return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
Packit 5c3484
    }
Packit 5c3484
}