Blame mpn/generic/div_qr_1n_pi2.c

Packit 5c3484
/* mpn_div_qr_1u_pi2.
Packit 5c3484
Packit 5c3484
   THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS
Packit 5c3484
   ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
Packit 5c3484
   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
Packit 5c3484
Packit 5c3484
Copyright 2013 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
This file is part of the GNU MP Library.
Packit 5c3484
Packit 5c3484
The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
it under the terms of either:
Packit 5c3484
Packit 5c3484
  * the GNU Lesser General Public License as published by the Free
Packit 5c3484
    Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
    option) any later version.
Packit 5c3484
Packit 5c3484
or
Packit 5c3484
Packit 5c3484
  * the GNU General Public License as published by the Free Software
Packit 5c3484
    Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
    later version.
Packit 5c3484
Packit 5c3484
or both in parallel, as here.
Packit 5c3484
Packit 5c3484
The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
for more details.
Packit 5c3484
Packit 5c3484
You should have received copies of the GNU General Public License and the
Packit 5c3484
GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
see https://www.gnu.org/licenses/.  */
Packit 5c3484
Packit 5c3484
/* ISSUES:
Packit 5c3484
Packit 5c3484
   * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
Packit 5c3484
Packit 5c3484
   * Are there any problems with generating n quotient limbs in the q area?  It
Packit 5c3484
     surely simplifies things.
Packit 5c3484
Packit 5c3484
   * Not yet adequately tested.
Packit 5c3484
*/
Packit 5c3484
Packit 5c3484
#include "gmp.h"
Packit 5c3484
#include "gmp-impl.h"
Packit 5c3484
#include "longlong.h"
Packit 5c3484
Packit 5c3484
/* Define some longlong.h-style macros, but for wider operations.
Packit 5c3484
   * add_sssaaaa is like longlong.h's add_ssaaaa but propagating
Packit 5c3484
     carry-out into an additional sum operand.
Packit 5c3484
*/
Packit 5c3484
#if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
Packit 5c3484
Packit 5c3484
#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
Packit 5c3484
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
Packit 5c3484
  __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
Packit 5c3484
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
Packit 5c3484
	   : "0"  ((USItype)(s2)),					\
Packit 5c3484
	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
Packit 5c3484
	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
#if defined (__amd64__) && W_TYPE_SIZE == 64
Packit 5c3484
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
Packit 5c3484
  __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
Packit 5c3484
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
Packit 5c3484
	   : "0"  ((UDItype)(s2)),					\
Packit 5c3484
	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
Packit 5c3484
	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
Packit 5c3484
/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
Packit 5c3484
   processor running in 32-bit mode, since the carry flag then gets the 32-bit
Packit 5c3484
   carry.  */
Packit 5c3484
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
Packit 5c3484
  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"	\
Packit 5c3484
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
Packit 5c3484
	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
#endif /* __GNUC__ */
Packit 5c3484
Packit 5c3484
#ifndef add_sssaaaa
Packit 5c3484
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
Packit 5c3484
  do {									\
Packit 5c3484
    UWtype __s0, __s1, __c0, __c1;					\
Packit 5c3484
    __s0 = (a0) + (b0);							\
Packit 5c3484
    __s1 = (a1) + (b1);							\
Packit 5c3484
    __c0 = __s0 < (a0);							\
Packit 5c3484
    __c1 = __s1 < (a1);							\
Packit 5c3484
    (s0) = __s0;							\
Packit 5c3484
    __s1 = __s1 + __c0;							\
Packit 5c3484
    (s1) = __s1;							\
Packit 5c3484
    (s2) += __c1 + (__s1 < __c0);					\
Packit 5c3484
  } while (0)
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
struct precomp_div_1_pi2
Packit 5c3484
{
Packit 5c3484
  mp_limb_t dip[2];
Packit 5c3484
  mp_limb_t d;
Packit 5c3484
  int norm_cnt;
Packit 5c3484
};
Packit 5c3484
Packit 5c3484
mp_limb_t
Packit 5c3484
mpn_div_qr_1n_pi2 (mp_ptr qp,
Packit 5c3484
		   mp_srcptr up, mp_size_t un,
Packit 5c3484
		   struct precomp_div_1_pi2 *pd)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t most_significant_q_limb;
Packit 5c3484
  mp_size_t i;
Packit 5c3484
  mp_limb_t r, u2, u1, u0;
Packit 5c3484
  mp_limb_t d0, di1, di0;
Packit 5c3484
  mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
Packit 5c3484
  mp_limb_t cnd;
Packit 5c3484
Packit 5c3484
  ASSERT (un >= 2);
Packit 5c3484
  ASSERT ((pd->d & GMP_NUMB_HIGHBIT) != 0);
Packit 5c3484
  ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
Packit 5c3484
  ASSERT_MPN (up, un);
Packit 5c3484
Packit 5c3484
#define q3 q3a
Packit 5c3484
#define q2 q2b
Packit 5c3484
#define q1 q1b
Packit 5c3484
Packit 5c3484
  up += un - 3;
Packit 5c3484
  r = up[2];
Packit 5c3484
  d0 = pd->d;
Packit 5c3484
Packit 5c3484
  most_significant_q_limb = (r >= d0);
Packit 5c3484
  r -= d0 & -most_significant_q_limb;
Packit 5c3484
Packit 5c3484
  qp += un - 3;
Packit 5c3484
  qp[2] = most_significant_q_limb;
Packit 5c3484
Packit 5c3484
  di1 = pd->dip[1];
Packit 5c3484
  di0 = pd->dip[0];
Packit 5c3484
Packit 5c3484
  for (i = un - 3; i >= 0; i -= 2)
Packit 5c3484
    {
Packit 5c3484
      u2 = r;
Packit 5c3484
      u1 = up[1];
Packit 5c3484
      u0 = up[0];
Packit 5c3484
Packit 5c3484
      /* Dividend in {r,u1,u0} */
Packit 5c3484
Packit 5c3484
      umul_ppmm (q1d,q0d, u1, di0);
Packit 5c3484
      umul_ppmm (q2b,q1b, u1, di1);
Packit 5c3484
      q2b++;				/* cannot spill */
Packit 5c3484
      add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
Packit 5c3484
Packit 5c3484
      umul_ppmm (q2c,q1c, u2,  di0);
Packit 5c3484
      add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
Packit 5c3484
      umul_ppmm (q3a,q2a, u2, di1);
Packit 5c3484
Packit 5c3484
      add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
Packit 5c3484
Packit 5c3484
      q3 += r;
Packit 5c3484
Packit 5c3484
      r = u0 - q2 * d0;
Packit 5c3484
Packit 5c3484
      cnd = (r >= q1);
Packit 5c3484
      r += d0 & -cnd;
Packit 5c3484
      sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
Packit 5c3484
Packit 5c3484
      if (UNLIKELY (r >= d0))
Packit 5c3484
	{
Packit 5c3484
	  r -= d0;
Packit 5c3484
	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
Packit 5c3484
	}
Packit 5c3484
Packit 5c3484
      qp[0] = q2;
Packit 5c3484
      qp[1] = q3;
Packit 5c3484
Packit 5c3484
      up -= 2;
Packit 5c3484
      qp -= 2;
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  if ((un & 1) == 0)
Packit 5c3484
    {
Packit 5c3484
      u2 = r;
Packit 5c3484
      u1 = up[1];
Packit 5c3484
Packit 5c3484
      udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
Packit 5c3484
      qp[1] = q3;
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  return r;
Packit 5c3484
Packit 5c3484
#undef q3
Packit 5c3484
#undef q2
Packit 5c3484
#undef q1
Packit 5c3484
}