Blame mpn/generic/diveby3.c

Packit 5c3484
/* mpn_divexact_by3c -- mpn exact division by 3.
Packit 5c3484
Packit 5c3484
Copyright 2000-2003, 2008 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
This file is part of the GNU MP Library.
Packit 5c3484
Packit 5c3484
The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
it under the terms of either:
Packit 5c3484
Packit 5c3484
  * the GNU Lesser General Public License as published by the Free
Packit 5c3484
    Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
    option) any later version.
Packit 5c3484
Packit 5c3484
or
Packit 5c3484
Packit 5c3484
  * the GNU General Public License as published by the Free Software
Packit 5c3484
    Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
    later version.
Packit 5c3484
Packit 5c3484
or both in parallel, as here.
Packit 5c3484
Packit 5c3484
The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
for more details.
Packit 5c3484
Packit 5c3484
You should have received copies of the GNU General Public License and the
Packit 5c3484
GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
see https://www.gnu.org/licenses/.  */
Packit 5c3484
Packit 5c3484
#include "gmp.h"
Packit 5c3484
#include "gmp-impl.h"
Packit 5c3484
Packit 5c3484
#if DIVEXACT_BY3_METHOD == 0
Packit 5c3484
Packit 5c3484
mp_limb_t
Packit 5c3484
mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t r;
Packit 5c3484
  r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);
Packit 5c3484
Packit 5c3484
  /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.
Packit 5c3484
     We want to return C.  We compute the remainder mod 4 and notice that the
Packit 5c3484
     inverse of (2^(2k)-1)/3 mod 4 is 1.  */
Packit 5c3484
  return r & 3;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
#if DIVEXACT_BY3_METHOD == 1
Packit 5c3484
Packit 5c3484
/* The algorithm here is basically the same as mpn_divexact_1, as described
Packit 5c3484
   in the manual.  Namely at each step q = (src[i]-c)*inverse, and new c =
Packit 5c3484
   borrow(src[i]-c) + high(divisor*q).  But because the divisor is just 3,
Packit 5c3484
   high(divisor*q) can be determined with two comparisons instead of a
Packit 5c3484
   multiply.
Packit 5c3484
Packit 5c3484
   The "c += ..."s add the high limb of 3*l to c.  That high limb will be 0,
Packit 5c3484
   1 or 2.  Doing two separate "+="s seems to give better code on gcc (as of
Packit 5c3484
   2.95.2 at least).
Packit 5c3484
Packit 5c3484
   It will be noted that the new c is formed by adding three values each 0
Packit 5c3484
   or 1.  But the total is only 0, 1 or 2.  When the subtraction src[i]-c
Packit 5c3484
   causes a borrow, that leaves a limb value of either 0xFF...FF or
Packit 5c3484
   0xFF...FE.  The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or
Packit 5c3484
   0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1
Packit 5c3484
   respectively, hence a total of no more than 2.
Packit 5c3484
Packit 5c3484
   Alternatives:
Packit 5c3484
Packit 5c3484
   This implementation has each multiply on the dependent chain, due to
Packit 5c3484
   "l=s-c".  See below for alternative code which avoids that.  */
Packit 5c3484
Packit 5c3484
mp_limb_t
Packit 5c3484
mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t  l, q, s;
Packit 5c3484
  mp_size_t  i;
Packit 5c3484
Packit 5c3484
  ASSERT (un >= 1);
Packit 5c3484
  ASSERT (c == 0 || c == 1 || c == 2);
Packit 5c3484
  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
Packit 5c3484
Packit 5c3484
  i = 0;
Packit 5c3484
  do
Packit 5c3484
    {
Packit 5c3484
      s = up[i];
Packit 5c3484
      SUBC_LIMB (c, l, s, c);
Packit 5c3484
Packit 5c3484
      q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
Packit 5c3484
      rp[i] = q;
Packit 5c3484
Packit 5c3484
      c += (q >= GMP_NUMB_CEIL_MAX_DIV3);
Packit 5c3484
      c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);
Packit 5c3484
    }
Packit 5c3484
  while (++i < un);
Packit 5c3484
Packit 5c3484
  ASSERT (c == 0 || c == 1 || c == 2);
Packit 5c3484
  return c;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
#if DIVEXACT_BY3_METHOD == 2
Packit 5c3484
Packit 5c3484
/* The following alternative code re-arranges the quotient calculation from
Packit 5c3484
   (src[i]-c)*inverse to instead
Packit 5c3484
Packit 5c3484
       q = src[i]*inverse - c*inverse
Packit 5c3484
Packit 5c3484
   thereby allowing src[i]*inverse to be scheduled back as far as desired,
Packit 5c3484
   making full use of multiplier throughput and leaving just some carry
Packit 5c3484
   handing on the dependent chain.
Packit 5c3484
Packit 5c3484
   The carry handling consists of determining the c for the next iteration.
Packit 5c3484
   This is the same as described above, namely look for any borrow from
Packit 5c3484
   src[i]-c, and at the high of 3*q.
Packit 5c3484
Packit 5c3484
   high(3*q) is done with two comparisons as above (in c2 and c3).  The
Packit 5c3484
   borrow from src[i]-c is incorporated into those by noting that if there's
Packit 5c3484
   a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn
Packit 5c3484
   giving q = 0x55..55 or 0xAA..AA.  Adding 1 to either of those q values is
Packit 5c3484
   enough to make high(3*q) come out 1 bigger, as required.
Packit 5c3484
Packit 5c3484
   l = -c*inverse is calculated at the same time as c, since for most chips
Packit 5c3484
   it can be more conveniently derived from separate c1/c2/c3 values than
Packit 5c3484
   from a combined c equal to 0, 1 or 2.
Packit 5c3484
Packit 5c3484
   The net effect is that with good pipelining this loop should be able to
Packit 5c3484
   run at perhaps 4 cycles/limb, depending on available execute resources
Packit 5c3484
   etc.
Packit 5c3484
Packit 5c3484
   Usage:
Packit 5c3484
Packit 5c3484
   This code is not used by default, since we really can't rely on the
Packit 5c3484
   compiler generating a good software pipeline, nor on such an approach
Packit 5c3484
   even being worthwhile on all CPUs.
Packit 5c3484
Packit 5c3484
   Itanium is one chip where this algorithm helps though, see
Packit 5c3484
   mpn/ia64/diveby3.asm.  */
Packit 5c3484
Packit 5c3484
mp_limb_t
Packit 5c3484
mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t  s, sm, cl, q, qx, c2, c3;
Packit 5c3484
  mp_size_t  i;
Packit 5c3484
Packit 5c3484
  ASSERT (un >= 1);
Packit 5c3484
  ASSERT (cy == 0 || cy == 1 || cy == 2);
Packit 5c3484
  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
Packit 5c3484
Packit 5c3484
  cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;
Packit 5c3484
Packit 5c3484
  for (i = 0; i < un; i++)
Packit 5c3484
    {
Packit 5c3484
      s = up[i];
Packit 5c3484
      sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
Packit 5c3484
Packit 5c3484
      q = (cl + sm) & GMP_NUMB_MASK;
Packit 5c3484
      rp[i] = q;
Packit 5c3484
      qx = q + (s < cy);
Packit 5c3484
Packit 5c3484
      c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;
Packit 5c3484
      c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;
Packit 5c3484
Packit 5c3484
      cy = c2 + c3;
Packit 5c3484
      cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  return cy;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
#endif