Blame mpn/sparc64/mode1o.c

Packit 5c3484
/* UltraSPARC 64 mpn_modexact_1c_odd -- mpn by limb exact style remainder.
Packit 5c3484
Packit 5c3484
   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
Packit 5c3484
   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
Packit 5c3484
   FUTURE GNU MP RELEASES.
Packit 5c3484
Packit 5c3484
Copyright 2000-2003 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
This file is part of the GNU MP Library.
Packit 5c3484
Packit 5c3484
The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
it under the terms of either:
Packit 5c3484
Packit 5c3484
  * the GNU Lesser General Public License as published by the Free
Packit 5c3484
    Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
    option) any later version.
Packit 5c3484
Packit 5c3484
or
Packit 5c3484
Packit 5c3484
  * the GNU General Public License as published by the Free Software
Packit 5c3484
    Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
    later version.
Packit 5c3484
Packit 5c3484
or both in parallel, as here.
Packit 5c3484
Packit 5c3484
The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
for more details.
Packit 5c3484
Packit 5c3484
You should have received copies of the GNU General Public License and the
Packit 5c3484
GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
see https://www.gnu.org/licenses/.  */
Packit 5c3484
Packit 5c3484
#include "gmp.h"
Packit 5c3484
#include "gmp-impl.h"
Packit 5c3484
#include "longlong.h"
Packit 5c3484
Packit 5c3484
#include "mpn/sparc64/sparc64.h"
Packit 5c3484
Packit 5c3484
Packit 5c3484
/*                 64-bit divisor   32-bit divisor
Packit 5c3484
                    cycles/limb      cycles/limb
Packit 5c3484
                     (approx)         (approx)
Packit 5c3484
   Ultrasparc 2i:       ?                ?
Packit 5c3484
*/
Packit 5c3484
Packit 5c3484
Packit 5c3484
/* This implementation reduces the number of multiplies done, knowing that
Packit 5c3484
   on ultrasparc 1 and 2 the mulx instruction stalls the whole chip.
Packit 5c3484
Packit 5c3484
   The key idea is to use the fact that the low limb of q*d equals l, this
Packit 5c3484
   being the whole purpose of the q calculated.  It means there's no need to
Packit 5c3484
   calculate the lowest 32x32->64 part of the q*d, instead it can be
Packit 5c3484
   inferred from l and the other three 32x32->64 parts.  See sparc64.h for
Packit 5c3484
   details.
Packit 5c3484
Packit 5c3484
   When d is 32-bits, the same applies, but in this case there's only one
Packit 5c3484
   other 32x32->64 part (ie. HIGH(q)*d).
Packit 5c3484
Packit 5c3484
   The net effect is that for 64-bit divisor each limb is 4 mulx, or for
Packit 5c3484
   32-bit divisor each is 2 mulx.
Packit 5c3484
Packit 5c3484
   Enhancements:
Packit 5c3484
Packit 5c3484
   No doubt this could be done in assembler, if that helped the scheduling,
Packit 5c3484
   or perhaps guaranteed good code irrespective of the compiler.
Packit 5c3484
Packit 5c3484
   Alternatives:
Packit 5c3484
Packit 5c3484
   It might be possibly to use floating point.  The loop is dominated by
Packit 5c3484
   multiply latency, so not sure if floats would improve that.  One
Packit 5c3484
   possibility would be to take two limbs at a time, with a 128 bit inverse,
Packit 5c3484
   if there's enough registers, which could effectively use float throughput
Packit 5c3484
   to reduce total latency across two limbs.  */
Packit 5c3484
Packit 5c3484
#define ASSERT_RETVAL(r)                \
Packit 5c3484
  ASSERT (orig_c < d ? r < d : r <= d)
Packit 5c3484
Packit 5c3484
mp_limb_t
Packit 5c3484
mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t orig_c)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t  c = orig_c;
Packit 5c3484
  mp_limb_t  s, l, q, h, inverse;
Packit 5c3484
Packit 5c3484
  ASSERT (size >= 1);
Packit 5c3484
  ASSERT (d & 1);
Packit 5c3484
  ASSERT_MPN (src, size);
Packit 5c3484
  ASSERT_LIMB (d);
Packit 5c3484
  ASSERT_LIMB (c);
Packit 5c3484
Packit 5c3484
  /* udivx is faster than 10 or 12 mulx's for one limb via an inverse */
Packit 5c3484
  if (size == 1)
Packit 5c3484
    {
Packit 5c3484
      s = src[0];
Packit 5c3484
      if (s > c)
Packit 5c3484
	{
Packit 5c3484
	  l = s-c;
Packit 5c3484
	  h = l % d;
Packit 5c3484
	  if (h != 0)
Packit 5c3484
	    h = d - h;
Packit 5c3484
	}
Packit 5c3484
      else
Packit 5c3484
	{
Packit 5c3484
	  l = c-s;
Packit 5c3484
	  h = l % d;
Packit 5c3484
	}
Packit 5c3484
      return h;
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  binvert_limb (inverse, d);
Packit 5c3484
Packit 5c3484
  if (d <= 0xFFFFFFFF)
Packit 5c3484
    {
Packit 5c3484
      s = *src++;
Packit 5c3484
      size--;
Packit 5c3484
      do
Packit 5c3484
        {
Packit 5c3484
          SUBC_LIMB (c, l, s, c);
Packit 5c3484
          s = *src++;
Packit 5c3484
          q = l * inverse;
Packit 5c3484
          umul_ppmm_half_lowequal (h, q, d, l);
Packit 5c3484
          c += h;
Packit 5c3484
          size--;
Packit 5c3484
        }
Packit 5c3484
      while (size != 0);
Packit 5c3484
Packit 5c3484
      if (s <= d)
Packit 5c3484
        {
Packit 5c3484
          /* With high s <= d the final step can be a subtract and addback.
Packit 5c3484
             If c==0 then the addback will restore to l>=0.  If c==d then
Packit 5c3484
             will get l==d if s==0, but that's ok per the function
Packit 5c3484
             definition.  */
Packit 5c3484
Packit 5c3484
          l = c - s;
Packit 5c3484
          l += (l > c ? d : 0);
Packit 5c3484
Packit 5c3484
          ASSERT_RETVAL (l);
Packit 5c3484
          return l;
Packit 5c3484
        }
Packit 5c3484
      else
Packit 5c3484
        {
Packit 5c3484
          /* Can't skip a divide, just do the loop code once more. */
Packit 5c3484
          SUBC_LIMB (c, l, s, c);
Packit 5c3484
          q = l * inverse;
Packit 5c3484
          umul_ppmm_half_lowequal (h, q, d, l);
Packit 5c3484
          c += h;
Packit 5c3484
Packit 5c3484
          ASSERT_RETVAL (c);
Packit 5c3484
          return c;
Packit 5c3484
        }
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      mp_limb_t  dl = LOW32 (d);
Packit 5c3484
      mp_limb_t  dh = HIGH32 (d);
Packit 5c3484
      long i;
Packit 5c3484
Packit 5c3484
      s = *src++;
Packit 5c3484
      size--;
Packit 5c3484
      do
Packit 5c3484
        {
Packit 5c3484
          SUBC_LIMB (c, l, s, c);
Packit 5c3484
          s = *src++;
Packit 5c3484
          q = l * inverse;
Packit 5c3484
          umul_ppmm_lowequal (h, q, d, dh, dl, l);
Packit 5c3484
          c += h;
Packit 5c3484
          size--;
Packit 5c3484
        }
Packit 5c3484
      while (size != 0);
Packit 5c3484
Packit 5c3484
      if (s <= d)
Packit 5c3484
        {
Packit 5c3484
          /* With high s <= d the final step can be a subtract and addback.
Packit 5c3484
             If c==0 then the addback will restore to l>=0.  If c==d then
Packit 5c3484
             will get l==d if s==0, but that's ok per the function
Packit 5c3484
             definition.  */
Packit 5c3484
Packit 5c3484
          l = c - s;
Packit 5c3484
          l += (l > c ? d : 0);
Packit 5c3484
Packit 5c3484
          ASSERT_RETVAL (l);
Packit 5c3484
          return l;
Packit 5c3484
        }
Packit 5c3484
      else
Packit 5c3484
        {
Packit 5c3484
          /* Can't skip a divide, just do the loop code once more. */
Packit 5c3484
          SUBC_LIMB (c, l, s, c);
Packit 5c3484
          q = l * inverse;
Packit 5c3484
          umul_ppmm_lowequal (h, q, d, dh, dl, l);
Packit 5c3484
          c += h;
Packit 5c3484
Packit 5c3484
          ASSERT_RETVAL (c);
Packit 5c3484
          return c;
Packit 5c3484
        }
Packit 5c3484
    }
Packit 5c3484
}