Blame mpn/cray/ieee/submul_1.c

Packit 5c3484
/* Cray PVP/IEEE mpn_submul_1 -- multiply a limb vector with a limb and
Packit 5c3484
   subtract the result from a second limb vector.
Packit 5c3484
Packit 5c3484
Copyright 2000-2002 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
This file is part of the GNU MP Library.
Packit 5c3484
Packit 5c3484
The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
it under the terms of either:
Packit 5c3484
Packit 5c3484
  * the GNU Lesser General Public License as published by the Free
Packit 5c3484
    Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
    option) any later version.
Packit 5c3484
Packit 5c3484
or
Packit 5c3484
Packit 5c3484
  * the GNU General Public License as published by the Free Software
Packit 5c3484
    Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
    later version.
Packit 5c3484
Packit 5c3484
or both in parallel, as here.
Packit 5c3484
Packit 5c3484
The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
for more details.
Packit 5c3484
Packit 5c3484
You should have received copies of the GNU General Public License and the
Packit 5c3484
GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
see https://www.gnu.org/licenses/.  */
Packit 5c3484
Packit 5c3484
/* This code runs at just under 9 cycles/limb on a T90.  That is not perfect,
Packit 5c3484
   mainly due to vector register shortage in the main loop.  Assembly code
Packit 5c3484
   should bring it down to perhaps 7 cycles/limb.  */
Packit 5c3484
Packit 5c3484
#include <intrinsics.h>
Packit 5c3484
#include "gmp.h"
Packit 5c3484
#include "gmp-impl.h"
Packit 5c3484
Packit 5c3484
mp_limb_t
Packit 5c3484
mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t cy[n];
Packit 5c3484
  mp_limb_t a, b, r, s0, s1, c0, c1;
Packit 5c3484
  mp_size_t i;
Packit 5c3484
  int more_carries;
Packit 5c3484
Packit 5c3484
  if (up == rp)
Packit 5c3484
    {
Packit 5c3484
      /* The algorithm used below cannot handle overlap.  Handle it here by
Packit 5c3484
	 making a temporary copy of the source vector, then call ourselves.  */
Packit 5c3484
      mp_limb_t xp[n];
Packit 5c3484
      MPN_COPY (xp, up, n);
Packit 5c3484
      return mpn_submul_1 (rp, xp, n, vl);
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  a = up[0] * vl;
Packit 5c3484
  r = rp[0];
Packit 5c3484
  s0 = r - a;
Packit 5c3484
  rp[0] = s0;
Packit 5c3484
  c1 = ((s0 & a) | ((s0 | a) & ~r)) >> 63;
Packit 5c3484
  cy[0] = c1;
Packit 5c3484
Packit 5c3484
  /* Main multiply loop.  Generate a raw accumulated output product in rp[]
Packit 5c3484
     and a carry vector in cy[].  */
Packit 5c3484
#pragma _CRI ivdep
Packit 5c3484
  for (i = 1; i < n; i++)
Packit 5c3484
    {
Packit 5c3484
      a = up[i] * vl;
Packit 5c3484
      b = _int_mult_upper (up[i - 1], vl);
Packit 5c3484
      s0 = a + b;
Packit 5c3484
      c0 = ((a & b) | ((a | b) & ~s0)) >> 63;
Packit 5c3484
      r = rp[i];
Packit 5c3484
      s1 = r - s0;
Packit 5c3484
      rp[i] = s1;
Packit 5c3484
      c1 = ((s1 & s0) | ((s1 | s0) & ~r)) >> 63;
Packit 5c3484
      cy[i] = c0 + c1;
Packit 5c3484
    }
Packit 5c3484
  /* Carry subtract loop.  Subtract the carry vector cy[] from the raw result
Packit 5c3484
     rp[] and store the new result back to rp[].  */
Packit 5c3484
  more_carries = 0;
Packit 5c3484
#pragma _CRI ivdep
Packit 5c3484
  for (i = 1; i < n; i++)
Packit 5c3484
    {
Packit 5c3484
      r = rp[i];
Packit 5c3484
      c0 = cy[i - 1];
Packit 5c3484
      s0 = r - c0;
Packit 5c3484
      rp[i] = s0;
Packit 5c3484
      c0 = (s0 & ~r) >> 63;
Packit 5c3484
      more_carries += c0;
Packit 5c3484
    }
Packit 5c3484
  /* If that second loop generated carry, handle that in scalar loop.  */
Packit 5c3484
  if (more_carries)
Packit 5c3484
    {
Packit 5c3484
      mp_limb_t cyrec = 0;
Packit 5c3484
      /* Look for places where rp[k] == ~0 and cy[k-1] == 1 or
Packit 5c3484
	 rp[k] == ~1 and cy[k-1] == 2.
Packit 5c3484
	 These are where we got a recurrency carry.  */
Packit 5c3484
      for (i = 1; i < n; i++)
Packit 5c3484
	{
Packit 5c3484
	  r = rp[i];
Packit 5c3484
	  c0 = ~r < cy[i - 1];
Packit 5c3484
	  s0 = r - cyrec;
Packit 5c3484
	  rp[i] = s0;
Packit 5c3484
	  c1 = (s0 & ~r) >> 63;
Packit 5c3484
	  cyrec = c0 | c1;
Packit 5c3484
	}
Packit 5c3484
      return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1];
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  return _int_mult_upper (up[n - 1], vl) + cy[n - 1];
Packit 5c3484
}