Blame mpn/generic/matrix22_mul.c

Packit 5c3484
/* matrix22_mul.c.
Packit 5c3484
Packit 5c3484
   Contributed by Niels Möller and Marco Bodrato.
Packit 5c3484
Packit 5c3484
   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
Packit 5c3484
   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
Packit 5c3484
   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
Packit 5c3484
Packit 5c3484
Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
This file is part of the GNU MP Library.
Packit 5c3484
Packit 5c3484
The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
it under the terms of either:
Packit 5c3484
Packit 5c3484
  * the GNU Lesser General Public License as published by the Free
Packit 5c3484
    Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
    option) any later version.
Packit 5c3484
Packit 5c3484
or
Packit 5c3484
Packit 5c3484
  * the GNU General Public License as published by the Free Software
Packit 5c3484
    Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
    later version.
Packit 5c3484
Packit 5c3484
or both in parallel, as here.
Packit 5c3484
Packit 5c3484
The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
for more details.
Packit 5c3484
Packit 5c3484
You should have received copies of the GNU General Public License and the
Packit 5c3484
GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
see https://www.gnu.org/licenses/.  */
Packit 5c3484
Packit 5c3484
#include "gmp.h"
Packit 5c3484
#include "gmp-impl.h"
Packit 5c3484
#include "longlong.h"
Packit 5c3484
Packit 5c3484
#define MUL(rp, ap, an, bp, bn) do {		\
Packit 5c3484
  if (an >= bn)					\
Packit 5c3484
    mpn_mul (rp, ap, an, bp, bn);		\
Packit 5c3484
  else						\
Packit 5c3484
    mpn_mul (rp, bp, bn, ap, an);		\
Packit 5c3484
} while (0)
Packit 5c3484
Packit 5c3484
/* Inputs are unsigned. */
Packit 5c3484
static int
Packit 5c3484
abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
Packit 5c3484
{
Packit 5c3484
  int c;
Packit 5c3484
  MPN_CMP (c, ap, bp, n);
Packit 5c3484
  if (c >= 0)
Packit 5c3484
    {
Packit 5c3484
      mpn_sub_n (rp, ap, bp, n);
Packit 5c3484
      return 0;
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      mpn_sub_n (rp, bp, ap, n);
Packit 5c3484
      return 1;
Packit 5c3484
    }
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
static int
Packit 5c3484
add_signed_n (mp_ptr rp,
Packit 5c3484
	      mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n)
Packit 5c3484
{
Packit 5c3484
  if (as != bs)
Packit 5c3484
    return as ^ abs_sub_n (rp, ap, bp, n);
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n));
Packit 5c3484
      return as;
Packit 5c3484
    }
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
mp_size_t
Packit 5c3484
mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
Packit 5c3484
{
Packit 5c3484
  if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
Packit 5c3484
      || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
Packit 5c3484
    return 3*rn + 2*mn;
Packit 5c3484
  else
Packit 5c3484
    return 3*(rn + mn) + 5;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
/* Algorithm:
Packit 5c3484
Packit 5c3484
    / s0 \   /  1  0  0  0 \ / r0 \
Packit 5c3484
    | s1 |   |  0  1  0  1 | | r1 |
Packit 5c3484
    | s2 |   |  0  0 -1  1 | | r2 |
Packit 5c3484
    | s3 | = |  0  1 -1  1 | \ r3 /
Packit 5c3484
    | s4 |   | -1  1 -1  1 |
Packit 5c3484
    | s5 |   |  0  1  0  0 |
Packit 5c3484
    \ s6 /   \  0  0  1  0 /
Packit 5c3484
Packit 5c3484
    / t0 \   /  1  0  0  0 \ / m0 \
Packit 5c3484
    | t1 |   |  0  1  0  1 | | m1 |
Packit 5c3484
    | t2 |   |  0  0 -1  1 | | m2 |
Packit 5c3484
    | t3 | = |  0  1 -1  1 | \ m3 /
Packit 5c3484
    | t4 |   | -1  1 -1  1 |
Packit 5c3484
    | t5 |   |  0  1  0  0 |
Packit 5c3484
    \ t6 /   \  0  0  1  0 /
Packit 5c3484
Packit 5c3484
  Note: the two matrices above are the same, but s_i and t_i are used
Packit 5c3484
  in the same product, only for i<4, see "A Strassen-like Matrix
Packit 5c3484
  Multiplication suited for squaring and higher power computation" by
Packit 5c3484
  M. Bodrato, in Proceedings of ISSAC 2010.
Packit 5c3484
Packit 5c3484
    / r0 \   / 1 0  0  0  0  1  0 \ / s0*t0 \
Packit 5c3484
    | r1 | = | 0 0 -1  1 -1  1  0 | | s1*t1 |
Packit 5c3484
    | r2 |   | 0 1  0 -1  0 -1 -1 | | s2*t2 |
Packit 5c3484
    \ r3 /   \ 0 1  1 -1  0 -1  0 / | s3*t3 |
Packit 5c3484
				    | s4*t5 |
Packit 5c3484
				    | s5*t6 |
Packit 5c3484
				    \ s6*t4 /
Packit 5c3484
Packit 5c3484
  The scheduling uses two temporaries U0 and U1 to store products, and
Packit 5c3484
  two, S0 and T0, to store combinations of entries of the two
Packit 5c3484
  operands.
Packit 5c3484
*/
Packit 5c3484
Packit 5c3484
/* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
Packit 5c3484
 *
Packit 5c3484
 * Resulting elements are of size up to rn + mn + 1.
Packit 5c3484
 *
Packit 5c3484
 * Temporary storage: 3 rn + 3 mn + 5. */
Packit 5c3484
void
Packit 5c3484
mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
Packit 5c3484
			   mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
Packit 5c3484
			   mp_ptr tp)
Packit 5c3484
{
Packit 5c3484
  mp_ptr s0, t0, u0, u1;
Packit 5c3484
  int r1s, r3s, s0s, t0s, u1s;
Packit 5c3484
  s0 = tp; tp += rn + 1;
Packit 5c3484
  t0 = tp; tp += mn + 1;
Packit 5c3484
  u0 = tp; tp += rn + mn + 1;
Packit 5c3484
  u1 = tp; /* rn + mn + 2 */
Packit 5c3484
Packit 5c3484
  MUL (u0, r1, rn, m2, mn);		/* u5 = s5 * t6 */
Packit 5c3484
  r3s = abs_sub_n (r3, r3, r2, rn);	/* r3 - r2 */
Packit 5c3484
  if (r3s)
Packit 5c3484
    {
Packit 5c3484
      r1s = abs_sub_n (r1, r1, r3, rn);
Packit 5c3484
      r1[rn] = 0;
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      r1[rn] = mpn_add_n (r1, r1, r3, rn);
Packit 5c3484
      r1s = 0;				/* r1 - r2 + r3  */
Packit 5c3484
    }
Packit 5c3484
  if (r1s)
Packit 5c3484
    {
Packit 5c3484
      s0[rn] = mpn_add_n (s0, r1, r0, rn);
Packit 5c3484
      s0s = 0;
Packit 5c3484
    }
Packit 5c3484
  else if (r1[rn] != 0)
Packit 5c3484
    {
Packit 5c3484
      s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn);
Packit 5c3484
      s0s = 1;				/* s4 = -r0 + r1 - r2 + r3 */
Packit 5c3484
					/* Reverse sign! */
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      s0s = abs_sub_n (s0, r0, r1, rn);
Packit 5c3484
      s0[rn] = 0;
Packit 5c3484
    }
Packit 5c3484
  MUL (u1, r0, rn, m0, mn);		/* u0 = s0 * t0 */
Packit 5c3484
  r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
Packit 5c3484
  ASSERT (r0[rn+mn] < 2);		/* u0 + u5 */
Packit 5c3484
Packit 5c3484
  t0s = abs_sub_n (t0, m3, m2, mn);
Packit 5c3484
  u1s = r3s^t0s^1;			/* Reverse sign! */
Packit 5c3484
  MUL (u1, r3, rn, t0, mn);		/* u2 = s2 * t2 */
Packit 5c3484
  u1[rn+mn] = 0;
Packit 5c3484
  if (t0s)
Packit 5c3484
    {
Packit 5c3484
      t0s = abs_sub_n (t0, m1, t0, mn);
Packit 5c3484
      t0[mn] = 0;
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      t0[mn] = mpn_add_n (t0, t0, m1, mn);
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs
Packit 5c3484
     at r3. I'd expect that for matrices of random size, the high
Packit 5c3484
     words t0[mn] and r1[rn] are non-zero with a pretty small
Packit 5c3484
     probability. If that can be confirmed this should be done as an
Packit 5c3484
     unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn]))
Packit 5c3484
     add_n. */
Packit 5c3484
  if (t0[mn] != 0)
Packit 5c3484
    {
Packit 5c3484
      MUL (r3, r1, rn, t0, mn + 1);	/* u3 = s3 * t3 */
Packit 5c3484
      ASSERT (r1[rn] < 2);
Packit 5c3484
      if (r1[rn] != 0)
Packit 5c3484
	mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1);
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      MUL (r3, r1, rn + 1, t0, mn);
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  ASSERT (r3[rn+mn] < 4);
Packit 5c3484
Packit 5c3484
  u0[rn+mn] = 0;
Packit 5c3484
  if (r1s^t0s)
Packit 5c3484
    {
Packit 5c3484
      r3s = abs_sub_n (r3, u0, r3, rn + mn + 1);
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1));
Packit 5c3484
      r3s = 0;				/* u3 + u5 */
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  if (t0s)
Packit 5c3484
    {
Packit 5c3484
      t0[mn] = mpn_add_n (t0, t0, m0, mn);
Packit 5c3484
    }
Packit 5c3484
  else if (t0[mn] != 0)
Packit 5c3484
    {
Packit 5c3484
      t0[mn] -= mpn_sub_n (t0, t0, m0, mn);
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      t0s = abs_sub_n (t0, t0, m0, mn);
Packit 5c3484
    }
Packit 5c3484
  MUL (u0, r2, rn, t0, mn + 1);		/* u6 = s6 * t4 */
Packit 5c3484
  ASSERT (u0[rn+mn] < 2);
Packit 5c3484
  if (r1s)
Packit 5c3484
    {
Packit 5c3484
      ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn));
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      r1[rn] += mpn_add_n (r1, r1, r2, rn);
Packit 5c3484
    }
Packit 5c3484
  rn++;
Packit 5c3484
  t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn);
Packit 5c3484
					/* u3 + u5 + u6 */
Packit 5c3484
  ASSERT (r2[rn+mn-1] < 4);
Packit 5c3484
  r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn);
Packit 5c3484
					/* -u2 + u3 + u5  */
Packit 5c3484
  ASSERT (r3[rn+mn-1] < 3);
Packit 5c3484
  MUL (u0, s0, rn, m1, mn);		/* u4 = s4 * t5 */
Packit 5c3484
  ASSERT (u0[rn+mn-1] < 2);
Packit 5c3484
  t0[mn] = mpn_add_n (t0, m3, m1, mn);
Packit 5c3484
  MUL (u1, r1, rn, t0, mn + 1);		/* u1 = s1 * t1 */
Packit 5c3484
  mn += rn;
Packit 5c3484
  ASSERT (u1[mn-1] < 4);
Packit 5c3484
  ASSERT (u1[mn] == 0);
Packit 5c3484
  ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn));
Packit 5c3484
					/* -u2 + u3 - u4 + u5  */
Packit 5c3484
  ASSERT (r1[mn-1] < 2);
Packit 5c3484
  if (r3s)
Packit 5c3484
    {
Packit 5c3484
      ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn));
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn));
Packit 5c3484
					/* u1 + u2 - u3 - u5  */
Packit 5c3484
    }
Packit 5c3484
  ASSERT (r3[mn-1] < 2);
Packit 5c3484
  if (t0s)
Packit 5c3484
    {
Packit 5c3484
      ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn));
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn));
Packit 5c3484
					/* u1 - u3 - u5 - u6  */
Packit 5c3484
    }
Packit 5c3484
  ASSERT (r2[mn-1] < 2);
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
void
Packit 5c3484
mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
Packit 5c3484
		  mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
Packit 5c3484
		  mp_ptr tp)
Packit 5c3484
{
Packit 5c3484
  if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
Packit 5c3484
      || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
Packit 5c3484
    {
Packit 5c3484
      mp_ptr p0, p1;
Packit 5c3484
      unsigned i;
Packit 5c3484
Packit 5c3484
      /* Temporary storage: 3 rn + 2 mn */
Packit 5c3484
      p0 = tp + rn;
Packit 5c3484
      p1 = p0 + rn + mn;
Packit 5c3484
Packit 5c3484
      for (i = 0; i < 2; i++)
Packit 5c3484
	{
Packit 5c3484
	  MPN_COPY (tp, r0, rn);
Packit 5c3484
Packit 5c3484
	  if (rn >= mn)
Packit 5c3484
	    {
Packit 5c3484
	      mpn_mul (p0, r0, rn, m0, mn);
Packit 5c3484
	      mpn_mul (p1, r1, rn, m3, mn);
Packit 5c3484
	      mpn_mul (r0, r1, rn, m2, mn);
Packit 5c3484
	      mpn_mul (r1, tp, rn, m1, mn);
Packit 5c3484
	    }
Packit 5c3484
	  else
Packit 5c3484
	    {
Packit 5c3484
	      mpn_mul (p0, m0, mn, r0, rn);
Packit 5c3484
	      mpn_mul (p1, m3, mn, r1, rn);
Packit 5c3484
	      mpn_mul (r0, m2, mn, r1, rn);
Packit 5c3484
	      mpn_mul (r1, m1, mn, tp, rn);
Packit 5c3484
	    }
Packit 5c3484
	  r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);
Packit 5c3484
	  r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);
Packit 5c3484
Packit 5c3484
	  r0 = r2; r1 = r3;
Packit 5c3484
	}
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,
Packit 5c3484
			       m0, m1, m2, m3, mn, tp);
Packit 5c3484
}