Blame mpn/generic/toom32_mul.c

Packit 5c3484
/* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5
Packit 5c3484
   times as large as bn.  Or more accurately, bn < an < 3bn.
Packit 5c3484
Packit 5c3484
   Contributed to the GNU project by Torbjorn Granlund.
Packit 5c3484
   Improvements by Marco Bodrato and Niels Möller.
Packit 5c3484
Packit 5c3484
   The idea of applying toom to unbalanced multiplication is due to Marco
Packit 5c3484
   Bodrato and Alberto Zanoni.
Packit 5c3484
Packit 5c3484
   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
Packit 5c3484
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
Packit 5c3484
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
Packit 5c3484
Packit 5c3484
Copyright 2006-2010 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
This file is part of the GNU MP Library.
Packit 5c3484
Packit 5c3484
The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
it under the terms of either:
Packit 5c3484
Packit 5c3484
  * the GNU Lesser General Public License as published by the Free
Packit 5c3484
    Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
    option) any later version.
Packit 5c3484
Packit 5c3484
or
Packit 5c3484
Packit 5c3484
  * the GNU General Public License as published by the Free Software
Packit 5c3484
    Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
    later version.
Packit 5c3484
Packit 5c3484
or both in parallel, as here.
Packit 5c3484
Packit 5c3484
The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
for more details.
Packit 5c3484
Packit 5c3484
You should have received copies of the GNU General Public License and the
Packit 5c3484
GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
see https://www.gnu.org/licenses/.  */
Packit 5c3484
Packit 5c3484
Packit 5c3484
#include "gmp.h"
Packit 5c3484
#include "gmp-impl.h"
Packit 5c3484
Packit 5c3484
/* Evaluate in: -1, 0, +1, +inf
Packit 5c3484
Packit 5c3484
  <-s-><--n--><--n-->
Packit 5c3484
   ___ ______ ______
Packit 5c3484
  |a2_|___a1_|___a0_|
Packit 5c3484
	|_b1_|___b0_|
Packit 5c3484
	<-t--><--n-->
Packit 5c3484
Packit 5c3484
  v0  =  a0         * b0      #   A(0)*B(0)
Packit 5c3484
  v1  = (a0+ a1+ a2)*(b0+ b1) #   A(1)*B(1)      ah  <= 2  bh <= 1
Packit 5c3484
  vm1 = (a0- a1+ a2)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 1  bh = 0
Packit 5c3484
  vinf=          a2 *     b1  # A(inf)*B(inf)
Packit 5c3484
*/
Packit 5c3484
Packit 5c3484
#define TOOM32_MUL_N_REC(p, a, b, n, ws)				\
Packit 5c3484
  do {									\
Packit 5c3484
    mpn_mul_n (p, a, b, n);						\
Packit 5c3484
  } while (0)
Packit 5c3484
Packit 5c3484
void
Packit 5c3484
mpn_toom32_mul (mp_ptr pp,
Packit 5c3484
		mp_srcptr ap, mp_size_t an,
Packit 5c3484
		mp_srcptr bp, mp_size_t bn,
Packit 5c3484
		mp_ptr scratch)
Packit 5c3484
{
Packit 5c3484
  mp_size_t n, s, t;
Packit 5c3484
  int vm1_neg;
Packit 5c3484
  mp_limb_t cy;
Packit 5c3484
  mp_limb_signed_t hi;
Packit 5c3484
  mp_limb_t ap1_hi, bp1_hi;
Packit 5c3484
Packit 5c3484
#define a0  ap
Packit 5c3484
#define a1  (ap + n)
Packit 5c3484
#define a2  (ap + 2 * n)
Packit 5c3484
#define b0  bp
Packit 5c3484
#define b1  (bp + n)
Packit 5c3484
Packit 5c3484
  /* Required, to ensure that s + t >= n. */
Packit 5c3484
  ASSERT (bn + 2 <= an && an + 6 <= 3*bn);
Packit 5c3484
Packit 5c3484
  n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
Packit 5c3484
Packit 5c3484
  s = an - 2 * n;
Packit 5c3484
  t = bn - n;
Packit 5c3484
Packit 5c3484
  ASSERT (0 < s && s <= n);
Packit 5c3484
  ASSERT (0 < t && t <= n);
Packit 5c3484
  ASSERT (s + t >= n);
Packit 5c3484
Packit 5c3484
  /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */
Packit 5c3484
#define ap1 (pp)		/* n, most significant limb in ap1_hi */
Packit 5c3484
#define bp1 (pp + n)		/* n, most significant bit in bp1_hi */
Packit 5c3484
#define am1 (pp + 2*n)		/* n, most significant bit in hi */
Packit 5c3484
#define bm1 (pp + 3*n)		/* n */
Packit 5c3484
#define v1 (scratch)		/* 2n + 1 */
Packit 5c3484
#define vm1 (pp)		/* 2n + 1 */
Packit 5c3484
#define scratch_out (scratch + 2*n + 1) /* Currently unused. */
Packit 5c3484
Packit 5c3484
  /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */
Packit 5c3484
Packit 5c3484
  /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */
Packit 5c3484
Packit 5c3484
  /* Compute ap1 = a0 + a1 + a3, am1 = a0 - a1 + a3 */
Packit 5c3484
  ap1_hi = mpn_add (ap1, a0, n, a2, s);
Packit 5c3484
#if HAVE_NATIVE_mpn_add_n_sub_n
Packit 5c3484
  if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
Packit 5c3484
    {
Packit 5c3484
      ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1;
Packit 5c3484
      hi = 0;
Packit 5c3484
      vm1_neg = 1;
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n);
Packit 5c3484
      hi = ap1_hi - (cy & 1);
Packit 5c3484
      ap1_hi += (cy >> 1);
Packit 5c3484
      vm1_neg = 0;
Packit 5c3484
    }
Packit 5c3484
#else
Packit 5c3484
  if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
Packit 5c3484
    {
Packit 5c3484
      ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n));
Packit 5c3484
      hi = 0;
Packit 5c3484
      vm1_neg = 1;
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n);
Packit 5c3484
      vm1_neg = 0;
Packit 5c3484
    }
Packit 5c3484
  ap1_hi += mpn_add_n (ap1, ap1, a1, n);
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
  /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */
Packit 5c3484
  if (t == n)
Packit 5c3484
    {
Packit 5c3484
#if HAVE_NATIVE_mpn_add_n_sub_n
Packit 5c3484
      if (mpn_cmp (b0, b1, n) < 0)
Packit 5c3484
	{
Packit 5c3484
	  cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n);
Packit 5c3484
	  vm1_neg ^= 1;
Packit 5c3484
	}
Packit 5c3484
      else
Packit 5c3484
	{
Packit 5c3484
	  cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n);
Packit 5c3484
	}
Packit 5c3484
      bp1_hi = cy >> 1;
Packit 5c3484
#else
Packit 5c3484
      bp1_hi = mpn_add_n (bp1, b0, b1, n);
Packit 5c3484
Packit 5c3484
      if (mpn_cmp (b0, b1, n) < 0)
Packit 5c3484
	{
Packit 5c3484
	  ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n));
Packit 5c3484
	  vm1_neg ^= 1;
Packit 5c3484
	}
Packit 5c3484
      else
Packit 5c3484
	{
Packit 5c3484
	  ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n));
Packit 5c3484
	}
Packit 5c3484
#endif
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      /* FIXME: Should still use mpn_add_n_sub_n for the main part. */
Packit 5c3484
      bp1_hi = mpn_add (bp1, b0, n, b1, t);
Packit 5c3484
Packit 5c3484
      if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
Packit 5c3484
	{
Packit 5c3484
	  ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t));
Packit 5c3484
	  MPN_ZERO (bm1 + t, n - t);
Packit 5c3484
	  vm1_neg ^= 1;
Packit 5c3484
	}
Packit 5c3484
      else
Packit 5c3484
	{
Packit 5c3484
	  ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t));
Packit 5c3484
	}
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out);
Packit 5c3484
  if (ap1_hi == 1)
Packit 5c3484
    {
Packit 5c3484
      cy = bp1_hi + mpn_add_n (v1 + n, v1 + n, bp1, n);
Packit 5c3484
    }
Packit 5c3484
  else if (ap1_hi == 2)
Packit 5c3484
    {
Packit 5c3484
#if HAVE_NATIVE_mpn_addlsh1_n
Packit 5c3484
      cy = 2 * bp1_hi + mpn_addlsh1_n (v1 + n, v1 + n, bp1, n);
Packit 5c3484
#else
Packit 5c3484
      cy = 2 * bp1_hi + mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2));
Packit 5c3484
#endif
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    cy = 0;
Packit 5c3484
  if (bp1_hi != 0)
Packit 5c3484
    cy += mpn_add_n (v1 + n, v1 + n, ap1, n);
Packit 5c3484
  v1[2 * n] = cy;
Packit 5c3484
Packit 5c3484
  TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out);
Packit 5c3484
  if (hi)
Packit 5c3484
    hi = mpn_add_n (vm1+n, vm1+n, bm1, n);
Packit 5c3484
Packit 5c3484
  vm1[2*n] = hi;
Packit 5c3484
Packit 5c3484
  /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */
Packit 5c3484
  if (vm1_neg)
Packit 5c3484
    {
Packit 5c3484
#if HAVE_NATIVE_mpn_rsh1sub_n
Packit 5c3484
      mpn_rsh1sub_n (v1, v1, vm1, 2*n+1);
Packit 5c3484
#else
Packit 5c3484
      mpn_sub_n (v1, v1, vm1, 2*n+1);
Packit 5c3484
      ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
Packit 5c3484
#endif
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
#if HAVE_NATIVE_mpn_rsh1add_n
Packit 5c3484
      mpn_rsh1add_n (v1, v1, vm1, 2*n+1);
Packit 5c3484
#else
Packit 5c3484
      mpn_add_n (v1, v1, vm1, 2*n+1);
Packit 5c3484
      ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
Packit 5c3484
#endif
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence
Packit 5c3484
Packit 5c3484
     y = x1 + x3 + (x0 + x2) * B
Packit 5c3484
       = (x0 + x2) * B + (x0 + x2) - vm1.
Packit 5c3484
Packit 5c3484
     y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as
Packit 5c3484
     follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n
Packit 5c3484
     (already in place, except for carry propagation).
Packit 5c3484
Packit 5c3484
     We thus add
Packit 5c3484
Packit 5c3484
   B^3  B^2   B    1
Packit 5c3484
    |    |    |    |
Packit 5c3484
   +-----+----+
Packit 5c3484
 + |  x0 + x2 |
Packit 5c3484
   +----+-----+----+
Packit 5c3484
 +      |  x0 + x2 |
Packit 5c3484
	+----------+
Packit 5c3484
 -      |  vm1     |
Packit 5c3484
 --+----++----+----+-
Packit 5c3484
   | y2  | y1 | y0 |
Packit 5c3484
   +-----+----+----+
Packit 5c3484
Packit 5c3484
  Since we store y0 at the same location as the low half of x0 + x2, we
Packit 5c3484
  need to do the middle sum first. */
Packit 5c3484
Packit 5c3484
  hi = vm1[2*n];
Packit 5c3484
  cy = mpn_add_n (pp + 2*n, v1, v1 + n, n);
Packit 5c3484
  MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]);
Packit 5c3484
Packit 5c3484
  /* FIXME: Can we get rid of this second vm1_neg conditional by
Packit 5c3484
     swapping the location of +1 and -1 values? */
Packit 5c3484
  if (vm1_neg)
Packit 5c3484
    {
Packit 5c3484
      cy = mpn_add_n (v1, v1, vm1, n);
Packit 5c3484
      hi += mpn_add_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
Packit 5c3484
      MPN_INCR_U (v1 + n, n+1, hi);
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      cy = mpn_sub_n (v1, v1, vm1, n);
Packit 5c3484
      hi += mpn_sub_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
Packit 5c3484
      MPN_DECR_U (v1 + n, n+1, hi);
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out);
Packit 5c3484
  /* vinf, s+t limbs.  Use mpn_mul for now, to handle unbalanced operands */
Packit 5c3484
  if (s > t)  mpn_mul (pp+3*n, a2, s, b1, t);
Packit 5c3484
  else        mpn_mul (pp+3*n, b1, t, a2, s);
Packit 5c3484
Packit 5c3484
  /* Remaining interpolation.
Packit 5c3484
Packit 5c3484
     y * B + x0 + x3 B^3 - x0 B^2 - x3 B
Packit 5c3484
     = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B
Packit 5c3484
     = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B
Packit 5c3484
       + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2
Packit 5c3484
     = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2
Packit 5c3484
       + (y2 - (H x0 - L x3)) B^3 + H x3 B^4
Packit 5c3484
Packit 5c3484
	  B^4       B^3       B^2        B         1
Packit 5c3484
 |         |         |         |         |         |
Packit 5c3484
   +-------+                   +---------+---------+
Packit 5c3484
   |  Hx3  |                   | Hx0-Lx3 |    Lx0  |
Packit 5c3484
   +------+----------+---------+---------+---------+
Packit 5c3484
	  |    y2    |  y1     |   y0    |
Packit 5c3484
	  ++---------+---------+---------+
Packit 5c3484
	  -| Hx0-Lx3 | - Lx0   |
Packit 5c3484
	   +---------+---------+
Packit 5c3484
		      | - Hx3  |
Packit 5c3484
		      +--------+
Packit 5c3484
Packit 5c3484
    We must take into account the carry from Hx0 - Lx3.
Packit 5c3484
  */
Packit 5c3484
Packit 5c3484
  cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n);
Packit 5c3484
  hi = scratch[2*n] + cy;
Packit 5c3484
Packit 5c3484
  cy = mpn_sub_nc (pp + 2*n, pp + 2*n, pp, n, cy);
Packit 5c3484
  hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy);
Packit 5c3484
Packit 5c3484
  hi += mpn_add (pp + n, pp + n, 3*n, scratch, n);
Packit 5c3484
Packit 5c3484
  /* FIXME: Is support for s + t == n needed? */
Packit 5c3484
  if (LIKELY (s + t > n))
Packit 5c3484
    {
Packit 5c3484
      hi -= mpn_sub (pp + 2*n, pp + 2*n, 2*n, pp + 4*n, s+t-n);
Packit 5c3484
Packit 5c3484
      if (hi < 0)
Packit 5c3484
	MPN_DECR_U (pp + 4*n, s+t-n, -hi);
Packit 5c3484
      else
Packit 5c3484
	MPN_INCR_U (pp + 4*n, s+t-n, hi);
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    ASSERT (hi == 0);
Packit 5c3484
}