Blame mpn/generic/toom44_mul.c

Packit 5c3484
/* mpn_toom44_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in
Packit 5c3484
   size.  Or more accurately, bn <= an < (4/3)bn.
Packit 5c3484
Packit 5c3484
   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
Packit 5c3484
Packit 5c3484
   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
Packit 5c3484
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
Packit 5c3484
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
Packit 5c3484
Packit 5c3484
Copyright 2006-2008, 2013 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
This file is part of the GNU MP Library.
Packit 5c3484
Packit 5c3484
The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
it under the terms of either:
Packit 5c3484
Packit 5c3484
  * the GNU Lesser General Public License as published by the Free
Packit 5c3484
    Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
    option) any later version.
Packit 5c3484
Packit 5c3484
or
Packit 5c3484
Packit 5c3484
  * the GNU General Public License as published by the Free Software
Packit 5c3484
    Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
    later version.
Packit 5c3484
Packit 5c3484
or both in parallel, as here.
Packit 5c3484
Packit 5c3484
The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
for more details.
Packit 5c3484
Packit 5c3484
You should have received copies of the GNU General Public License and the
Packit 5c3484
GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
see https://www.gnu.org/licenses/.  */
Packit 5c3484
Packit 5c3484
Packit 5c3484
#include "gmp.h"
Packit 5c3484
#include "gmp-impl.h"
Packit 5c3484
Packit 5c3484
/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
Packit 5c3484
Packit 5c3484
  <-s--><--n--><--n--><--n-->
Packit 5c3484
   ____ ______ ______ ______
Packit 5c3484
  |_a3_|___a2_|___a1_|___a0_|
Packit 5c3484
   |b3_|___b2_|___b1_|___b0_|
Packit 5c3484
   <-t-><--n--><--n--><--n-->
Packit 5c3484
Packit 5c3484
  v0  =   a0             *  b0              #    A(0)*B(0)
Packit 5c3484
  v1  = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) #    A(1)*B(1)      ah  <= 3   bh  <= 3
Packit 5c3484
  vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) #   A(-1)*B(-1)    |ah| <= 1  |bh| <= 1
Packit 5c3484
  v2  = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) #    A(2)*B(2)      ah  <= 14  bh  <= 14
Packit 5c3484
  vm2 = ( a0-2a1+4a2-8a3)*( b0-2b1+4b2-8b3) #    A(2)*B(2)      ah  <= 9  |bh| <= 9
Packit 5c3484
  vh  = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) #  A(1/2)*B(1/2)    ah  <= 14  bh  <= 14
Packit 5c3484
  vinf=               a3 *          b2      #  A(inf)*B(inf)
Packit 5c3484
*/
Packit 5c3484
Packit 5c3484
#if TUNE_PROGRAM_BUILD
Packit 5c3484
#define MAYBE_mul_basecase 1
Packit 5c3484
#define MAYBE_mul_toom22   1
Packit 5c3484
#define MAYBE_mul_toom44   1
Packit 5c3484
#else
Packit 5c3484
#define MAYBE_mul_basecase						\
Packit 5c3484
  (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM22_THRESHOLD)
Packit 5c3484
#define MAYBE_mul_toom22						\
Packit 5c3484
  (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM33_THRESHOLD)
Packit 5c3484
#define MAYBE_mul_toom44						\
Packit 5c3484
  (MUL_TOOM6H_THRESHOLD >= 4 * MUL_TOOM44_THRESHOLD)
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
#define TOOM44_MUL_N_REC(p, a, b, n, ws)				\
Packit 5c3484
  do {									\
Packit 5c3484
    if (MAYBE_mul_basecase						\
Packit 5c3484
	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
Packit 5c3484
      mpn_mul_basecase (p, a, n, b, n);					\
Packit 5c3484
    else if (MAYBE_mul_toom22						\
Packit 5c3484
	     && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
Packit 5c3484
      mpn_toom22_mul (p, a, n, b, n, ws);				\
Packit 5c3484
    else if (! MAYBE_mul_toom44						\
Packit 5c3484
	     || BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD))		\
Packit 5c3484
      mpn_toom33_mul (p, a, n, b, n, ws);				\
Packit 5c3484
    else								\
Packit 5c3484
      mpn_toom44_mul (p, a, n, b, n, ws);				\
Packit 5c3484
  } while (0)
Packit 5c3484
Packit 5c3484
/* Use of scratch space. In the product area, we store
Packit 5c3484
Packit 5c3484
      ___________________
Packit 5c3484
     |vinf|____|_v1_|_v0_|
Packit 5c3484
      s+t  2n-1 2n+1  2n
Packit 5c3484
Packit 5c3484
   The other recursive products, vm1, v2, vm2, vh are stored in the
Packit 5c3484
   scratch area. When computing them, we use the product area for
Packit 5c3484
   intermediate values.
Packit 5c3484
Packit 5c3484
   Next, we compute v1. We can store the intermediate factors at v0
Packit 5c3484
   and at vh + 2n + 2.
Packit 5c3484
Packit 5c3484
   Finally, for v0 and vinf, factors are parts of the input operands,
Packit 5c3484
   and we need scratch space only for the recursive multiplication.
Packit 5c3484
Packit 5c3484
   In all, if S(an) is the scratch need, the needed space is bounded by
Packit 5c3484
Packit 5c3484
     S(an) <= 4 (2*ceil(an/4) + 1) + 1 + S(ceil(an/4) + 1)
Packit 5c3484
Packit 5c3484
   which should give S(n) = 8 n/3 + c log(n) for some constant c.
Packit 5c3484
*/
Packit 5c3484
Packit 5c3484
void
Packit 5c3484
mpn_toom44_mul (mp_ptr pp,
Packit 5c3484
		mp_srcptr ap, mp_size_t an,
Packit 5c3484
		mp_srcptr bp, mp_size_t bn,
Packit 5c3484
		mp_ptr scratch)
Packit 5c3484
{
Packit 5c3484
  mp_size_t n, s, t;
Packit 5c3484
  mp_limb_t cy;
Packit 5c3484
  enum toom7_flags flags;
Packit 5c3484
Packit 5c3484
#define a0  ap
Packit 5c3484
#define a1  (ap + n)
Packit 5c3484
#define a2  (ap + 2*n)
Packit 5c3484
#define a3  (ap + 3*n)
Packit 5c3484
#define b0  bp
Packit 5c3484
#define b1  (bp + n)
Packit 5c3484
#define b2  (bp + 2*n)
Packit 5c3484
#define b3  (bp + 3*n)
Packit 5c3484
Packit 5c3484
  ASSERT (an >= bn);
Packit 5c3484
Packit 5c3484
  n = (an + 3) >> 2;
Packit 5c3484
Packit 5c3484
  s = an - 3 * n;
Packit 5c3484
  t = bn - 3 * n;
Packit 5c3484
Packit 5c3484
  ASSERT (0 < s && s <= n);
Packit 5c3484
  ASSERT (0 < t && t <= n);
Packit 5c3484
  ASSERT (s >= t);
Packit 5c3484
Packit 5c3484
  /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
Packit 5c3484
   * following limb, so these must be computed in order, and we need a
Packit 5c3484
   * one limb gap to tp. */
Packit 5c3484
#define v0    pp				/* 2n */
Packit 5c3484
#define v1    (pp + 2 * n)			/* 2n+1 */
Packit 5c3484
#define vinf  (pp + 6 * n)			/* s+t */
Packit 5c3484
#define v2    scratch				/* 2n+1 */
Packit 5c3484
#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
Packit 5c3484
#define vh    (scratch + 4 * n + 2)		/* 2n+1 */
Packit 5c3484
#define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
Packit 5c3484
#define tp (scratch + 8*n + 5)
Packit 5c3484
Packit 5c3484
  /* apx and bpx must not overlap with v1 */
Packit 5c3484
#define apx   pp				/* n+1 */
Packit 5c3484
#define amx   (pp + n + 1)			/* n+1 */
Packit 5c3484
#define bmx   (pp + 2*n + 2)			/* n+1 */
Packit 5c3484
#define bpx   (pp + 4*n + 2)			/* n+1 */
Packit 5c3484
Packit 5c3484
  /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
Packit 5c3484
     gives roughly 32 n/3 + log term. */
Packit 5c3484
Packit 5c3484
  /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3.  */
Packit 5c3484
  flags = (enum toom7_flags) (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp));
Packit 5c3484
Packit 5c3484
  /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3.  */
Packit 5c3484
  flags = (enum toom7_flags) (flags ^ (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp)));
Packit 5c3484
Packit 5c3484
  TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp);	/* v2,  2n+1 limbs */
Packit 5c3484
  TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp);	/* vm2,  2n+1 limbs */
Packit 5c3484
Packit 5c3484
  /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
Packit 5c3484
#if HAVE_NATIVE_mpn_addlsh1_n
Packit 5c3484
  cy = mpn_addlsh1_n (apx, a1, a0, n);
Packit 5c3484
  cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
Packit 5c3484
  if (s < n)
Packit 5c3484
    {
Packit 5c3484
      mp_limb_t cy2;
Packit 5c3484
      cy2 = mpn_addlsh1_n (apx, a3, apx, s);
Packit 5c3484
      apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
Packit 5c3484
      MPN_INCR_U (apx + s, n+1-s, cy2);
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
Packit 5c3484
#else
Packit 5c3484
  cy = mpn_lshift (apx, a0, n, 1);
Packit 5c3484
  cy += mpn_add_n (apx, apx, a1, n);
Packit 5c3484
  cy = 2*cy + mpn_lshift (apx, apx, n, 1);
Packit 5c3484
  cy += mpn_add_n (apx, apx, a2, n);
Packit 5c3484
  cy = 2*cy + mpn_lshift (apx, apx, n, 1);
Packit 5c3484
  apx[n] = cy + mpn_add (apx, apx, n, a3, s);
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
  /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */
Packit 5c3484
#if HAVE_NATIVE_mpn_addlsh1_n
Packit 5c3484
  cy = mpn_addlsh1_n (bpx, b1, b0, n);
Packit 5c3484
  cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n);
Packit 5c3484
  if (t < n)
Packit 5c3484
    {
Packit 5c3484
      mp_limb_t cy2;
Packit 5c3484
      cy2 = mpn_addlsh1_n (bpx, b3, bpx, t);
Packit 5c3484
      bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1);
Packit 5c3484
      MPN_INCR_U (bpx + t, n+1-t, cy2);
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n);
Packit 5c3484
#else
Packit 5c3484
  cy = mpn_lshift (bpx, b0, n, 1);
Packit 5c3484
  cy += mpn_add_n (bpx, bpx, b1, n);
Packit 5c3484
  cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
Packit 5c3484
  cy += mpn_add_n (bpx, bpx, b2, n);
Packit 5c3484
  cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
Packit 5c3484
  bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t);
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
  ASSERT (apx[n] < 15);
Packit 5c3484
  ASSERT (bpx[n] < 15);
Packit 5c3484
Packit 5c3484
  TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp);	/* vh,  2n+1 limbs */
Packit 5c3484
Packit 5c3484
  /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3.  */
Packit 5c3484
  flags = (enum toom7_flags) (flags | (toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp)));
Packit 5c3484
Packit 5c3484
  /* Compute bpx = b0 + b1 + b2 + b3 and bmx = b0 - b1 + b2 - b3.  */
Packit 5c3484
  flags = (enum toom7_flags) (flags ^ (toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp)));
Packit 5c3484
Packit 5c3484
  TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp);	/* vm1,  2n+1 limbs */
Packit 5c3484
  /* Clobbers amx, bmx. */
Packit 5c3484
  TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp);	/* v1,  2n+1 limbs */
Packit 5c3484
Packit 5c3484
  TOOM44_MUL_N_REC (v0, a0, b0, n, tp);
Packit 5c3484
  if (s > t)
Packit 5c3484
    mpn_mul (vinf, a3, s, b3, t);
Packit 5c3484
  else
Packit 5c3484
    TOOM44_MUL_N_REC (vinf, a3, b3, s, tp);	/* vinf, s+t limbs */
Packit 5c3484
Packit 5c3484
  mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp);
Packit 5c3484
}