Blame mpn/generic/mullo_n.c

Packit 5c3484
/* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs
Packit 5c3484
   of their products.
Packit 5c3484
Packit 5c3484
   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
Packit 5c3484
Packit 5c3484
   THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
Packit 5c3484
   FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
Packit 5c3484
   THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
Packit 5c3484
Packit 5c3484
Copyright 2004, 2005, 2009, 2010, 2012 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
This file is part of the GNU MP Library.
Packit 5c3484
Packit 5c3484
The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
it under the terms of either:
Packit 5c3484
Packit 5c3484
  * the GNU Lesser General Public License as published by the Free
Packit 5c3484
    Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
    option) any later version.
Packit 5c3484
Packit 5c3484
or
Packit 5c3484
Packit 5c3484
  * the GNU General Public License as published by the Free Software
Packit 5c3484
    Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
    later version.
Packit 5c3484
Packit 5c3484
or both in parallel, as here.
Packit 5c3484
Packit 5c3484
The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
for more details.
Packit 5c3484
Packit 5c3484
You should have received copies of the GNU General Public License and the
Packit 5c3484
GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
see https://www.gnu.org/licenses/.  */
Packit 5c3484
Packit 5c3484
#include "gmp.h"
Packit 5c3484
#include "gmp-impl.h"
Packit 5c3484
Packit 5c3484
Packit 5c3484
#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
Packit 5c3484
#define MAYBE_range_basecase 1
Packit 5c3484
#define MAYBE_range_toom22   1
Packit 5c3484
#else
Packit 5c3484
#define MAYBE_range_basecase                                           \
Packit 5c3484
  ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11))
Packit 5c3484
#define MAYBE_range_toom22                                             \
Packit 5c3484
  ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) )
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
/*  THINK: The DC strategy uses different constants in different Toom's
Packit 5c3484
	 ranges. Something smoother?
Packit 5c3484
*/
Packit 5c3484
Packit 5c3484
/*
Packit 5c3484
  Compute the least significant half of the product {xy,n}*{yp,n}, or
Packit 5c3484
  formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
Packit 5c3484
Packit 5c3484
  Above the given threshold, the Divide and Conquer strategy is used.
Packit 5c3484
  The operands are split in two, and a full product plus two mullo
Packit 5c3484
  are used to obtain the final result. The more natural strategy is to
Packit 5c3484
  split in two halves, but this is far from optimal when a
Packit 5c3484
  sub-quadratic multiplication is used.
Packit 5c3484
Packit 5c3484
  Mulders suggests an unbalanced split in favour of the full product,
Packit 5c3484
  split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
Packit 5c3484
Packit 5c3484
  To compute the value of a, we assume that the cost of mullo for a
Packit 5c3484
  given size ML(n) is a fraction of the cost of a full product with
Packit 5c3484
  same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
Packit 5c3484
  then we can write:
Packit 5c3484
Packit 5c3484
  ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
Packit 5c3484
Packit 5c3484
  Given a value for e, want to minimise the value of k, i.e. the
Packit 5c3484
  function k=(1-a)^e/(1-2*a^e).
Packit 5c3484
Packit 5c3484
  With e=2, the exponent for schoolbook multiplication, the minimum is
Packit 5c3484
  given by the values a=1-a=1/2.
Packit 5c3484
Packit 5c3484
  With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
Packit 5c3484
  Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
Packit 5c3484
Packit 5c3484
  Other possible approximations follow:
Packit 5c3484
  e=log(5)/log(3) [Toom-3] -> a ~= 9/40
Packit 5c3484
  e=log(7)/log(4) [Toom-4] -> a ~= 7/39
Packit 5c3484
  e=log(11)/log(6) [Toom-6] -> a ~= 1/8
Packit 5c3484
  e=log(15)/log(8) [Toom-8] -> a ~= 1/10
Packit 5c3484
Packit 5c3484
  The values above where obtained with the following trivial commands
Packit 5c3484
  in the gp-pari shell:
Packit 5c3484
Packit 5c3484
fun(e,a)=(1-a)^e/(1-2*a^e)
Packit 5c3484
mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)
Packit 5c3484
contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
Packit 5c3484
contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
Packit 5c3484
contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
Packit 5c3484
contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
Packit 5c3484
contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
Packit 5c3484
Packit 5c3484
  ,
Packit 5c3484
  |\
Packit 5c3484
  | \
Packit 5c3484
  +----,
Packit 5c3484
  |    |
Packit 5c3484
  |    |
Packit 5c3484
  |    |\
Packit 5c3484
  |    | \
Packit 5c3484
  +----+--`
Packit 5c3484
  ^ n2 ^n1^
Packit 5c3484
Packit 5c3484
  For an actual implementation, the assumption that M(n)=n^e is
Packit 5c3484
  incorrect, as a consequence also the assumption that ML(n)=k*M(n)
Packit 5c3484
  with a constant k is wrong.
Packit 5c3484
Packit 5c3484
  But theory suggest us two things:
Packit 5c3484
  - the best the multiplication product is (lower e), the more k
Packit 5c3484
    approaches 1, and a approaches 0.
Packit 5c3484
Packit 5c3484
  - A value for a smaller than optimal is probably less bad than a
Packit 5c3484
    bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
Packit 5c3484
    value, and k(a)=0.808_ the mul/mullo speed ratio. We get
Packit 5c3484
    k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
Packit 5c3484
*/
Packit 5c3484
Packit 5c3484
static mp_size_t
Packit 5c3484
mpn_mullo_n_itch (mp_size_t n)
Packit 5c3484
{
Packit 5c3484
  return 2*n;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
/*
Packit 5c3484
    mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
Packit 5c3484
    It accepts tp == rp.
Packit 5c3484
*/
Packit 5c3484
static void
Packit 5c3484
mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
Packit 5c3484
{
Packit 5c3484
  mp_size_t n2, n1;
Packit 5c3484
  ASSERT (n >= 2);
Packit 5c3484
  ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
Packit 5c3484
  ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
Packit 5c3484
  ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
Packit 5c3484
Packit 5c3484
  /* Divide-and-conquer */
Packit 5c3484
Packit 5c3484
  /* We need fractional approximation of the value 0 < a <= 1/2
Packit 5c3484
     giving the minimum in the function k=(1-a)^e/(1-2*a^e).
Packit 5c3484
  */
Packit 5c3484
  if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
Packit 5c3484
    n1 = n >> 1;
Packit 5c3484
  else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
Packit 5c3484
    n1 = n * 11 / (size_t) 36;	/* n1 ~= n*(1-.694...) */
Packit 5c3484
  else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9)))
Packit 5c3484
    n1 = n * 9 / (size_t) 40;	/* n1 ~= n*(1-.775...) */
Packit 5c3484
  else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9))
Packit 5c3484
    n1 = n * 7 / (size_t) 39;	/* n1 ~= n*(1-.821...) */
Packit 5c3484
  /* n1 = n * 4 / (size_t) 31;	// n1 ~= n*(1-.871...) [TOOM66] */
Packit 5c3484
  else
Packit 5c3484
    n1 = n / (size_t) 10;		/* n1 ~= n*(1-.899...) [TOOM88] */
Packit 5c3484
Packit 5c3484
  n2 = n - n1;
Packit 5c3484
Packit 5c3484
  /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0,
Packit 5c3484
	      y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
Packit 5c3484
Packit 5c3484
  /* x0 * y0 */
Packit 5c3484
  mpn_mul_n (tp, xp, yp, n2);
Packit 5c3484
  MPN_COPY (rp, tp, n2);
Packit 5c3484
Packit 5c3484
  /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */
Packit 5c3484
  if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
Packit 5c3484
    mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1);
Packit 5c3484
  else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
Packit 5c3484
    mpn_mullo_basecase (tp + n, xp + n2, yp, n1);
Packit 5c3484
  else
Packit 5c3484
    mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n);
Packit 5c3484
  mpn_add_n (rp + n2, tp + n2, tp + n, n1);
Packit 5c3484
Packit 5c3484
  /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
Packit 5c3484
  if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
Packit 5c3484
    mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1);
Packit 5c3484
  else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
Packit 5c3484
    mpn_mullo_basecase (tp + n, xp, yp + n2, n1);
Packit 5c3484
  else
Packit 5c3484
    mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n);
Packit 5c3484
  mpn_add_n (rp + n2, rp + n2, tp + n, n1);
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
Packit 5c3484
#define MUL_BASECASE_ALLOC \
Packit 5c3484
 (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT)
Packit 5c3484
Packit 5c3484
/* FIXME: This function should accept a temporary area; dc_mullow_n
Packit 5c3484
   accepts a pointer tp, and handle the case tp == rp, do the same here.
Packit 5c3484
   Maybe recombine the two functions.
Packit 5c3484
   THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase
Packit 5c3484
	  (typically thanks to mpn_addmul_2) should we unconditionally use
Packit 5c3484
	  mpn_mul_n?
Packit 5c3484
*/
Packit 5c3484
Packit 5c3484
void
Packit 5c3484
mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
Packit 5c3484
{
Packit 5c3484
  ASSERT (n >= 1);
Packit 5c3484
  ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
Packit 5c3484
  ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
Packit 5c3484
Packit 5c3484
  if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD))
Packit 5c3484
    {
Packit 5c3484
      /* Allocate workspace of fixed size on stack: fast! */
Packit 5c3484
      mp_limb_t tp[MUL_BASECASE_ALLOC];
Packit 5c3484
      mpn_mul_basecase (tp, xp, n, yp, n);
Packit 5c3484
      MPN_COPY (rp, tp, n);
Packit 5c3484
    }
Packit 5c3484
  else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD))
Packit 5c3484
    {
Packit 5c3484
      mpn_mullo_basecase (rp, xp, yp, n);
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      mp_ptr tp;
Packit 5c3484
      TMP_DECL;
Packit 5c3484
      TMP_MARK;
Packit 5c3484
      tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n));
Packit 5c3484
      if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD))
Packit 5c3484
	{
Packit 5c3484
	  mpn_dc_mullo_n (rp, xp, yp, n, tp);
Packit 5c3484
	}
Packit 5c3484
      else
Packit 5c3484
	{
Packit 5c3484
	  /* For really large operands, use plain mpn_mul_n but throw away upper n
Packit 5c3484
	     limbs of result.  */
Packit 5c3484
#if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD)
Packit 5c3484
	  mpn_fft_mul (tp, xp, n, yp, n);
Packit 5c3484
#else
Packit 5c3484
	  mpn_mul_n (tp, xp, yp, n);
Packit 5c3484
#endif
Packit 5c3484
	  MPN_COPY (rp, tp, n);
Packit 5c3484
	}
Packit 5c3484
      TMP_FREE;
Packit 5c3484
    }
Packit 5c3484
}