Blame mpz/bin_uiui.c

Packit 5c3484
/* mpz_bin_uiui - compute n over k.
Packit 5c3484
Packit 5c3484
Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
Packit 5c3484
Packit 5c3484
Copyright 2010-2012 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
This file is part of the GNU MP Library.
Packit 5c3484
Packit 5c3484
The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
it under the terms of either:
Packit 5c3484
Packit 5c3484
  * the GNU Lesser General Public License as published by the Free
Packit 5c3484
    Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
    option) any later version.
Packit 5c3484
Packit 5c3484
or
Packit 5c3484
Packit 5c3484
  * the GNU General Public License as published by the Free Software
Packit 5c3484
    Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
    later version.
Packit 5c3484
Packit 5c3484
or both in parallel, as here.
Packit 5c3484
Packit 5c3484
The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
for more details.
Packit 5c3484
Packit 5c3484
You should have received copies of the GNU General Public License and the
Packit 5c3484
GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
see https://www.gnu.org/licenses/.  */
Packit 5c3484
Packit 5c3484
#include "gmp.h"
Packit 5c3484
#include "gmp-impl.h"
Packit 5c3484
#include "longlong.h"
Packit 5c3484
Packit 5c3484
#ifndef BIN_GOETGHELUCK_THRESHOLD
Packit 5c3484
#define BIN_GOETGHELUCK_THRESHOLD  1000
Packit 5c3484
#endif
Packit 5c3484
#ifndef BIN_UIUI_ENABLE_SMALLDC
Packit 5c3484
#define BIN_UIUI_ENABLE_SMALLDC    1
Packit 5c3484
#endif
Packit 5c3484
#ifndef BIN_UIUI_RECURSIVE_SMALLDC
Packit 5c3484
#define BIN_UIUI_RECURSIVE_SMALLDC (GMP_NUMB_BITS > 32)
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
/* Algorithm:
Packit 5c3484
Packit 5c3484
   Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8)
Packit 5c3484
   which are then accumulated into mpn numbers.  The first inner loop
Packit 5c3484
   accumulates divisor factors, the 2nd inner loop accumulates exactly the same
Packit 5c3484
   number of dividend factors.  We avoid accumulating more for the divisor,
Packit 5c3484
   even with its smaller factors, since we else cannot guarantee divisibility.
Packit 5c3484
Packit 5c3484
   Since we know each division will yield an integer, we compute the quotient
Packit 5c3484
   using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod
Packit 5c3484
   2^t.
Packit 5c3484
Packit 5c3484
   Improvements:
Packit 5c3484
Packit 5c3484
   (1) An obvious improvement to this code would be to compute mod 2^t
Packit 5c3484
   everywhere.  Unfortunately, we cannot determine t beforehand, unless we
Packit 5c3484
   invoke some approximation, such as Stirling's formula.  Of course, we don't
Packit 5c3484
   need t to be tight.  However, it is not clear that this would help much,
Packit 5c3484
   our numbers are kept reasonably small already.
Packit 5c3484
Packit 5c3484
   (2) Compute nmax/kmax semi-accurately, without scalar division or a loop.
Packit 5c3484
   Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index,
Packit 5c3484
   would make it both reasonably accurate and fast.  (We could use a table
Packit 5c3484
   stored into a limb, perhaps.)  The table should take the removed factors of
Packit 5c3484
   2 into account (those done on-the-fly in mulN).
Packit 5c3484
Packit 5c3484
   (3) The first time in the loop we compute the odd part of a
Packit 5c3484
   factorial in kp, we might use oddfac_1 for this task.
Packit 5c3484
 */
Packit 5c3484
Packit 5c3484
/* This threshold determines how large divisor to accumulate before we call
Packit 5c3484
   bdiv.  Perhaps we should never call bdiv, and accumulate all we are told,
Packit 5c3484
   since we are just basecase code anyway?  Presumably, this depends on the
Packit 5c3484
   relative speed of the asymptotically fast code and this code.  */
Packit 5c3484
#define SOME_THRESHOLD 20
Packit 5c3484
Packit 5c3484
/* Multiply-into-limb functions.  These remove factors of 2 on-the-fly.  FIXME:
Packit 5c3484
   All versions of MAXFACS don't take this 2 removal into account now, meaning
Packit 5c3484
   that then, shifting just adds some overhead.  (We remove factors from the
Packit 5c3484
   completed limb anyway.)  */
Packit 5c3484
Packit 5c3484
static mp_limb_t
Packit 5c3484
mul1 (mp_limb_t m)
Packit 5c3484
{
Packit 5c3484
  return m;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
static mp_limb_t
Packit 5c3484
mul2 (mp_limb_t m)
Packit 5c3484
{
Packit 5c3484
  /* We need to shift before multiplying, to avoid an overflow. */
Packit 5c3484
  mp_limb_t m01 = (m | 1) * ((m + 1) >> 1);
Packit 5c3484
  return m01;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
static mp_limb_t
Packit 5c3484
mul3 (mp_limb_t m)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
Packit 5c3484
  mp_limb_t m2 = (m + 2);
Packit 5c3484
  return m01 * m2;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
static mp_limb_t
Packit 5c3484
mul4 (mp_limb_t m)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
Packit 5c3484
  mp_limb_t m23 = (m + 2) * (m + 3) >> 1;
Packit 5c3484
  return m01 * m23;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
static mp_limb_t
Packit 5c3484
mul5 (mp_limb_t m)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t m012 = (m + 0) * (m + 1) * (m + 2) >> 1;
Packit 5c3484
  mp_limb_t m34 = (m + 3) * (m + 4) >> 1;
Packit 5c3484
  return m012 * m34;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
static mp_limb_t
Packit 5c3484
mul6 (mp_limb_t m)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t m01 = (m + 0) * (m + 1);
Packit 5c3484
  mp_limb_t m23 = (m + 2) * (m + 3);
Packit 5c3484
  mp_limb_t m45 = (m + 4) * (m + 5) >> 1;
Packit 5c3484
  mp_limb_t m0123 = m01 * m23 >> 3;
Packit 5c3484
  return m0123 * m45;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
static mp_limb_t
Packit 5c3484
mul7 (mp_limb_t m)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t m01 = (m + 0) * (m + 1);
Packit 5c3484
  mp_limb_t m23 = (m + 2) * (m + 3);
Packit 5c3484
  mp_limb_t m456 = (m + 4) * (m + 5) * (m + 6) >> 1;
Packit 5c3484
  mp_limb_t m0123 = m01 * m23 >> 3;
Packit 5c3484
  return m0123 * m456;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
static mp_limb_t
Packit 5c3484
mul8 (mp_limb_t m)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t m01 = (m + 0) * (m + 1);
Packit 5c3484
  mp_limb_t m23 = (m + 2) * (m + 3);
Packit 5c3484
  mp_limb_t m45 = (m + 4) * (m + 5);
Packit 5c3484
  mp_limb_t m67 = (m + 6) * (m + 7);
Packit 5c3484
  mp_limb_t m0123 = m01 * m23 >> 3;
Packit 5c3484
  mp_limb_t m4567 = m45 * m67 >> 3;
Packit 5c3484
  return m0123 * m4567;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
typedef mp_limb_t (* mulfunc_t) (mp_limb_t);
Packit 5c3484
Packit 5c3484
static const mulfunc_t mulfunc[] = {mul1,mul2,mul3,mul4,mul5,mul6,mul7,mul8};
Packit 5c3484
#define M (numberof(mulfunc))
Packit 5c3484
Packit 5c3484
/* Number of factors-of-2 removed by the corresponding mulN function.  */
Packit 5c3484
static const unsigned char tcnttab[] = {0, 1, 1, 2, 2, 4, 4, 6};
Packit 5c3484
Packit 5c3484
#if 1
Packit 5c3484
/* This variant is inaccurate but share the code with other functions.  */
Packit 5c3484
#define MAXFACS(max,l)							\
Packit 5c3484
  do {									\
Packit 5c3484
    (max) = log_n_max (l);						\
Packit 5c3484
  } while (0)
Packit 5c3484
#else
Packit 5c3484
Packit 5c3484
/* This variant is exact(?) but uses a loop.  It takes the 2 removal
Packit 5c3484
 of mulN into account.  */
Packit 5c3484
static const unsigned long ftab[] =
Packit 5c3484
#if GMP_NUMB_BITS == 64
Packit 5c3484
  /* 1 to 8 factors per iteration */
Packit 5c3484
  {CNST_LIMB(0xffffffffffffffff),CNST_LIMB(0x100000000),0x32cbfe,0x16a0b,0x24c4,0xa16,0x34b,0x1b2 /*,0xdf,0x8d */};
Packit 5c3484
#endif
Packit 5c3484
#if GMP_NUMB_BITS == 32
Packit 5c3484
  /* 1 to 7 factors per iteration */
Packit 5c3484
  {0xffffffff,0x10000,0x801,0x16b,0x71,0x42,0x26 /* ,0x1e */};
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
#define MAXFACS(max,l)							\
Packit 5c3484
  do {									\
Packit 5c3484
    int __i;								\
Packit 5c3484
    for (__i = numberof (ftab) - 1; l > ftab[__i]; __i--)		\
Packit 5c3484
      ;									\
Packit 5c3484
    (max) = __i + 1;							\
Packit 5c3484
  } while (0)
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
/* Entry i contains (i!/2^t)^(-1) where t is chosen such that the parenthesis
Packit 5c3484
   is an odd integer. */
Packit 5c3484
static const mp_limb_t facinv[] = { ONE_LIMB_ODD_FACTORIAL_INVERSES_TABLE };
Packit 5c3484
Packit 5c3484
static void
Packit 5c3484
mpz_bdiv_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
Packit 5c3484
{
Packit 5c3484
  int nmax, kmax, nmaxnow, numfac;
Packit 5c3484
  mp_ptr np, kp;
Packit 5c3484
  mp_size_t nn, kn, alloc;
Packit 5c3484
  mp_limb_t i, j, t, iii, jjj, cy, dinv;
Packit 5c3484
  mp_bitcnt_t i2cnt, j2cnt;
Packit 5c3484
  int cnt;
Packit 5c3484
  mp_size_t maxn;
Packit 5c3484
  TMP_DECL;
Packit 5c3484
Packit 5c3484
  ASSERT (k > ODD_FACTORIAL_TABLE_LIMIT);
Packit 5c3484
  TMP_MARK;
Packit 5c3484
Packit 5c3484
  maxn = 1 + n / GMP_NUMB_BITS;    /* absolutely largest result size (limbs) */
Packit 5c3484
Packit 5c3484
  /* FIXME: This allocation might be insufficient, but is usually way too
Packit 5c3484
     large.  */
Packit 5c3484
  alloc = SOME_THRESHOLD - 1 + MAX (3 * maxn / 2, SOME_THRESHOLD);
Packit 5c3484
  alloc = MIN (alloc, k) + 1;
Packit 5c3484
  np = TMP_ALLOC_LIMBS (alloc);
Packit 5c3484
  kp = TMP_ALLOC_LIMBS (SOME_THRESHOLD + 1);
Packit 5c3484
Packit 5c3484
  MAXFACS (nmax, n);
Packit 5c3484
  ASSERT (nmax <= M);
Packit 5c3484
  MAXFACS (kmax, k);
Packit 5c3484
  ASSERT (kmax <= M);
Packit 5c3484
  ASSERT (k >= M);
Packit 5c3484
Packit 5c3484
  i = n - k + 1;
Packit 5c3484
Packit 5c3484
  np[0] = 1; nn = 1;
Packit 5c3484
Packit 5c3484
  i2cnt = 0;				/* total low zeros in dividend */
Packit 5c3484
  j2cnt = __gmp_fac2cnt_table[ODD_FACTORIAL_TABLE_LIMIT / 2 - 1];
Packit 5c3484
					/* total low zeros in divisor */
Packit 5c3484
Packit 5c3484
  numfac = 1;
Packit 5c3484
  j = ODD_FACTORIAL_TABLE_LIMIT + 1;
Packit 5c3484
  jjj = ODD_FACTORIAL_TABLE_MAX;
Packit 5c3484
  ASSERT (__gmp_oddfac_table[ODD_FACTORIAL_TABLE_LIMIT] == ODD_FACTORIAL_TABLE_MAX);
Packit 5c3484
Packit 5c3484
  while (1)
Packit 5c3484
    {
Packit 5c3484
      kp[0] = jjj;				/* store new factors */
Packit 5c3484
      kn = 1;
Packit 5c3484
      t = k - j + 1;
Packit 5c3484
      kmax = MIN (kmax, t);
Packit 5c3484
Packit 5c3484
      while (kmax != 0 && kn < SOME_THRESHOLD)
Packit 5c3484
	{
Packit 5c3484
	  jjj = mulfunc[kmax - 1] (j);
Packit 5c3484
	  j += kmax;				/* number of factors used */
Packit 5c3484
	  count_trailing_zeros (cnt, jjj);	/* count low zeros */
Packit 5c3484
	  jjj >>= cnt;				/* remove remaining low zeros */
Packit 5c3484
	  j2cnt += tcnttab[kmax - 1] + cnt;	/* update low zeros count */
Packit 5c3484
	  cy = mpn_mul_1 (kp, kp, kn, jjj);	/* accumulate new factors */
Packit 5c3484
	  kp[kn] = cy;
Packit 5c3484
	  kn += cy != 0;
Packit 5c3484
	  t = k - j + 1;
Packit 5c3484
	  kmax = MIN (kmax, t);
Packit 5c3484
	}
Packit 5c3484
      numfac = j - numfac;
Packit 5c3484
Packit 5c3484
      while (numfac != 0)
Packit 5c3484
	{
Packit 5c3484
	  nmaxnow = MIN (nmax, numfac);
Packit 5c3484
	  iii = mulfunc[nmaxnow - 1] (i);
Packit 5c3484
	  i += nmaxnow;				/* number of factors used */
Packit 5c3484
	  count_trailing_zeros (cnt, iii);	/* count low zeros */
Packit 5c3484
	  iii >>= cnt;				/* remove remaining low zeros */
Packit 5c3484
	  i2cnt += tcnttab[nmaxnow - 1] + cnt;	/* update low zeros count */
Packit 5c3484
	  cy = mpn_mul_1 (np, np, nn, iii);	/* accumulate new factors */
Packit 5c3484
	  np[nn] = cy;
Packit 5c3484
	  nn += cy != 0;
Packit 5c3484
	  numfac -= nmaxnow;
Packit 5c3484
	}
Packit 5c3484
Packit 5c3484
      ASSERT (nn < alloc);
Packit 5c3484
Packit 5c3484
      binvert_limb (dinv, kp[0]);
Packit 5c3484
      nn += (np[nn - 1] >= kp[kn - 1]);
Packit 5c3484
      nn -= kn;
Packit 5c3484
      mpn_sbpi1_bdiv_q (np, np, nn, kp, MIN(kn,nn), -dinv);
Packit 5c3484
Packit 5c3484
      if (kmax == 0)
Packit 5c3484
	break;
Packit 5c3484
      numfac = j;
Packit 5c3484
Packit 5c3484
      jjj = mulfunc[kmax - 1] (j);
Packit 5c3484
      j += kmax;				/* number of factors used */
Packit 5c3484
      count_trailing_zeros (cnt, jjj);		/* count low zeros */
Packit 5c3484
      jjj >>= cnt;				/* remove remaining low zeros */
Packit 5c3484
      j2cnt += tcnttab[kmax - 1] + cnt;		/* update low zeros count */
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  /* Put back the right number of factors of 2.  */
Packit 5c3484
  cnt = i2cnt - j2cnt;
Packit 5c3484
  if (cnt != 0)
Packit 5c3484
    {
Packit 5c3484
      ASSERT (cnt < GMP_NUMB_BITS); /* can happen, but not for intended use */
Packit 5c3484
      cy = mpn_lshift (np, np, nn, cnt);
Packit 5c3484
      np[nn] = cy;
Packit 5c3484
      nn += cy != 0;
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  nn -= np[nn - 1] == 0;	/* normalisation */
Packit 5c3484
Packit 5c3484
  kp = MPZ_NEWALLOC (r, nn);
Packit 5c3484
  SIZ(r) = nn;
Packit 5c3484
  MPN_COPY (kp, np, nn);
Packit 5c3484
  TMP_FREE;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
static void
Packit 5c3484
mpz_smallk_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
Packit 5c3484
{
Packit 5c3484
  int nmax, numfac;
Packit 5c3484
  mp_ptr rp;
Packit 5c3484
  mp_size_t rn, alloc;
Packit 5c3484
  mp_limb_t i, iii, cy;
Packit 5c3484
  mp_bitcnt_t i2cnt, cnt;
Packit 5c3484
Packit 5c3484
  count_leading_zeros (cnt, (mp_limb_t) n);
Packit 5c3484
  cnt = GMP_LIMB_BITS - cnt;
Packit 5c3484
  alloc = cnt * k / GMP_NUMB_BITS + 3;	/* FIXME: ensure rounding is enough. */
Packit 5c3484
  rp = MPZ_NEWALLOC (r, alloc);
Packit 5c3484
Packit 5c3484
  MAXFACS (nmax, n);
Packit 5c3484
  nmax = MIN (nmax, M);
Packit 5c3484
Packit 5c3484
  i = n - k + 1;
Packit 5c3484
Packit 5c3484
  nmax = MIN (nmax, k);
Packit 5c3484
  rp[0] = mulfunc[nmax - 1] (i);
Packit 5c3484
  rn = 1;
Packit 5c3484
  i += nmax;				/* number of factors used */
Packit 5c3484
  i2cnt = tcnttab[nmax - 1];		/* low zeros count */
Packit 5c3484
  numfac = k - nmax;
Packit 5c3484
  while (numfac != 0)
Packit 5c3484
    {
Packit 5c3484
      nmax = MIN (nmax, numfac);
Packit 5c3484
      iii = mulfunc[nmax - 1] (i);
Packit 5c3484
      i += nmax;			/* number of factors used */
Packit 5c3484
      i2cnt += tcnttab[nmax - 1];	/* update low zeros count */
Packit 5c3484
      cy = mpn_mul_1 (rp, rp, rn, iii);	/* accumulate new factors */
Packit 5c3484
      rp[rn] = cy;
Packit 5c3484
      rn += cy != 0;
Packit 5c3484
      numfac -= nmax;
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  ASSERT (rn < alloc);
Packit 5c3484
Packit 5c3484
  mpn_pi1_bdiv_q_1 (rp, rp, rn, __gmp_oddfac_table[k], facinv[k - 2],
Packit 5c3484
		    __gmp_fac2cnt_table[k / 2 - 1] - i2cnt);
Packit 5c3484
  /* A two-fold, branch-free normalisation is possible :*/
Packit 5c3484
  /* rn -= rp[rn - 1] == 0; */
Packit 5c3484
  /* rn -= rp[rn - 1] == 0; */
Packit 5c3484
  MPN_NORMALIZE_NOT_ZERO (rp, rn);
Packit 5c3484
Packit 5c3484
  SIZ(r) = rn;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
/* Algorithm:
Packit 5c3484
Packit 5c3484
   Plain and simply multiply things together.
Packit 5c3484
Packit 5c3484
   We tabulate factorials (k!/2^t)^(-1) mod B (where t is chosen such
Packit 5c3484
   that k!/2^t is odd).
Packit 5c3484
Packit 5c3484
*/
Packit 5c3484
Packit 5c3484
static mp_limb_t
Packit 5c3484
bc_bin_uiui (unsigned int n, unsigned int k)
Packit 5c3484
{
Packit 5c3484
  return ((__gmp_oddfac_table[n] * facinv[k - 2] * facinv[n - k - 2])
Packit 5c3484
    << (__gmp_fac2cnt_table[n / 2 - 1] - __gmp_fac2cnt_table[k / 2 - 1] - __gmp_fac2cnt_table[(n-k) / 2 - 1]))
Packit 5c3484
    & GMP_NUMB_MASK;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
/* Algorithm:
Packit 5c3484
Packit 5c3484
   Recursively exploit the relation
Packit 5c3484
   bin(n,k) = bin(n,k>>1)*bin(n-k>>1,k-k>>1)/bin(k,k>>1) .
Packit 5c3484
Packit 5c3484
   Values for binomial(k,k>>1) that fit in a limb are precomputed
Packit 5c3484
   (with inverses).
Packit 5c3484
*/
Packit 5c3484
Packit 5c3484
/* bin2kk[i - ODD_CENTRAL_BINOMIAL_OFFSET] =
Packit 5c3484
   binomial(i*2,i)/2^t (where t is chosen so that it is odd). */
Packit 5c3484
static const mp_limb_t bin2kk[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_TABLE };
Packit 5c3484
Packit 5c3484
/* bin2kkinv[i] = bin2kk[i]^-1 mod B */
Packit 5c3484
static const mp_limb_t bin2kkinv[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_INVERSE_TABLE };
Packit 5c3484
Packit 5c3484
/* bin2kk[i] = binomial((i+MIN_S)*2,i+MIN_S)/2^t. This table contains the t values. */
Packit 5c3484
static const unsigned char fac2bin[] = { CENTRAL_BINOMIAL_2FAC_TABLE };
Packit 5c3484
Packit 5c3484
static void
Packit 5c3484
mpz_smallkdc_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
Packit 5c3484
{
Packit 5c3484
  mp_ptr rp;
Packit 5c3484
  mp_size_t rn;
Packit 5c3484
  unsigned long int hk;
Packit 5c3484
Packit 5c3484
  hk = k >> 1;
Packit 5c3484
Packit 5c3484
  if ((! BIN_UIUI_RECURSIVE_SMALLDC) || hk <= ODD_FACTORIAL_TABLE_LIMIT)
Packit 5c3484
    mpz_smallk_bin_uiui (r, n, hk);
Packit 5c3484
  else
Packit 5c3484
    mpz_smallkdc_bin_uiui (r, n, hk);
Packit 5c3484
  k -= hk;
Packit 5c3484
  n -= hk;
Packit 5c3484
  if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) {
Packit 5c3484
    mp_limb_t cy;
Packit 5c3484
    rn = SIZ (r);
Packit 5c3484
    rp = MPZ_REALLOC (r, rn + 1);
Packit 5c3484
    cy = mpn_mul_1 (rp, rp, rn, bc_bin_uiui (n, k));
Packit 5c3484
    rp [rn] = cy;
Packit 5c3484
    rn += cy != 0;
Packit 5c3484
  } else {
Packit 5c3484
    mp_limb_t buffer[ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3];
Packit 5c3484
    mpz_t t;
Packit 5c3484
Packit 5c3484
    ALLOC (t) = ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3;
Packit 5c3484
    PTR (t) = buffer;
Packit 5c3484
    if ((! BIN_UIUI_RECURSIVE_SMALLDC) || k <= ODD_FACTORIAL_TABLE_LIMIT)
Packit 5c3484
      mpz_smallk_bin_uiui (t, n, k);
Packit 5c3484
    else
Packit 5c3484
      mpz_smallkdc_bin_uiui (t, n, k);
Packit 5c3484
    mpz_mul (r, r, t);
Packit 5c3484
    rp = PTR (r);
Packit 5c3484
    rn = SIZ (r);
Packit 5c3484
  }
Packit 5c3484
Packit 5c3484
  mpn_pi1_bdiv_q_1 (rp, rp, rn, bin2kk[k - ODD_CENTRAL_BINOMIAL_OFFSET],
Packit 5c3484
		    bin2kkinv[k - ODD_CENTRAL_BINOMIAL_OFFSET],
Packit 5c3484
		    fac2bin[k - ODD_CENTRAL_BINOMIAL_OFFSET] - (k != hk));
Packit 5c3484
  /* A two-fold, branch-free normalisation is possible :*/
Packit 5c3484
  /* rn -= rp[rn - 1] == 0; */
Packit 5c3484
  /* rn -= rp[rn - 1] == 0; */
Packit 5c3484
  MPN_NORMALIZE_NOT_ZERO (rp, rn);
Packit 5c3484
Packit 5c3484
  SIZ(r) = rn;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
/* mpz_goetgheluck_bin_uiui(RESULT, N, K) -- Set RESULT to binomial(N,K).
Packit 5c3484
 *
Packit 5c3484
 * Contributed to the GNU project by Marco Bodrato.
Packit 5c3484
 *
Packit 5c3484
 * Implementation of the algorithm by P. Goetgheluck, "Computing
Packit 5c3484
 * Binomial Coefficients", The American Mathematical Monthly, Vol. 94,
Packit 5c3484
 * No. 4 (April 1987), pp. 360-365.
Packit 5c3484
 *
Packit 5c3484
 * Acknowledgment: Peter Luschny did spot the slowness of the previous
Packit 5c3484
 * code and suggested the reference.
Packit 5c3484
 */
Packit 5c3484
Packit 5c3484
/* TODO: Remove duplicated constants / macros / static functions...
Packit 5c3484
 */
Packit 5c3484
Packit 5c3484
/*************************************************************/
Packit 5c3484
/* Section macros: common macros, for swing/fac/bin (&sieve) */
Packit 5c3484
/*************************************************************/
Packit 5c3484
Packit 5c3484
#define FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I)			\
Packit 5c3484
  if ((PR) > (MAX_PR)) {					\
Packit 5c3484
    (VEC)[(I)++] = (PR);					\
Packit 5c3484
    (PR) = 1;							\
Packit 5c3484
  }
Packit 5c3484
Packit 5c3484
#define FACTOR_LIST_STORE(P, PR, MAX_PR, VEC, I)		\
Packit 5c3484
  do {								\
Packit 5c3484
    if ((PR) > (MAX_PR)) {					\
Packit 5c3484
      (VEC)[(I)++] = (PR);					\
Packit 5c3484
      (PR) = (P);						\
Packit 5c3484
    } else							\
Packit 5c3484
      (PR) *= (P);						\
Packit 5c3484
  } while (0)
Packit 5c3484
Packit 5c3484
#define LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)			\
Packit 5c3484
    __max_i = (end);						\
Packit 5c3484
								\
Packit 5c3484
    do {							\
Packit 5c3484
      ++__i;							\
Packit 5c3484
      if (((sieve)[__index] & __mask) == 0)			\
Packit 5c3484
	{							\
Packit 5c3484
	  (prime) = id_to_n(__i)
Packit 5c3484
Packit 5c3484
#define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve)		\
Packit 5c3484
  do {								\
Packit 5c3484
    mp_limb_t __mask, __index, __max_i, __i;			\
Packit 5c3484
								\
Packit 5c3484
    __i = (start)-(off);					\
Packit 5c3484
    __index = __i / GMP_LIMB_BITS;				\
Packit 5c3484
    __mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS);		\
Packit 5c3484
    __i += (off);						\
Packit 5c3484
								\
Packit 5c3484
    LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)
Packit 5c3484
Packit 5c3484
#define LOOP_ON_SIEVE_STOP					\
Packit 5c3484
	}							\
Packit 5c3484
      __mask = __mask << 1 | __mask >> (GMP_LIMB_BITS-1);	\
Packit 5c3484
      __index += __mask & 1;					\
Packit 5c3484
    }  while (__i <= __max_i)					\
Packit 5c3484
Packit 5c3484
#define LOOP_ON_SIEVE_END					\
Packit 5c3484
    LOOP_ON_SIEVE_STOP;						\
Packit 5c3484
  } while (0)
Packit 5c3484
Packit 5c3484
/*********************************************************/
Packit 5c3484
/* Section sieve: sieving functions and tools for primes */
Packit 5c3484
/*********************************************************/
Packit 5c3484
Packit 5c3484
#if WANT_ASSERT
Packit 5c3484
static mp_limb_t
Packit 5c3484
bit_to_n (mp_limb_t bit) { return (bit*3+4)|1; }
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
/* id_to_n (x) = bit_to_n (x-1) = (id*3+1)|1*/
Packit 5c3484
static mp_limb_t
Packit 5c3484
id_to_n  (mp_limb_t id)  { return id*3+1+(id&1;; }
Packit 5c3484
Packit 5c3484
/* n_to_bit (n) = ((n-1)&(-CNST_LIMB(2)))/3U-1 */
Packit 5c3484
static mp_limb_t
Packit 5c3484
n_to_bit (mp_limb_t n) { return ((n-5)|1)/3U; }
Packit 5c3484
Packit 5c3484
static mp_size_t
Packit 5c3484
primesieve_size (mp_limb_t n) { return n_to_bit(n) / GMP_LIMB_BITS + 1; }
Packit 5c3484
Packit 5c3484
/*********************************************************/
Packit 5c3484
/* Section binomial: fast binomial implementation        */
Packit 5c3484
/*********************************************************/
Packit 5c3484
Packit 5c3484
#define COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I)	\
Packit 5c3484
  do {							\
Packit 5c3484
    mp_limb_t __a, __b, __prime, __ma,__mb;		\
Packit 5c3484
    __prime = (P);					\
Packit 5c3484
    __a = (N); __b = (K); __mb = 0;			\
Packit 5c3484
    FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I);		\
Packit 5c3484
    do {						\
Packit 5c3484
      __mb += __b % __prime; __b /= __prime;		\
Packit 5c3484
      __ma = __a % __prime; __a /= __prime;		\
Packit 5c3484
      if (__ma < __mb) {				\
Packit 5c3484
        __mb = 1; (PR) *= __prime;			\
Packit 5c3484
      } else  __mb = 0;					\
Packit 5c3484
    } while (__a >= __prime);				\
Packit 5c3484
  } while (0)
Packit 5c3484
Packit 5c3484
#define SH_COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I)	\
Packit 5c3484
  do {							\
Packit 5c3484
    mp_limb_t __prime;					\
Packit 5c3484
    __prime = (P);					\
Packit 5c3484
    if (((N) % __prime) < ((K) % __prime)) {		\
Packit 5c3484
      FACTOR_LIST_STORE (__prime, PR, MAX_PR, VEC, I);	\
Packit 5c3484
    }							\
Packit 5c3484
  } while (0)
Packit 5c3484
Packit 5c3484
/* Returns an approximation of the sqare root of x.  *
Packit 5c3484
 * It gives: x <= limb_apprsqrt (x) ^ 2 < x * 9/4    */
Packit 5c3484
static mp_limb_t
Packit 5c3484
limb_apprsqrt (mp_limb_t x)
Packit 5c3484
{
Packit 5c3484
  int s;
Packit 5c3484
Packit 5c3484
  ASSERT (x > 2);
Packit 5c3484
  count_leading_zeros (s, x - 1);
Packit 5c3484
  s = GMP_LIMB_BITS - 1 - s;
Packit 5c3484
  return (CNST_LIMB(1) << (s >> 1)) + (CNST_LIMB(1) << ((s - 1) >> 1));
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
static void
Packit 5c3484
mpz_goetgheluck_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t *sieve, *factors, count;
Packit 5c3484
  mp_limb_t prod, max_prod, j;
Packit 5c3484
  TMP_DECL;
Packit 5c3484
Packit 5c3484
  ASSERT (BIN_GOETGHELUCK_THRESHOLD >= 13);
Packit 5c3484
  ASSERT (n >= 25);
Packit 5c3484
Packit 5c3484
  TMP_MARK;
Packit 5c3484
  sieve = TMP_ALLOC_LIMBS (primesieve_size (n));
Packit 5c3484
Packit 5c3484
  count = gmp_primesieve (sieve, n) + 1;
Packit 5c3484
  factors = TMP_ALLOC_LIMBS (count / log_n_max (n) + 1);
Packit 5c3484
Packit 5c3484
  max_prod = GMP_NUMB_MAX / n;
Packit 5c3484
Packit 5c3484
  /* Handle primes = 2, 3 separately. */
Packit 5c3484
  popc_limb (count, n - k);
Packit 5c3484
  popc_limb (j, k);
Packit 5c3484
  count += j;
Packit 5c3484
  popc_limb (j, n);
Packit 5c3484
  count -= j;
Packit 5c3484
  prod = CNST_LIMB(1) << count;
Packit 5c3484
Packit 5c3484
  j = 0;
Packit 5c3484
  COUNT_A_PRIME (3, n, k, prod, max_prod, factors, j);
Packit 5c3484
Packit 5c3484
  /* Accumulate prime factors from 5 to n/2 */
Packit 5c3484
    {
Packit 5c3484
      mp_limb_t s;
Packit 5c3484
Packit 5c3484
      {
Packit 5c3484
	mp_limb_t prime;
Packit 5c3484
	s = limb_apprsqrt(n);
Packit 5c3484
	s = n_to_bit (s);
Packit 5c3484
	LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (5), s, 0,sieve);
Packit 5c3484
	COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
Packit 5c3484
	LOOP_ON_SIEVE_END;
Packit 5c3484
	s++;
Packit 5c3484
      }
Packit 5c3484
Packit 5c3484
      ASSERT (max_prod <= GMP_NUMB_MAX / 2);
Packit 5c3484
      max_prod <<= 1;
Packit 5c3484
      ASSERT (bit_to_n (s) * bit_to_n (s) > n);
Packit 5c3484
      ASSERT (s <= n_to_bit (n >> 1));
Packit 5c3484
      {
Packit 5c3484
	mp_limb_t prime;
Packit 5c3484
Packit 5c3484
	LOOP_ON_SIEVE_BEGIN (prime, s, n_to_bit (n >> 1), 0,sieve);
Packit 5c3484
	SH_COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
Packit 5c3484
	LOOP_ON_SIEVE_END;
Packit 5c3484
      }
Packit 5c3484
      max_prod >>= 1;
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  /* Store primes from (n-k)+1 to n */
Packit 5c3484
  ASSERT (n_to_bit (n - k) < n_to_bit (n));
Packit 5c3484
    {
Packit 5c3484
      mp_limb_t prime;
Packit 5c3484
      LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (n - k) + 1, n_to_bit (n), 0,sieve);
Packit 5c3484
      FACTOR_LIST_STORE (prime, prod, max_prod, factors, j);
Packit 5c3484
      LOOP_ON_SIEVE_END;
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  if (LIKELY (j != 0))
Packit 5c3484
    {
Packit 5c3484
      factors[j++] = prod;
Packit 5c3484
      mpz_prodlimbs (r, factors, j);
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      PTR (r)[0] = prod;
Packit 5c3484
      SIZ (r) = 1;
Packit 5c3484
    }
Packit 5c3484
  TMP_FREE;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
#undef COUNT_A_PRIME
Packit 5c3484
#undef SH_COUNT_A_PRIME
Packit 5c3484
#undef LOOP_ON_SIEVE_END
Packit 5c3484
#undef LOOP_ON_SIEVE_STOP
Packit 5c3484
#undef LOOP_ON_SIEVE_BEGIN
Packit 5c3484
#undef LOOP_ON_SIEVE_CONTINUE
Packit 5c3484
Packit 5c3484
/*********************************************************/
Packit 5c3484
/* End of implementation of Goetgheluck's algorithm      */
Packit 5c3484
/*********************************************************/
Packit 5c3484
Packit 5c3484
void
Packit 5c3484
mpz_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
Packit 5c3484
{
Packit 5c3484
  if (UNLIKELY (n < k)) {
Packit 5c3484
    SIZ (r) = 0;
Packit 5c3484
#if BITS_PER_ULONG > GMP_NUMB_BITS
Packit 5c3484
  } else if (UNLIKELY (n > GMP_NUMB_MAX)) {
Packit 5c3484
    mpz_t tmp;
Packit 5c3484
Packit 5c3484
    mpz_init_set_ui (tmp, n);
Packit 5c3484
    mpz_bin_ui (r, tmp, k);
Packit 5c3484
    mpz_clear (tmp);
Packit 5c3484
#endif
Packit 5c3484
  } else {
Packit 5c3484
    ASSERT (n <= GMP_NUMB_MAX);
Packit 5c3484
    /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */
Packit 5c3484
    k = MIN (k, n - k);
Packit 5c3484
    if (k < 2) {
Packit 5c3484
      PTR(r)[0] = k ? n : 1; /* 1 + ((-k) & (n-1)); */
Packit 5c3484
      SIZ(r) = 1;
Packit 5c3484
    } else if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) { /* k >= 2, n >= 4 */
Packit 5c3484
      PTR(r)[0] = bc_bin_uiui (n, k);
Packit 5c3484
      SIZ(r) = 1;
Packit 5c3484
    } else if (k <= ODD_FACTORIAL_TABLE_LIMIT)
Packit 5c3484
      mpz_smallk_bin_uiui (r, n, k);
Packit 5c3484
    else if (BIN_UIUI_ENABLE_SMALLDC &&
Packit 5c3484
	     k <= (BIN_UIUI_RECURSIVE_SMALLDC ? ODD_CENTRAL_BINOMIAL_TABLE_LIMIT : ODD_FACTORIAL_TABLE_LIMIT)* 2)
Packit 5c3484
      mpz_smallkdc_bin_uiui (r, n, k);
Packit 5c3484
    else if (ABOVE_THRESHOLD (k, BIN_GOETGHELUCK_THRESHOLD) &&
Packit 5c3484
	     k > (n >> 4)) /* k > ODD_FACTORIAL_TABLE_LIMIT */
Packit 5c3484
      mpz_goetgheluck_bin_uiui (r, n, k);
Packit 5c3484
    else
Packit 5c3484
      mpz_bdiv_bin_uiui (r, n, k);
Packit 5c3484
  }
Packit 5c3484
}