|
Packit |
5c3484 |
/* mpn_mul -- Multiply two natural numbers.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Contributed to the GNU project by Torbjorn Granlund.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012,
|
|
Packit |
5c3484 |
2014 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
it under the terms of either:
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
option) any later version.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
or
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
later version.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
or both in parallel, as here.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
for more details.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
see https://www.gnu.org/licenses/. */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#include "gmp.h"
|
|
Packit |
5c3484 |
#include "gmp-impl.h"
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#ifndef MUL_BASECASE_MAX_UN
|
|
Packit |
5c3484 |
#define MUL_BASECASE_MAX_UN 500
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* Areas where the different toom algorithms can be called (extracted
|
|
Packit |
5c3484 |
from the t-toom*.c files, and ignoring small constant offsets):
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
1/6 1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5 1 vn/un
|
|
Packit |
5c3484 |
4/7 6/7
|
|
Packit |
5c3484 |
6/11
|
|
Packit |
5c3484 |
|--------------------| toom22 (small)
|
|
Packit |
5c3484 |
|| toom22 (large)
|
|
Packit |
5c3484 |
|xxxx| toom22 called
|
|
Packit |
5c3484 |
|-------------------------------------| toom32
|
|
Packit |
5c3484 |
|xxxxxxxxxxxxxxxx| | toom32 called
|
|
Packit |
5c3484 |
|------------| toom33
|
|
Packit |
5c3484 |
|x| toom33 called
|
|
Packit |
5c3484 |
|---------------------------------| | toom42
|
|
Packit |
5c3484 |
|xxxxxxxxxxxxxxxxxxxxxxxx| | toom42 called
|
|
Packit |
5c3484 |
|--------------------| toom43
|
|
Packit |
5c3484 |
|xxxxxxxxxx| toom43 called
|
|
Packit |
5c3484 |
|-----------------------------| toom52 (unused)
|
|
Packit |
5c3484 |
|--------| toom44
|
|
Packit |
5c3484 |
|xxxxxxxx| toom44 called
|
|
Packit |
5c3484 |
|--------------------| | toom53
|
|
Packit |
5c3484 |
|xxxxxx| toom53 called
|
|
Packit |
5c3484 |
|-------------------------| toom62 (unused)
|
|
Packit |
5c3484 |
|----------------| toom54 (unused)
|
|
Packit |
5c3484 |
|--------------------| toom63
|
|
Packit |
5c3484 |
|xxxxxxxxx| | toom63 called
|
|
Packit |
5c3484 |
|---------------------------------| toom6h
|
|
Packit |
5c3484 |
|xxxxxxxx| toom6h called
|
|
Packit |
5c3484 |
|-------------------------| toom8h (32 bit)
|
|
Packit |
5c3484 |
|------------------------------------------| toom8h (64 bit)
|
|
Packit |
5c3484 |
|xxxxxxxx| toom8h called
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
|
|
Packit |
5c3484 |
#define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
|
|
Packit |
5c3484 |
(pointed to by VP, with VN limbs), and store the result at PRODP. The
|
|
Packit |
5c3484 |
result is UN + VN limbs. Return the most significant limb of the result.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
NOTE: The space pointed to by PRODP is overwritten before finished with U
|
|
Packit |
5c3484 |
and V, so overlap is an error.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Argument constraints:
|
|
Packit |
5c3484 |
1. UN >= VN.
|
|
Packit |
5c3484 |
2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
|
|
Packit |
5c3484 |
the multiplier and the multiplicand. */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/*
|
|
Packit |
5c3484 |
* The cutoff lines in the toomX2 and toomX3 code are now exactly between the
|
|
Packit |
5c3484 |
ideal lines of the surrounding algorithms. Is that optimal?
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* The toomX3 code now uses a structure similar to the one of toomX2, except
|
|
Packit |
5c3484 |
that it loops longer in the unbalanced case. The result is that the
|
|
Packit |
5c3484 |
remaining area might have un < vn. Should we fix the toomX2 code in a
|
|
Packit |
5c3484 |
similar way?
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* The toomX3 code is used for the largest non-FFT unbalanced operands. It
|
|
Packit |
5c3484 |
therefore calls mpn_mul recursively for certain cases.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* Allocate static temp space using THRESHOLD variables (except for toom44
|
|
Packit |
5c3484 |
when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
|
|
Packit |
5c3484 |
have the same vn threshold. This is not true, we should actually use
|
|
Packit |
5c3484 |
mul_basecase for slightly larger operands for toom32 than for toom22, and
|
|
Packit |
5c3484 |
even larger for toom42.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* That problem is even more prevalent for toomX3. We therefore use special
|
|
Packit |
5c3484 |
THRESHOLD variables there.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* Is our ITCH allocation correct?
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#define ITCH (16*vn + 100)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mp_limb_t
|
|
Packit |
5c3484 |
mpn_mul (mp_ptr prodp,
|
|
Packit |
5c3484 |
mp_srcptr up, mp_size_t un,
|
|
Packit |
5c3484 |
mp_srcptr vp, mp_size_t vn)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
ASSERT (un >= vn);
|
|
Packit |
5c3484 |
ASSERT (vn >= 1);
|
|
Packit |
5c3484 |
ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
|
|
Packit |
5c3484 |
ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
if (un == vn)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
if (up == vp)
|
|
Packit |
5c3484 |
mpn_sqr (prodp, up, un);
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
mpn_mul_n (prodp, up, vp, un);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else if (vn < MUL_TOOM22_THRESHOLD)
|
|
Packit |
5c3484 |
{ /* plain schoolbook multiplication */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* Unless un is very large, or else if have an applicable mpn_mul_N,
|
|
Packit |
5c3484 |
perform basecase multiply directly. */
|
|
Packit |
5c3484 |
if (un <= MUL_BASECASE_MAX_UN
|
|
Packit |
5c3484 |
#if HAVE_NATIVE_mpn_mul_2
|
|
Packit |
5c3484 |
|| vn <= 2
|
|
Packit |
5c3484 |
#else
|
|
Packit |
5c3484 |
|| vn == 1
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
)
|
|
Packit |
5c3484 |
mpn_mul_basecase (prodp, up, un, vp, vn);
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
/* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory
|
|
Packit |
5c3484 |
locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
|
|
Packit |
5c3484 |
these pieces with the vp[] operand. After each such partial
|
|
Packit |
5c3484 |
multiplication (but the last) we copy the most significant vn
|
|
Packit |
5c3484 |
limbs into a temporary buffer since that part would otherwise be
|
|
Packit |
5c3484 |
overwritten by the next multiplication. After the next
|
|
Packit |
5c3484 |
multiplication, we add it back. This illustrates the situation:
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
-->vn<--
|
|
Packit |
5c3484 |
| |<------- un ------->|
|
|
Packit |
5c3484 |
_____________________|
|
|
Packit |
5c3484 |
X /|
|
|
Packit |
5c3484 |
/XX__________________/ |
|
|
Packit |
5c3484 |
_____________________ |
|
|
Packit |
5c3484 |
X / |
|
|
Packit |
5c3484 |
/XX__________________/ |
|
|
Packit |
5c3484 |
_____________________ |
|
|
Packit |
5c3484 |
/ / |
|
|
Packit |
5c3484 |
/____________________/ |
|
|
Packit |
5c3484 |
==================================================================
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The parts marked with X are the parts whose sums are copied into
|
|
Packit |
5c3484 |
the temporary buffer. */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
|
|
Packit |
5c3484 |
mp_limb_t cy;
|
|
Packit |
5c3484 |
ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
|
|
Packit |
5c3484 |
prodp += MUL_BASECASE_MAX_UN;
|
|
Packit |
5c3484 |
MPN_COPY (tp, prodp, vn); /* preserve high triangle */
|
|
Packit |
5c3484 |
up += MUL_BASECASE_MAX_UN;
|
|
Packit |
5c3484 |
un -= MUL_BASECASE_MAX_UN;
|
|
Packit |
5c3484 |
while (un > MUL_BASECASE_MAX_UN)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
|
|
Packit |
5c3484 |
cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
|
|
Packit |
5c3484 |
mpn_incr_u (prodp + vn, cy);
|
|
Packit |
5c3484 |
prodp += MUL_BASECASE_MAX_UN;
|
|
Packit |
5c3484 |
MPN_COPY (tp, prodp, vn); /* preserve high triangle */
|
|
Packit |
5c3484 |
up += MUL_BASECASE_MAX_UN;
|
|
Packit |
5c3484 |
un -= MUL_BASECASE_MAX_UN;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
if (un > vn)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mpn_mul_basecase (prodp, up, un, vp, vn);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
ASSERT (un > 0);
|
|
Packit |
5c3484 |
mpn_mul_basecase (prodp, vp, vn, up, un);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
|
|
Packit |
5c3484 |
mpn_incr_u (prodp + vn, cy);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
/* Use ToomX2 variants */
|
|
Packit |
5c3484 |
mp_ptr scratch;
|
|
Packit |
5c3484 |
TMP_SDECL; TMP_SMARK;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2)
|
|
Packit |
5c3484 |
scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2);
|
|
Packit |
5c3484 |
ASSERT (mpn_toom22_mul_itch ((5*vn-1)/4, vn) <= ITCH_TOOMX2); /* 5vn/2+ */
|
|
Packit |
5c3484 |
ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX2); /* 7vn/6+ */
|
|
Packit |
5c3484 |
ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */
|
|
Packit |
5c3484 |
#undef ITCH_TOOMX2
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
|
|
Packit |
5c3484 |
square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly
|
|
Packit |
5c3484 |
wise; we would get better balance by slightly moving the bound. We
|
|
Packit |
5c3484 |
will sometimes end up with un < vn, like in the X3 arm below. */
|
|
Packit |
5c3484 |
if (un >= 3 * vn)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mp_limb_t cy;
|
|
Packit |
5c3484 |
mp_ptr ws;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* The maximum ws usage is for the mpn_mul result. */
|
|
Packit |
5c3484 |
ws = TMP_SALLOC_LIMBS (4 * vn);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
|
|
Packit |
5c3484 |
un -= 2 * vn;
|
|
Packit |
5c3484 |
up += 2 * vn;
|
|
Packit |
5c3484 |
prodp += 2 * vn;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
while (un >= 3 * vn)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
|
|
Packit |
5c3484 |
un -= 2 * vn;
|
|
Packit |
5c3484 |
up += 2 * vn;
|
|
Packit |
5c3484 |
cy = mpn_add_n (prodp, prodp, ws, vn);
|
|
Packit |
5c3484 |
MPN_COPY (prodp + vn, ws + vn, 2 * vn);
|
|
Packit |
5c3484 |
mpn_incr_u (prodp + vn, cy);
|
|
Packit |
5c3484 |
prodp += 2 * vn;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* vn <= un < 3vn */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
if (4 * un < 5 * vn)
|
|
Packit |
5c3484 |
mpn_toom22_mul (ws, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
else if (4 * un < 7 * vn)
|
|
Packit |
5c3484 |
mpn_toom32_mul (ws, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
mpn_toom42_mul (ws, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
cy = mpn_add_n (prodp, prodp, ws, vn);
|
|
Packit |
5c3484 |
MPN_COPY (prodp + vn, ws + vn, un);
|
|
Packit |
5c3484 |
mpn_incr_u (prodp + vn, cy);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
if (4 * un < 5 * vn)
|
|
Packit |
5c3484 |
mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
else if (4 * un < 7 * vn)
|
|
Packit |
5c3484 |
mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
TMP_SFREE;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
|
|
Packit |
5c3484 |
BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
/* Handle the largest operands that are not in the FFT range. The 2nd
|
|
Packit |
5c3484 |
condition makes very unbalanced operands avoid the FFT code (except
|
|
Packit |
5c3484 |
perhaps as coefficient products of the Toom code. */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
/* Use ToomX3 variants */
|
|
Packit |
5c3484 |
mp_ptr scratch;
|
|
Packit |
5c3484 |
TMP_DECL; TMP_MARK;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS)
|
|
Packit |
5c3484 |
scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3);
|
|
Packit |
5c3484 |
ASSERT (mpn_toom33_mul_itch ((7*vn-1)/6, vn) <= ITCH_TOOMX3); /* 7vn/2+ */
|
|
Packit |
5c3484 |
ASSERT (mpn_toom43_mul_itch ((3*vn-1)/2, vn) <= ITCH_TOOMX3); /* 9vn/4+ */
|
|
Packit |
5c3484 |
ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX3); /* 7vn/6+ */
|
|
Packit |
5c3484 |
ASSERT (mpn_toom53_mul_itch ((11*vn-1)/6, vn) <= ITCH_TOOMX3); /* 11vn/3+ */
|
|
Packit |
5c3484 |
ASSERT (mpn_toom42_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
|
|
Packit |
5c3484 |
ASSERT (mpn_toom63_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
|
|
Packit |
5c3484 |
#undef ITCH_TOOMX3
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
if (2 * un >= 5 * vn)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mp_limb_t cy;
|
|
Packit |
5c3484 |
mp_ptr ws;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* The maximum ws usage is for the mpn_mul result. */
|
|
Packit |
5c3484 |
ws = TMP_ALLOC_LIMBS (7 * vn >> 1);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
|
|
Packit |
5c3484 |
mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
|
|
Packit |
5c3484 |
un -= 2 * vn;
|
|
Packit |
5c3484 |
up += 2 * vn;
|
|
Packit |
5c3484 |
prodp += 2 * vn;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
while (2 * un >= 5 * vn) /* un >= 2.5vn */
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
|
|
Packit |
5c3484 |
mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
|
|
Packit |
5c3484 |
un -= 2 * vn;
|
|
Packit |
5c3484 |
up += 2 * vn;
|
|
Packit |
5c3484 |
cy = mpn_add_n (prodp, prodp, ws, vn);
|
|
Packit |
5c3484 |
MPN_COPY (prodp + vn, ws + vn, 2 * vn);
|
|
Packit |
5c3484 |
mpn_incr_u (prodp + vn, cy);
|
|
Packit |
5c3484 |
prodp += 2 * vn;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* vn / 2 <= un < 2.5vn */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
if (un < vn)
|
|
Packit |
5c3484 |
mpn_mul (ws, vp, vn, up, un);
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
mpn_mul (ws, up, un, vp, vn);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
cy = mpn_add_n (prodp, prodp, ws, vn);
|
|
Packit |
5c3484 |
MPN_COPY (prodp + vn, ws + vn, un);
|
|
Packit |
5c3484 |
mpn_incr_u (prodp + vn, cy);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
if (6 * un < 7 * vn)
|
|
Packit |
5c3484 |
mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
else if (2 * un < 3 * vn)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
|
|
Packit |
5c3484 |
mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else if (6 * un < 11 * vn)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
if (4 * un < 7 * vn)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
|
|
Packit |
5c3484 |
mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
|
|
Packit |
5c3484 |
mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
|
|
Packit |
5c3484 |
mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
TMP_FREE;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mp_ptr scratch;
|
|
Packit |
5c3484 |
TMP_DECL; TMP_MARK;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
|
|
Packit |
5c3484 |
mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
|
|
Packit |
5c3484 |
mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
|
|
Packit |
5c3484 |
mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
TMP_FREE;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
if (un >= 8 * vn)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mp_limb_t cy;
|
|
Packit |
5c3484 |
mp_ptr ws;
|
|
Packit |
5c3484 |
TMP_DECL; TMP_MARK;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* The maximum ws usage is for the mpn_mul result. */
|
|
Packit |
5c3484 |
ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
|
|
Packit |
5c3484 |
un -= 3 * vn;
|
|
Packit |
5c3484 |
up += 3 * vn;
|
|
Packit |
5c3484 |
prodp += 3 * vn;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
while (2 * un >= 7 * vn) /* un >= 3.5vn */
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mpn_fft_mul (ws, up, 3 * vn, vp, vn);
|
|
Packit |
5c3484 |
un -= 3 * vn;
|
|
Packit |
5c3484 |
up += 3 * vn;
|
|
Packit |
5c3484 |
cy = mpn_add_n (prodp, prodp, ws, vn);
|
|
Packit |
5c3484 |
MPN_COPY (prodp + vn, ws + vn, 3 * vn);
|
|
Packit |
5c3484 |
mpn_incr_u (prodp + vn, cy);
|
|
Packit |
5c3484 |
prodp += 3 * vn;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* vn / 2 <= un < 3.5vn */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
if (un < vn)
|
|
Packit |
5c3484 |
mpn_mul (ws, vp, vn, up, un);
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
mpn_mul (ws, up, un, vp, vn);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
cy = mpn_add_n (prodp, prodp, ws, vn);
|
|
Packit |
5c3484 |
MPN_COPY (prodp + vn, ws + vn, un);
|
|
Packit |
5c3484 |
mpn_incr_u (prodp + vn, cy);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
TMP_FREE;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
mpn_fft_mul (prodp, up, un, vp, vn);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
return prodp[un + vn - 1]; /* historic */
|
|
Packit |
5c3484 |
}
|