|
Packit |
5c3484 |
/* mpn_sqrlo -- squares an n-limb number and returns the low n limbs
|
|
Packit |
5c3484 |
of the result.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS
|
|
Packit |
5c3484 |
FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
|
|
Packit |
5c3484 |
THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
it under the terms of either:
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
option) any later version.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
or
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
later version.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
or both in parallel, as here.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
for more details.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
see https://www.gnu.org/licenses/. */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#include "gmp.h"
|
|
Packit |
5c3484 |
#include "gmp-impl.h"
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
|
|
Packit |
5c3484 |
#define MAYBE_range_basecase 1
|
|
Packit |
5c3484 |
#define MAYBE_range_toom22 1
|
|
Packit |
5c3484 |
#else
|
|
Packit |
5c3484 |
#define MAYBE_range_basecase \
|
|
Packit |
5c3484 |
((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11))
|
|
Packit |
5c3484 |
#define MAYBE_range_toom22 \
|
|
Packit |
5c3484 |
((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) )
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* THINK: The DC strategy uses different constants in different Toom's
|
|
Packit |
5c3484 |
ranges. Something smoother?
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/*
|
|
Packit |
5c3484 |
Compute the least significant half of the product {xy,n}*{yp,n}, or
|
|
Packit |
5c3484 |
formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Above the given threshold, the Divide and Conquer strategy is used.
|
|
Packit |
5c3484 |
The operand is split in two, and a full square plus a mullo
|
|
Packit |
5c3484 |
is used to obtain the final result. The more natural strategy is to
|
|
Packit |
5c3484 |
split in two halves, but this is far from optimal when a
|
|
Packit |
5c3484 |
sub-quadratic multiplication is used.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Mulders suggests an unbalanced split in favour of the full product,
|
|
Packit |
5c3484 |
split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
To compute the value of a, we assume that the cost of mullo for a
|
|
Packit |
5c3484 |
given size ML(n) is a fraction of the cost of a full product with
|
|
Packit |
5c3484 |
same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
|
|
Packit |
5c3484 |
then we can write:
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Given a value for e, want to minimise the value of k, i.e. the
|
|
Packit |
5c3484 |
function k=(1-a)^e/(1-2*a^e).
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
With e=2, the exponent for schoolbook multiplication, the minimum is
|
|
Packit |
5c3484 |
given by the values a=1-a=1/2.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
|
|
Packit |
5c3484 |
Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Other possible approximations follow:
|
|
Packit |
5c3484 |
e=log(5)/log(3) [Toom-3] -> a ~= 9/40
|
|
Packit |
5c3484 |
e=log(7)/log(4) [Toom-4] -> a ~= 7/39
|
|
Packit |
5c3484 |
e=log(11)/log(6) [Toom-6] -> a ~= 1/8
|
|
Packit |
5c3484 |
e=log(15)/log(8) [Toom-8] -> a ~= 1/10
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The values above where obtained with the following trivial commands
|
|
Packit |
5c3484 |
in the gp-pari shell:
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
fun(e,a)=(1-a)^e/(1-2*a^e)
|
|
Packit |
5c3484 |
mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)
|
|
Packit |
5c3484 |
contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
|
|
Packit |
5c3484 |
contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
|
|
Packit |
5c3484 |
contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
|
|
Packit |
5c3484 |
contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
|
|
Packit |
5c3484 |
contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
,
|
|
Packit |
5c3484 |
|\
|
|
Packit |
5c3484 |
| \
|
|
Packit |
5c3484 |
+----,
|
|
Packit |
5c3484 |
| |
|
|
Packit |
5c3484 |
| |
|
|
Packit |
5c3484 |
| |\
|
|
Packit |
5c3484 |
| | \
|
|
Packit |
5c3484 |
+----+--`
|
|
Packit |
5c3484 |
^ n2 ^n1^
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
For an actual implementation, the assumption that M(n)=n^e is
|
|
Packit |
5c3484 |
incorrect, as a consequence also the assumption that ML(n)=k*M(n)
|
|
Packit |
5c3484 |
with a constant k is wrong.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
But theory suggest us two things:
|
|
Packit |
5c3484 |
- the best the multiplication product is (lower e), the more k
|
|
Packit |
5c3484 |
approaches 1, and a approaches 0.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
- A value for a smaller than optimal is probably less bad than a
|
|
Packit |
5c3484 |
bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
|
|
Packit |
5c3484 |
value, and k(a)=0.808_ the mul/mullo speed ratio. We get
|
|
Packit |
5c3484 |
k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
static mp_size_t
|
|
Packit |
5c3484 |
mpn_sqrlo_itch (mp_size_t n)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
return 2*n;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/*
|
|
Packit |
5c3484 |
mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp.
|
|
Packit |
5c3484 |
It accepts tp == rp.
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
static void
|
|
Packit |
5c3484 |
mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mp_size_t n2, n1;
|
|
Packit |
5c3484 |
ASSERT (n >= 2);
|
|
Packit |
5c3484 |
ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
|
|
Packit |
5c3484 |
ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* Divide-and-conquer */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* We need fractional approximation of the value 0 < a <= 1/2
|
|
Packit |
5c3484 |
giving the minimum in the function k=(1-a)^e/(1-2*a^e).
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11)))
|
|
Packit |
5c3484 |
n1 = n >> 1;
|
|
Packit |
5c3484 |
else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11)))
|
|
Packit |
5c3484 |
n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */
|
|
Packit |
5c3484 |
else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9)))
|
|
Packit |
5c3484 |
n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */
|
|
Packit |
5c3484 |
else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9))
|
|
Packit |
5c3484 |
n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */
|
|
Packit |
5c3484 |
/* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
n2 = n - n1;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* x0 ^ 2 */
|
|
Packit |
5c3484 |
mpn_sqr (tp, xp, n2);
|
|
Packit |
5c3484 |
MPN_COPY (rp, tp, n2);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */
|
|
Packit |
5c3484 |
if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
|
|
Packit |
5c3484 |
mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1);
|
|
Packit |
5c3484 |
else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
|
|
Packit |
5c3484 |
mpn_mullo_basecase (tp + n, xp + n2, xp, n1);
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
mpn_mullo_n (tp + n, xp + n2, xp, n1);
|
|
Packit |
5c3484 |
/* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */
|
|
Packit |
5c3484 |
#if HAVE_NATIVE_mpn_addlsh1_n
|
|
Packit |
5c3484 |
mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1);
|
|
Packit |
5c3484 |
#else
|
|
Packit |
5c3484 |
mpn_lshift (rp + n2, tp + n, n1, 1);
|
|
Packit |
5c3484 |
mpn_add_n (rp + n2, rp + n2, tp + n2, n1);
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */
|
|
Packit |
5c3484 |
#define SQR_BASECASE_ALLOC \
|
|
Packit |
5c3484 |
(SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* FIXME: This function should accept a temporary area; dc_sqrlo
|
|
Packit |
5c3484 |
accepts a pointer tp, and handle the case tp == rp, do the same here.
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
void
|
|
Packit |
5c3484 |
mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
ASSERT (n >= 1);
|
|
Packit |
5c3484 |
ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD))
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
/* FIXME: smarter criteria? */
|
|
Packit |
5c3484 |
#if HAVE_NATIVE_mpn_mullo_basecase || ! HAVE_NATIVE_mpn_sqr_basecase
|
|
Packit |
5c3484 |
/* mullo computes as many products as sqr, but directly writes
|
|
Packit |
5c3484 |
on the result area. */
|
|
Packit |
5c3484 |
mpn_mullo_basecase (rp, xp, xp, n);
|
|
Packit |
5c3484 |
#else
|
|
Packit |
5c3484 |
/* Allocate workspace of fixed size on stack: fast! */
|
|
Packit |
5c3484 |
mp_limb_t tp[SQR_BASECASE_ALLOC];
|
|
Packit |
5c3484 |
mpn_sqr_basecase (tp, xp, n);
|
|
Packit |
5c3484 |
MPN_COPY (rp, tp, n);
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD))
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mpn_sqrlo_basecase (rp, xp, n);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mp_ptr tp;
|
|
Packit |
5c3484 |
TMP_DECL;
|
|
Packit |
5c3484 |
TMP_MARK;
|
|
Packit |
5c3484 |
tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n));
|
|
Packit |
5c3484 |
if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD))
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mpn_dc_sqrlo (rp, xp, n, tp);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
/* For really large operands, use plain mpn_mul_n but throw away upper n
|
|
Packit |
5c3484 |
limbs of result. */
|
|
Packit |
5c3484 |
#if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD)
|
|
Packit |
5c3484 |
mpn_fft_mul (tp, xp, n, xp, n);
|
|
Packit |
5c3484 |
#else
|
|
Packit |
5c3484 |
mpn_sqr (tp, xp, n);
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
MPN_COPY (rp, tp, n);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
TMP_FREE;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
}
|