|
Packit |
5c3484 |
/* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Contributed to the GNU project by Niels Möller.
|
|
Packit |
5c3484 |
Improvements by Marco Bodrato.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
|
|
Packit |
5c3484 |
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
|
|
Packit |
5c3484 |
GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Copyright 2006, 2007, 2009, 2014 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
it under the terms of either:
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
option) any later version.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
or
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
later version.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
or both in parallel, as here.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
for more details.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
see https://www.gnu.org/licenses/. */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#include "gmp.h"
|
|
Packit |
5c3484 |
#include "gmp-impl.h"
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#define BINVERT_3 MODLIMB_INVERSE_3
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#define BINVERT_9 \
|
|
Packit |
5c3484 |
((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#define BINVERT_15 \
|
|
Packit |
5c3484 |
((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* For the various mpn_divexact_byN here, fall back to using either
|
|
Packit |
5c3484 |
mpn_pi1_bdiv_q_1 or mpn_divexact_1. The former has less overhead and is
|
|
Packit |
5c3484 |
many faster if it is native. For now, since mpn_divexact_1 is native on
|
|
Packit |
5c3484 |
several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use
|
|
Packit |
5c3484 |
mpn_pi1_bdiv_q_1 unconditionally. FIXME. */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* For odd divisors, mpn_divexact_1 works fine with two's complement. */
|
|
Packit |
5c3484 |
#ifndef mpn_divexact_by3
|
|
Packit |
5c3484 |
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
|
|
Packit |
5c3484 |
#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
|
|
Packit |
5c3484 |
#else
|
|
Packit |
5c3484 |
#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#ifndef mpn_divexact_by9
|
|
Packit |
5c3484 |
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
|
|
Packit |
5c3484 |
#define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0)
|
|
Packit |
5c3484 |
#else
|
|
Packit |
5c3484 |
#define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9)
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#ifndef mpn_divexact_by15
|
|
Packit |
5c3484 |
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
|
|
Packit |
5c3484 |
#define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0)
|
|
Packit |
5c3484 |
#else
|
|
Packit |
5c3484 |
#define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15)
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* Interpolation for toom4, using the evaluation points 0, infinity,
|
|
Packit |
5c3484 |
1, -1, 2, -2, 1/2. More precisely, we want to compute
|
|
Packit |
5c3484 |
f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the
|
|
Packit |
5c3484 |
seven values
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
w0 = f(0),
|
|
Packit |
5c3484 |
w1 = f(-2),
|
|
Packit |
5c3484 |
w2 = f(1),
|
|
Packit |
5c3484 |
w3 = f(-1),
|
|
Packit |
5c3484 |
w4 = f(2)
|
|
Packit |
5c3484 |
w5 = 64 * f(1/2)
|
|
Packit |
5c3484 |
w6 = limit at infinity of f(x) / x^6,
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n },
|
|
Packit |
5c3484 |
w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n,
|
|
Packit |
5c3484 |
w6n }. The other values are 2n + 1 limbs each (with most
|
|
Packit |
5c3484 |
significant limbs small). f(-1) and f(-1/2) may be negative, signs
|
|
Packit |
5c3484 |
determined by the flag bits. Inputs are destroyed.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Needs (2*n + 1) limbs of temporary storage.
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
void
|
|
Packit |
5c3484 |
mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,
|
|
Packit |
5c3484 |
mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5,
|
|
Packit |
5c3484 |
mp_size_t w6n, mp_ptr tp)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mp_size_t m;
|
|
Packit |
5c3484 |
mp_limb_t cy;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
m = 2*n + 1;
|
|
Packit |
5c3484 |
#define w0 rp
|
|
Packit |
5c3484 |
#define w2 (rp + 2*n)
|
|
Packit |
5c3484 |
#define w6 (rp + 6*n)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASSERT (w6n > 0);
|
|
Packit |
5c3484 |
ASSERT (w6n <= 2*n);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* Using formulas similar to Marco Bodrato's
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
W5 = W5 + W4
|
|
Packit |
5c3484 |
W1 =(W4 - W1)/2
|
|
Packit |
5c3484 |
W4 = W4 - W0
|
|
Packit |
5c3484 |
W4 =(W4 - W1)/4 - W6*16
|
|
Packit |
5c3484 |
W3 =(W2 - W3)/2
|
|
Packit |
5c3484 |
W2 = W2 - W3
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
W5 = W5 - W2*65 May be negative.
|
|
Packit |
5c3484 |
W2 = W2 - W6 - W0
|
|
Packit |
5c3484 |
W5 =(W5 + W2*45)/2 Now >= 0 again.
|
|
Packit |
5c3484 |
W4 =(W4 - W2)/3
|
|
Packit |
5c3484 |
W2 = W2 - W4
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
W1 = W5 - W1 May be negative.
|
|
Packit |
5c3484 |
W5 =(W5 - W3*8)/9
|
|
Packit |
5c3484 |
W3 = W3 - W5
|
|
Packit |
5c3484 |
W1 =(W1/15 + W5)/2 Now >= 0 again.
|
|
Packit |
5c3484 |
W5 = W5 - W1
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1),
|
|
Packit |
5c3484 |
W4 = f(2), W5 = f(1/2), W6 = f(oo),
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Note that most intermediate results are positive; the ones that
|
|
Packit |
5c3484 |
may be negative are represented in two's complement. We must
|
|
Packit |
5c3484 |
never shift right a value that may be negative, since that would
|
|
Packit |
5c3484 |
invalidate the sign bit. On the other hand, divexact by odd
|
|
Packit |
5c3484 |
numbers work fine with two's complement.
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mpn_add_n (w5, w5, w4, m);
|
|
Packit |
5c3484 |
if (flags & toom7_w1_neg)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
#ifdef HAVE_NATIVE_mpn_rsh1add_n
|
|
Packit |
5c3484 |
mpn_rsh1add_n (w1, w1, w4, m);
|
|
Packit |
5c3484 |
#else
|
|
Packit |
5c3484 |
mpn_add_n (w1, w1, w4, m); ASSERT (!(w1[0] & 1));
|
|
Packit |
5c3484 |
mpn_rshift (w1, w1, m, 1);
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
|
|
Packit |
5c3484 |
mpn_rsh1sub_n (w1, w4, w1, m);
|
|
Packit |
5c3484 |
#else
|
|
Packit |
5c3484 |
mpn_sub_n (w1, w4, w1, m); ASSERT (!(w1[0] & 1));
|
|
Packit |
5c3484 |
mpn_rshift (w1, w1, m, 1);
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
mpn_sub (w4, w4, m, w0, 2*n);
|
|
Packit |
5c3484 |
mpn_sub_n (w4, w4, w1, m); ASSERT (!(w4[0] & 3));
|
|
Packit |
5c3484 |
mpn_rshift (w4, w4, m, 2); /* w4>=0 */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
tp[w6n] = mpn_lshift (tp, w6, w6n, 4);
|
|
Packit |
5c3484 |
mpn_sub (w4, w4, m, tp, w6n+1);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
if (flags & toom7_w3_neg)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
#ifdef HAVE_NATIVE_mpn_rsh1add_n
|
|
Packit |
5c3484 |
mpn_rsh1add_n (w3, w3, w2, m);
|
|
Packit |
5c3484 |
#else
|
|
Packit |
5c3484 |
mpn_add_n (w3, w3, w2, m); ASSERT (!(w3[0] & 1));
|
|
Packit |
5c3484 |
mpn_rshift (w3, w3, m, 1);
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
|
|
Packit |
5c3484 |
mpn_rsh1sub_n (w3, w2, w3, m);
|
|
Packit |
5c3484 |
#else
|
|
Packit |
5c3484 |
mpn_sub_n (w3, w2, w3, m); ASSERT (!(w3[0] & 1));
|
|
Packit |
5c3484 |
mpn_rshift (w3, w3, m, 1);
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mpn_sub_n (w2, w2, w3, m);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mpn_submul_1 (w5, w2, m, 65);
|
|
Packit |
5c3484 |
mpn_sub (w2, w2, m, w6, w6n);
|
|
Packit |
5c3484 |
mpn_sub (w2, w2, m, w0, 2*n);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mpn_addmul_1 (w5, w2, m, 45); ASSERT (!(w5[0] & 1));
|
|
Packit |
5c3484 |
mpn_rshift (w5, w5, m, 1);
|
|
Packit |
5c3484 |
mpn_sub_n (w4, w4, w2, m);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mpn_divexact_by3 (w4, w4, m);
|
|
Packit |
5c3484 |
mpn_sub_n (w2, w2, w4, m);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mpn_sub_n (w1, w5, w1, m);
|
|
Packit |
5c3484 |
mpn_lshift (tp, w3, m, 3);
|
|
Packit |
5c3484 |
mpn_sub_n (w5, w5, tp, m);
|
|
Packit |
5c3484 |
mpn_divexact_by9 (w5, w5, m);
|
|
Packit |
5c3484 |
mpn_sub_n (w3, w3, w5, m);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mpn_divexact_by15 (w1, w1, m);
|
|
Packit |
5c3484 |
mpn_add_n (w1, w1, w5, m); ASSERT (!(w1[0] & 1));
|
|
Packit |
5c3484 |
mpn_rshift (w1, w1, m, 1); /* w1>=0 now */
|
|
Packit |
5c3484 |
mpn_sub_n (w5, w5, w1, m);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* These bounds are valid for the 4x4 polynomial product of toom44,
|
|
Packit |
5c3484 |
* and they are conservative for toom53 and toom62. */
|
|
Packit |
5c3484 |
ASSERT (w1[2*n] < 2);
|
|
Packit |
5c3484 |
ASSERT (w2[2*n] < 3);
|
|
Packit |
5c3484 |
ASSERT (w3[2*n] < 4);
|
|
Packit |
5c3484 |
ASSERT (w4[2*n] < 3);
|
|
Packit |
5c3484 |
ASSERT (w5[2*n] < 2);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* Addition chain. Note carries and the 2n'th limbs that need to be
|
|
Packit |
5c3484 |
* added in.
|
|
Packit |
5c3484 |
*
|
|
Packit |
5c3484 |
* Special care is needed for w2[2n] and the corresponding carry,
|
|
Packit |
5c3484 |
* since the "simple" way of adding it all together would overwrite
|
|
Packit |
5c3484 |
* the limb at wp[2*n] and rp[4*n] (same location) with the sum of
|
|
Packit |
5c3484 |
* the high half of w3 and the low half of w4.
|
|
Packit |
5c3484 |
*
|
|
Packit |
5c3484 |
* 7 6 5 4 3 2 1 0
|
|
Packit |
5c3484 |
* | | | | | | | | |
|
|
Packit |
5c3484 |
* ||w3 (2n+1)|
|
|
Packit |
5c3484 |
* ||w4 (2n+1)|
|
|
Packit |
5c3484 |
* ||w5 (2n+1)| ||w1 (2n+1)|
|
|
Packit |
5c3484 |
* + | w6 (w6n)| ||w2 (2n+1)| w0 (2n) | (share storage with r)
|
|
Packit |
5c3484 |
* -----------------------------------------------
|
|
Packit |
5c3484 |
* r | | | | | | | | |
|
|
Packit |
5c3484 |
* c7 c6 c5 c4 c3 Carries to propagate
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
cy = mpn_add_n (rp + n, rp + n, w1, m);
|
|
Packit |
5c3484 |
MPN_INCR_U (w2 + n + 1, n , cy);
|
|
Packit |
5c3484 |
cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n);
|
|
Packit |
5c3484 |
MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy);
|
|
Packit |
5c3484 |
cy = mpn_add_n (rp + 4*n, w3 + n, w4, n);
|
|
Packit |
5c3484 |
MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy);
|
|
Packit |
5c3484 |
cy = mpn_add_n (rp + 5*n, w4 + n, w5, n);
|
|
Packit |
5c3484 |
MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy);
|
|
Packit |
5c3484 |
if (w6n > n + 1)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
cy = mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, n + 1);
|
|
Packit |
5c3484 |
MPN_INCR_U (rp + 7*n + 1, w6n - n - 1, cy);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n));
|
|
Packit |
5c3484 |
#if WANT_ASSERT
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mp_size_t i;
|
|
Packit |
5c3484 |
for (i = w6n; i <= n; i++)
|
|
Packit |
5c3484 |
ASSERT (w5[n + i] == 0);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
}
|