|
Packit |
5c3484 |
/* mpn_toom42_mulmid -- toom42 middle product
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Contributed by David Harvey.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
|
|
Packit |
5c3484 |
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
|
|
Packit |
5c3484 |
GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Copyright 2011 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
it under the terms of either:
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
option) any later version.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
or
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
later version.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
or both in parallel, as here.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
for more details.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
see https://www.gnu.org/licenses/. */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#include "gmp.h"
|
|
Packit |
5c3484 |
#include "gmp-impl.h"
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/*
|
|
Packit |
5c3484 |
Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Neither ap nor bp may overlap rp.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Must have n >= 4.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Amount of scratch space required is given by mpn_toom42_mulmid_itch().
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact
|
|
Packit |
5c3484 |
requirements should be clarified.
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
void
|
|
Packit |
5c3484 |
mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n,
|
|
Packit |
5c3484 |
mp_ptr scratch)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mp_limb_t cy, e[12], zh, zl;
|
|
Packit |
5c3484 |
mp_size_t m;
|
|
Packit |
5c3484 |
int neg;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASSERT (n >= 4);
|
|
Packit |
5c3484 |
ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
|
|
Packit |
5c3484 |
ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ap += n & 1; /* handle odd row and diagonal later */
|
|
Packit |
5c3484 |
m = n / 2;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* (e0h:e0l) etc are correction terms, in 2's complement */
|
|
Packit |
5c3484 |
#define e0l (e[0])
|
|
Packit |
5c3484 |
#define e0h (e[1])
|
|
Packit |
5c3484 |
#define e1l (e[2])
|
|
Packit |
5c3484 |
#define e1h (e[3])
|
|
Packit |
5c3484 |
#define e2l (e[4])
|
|
Packit |
5c3484 |
#define e2h (e[5])
|
|
Packit |
5c3484 |
#define e3l (e[6])
|
|
Packit |
5c3484 |
#define e3h (e[7])
|
|
Packit |
5c3484 |
#define e4l (e[8])
|
|
Packit |
5c3484 |
#define e4h (e[9])
|
|
Packit |
5c3484 |
#define e5l (e[10])
|
|
Packit |
5c3484 |
#define e5h (e[11])
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#define s (scratch + 2)
|
|
Packit |
5c3484 |
#define t (rp + m + 2)
|
|
Packit |
5c3484 |
#define p0 rp
|
|
Packit |
5c3484 |
#define p1 scratch
|
|
Packit |
5c3484 |
#define p2 (rp + m)
|
|
Packit |
5c3484 |
#define next_scratch (scratch + 3*m + 1)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/*
|
|
Packit |
5c3484 |
rp scratch
|
|
Packit |
5c3484 |
|---------|-----------| |---------|---------|----------|
|
|
Packit |
5c3484 |
0 m 2m+2 0 m 2m 3m+1
|
|
Packit |
5c3484 |
<----p2----> <-------------s------------->
|
|
Packit |
5c3484 |
<----p0----><---t----> <----p1---->
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */
|
|
Packit |
5c3484 |
cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0);
|
|
Packit |
5c3484 |
cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l,
|
|
Packit |
5c3484 |
bp + m, bp, m, cy);
|
|
Packit |
5c3484 |
mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */
|
|
Packit |
5c3484 |
if (mpn_cmp (bp + m, bp, m) < 0)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l,
|
|
Packit |
5c3484 |
ap + m - 1, ap + 2*m - 1, m, 0));
|
|
Packit |
5c3484 |
neg = 1;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l,
|
|
Packit |
5c3484 |
ap + m - 1, ap + 2*m - 1, m, 0));
|
|
Packit |
5c3484 |
neg = 0;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* recursive middle products. The picture is:
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
b[2m-1] A A A B B B - - - - -
|
|
Packit |
5c3484 |
... - A A A B B B - - - -
|
|
Packit |
5c3484 |
b[m] - - A A A B B B - - -
|
|
Packit |
5c3484 |
b[m-1] - - - C C C D D D - -
|
|
Packit |
5c3484 |
... - - - - C C C D D D -
|
|
Packit |
5c3484 |
b[0] - - - - - C C C D D D
|
|
Packit |
5c3484 |
a[0] ... a[m] ... a[2m] ... a[4m-2]
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
if (m < MULMID_TOOM42_THRESHOLD)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
/* A + B */
|
|
Packit |
5c3484 |
mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m);
|
|
Packit |
5c3484 |
/* accumulate high limbs of p0 into e1 */
|
|
Packit |
5c3484 |
ADDC_LIMB (cy, e1l, e1l, p0[m]);
|
|
Packit |
5c3484 |
e1h += p0[m + 1] + cy;
|
|
Packit |
5c3484 |
/* (-1)^neg * (B - C) (overwrites first m limbs of s) */
|
|
Packit |
5c3484 |
mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m);
|
|
Packit |
5c3484 |
/* C + D (overwrites t) */
|
|
Packit |
5c3484 |
mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
/* as above, but use toom42 instead */
|
|
Packit |
5c3484 |
mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch);
|
|
Packit |
5c3484 |
ADDC_LIMB (cy, e1l, e1l, p0[m]);
|
|
Packit |
5c3484 |
e1h += p0[m + 1] + cy;
|
|
Packit |
5c3484 |
mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch);
|
|
Packit |
5c3484 |
mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* apply error terms */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* -e0 at rp[0] */
|
|
Packit |
5c3484 |
SUBC_LIMB (cy, rp[0], rp[0], e0l);
|
|
Packit |
5c3484 |
SUBC_LIMB (cy, rp[1], rp[1], e0h + cy);
|
|
Packit |
5c3484 |
if (UNLIKELY (cy))
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1;
|
|
Packit |
5c3484 |
SUBC_LIMB (cy, e1l, e1l, cy);
|
|
Packit |
5c3484 |
e1h -= cy;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* z = e1 - e2 + high(p0) */
|
|
Packit |
5c3484 |
SUBC_LIMB (cy, zl, e1l, e2l);
|
|
Packit |
5c3484 |
zh = e1h - e2h - cy;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* z at rp[m] */
|
|
Packit |
5c3484 |
ADDC_LIMB (cy, rp[m], rp[m], zl);
|
|
Packit |
5c3484 |
zh = (zh + cy) & GMP_NUMB_MASK;
|
|
Packit |
5c3484 |
ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh);
|
|
Packit |
5c3484 |
cy -= (zh >> (GMP_NUMB_BITS - 1));
|
|
Packit |
5c3484 |
if (UNLIKELY (cy))
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
if (cy == 1)
|
|
Packit |
5c3484 |
mpn_add_1 (rp + m + 2, rp + m + 2, m, 1);
|
|
Packit |
5c3484 |
else /* cy == -1 */
|
|
Packit |
5c3484 |
mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* e3 at rp[2*m] */
|
|
Packit |
5c3484 |
ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l);
|
|
Packit |
5c3484 |
rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* e4 at p1[0] */
|
|
Packit |
5c3484 |
ADDC_LIMB (cy, p1[0], p1[0], e4l);
|
|
Packit |
5c3484 |
ADDC_LIMB (cy, p1[1], p1[1], e4h + cy);
|
|
Packit |
5c3484 |
if (UNLIKELY (cy))
|
|
Packit |
5c3484 |
mpn_add_1 (p1 + 2, p1 + 2, m, 1);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* -e5 at p1[m] */
|
|
Packit |
5c3484 |
SUBC_LIMB (cy, p1[m], p1[m], e5l);
|
|
Packit |
5c3484 |
p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* adjustment if p1 ends up negative */
|
|
Packit |
5c3484 |
cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1));
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* add (-1)^neg * (p1 - B^m * p1) to output */
|
|
Packit |
5c3484 |
if (neg)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy);
|
|
Packit |
5c3484 |
mpn_add (rp, rp, 2*m + 2, p1, m + 2); /* A + C */
|
|
Packit |
5c3484 |
mpn_sub_n (rp + m, rp + m, p1, m + 2); /* B + D */
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
else
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mpn_add_1 (rp + m + 2, rp + m + 2, m, cy);
|
|
Packit |
5c3484 |
mpn_sub (rp, rp, 2*m + 2, p1, m + 2); /* A + C */
|
|
Packit |
5c3484 |
mpn_add_n (rp + m, rp + m, p1, m + 2); /* B + D */
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* odd row and diagonal */
|
|
Packit |
5c3484 |
if (n & 1)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
/*
|
|
Packit |
5c3484 |
Products marked E are already done. We need to do products marked O.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
OOOOO----
|
|
Packit |
5c3484 |
-EEEEO---
|
|
Packit |
5c3484 |
--EEEEO--
|
|
Packit |
5c3484 |
---EEEEO-
|
|
Packit |
5c3484 |
----EEEEO
|
|
Packit |
5c3484 |
*/
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* first row of O's */
|
|
Packit |
5c3484 |
cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]);
|
|
Packit |
5c3484 |
ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* O's on diagonal */
|
|
Packit |
5c3484 |
/* FIXME: should probably define an interface "mpn_mulmid_diag_1"
|
|
Packit |
5c3484 |
that can handle the sum below. Currently we're relying on
|
|
Packit |
5c3484 |
mulmid_basecase being pretty fast for a diagonal sum like this,
|
|
Packit |
5c3484 |
which is true at least for the K8 asm version, but surely false
|
|
Packit |
5c3484 |
for the generic version. */
|
|
Packit |
5c3484 |
mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1);
|
|
Packit |
5c3484 |
mpn_add_n (rp + n - 1, rp + n - 1, e, 3);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
}
|