Blame mpn/generic/toom42_mulmid.c

Packit 5c3484
/* mpn_toom42_mulmid -- toom42 middle product
Packit 5c3484
Packit 5c3484
   Contributed by David Harvey.
Packit 5c3484
Packit 5c3484
   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
Packit 5c3484
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
Packit 5c3484
   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
Packit 5c3484
Packit 5c3484
Copyright 2011 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
This file is part of the GNU MP Library.
Packit 5c3484
Packit 5c3484
The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
it under the terms of either:
Packit 5c3484
Packit 5c3484
  * the GNU Lesser General Public License as published by the Free
Packit 5c3484
    Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
    option) any later version.
Packit 5c3484
Packit 5c3484
or
Packit 5c3484
Packit 5c3484
  * the GNU General Public License as published by the Free Software
Packit 5c3484
    Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
    later version.
Packit 5c3484
Packit 5c3484
or both in parallel, as here.
Packit 5c3484
Packit 5c3484
The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
for more details.
Packit 5c3484
Packit 5c3484
You should have received copies of the GNU General Public License and the
Packit 5c3484
GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
see https://www.gnu.org/licenses/.  */
Packit 5c3484
Packit 5c3484
Packit 5c3484
#include "gmp.h"
Packit 5c3484
#include "gmp-impl.h"
Packit 5c3484
Packit 5c3484
Packit 5c3484
Packit 5c3484
/*
Packit 5c3484
  Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}.
Packit 5c3484
Packit 5c3484
  Neither ap nor bp may overlap rp.
Packit 5c3484
Packit 5c3484
  Must have n >= 4.
Packit 5c3484
Packit 5c3484
  Amount of scratch space required is given by mpn_toom42_mulmid_itch().
Packit 5c3484
Packit 5c3484
  FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact
Packit 5c3484
  requirements should be clarified.
Packit 5c3484
*/
Packit 5c3484
void
Packit 5c3484
mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n,
Packit 5c3484
                   mp_ptr scratch)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t cy, e[12], zh, zl;
Packit 5c3484
  mp_size_t m;
Packit 5c3484
  int neg;
Packit 5c3484
Packit 5c3484
  ASSERT (n >= 4);
Packit 5c3484
  ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
Packit 5c3484
  ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));
Packit 5c3484
Packit 5c3484
  ap += n & 1;   /* handle odd row and diagonal later */
Packit 5c3484
  m = n / 2;
Packit 5c3484
Packit 5c3484
  /* (e0h:e0l) etc are correction terms, in 2's complement */
Packit 5c3484
#define e0l (e[0])
Packit 5c3484
#define e0h (e[1])
Packit 5c3484
#define e1l (e[2])
Packit 5c3484
#define e1h (e[3])
Packit 5c3484
#define e2l (e[4])
Packit 5c3484
#define e2h (e[5])
Packit 5c3484
#define e3l (e[6])
Packit 5c3484
#define e3h (e[7])
Packit 5c3484
#define e4l (e[8])
Packit 5c3484
#define e4h (e[9])
Packit 5c3484
#define e5l (e[10])
Packit 5c3484
#define e5h (e[11])
Packit 5c3484
Packit 5c3484
#define s (scratch + 2)
Packit 5c3484
#define t (rp + m + 2)
Packit 5c3484
#define p0 rp
Packit 5c3484
#define p1 scratch
Packit 5c3484
#define p2 (rp + m)
Packit 5c3484
#define next_scratch (scratch + 3*m + 1)
Packit 5c3484
Packit 5c3484
  /*
Packit 5c3484
            rp                            scratch
Packit 5c3484
  |---------|-----------|    |---------|---------|----------|
Packit 5c3484
  0         m         2m+2   0         m         2m        3m+1
Packit 5c3484
            <----p2---->       <-------------s------------->
Packit 5c3484
  <----p0----><---t---->     <----p1---->
Packit 5c3484
  */
Packit 5c3484
Packit 5c3484
  /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */
Packit 5c3484
  cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0);
Packit 5c3484
  cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l,
Packit 5c3484
		       bp + m, bp, m, cy);
Packit 5c3484
  mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy);
Packit 5c3484
Packit 5c3484
  /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */
Packit 5c3484
  if (mpn_cmp (bp + m, bp, m) < 0)
Packit 5c3484
    {
Packit 5c3484
      ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l,
Packit 5c3484
				      ap + m - 1, ap + 2*m - 1, m, 0));
Packit 5c3484
      neg = 1;
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l,
Packit 5c3484
				      ap + m - 1, ap + 2*m - 1, m, 0));
Packit 5c3484
      neg = 0;
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  /* recursive middle products. The picture is:
Packit 5c3484
Packit 5c3484
      b[2m-1]   A   A   A   B   B   B   -   -   -   -   -
Packit 5c3484
      ...       -   A   A   A   B   B   B   -   -   -   -
Packit 5c3484
      b[m]      -   -   A   A   A   B   B   B   -   -   -
Packit 5c3484
      b[m-1]    -   -   -   C   C   C   D   D   D   -   -
Packit 5c3484
      ...       -   -   -   -   C   C   C   D   D   D   -
Packit 5c3484
      b[0]      -   -   -   -   -   C   C   C   D   D   D
Packit 5c3484
               a[0]   ...  a[m]  ...  a[2m]    ...    a[4m-2]
Packit 5c3484
  */
Packit 5c3484
Packit 5c3484
  if (m < MULMID_TOOM42_THRESHOLD)
Packit 5c3484
    {
Packit 5c3484
      /* A + B */
Packit 5c3484
      mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m);
Packit 5c3484
      /* accumulate high limbs of p0 into e1 */
Packit 5c3484
      ADDC_LIMB (cy, e1l, e1l, p0[m]);
Packit 5c3484
      e1h += p0[m + 1] + cy;
Packit 5c3484
      /* (-1)^neg * (B - C)   (overwrites first m limbs of s) */
Packit 5c3484
      mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m);
Packit 5c3484
      /* C + D   (overwrites t) */
Packit 5c3484
      mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m);
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      /* as above, but use toom42 instead */
Packit 5c3484
      mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch);
Packit 5c3484
      ADDC_LIMB (cy, e1l, e1l, p0[m]);
Packit 5c3484
      e1h += p0[m + 1] + cy;
Packit 5c3484
      mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch);
Packit 5c3484
      mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch);
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  /* apply error terms */
Packit 5c3484
Packit 5c3484
  /* -e0 at rp[0] */
Packit 5c3484
  SUBC_LIMB (cy, rp[0], rp[0], e0l);
Packit 5c3484
  SUBC_LIMB (cy, rp[1], rp[1], e0h + cy);
Packit 5c3484
  if (UNLIKELY (cy))
Packit 5c3484
    {
Packit 5c3484
      cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1;
Packit 5c3484
      SUBC_LIMB (cy, e1l, e1l, cy);
Packit 5c3484
      e1h -= cy;
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  /* z = e1 - e2 + high(p0) */
Packit 5c3484
  SUBC_LIMB (cy, zl, e1l, e2l);
Packit 5c3484
  zh = e1h - e2h - cy;
Packit 5c3484
Packit 5c3484
  /* z at rp[m] */
Packit 5c3484
  ADDC_LIMB (cy, rp[m], rp[m], zl);
Packit 5c3484
  zh = (zh + cy) & GMP_NUMB_MASK;
Packit 5c3484
  ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh);
Packit 5c3484
  cy -= (zh >> (GMP_NUMB_BITS - 1));
Packit 5c3484
  if (UNLIKELY (cy))
Packit 5c3484
    {
Packit 5c3484
      if (cy == 1)
Packit 5c3484
	mpn_add_1 (rp + m + 2, rp + m + 2, m, 1);
Packit 5c3484
      else /* cy == -1 */
Packit 5c3484
	mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1);
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  /* e3 at rp[2*m] */
Packit 5c3484
  ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l);
Packit 5c3484
  rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK;
Packit 5c3484
Packit 5c3484
  /* e4 at p1[0] */
Packit 5c3484
  ADDC_LIMB (cy, p1[0], p1[0], e4l);
Packit 5c3484
  ADDC_LIMB (cy, p1[1], p1[1], e4h + cy);
Packit 5c3484
  if (UNLIKELY (cy))
Packit 5c3484
    mpn_add_1 (p1 + 2, p1 + 2, m, 1);
Packit 5c3484
Packit 5c3484
  /* -e5 at p1[m] */
Packit 5c3484
  SUBC_LIMB (cy, p1[m], p1[m], e5l);
Packit 5c3484
  p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK;
Packit 5c3484
Packit 5c3484
  /* adjustment if p1 ends up negative */
Packit 5c3484
  cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1));
Packit 5c3484
Packit 5c3484
  /* add (-1)^neg * (p1 - B^m * p1) to output */
Packit 5c3484
  if (neg)
Packit 5c3484
    {
Packit 5c3484
      mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy);
Packit 5c3484
      mpn_add (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
Packit 5c3484
      mpn_sub_n (rp + m, rp + m, p1, m + 2);            /* B + D */
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      mpn_add_1 (rp + m + 2, rp + m + 2, m, cy);
Packit 5c3484
      mpn_sub (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
Packit 5c3484
      mpn_add_n (rp + m, rp + m, p1, m + 2);            /* B + D */
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  /* odd row and diagonal */
Packit 5c3484
  if (n & 1)
Packit 5c3484
    {
Packit 5c3484
      /*
Packit 5c3484
        Products marked E are already done. We need to do products marked O.
Packit 5c3484
Packit 5c3484
        OOOOO----
Packit 5c3484
        -EEEEO---
Packit 5c3484
        --EEEEO--
Packit 5c3484
        ---EEEEO-
Packit 5c3484
        ----EEEEO
Packit 5c3484
       */
Packit 5c3484
Packit 5c3484
      /* first row of O's */
Packit 5c3484
      cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]);
Packit 5c3484
      ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy);
Packit 5c3484
Packit 5c3484
      /* O's on diagonal */
Packit 5c3484
      /* FIXME: should probably define an interface "mpn_mulmid_diag_1"
Packit 5c3484
         that can handle the sum below. Currently we're relying on
Packit 5c3484
         mulmid_basecase being pretty fast for a diagonal sum like this,
Packit 5c3484
	 which is true at least for the K8 asm version, but surely false
Packit 5c3484
	 for the generic version. */
Packit 5c3484
      mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1);
Packit 5c3484
      mpn_add_n (rp + n - 1, rp + n - 1, e, 3);
Packit 5c3484
    }
Packit 5c3484
}