/* mpn_toom_interpolate_8pts -- Interpolate for toom54, 63, 72.

   Contributed to the GNU project by Marco Bodrato.

   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.

Copyright 2009, 2011, 2012 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */

#include "gmp.h"
#include "gmp-impl.h"

#define BINVERT_3 MODLIMB_INVERSE_3

#define BINVERT_15 \
  ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)

#define BINVERT_45 ((BINVERT_15 * BINVERT_3) & GMP_NUMB_MASK)

#ifndef mpn_divexact_by3
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
#else
#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
#endif
#endif

#ifndef mpn_divexact_by45
#if GMP_NUMB_BITS % 12 == 0
#define mpn_divexact_by45(dst,src,size) \
  (63 & 19 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 45)))
#else
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
#define mpn_divexact_by45(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,45,BINVERT_45,0)
#else
#define mpn_divexact_by45(dst,src,size) mpn_divexact_1(dst,src,size,45)
#endif
#endif
#endif

#if HAVE_NATIVE_mpn_sublsh2_n_ip1
#define DO_mpn_sublsh2_n(dst,src,n,ws) mpn_sublsh2_n_ip1(dst,src,n)
#else
#define DO_mpn_sublsh2_n(dst,src,n,ws) DO_mpn_sublsh_n(dst,src,n,2,ws)
#endif

#if HAVE_NATIVE_mpn_sublsh_n
#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n (dst,dst,src,n,s)
#else
static mp_limb_t
DO_mpn_sublsh_n (mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
{
#if USE_MUL_1 && 0
  return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
#else
  mp_limb_t __cy;
  __cy = mpn_lshift (ws,src,n,s);
  return __cy + mpn_sub_n (dst,dst,ws,n);
#endif
}
#endif


#if HAVE_NATIVE_mpn_subrsh
#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh (dst,nd,src,ns,s)
#else
/* This is not a correct definition, it assumes no carry */
#define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
do {									\
  mp_limb_t __cy;							\
  MPN_DECR_U (dst, nd, src[0] >> s);					\
  __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
  MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
} while (0)
#endif

/* Interpolation for Toom-4.5 (or Toom-4), using the evaluation
   points: infinity(4.5 only), 4, -4, 2, -2, 1, -1, 0. More precisely,
   we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
   degree 7 (or 6), given the 8 (rsp. 7) values:

     r1 = limit at infinity of f(x) / x^7,
     r2 = f(4),
     r3 = f(-4),
     r4 = f(2),
     r5 = f(-2),
     r6 = f(1),
     r7 = f(-1),
     r8 = f(0).

   All couples of the form f(n),f(-n) must be already mixed with
   toom_couple_handling(f(n),...,f(-n),...)

   The result is stored in {pp, spt + 7*n (or 6*n)}.
   At entry, r8 is stored at {pp, 2n},
   r5 is stored at {pp + 3n, 3n + 1}.

   The other values are 2n+... limbs each (with most significant limbs small).

   All intermediate results are positive.
   Inputs are destroyed.
*/

void
mpn_toom_interpolate_8pts (mp_ptr pp, mp_size_t n,
			   mp_ptr r3, mp_ptr r7,
			   mp_size_t spt, mp_ptr ws)
{
  mp_limb_signed_t cy;
  mp_ptr r5, r1;
  r5 = (pp + 3 * n);			/* 3n+1 */
  r1 = (pp + 7 * n);			/* spt */

  /******************************* interpolation *****************************/

  DO_mpn_subrsh(r3+n, 2 * n + 1, pp, 2 * n, 4, ws);
  cy = DO_mpn_sublsh_n (r3, r1, spt, 12, ws);
  MPN_DECR_U (r3 + spt, 3 * n + 1 - spt, cy);

  DO_mpn_subrsh(r5+n, 2 * n + 1, pp, 2 * n, 2, ws);
  cy = DO_mpn_sublsh_n (r5, r1, spt, 6, ws);
  MPN_DECR_U (r5 + spt, 3 * n + 1 - spt, cy);

  r7[3*n] -= mpn_sub_n (r7+n, r7+n, pp, 2 * n);
  cy = mpn_sub_n (r7, r7, r1, spt);
  MPN_DECR_U (r7 + spt, 3 * n + 1 - spt, cy);

  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));
  ASSERT_NOCARRY(mpn_rshift(r3, r3, 3 * n + 1, 2));

  ASSERT_NOCARRY(mpn_sub_n (r5, r5, r7, 3 * n + 1));

  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));

  mpn_divexact_by45 (r3, r3, 3 * n + 1);

  ASSERT_NOCARRY(mpn_divexact_by3 (r5, r5, 3 * n + 1));

  ASSERT_NOCARRY(DO_mpn_sublsh2_n (r5, r3, 3 * n + 1, ws));

  /* last interpolation steps... */
  /* ... are mixed with recomposition */

  /***************************** recomposition *******************************/
  /*
    pp[] prior to operations:
     |_H r1|_L r1|____||_H r5|_M_r5|_L r5|_____|_H r8|_L r8|pp

    summation scheme for remaining operations:
     |____8|n___7|n___6|n___5|n___4|n___3|n___2|n____|n____|pp
     |_H r1|_L r1|____||_H*r5|_M r5|_L r5|_____|_H_r8|_L r8|pp
	  ||_H r3|_M r3|_L*r3|
				  ||_H_r7|_M_r7|_L_r7|
		      ||-H r3|-M r3|-L*r3|
				  ||-H*r5|-M_r5|-L_r5|
  */

  cy = mpn_add_n (pp + n, pp + n, r7, n); /* Hr8+Lr7-Lr5 */
  cy-= mpn_sub_n (pp + n, pp + n, r5, n);
  if (0 > cy)
    MPN_DECR_U (r7 + n, 2*n + 1, 1);
  else
    MPN_INCR_U (r7 + n, 2*n + 1, cy);

  cy = mpn_sub_n (pp + 2*n, r7 + n, r5 + n, n); /* Mr7-Mr5 */
  MPN_DECR_U (r7 + 2*n, n + 1, cy);

  cy = mpn_add_n (pp + 3*n, r5, r7+ 2*n, n+1); /* Hr7+Lr5 */
  r5[3*n]+= mpn_add_n (r5 + 2*n, r5 + 2*n, r3, n); /* Hr5+Lr3 */
  cy-= mpn_sub_n (pp + 3*n, pp + 3*n, r5 + 2*n, n+1); /* Hr7-Hr5+Lr5-Lr3 */
  if (UNLIKELY(0 > cy))
    MPN_DECR_U (r5 + n + 1, 2*n, 1);
  else
    MPN_INCR_U (r5 + n + 1, 2*n, cy);

  ASSERT_NOCARRY(mpn_sub_n(pp + 4*n, r5 + n, r3 + n, 2*n +1)); /* Mr5-Mr3,Hr5-Hr3 */

  cy = mpn_add_1 (pp + 6*n, r3 + n, n, pp[6*n]);
  MPN_INCR_U (r3 + 2*n, n + 1, cy);
  cy = mpn_add_n (pp + 7*n, pp + 7*n, r3 + 2*n, n);
  if (LIKELY(spt != n))
    MPN_INCR_U (pp + 8*n, spt - n, cy + r3[3*n]);
  else
    ASSERT (r3[3*n] + cy == 0);
}