Blame mpn/generic/add_n_sub_n.c

Packit 5c3484
/* mpn_add_n_sub_n -- Add and Subtract two limb vectors of equal, non-zero length.
Packit 5c3484
Packit 5c3484
   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
Packit 5c3484
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
Packit 5c3484
   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
Packit 5c3484
Packit 5c3484
Copyright 1999-2001, 2006 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
This file is part of the GNU MP Library.
Packit 5c3484
Packit 5c3484
The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
it under the terms of either:
Packit 5c3484
Packit 5c3484
  * the GNU Lesser General Public License as published by the Free
Packit 5c3484
    Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
    option) any later version.
Packit 5c3484
Packit 5c3484
or
Packit 5c3484
Packit 5c3484
  * the GNU General Public License as published by the Free Software
Packit 5c3484
    Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
    later version.
Packit 5c3484
Packit 5c3484
or both in parallel, as here.
Packit 5c3484
Packit 5c3484
The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
for more details.
Packit 5c3484
Packit 5c3484
You should have received copies of the GNU General Public License and the
Packit 5c3484
GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
see https://www.gnu.org/licenses/.  */
Packit 5c3484
Packit 5c3484
#include "gmp.h"
Packit 5c3484
#include "gmp-impl.h"
Packit 5c3484
Packit 5c3484
#ifndef L1_CACHE_SIZE
Packit 5c3484
#define L1_CACHE_SIZE 8192	/* only 68040 has less than this */
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
#define PART_SIZE (L1_CACHE_SIZE / GMP_LIMB_BYTES / 6)
Packit 5c3484
Packit 5c3484
Packit 5c3484
/* mpn_add_n_sub_n.
Packit 5c3484
   r1[] = s1[] + s2[]
Packit 5c3484
   r2[] = s1[] - s2[]
Packit 5c3484
   All operands have n limbs.
Packit 5c3484
   In-place operations allowed.  */
Packit 5c3484
mp_limb_t
Packit 5c3484
mpn_add_n_sub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
Packit 5c3484
{
Packit 5c3484
  mp_limb_t acyn, acyo;		/* carry for add */
Packit 5c3484
  mp_limb_t scyn, scyo;		/* carry for subtract */
Packit 5c3484
  mp_size_t off;		/* offset in operands */
Packit 5c3484
  mp_size_t this_n;		/* size of current chunk */
Packit 5c3484
Packit 5c3484
  /* We alternatingly add and subtract in chunks that fit into the (L1)
Packit 5c3484
     cache.  Since the chunks are several hundred limbs, the function call
Packit 5c3484
     overhead is insignificant, but we get much better locality.  */
Packit 5c3484
Packit 5c3484
  /* We have three variant of the inner loop, the proper loop is chosen
Packit 5c3484
     depending on whether r1 or r2 are the same operand as s1 or s2.  */
Packit 5c3484
Packit 5c3484
  if (r1p != s1p && r1p != s2p)
Packit 5c3484
    {
Packit 5c3484
      /* r1 is not identical to either input operand.  We can therefore write
Packit 5c3484
	 to r1 directly, without using temporary storage.  */
Packit 5c3484
      acyo = 0;
Packit 5c3484
      scyo = 0;
Packit 5c3484
      for (off = 0; off < n; off += PART_SIZE)
Packit 5c3484
	{
Packit 5c3484
	  this_n = MIN (n - off, PART_SIZE);
Packit 5c3484
#if HAVE_NATIVE_mpn_add_nc
Packit 5c3484
	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
Packit 5c3484
#else
Packit 5c3484
	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
Packit 5c3484
	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
Packit 5c3484
#endif
Packit 5c3484
#if HAVE_NATIVE_mpn_sub_nc
Packit 5c3484
	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
Packit 5c3484
#else
Packit 5c3484
	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
Packit 5c3484
	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
Packit 5c3484
#endif
Packit 5c3484
	}
Packit 5c3484
    }
Packit 5c3484
  else if (r2p != s1p && r2p != s2p)
Packit 5c3484
    {
Packit 5c3484
      /* r2 is not identical to either input operand.  We can therefore write
Packit 5c3484
	 to r2 directly, without using temporary storage.  */
Packit 5c3484
      acyo = 0;
Packit 5c3484
      scyo = 0;
Packit 5c3484
      for (off = 0; off < n; off += PART_SIZE)
Packit 5c3484
	{
Packit 5c3484
	  this_n = MIN (n - off, PART_SIZE);
Packit 5c3484
#if HAVE_NATIVE_mpn_sub_nc
Packit 5c3484
	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
Packit 5c3484
#else
Packit 5c3484
	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
Packit 5c3484
	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
Packit 5c3484
#endif
Packit 5c3484
#if HAVE_NATIVE_mpn_add_nc
Packit 5c3484
	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
Packit 5c3484
#else
Packit 5c3484
	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
Packit 5c3484
	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
Packit 5c3484
#endif
Packit 5c3484
	}
Packit 5c3484
    }
Packit 5c3484
  else
Packit 5c3484
    {
Packit 5c3484
      /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2==s2 or vice versa)
Packit 5c3484
	 Need temporary storage.  */
Packit 5c3484
      mp_limb_t tp[PART_SIZE];
Packit 5c3484
      acyo = 0;
Packit 5c3484
      scyo = 0;
Packit 5c3484
      for (off = 0; off < n; off += PART_SIZE)
Packit 5c3484
	{
Packit 5c3484
	  this_n = MIN (n - off, PART_SIZE);
Packit 5c3484
#if HAVE_NATIVE_mpn_add_nc
Packit 5c3484
	  acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo);
Packit 5c3484
#else
Packit 5c3484
	  acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n);
Packit 5c3484
	  acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo);
Packit 5c3484
#endif
Packit 5c3484
#if HAVE_NATIVE_mpn_sub_nc
Packit 5c3484
	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
Packit 5c3484
#else
Packit 5c3484
	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
Packit 5c3484
	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
Packit 5c3484
#endif
Packit 5c3484
	  MPN_COPY (r1p + off, tp, this_n);
Packit 5c3484
	}
Packit 5c3484
    }
Packit 5c3484
Packit 5c3484
  return 2 * acyo + scyo;
Packit 5c3484
}
Packit 5c3484
Packit 5c3484
#ifdef MAIN
Packit 5c3484
#include <stdlib.h>
Packit 5c3484
#include <stdio.h>
Packit 5c3484
#include "timing.h"
Packit 5c3484
Packit 5c3484
long cputime ();
Packit 5c3484
Packit 5c3484
int
Packit 5c3484
main (int argc, char **argv)
Packit 5c3484
{
Packit 5c3484
  mp_ptr r1p, r2p, s1p, s2p;
Packit 5c3484
  double t;
Packit 5c3484
  mp_size_t n;
Packit 5c3484
Packit 5c3484
  n = strtol (argv[1], 0, 0);
Packit 5c3484
Packit 5c3484
  r1p = malloc (n * GMP_LIMB_BYTES);
Packit 5c3484
  r2p = malloc (n * GMP_LIMB_BYTES);
Packit 5c3484
  s1p = malloc (n * GMP_LIMB_BYTES);
Packit 5c3484
  s2p = malloc (n * GMP_LIMB_BYTES);
Packit 5c3484
  TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n)));
Packit 5c3484
  printf ("              separate add and sub: %.3f\n", t);
Packit 5c3484
  TIME (t,mpn_add_n_sub_n(r1p,r2p,s1p,s2p,n));
Packit 5c3484
  printf ("combined addsub separate variables: %.3f\n", t);
Packit 5c3484
  TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n));
Packit 5c3484
  printf ("        combined addsub r1 overlap: %.3f\n", t);
Packit 5c3484
  TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n));
Packit 5c3484
  printf ("        combined addsub r2 overlap: %.3f\n", t);
Packit 5c3484
  TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,r2p,n));
Packit 5c3484
  printf ("          combined addsub in-place: %.3f\n", t);
Packit 5c3484
Packit 5c3484
  return 0;
Packit 5c3484
}
Packit 5c3484
#endif