Blame mpn/generic/perfsqr.c

Packit 5c3484
/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
Packit 5c3484
   zero otherwise.
Packit 5c3484
Packit 5c3484
Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software
Packit 5c3484
Foundation, Inc.
Packit 5c3484
Packit 5c3484
This file is part of the GNU MP Library.
Packit 5c3484
Packit 5c3484
The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
it under the terms of either:
Packit 5c3484
Packit 5c3484
  * the GNU Lesser General Public License as published by the Free
Packit 5c3484
    Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
    option) any later version.
Packit 5c3484
Packit 5c3484
or
Packit 5c3484
Packit 5c3484
  * the GNU General Public License as published by the Free Software
Packit 5c3484
    Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
    later version.
Packit 5c3484
Packit 5c3484
or both in parallel, as here.
Packit 5c3484
Packit 5c3484
The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
for more details.
Packit 5c3484
Packit 5c3484
You should have received copies of the GNU General Public License and the
Packit 5c3484
GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
see https://www.gnu.org/licenses/.  */
Packit 5c3484
Packit 5c3484
#include <stdio.h> /* for NULL */
Packit 5c3484
#include "gmp.h"
Packit 5c3484
#include "gmp-impl.h"
Packit 5c3484
#include "longlong.h"
Packit 5c3484
Packit 5c3484
#include "perfsqr.h"
Packit 5c3484
Packit 5c3484
Packit 5c3484
/* change this to "#define TRACE(x) x" for diagnostics */
Packit 5c3484
#define TRACE(x)
Packit 5c3484
Packit 5c3484
Packit 5c3484
Packit 5c3484
/* PERFSQR_MOD_* detects non-squares using residue tests.
Packit 5c3484
Packit 5c3484
   A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h.  It takes
Packit 5c3484
   {up,usize} modulo a selected modulus to get a remainder r.  For 32-bit or
Packit 5c3484
   64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34,
Packit 5c3484
   or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP
Packit 5c3484
   used.  PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this
Packit 5c3484
   case too.
Packit 5c3484
Packit 5c3484
   PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or
Packit 5c3484
   PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table
Packit 5c3484
   data indicating residues and non-residues modulo those divisors.  The
Packit 5c3484
   table data is in 1 or 2 limbs worth of bits respectively, per the size of
Packit 5c3484
   each d.
Packit 5c3484
Packit 5c3484
   A "modexact" style remainder is taken to reduce r modulo d.
Packit 5c3484
   PERFSQR_MOD_IDX implements this, producing an index "idx" for use with
Packit 5c3484
   the table data.  Notice there's just one multiplication by a constant
Packit 5c3484
   "inv", for each d.
Packit 5c3484
Packit 5c3484
   The modexact doesn't produce a true r%d remainder, instead idx satisfies
Packit 5c3484
   "-(idx<
Packit 5c3484
   -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is
Packit 5c3484
   accounted for by having the table data suitably permuted.
Packit 5c3484
Packit 5c3484
   The remainder r fits within PERFSQR_MOD_BITS which is less than a limb.
Packit 5c3484
   In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit
Packit 5c3484
   each divisor d meaning the modexact multiply can take place entirely
Packit 5c3484
   within one limb, giving the compiler the chance to optimize it, in a way
Packit 5c3484
   that say umul_ppmm would not give.
Packit 5c3484
Packit 5c3484
   There's no need for the divisors d to be prime, in fact gen-psqr.c makes
Packit 5c3484
   a deliberate effort to combine factors so as to reduce the number of
Packit 5c3484
   separate tests done on r.  But such combining is limited to d <=
Packit 5c3484
   2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs.
Packit 5c3484
Packit 5c3484
   Alternatives:
Packit 5c3484
Packit 5c3484
   It'd be possible to use bigger divisors d, and more than 2 limbs of table
Packit 5c3484
   data, but this doesn't look like it would be of much help to the prime
Packit 5c3484
   factors in the usual moduli 2^24-1 or 2^48-1.
Packit 5c3484
Packit 5c3484
   The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're
Packit 5c3484
   just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime
Packit 5c3484
   factors.  2^32-1 and 2^64-1 would be equally easy to calculate, but have
Packit 5c3484
   fewer prime factors.
Packit 5c3484
Packit 5c3484
   The nails case usually ends up using mpn_mod_1, which is a lot slower
Packit 5c3484
   than mpn_mod_34lsub1.  Perhaps other such special moduli could be found
Packit 5c3484
   for the nails case.  Two-term things like 2^30-2^15-1 might be
Packit 5c3484
   candidates.  Or at worst some on-the-fly de-nailing would allow the plain
Packit 5c3484
   2^24-1 to be used.  Currently nails are too preliminary to be worried
Packit 5c3484
   about.
Packit 5c3484
Packit 5c3484
*/
Packit 5c3484
Packit 5c3484
#define PERFSQR_MOD_MASK       ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1)
Packit 5c3484
Packit 5c3484
#define MOD34_BITS  (GMP_NUMB_BITS / 4 * 3)
Packit 5c3484
#define MOD34_MASK  ((CNST_LIMB(1) << MOD34_BITS) - 1)
Packit 5c3484
Packit 5c3484
#define PERFSQR_MOD_34(r, up, usize)				\
Packit 5c3484
  do {								\
Packit 5c3484
    (r) = mpn_mod_34lsub1 (up, usize);				\
Packit 5c3484
    (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS);		\
Packit 5c3484
  } while (0)
Packit 5c3484
Packit 5c3484
/* FIXME: The %= here isn't good, and might destroy any savings from keeping
Packit 5c3484
   the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm).
Packit 5c3484
   Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor
Packit 5c3484
   and a shift count, like mpn_preinv_divrem_1.  But mod_34lsub1 is our
Packit 5c3484
   normal case, so lets not worry too much about mod_1.  */
Packit 5c3484
#define PERFSQR_MOD_PP(r, up, usize)					\
Packit 5c3484
  do {									\
Packit 5c3484
    if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD))	\
Packit 5c3484
      {									\
Packit 5c3484
	(r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM,		\
Packit 5c3484
				PERFSQR_PP_INVERTED);			\
Packit 5c3484
	(r) %= PERFSQR_PP;						\
Packit 5c3484
      }									\
Packit 5c3484
    else								\
Packit 5c3484
      {									\
Packit 5c3484
	(r) = mpn_mod_1 (up, usize, PERFSQR_PP);			\
Packit 5c3484
      }									\
Packit 5c3484
  } while (0)
Packit 5c3484
Packit 5c3484
#define PERFSQR_MOD_IDX(idx, r, d, inv)				\
Packit 5c3484
  do {								\
Packit 5c3484
    mp_limb_t  q;						\
Packit 5c3484
    ASSERT ((r) <= PERFSQR_MOD_MASK);				\
Packit 5c3484
    ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1);		\
Packit 5c3484
    ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK);		\
Packit 5c3484
								\
Packit 5c3484
    q = ((r) * (inv)) & PERFSQR_MOD_MASK;			\
Packit 5c3484
    ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK));		\
Packit 5c3484
    (idx) = (q * (d)) >> PERFSQR_MOD_BITS;			\
Packit 5c3484
  } while (0)
Packit 5c3484
Packit 5c3484
#define PERFSQR_MOD_1(r, d, inv, mask)				\
Packit 5c3484
  do {								\
Packit 5c3484
    unsigned   idx;						\
Packit 5c3484
    ASSERT ((d) <= GMP_LIMB_BITS);				\
Packit 5c3484
    PERFSQR_MOD_IDX(idx, r, d, inv);				\
Packit 5c3484
    TRACE (printf ("  PERFSQR_MOD_1 d=%u r=%lu idx=%u\n",	\
Packit 5c3484
		   d, r%d, idx));				\
Packit 5c3484
    if ((((mask) >> idx) & 1) == 0)				\
Packit 5c3484
      {								\
Packit 5c3484
	TRACE (printf ("  non-square\n"));			\
Packit 5c3484
	return 0;						\
Packit 5c3484
      }								\
Packit 5c3484
  } while (0)
Packit 5c3484
Packit 5c3484
/* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the
Packit 5c3484
   sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch.  */
Packit 5c3484
#define PERFSQR_MOD_2(r, d, inv, mhi, mlo)			\
Packit 5c3484
  do {								\
Packit 5c3484
    mp_limb_t  m;						\
Packit 5c3484
    unsigned   idx;						\
Packit 5c3484
    ASSERT ((d) <= 2*GMP_LIMB_BITS);				\
Packit 5c3484
								\
Packit 5c3484
    PERFSQR_MOD_IDX (idx, r, d, inv);				\
Packit 5c3484
    TRACE (printf ("  PERFSQR_MOD_2 d=%u r=%lu idx=%u\n",	\
Packit 5c3484
		   d, r%d, idx));				\
Packit 5c3484
    m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi));	\
Packit 5c3484
    idx %= GMP_LIMB_BITS;					\
Packit 5c3484
    if (((m >> idx) & 1) == 0)					\
Packit 5c3484
      {								\
Packit 5c3484
	TRACE (printf ("  non-square\n"));			\
Packit 5c3484
	return 0;						\
Packit 5c3484
      }								\
Packit 5c3484
  } while (0)
Packit 5c3484
Packit 5c3484
Packit 5c3484
int
Packit 5c3484
mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
Packit 5c3484
{
Packit 5c3484
  ASSERT (usize >= 1);
Packit 5c3484
Packit 5c3484
  TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize));
Packit 5c3484
Packit 5c3484
  /* The first test excludes 212/256 (82.8%) of the perfect square candidates
Packit 5c3484
     in O(1) time.  */
Packit 5c3484
  {
Packit 5c3484
    unsigned  idx = up[0] % 0x100;
Packit 5c3484
    if (((sq_res_0x100[idx / GMP_LIMB_BITS]
Packit 5c3484
	  >> (idx % GMP_LIMB_BITS)) & 1) == 0)
Packit 5c3484
      return 0;
Packit 5c3484
  }
Packit 5c3484
Packit 5c3484
#if 0
Packit 5c3484
  /* Check that we have even multiplicity of 2, and then check that the rest is
Packit 5c3484
     a possible perfect square.  Leave disabled until we can determine this
Packit 5c3484
     really is an improvement.  It it is, it could completely replace the
Packit 5c3484
     simple probe above, since this should throw out more non-squares, but at
Packit 5c3484
     the expense of somewhat more cycles.  */
Packit 5c3484
  {
Packit 5c3484
    mp_limb_t lo;
Packit 5c3484
    int cnt;
Packit 5c3484
    lo = up[0];
Packit 5c3484
    while (lo == 0)
Packit 5c3484
      up++, lo = up[0], usize--;
Packit 5c3484
    count_trailing_zeros (cnt, lo);
Packit 5c3484
    if ((cnt & 1) != 0)
Packit 5c3484
      return 0;			/* return of not even multiplicity of 2 */
Packit 5c3484
    lo >>= cnt;			/* shift down to align lowest non-zero bit */
Packit 5c3484
    lo >>= 1;			/* shift away lowest non-zero bit */
Packit 5c3484
    if ((lo & 3) != 0)
Packit 5c3484
      return 0;
Packit 5c3484
  }
Packit 5c3484
#endif
Packit 5c3484
Packit 5c3484
Packit 5c3484
  /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares
Packit 5c3484
     according to their residues modulo small primes (or powers of
Packit 5c3484
     primes).  See perfsqr.h.  */
Packit 5c3484
  PERFSQR_MOD_TEST (up, usize);
Packit 5c3484
Packit 5c3484
Packit 5c3484
  /* For the third and last test, we finally compute the square root,
Packit 5c3484
     to make sure we've really got a perfect square.  */
Packit 5c3484
  {
Packit 5c3484
    mp_ptr root_ptr;
Packit 5c3484
    int res;
Packit 5c3484
    TMP_DECL;
Packit 5c3484
Packit 5c3484
    TMP_MARK;
Packit 5c3484
    root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2);
Packit 5c3484
Packit 5c3484
    /* Iff mpn_sqrtrem returns zero, the square is perfect.  */
Packit 5c3484
    res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
Packit 5c3484
    TMP_FREE;
Packit 5c3484
Packit 5c3484
    return res;
Packit 5c3484
  }
Packit 5c3484
}