|
Packit |
5c3484 |
/* mpn_divexact_by3c -- mpn exact division by 3.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Copyright 2000-2003, 2008 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
it under the terms of either:
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
option) any later version.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
or
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
* the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
later version.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
or both in parallel, as here.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
for more details.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
see https://www.gnu.org/licenses/. */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#include "gmp.h"
|
|
Packit |
5c3484 |
#include "gmp-impl.h"
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#if DIVEXACT_BY3_METHOD == 0
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mp_limb_t
|
|
Packit |
5c3484 |
mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mp_limb_t r;
|
|
Packit |
5c3484 |
r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.
|
|
Packit |
5c3484 |
We want to return C. We compute the remainder mod 4 and notice that the
|
|
Packit |
5c3484 |
inverse of (2^(2k)-1)/3 mod 4 is 1. */
|
|
Packit |
5c3484 |
return r & 3;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#if DIVEXACT_BY3_METHOD == 1
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* The algorithm here is basically the same as mpn_divexact_1, as described
|
|
Packit |
5c3484 |
in the manual. Namely at each step q = (src[i]-c)*inverse, and new c =
|
|
Packit |
5c3484 |
borrow(src[i]-c) + high(divisor*q). But because the divisor is just 3,
|
|
Packit |
5c3484 |
high(divisor*q) can be determined with two comparisons instead of a
|
|
Packit |
5c3484 |
multiply.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The "c += ..."s add the high limb of 3*l to c. That high limb will be 0,
|
|
Packit |
5c3484 |
1 or 2. Doing two separate "+="s seems to give better code on gcc (as of
|
|
Packit |
5c3484 |
2.95.2 at least).
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
It will be noted that the new c is formed by adding three values each 0
|
|
Packit |
5c3484 |
or 1. But the total is only 0, 1 or 2. When the subtraction src[i]-c
|
|
Packit |
5c3484 |
causes a borrow, that leaves a limb value of either 0xFF...FF or
|
|
Packit |
5c3484 |
0xFF...FE. The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or
|
|
Packit |
5c3484 |
0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1
|
|
Packit |
5c3484 |
respectively, hence a total of no more than 2.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Alternatives:
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
This implementation has each multiply on the dependent chain, due to
|
|
Packit |
5c3484 |
"l=s-c". See below for alternative code which avoids that. */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mp_limb_t
|
|
Packit |
5c3484 |
mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mp_limb_t l, q, s;
|
|
Packit |
5c3484 |
mp_size_t i;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASSERT (un >= 1);
|
|
Packit |
5c3484 |
ASSERT (c == 0 || c == 1 || c == 2);
|
|
Packit |
5c3484 |
ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
i = 0;
|
|
Packit |
5c3484 |
do
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
s = up[i];
|
|
Packit |
5c3484 |
SUBC_LIMB (c, l, s, c);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
|
|
Packit |
5c3484 |
rp[i] = q;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
c += (q >= GMP_NUMB_CEIL_MAX_DIV3);
|
|
Packit |
5c3484 |
c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
while (++i < un);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASSERT (c == 0 || c == 1 || c == 2);
|
|
Packit |
5c3484 |
return c;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#endif
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#if DIVEXACT_BY3_METHOD == 2
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
/* The following alternative code re-arranges the quotient calculation from
|
|
Packit |
5c3484 |
(src[i]-c)*inverse to instead
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
q = src[i]*inverse - c*inverse
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
thereby allowing src[i]*inverse to be scheduled back as far as desired,
|
|
Packit |
5c3484 |
making full use of multiplier throughput and leaving just some carry
|
|
Packit |
5c3484 |
handing on the dependent chain.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The carry handling consists of determining the c for the next iteration.
|
|
Packit |
5c3484 |
This is the same as described above, namely look for any borrow from
|
|
Packit |
5c3484 |
src[i]-c, and at the high of 3*q.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
high(3*q) is done with two comparisons as above (in c2 and c3). The
|
|
Packit |
5c3484 |
borrow from src[i]-c is incorporated into those by noting that if there's
|
|
Packit |
5c3484 |
a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn
|
|
Packit |
5c3484 |
giving q = 0x55..55 or 0xAA..AA. Adding 1 to either of those q values is
|
|
Packit |
5c3484 |
enough to make high(3*q) come out 1 bigger, as required.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
l = -c*inverse is calculated at the same time as c, since for most chips
|
|
Packit |
5c3484 |
it can be more conveniently derived from separate c1/c2/c3 values than
|
|
Packit |
5c3484 |
from a combined c equal to 0, 1 or 2.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
The net effect is that with good pipelining this loop should be able to
|
|
Packit |
5c3484 |
run at perhaps 4 cycles/limb, depending on available execute resources
|
|
Packit |
5c3484 |
etc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Usage:
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
This code is not used by default, since we really can't rely on the
|
|
Packit |
5c3484 |
compiler generating a good software pipeline, nor on such an approach
|
|
Packit |
5c3484 |
even being worthwhile on all CPUs.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Itanium is one chip where this algorithm helps though, see
|
|
Packit |
5c3484 |
mpn/ia64/diveby3.asm. */
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mp_limb_t
|
|
Packit |
5c3484 |
mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
mp_limb_t s, sm, cl, q, qx, c2, c3;
|
|
Packit |
5c3484 |
mp_size_t i;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASSERT (un >= 1);
|
|
Packit |
5c3484 |
ASSERT (cy == 0 || cy == 1 || cy == 2);
|
|
Packit |
5c3484 |
ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
for (i = 0; i < un; i++)
|
|
Packit |
5c3484 |
{
|
|
Packit |
5c3484 |
s = up[i];
|
|
Packit |
5c3484 |
sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
q = (cl + sm) & GMP_NUMB_MASK;
|
|
Packit |
5c3484 |
rp[i] = q;
|
|
Packit |
5c3484 |
qx = q + (s < cy);
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;
|
|
Packit |
5c3484 |
c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
cy = c2 + c3;
|
|
Packit |
5c3484 |
cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
return cy;
|
|
Packit |
5c3484 |
}
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
#endif
|