Tree - source-git/gmp - CentOS Git server

source-git / gmp

Blame mpn/generic/diveby3.c

Blob History Raw

Packit	5c3484	`/* mpn_divexact_by3c -- mpn exact division by 3.`
Packit	5c3484
Packit	5c3484	`Copyright 2000-2003, 2008 Free Software Foundation, Inc.`
Packit	5c3484
Packit	5c3484	`This file is part of the GNU MP Library.`
Packit	5c3484
Packit	5c3484	`The GNU MP Library is free software; you can redistribute it and/or modify`
Packit	5c3484	`it under the terms of either:`
Packit	5c3484
Packit	5c3484	`* the GNU Lesser General Public License as published by the Free`
Packit	5c3484	`Software Foundation; either version 3 of the License, or (at your`
Packit	5c3484	`option) any later version.`
Packit	5c3484
Packit	5c3484	`or`
Packit	5c3484
Packit	5c3484	`* the GNU General Public License as published by the Free Software`
Packit	5c3484	`Foundation; either version 2 of the License, or (at your option) any`
Packit	5c3484	`later version.`
Packit	5c3484
Packit	5c3484	`or both in parallel, as here.`
Packit	5c3484
Packit	5c3484	`The GNU MP Library is distributed in the hope that it will be useful, but`
Packit	5c3484	`WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
Packit	5c3484	`or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License`
Packit	5c3484	`for more details.`
Packit	5c3484
Packit	5c3484	`You should have received copies of the GNU General Public License and the`
Packit	5c3484	`GNU Lesser General Public License along with the GNU MP Library. If not,`
Packit	5c3484	`see https://www.gnu.org/licenses/. */`
Packit	5c3484
Packit	5c3484	`#include "gmp.h"`
Packit	5c3484	`#include "gmp-impl.h"`
Packit	5c3484
Packit	5c3484	`#if DIVEXACT_BY3_METHOD == 0`
Packit	5c3484
Packit	5c3484	`mp_limb_t`
Packit	5c3484	`mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)`
Packit	5c3484	`{`
Packit	5c3484	`mp_limb_t r;`
Packit	5c3484	`r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);`
Packit	5c3484
Packit	5c3484	`/* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.`
Packit	5c3484	`We want to return C. We compute the remainder mod 4 and notice that the`
Packit	5c3484	`inverse of (2^(2k)-1)/3 mod 4 is 1. */`
Packit	5c3484	`return r & 3;`
Packit	5c3484	`}`
Packit	5c3484
Packit	5c3484	`#endif`
Packit	5c3484
Packit	5c3484	`#if DIVEXACT_BY3_METHOD == 1`
Packit	5c3484
Packit	5c3484	`/* The algorithm here is basically the same as mpn_divexact_1, as described`
Packit	5c3484	`in the manual. Namely at each step q = (src[i]-c)*inverse, and new c =`
Packit	5c3484	`borrow(src[i]-c) + high(divisor*q). But because the divisor is just 3,`
Packit	5c3484	`high(divisor*q) can be determined with two comparisons instead of a`
Packit	5c3484	`multiply.`
Packit	5c3484
Packit	5c3484	`The "c += ..."s add the high limb of 3*l to c. That high limb will be 0,`
Packit	5c3484	`1 or 2. Doing two separate "+="s seems to give better code on gcc (as of`
Packit	5c3484	`2.95.2 at least).`
Packit	5c3484
Packit	5c3484	`It will be noted that the new c is formed by adding three values each 0`
Packit	5c3484	`or 1. But the total is only 0, 1 or 2. When the subtraction src[i]-c`
Packit	5c3484	`causes a borrow, that leaves a limb value of either 0xFF...FF or`
Packit	5c3484	`0xFF...FE. The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or`
Packit	5c3484	`0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1`
Packit	5c3484	`respectively, hence a total of no more than 2.`
Packit	5c3484
Packit	5c3484	`Alternatives:`
Packit	5c3484
Packit	5c3484	`This implementation has each multiply on the dependent chain, due to`
Packit	5c3484	`"l=s-c". See below for alternative code which avoids that. */`
Packit	5c3484
Packit	5c3484	`mp_limb_t`
Packit	5c3484	`mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)`
Packit	5c3484	`{`
Packit	5c3484	`mp_limb_t l, q, s;`
Packit	5c3484	`mp_size_t i;`
Packit	5c3484
Packit	5c3484	`ASSERT (un >= 1);`
Packit	5c3484	`ASSERT (c == 0 \|\| c == 1 \|\| c == 2);`
Packit	5c3484	`ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));`
Packit	5c3484
Packit	5c3484	`i = 0;`
Packit	5c3484	`do`
Packit	5c3484	`{`
Packit	5c3484	`s = up[i];`
Packit	5c3484	`SUBC_LIMB (c, l, s, c);`
Packit	5c3484
Packit	5c3484	`q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;`
Packit	5c3484	`rp[i] = q;`
Packit	5c3484
Packit	5c3484	`c += (q >= GMP_NUMB_CEIL_MAX_DIV3);`
Packit	5c3484	`c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);`
Packit	5c3484	`}`
Packit	5c3484	`while (++i < un);`
Packit	5c3484
Packit	5c3484	`ASSERT (c == 0 \|\| c == 1 \|\| c == 2);`
Packit	5c3484	`return c;`
Packit	5c3484	`}`
Packit	5c3484
Packit	5c3484
Packit	5c3484	`#endif`
Packit	5c3484
Packit	5c3484	`#if DIVEXACT_BY3_METHOD == 2`
Packit	5c3484
Packit	5c3484	`/* The following alternative code re-arranges the quotient calculation from`
Packit	5c3484	`(src[i]-c)*inverse to instead`
Packit	5c3484
Packit	5c3484	`q = src[i]inverse - cinverse`
Packit	5c3484
Packit	5c3484	`thereby allowing src[i]*inverse to be scheduled back as far as desired,`
Packit	5c3484	`making full use of multiplier throughput and leaving just some carry`
Packit	5c3484	`handing on the dependent chain.`
Packit	5c3484
Packit	5c3484	`The carry handling consists of determining the c for the next iteration.`
Packit	5c3484	`This is the same as described above, namely look for any borrow from`
Packit	5c3484	`src[i]-c, and at the high of 3*q.`
Packit	5c3484
Packit	5c3484	`high(3*q) is done with two comparisons as above (in c2 and c3). The`
Packit	5c3484	`borrow from src[i]-c is incorporated into those by noting that if there's`
Packit	5c3484	`a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn`
Packit	5c3484	`giving q = 0x55..55 or 0xAA..AA. Adding 1 to either of those q values is`
Packit	5c3484	`enough to make high(3*q) come out 1 bigger, as required.`
Packit	5c3484
Packit	5c3484	`l = -c*inverse is calculated at the same time as c, since for most chips`
Packit	5c3484	`it can be more conveniently derived from separate c1/c2/c3 values than`
Packit	5c3484	`from a combined c equal to 0, 1 or 2.`
Packit	5c3484
Packit	5c3484	`The net effect is that with good pipelining this loop should be able to`
Packit	5c3484	`run at perhaps 4 cycles/limb, depending on available execute resources`
Packit	5c3484	`etc.`
Packit	5c3484
Packit	5c3484	`Usage:`
Packit	5c3484
Packit	5c3484	`This code is not used by default, since we really can't rely on the`
Packit	5c3484	`compiler generating a good software pipeline, nor on such an approach`
Packit	5c3484	`even being worthwhile on all CPUs.`
Packit	5c3484
Packit	5c3484	`Itanium is one chip where this algorithm helps though, see`
Packit	5c3484	`mpn/ia64/diveby3.asm. */`
Packit	5c3484
Packit	5c3484	`mp_limb_t`
Packit	5c3484	`mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)`
Packit	5c3484	`{`
Packit	5c3484	`mp_limb_t s, sm, cl, q, qx, c2, c3;`
Packit	5c3484	`mp_size_t i;`
Packit	5c3484
Packit	5c3484	`ASSERT (un >= 1);`
Packit	5c3484	`ASSERT (cy == 0 \|\| cy == 1 \|\| cy == 2);`
Packit	5c3484	`ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));`
Packit	5c3484
Packit	5c3484	`cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;`
Packit	5c3484
Packit	5c3484	`for (i = 0; i < un; i++)`
Packit	5c3484	`{`
Packit	5c3484	`s = up[i];`
Packit	5c3484	`sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;`
Packit	5c3484
Packit	5c3484	`q = (cl + sm) & GMP_NUMB_MASK;`
Packit	5c3484	`rp[i] = q;`
Packit	5c3484	`qx = q + (s < cy);`
Packit	5c3484
Packit	5c3484	`c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;`
Packit	5c3484	`c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;`
Packit	5c3484
Packit	5c3484	`cy = c2 + c3;`
Packit	5c3484	`cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);`
Packit	5c3484	`}`
Packit	5c3484
Packit	5c3484	`return cy;`
Packit	5c3484	`}`
Packit	5c3484
Packit	5c3484	`#endif`

source-git / gmp

Source Code

Blame mpn/generic/diveby3.c