dnl AMD64 mpn_modexact_1_odd -- Hensel norm remainder. dnl Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of either: dnl dnl * the GNU Lesser General Public License as published by the Free dnl Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl or dnl dnl * the GNU General Public License as published by the Free Software dnl Foundation; either version 2 of the License, or (at your option) any dnl later version. dnl dnl or both in parallel, as here. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License dnl for more details. dnl dnl You should have received copies of the GNU General Public License and the dnl GNU Lesser General Public License along with the GNU MP Library. If not, dnl see https://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb C AMD K8,K9 10 C AMD K10 10 C Intel P4 33 C Intel core2 13 C Intel corei 14.5 C Intel atom 35 C VIA nano ? C The dependent chain in the main loop is C C cycles C sub %rdx, %rax 1 C imul %r9, %rax 4 C mul %r8 5 C ---- C total 10 C C The mov load from src seems to need to be scheduled back before the jz to C achieve this speed, out-of-order execution apparently can't completely hide C the latency otherwise. C C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it C for the first iteration (where there's no cbit). C C The code alignment used (32-byte) for the loop also seems necessary. Without C that the non-PIC case has adc crossing the 0x60 offset, apparently making it C run at 11 cycles instead of 10. ABI_SUPPORT(DOS64) ABI_SUPPORT(STD64) ASM_START() TEXT ALIGN(32) PROLOGUE(mpn_modexact_1_odd) FUNC_ENTRY(3) mov $0, R32(%rcx) IFDOS(` jmp L(ent) ') PROLOGUE(mpn_modexact_1c_odd) FUNC_ENTRY(4) L(ent): C rdi src C rsi size C rdx divisor C rcx carry mov %rdx, %r8 C d shr R32(%rdx) C d/2 LEA( binvert_limb_table, %r9) and $127, R32(%rdx) mov %rcx, %r10 C initial carry movzbl (%r9,%rdx), R32(%rdx) C inv 8 bits mov (%rdi), %rax C src[0] lea (%rdi,%rsi,8), %r11 C src end mov %r8, %rdi C d, made available to imull lea (%rdx,%rdx), R32(%rcx) C 2*inv imul R32(%rdx), R32(%rdx) C inv*inv neg %rsi C -size imul R32(%rdi), R32(%rdx) C inv*inv*d sub R32(%rdx), R32(%rcx) C inv = 2*inv - inv*inv*d, 16 bits lea (%rcx,%rcx), R32(%rdx) C 2*inv imul R32(%rcx), R32(%rcx) C inv*inv imul R32(%rdi), R32(%rcx) C inv*inv*d sub R32(%rcx), R32(%rdx) C inv = 2*inv - inv*inv*d, 32 bits xor R32(%rcx), R32(%rcx) C initial cbit lea (%rdx,%rdx), %r9 C 2*inv imul %rdx, %rdx C inv*inv imul %r8, %rdx C inv*inv*d sub %rdx, %r9 C inv = 2*inv - inv*inv*d, 64 bits mov %r10, %rdx C initial climb ASSERT(e,` C d*inv == 1 mod 2^64 mov %r8, %r10 imul %r9, %r10 cmp $1, %r10') inc %rsi jz L(one) ALIGN(16) L(top): C rax l = src[i]-cbit C rcx new cbit, 0 or 1 C rdx climb, high of last product C rsi counter, limbs, negative C rdi C r8 divisor C r9 inverse C r11 src end ptr sub %rdx, %rax C l = src[i]-cbit - climb adc $0, %rcx C more cbit imul %r9, %rax C q = l * inverse mul %r8 C climb = high (q * d) mov (%r11,%rsi,8), %rax C src[i+1] sub %rcx, %rax C next l = src[i+1] - cbit setc R8(%rcx) C new cbit inc %rsi jnz L(top) L(one): sub %rdx, %rax C l = src[i]-cbit - climb adc $0, %rcx C more cbit imul %r9, %rax C q = l * inverse mul %r8 C climb = high (q * d) lea (%rcx,%rdx), %rax C climb+cbit FUNC_EXIT() ret EPILOGUE(mpn_modexact_1c_odd) EPILOGUE(mpn_modexact_1_odd)