dnl IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the dnl result from a second limb vector. dnl Contributed to the GNU project by Torbjorn Granlund. dnl Copyright 2000-2004 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of either: dnl dnl * the GNU Lesser General Public License as published by the Free dnl Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl or dnl dnl * the GNU General Public License as published by the Free Software dnl Foundation; either version 2 of the License, or (at your option) any dnl later version. dnl dnl or both in parallel, as here. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License dnl for more details. dnl dnl You should have received copies of the GNU General Public License and the dnl GNU Lesser General Public License along with the GNU MP Library. If not, dnl see https://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb C Itanium: 4.0 C Itanium 2: 2.25 (alignment dependent, sometimes it seems to need 3 c/l) C TODO C * Optimize feed-in and wind-down code, both for speed and code size. C * Handle low limb input and results specially, using a common stf8 in the C epilogue. C * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in C 2nd bundle. This will allow the bbb bundle to be one cycle earlier and C save a cycle. C INPUT PARAMETERS define(`rp', `r32') define(`up', `r33') define(`n', `r34') define(`vl', `r35') ASM_START() PROLOGUE(mpn_submul_1) .prologue .save ar.lc, r2 .body ifdef(`HAVE_ABI_32', ` addp4 rp = 0, rp C M I addp4 up = 0, up C M I zxt4 n = n C I ;; ') {.mmi mov r10 = rp C M I mov r9 = up C M I sub vl = r0, vl C M I negate vl } {.mmi ldf8 f8 = [rp], 8 C M ldf8 f7 = [up], 8 C M add r19 = -1, n C M I n - 1 ;; } {.mmi cmp.eq p6, p0 = 0, vl C M I mov r8 = 0 C M I zero cylimb mov r2 = ar.lc C I0 } {.mmi setf.sig f6 = vl C M2 M3 and r14 = 3, n C M I shr.u r19 = r19, 2 C I0 ;; } {.mmb nop 0 cmp.eq p10, p0 = 0, r14 C M I (p6) br.spnt .Ldone C B vl == 0 } {.mmi cmp.eq p11, p0 = 2, r14 C M I cmp.eq p12, p0 = 3, r14 C M I mov ar.lc = r19 C I0 } {.bbb (p10) br.dptk .Lb00 C B (p11) br.dptk .Lb10 C B (p12) br.dptk .Lb11 C B ;; } .Lb01: br.cloop.dptk .grt1 xma.l f39 = f7, f6, f8 xma.hu f43 = f7, f6, f8 ;; getf.sig r27 = f39 C lo getf.sig r31 = f43 C hi ld8 r20 = [r9], 8 br .Lcj1 .grt1: ldf8 f44 = [rp], 8 ldf8 f32 = [up], 8 ;; ldf8 f45 = [rp], 8 ldf8 f33 = [up], 8 ;; ldf8 f46 = [rp], 8 xma.l f39 = f7, f6, f8 ldf8 f34 = [up], 8 xma.hu f43 = f7, f6, f8 ;; ldf8 f47 = [rp], 8 xma.l f36 = f32, f6, f44 ldf8 f35 = [up], 8 xma.hu f40 = f32, f6, f44 br.cloop.dptk .grt5 ;; getf.sig r27 = f39 C lo xma.l f37 = f33, f6, f45 ld8 r20 = [r9], 8 xma.hu f41 = f33, f6, f45 ;; getf.sig r31 = f43 C hi getf.sig r24 = f36 C lo xma.l f38 = f34, f6, f46 ld8 r21 = [r9], 8 xma.hu f42 = f34, f6, f46 ;; getf.sig r28 = f40 C hi getf.sig r25 = f37 C lo xma.l f39 = f35, f6, f47 ld8 r22 = [r9], 8 xma.hu f43 = f35, f6, f47 ;; getf.sig r29 = f41 C hi getf.sig r26 = f38 C lo ld8 r23 = [r9], 8 br .Lcj5 .grt5: ldf8 f44 = [rp], 8 ldf8 f32 = [up], 8 ;; getf.sig r27 = f39 C lo xma.l f37 = f33, f6, f45 ld8 r20 = [r9], 8 xma.hu f41 = f33, f6, f45 ;; ldf8 f45 = [rp], 8 getf.sig r31 = f43 C hi ldf8 f33 = [up], 8 ;; getf.sig r24 = f36 C lo xma.l f38 = f34, f6, f46 ld8 r21 = [r9], 8 xma.hu f42 = f34, f6, f46 ;; ldf8 f46 = [rp], 8 getf.sig r28 = f40 C hi ldf8 f34 = [up], 8 ;; getf.sig r25 = f37 C lo xma.l f39 = f35, f6, f47 ld8 r22 = [r9], 8 xma.hu f43 = f35, f6, f47 ;; ldf8 f47 = [rp], 8 getf.sig r29 = f41 C hi ldf8 f35 = [up], 8 ;; getf.sig r26 = f38 C lo xma.l f36 = f32, f6, f44 ld8 r23 = [r9], 8 xma.hu f40 = f32, f6, f44 br.cloop.dptk .Loop br .Lend .Lb10: ldf8 f47 = [rp], 8 ldf8 f35 = [up], 8 br.cloop.dptk .grt2 xma.l f38 = f7, f6, f8 xma.hu f42 = f7, f6, f8 ;; xma.l f39 = f35, f6, f47 xma.hu f43 = f35, f6, f47 ;; getf.sig r26 = f38 C lo getf.sig r30 = f42 C hi ld8 r23 = [r9], 8 ;; getf.sig r27 = f39 C lo getf.sig r31 = f43 C hi ld8 r20 = [r9], 8 br .Lcj2 .grt2: ldf8 f44 = [rp], 8 ldf8 f32 = [up], 8 ;; ldf8 f45 = [rp], 8 ldf8 f33 = [up], 8 xma.l f38 = f7, f6, f8 xma.hu f42 = f7, f6, f8 ;; ldf8 f46 = [rp], 8 ldf8 f34 = [up], 8 xma.l f39 = f35, f6, f47 xma.hu f43 = f35, f6, f47 ;; ldf8 f47 = [rp], 8 ldf8 f35 = [up], 8 ;; getf.sig r26 = f38 C lo xma.l f36 = f32, f6, f44 ld8 r23 = [r9], 8 xma.hu f40 = f32, f6, f44 br.cloop.dptk .grt6 getf.sig r30 = f42 C hi ;; getf.sig r27 = f39 C lo xma.l f37 = f33, f6, f45 ld8 r20 = [r9], 8 xma.hu f41 = f33, f6, f45 ;; getf.sig r31 = f43 C hi getf.sig r24 = f36 C lo xma.l f38 = f34, f6, f46 ld8 r21 = [r9], 8 xma.hu f42 = f34, f6, f46 ;; getf.sig r28 = f40 C hi getf.sig r25 = f37 C lo xma.l f39 = f35, f6, f47 ld8 r22 = [r9], 8 xma.hu f43 = f35, f6, f47 br .Lcj6 .grt6: ldf8 f44 = [rp], 8 getf.sig r30 = f42 C hi ldf8 f32 = [up], 8 ;; getf.sig r27 = f39 C lo xma.l f37 = f33, f6, f45 ld8 r20 = [r9], 8 xma.hu f41 = f33, f6, f45 ;; ldf8 f45 = [rp], 8 getf.sig r31 = f43 C hi ldf8 f33 = [up], 8 ;; getf.sig r24 = f36 C lo xma.l f38 = f34, f6, f46 ld8 r21 = [r9], 8 xma.hu f42 = f34, f6, f46 ;; ldf8 f46 = [rp], 8 getf.sig r28 = f40 C hi ldf8 f34 = [up], 8 ;; getf.sig r25 = f37 C lo xma.l f39 = f35, f6, f47 ld8 r22 = [r9], 8 xma.hu f43 = f35, f6, f47 br .LL10 .Lb11: ldf8 f46 = [rp], 8 ldf8 f34 = [up], 8 ;; ldf8 f47 = [rp], 8 ldf8 f35 = [up], 8 br.cloop.dptk .grt3 xma.l f37 = f7, f6, f8 xma.hu f41 = f7, f6, f8 ;; xma.l f38 = f34, f6, f46 xma.hu f42 = f34, f6, f46 ;; getf.sig r25 = f37 C lo xma.l f39 = f35, f6, f47 xma.hu f43 = f35, f6, f47 ;; getf.sig r29 = f41 C hi ld8 r22 = [r9], 8 ;; getf.sig r26 = f38 C lo getf.sig r30 = f42 C hi ld8 r23 = [r9], 8 ;; getf.sig r27 = f39 C lo getf.sig r31 = f43 C hi ld8 r20 = [r9], 8 br .Lcj3 .grt3: ldf8 f44 = [rp], 8 xma.l f37 = f7, f6, f8 ldf8 f32 = [up], 8 xma.hu f41 = f7, f6, f8 ;; ldf8 f45 = [rp], 8 xma.l f38 = f34, f6, f46 ldf8 f33 = [up], 8 xma.hu f42 = f34, f6, f46 ;; ldf8 f46 = [rp], 8 ldf8 f34 = [up], 8 ;; getf.sig r25 = f37 C lo xma.l f39 = f35, f6, f47 ld8 r22 = [r9], 8 xma.hu f43 = f35, f6, f47 ;; ldf8 f47 = [rp], 8 getf.sig r29 = f41 C hi ldf8 f35 = [up], 8 ;; getf.sig r26 = f38 C lo xma.l f36 = f32, f6, f44 ld8 r23 = [r9], 8 xma.hu f40 = f32, f6, f44 br.cloop.dptk .grt7 ;; getf.sig r30 = f42 C hi getf.sig r27 = f39 C lo xma.l f37 = f33, f6, f45 ld8 r20 = [r9], 8 xma.hu f41 = f33, f6, f45 ;; getf.sig r31 = f43 C hi getf.sig r24 = f36 C lo xma.l f38 = f34, f6, f46 ld8 r21 = [r9], 8 xma.hu f42 = f34, f6, f46 br .Lcj7 .grt7: ldf8 f44 = [rp], 8 getf.sig r30 = f42 C hi ldf8 f32 = [up], 8 ;; getf.sig r27 = f39 C lo xma.l f37 = f33, f6, f45 ld8 r20 = [r9], 8 xma.hu f41 = f33, f6, f45 ;; ldf8 f45 = [rp], 8 getf.sig r31 = f43 C hi ldf8 f33 = [up], 8 ;; getf.sig r24 = f36 C lo xma.l f38 = f34, f6, f46 ld8 r21 = [r9], 8 xma.hu f42 = f34, f6, f46 br .LL11 .Lb00: ldf8 f45 = [rp], 8 ldf8 f33 = [up], 8 ;; ldf8 f46 = [rp], 8 ldf8 f34 = [up], 8 ;; ldf8 f47 = [rp], 8 xma.l f36 = f7, f6, f8 ldf8 f35 = [up], 8 xma.hu f40 = f7, f6, f8 br.cloop.dptk .grt4 xma.l f37 = f33, f6, f45 xma.hu f41 = f33, f6, f45 ;; getf.sig r24 = f36 C lo xma.l f38 = f34, f6, f46 ld8 r21 = [r9], 8 xma.hu f42 = f34, f6, f46 ;; getf.sig r28 = f40 C hi xma.l f39 = f35, f6, f47 getf.sig r25 = f37 C lo ld8 r22 = [r9], 8 xma.hu f43 = f35, f6, f47 ;; getf.sig r29 = f41 C hi getf.sig r26 = f38 C lo ld8 r23 = [r9], 8 ;; getf.sig r30 = f42 C hi getf.sig r27 = f39 C lo ld8 r20 = [r9], 8 br .Lcj4 .grt4: ldf8 f44 = [rp], 8 xma.l f37 = f33, f6, f45 ldf8 f32 = [up], 8 xma.hu f41 = f33, f6, f45 ;; ldf8 f45 = [rp], 8 ldf8 f33 = [up], 8 xma.l f38 = f34, f6, f46 getf.sig r24 = f36 C lo ld8 r21 = [r9], 8 xma.hu f42 = f34, f6, f46 ;; ldf8 f46 = [rp], 8 getf.sig r28 = f40 C hi ldf8 f34 = [up], 8 xma.l f39 = f35, f6, f47 getf.sig r25 = f37 C lo ld8 r22 = [r9], 8 xma.hu f43 = f35, f6, f47 ;; ldf8 f47 = [rp], 8 getf.sig r29 = f41 C hi ldf8 f35 = [up], 8 ;; getf.sig r26 = f38 C lo xma.l f36 = f32, f6, f44 ld8 r23 = [r9], 8 xma.hu f40 = f32, f6, f44 br.cloop.dptk .grt8 ;; getf.sig r30 = f42 C hi getf.sig r27 = f39 C lo xma.l f37 = f33, f6, f45 ld8 r20 = [r9], 8 xma.hu f41 = f33, f6, f45 br .Lcj8 .grt8: ldf8 f44 = [rp], 8 getf.sig r30 = f42 C hi ldf8 f32 = [up], 8 ;; getf.sig r27 = f39 C lo xma.l f37 = f33, f6, f45 ld8 r20 = [r9], 8 xma.hu f41 = f33, f6, f45 br .LL00 ALIGN(32) .Loop: {.mmi ldf8 f44 = [rp], 8 cmp.ltu p6, p0 = r27, r8 C lo cmp sub r14 = r27, r8 C lo sub } {.mmi getf.sig r30 = f42 C hi ldf8 f32 = [up], 8 sub r8 = r20, r31 C hi sub ;; C 01 } {.mmf getf.sig r27 = f39 C lo st8 [r10] = r14, 8 xma.l f37 = f33, f6, f45 } {.mfi ld8 r20 = [r9], 8 xma.hu f41 = f33, f6, f45 (p6) add r8 = 1, r8 ;; C 02 } {.mmi .LL00: ldf8 f45 = [rp], 8 cmp.ltu p6, p0 = r24, r8 sub r14 = r24, r8 } {.mmi getf.sig r31 = f43 C hi ldf8 f33 = [up], 8 sub r8 = r21, r28 ;; C 03 } {.mmf getf.sig r24 = f36 C lo st8 [r10] = r14, 8 xma.l f38 = f34, f6, f46 } {.mfi ld8 r21 = [r9], 8 xma.hu f42 = f34, f6, f46 (p6) add r8 = 1, r8 ;; C 04 } {.mmi .LL11: ldf8 f46 = [rp], 8 cmp.ltu p6, p0 = r25, r8 sub r14 = r25, r8 } {.mmi getf.sig r28 = f40 C hi ldf8 f34 = [up], 8 sub r8 = r22, r29 ;; C 05 } {.mmf getf.sig r25 = f37 C lo st8 [r10] = r14, 8 xma.l f39 = f35, f6, f47 } {.mfi ld8 r22 = [r9], 8 xma.hu f43 = f35, f6, f47 (p6) add r8 = 1, r8 ;; C 06 } {.mmi .LL10: ldf8 f47 = [rp], 8 cmp.ltu p6, p0 = r26, r8 sub r14 = r26, r8 } {.mmi getf.sig r29 = f41 C hi ldf8 f35 = [up], 8 sub r8 = r23, r30 ;; C 07 } {.mmf getf.sig r26 = f38 C lo st8 [r10] = r14, 8 xma.l f36 = f32, f6, f44 } {.mfi ld8 r23 = [r9], 8 xma.hu f40 = f32, f6, f44 (p6) add r8 = 1, r8 } br.cloop.dptk .Loop ;; .Lend: cmp.ltu p6, p0 = r27, r8 sub r14 = r27, r8 getf.sig r30 = f42 sub r8 = r20, r31 ;; getf.sig r27 = f39 st8 [r10] = r14, 8 xma.l f37 = f33, f6, f45 ld8 r20 = [r9], 8 xma.hu f41 = f33, f6, f45 (p6) add r8 = 1, r8 ;; .Lcj8: cmp.ltu p6, p0 = r24, r8 sub r14 = r24, r8 getf.sig r31 = f43 sub r8 = r21, r28 ;; getf.sig r24 = f36 st8 [r10] = r14, 8 xma.l f38 = f34, f6, f46 ld8 r21 = [r9], 8 xma.hu f42 = f34, f6, f46 (p6) add r8 = 1, r8 ;; .Lcj7: cmp.ltu p6, p0 = r25, r8 sub r14 = r25, r8 getf.sig r28 = f40 sub r8 = r22, r29 ;; getf.sig r25 = f37 st8 [r10] = r14, 8 xma.l f39 = f35, f6, f47 ld8 r22 = [r9], 8 xma.hu f43 = f35, f6, f47 (p6) add r8 = 1, r8 ;; .Lcj6: cmp.ltu p6, p0 = r26, r8 sub r14 = r26, r8 getf.sig r29 = f41 sub r8 = r23, r30 ;; getf.sig r26 = f38 st8 [r10] = r14, 8 ld8 r23 = [r9], 8 (p6) add r8 = 1, r8 ;; .Lcj5: cmp.ltu p6, p0 = r27, r8 sub r14 = r27, r8 getf.sig r30 = f42 sub r8 = r20, r31 ;; getf.sig r27 = f39 st8 [r10] = r14, 8 ld8 r20 = [r9], 8 (p6) add r8 = 1, r8 ;; .Lcj4: cmp.ltu p6, p0 = r24, r8 sub r14 = r24, r8 getf.sig r31 = f43 sub r8 = r21, r28 ;; st8 [r10] = r14, 8 (p6) add r8 = 1, r8 ;; .Lcj3: cmp.ltu p6, p0 = r25, r8 sub r14 = r25, r8 sub r8 = r22, r29 ;; st8 [r10] = r14, 8 (p6) add r8 = 1, r8 ;; .Lcj2: cmp.ltu p6, p0 = r26, r8 sub r14 = r26, r8 sub r8 = r23, r30 ;; st8 [r10] = r14, 8 (p6) add r8 = 1, r8 ;; .Lcj1: cmp.ltu p6, p0 = r27, r8 sub r14 = r27, r8 sub r8 = r20, r31 ;; st8 [r10] = r14, 8 mov ar.lc = r2 (p6) add r8 = 1, r8 br.ret.sptk.many b0 .Ldone: mov ar.lc = r2 br.ret.sptk.many b0 EPILOGUE() ASM_END()