Blob Blame History Raw
dnl  IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
dnl  result from a second limb vector.

dnl  Contributed to the GNU project by Torbjorn Granlund.

dnl  Copyright 2000-2004 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of either:
dnl
dnl    * the GNU Lesser General Public License as published by the Free
dnl      Software Foundation; either version 3 of the License, or (at your
dnl      option) any later version.
dnl
dnl  or
dnl
dnl    * the GNU General Public License as published by the Free Software
dnl      Foundation; either version 2 of the License, or (at your option) any
dnl      later version.
dnl
dnl  or both in parallel, as here.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
dnl  for more details.
dnl
dnl  You should have received copies of the GNU General Public License and the
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
dnl  see https://www.gnu.org/licenses/.

include(`../config.m4')

C         cycles/limb
C Itanium:    4.0
C Itanium 2:  2.25 (alignment dependent, sometimes it seems to need 3 c/l)

C TODO
C  * Optimize feed-in and wind-down code, both for speed and code size.
C  * Handle low limb input and results specially, using a common stf8 in the
C    epilogue.
C  * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in
C    2nd bundle.  This will allow the bbb bundle to be one cycle earlier and
C    save a cycle.

C INPUT PARAMETERS
define(`rp', `r32')
define(`up', `r33')
define(`n',  `r34')
define(`vl', `r35')

ASM_START()
PROLOGUE(mpn_submul_1)
	.prologue
	.save	ar.lc, r2
	.body

ifdef(`HAVE_ABI_32',
`	addp4		rp = 0, rp		C M I
	addp4		up = 0, up		C M I
	zxt4		n = n			C I
	;;
')
{.mmi
	mov		r10 = rp		C M I
	mov		r9 = up			C M I
	sub		vl = r0, vl		C M I	negate vl
}
{.mmi
	ldf8		f8 = [rp], 8		C M
	ldf8		f7 = [up], 8		C M
	add		r19 = -1, n		C M I	n - 1
	;;
}
{.mmi
	cmp.eq		p6, p0 = 0, vl		C M I
	mov		r8 = 0			C M I	zero cylimb
	mov		r2 = ar.lc		C I0
}
{.mmi
	setf.sig	f6 = vl			C M2 M3
	and		r14 = 3, n		C M I
	shr.u		r19 = r19, 2		C I0
	;;
}
{.mmb
	nop		0
	cmp.eq		p10, p0 = 0, r14	C M I
   (p6)	br.spnt		.Ldone			C B	vl == 0
}
{.mmi
	cmp.eq		p11, p0 = 2, r14	C M I
	cmp.eq		p12, p0 = 3, r14	C M I
	mov		ar.lc = r19		C I0
}
{.bbb
  (p10)	br.dptk		.Lb00			C B
  (p11)	br.dptk		.Lb10			C B
  (p12)	br.dptk		.Lb11			C B
	;;
}

.Lb01:	br.cloop.dptk	.grt1

	xma.l		f39 = f7, f6, f8
	xma.hu		f43 = f7, f6, f8
	;;
	getf.sig	r27 = f39			C lo
	getf.sig	r31 = f43			C hi
	ld8		r20 = [r9], 8
	br		.Lcj1

.grt1:	ldf8		f44 = [rp], 8
	ldf8		f32 = [up], 8
	;;
	ldf8		f45 = [rp], 8
	ldf8		f33 = [up], 8
	;;
	ldf8		f46 = [rp], 8
	xma.l		f39 = f7, f6, f8
	ldf8		f34 = [up], 8
	xma.hu		f43 = f7, f6, f8
	;;
	ldf8		f47 = [rp], 8
	xma.l		f36 = f32, f6, f44
	ldf8		f35 = [up], 8
	xma.hu		f40 = f32, f6, f44
	br.cloop.dptk	.grt5
	;;

	getf.sig	r27 = f39			C lo
	xma.l		f37 = f33, f6, f45
	ld8		r20 = [r9], 8
	xma.hu		f41 = f33, f6, f45
	;;
	getf.sig	r31 = f43			C hi
	getf.sig	r24 = f36			C lo
	xma.l		f38 = f34, f6, f46
	ld8		r21 = [r9], 8
	xma.hu		f42 = f34, f6, f46
	;;
	getf.sig	r28 = f40			C hi
	getf.sig	r25 = f37			C lo
	xma.l		f39 = f35, f6, f47
	ld8		r22 = [r9], 8
	xma.hu		f43 = f35, f6, f47
	;;
	getf.sig	r29 = f41			C hi
	getf.sig	r26 = f38			C lo
	ld8		r23 = [r9], 8
	br		.Lcj5

.grt5:	ldf8		f44 = [rp], 8
	ldf8		f32 = [up], 8
	;;
	getf.sig	r27 = f39			C lo
	xma.l		f37 = f33, f6, f45
	ld8		r20 = [r9], 8
	xma.hu		f41 = f33, f6, f45
	;;
	ldf8		f45 = [rp], 8
	getf.sig	r31 = f43			C hi
	ldf8		f33 = [up], 8
	;;
	getf.sig	r24 = f36			C lo
	xma.l		f38 = f34, f6, f46
	ld8		r21 = [r9], 8
	xma.hu		f42 = f34, f6, f46
	;;
	ldf8		f46 = [rp], 8
	getf.sig	r28 = f40			C hi
	ldf8		f34 = [up], 8
	;;
	getf.sig	r25 = f37			C lo
	xma.l		f39 = f35, f6, f47
	ld8		r22 = [r9], 8
	xma.hu		f43 = f35, f6, f47
	;;
	ldf8		f47 = [rp], 8
	getf.sig	r29 = f41			C hi
	ldf8		f35 = [up], 8
	;;
	getf.sig	r26 = f38			C lo
	xma.l		f36 = f32, f6, f44
	ld8		r23 = [r9], 8
	xma.hu		f40 = f32, f6, f44
	br.cloop.dptk	.Loop
	br		.Lend


.Lb10:	ldf8		f47 = [rp], 8
	ldf8		f35 = [up], 8
	br.cloop.dptk	.grt2

	xma.l		f38 = f7, f6, f8
	xma.hu		f42 = f7, f6, f8
	;;
	xma.l		f39 = f35, f6, f47
	xma.hu		f43 = f35, f6, f47
	;;
	getf.sig	r26 = f38			C lo
	getf.sig	r30 = f42			C hi
	ld8		r23 = [r9], 8
	;;
	getf.sig	r27 = f39			C lo
	getf.sig	r31 = f43			C hi
	ld8		r20 = [r9], 8
	br		.Lcj2

.grt2:	ldf8		f44 = [rp], 8
	ldf8		f32 = [up], 8
	;;
	ldf8		f45 = [rp], 8
	ldf8		f33 = [up], 8
	xma.l		f38 = f7, f6, f8
	xma.hu		f42 = f7, f6, f8
	;;
	ldf8		f46 = [rp], 8
	ldf8		f34 = [up], 8
	xma.l		f39 = f35, f6, f47
	xma.hu		f43 = f35, f6, f47
	;;
	ldf8		f47 = [rp], 8
	ldf8		f35 = [up], 8
	;;
	getf.sig	r26 = f38			C lo
	xma.l		f36 = f32, f6, f44
	ld8		r23 = [r9], 8
	xma.hu		f40 = f32, f6, f44
	br.cloop.dptk	.grt6

	getf.sig	r30 = f42			C hi
	;;
	getf.sig	r27 = f39			C lo
	xma.l		f37 = f33, f6, f45
	ld8		r20 = [r9], 8
	xma.hu		f41 = f33, f6, f45
	;;
	getf.sig	r31 = f43			C hi
	getf.sig	r24 = f36			C lo
	xma.l		f38 = f34, f6, f46
	ld8		r21 = [r9], 8
	xma.hu		f42 = f34, f6, f46
	;;
	getf.sig	r28 = f40			C hi
	getf.sig	r25 = f37			C lo
	xma.l		f39 = f35, f6, f47
	ld8		r22 = [r9], 8
	xma.hu		f43 = f35, f6, f47
	br		.Lcj6

.grt6:	ldf8		f44 = [rp], 8
	getf.sig	r30 = f42			C hi
	ldf8		f32 = [up], 8
	;;
	getf.sig	r27 = f39			C lo
	xma.l		f37 = f33, f6, f45
	ld8		r20 = [r9], 8
	xma.hu		f41 = f33, f6, f45
	;;
	ldf8		f45 = [rp], 8
	getf.sig	r31 = f43			C hi
	ldf8		f33 = [up], 8
	;;
	getf.sig	r24 = f36			C lo
	xma.l		f38 = f34, f6, f46
	ld8		r21 = [r9], 8
	xma.hu		f42 = f34, f6, f46
	;;
	ldf8		f46 = [rp], 8
	getf.sig	r28 = f40			C hi
	ldf8		f34 = [up], 8
	;;
	getf.sig	r25 = f37			C lo
	xma.l		f39 = f35, f6, f47
	ld8		r22 = [r9], 8
	xma.hu		f43 = f35, f6, f47
	br		.LL10


.Lb11:	ldf8		f46 = [rp], 8
	ldf8		f34 = [up], 8
	;;
	ldf8		f47 = [rp], 8
	ldf8		f35 = [up], 8
	br.cloop.dptk	.grt3

	xma.l		f37 = f7, f6, f8
	xma.hu		f41 = f7, f6, f8
	;;
	xma.l		f38 = f34, f6, f46
	xma.hu		f42 = f34, f6, f46
	;;
	getf.sig	r25 = f37			C lo
	xma.l		f39 = f35, f6, f47
	xma.hu		f43 = f35, f6, f47
	;;
	getf.sig	r29 = f41			C hi
	ld8		r22 = [r9], 8
	;;
	getf.sig	r26 = f38			C lo
	getf.sig	r30 = f42			C hi
	ld8		r23 = [r9], 8
	;;
	getf.sig	r27 = f39			C lo
	getf.sig	r31 = f43			C hi
	ld8		r20 = [r9], 8
	br		.Lcj3

.grt3:	ldf8		f44 = [rp], 8
	xma.l		f37 = f7, f6, f8
	ldf8		f32 = [up], 8
	xma.hu		f41 = f7, f6, f8
	;;
	ldf8		f45 = [rp], 8
	xma.l		f38 = f34, f6, f46
	ldf8		f33 = [up], 8
	xma.hu		f42 = f34, f6, f46
	;;
	ldf8		f46 = [rp], 8
	ldf8		f34 = [up], 8
	;;
	getf.sig	r25 = f37			C lo
	xma.l		f39 = f35, f6, f47
	ld8		r22 = [r9], 8
	xma.hu		f43 = f35, f6, f47
	;;
	ldf8		f47 = [rp], 8
	getf.sig	r29 = f41			C hi
	ldf8		f35 = [up], 8
	;;
	getf.sig	r26 = f38			C lo
	xma.l		f36 = f32, f6, f44
	ld8		r23 = [r9], 8
	xma.hu		f40 = f32, f6, f44
	br.cloop.dptk	.grt7
	;;

	getf.sig	r30 = f42			C hi
	getf.sig	r27 = f39			C lo
	xma.l		f37 = f33, f6, f45
	ld8		r20 = [r9], 8
	xma.hu		f41 = f33, f6, f45
	;;
	getf.sig	r31 = f43			C hi
	getf.sig	r24 = f36			C lo
	xma.l		f38 = f34, f6, f46
	ld8		r21 = [r9], 8
	xma.hu		f42 = f34, f6, f46
	br		.Lcj7

.grt7:	ldf8		f44 = [rp], 8
	getf.sig	r30 = f42			C hi
	ldf8		f32 = [up], 8
	;;
	getf.sig	r27 = f39			C lo
	xma.l		f37 = f33, f6, f45
	ld8		r20 = [r9], 8
	xma.hu		f41 = f33, f6, f45
	;;
	ldf8		f45 = [rp], 8
	getf.sig	r31 = f43			C hi
	ldf8		f33 = [up], 8
	;;
	getf.sig	r24 = f36			C lo
	xma.l		f38 = f34, f6, f46
	ld8		r21 = [r9], 8
	xma.hu		f42 = f34, f6, f46
	br		.LL11


.Lb00:	ldf8		f45 = [rp], 8
	ldf8		f33 = [up], 8
	;;
	ldf8		f46 = [rp], 8
	ldf8		f34 = [up], 8
	;;
	ldf8		f47 = [rp], 8
	xma.l		f36 = f7, f6, f8
	ldf8		f35 = [up], 8
	xma.hu		f40 = f7, f6, f8
	br.cloop.dptk	.grt4

	xma.l		f37 = f33, f6, f45
	xma.hu		f41 = f33, f6, f45
	;;
	getf.sig	r24 = f36			C lo
	xma.l		f38 = f34, f6, f46
	ld8		r21 = [r9], 8
	xma.hu		f42 = f34, f6, f46
	;;
	getf.sig	r28 = f40			C hi
	xma.l		f39 = f35, f6, f47
	getf.sig	r25 = f37			C lo
	ld8		r22 = [r9], 8
	xma.hu		f43 = f35, f6, f47
	;;
	getf.sig	r29 = f41			C hi
	getf.sig	r26 = f38			C lo
	ld8		r23 = [r9], 8
	;;
	getf.sig	r30 = f42			C hi
	getf.sig	r27 = f39			C lo
	ld8		r20 = [r9], 8
	br		.Lcj4

.grt4:	ldf8		f44 = [rp], 8
	xma.l		f37 = f33, f6, f45
	ldf8		f32 = [up], 8
	xma.hu		f41 = f33, f6, f45
	;;
	ldf8		f45 = [rp], 8
	ldf8		f33 = [up], 8
	xma.l		f38 = f34, f6, f46
	getf.sig	r24 = f36			C lo
	ld8		r21 = [r9], 8
	xma.hu		f42 = f34, f6, f46
	;;
	ldf8		f46 = [rp], 8
	getf.sig	r28 = f40			C hi
	ldf8		f34 = [up], 8
	xma.l		f39 = f35, f6, f47
	getf.sig	r25 = f37			C lo
	ld8		r22 = [r9], 8
	xma.hu		f43 = f35, f6, f47
	;;
	ldf8		f47 = [rp], 8
	getf.sig	r29 = f41			C hi
	ldf8		f35 = [up], 8
	;;
	getf.sig	r26 = f38			C lo
	xma.l		f36 = f32, f6, f44
	ld8		r23 = [r9], 8
	xma.hu		f40 = f32, f6, f44
	br.cloop.dptk	.grt8
	;;

	getf.sig	r30 = f42			C hi
	getf.sig	r27 = f39			C lo
	xma.l		f37 = f33, f6, f45
	ld8		r20 = [r9], 8
	xma.hu		f41 = f33, f6, f45
	br		.Lcj8

.grt8:	ldf8		f44 = [rp], 8
	getf.sig	r30 = f42			C hi
	ldf8		f32 = [up], 8
	;;
	getf.sig	r27 = f39			C lo
	xma.l		f37 = f33, f6, f45
	ld8		r20 = [r9], 8
	xma.hu		f41 = f33, f6, f45
	br		.LL00

	ALIGN(32)
.Loop:
{.mmi
	ldf8		f44 = [rp], 8
	cmp.ltu		p6, p0 = r27, r8	C lo cmp
	sub		r14 = r27, r8		C lo sub
}
{.mmi
	getf.sig	r30 = f42			C hi
	ldf8		f32 = [up], 8
	sub		r8 = r20, r31		C hi sub
	;;				C 01
}
{.mmf
	getf.sig	r27 = f39			C lo
	st8		[r10] = r14, 8
	xma.l		f37 = f33, f6, f45
}
{.mfi
	ld8		r20 = [r9], 8
	xma.hu		f41 = f33, f6, f45
   (p6)	add		r8 = 1, r8
	;;				C 02
}
{.mmi
.LL00:	ldf8		f45 = [rp], 8
	cmp.ltu		p6, p0 = r24, r8
	sub		r14 = r24, r8
}
{.mmi
	getf.sig	r31 = f43			C hi
	ldf8		f33 = [up], 8
	sub		r8 = r21, r28
	;;				C 03
}
{.mmf
	getf.sig	r24 = f36			C lo
	st8		[r10] = r14, 8
	xma.l		f38 = f34, f6, f46
}
{.mfi
	ld8		r21 = [r9], 8
	xma.hu		f42 = f34, f6, f46
   (p6)	add		r8 = 1, r8
	;;				C 04
}
{.mmi
.LL11:	ldf8		f46 = [rp], 8
	cmp.ltu		p6, p0 = r25, r8
	sub		r14 = r25, r8
}
{.mmi
	getf.sig	r28 = f40			C hi
	ldf8		f34 = [up], 8
	sub		r8 = r22, r29
	;;				C 05
}
{.mmf
	getf.sig	r25 = f37			C lo
	st8		[r10] = r14, 8
	xma.l		f39 = f35, f6, f47
}
{.mfi
	ld8		r22 = [r9], 8
	xma.hu		f43 = f35, f6, f47
   (p6)	add		r8 = 1, r8
	;;				C 06
}
{.mmi
.LL10:	ldf8		f47 = [rp], 8
	cmp.ltu		p6, p0 = r26, r8
	sub		r14 = r26, r8
}
{.mmi
	getf.sig	r29 = f41			C hi
	ldf8		f35 = [up], 8
	sub		r8 = r23, r30
	;;				C 07
}
{.mmf
	getf.sig	r26 = f38			C lo
	st8		[r10] = r14, 8
	xma.l		f36 = f32, f6, f44
}
{.mfi
	ld8		r23 = [r9], 8
	xma.hu		f40 = f32, f6, f44
   (p6)	add		r8 = 1, r8
}
	br.cloop.dptk	.Loop
	;;

.Lend:
	cmp.ltu		p6, p0 = r27, r8
	sub		r14 = r27, r8
	getf.sig	r30 = f42
	sub		r8 = r20, r31
	;;
	getf.sig	r27 = f39
	st8		[r10] = r14, 8
	xma.l		f37 = f33, f6, f45
	ld8		r20 = [r9], 8
	xma.hu		f41 = f33, f6, f45
   (p6)	add		r8 = 1, r8
	;;
.Lcj8:
	cmp.ltu		p6, p0 = r24, r8
	sub		r14 = r24, r8
	getf.sig	r31 = f43
	sub		r8 = r21, r28
	;;
	getf.sig	r24 = f36
	st8		[r10] = r14, 8
	xma.l		f38 = f34, f6, f46
	ld8		r21 = [r9], 8
	xma.hu		f42 = f34, f6, f46
   (p6)	add		r8 = 1, r8
	;;
.Lcj7:
	cmp.ltu		p6, p0 = r25, r8
	sub		r14 = r25, r8
	getf.sig	r28 = f40
	sub		r8 = r22, r29
	;;
	getf.sig	r25 = f37
	st8		[r10] = r14, 8
	xma.l		f39 = f35, f6, f47
	ld8		r22 = [r9], 8
	xma.hu		f43 = f35, f6, f47
   (p6)	add		r8 = 1, r8
	;;
.Lcj6:
	cmp.ltu		p6, p0 = r26, r8
	sub		r14 = r26, r8
	getf.sig	r29 = f41
	sub		r8 = r23, r30
	;;
	getf.sig	r26 = f38
	st8		[r10] = r14, 8
	ld8		r23 = [r9], 8
   (p6)	add		r8 = 1, r8
	;;
.Lcj5:
	cmp.ltu		p6, p0 = r27, r8
	sub		r14 = r27, r8
	getf.sig	r30 = f42
	sub		r8 = r20, r31
	;;
	getf.sig	r27 = f39
	st8		[r10] = r14, 8
	ld8		r20 = [r9], 8
   (p6)	add		r8 = 1, r8
	;;
.Lcj4:
	cmp.ltu		p6, p0 = r24, r8
	sub		r14 = r24, r8
	getf.sig	r31 = f43
	sub		r8 = r21, r28
	;;
	st8		[r10] = r14, 8
   (p6)	add		r8 = 1, r8
	;;
.Lcj3:
	cmp.ltu		p6, p0 = r25, r8
	sub		r14 = r25, r8
	sub		r8 = r22, r29
	;;
	st8		[r10] = r14, 8
   (p6)	add		r8 = 1, r8
	;;
.Lcj2:
	cmp.ltu		p6, p0 = r26, r8
	sub		r14 = r26, r8
	sub		r8 = r23, r30
	;;
	st8		[r10] = r14, 8
   (p6)	add		r8 = 1, r8
	;;
.Lcj1:
	cmp.ltu		p6, p0 = r27, r8
	sub		r14 = r27, r8
	sub		r8 = r20, r31
	;;
	st8		[r10] = r14, 8
	mov		ar.lc = r2
   (p6)	add		r8 = 1, r8
	br.ret.sptk.many b0
.Ldone:	mov		ar.lc = r2
	br.ret.sptk.many b0
EPILOGUE()
ASM_END()