Blame mpn/ia64/addmul_2.asm

Packit 5c3484
dnl  IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and
Packit 5c3484
dnl  add the result to a (n+1)-limb number.
Packit 5c3484
Packit 5c3484
dnl  Contributed to the GNU project by Torbjorn Granlund.
Packit 5c3484
Packit 5c3484
dnl  Copyright 2004, 2005, 2011 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
C         cycles/limb
Packit 5c3484
C Itanium:    3.65
Packit 5c3484
C Itanium 2:  1.625
Packit 5c3484
Packit 5c3484
C TODO
Packit 5c3484
C  * Clean up variable names, and try to decrease the number of distinct
Packit 5c3484
C    registers used.
Packit 5c3484
C  * Clean up feed-in code to not require zeroing several registers.
Packit 5c3484
C  * Make sure we don't depend on uninitialised predicate registers.
Packit 5c3484
C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
Packit 5c3484
C    wind-down code.
Packit 5c3484
C  * Ultimately rewrite.  The problem with this code is that it first uses a
Packit 5c3484
C    loaded u value in one xma pair, then leaves it live over several unrelated
Packit 5c3484
C    xma pairs, before it uses it again.  It should actually be quite possible
Packit 5c3484
C    to just swap some aligned xma pairs around.  But we should then schedule
Packit 5c3484
C    u loads further from the first use.
Packit 5c3484
Packit 5c3484
C INPUT PARAMETERS
Packit 5c3484
define(`rp',`r32')
Packit 5c3484
define(`up',`r33')
Packit 5c3484
define(`n',`r34')
Packit 5c3484
define(`vp',`r35')
Packit 5c3484
Packit 5c3484
define(`srp',`r3')
Packit 5c3484
Packit 5c3484
define(`v0',`f6')
Packit 5c3484
define(`v1',`f7')
Packit 5c3484
Packit 5c3484
define(`s0',`r14')
Packit 5c3484
define(`acc0',`r15')
Packit 5c3484
Packit 5c3484
define(`pr0_0',`r16') define(`pr0_1',`r17')
Packit 5c3484
define(`pr0_2',`r18') define(`pr0_3',`r19')
Packit 5c3484
Packit 5c3484
define(`pr1_0',`r20') define(`pr1_1',`r21')
Packit 5c3484
define(`pr1_2',`r22') define(`pr1_3',`r23')
Packit 5c3484
Packit 5c3484
define(`acc1_0',`r24') define(`acc1_1',`r25')
Packit 5c3484
define(`acc1_2',`r26') define(`acc1_3',`r27')
Packit 5c3484
Packit 5c3484
dnl define(`',`r28')
Packit 5c3484
dnl define(`',`r29')
Packit 5c3484
dnl define(`',`r30')
Packit 5c3484
dnl define(`',`r31')
Packit 5c3484
Packit 5c3484
define(`fp0b_0',`f8') define(`fp0b_1',`f9')
Packit 5c3484
define(`fp0b_2',`f10') define(`fp0b_3',`f11')
Packit 5c3484
Packit 5c3484
define(`fp1a_0',`f12') define(`fp1a_1',`f13')
Packit 5c3484
define(`fp1a_2',`f14') define(`fp1a_3',`f15')
Packit 5c3484
Packit 5c3484
define(`fp1b_0',`f32') define(`fp1b_1',`f33')
Packit 5c3484
define(`fp1b_2',`f34') define(`fp1b_3',`f35')
Packit 5c3484
Packit 5c3484
define(`fp2a_0',`f36') define(`fp2a_1',`f37')
Packit 5c3484
define(`fp2a_2',`f38') define(`fp2a_3',`f39')
Packit 5c3484
Packit 5c3484
define(`r_0',`f40') define(`r_1',`f41')
Packit 5c3484
define(`r_2',`f42') define(`r_3',`f43')
Packit 5c3484
Packit 5c3484
define(`u_0',`f44') define(`u_1',`f45')
Packit 5c3484
define(`u_2',`f46') define(`u_3',`f47')
Packit 5c3484
Packit 5c3484
define(`rx',`f48')
Packit 5c3484
define(`ux',`f49')
Packit 5c3484
define(`ry',`f50')
Packit 5c3484
define(`uy',`f51')
Packit 5c3484
Packit 5c3484
ASM_START()
Packit 5c3484
PROLOGUE(mpn_addmul_2s)
Packit 5c3484
	.prologue
Packit 5c3484
	.save	ar.lc, r2
Packit 5c3484
	.body
Packit 5c3484
Packit 5c3484
ifdef(`HAVE_ABI_32',`
Packit 5c3484
 {.mmi;		addp4	rp = 0, rp		C			M I
Packit 5c3484
		addp4	up = 0, up		C			M I
Packit 5c3484
		addp4	vp = 0, vp		C			M I
Packit 5c3484
}{.mmi;		nop	1
Packit 5c3484
		nop	1
Packit 5c3484
		zxt4	n = n			C			I
Packit 5c3484
	;;
Packit 5c3484
}')
Packit 5c3484
Packit 5c3484
 {.mmi;		ldf8	ux = [up], 8		C			M
Packit 5c3484
		ldf8	v0 = [vp], 8		C			M
Packit 5c3484
		mov	r2 = ar.lc		C			I0
Packit 5c3484
}{.mmi;		ldf8	rx = [rp], 8		C			M
Packit 5c3484
		and	r14 = 3, n		C			M I
Packit 5c3484
		add	n = -2, n		C			M I
Packit 5c3484
	;;
Packit 5c3484
}{.mmi;		ldf8	uy = [up], 8		C			M
Packit 5c3484
		ldf8	v1 = [vp]		C			M
Packit 5c3484
		shr.u	n = n, 2		C			I0
Packit 5c3484
}{.mmi;		ldf8	ry = [rp], -8		C			M
Packit 5c3484
		cmp.eq	p14, p0 = 1, r14	C			M I
Packit 5c3484
		cmp.eq	p11, p0 = 2, r14	C			M I
Packit 5c3484
	;;
Packit 5c3484
}{.mmi;		add	srp = 16, rp		C			M I
Packit 5c3484
		cmp.eq	p15, p0 = 3, r14	C			M I
Packit 5c3484
		mov	ar.lc = n		C			I0
Packit 5c3484
}{.bbb;	(p14)	br.dptk	L(x01)			C			B
Packit 5c3484
	(p11)	br.dptk	L(x10)			C			B
Packit 5c3484
	(p15)	br.dptk	L(x11)			C			B
Packit 5c3484
	;;
Packit 5c3484
}
Packit 5c3484
L(x00):		cmp.ne	p6, p0 = r0, r0		C suppress initial xma pair
Packit 5c3484
		mov	fp2a_3 = f0
Packit 5c3484
		br	L(b00)
Packit 5c3484
L(x01):		cmp.ne	p14, p0 = r0, r0	C suppress initial xma pair
Packit 5c3484
		mov	fp2a_2 = f0
Packit 5c3484
		br	L(b01)
Packit 5c3484
L(x10):		cmp.ne	p11, p0 = r0, r0	C suppress initial xma pair
Packit 5c3484
		mov	fp2a_1 = f0
Packit 5c3484
		br	L(b10)
Packit 5c3484
L(x11):		cmp.ne	p15, p0 = r0, r0	C suppress initial xma pair
Packit 5c3484
		mov	fp2a_0 = f0
Packit 5c3484
		br	L(b11)
Packit 5c3484
Packit 5c3484
EPILOGUE()
Packit 5c3484
Packit 5c3484
PROLOGUE(mpn_addmul_2)
Packit 5c3484
	.prologue
Packit 5c3484
	.save	ar.lc, r2
Packit 5c3484
	.body
Packit 5c3484
Packit 5c3484
ifdef(`HAVE_ABI_32',`
Packit 5c3484
 {.mmi;		addp4	rp = 0, rp		C			M I
Packit 5c3484
		addp4	up = 0, up		C			M I
Packit 5c3484
		addp4	vp = 0, vp		C			M I
Packit 5c3484
}{.mmi;		nop	1
Packit 5c3484
		nop	1
Packit 5c3484
		zxt4	n = n			C			I
Packit 5c3484
	;;
Packit 5c3484
}')
Packit 5c3484
Packit 5c3484
 {.mmi;		ldf8	ux = [up], 8		C			M
Packit 5c3484
		ldf8	v0 = [vp], 8		C			M
Packit 5c3484
		mov	r2 = ar.lc		C			I0
Packit 5c3484
}{.mmi;		ldf8	rx = [rp], 8		C			M
Packit 5c3484
		and	r14 = 3, n		C			M I
Packit 5c3484
		add	n = -2, n		C			M I
Packit 5c3484
	;;
Packit 5c3484
}{.mmi;		ldf8	uy = [up], 8		C			M
Packit 5c3484
		ldf8	v1 = [vp]		C			M
Packit 5c3484
		shr.u	n = n, 2		C			I0
Packit 5c3484
}{.mmi;		ldf8	ry = [rp], -8		C			M
Packit 5c3484
		cmp.eq	p14, p0 = 1, r14	C			M I
Packit 5c3484
		cmp.eq	p11, p0 = 2, r14	C			M I
Packit 5c3484
	;;
Packit 5c3484
}{.mmi;		add	srp = 16, rp		C			M I
Packit 5c3484
		cmp.eq	p15, p6 = 3, r14	C			M I
Packit 5c3484
		mov	ar.lc = n		C			I0
Packit 5c3484
}{.bbb;	(p14)	br.dptk	L(b01)			C			B
Packit 5c3484
	(p11)	br.dptk	L(b10)			C			B
Packit 5c3484
	(p15)	br.dptk	L(b11)			C			B
Packit 5c3484
	;;
Packit 5c3484
}
Packit 5c3484
	ALIGN(32)
Packit 5c3484
L(b00):
Packit 5c3484
 {.mmi;		ldf8	r_1 = [srp], 8
Packit 5c3484
		ldf8	u_1 = [up], 8
Packit 5c3484
		mov	acc1_2 = 0
Packit 5c3484
}{.mmi;		mov	pr1_2 = 0
Packit 5c3484
		mov	pr0_3 = 0
Packit 5c3484
		cmp.ne	p8, p9 = r0, r0
Packit 5c3484
	;;
Packit 5c3484
}{.mfi;		ldf8	r_2 = [srp], 8
Packit 5c3484
		xma.l	fp0b_3 = ux, v0, rx
Packit 5c3484
		cmp.ne	p12, p13 = r0, r0
Packit 5c3484
}{.mfb;		ldf8	u_2 = [up], 8
Packit 5c3484
		xma.hu	fp1b_3 = ux, v0, rx
Packit 5c3484
		br.cloop.dptk	L(gt4)
Packit 5c3484
}
Packit 5c3484
		xma.l	fp0b_0 = uy, v0, ry
Packit 5c3484
		xma.hu	fp1a_0 = uy, v0, ry
Packit 5c3484
	;;
Packit 5c3484
		getfsig	acc0 = fp0b_3
Packit 5c3484
	(p6)	xma.hu	fp2a_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
Packit 5c3484
	(p6)	xma.l	fp1b_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
Packit 5c3484
	;;
Packit 5c3484
		xma.l	fp0b_1 = u_1, v0, r_1
Packit 5c3484
		xma.hu	fp1a_1 = u_1, v0, r_1
Packit 5c3484
	;;
Packit 5c3484
		getfsig	pr0_0 = fp0b_0
Packit 5c3484
		xma.l	fp1b_0 = uy, v1, fp1a_0
Packit 5c3484
		xma.hu	fp2a_0 = uy, v1, fp1a_0
Packit 5c3484
	;;
Packit 5c3484
		getfsig	pr1_3 = fp1b_3
Packit 5c3484
		getfsig	acc1_3 = fp2a_3
Packit 5c3484
		xma.l	fp0b_2 = u_2, v0, r_2
Packit 5c3484
		xma.hu	fp1a_2 = u_2, v0, r_2
Packit 5c3484
		br	L(cj4)
Packit 5c3484
Packit 5c3484
L(gt4):		xma.l	fp0b_0 = uy, v0, ry
Packit 5c3484
		xma.hu	fp1a_0 = uy, v0, ry
Packit 5c3484
	;;
Packit 5c3484
		ldf8	r_3 = [srp], 8
Packit 5c3484
		getfsig	acc0 = fp0b_3
Packit 5c3484
	(p6)	xma.hu	fp2a_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
Packit 5c3484
		ldf8	u_3 = [up], 8
Packit 5c3484
	(p6)	xma.l	fp1b_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
Packit 5c3484
	;;
Packit 5c3484
		xma.l	fp0b_1 = u_1, v0, r_1
Packit 5c3484
		xma.hu	fp1a_1 = u_1, v0, r_1
Packit 5c3484
	;;
Packit 5c3484
		ldf8	r_0 = [srp], 8
Packit 5c3484
		getfsig	pr0_0 = fp0b_0
Packit 5c3484
		xma.l	fp1b_0 = uy, v1, fp1a_0
Packit 5c3484
		xma.hu	fp2a_0 = uy, v1, fp1a_0
Packit 5c3484
	;;
Packit 5c3484
		ldf8	u_0 = [up], 8
Packit 5c3484
		getfsig	pr1_3 = fp1b_3
Packit 5c3484
		xma.l	fp0b_2 = u_2, v0, r_2
Packit 5c3484
	;;
Packit 5c3484
		getfsig	acc1_3 = fp2a_3
Packit 5c3484
		xma.hu	fp1a_2 = u_2, v0, r_2
Packit 5c3484
		br	L(00)
Packit 5c3484
Packit 5c3484
Packit 5c3484
	ALIGN(32)
Packit 5c3484
L(b01):
Packit 5c3484
 {.mmi;		ldf8	r_0 = [srp], 8		C M
Packit 5c3484
		ldf8	u_0 = [up], 8		C M
Packit 5c3484
		mov	acc1_1 = 0		C M I
Packit 5c3484
}{.mmi;		mov	pr1_1 = 0		C M I
Packit 5c3484
		mov	pr0_2 = 0		C M I
Packit 5c3484
		cmp.ne	p6, p7 = r0, r0		C M I
Packit 5c3484
	;;
Packit 5c3484
}{.mfi;		ldf8	r_1 = [srp], 8		C M
Packit 5c3484
		xma.l	fp0b_2 = ux, v0, rx	C F
Packit 5c3484
		cmp.ne	p10, p11 = r0, r0	C M I
Packit 5c3484
}{.mfi;		ldf8	u_1 = [up], 8		C M
Packit 5c3484
		xma.hu	fp1b_2 = ux, v0, rx	C F
Packit 5c3484
		nop	1
Packit 5c3484
	;;
Packit 5c3484
}		xma.l	fp0b_3 = uy, v0, ry	C F
Packit 5c3484
		xma.hu	fp1a_3 = uy, v0, ry	C F
Packit 5c3484
	;;
Packit 5c3484
 {.mmf;		getfsig	acc0 = fp0b_2		C M
Packit 5c3484
		ldf8	r_2 = [srp], 8		C M
Packit 5c3484
	(p14)	xma.hu	fp2a_2 = ux, v1,fp1b_2	C F	suppressed for addmul_2s
Packit 5c3484
}{.mfb;		ldf8	u_2 = [up], 8		C M
Packit 5c3484
	(p14)	xma.l	fp1b_2 = ux, v1,fp1b_2	C F	suppressed for addmul_2s
Packit 5c3484
		br.cloop.dptk	L(gt5)
Packit 5c3484
}
Packit 5c3484
		xma.l	fp0b_0 = u_0, v0, r_0	C F
Packit 5c3484
		xma.hu	fp1a_0 = u_0, v0, r_0	C F
Packit 5c3484
	;;
Packit 5c3484
		getfsig	pr0_3 = fp0b_3		C M
Packit 5c3484
		xma.l	fp1b_3 = uy, v1,fp1a_3	C F
Packit 5c3484
		xma.hu	fp2a_3 = uy, v1,fp1a_3	C F
Packit 5c3484
	;;
Packit 5c3484
		getfsig	pr1_2 = fp1b_2		C M
Packit 5c3484
		getfsig	acc1_2 = fp2a_2		C M
Packit 5c3484
		xma.l	fp0b_1 = u_1, v0, r_1	C F
Packit 5c3484
		xma.hu	fp1a_1 = u_1, v0, r_1	C F
Packit 5c3484
		br	L(cj5)
Packit 5c3484
Packit 5c3484
L(gt5):		xma.l	fp0b_0 = u_0, v0, r_0
Packit 5c3484
		xma.hu	fp1a_0 = u_0, v0, r_0
Packit 5c3484
	;;
Packit 5c3484
		getfsig	pr0_3 = fp0b_3
Packit 5c3484
		ldf8	r_3 = [srp], 8
Packit 5c3484
		xma.l	fp1b_3 = uy, v1, fp1a_3
Packit 5c3484
		xma.hu	fp2a_3 = uy, v1, fp1a_3
Packit 5c3484
	;;
Packit 5c3484
		ldf8	u_3 = [up], 8
Packit 5c3484
		getfsig	pr1_2 = fp1b_2
Packit 5c3484
		xma.l	fp0b_1 = u_1, v0, r_1
Packit 5c3484
	;;
Packit 5c3484
		getfsig	acc1_2 = fp2a_2
Packit 5c3484
		xma.hu	fp1a_1 = u_1, v0, r_1
Packit 5c3484
		br	L(01)
Packit 5c3484
Packit 5c3484
Packit 5c3484
	ALIGN(32)
Packit 5c3484
L(b10):		br.cloop.dptk	L(gt2)
Packit 5c3484
		xma.l	fp0b_1 = ux, v0, rx
Packit 5c3484
		xma.hu	fp1b_1 = ux, v0, rx
Packit 5c3484
	;;
Packit 5c3484
		xma.l	fp0b_2 = uy, v0, ry
Packit 5c3484
		xma.hu	fp1a_2 = uy, v0, ry
Packit 5c3484
	;;
Packit 5c3484
		stf8	[rp] = fp0b_1, 8
Packit 5c3484
	(p11)	xma.hu	fp2a_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
Packit 5c3484
	(p11)	xma.l	fp1b_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
Packit 5c3484
	;;
Packit 5c3484
		getfsig	acc0 = fp0b_2
Packit 5c3484
		xma.l	fp1b_2 = uy, v1, fp1a_2
Packit 5c3484
		xma.hu	fp2a_2 = uy, v1, fp1a_2
Packit 5c3484
	;;
Packit 5c3484
		getfsig	pr1_1 = fp1b_1
Packit 5c3484
		getfsig	acc1_1 = fp2a_1
Packit 5c3484
		mov	ar.lc = r2
Packit 5c3484
		getfsig	pr1_2 = fp1b_2
Packit 5c3484
		getfsig	r8 = fp2a_2
Packit 5c3484
	;;
Packit 5c3484
		add	s0 = pr1_1, acc0
Packit 5c3484
	;;
Packit 5c3484
		st8	[rp] = s0, 8
Packit 5c3484
		cmp.ltu	p8, p9 = s0, pr1_1
Packit 5c3484
		sub	r31 = -1, acc1_1
Packit 5c3484
	;;
Packit 5c3484
	.pred.rel "mutex", p8, p9
Packit 5c3484
	(p8)	add	acc0 = pr1_2, acc1_1, 1
Packit 5c3484
	(p9)	add	acc0 = pr1_2, acc1_1
Packit 5c3484
	(p8)	cmp.leu	p10, p0 = r31, pr1_2
Packit 5c3484
	(p9)	cmp.ltu	p10, p0 = r31, pr1_2
Packit 5c3484
	;;
Packit 5c3484
		st8	[rp] = acc0, 8
Packit 5c3484
	(p10)	add	r8 = 1, r8
Packit 5c3484
		br.ret.sptk.many b0
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(gt2):
Packit 5c3484
 {.mmi;		ldf8	r_3 = [srp], 8
Packit 5c3484
		ldf8	u_3 = [up], 8
Packit 5c3484
		mov	acc1_0 = 0
Packit 5c3484
	;;
Packit 5c3484
}{.mfi;		ldf8	r_0 = [srp], 8
Packit 5c3484
		xma.l	fp0b_1 = ux, v0, rx
Packit 5c3484
		mov	pr1_0 = 0
Packit 5c3484
}{.mfi;		ldf8	u_0 = [up], 8
Packit 5c3484
		xma.hu	fp1b_1 = ux, v0, rx
Packit 5c3484
		mov	pr0_1 = 0
Packit 5c3484
	;;
Packit 5c3484
}		xma.l	fp0b_2 = uy, v0, ry
Packit 5c3484
		xma.hu	fp1a_2 = uy, v0, ry
Packit 5c3484
	;;
Packit 5c3484
		getfsig	acc0 = fp0b_1
Packit 5c3484
		ldf8	r_1 = [srp], 8
Packit 5c3484
	(p11)	xma.hu	fp2a_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
Packit 5c3484
	(p11)	xma.l	fp1b_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
Packit 5c3484
	;;
Packit 5c3484
		ldf8	u_1 = [up], 8
Packit 5c3484
		xma.l	fp0b_3 = u_3, v0, r_3
Packit 5c3484
		xma.hu	fp1a_3 = u_3, v0, r_3
Packit 5c3484
	;;
Packit 5c3484
		getfsig	pr0_2 = fp0b_2
Packit 5c3484
		ldf8	r_2 = [srp], 8
Packit 5c3484
		xma.l	fp1b_2 = uy, v1, fp1a_2
Packit 5c3484
		xma.hu	fp2a_2 = uy, v1, fp1a_2
Packit 5c3484
	;;
Packit 5c3484
		ldf8	u_2 = [up], 8
Packit 5c3484
		getfsig	pr1_1 = fp1b_1
Packit 5c3484
	;;
Packit 5c3484
 {.mfi;		getfsig	acc1_1 = fp2a_1
Packit 5c3484
		xma.l	fp0b_0 = u_0, v0, r_0
Packit 5c3484
		cmp.ne	p8, p9 = r0, r0
Packit 5c3484
}{.mfb;		cmp.ne	p12, p13 = r0, r0
Packit 5c3484
		xma.hu	fp1a_0 = u_0, v0, r_0
Packit 5c3484
		br.cloop.sptk.clr	L(top)
Packit 5c3484
}
Packit 5c3484
		br.many	L(end)
Packit 5c3484
Packit 5c3484
Packit 5c3484
	ALIGN(32)
Packit 5c3484
L(b11):		ldf8	r_2 = [srp], 8
Packit 5c3484
		mov	pr1_3 = 0
Packit 5c3484
		mov	pr0_0 = 0
Packit 5c3484
	;;
Packit 5c3484
		ldf8	u_2 = [up], 8
Packit 5c3484
		mov	acc1_3 = 0
Packit 5c3484
		br.cloop.dptk	L(gt3)
Packit 5c3484
	;;
Packit 5c3484
		cmp.ne	p6, p7 = r0, r0
Packit 5c3484
		xma.l	fp0b_0 = ux, v0, rx
Packit 5c3484
		xma.hu	fp1b_0 = ux, v0, rx
Packit 5c3484
	;;
Packit 5c3484
		cmp.ne	p10, p11 = r0, r0
Packit 5c3484
		xma.l	fp0b_1 = uy, v0, ry
Packit 5c3484
		xma.hu	fp1a_1 = uy, v0, ry
Packit 5c3484
	;;
Packit 5c3484
		getfsig	acc0 = fp0b_0
Packit 5c3484
	(p15)	xma.hu	fp2a_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
Packit 5c3484
	(p15)	xma.l	fp1b_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
Packit 5c3484
	;;
Packit 5c3484
		xma.l	fp0b_2 = uy, v1, r_2
Packit 5c3484
		xma.hu	fp1a_2 = uy, v1, r_2
Packit 5c3484
	;;
Packit 5c3484
		getfsig	pr0_1 = fp0b_1
Packit 5c3484
		xma.l	fp1b_1 = u_2, v0, fp1a_1
Packit 5c3484
		xma.hu	fp2a_1 = u_2, v0, fp1a_1
Packit 5c3484
	;;
Packit 5c3484
		getfsig	pr1_0 = fp1b_0
Packit 5c3484
		getfsig	acc1_0 = fp2a_0
Packit 5c3484
		br	L(cj3)
Packit 5c3484
Packit 5c3484
L(gt3):		ldf8	r_3 = [srp], 8
Packit 5c3484
		xma.l	fp0b_0 = ux, v0, rx
Packit 5c3484
		cmp.ne	p10, p11 = r0, r0
Packit 5c3484
		ldf8	u_3 = [up], 8
Packit 5c3484
		xma.hu	fp1b_0 = ux, v0, rx
Packit 5c3484
		cmp.ne	p6, p7 = r0, r0
Packit 5c3484
	;;
Packit 5c3484
		xma.l	fp0b_1 = uy, v0, ry
Packit 5c3484
		xma.hu	fp1a_1 = uy, v0, ry
Packit 5c3484
	;;
Packit 5c3484
		getfsig	acc0 = fp0b_0
Packit 5c3484
		ldf8	r_0 = [srp], 8
Packit 5c3484
	(p15)	xma.hu	fp2a_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
Packit 5c3484
		ldf8	u_0 = [up], 8
Packit 5c3484
	(p15)	xma.l	fp1b_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
Packit 5c3484
	;;
Packit 5c3484
		xma.l	fp0b_2 = u_2, v0, r_2
Packit 5c3484
		xma.hu	fp1a_2 = u_2, v0, r_2
Packit 5c3484
	;;
Packit 5c3484
		getfsig	pr0_1 = fp0b_1
Packit 5c3484
		ldf8	r_1 = [srp], 8
Packit 5c3484
		xma.l	fp1b_1 = uy, v1, fp1a_1
Packit 5c3484
		xma.hu	fp2a_1 = uy, v1, fp1a_1
Packit 5c3484
	;;
Packit 5c3484
		ldf8	u_1 = [up], 8
Packit 5c3484
		getfsig	pr1_0 = fp1b_0
Packit 5c3484
	;;
Packit 5c3484
		getfsig	acc1_0 = fp2a_0
Packit 5c3484
		xma.l	fp0b_3 = u_3, v0, r_3
Packit 5c3484
		xma.hu	fp1a_3 = u_3, v0, r_3
Packit 5c3484
		br	L(11)
Packit 5c3484
Packit 5c3484
Packit 5c3484
C *** MAIN LOOP START ***
Packit 5c3484
	ALIGN(32)
Packit 5c3484
L(top):						C 00
Packit 5c3484
	.pred.rel "mutex", p12, p13
Packit 5c3484
		getfsig	pr0_3 = fp0b_3
Packit 5c3484
		ldf8	r_3 = [srp], 8
Packit 5c3484
		xma.l	fp1b_3 = u_3, v1, fp1a_3
Packit 5c3484
	(p12)	add	s0 = pr1_0, acc0, 1
Packit 5c3484
	(p13)	add	s0 = pr1_0, acc0
Packit 5c3484
		xma.hu	fp2a_3 = u_3, v1, fp1a_3
Packit 5c3484
	;;					C 01
Packit 5c3484
	.pred.rel "mutex", p8, p9
Packit 5c3484
	.pred.rel "mutex", p12, p13
Packit 5c3484
		ldf8	u_3 = [up], 8
Packit 5c3484
		getfsig	pr1_2 = fp1b_2
Packit 5c3484
	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
Packit 5c3484
	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
Packit 5c3484
	(p12)	cmp.leu	p10, p11 = s0, pr1_0
Packit 5c3484
	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
Packit 5c3484
	;;					C 02
Packit 5c3484
	.pred.rel "mutex", p6, p7
Packit 5c3484
		getfsig	acc1_2 = fp2a_2
Packit 5c3484
		st8	[rp] = s0, 8
Packit 5c3484
		xma.l	fp0b_1 = u_1, v0, r_1
Packit 5c3484
	(p6)	add	acc0 = pr0_2, acc1_0, 1
Packit 5c3484
	(p7)	add	acc0 = pr0_2, acc1_0
Packit 5c3484
		xma.hu	fp1a_1 = u_1, v0, r_1
Packit 5c3484
	;;					C 03
Packit 5c3484
L(01):
Packit 5c3484
	.pred.rel "mutex", p10, p11
Packit 5c3484
		getfsig	pr0_0 = fp0b_0
Packit 5c3484
		ldf8	r_0 = [srp], 8
Packit 5c3484
		xma.l	fp1b_0 = u_0, v1, fp1a_0
Packit 5c3484
	(p10)	add	s0 = pr1_1, acc0, 1
Packit 5c3484
	(p11)	add	s0 = pr1_1, acc0
Packit 5c3484
		xma.hu	fp2a_0 = u_0, v1, fp1a_0
Packit 5c3484
	;;					C 04
Packit 5c3484
	.pred.rel "mutex", p6, p7
Packit 5c3484
	.pred.rel "mutex", p10, p11
Packit 5c3484
		ldf8	u_0 = [up], 8
Packit 5c3484
		getfsig	pr1_3 = fp1b_3
Packit 5c3484
	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
Packit 5c3484
	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
Packit 5c3484
	(p10)	cmp.leu	p12, p13 = s0, pr1_1
Packit 5c3484
	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
Packit 5c3484
	;;					C 05
Packit 5c3484
	.pred.rel "mutex", p8, p9
Packit 5c3484
		getfsig	acc1_3 = fp2a_3
Packit 5c3484
		st8	[rp] = s0, 8
Packit 5c3484
		xma.l	fp0b_2 = u_2, v0, r_2
Packit 5c3484
	(p8)	add	acc0 = pr0_3, acc1_1, 1
Packit 5c3484
	(p9)	add	acc0 = pr0_3, acc1_1
Packit 5c3484
		xma.hu	fp1a_2 = u_2, v0, r_2
Packit 5c3484
	;;					C 06
Packit 5c3484
L(00):
Packit 5c3484
	.pred.rel "mutex", p12, p13
Packit 5c3484
		getfsig	pr0_1 = fp0b_1
Packit 5c3484
		ldf8	r_1 = [srp], 8
Packit 5c3484
		xma.l	fp1b_1 = u_1, v1, fp1a_1
Packit 5c3484
	(p12)	add	s0 = pr1_2, acc0, 1
Packit 5c3484
	(p13)	add	s0 = pr1_2, acc0
Packit 5c3484
		xma.hu	fp2a_1 = u_1, v1, fp1a_1
Packit 5c3484
	;;					C 07
Packit 5c3484
	.pred.rel "mutex", p8, p9
Packit 5c3484
	.pred.rel "mutex", p12, p13
Packit 5c3484
		ldf8	u_1 = [up], 8
Packit 5c3484
		getfsig	pr1_0 = fp1b_0
Packit 5c3484
	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
Packit 5c3484
	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
Packit 5c3484
	(p12)	cmp.leu	p10, p11 = s0, pr1_2
Packit 5c3484
	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
Packit 5c3484
	;;					C 08
Packit 5c3484
	.pred.rel "mutex", p6, p7
Packit 5c3484
		getfsig	acc1_0 = fp2a_0
Packit 5c3484
		st8	[rp] = s0, 8
Packit 5c3484
		xma.l	fp0b_3 = u_3, v0, r_3
Packit 5c3484
	(p6)	add	acc0 = pr0_0, acc1_2, 1
Packit 5c3484
	(p7)	add	acc0 = pr0_0, acc1_2
Packit 5c3484
		xma.hu	fp1a_3 = u_3, v0, r_3
Packit 5c3484
	;;					C 09
Packit 5c3484
L(11):
Packit 5c3484
	.pred.rel "mutex", p10, p11
Packit 5c3484
		getfsig	pr0_2 = fp0b_2
Packit 5c3484
		ldf8	r_2 = [srp], 8
Packit 5c3484
		xma.l	fp1b_2 = u_2, v1, fp1a_2
Packit 5c3484
	(p10)	add	s0 = pr1_3, acc0, 1
Packit 5c3484
	(p11)	add	s0 = pr1_3, acc0
Packit 5c3484
		xma.hu	fp2a_2 = u_2, v1, fp1a_2
Packit 5c3484
	;;					C 10
Packit 5c3484
	.pred.rel "mutex", p6, p7
Packit 5c3484
	.pred.rel "mutex", p10, p11
Packit 5c3484
		ldf8	u_2 = [up], 8
Packit 5c3484
		getfsig	pr1_1 = fp1b_1
Packit 5c3484
	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
Packit 5c3484
	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
Packit 5c3484
	(p10)	cmp.leu	p12, p13 = s0, pr1_3
Packit 5c3484
	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
Packit 5c3484
	;;					C 11
Packit 5c3484
	.pred.rel "mutex", p8, p9
Packit 5c3484
		getfsig	acc1_1 = fp2a_1
Packit 5c3484
		st8	[rp] = s0, 8
Packit 5c3484
		xma.l	fp0b_0 = u_0, v0, r_0
Packit 5c3484
	(p8)	add	acc0 = pr0_1, acc1_3, 1
Packit 5c3484
	(p9)	add	acc0 = pr0_1, acc1_3
Packit 5c3484
		xma.hu	fp1a_0 = u_0, v0, r_0
Packit 5c3484
L(10):		br.cloop.sptk.clr	L(top)	C 12
Packit 5c3484
	;;
Packit 5c3484
C *** MAIN LOOP END ***
Packit 5c3484
L(end):
Packit 5c3484
	.pred.rel "mutex", p12, p13
Packit 5c3484
 {.mfi;		getfsig	pr0_3 = fp0b_3
Packit 5c3484
		xma.l	fp1b_3 = u_3, v1, fp1a_3
Packit 5c3484
	(p12)	add	s0 = pr1_0, acc0, 1
Packit 5c3484
}{.mfi;	(p13)	add	s0 = pr1_0, acc0
Packit 5c3484
		xma.hu	fp2a_3 = u_3, v1, fp1a_3
Packit 5c3484
		nop	1
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p8, p9
Packit 5c3484
	.pred.rel "mutex", p12, p13
Packit 5c3484
 {.mmi;		getfsig	pr1_2 = fp1b_2
Packit 5c3484
		st8	[rp] = s0, 8
Packit 5c3484
	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
Packit 5c3484
}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
Packit 5c3484
	(p12)	cmp.leu	p10, p11 = s0, pr1_0
Packit 5c3484
	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p6, p7
Packit 5c3484
 {.mfi;		getfsig	acc1_2 = fp2a_2
Packit 5c3484
		xma.l	fp0b_1 = u_1, v0, r_1
Packit 5c3484
		nop	1
Packit 5c3484
}{.mmf;	(p6)	add	acc0 = pr0_2, acc1_0, 1
Packit 5c3484
	(p7)	add	acc0 = pr0_2, acc1_0
Packit 5c3484
		xma.hu	fp1a_1 = u_1, v0, r_1
Packit 5c3484
	;;
Packit 5c3484
}
Packit 5c3484
L(cj5):
Packit 5c3484
	.pred.rel "mutex", p10, p11
Packit 5c3484
 {.mfi;		getfsig	pr0_0 = fp0b_0
Packit 5c3484
		xma.l	fp1b_0 = u_0, v1, fp1a_0
Packit 5c3484
	(p10)	add	s0 = pr1_1, acc0, 1
Packit 5c3484
}{.mfi;	(p11)	add	s0 = pr1_1, acc0
Packit 5c3484
		xma.hu	fp2a_0 = u_0, v1, fp1a_0
Packit 5c3484
		nop	1
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p6, p7
Packit 5c3484
	.pred.rel "mutex", p10, p11
Packit 5c3484
 {.mmi;		getfsig	pr1_3 = fp1b_3
Packit 5c3484
	st8	[rp] = s0, 8
Packit 5c3484
	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
Packit 5c3484
}{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
Packit 5c3484
	(p10)	cmp.leu	p12, p13 = s0, pr1_1
Packit 5c3484
	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p8, p9
Packit 5c3484
 {.mfi;		getfsig	acc1_3 = fp2a_3
Packit 5c3484
		xma.l	fp0b_2 = u_2, v0, r_2
Packit 5c3484
		nop	1
Packit 5c3484
}{.mmf;	(p8)	add	acc0 = pr0_3, acc1_1, 1
Packit 5c3484
	(p9)	add	acc0 = pr0_3, acc1_1
Packit 5c3484
		xma.hu	fp1a_2 = u_2, v0, r_2
Packit 5c3484
	;;
Packit 5c3484
}
Packit 5c3484
L(cj4):
Packit 5c3484
	.pred.rel "mutex", p12, p13
Packit 5c3484
 {.mfi;		getfsig	pr0_1 = fp0b_1
Packit 5c3484
		xma.l	fp1b_1 = u_1, v1, fp1a_1
Packit 5c3484
	(p12)	add	s0 = pr1_2, acc0, 1
Packit 5c3484
}{.mfi;	(p13)	add	s0 = pr1_2, acc0
Packit 5c3484
		xma.hu	fp2a_1 = u_1, v1, fp1a_1
Packit 5c3484
		nop	1
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p8, p9
Packit 5c3484
	.pred.rel "mutex", p12, p13
Packit 5c3484
 {.mmi;		getfsig	pr1_0 = fp1b_0
Packit 5c3484
		st8	[rp] = s0, 8
Packit 5c3484
	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
Packit 5c3484
}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
Packit 5c3484
	(p12)	cmp.leu	p10, p11 = s0, pr1_2
Packit 5c3484
	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p6, p7
Packit 5c3484
 {.mmi;		getfsig	acc1_0 = fp2a_0
Packit 5c3484
	(p6)	add	acc0 = pr0_0, acc1_2, 1
Packit 5c3484
	(p7)	add	acc0 = pr0_0, acc1_2
Packit 5c3484
	;;
Packit 5c3484
}
Packit 5c3484
L(cj3):
Packit 5c3484
	.pred.rel "mutex", p10, p11
Packit 5c3484
 {.mfi;		getfsig	pr0_2 = fp0b_2
Packit 5c3484
		xma.l	fp1b_2 = u_2, v1, fp1a_2
Packit 5c3484
	(p10)	add	s0 = pr1_3, acc0, 1
Packit 5c3484
}{.mfi;	(p11)	add	s0 = pr1_3, acc0
Packit 5c3484
		xma.hu	fp2a_2 = u_2, v1, fp1a_2
Packit 5c3484
		nop	1
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p6, p7
Packit 5c3484
	.pred.rel "mutex", p10, p11
Packit 5c3484
 {.mmi;		getfsig	pr1_1 = fp1b_1
Packit 5c3484
		st8	[rp] = s0, 8
Packit 5c3484
	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
Packit 5c3484
}{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
Packit 5c3484
	(p10)	cmp.leu	p12, p13 = s0, pr1_3
Packit 5c3484
	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p8, p9
Packit 5c3484
 {.mmi;		getfsig	acc1_1 = fp2a_1
Packit 5c3484
	(p8)	add	acc0 = pr0_1, acc1_3, 1
Packit 5c3484
	(p9)	add	acc0 = pr0_1, acc1_3
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p12, p13
Packit 5c3484
 {.mmi;	(p12)	add	s0 = pr1_0, acc0, 1
Packit 5c3484
	(p13)	add	s0 = pr1_0, acc0
Packit 5c3484
		nop	1
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p8, p9
Packit 5c3484
	.pred.rel "mutex", p12, p13
Packit 5c3484
 {.mmi;		getfsig	pr1_2 = fp1b_2
Packit 5c3484
		st8	[rp] = s0, 8
Packit 5c3484
	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
Packit 5c3484
}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
Packit 5c3484
	(p12)	cmp.leu	p10, p11 = s0, pr1_0
Packit 5c3484
	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p6, p7
Packit 5c3484
 {.mmi;		getfsig	r8 = fp2a_2
Packit 5c3484
	(p6)	add	acc0 = pr0_2, acc1_0, 1
Packit 5c3484
	(p7)	add	acc0 = pr0_2, acc1_0
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p10, p11
Packit 5c3484
 {.mmi;	(p10)	add	s0 = pr1_1, acc0, 1
Packit 5c3484
	(p11)	add	s0 = pr1_1, acc0
Packit 5c3484
	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p10, p11
Packit 5c3484
 {.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
Packit 5c3484
	(p10)	cmp.leu	p12, p13 = s0, pr1_1
Packit 5c3484
	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p8, p9
Packit 5c3484
 {.mmi;		st8	[rp] = s0, 8
Packit 5c3484
	(p8)	add	acc0 = pr1_2, acc1_1, 1
Packit 5c3484
	(p9)	add	acc0 = pr1_2, acc1_1
Packit 5c3484
	;;
Packit 5c3484
}	.pred.rel "mutex", p8, p9
Packit 5c3484
 {.mmi;	(p8)	cmp.leu	p10, p11 = acc0, pr1_2
Packit 5c3484
	(p9)	cmp.ltu	p10, p11 = acc0, pr1_2
Packit 5c3484
	(p12)	add	acc0 = 1, acc0
Packit 5c3484
	;;
Packit 5c3484
}{.mmi;		st8	[rp] = acc0, 8
Packit 5c3484
	(p12)	cmpeqor	p10, p0 = 0, acc0
Packit 5c3484
		nop	1
Packit 5c3484
	;;
Packit 5c3484
}{.mib;	(p10)	add	r8 = 1, r8
Packit 5c3484
		mov	ar.lc = r2
Packit 5c3484
		br.ret.sptk.many b0
Packit 5c3484
}
Packit 5c3484
EPILOGUE()
Packit 5c3484
ASM_END()