Blame sysdeps/alpha/alphaev6/addmul_1.S

Packit 6c4009
 # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
Packit 6c4009
 # the result to a second limb vector.
Packit 6c4009
 #
Packit 6c4009
 #  Copyright (C) 2000-2018 Free Software Foundation, Inc.
Packit 6c4009
 #
Packit 6c4009
 #  This file is part of the GNU MP Library.
Packit 6c4009
 #
Packit 6c4009
 #  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 6c4009
 #  it under the terms of the GNU Lesser General Public License as published
Packit 6c4009
 #  by the Free Software Foundation; either version 2.1 of the License, or (at
Packit 6c4009
 #  your option) any later version.
Packit 6c4009
 #
Packit 6c4009
 #  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 6c4009
 #  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 6c4009
 #  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
Packit 6c4009
 #  License for more details.
Packit 6c4009
 #
Packit 6c4009
 #  You should have received a copy of the GNU Lesser General Public License
Packit 6c4009
 #  along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
Packit 6c4009
Packit 6c4009
 #  INPUT PARAMETERS
Packit 6c4009
 #  res_ptr	$16
Packit 6c4009
 #  s1_ptr	$17
Packit 6c4009
 #  size	$18
Packit 6c4009
 #  s2_limb	$19
Packit 6c4009
 #
Packit 6c4009
 #  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
Packit 6c4009
 #  exactly 3.625 cycles/limb on EV6...
Packit 6c4009
 #
Packit 6c4009
 # This code was written in close cooperation with ev6 pipeline expert
Packit 6c4009
 # Steve Root (root@toober.hlo.dec.com).  Any errors are tege's fault, though.
Packit 6c4009
 #
Packit 6c4009
 #   Register usages for unrolled loop:
Packit 6c4009
 #	  0-3     mul's
Packit 6c4009
 #	  4-7     acc's
Packit 6c4009
 #	  8-15    mul results
Packit 6c4009
 #	  20,21   carry's
Packit 6c4009
 #	  22,23   save for stores
Packit 6c4009
 #
Packit 6c4009
 #   Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
Packit 6c4009
 #
Packit 6c4009
 #   The stores can issue a cycle late so we have paired no-op's to 'catch'
Packit 6c4009
 #   them, so that further disturbance to the schedule is damped.
Packit 6c4009
 #
Packit 6c4009
 #   We couldn't pair the loads, because the entangled schedule of the
Packit 6c4009
 #   carry's has to happen on one side {0} of the machine. Note, the total
Packit 6c4009
 #   use of U0, and the total use of L0 (after attending to the stores).
Packit 6c4009
 #   which is part of the reason why....
Packit 6c4009
 #
Packit 6c4009
 #   This is a great schedule for the d_cache, a poor schedule for the
Packit 6c4009
 #   b_cache. The lockup on U0 means that any stall can't be recovered
Packit 6c4009
 #   from. Consider a ldq in L1.  say that load gets stalled because it
Packit 6c4009
 #   collides with a fill from the b_Cache. On the next cycle, this load
Packit 6c4009
 #   gets priority. If first looks at L0, and goes there. The instruction
Packit 6c4009
 #   we intended for L0 gets to look at L1, which is NOT where we want
Packit 6c4009
 #   it. It either stalls 1, because it can't go in L0, or goes there, and
Packit 6c4009
 #   causes a further instruction to stall.
Packit 6c4009
 #
Packit 6c4009
 #   So for b_cache, we're likely going to want to put one or more cycles
Packit 6c4009
 #   back into the code! And, of course, put in prefetches. For the
Packit 6c4009
 #   accumulator, lds, intent to modify.  For the multiplier, you might
Packit 6c4009
 #   want ldq, evict next, if you're not wanting to use it again soon. Use
Packit 6c4009
 #   256 ahead of present pointer value. At a place where we have an mt
Packit 6c4009
 #   followed by a bookkeeping, put the bookkeeping in upper, and the
Packit 6c4009
 #   prefetch into lower.
Packit 6c4009
 #
Packit 6c4009
 #   Note, the usage of physical registers per cycle is smoothed off, as
Packit 6c4009
 #   much as possible.
Packit 6c4009
 #
Packit 6c4009
 #   Note, the ldq's and stq's are at the end of the quadpacks.  note, we'd
Packit 6c4009
 #   like not to have a ldq or stq to preceded a conditional branch in a
Packit 6c4009
 #   quadpack. The conditional branch moves the retire pointer one cycle
Packit 6c4009
 #   later.
Packit 6c4009
 #
Packit 6c4009
 #   Optimization notes:
Packit 6c4009
 #   Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
Packit 6c4009
 #   Reserved regs:	 $29 $30 $31
Packit 6c4009
 #   Free caller-saves regs in unrolled code: $24 $25 $28
Packit 6c4009
 #   We should swap some of the callee-saves regs for some of the free
Packit 6c4009
 #   caller-saves regs, saving some overhead cycles.
Packit 6c4009
 #   Most importantly, we should write fast code for the 0-7 case.
Packit 6c4009
 #   The code we use there are for the 21164, and runs at 7 cycles/limb
Packit 6c4009
 #   on the 21264.  Should not be hard, if we write specialized code for
Packit 6c4009
 #   1-7 limbs (the one for 0 limbs should be straightforward).  We then just
Packit 6c4009
 #   need a jump table indexed by the low 3 bits of the count argument.
Packit 6c4009
Packit 6c4009
	.set	noreorder
Packit 6c4009
	.set	noat
Packit 6c4009
	.text
Packit 6c4009
Packit 6c4009
	.globl	__mpn_addmul_1
Packit 6c4009
	.ent	__mpn_addmul_1
Packit 6c4009
__mpn_addmul_1:
Packit 6c4009
	.frame	$30,0,$26,0
Packit 6c4009
	.prologue 0
Packit 6c4009
Packit 6c4009
	cmpult	$18,	8,	$1
Packit 6c4009
	beq	$1,	$Large
Packit 6c4009
Packit 6c4009
	ldq	$2,	0($17)		# $2 = s1_limb
Packit 6c4009
	addq	$17,	8,	$17	# s1_ptr++
Packit 6c4009
	subq	$18,	1,	$18	# size--
Packit 6c4009
	mulq	$2,	$19,	$3	# $3 = prod_low
Packit 6c4009
	ldq	$5,	0($16)		# $5 = *res_ptr
Packit 6c4009
	umulh	$2,	$19,	$0	# $0 = prod_high
Packit 6c4009
	beq	$18,	$Lend0b		# jump if size was == 1
Packit 6c4009
	ldq	$2,	0($17)		# $2 = s1_limb
Packit 6c4009
	addq	$17,	8,	$17	# s1_ptr++
Packit 6c4009
	subq	$18,	1,	$18	# size--
Packit 6c4009
	addq	$5,	$3,	$3
Packit 6c4009
	cmpult	$3,	$5,	$4
Packit 6c4009
	stq	$3,	0($16)
Packit 6c4009
	addq	$16,	8,	$16	# res_ptr++
Packit 6c4009
	beq	$18,	$Lend0a		# jump if size was == 2
Packit 6c4009
Packit 6c4009
	.align 3
Packit 6c4009
$Loop0:	mulq	$2,	$19,	$3	# $3 = prod_low
Packit 6c4009
	ldq	$5,	0($16)		# $5 = *res_ptr
Packit 6c4009
	addq	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
Packit 6c4009
	subq	$18,	1,	$18	# size--
Packit 6c4009
	umulh	$2,	$19,	$4	# $4 = cy_limb
Packit 6c4009
	ldq	$2,	0($17)		# $2 = s1_limb
Packit 6c4009
	addq	$17,	8,	$17	# s1_ptr++
Packit 6c4009
	addq	$3,	$0,	$3	# $3 = cy_limb + prod_low
Packit 6c4009
	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
Packit 6c4009
	addq	$5,	$3,	$3
Packit 6c4009
	cmpult	$3,	$5,	$5
Packit 6c4009
	stq	$3,	0($16)
Packit 6c4009
	addq	$16,	8,	$16	# res_ptr++
Packit 6c4009
	addq	$5,	$0,	$0	# combine carries
Packit 6c4009
	bne	$18,	$Loop0
Packit 6c4009
$Lend0a:
Packit 6c4009
	mulq	$2,	$19,	$3	# $3 = prod_low
Packit 6c4009
	ldq	$5,	0($16)		# $5 = *res_ptr
Packit 6c4009
	addq	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
Packit 6c4009
	umulh	$2,	$19,	$4	# $4 = cy_limb
Packit 6c4009
	addq	$3,	$0,	$3	# $3 = cy_limb + prod_low
Packit 6c4009
	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
Packit 6c4009
	addq	$5,	$3,	$3
Packit 6c4009
	cmpult	$3,	$5,	$5
Packit 6c4009
	stq	$3,	0($16)
Packit 6c4009
	addq	$5,	$0,	$0	# combine carries
Packit 6c4009
	addq	$4,	$0,	$0	# cy_limb = prod_high + cy
Packit 6c4009
	ret	$31,	($26),	1
Packit 6c4009
$Lend0b:
Packit 6c4009
	addq	$5,	$3,	$3
Packit 6c4009
	cmpult	$3,	$5,	$5
Packit 6c4009
	stq	$3,	0($16)
Packit 6c4009
	addq	$0,	$5,	$0
Packit 6c4009
	ret	$31,	($26),	1
Packit 6c4009
Packit 6c4009
$Large:
Packit 6c4009
	lda	$30,	-240($30)
Packit 6c4009
	stq	$9,	8($30)
Packit 6c4009
	stq	$10,	16($30)
Packit 6c4009
	stq	$11,	24($30)
Packit 6c4009
	stq	$12,	32($30)
Packit 6c4009
	stq	$13,	40($30)
Packit 6c4009
	stq	$14,	48($30)
Packit 6c4009
	stq	$15,	56($30)
Packit 6c4009
Packit 6c4009
	and	$18,	7,	$20	# count for the first loop, 0-7
Packit 6c4009
	srl	$18,	3,	$18	# count for unrolled loop
Packit 6c4009
	bis	$31,	$31,	$0
Packit 6c4009
	beq	$20,	$Lunroll
Packit 6c4009
	ldq	$2,	0($17)		# $2 = s1_limb
Packit 6c4009
	addq	$17,	8,	$17	# s1_ptr++
Packit 6c4009
	subq	$20,	1,	$20	# size--
Packit 6c4009
	mulq	$2,	$19,	$3	# $3 = prod_low
Packit 6c4009
	ldq	$5,	0($16)		# $5 = *res_ptr
Packit 6c4009
	umulh	$2,	$19,	$0	# $0 = prod_high
Packit 6c4009
	beq	$20,	$Lend1b		# jump if size was == 1
Packit 6c4009
	ldq	$2,	0($17)		# $2 = s1_limb
Packit 6c4009
	addq	$17,	8,	$17	# s1_ptr++
Packit 6c4009
	subq	$20,	1,	$20	# size--
Packit 6c4009
	addq	$5,	$3,	$3
Packit 6c4009
	cmpult	$3,	$5,	$4
Packit 6c4009
	stq	$3,	0($16)
Packit 6c4009
	addq	$16,	8,	$16	# res_ptr++
Packit 6c4009
	beq	$20,	$Lend1a		# jump if size was == 2
Packit 6c4009
Packit 6c4009
	.align 3
Packit 6c4009
$Loop1:	mulq	$2,	$19,	$3	# $3 = prod_low
Packit 6c4009
	ldq	$5,	0($16)		# $5 = *res_ptr
Packit 6c4009
	addq	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
Packit 6c4009
	subq	$20,	1,	$20	# size--
Packit 6c4009
	umulh	$2,	$19,	$4	# $4 = cy_limb
Packit 6c4009
	ldq	$2,	0($17)		# $2 = s1_limb
Packit 6c4009
	addq	$17,	8,	$17	# s1_ptr++
Packit 6c4009
	addq	$3,	$0,	$3	# $3 = cy_limb + prod_low
Packit 6c4009
	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
Packit 6c4009
	addq	$5,	$3,	$3
Packit 6c4009
	cmpult	$3,	$5,	$5
Packit 6c4009
	stq	$3,	0($16)
Packit 6c4009
	addq	$16,	8,	$16	# res_ptr++
Packit 6c4009
	addq	$5,	$0,	$0	# combine carries
Packit 6c4009
	bne	$20,	$Loop1
Packit 6c4009
Packit 6c4009
$Lend1a:
Packit 6c4009
	mulq	$2,	$19,	$3	# $3 = prod_low
Packit 6c4009
	ldq	$5,	0($16)		# $5 = *res_ptr
Packit 6c4009
	addq	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
Packit 6c4009
	umulh	$2,	$19,	$4	# $4 = cy_limb
Packit 6c4009
	addq	$3,	$0,	$3	# $3 = cy_limb + prod_low
Packit 6c4009
	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
Packit 6c4009
	addq	$5,	$3,	$3
Packit 6c4009
	cmpult	$3,	$5,	$5
Packit 6c4009
	stq	$3,	0($16)
Packit 6c4009
	addq	$16,	8,	$16	# res_ptr++
Packit 6c4009
	addq	$5,	$0,	$0	# combine carries
Packit 6c4009
	addq	$4,	$0,	$0	# cy_limb = prod_high + cy
Packit 6c4009
	br	$31,	$Lunroll
Packit 6c4009
$Lend1b:
Packit 6c4009
	addq	$5,	$3,	$3
Packit 6c4009
	cmpult	$3,	$5,	$5
Packit 6c4009
	stq	$3,	0($16)
Packit 6c4009
	addq	$16,	8,	$16	# res_ptr++
Packit 6c4009
	addq	$0,	$5,	$0
Packit 6c4009
Packit 6c4009
$Lunroll:
Packit 6c4009
	lda	$17,	-16($17)	# L1 bookkeeping
Packit 6c4009
	lda	$16,	-16($16)	# L1 bookkeeping
Packit 6c4009
	bis	$0,	$31,	$12
Packit 6c4009
Packit 6c4009
 # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
Packit 6c4009
Packit 6c4009
	ldq	$2,	16($17)		# L1
Packit 6c4009
	ldq	$3,	24($17)		# L1
Packit 6c4009
	lda	$18,	-1($18)		# L1 bookkeeping
Packit 6c4009
	ldq	$6,	16($16)		# L1
Packit 6c4009
	ldq	$7,	24($16)		# L1
Packit 6c4009
	ldq	$0,	32($17)		# L1
Packit 6c4009
	mulq	$19,	$2,	$13	# U1
Packit 6c4009
	ldq	$1,	40($17)		# L1
Packit 6c4009
	umulh	$19,	$2,	$14	# U1
Packit 6c4009
	mulq	$19,	$3,	$15	# U1
Packit 6c4009
	lda	$17,	64($17)		# L1 bookkeeping
Packit 6c4009
	ldq	$4,	32($16)		# L1
Packit 6c4009
	ldq	$5,	40($16)		# L1
Packit 6c4009
	umulh	$19,	$3,	$8	# U1
Packit 6c4009
	ldq	$2,	-16($17)	# L1
Packit 6c4009
	mulq	$19,	$0,	$9	# U1
Packit 6c4009
	ldq	$3,	-8($17)		# L1
Packit 6c4009
	umulh	$19,	$0,	$10	# U1
Packit 6c4009
	addq	$6,	$13,	$6	# L0 lo + acc
Packit 6c4009
	mulq	$19,	$1,	$11	# U1
Packit 6c4009
	cmpult	$6,	$13,	$20	# L0 lo add => carry
Packit 6c4009
	lda	$16,	64($16)		# L1 bookkeeping
Packit 6c4009
	addq	$6,	$12,	$22	# U0 hi add => answer
Packit 6c4009
	cmpult	$22,	$12,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$14,	$20,	$14	# U0 hi mul + carry
Packit 6c4009
	ldq	$6,	-16($16)	# L1
Packit 6c4009
	addq	$7,	$15,	$23	# L0 lo + acc
Packit 6c4009
	addq	$14,	$21,	$14	# U0 hi mul + carry
Packit 6c4009
	ldq	$7,	-8($16)		# L1
Packit 6c4009
	umulh	$19,	$1,	$12	# U1
Packit 6c4009
	cmpult	$23,	$15,	$20	# L0 lo add => carry
Packit 6c4009
	addq	$23,	$14,	$23	# U0 hi add => answer
Packit 6c4009
	ldq	$0,	0($17)		# L1
Packit 6c4009
	mulq	$19,	$2,	$13	# U1
Packit 6c4009
	cmpult	$23,	$14,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$8,	$20,	$8	# U0 hi mul + carry
Packit 6c4009
	ldq	$1,	8($17)		# L1
Packit 6c4009
	umulh	$19,	$2,	$14	# U1
Packit 6c4009
	addq	$4,	$9,	$4	# L0 lo + acc
Packit 6c4009
	stq	$22,	-48($16)	# L0
Packit 6c4009
	stq	$23,	-40($16)	# L1
Packit 6c4009
	mulq	$19,	$3,	$15	# U1
Packit 6c4009
	addq	$8,	$21,	$8	# U0 hi mul + carry
Packit 6c4009
	cmpult	$4,	$9,	$20	# L0 lo add => carry
Packit 6c4009
	addq	$4,	$8,	$22	# U0 hi add => answer
Packit 6c4009
	ble	$18,	$Lend		# U1 bookkeeping
Packit 6c4009
Packit 6c4009
 # ____ MAIN UNROLLED LOOP ____
Packit 6c4009
	.align 4
Packit 6c4009
$Loop:
Packit 6c4009
	bis	$31,	$31,	$31	# U1 mt
Packit 6c4009
	cmpult	$22,	$8,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$10,	$20,	$10	# U0 hi mul + carry
Packit 6c4009
	ldq	$4,	0($16)		# L1
Packit 6c4009
Packit 6c4009
	bis	$31,	$31,	$31	# U1 mt
Packit 6c4009
	addq	$5,	$11,	$23	# L0 lo + acc
Packit 6c4009
	addq	$10,	$21,	$10	# L0 hi mul + carry
Packit 6c4009
	ldq	$5,	8($16)		# L1
Packit 6c4009
Packit 6c4009
	umulh	$19,	$3,	$8	# U1
Packit 6c4009
	cmpult	$23,	$11,	$20	# L0 lo add => carry
Packit 6c4009
	addq	$23,	$10,	$23	# U0 hi add => answer
Packit 6c4009
	ldq	$2,	16($17)		# L1
Packit 6c4009
Packit 6c4009
	mulq	$19,	$0,	$9	# U1
Packit 6c4009
	cmpult	$23,	$10,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$12,	$20,	$12	# U0 hi mul + carry
Packit 6c4009
	ldq	$3,	24($17)		# L1
Packit 6c4009
Packit 6c4009
	umulh	$19,	$0,	$10	# U1
Packit 6c4009
	addq	$6,	$13,	$6	# L0 lo + acc
Packit 6c4009
	stq	$22,	-32($16)	# L0
Packit 6c4009
	stq	$23,	-24($16)	# L1
Packit 6c4009
Packit 6c4009
	bis	$31,	$31,	$31	# L0 st slosh
Packit 6c4009
	mulq	$19,	$1,	$11	# U1
Packit 6c4009
	bis	$31,	$31,	$31	# L1 st slosh
Packit 6c4009
	addq	$12,	$21,	$12	# U0 hi mul + carry
Packit 6c4009
Packit 6c4009
	cmpult	$6,	$13,	$20	# L0 lo add => carry
Packit 6c4009
	bis	$31,	$31,	$31	# U1 mt
Packit 6c4009
	lda	$18,	-1($18)		# L1 bookkeeping
Packit 6c4009
	addq	$6,	$12,	$22	# U0 hi add => answer
Packit 6c4009
Packit 6c4009
	bis	$31,	$31,	$31	# U1 mt
Packit 6c4009
	cmpult	$22,	$12,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$14,	$20,	$14	# U0 hi mul + carry
Packit 6c4009
	ldq	$6,	16($16)		# L1
Packit 6c4009
Packit 6c4009
	bis	$31,	$31,	$31	# U1 mt
Packit 6c4009
	addq	$7,	$15,	$23	# L0 lo + acc
Packit 6c4009
	addq	$14,	$21,	$14	# U0 hi mul + carry
Packit 6c4009
	ldq	$7,	24($16)		# L1
Packit 6c4009
Packit 6c4009
	umulh	$19,	$1,	$12	# U1
Packit 6c4009
	cmpult	$23,	$15,	$20	# L0 lo add => carry
Packit 6c4009
	addq	$23,	$14,	$23	# U0 hi add => answer
Packit 6c4009
	ldq	$0,	32($17)		# L1
Packit 6c4009
Packit 6c4009
	mulq	$19,	$2,	$13	# U1
Packit 6c4009
	cmpult	$23,	$14,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$8,	$20,	$8	# U0 hi mul + carry
Packit 6c4009
	ldq	$1,	40($17)		# L1
Packit 6c4009
Packit 6c4009
	umulh	$19,	$2,	$14	# U1
Packit 6c4009
	addq	$4,	$9,	$4	# U0 lo + acc
Packit 6c4009
	stq	$22,	-16($16)	# L0
Packit 6c4009
	stq	$23,	-8($16)		# L1
Packit 6c4009
Packit 6c4009
	bis	$31,	$31,	$31	# L0 st slosh
Packit 6c4009
	mulq	$19,	$3,	$15	# U1
Packit 6c4009
	bis	$31,	$31,	$31	# L1 st slosh
Packit 6c4009
	addq	$8,	$21,	$8	# L0 hi mul + carry
Packit 6c4009
Packit 6c4009
	cmpult	$4,	$9,	$20	# L0 lo add => carry
Packit 6c4009
	bis	$31,	$31,	$31	# U1 mt
Packit 6c4009
	lda	$17,	64($17)		# L1 bookkeeping
Packit 6c4009
	addq	$4,	$8,	$22	# U0 hi add => answer
Packit 6c4009
Packit 6c4009
	bis	$31,	$31,	$31	# U1 mt
Packit 6c4009
	cmpult	$22,	$8,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$10,	$20,	$10	# U0 hi mul + carry
Packit 6c4009
	ldq	$4,	32($16)		# L1
Packit 6c4009
Packit 6c4009
	bis	$31,	$31,	$31	# U1 mt
Packit 6c4009
	addq	$5,	$11,	$23	# L0 lo + acc
Packit 6c4009
	addq	$10,	$21,	$10	# L0 hi mul + carry
Packit 6c4009
	ldq	$5,	40($16)		# L1
Packit 6c4009
Packit 6c4009
	umulh	$19,	$3,	$8	# U1
Packit 6c4009
	cmpult	$23,	$11,	$20	# L0 lo add => carry
Packit 6c4009
	addq	$23,	$10,	$23	# U0 hi add => answer
Packit 6c4009
	ldq	$2,	-16($17)	# L1
Packit 6c4009
Packit 6c4009
	mulq	$19,	$0,	$9	# U1
Packit 6c4009
	cmpult	$23,	$10,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$12,	$20,	$12	# U0 hi mul + carry
Packit 6c4009
	ldq	$3,	-8($17)		# L1
Packit 6c4009
Packit 6c4009
	umulh	$19,	$0,	$10	# U1
Packit 6c4009
	addq	$6,	$13,	$6	# L0 lo + acc
Packit 6c4009
	stq	$22,	0($16)		# L0
Packit 6c4009
	stq	$23,	8($16)		# L1
Packit 6c4009
Packit 6c4009
	bis	$31,	$31,	$31	# L0 st slosh
Packit 6c4009
	mulq	$19,	$1,	$11	# U1
Packit 6c4009
	bis	$31,	$31,	$31	# L1 st slosh
Packit 6c4009
	addq	$12,	$21,	$12	# U0 hi mul + carry
Packit 6c4009
Packit 6c4009
	cmpult	$6,	$13,	$20	# L0 lo add => carry
Packit 6c4009
	bis	$31,	$31,	$31	# U1 mt
Packit 6c4009
	lda	$16,	64($16)		# L1 bookkeeping
Packit 6c4009
	addq	$6,	$12,	$22	# U0 hi add => answer
Packit 6c4009
Packit 6c4009
	bis	$31,	$31,	$31	# U1 mt
Packit 6c4009
	cmpult	$22,	$12,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$14,	$20,	$14	# U0 hi mul + carry
Packit 6c4009
	ldq	$6,	-16($16)	# L1
Packit 6c4009
Packit 6c4009
	bis	$31,	$31,	$31	# U1 mt
Packit 6c4009
	addq	$7,	$15,	$23	# L0 lo + acc
Packit 6c4009
	addq	$14,	$21,	$14	# U0 hi mul + carry
Packit 6c4009
	ldq	$7,	-8($16)		# L1
Packit 6c4009
Packit 6c4009
	umulh	$19,	$1,	$12	# U1
Packit 6c4009
	cmpult	$23,	$15,	$20	# L0 lo add => carry
Packit 6c4009
	addq	$23,	$14,	$23	# U0 hi add => answer
Packit 6c4009
	ldq	$0,	0($17)		# L1
Packit 6c4009
Packit 6c4009
	mulq	$19,	$2,	$13	# U1
Packit 6c4009
	cmpult	$23,	$14,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$8,	$20,	$8	# U0 hi mul + carry
Packit 6c4009
	ldq	$1,	8($17)		# L1
Packit 6c4009
Packit 6c4009
	umulh	$19,	$2,	$14	# U1
Packit 6c4009
	addq	$4,	$9,	$4	# L0 lo + acc
Packit 6c4009
	stq	$22,	-48($16)	# L0
Packit 6c4009
	stq	$23,	-40($16)	# L1
Packit 6c4009
Packit 6c4009
	bis	$31,	$31,	$31	# L0 st slosh
Packit 6c4009
	mulq	$19,	$3,	$15	# U1
Packit 6c4009
	bis	$31,	$31,	$31	# L1 st slosh
Packit 6c4009
	addq	$8,	$21,	$8	# U0 hi mul + carry
Packit 6c4009
Packit 6c4009
	cmpult	$4,	$9,	$20	# L0 lo add => carry
Packit 6c4009
	addq	$4,	$8,	$22	# U0 hi add => answer
Packit 6c4009
	bis	$31,	$31,	$31	# L1 mt
Packit 6c4009
	bgt	$18,	$Loop		# U1 bookkeeping
Packit 6c4009
Packit 6c4009
# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
Packit 6c4009
$Lend:
Packit 6c4009
	cmpult	$22,	$8,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$10,	$20,	$10	# U0 hi mul + carry
Packit 6c4009
	ldq	$4,	0($16)		# L1
Packit 6c4009
	addq	$5,	$11,	$23	# L0 lo + acc
Packit 6c4009
	addq	$10,	$21,	$10	# L0 hi mul + carry
Packit 6c4009
	ldq	$5,	8($16)		# L1
Packit 6c4009
	umulh	$19,	$3,	$8	# U1
Packit 6c4009
	cmpult	$23,	$11,	$20	# L0 lo add => carry
Packit 6c4009
	addq	$23,	$10,	$23	# U0 hi add => answer
Packit 6c4009
	mulq	$19,	$0,	$9	# U1
Packit 6c4009
	cmpult	$23,	$10,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$12,	$20,	$12	# U0 hi mul + carry
Packit 6c4009
	umulh	$19,	$0,	$10	# U1
Packit 6c4009
	addq	$6,	$13,	$6	# L0 lo + acc
Packit 6c4009
	stq	$22,	-32($16)	# L0
Packit 6c4009
	stq	$23,	-24($16)	# L1
Packit 6c4009
	mulq	$19,	$1,	$11	# U1
Packit 6c4009
	addq	$12,	$21,	$12	# U0 hi mul + carry
Packit 6c4009
	cmpult	$6,	$13,	$20	# L0 lo add => carry
Packit 6c4009
	addq	$6,	$12,	$22	# U0 hi add => answer
Packit 6c4009
	cmpult	$22,	$12,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$14,	$20,	$14	# U0 hi mul + carry
Packit 6c4009
	addq	$7,	$15,	$23	# L0 lo + acc
Packit 6c4009
	addq	$14,	$21,	$14	# U0 hi mul + carry
Packit 6c4009
	umulh	$19,	$1,	$12	# U1
Packit 6c4009
	cmpult	$23,	$15,	$20	# L0 lo add => carry
Packit 6c4009
	addq	$23,	$14,	$23	# U0 hi add => answer
Packit 6c4009
	cmpult	$23,	$14,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$8,	$20,	$8	# U0 hi mul + carry
Packit 6c4009
	addq	$4,	$9,	$4	# U0 lo + acc
Packit 6c4009
	stq	$22,	-16($16)	# L0
Packit 6c4009
	stq	$23,	-8($16)		# L1
Packit 6c4009
	bis	$31,	$31,	$31	# L0 st slosh
Packit 6c4009
	addq	$8,	$21,	$8	# L0 hi mul + carry
Packit 6c4009
	cmpult	$4,	$9,	$20	# L0 lo add => carry
Packit 6c4009
	addq	$4,	$8,	$22	# U0 hi add => answer
Packit 6c4009
	cmpult	$22,	$8,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$10,	$20,	$10	# U0 hi mul + carry
Packit 6c4009
	addq	$5,	$11,	$23	# L0 lo + acc
Packit 6c4009
	addq	$10,	$21,	$10	# L0 hi mul + carry
Packit 6c4009
	cmpult	$23,	$11,	$20	# L0 lo add => carry
Packit 6c4009
	addq	$23,	$10,	$23	# U0 hi add => answer
Packit 6c4009
	cmpult	$23,	$10,	$21	# L0 hi add => carry
Packit 6c4009
	addq	$12,	$20,	$12	# U0 hi mul + carry
Packit 6c4009
	stq	$22,	0($16)		# L0
Packit 6c4009
	stq	$23,	8($16)		# L1
Packit 6c4009
	addq	$12,	$21,	$0	# U0 hi mul + carry
Packit 6c4009
Packit 6c4009
	ldq	$9,	8($30)
Packit 6c4009
	ldq	$10,	16($30)
Packit 6c4009
	ldq	$11,	24($30)
Packit 6c4009
	ldq	$12,	32($30)
Packit 6c4009
	ldq	$13,	40($30)
Packit 6c4009
	ldq	$14,	48($30)
Packit 6c4009
	ldq	$15,	56($30)
Packit 6c4009
	lda	$30,	240($30)
Packit 6c4009
	ret	$31,	($26),	1
Packit 6c4009
Packit 6c4009
	.end	__mpn_addmul_1