Blame mpn/sparc64/ultrasparc1234/mul_1.asm

Packit 5c3484
dnl  SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
Packit 5c3484
dnl  the result in a second limb vector.
Packit 5c3484
Packit 5c3484
dnl  Copyright 1998, 2000-2003 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
C		   cycles/limb
Packit 5c3484
C UltraSPARC 1&2:     14
Packit 5c3484
C UltraSPARC 3:	      18.5
Packit 5c3484
Packit 5c3484
C Algorithm: We use eight floating-point multiplies per limb product, with the
Packit 5c3484
C invariant v operand split into four 16-bit pieces, and the s1 operand split
Packit 5c3484
C into 32-bit pieces.  We sum pairs of 48-bit partial products using
Packit 5c3484
C floating-point add, then convert the four 49-bit product-sums and transfer
Packit 5c3484
C them to the integer unit.
Packit 5c3484
Packit 5c3484
C Possible optimizations:
Packit 5c3484
C   1. Align the stack area where we transfer the four 49-bit product-sums
Packit 5c3484
C      to a 32-byte boundary.  That would minimize the cache collision.
Packit 5c3484
C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
Packit 5c3484
C      be to align the area to map to the area immediately before s1?)
Packit 5c3484
C   2. Sum the 4 49-bit quantities using 32-bit operations, as in the
Packit 5c3484
C      develop mpn_addmul_2.  This would save many integer instructions.
Packit 5c3484
C   3. Unrolling.  Questionable if it is worth the code expansion, given that
Packit 5c3484
C      it could only save 1 cycle/limb.
Packit 5c3484
C   4. Specialize for particular v values.  If its upper 32 bits are zero, we
Packit 5c3484
C      could save many operations, in the FPU (fmuld), but more so in the IEU
Packit 5c3484
C      since we'll be summing 48-bit quantities, which might be simpler.
Packit 5c3484
C   5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
Packit 5c3484
C      the i00,i16,i32,i48 RAW less apart.  The latter apart-scheduling should
Packit 5c3484
C      not be greater than needed for L2 cache latency, and also not so great
Packit 5c3484
C      that i16 needs to be copied.
Packit 5c3484
C   6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
Packit 5c3484
C      to get high IEU bandwidth.  (12 of the 14 cycles will be free for 2 IEU
Packit 5c3484
C      ops.)
Packit 5c3484
Packit 5c3484
C Instruction classification (as per UltraSPARC-1/2 functional units):
Packit 5c3484
C    8 FM
Packit 5c3484
C   10 FA
Packit 5c3484
C   11 MEM
Packit 5c3484
C   9 ISHIFT + 10? IADDLOG
Packit 5c3484
C    1 BRANCH
Packit 5c3484
C   49 insns totally (plus three mov insns that should be optimized out)
Packit 5c3484
Packit 5c3484
C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we
Packit 5c3484
C sustain 3.79 instructions/cycle.
Packit 5c3484
Packit 5c3484
C INPUT PARAMETERS
Packit 5c3484
C rp	i0
Packit 5c3484
C up	i1
Packit 5c3484
C n	i2
Packit 5c3484
C v	i3
Packit 5c3484
Packit 5c3484
ASM_START()
Packit 5c3484
	REGISTER(%g2,#scratch)
Packit 5c3484
	REGISTER(%g3,#scratch)
Packit 5c3484
Packit 5c3484
define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
Packit 5c3484
define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
Packit 5c3484
define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
Packit 5c3484
define(`u00',`%f32') define(`u32', `%f34')
Packit 5c3484
define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
Packit 5c3484
define(`cy',`%g1')
Packit 5c3484
define(`rlimb',`%g3')
Packit 5c3484
define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
Packit 5c3484
define(`xffffffff',`%l7')
Packit 5c3484
define(`xffff',`%o0')
Packit 5c3484
Packit 5c3484
PROLOGUE(mpn_mul_1)
Packit 5c3484
Packit 5c3484
C Initialization.  (1) Split v operand into four 16-bit chunks and store them
Packit 5c3484
C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
Packit 5c3484
C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
Packit 5c3484
Packit 5c3484
	save	%sp, -256, %sp
Packit 5c3484
	mov	-1, %g4
Packit 5c3484
	srlx	%g4, 48, xffff		C store mask in register `xffff'
Packit 5c3484
	and	%i3, xffff, %g2
Packit 5c3484
	stx	%g2, [%sp+2223+0]
Packit 5c3484
	srlx	%i3, 16, %g3
Packit 5c3484
	and	%g3, xffff, %g3
Packit 5c3484
	stx	%g3, [%sp+2223+8]
Packit 5c3484
	srlx	%i3, 32, %g2
Packit 5c3484
	and	%g2, xffff, %g2
Packit 5c3484
	stx	%g2, [%sp+2223+16]
Packit 5c3484
	srlx	%i3, 48, %g3
Packit 5c3484
	stx	%g3, [%sp+2223+24]
Packit 5c3484
	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
Packit 5c3484
Packit 5c3484
	sllx	%i2, 3, %i2
Packit 5c3484
	mov	0, cy			C clear cy
Packit 5c3484
	add	%i0, %i2, %i0
Packit 5c3484
	add	%i1, %i2, %i1
Packit 5c3484
	neg	%i2
Packit 5c3484
	add	%i1, 4, %i5
Packit 5c3484
	add	%i0, -32, %i4
Packit 5c3484
	add	%i0, -16, %i0
Packit 5c3484
Packit 5c3484
	ldd	[%sp+2223+0], v00
Packit 5c3484
	ldd	[%sp+2223+8], v16
Packit 5c3484
	ldd	[%sp+2223+16], v32
Packit 5c3484
	ldd	[%sp+2223+24], v48
Packit 5c3484
	ld	[%sp+2223+0],%f2	C zero f2
Packit 5c3484
	ld	[%sp+2223+0],%f4	C zero f4
Packit 5c3484
	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
Packit 5c3484
	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
Packit 5c3484
	fxtod	v00, v00
Packit 5c3484
	fxtod	v16, v16
Packit 5c3484
	fxtod	v32, v32
Packit 5c3484
	fxtod	v48, v48
Packit 5c3484
Packit 5c3484
C Start real work.  (We sneakingly read f3 and f5 above...)
Packit 5c3484
C The software pipeline is very deep, requiring 4 feed-in stages.
Packit 5c3484
Packit 5c3484
	fxtod	%f2, u00
Packit 5c3484
	fxtod	%f4, u32
Packit 5c3484
	fmuld	u00, v00, a00
Packit 5c3484
	fmuld	u00, v16, a16
Packit 5c3484
	fmuld	u00, v32, p32
Packit 5c3484
	fmuld	u32, v00, r32
Packit 5c3484
	fmuld	u00, v48, p48
Packit 5c3484
	addcc	%i2, 8, %i2
Packit 5c3484
	bnz,pt	%xcc, .L_two_or_more
Packit 5c3484
	fmuld	u32, v16, r48
Packit 5c3484
Packit 5c3484
.L_one:
Packit 5c3484
	fmuld	u32, v32, r64	C FIXME not urgent
Packit 5c3484
	faddd	p32, r32, a32
Packit 5c3484
	fdtox	a00, a00
Packit 5c3484
	faddd	p48, r48, a48
Packit 5c3484
	fmuld	u32, v48, r80	C FIXME not urgent
Packit 5c3484
	fdtox	a16, a16
Packit 5c3484
	fdtox	a32, a32
Packit 5c3484
	fdtox	a48, a48
Packit 5c3484
	std	a00, [%sp+2223+0]
Packit 5c3484
	std	a16, [%sp+2223+8]
Packit 5c3484
	std	a32, [%sp+2223+16]
Packit 5c3484
	std	a48, [%sp+2223+24]
Packit 5c3484
	add	%i2, 8, %i2
Packit 5c3484
Packit 5c3484
	fdtox	r64, a00
Packit 5c3484
	fdtox	r80, a16
Packit 5c3484
	ldx	[%sp+2223+0], i00
Packit 5c3484
	ldx	[%sp+2223+8], i16
Packit 5c3484
	ldx	[%sp+2223+16], i32
Packit 5c3484
	ldx	[%sp+2223+24], i48
Packit 5c3484
	std	a00, [%sp+2223+0]
Packit 5c3484
	std	a16, [%sp+2223+8]
Packit 5c3484
	add	%i2, 8, %i2
Packit 5c3484
Packit 5c3484
	mov	i00, %g5		C i00+ now in g5
Packit 5c3484
	ldx	[%sp+2223+0], i00
Packit 5c3484
	srlx	i16, 48, %l4		C (i16 >> 48)
Packit 5c3484
	mov	i16, %g2
Packit 5c3484
	ldx	[%sp+2223+8], i16
Packit 5c3484
	srlx	i48, 16, %l5		C (i48 >> 16)
Packit 5c3484
	mov	i32, %g4		C i32+ now in g4
Packit 5c3484
	sllx	i48, 32, %l6		C (i48 << 32)
Packit 5c3484
	srlx	%g4, 32, %o3		C (i32 >> 32)
Packit 5c3484
	add	%l5, %l4, %o1		C hi64- in %o1
Packit 5c3484
	std	a00, [%sp+2223+0]
Packit 5c3484
	sllx	%g4, 16, %o2		C (i32 << 16)
Packit 5c3484
	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
Packit 5c3484
	std	a16, [%sp+2223+8]
Packit 5c3484
	sllx	%o1, 48, %o3		C (hi64 << 48)
Packit 5c3484
	add	%g2, %o2, %o2		C mi64- in %o2
Packit 5c3484
	add	%l6, %o2, %o2		C mi64- in %o2
Packit 5c3484
	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
Packit 5c3484
	add	cy, %g5, %o4		C x = prev(i00) + cy
Packit 5c3484
	b	.L_out_1
Packit 5c3484
	add	%i2, 8, %i2
Packit 5c3484
Packit 5c3484
.L_two_or_more:
Packit 5c3484
	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
Packit 5c3484
	fmuld	u32, v32, r64	C FIXME not urgent
Packit 5c3484
	faddd	p32, r32, a32
Packit 5c3484
	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
Packit 5c3484
	fdtox	a00, a00
Packit 5c3484
	faddd	p48, r48, a48
Packit 5c3484
	fmuld	u32, v48, r80	C FIXME not urgent
Packit 5c3484
	fdtox	a16, a16
Packit 5c3484
	fdtox	a32, a32
Packit 5c3484
	fxtod	%f2, u00
Packit 5c3484
	fxtod	%f4, u32
Packit 5c3484
	fdtox	a48, a48
Packit 5c3484
	std	a00, [%sp+2223+0]
Packit 5c3484
	fmuld	u00, v00, p00
Packit 5c3484
	std	a16, [%sp+2223+8]
Packit 5c3484
	fmuld	u00, v16, p16
Packit 5c3484
	std	a32, [%sp+2223+16]
Packit 5c3484
	fmuld	u00, v32, p32
Packit 5c3484
	std	a48, [%sp+2223+24]
Packit 5c3484
	faddd	p00, r64, a00
Packit 5c3484
	fmuld	u32, v00, r32
Packit 5c3484
	faddd	p16, r80, a16
Packit 5c3484
	fmuld	u00, v48, p48
Packit 5c3484
	addcc	%i2, 8, %i2
Packit 5c3484
	bnz,pt	%xcc, .L_three_or_more
Packit 5c3484
	fmuld	u32, v16, r48
Packit 5c3484
Packit 5c3484
.L_two:
Packit 5c3484
	fmuld	u32, v32, r64	C FIXME not urgent
Packit 5c3484
	faddd	p32, r32, a32
Packit 5c3484
	fdtox	a00, a00
Packit 5c3484
	faddd	p48, r48, a48
Packit 5c3484
	fmuld	u32, v48, r80	C FIXME not urgent
Packit 5c3484
	fdtox	a16, a16
Packit 5c3484
	ldx	[%sp+2223+0], i00
Packit 5c3484
	fdtox	a32, a32
Packit 5c3484
	ldx	[%sp+2223+8], i16
Packit 5c3484
	ldx	[%sp+2223+16], i32
Packit 5c3484
	ldx	[%sp+2223+24], i48
Packit 5c3484
	fdtox	a48, a48
Packit 5c3484
	std	a00, [%sp+2223+0]
Packit 5c3484
	std	a16, [%sp+2223+8]
Packit 5c3484
	std	a32, [%sp+2223+16]
Packit 5c3484
	std	a48, [%sp+2223+24]
Packit 5c3484
	add	%i2, 8, %i2
Packit 5c3484
Packit 5c3484
	fdtox	r64, a00
Packit 5c3484
	mov	i00, %g5		C i00+ now in g5
Packit 5c3484
	fdtox	r80, a16
Packit 5c3484
	ldx	[%sp+2223+0], i00
Packit 5c3484
	srlx	i16, 48, %l4		C (i16 >> 48)
Packit 5c3484
	mov	i16, %g2
Packit 5c3484
	ldx	[%sp+2223+8], i16
Packit 5c3484
	srlx	i48, 16, %l5		C (i48 >> 16)
Packit 5c3484
	mov	i32, %g4		C i32+ now in g4
Packit 5c3484
	ldx	[%sp+2223+16], i32
Packit 5c3484
	sllx	i48, 32, %l6		C (i48 << 32)
Packit 5c3484
	ldx	[%sp+2223+24], i48
Packit 5c3484
	srlx	%g4, 32, %o3		C (i32 >> 32)
Packit 5c3484
	add	%l5, %l4, %o1		C hi64- in %o1
Packit 5c3484
	std	a00, [%sp+2223+0]
Packit 5c3484
	sllx	%g4, 16, %o2		C (i32 << 16)
Packit 5c3484
	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
Packit 5c3484
	std	a16, [%sp+2223+8]
Packit 5c3484
	sllx	%o1, 48, %o3		C (hi64 << 48)
Packit 5c3484
	add	%g2, %o2, %o2		C mi64- in %o2
Packit 5c3484
	add	%l6, %o2, %o2		C mi64- in %o2
Packit 5c3484
	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
Packit 5c3484
	add	cy, %g5, %o4		C x = prev(i00) + cy
Packit 5c3484
	b	.L_out_2
Packit 5c3484
	add	%i2, 8, %i2
Packit 5c3484
Packit 5c3484
.L_three_or_more:
Packit 5c3484
	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
Packit 5c3484
	fmuld	u32, v32, r64	C FIXME not urgent
Packit 5c3484
	faddd	p32, r32, a32
Packit 5c3484
	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
Packit 5c3484
	fdtox	a00, a00
Packit 5c3484
	faddd	p48, r48, a48
Packit 5c3484
	fmuld	u32, v48, r80	C FIXME not urgent
Packit 5c3484
	fdtox	a16, a16
Packit 5c3484
	ldx	[%sp+2223+0], i00
Packit 5c3484
	fdtox	a32, a32
Packit 5c3484
	ldx	[%sp+2223+8], i16
Packit 5c3484
	fxtod	%f2, u00
Packit 5c3484
	ldx	[%sp+2223+16], i32
Packit 5c3484
	fxtod	%f4, u32
Packit 5c3484
	ldx	[%sp+2223+24], i48
Packit 5c3484
	fdtox	a48, a48
Packit 5c3484
	std	a00, [%sp+2223+0]
Packit 5c3484
	fmuld	u00, v00, p00
Packit 5c3484
	std	a16, [%sp+2223+8]
Packit 5c3484
	fmuld	u00, v16, p16
Packit 5c3484
	std	a32, [%sp+2223+16]
Packit 5c3484
	fmuld	u00, v32, p32
Packit 5c3484
	std	a48, [%sp+2223+24]
Packit 5c3484
	faddd	p00, r64, a00
Packit 5c3484
	fmuld	u32, v00, r32
Packit 5c3484
	faddd	p16, r80, a16
Packit 5c3484
	fmuld	u00, v48, p48
Packit 5c3484
	addcc	%i2, 8, %i2
Packit 5c3484
	bnz,pt	%xcc, .L_four_or_more
Packit 5c3484
	fmuld	u32, v16, r48
Packit 5c3484
Packit 5c3484
.L_three:
Packit 5c3484
	fmuld	u32, v32, r64	C FIXME not urgent
Packit 5c3484
	faddd	p32, r32, a32
Packit 5c3484
	fdtox	a00, a00
Packit 5c3484
	faddd	p48, r48, a48
Packit 5c3484
	mov	i00, %g5		C i00+ now in g5
Packit 5c3484
	fmuld	u32, v48, r80	C FIXME not urgent
Packit 5c3484
	fdtox	a16, a16
Packit 5c3484
	ldx	[%sp+2223+0], i00
Packit 5c3484
	fdtox	a32, a32
Packit 5c3484
	srlx	i16, 48, %l4		C (i16 >> 48)
Packit 5c3484
	mov	i16, %g2
Packit 5c3484
	ldx	[%sp+2223+8], i16
Packit 5c3484
	srlx	i48, 16, %l5		C (i48 >> 16)
Packit 5c3484
	mov	i32, %g4		C i32+ now in g4
Packit 5c3484
	ldx	[%sp+2223+16], i32
Packit 5c3484
	sllx	i48, 32, %l6		C (i48 << 32)
Packit 5c3484
	ldx	[%sp+2223+24], i48
Packit 5c3484
	fdtox	a48, a48
Packit 5c3484
	srlx	%g4, 32, %o3		C (i32 >> 32)
Packit 5c3484
	add	%l5, %l4, %o1		C hi64- in %o1
Packit 5c3484
	std	a00, [%sp+2223+0]
Packit 5c3484
	sllx	%g4, 16, %o2		C (i32 << 16)
Packit 5c3484
	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
Packit 5c3484
	std	a16, [%sp+2223+8]
Packit 5c3484
	sllx	%o1, 48, %o3		C (hi64 << 48)
Packit 5c3484
	add	%g2, %o2, %o2		C mi64- in %o2
Packit 5c3484
	std	a32, [%sp+2223+16]
Packit 5c3484
	add	%l6, %o2, %o2		C mi64- in %o2
Packit 5c3484
	std	a48, [%sp+2223+24]
Packit 5c3484
	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
Packit 5c3484
	add	cy, %g5, %o4		C x = prev(i00) + cy
Packit 5c3484
	b	.L_out_3
Packit 5c3484
	add	%i2, 8, %i2
Packit 5c3484
Packit 5c3484
.L_four_or_more:
Packit 5c3484
	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
Packit 5c3484
	fmuld	u32, v32, r64	C FIXME not urgent
Packit 5c3484
	faddd	p32, r32, a32
Packit 5c3484
	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
Packit 5c3484
	fdtox	a00, a00
Packit 5c3484
	faddd	p48, r48, a48
Packit 5c3484
	mov	i00, %g5		C i00+ now in g5
Packit 5c3484
	fmuld	u32, v48, r80	C FIXME not urgent
Packit 5c3484
	fdtox	a16, a16
Packit 5c3484
	ldx	[%sp+2223+0], i00
Packit 5c3484
	fdtox	a32, a32
Packit 5c3484
	srlx	i16, 48, %l4		C (i16 >> 48)
Packit 5c3484
	mov	i16, %g2
Packit 5c3484
	ldx	[%sp+2223+8], i16
Packit 5c3484
	fxtod	%f2, u00
Packit 5c3484
	srlx	i48, 16, %l5		C (i48 >> 16)
Packit 5c3484
	mov	i32, %g4		C i32+ now in g4
Packit 5c3484
	ldx	[%sp+2223+16], i32
Packit 5c3484
	fxtod	%f4, u32
Packit 5c3484
	sllx	i48, 32, %l6		C (i48 << 32)
Packit 5c3484
	ldx	[%sp+2223+24], i48
Packit 5c3484
	fdtox	a48, a48
Packit 5c3484
	srlx	%g4, 32, %o3		C (i32 >> 32)
Packit 5c3484
	add	%l5, %l4, %o1		C hi64- in %o1
Packit 5c3484
	std	a00, [%sp+2223+0]
Packit 5c3484
	fmuld	u00, v00, p00
Packit 5c3484
	sllx	%g4, 16, %o2		C (i32 << 16)
Packit 5c3484
	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
Packit 5c3484
	std	a16, [%sp+2223+8]
Packit 5c3484
	fmuld	u00, v16, p16
Packit 5c3484
	sllx	%o1, 48, %o3		C (hi64 << 48)
Packit 5c3484
	add	%g2, %o2, %o2		C mi64- in %o2
Packit 5c3484
	std	a32, [%sp+2223+16]
Packit 5c3484
	fmuld	u00, v32, p32
Packit 5c3484
	add	%l6, %o2, %o2		C mi64- in %o2
Packit 5c3484
	std	a48, [%sp+2223+24]
Packit 5c3484
	faddd	p00, r64, a00
Packit 5c3484
	fmuld	u32, v00, r32
Packit 5c3484
	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
Packit 5c3484
	faddd	p16, r80, a16
Packit 5c3484
	fmuld	u00, v48, p48
Packit 5c3484
	add	cy, %g5, %o4		C x = prev(i00) + cy
Packit 5c3484
	addcc	%i2, 8, %i2
Packit 5c3484
	bnz,pt	%xcc, .Loop
Packit 5c3484
	fmuld	u32, v16, r48
Packit 5c3484
Packit 5c3484
.L_four:
Packit 5c3484
	b,a	.L_out_4
Packit 5c3484
Packit 5c3484
C BEGIN MAIN LOOP
Packit 5c3484
	.align	16
Packit 5c3484
.Loop:
Packit 5c3484
C 00
Packit 5c3484
	srlx	%o4, 16, %o5		C (x >> 16)
Packit 5c3484
	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
Packit 5c3484
	fmuld	u32, v32, r64	C FIXME not urgent
Packit 5c3484
	faddd	p32, r32, a32
Packit 5c3484
C 01
Packit 5c3484
	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
Packit 5c3484
	and	%o4, xffff, %o5		C (x & 0xffff)
Packit 5c3484
	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
Packit 5c3484
	fdtox	a00, a00
Packit 5c3484
C 02
Packit 5c3484
	faddd	p48, r48, a48
Packit 5c3484
C 03
Packit 5c3484
	srlx	%o2, 48, %o7		C (mi64 >> 48)
Packit 5c3484
	mov	i00, %g5		C i00+ now in g5
Packit 5c3484
	fmuld	u32, v48, r80	C FIXME not urgent
Packit 5c3484
	fdtox	a16, a16
Packit 5c3484
C 04
Packit 5c3484
	sllx	%o2, 16, %i3		C (mi64 << 16)
Packit 5c3484
	add	%o7, %o1, cy		C new cy
Packit 5c3484
	ldx	[%sp+2223+0], i00
Packit 5c3484
	fdtox	a32, a32
Packit 5c3484
C 05
Packit 5c3484
	srlx	i16, 48, %l4		C (i16 >> 48)
Packit 5c3484
	mov	i16, %g2
Packit 5c3484
	ldx	[%sp+2223+8], i16
Packit 5c3484
	fxtod	%f2, u00
Packit 5c3484
C 06
Packit 5c3484
	srlx	i48, 16, %l5		C (i48 >> 16)
Packit 5c3484
	mov	i32, %g4		C i32+ now in g4
Packit 5c3484
	ldx	[%sp+2223+16], i32
Packit 5c3484
	fxtod	%f4, u32
Packit 5c3484
C 07
Packit 5c3484
	sllx	i48, 32, %l6		C (i48 << 32)
Packit 5c3484
	or	%i3, %o5, %o5
Packit 5c3484
	ldx	[%sp+2223+24], i48
Packit 5c3484
	fdtox	a48, a48
Packit 5c3484
C 08
Packit 5c3484
	srlx	%g4, 32, %o3		C (i32 >> 32)
Packit 5c3484
	add	%l5, %l4, %o1		C hi64- in %o1
Packit 5c3484
	std	a00, [%sp+2223+0]
Packit 5c3484
	fmuld	u00, v00, p00
Packit 5c3484
C 09
Packit 5c3484
	sllx	%g4, 16, %o2		C (i32 << 16)
Packit 5c3484
	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
Packit 5c3484
	std	a16, [%sp+2223+8]
Packit 5c3484
	fmuld	u00, v16, p16
Packit 5c3484
C 10
Packit 5c3484
	sllx	%o1, 48, %o3		C (hi64 << 48)
Packit 5c3484
	add	%g2, %o2, %o2		C mi64- in %o2
Packit 5c3484
	std	a32, [%sp+2223+16]
Packit 5c3484
	fmuld	u00, v32, p32
Packit 5c3484
C 11
Packit 5c3484
	add	%l6, %o2, %o2		C mi64- in %o2
Packit 5c3484
	std	a48, [%sp+2223+24]
Packit 5c3484
	faddd	p00, r64, a00
Packit 5c3484
	fmuld	u32, v00, r32
Packit 5c3484
C 12
Packit 5c3484
	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
Packit 5c3484
	stx	%o5, [%i4+%i2]
Packit 5c3484
	faddd	p16, r80, a16
Packit 5c3484
	fmuld	u00, v48, p48
Packit 5c3484
C 13
Packit 5c3484
	add	cy, %g5, %o4		C x = prev(i00) + cy
Packit 5c3484
	addcc	%i2, 8, %i2
Packit 5c3484
	bnz,pt	%xcc, .Loop
Packit 5c3484
	fmuld	u32, v16, r48
Packit 5c3484
C END MAIN LOOP
Packit 5c3484
Packit 5c3484
.L_out_4:
Packit 5c3484
	srlx	%o4, 16, %o5		C (x >> 16)
Packit 5c3484
	fmuld	u32, v32, r64	C FIXME not urgent
Packit 5c3484
	faddd	p32, r32, a32
Packit 5c3484
	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
Packit 5c3484
	and	%o4, xffff, %o5		C (x & 0xffff)
Packit 5c3484
	fdtox	a00, a00
Packit 5c3484
	faddd	p48, r48, a48
Packit 5c3484
	srlx	%o2, 48, %o7		C (mi64 >> 48)
Packit 5c3484
	mov	i00, %g5		C i00+ now in g5
Packit 5c3484
	fmuld	u32, v48, r80	C FIXME not urgent
Packit 5c3484
	fdtox	a16, a16
Packit 5c3484
	sllx	%o2, 16, %i3		C (mi64 << 16)
Packit 5c3484
	add	%o7, %o1, cy		C new cy
Packit 5c3484
	ldx	[%sp+2223+0], i00
Packit 5c3484
	fdtox	a32, a32
Packit 5c3484
	srlx	i16, 48, %l4		C (i16 >> 48)
Packit 5c3484
	mov	i16, %g2
Packit 5c3484
	ldx	[%sp+2223+8], i16
Packit 5c3484
	srlx	i48, 16, %l5		C (i48 >> 16)
Packit 5c3484
	mov	i32, %g4		C i32+ now in g4
Packit 5c3484
	ldx	[%sp+2223+16], i32
Packit 5c3484
	sllx	i48, 32, %l6		C (i48 << 32)
Packit 5c3484
	or	%i3, %o5, %o5
Packit 5c3484
	ldx	[%sp+2223+24], i48
Packit 5c3484
	fdtox	a48, a48
Packit 5c3484
	srlx	%g4, 32, %o3		C (i32 >> 32)
Packit 5c3484
	add	%l5, %l4, %o1		C hi64- in %o1
Packit 5c3484
	std	a00, [%sp+2223+0]
Packit 5c3484
	sllx	%g4, 16, %o2		C (i32 << 16)
Packit 5c3484
	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
Packit 5c3484
	std	a16, [%sp+2223+8]
Packit 5c3484
	sllx	%o1, 48, %o3		C (hi64 << 48)
Packit 5c3484
	add	%g2, %o2, %o2		C mi64- in %o2
Packit 5c3484
	std	a32, [%sp+2223+16]
Packit 5c3484
	add	%l6, %o2, %o2		C mi64- in %o2
Packit 5c3484
	std	a48, [%sp+2223+24]
Packit 5c3484
	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
Packit 5c3484
	stx	%o5, [%i4+%i2]
Packit 5c3484
	add	cy, %g5, %o4		C x = prev(i00) + cy
Packit 5c3484
	add	%i2, 8, %i2
Packit 5c3484
.L_out_3:
Packit 5c3484
	srlx	%o4, 16, %o5		C (x >> 16)
Packit 5c3484
	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
Packit 5c3484
	and	%o4, xffff, %o5		C (x & 0xffff)
Packit 5c3484
	fdtox	r64, a00
Packit 5c3484
	srlx	%o2, 48, %o7		C (mi64 >> 48)
Packit 5c3484
	mov	i00, %g5		C i00+ now in g5
Packit 5c3484
	fdtox	r80, a16
Packit 5c3484
	sllx	%o2, 16, %i3		C (mi64 << 16)
Packit 5c3484
	add	%o7, %o1, cy		C new cy
Packit 5c3484
	ldx	[%sp+2223+0], i00
Packit 5c3484
	srlx	i16, 48, %l4		C (i16 >> 48)
Packit 5c3484
	mov	i16, %g2
Packit 5c3484
	ldx	[%sp+2223+8], i16
Packit 5c3484
	srlx	i48, 16, %l5		C (i48 >> 16)
Packit 5c3484
	mov	i32, %g4		C i32+ now in g4
Packit 5c3484
	ldx	[%sp+2223+16], i32
Packit 5c3484
	sllx	i48, 32, %l6		C (i48 << 32)
Packit 5c3484
	or	%i3, %o5, %o5
Packit 5c3484
	ldx	[%sp+2223+24], i48
Packit 5c3484
	srlx	%g4, 32, %o3		C (i32 >> 32)
Packit 5c3484
	add	%l5, %l4, %o1		C hi64- in %o1
Packit 5c3484
	std	a00, [%sp+2223+0]
Packit 5c3484
	sllx	%g4, 16, %o2		C (i32 << 16)
Packit 5c3484
	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
Packit 5c3484
	std	a16, [%sp+2223+8]
Packit 5c3484
	sllx	%o1, 48, %o3		C (hi64 << 48)
Packit 5c3484
	add	%g2, %o2, %o2		C mi64- in %o2
Packit 5c3484
	add	%l6, %o2, %o2		C mi64- in %o2
Packit 5c3484
	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
Packit 5c3484
	stx	%o5, [%i4+%i2]
Packit 5c3484
	add	cy, %g5, %o4		C x = prev(i00) + cy
Packit 5c3484
	add	%i2, 8, %i2
Packit 5c3484
.L_out_2:
Packit 5c3484
	srlx	%o4, 16, %o5		C (x >> 16)
Packit 5c3484
	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
Packit 5c3484
	and	%o4, xffff, %o5		C (x & 0xffff)
Packit 5c3484
	srlx	%o2, 48, %o7		C (mi64 >> 48)
Packit 5c3484
	mov	i00, %g5		C i00+ now in g5
Packit 5c3484
	sllx	%o2, 16, %i3		C (mi64 << 16)
Packit 5c3484
	add	%o7, %o1, cy		C new cy
Packit 5c3484
	ldx	[%sp+2223+0], i00
Packit 5c3484
	srlx	i16, 48, %l4		C (i16 >> 48)
Packit 5c3484
	mov	i16, %g2
Packit 5c3484
	ldx	[%sp+2223+8], i16
Packit 5c3484
	srlx	i48, 16, %l5		C (i48 >> 16)
Packit 5c3484
	mov	i32, %g4		C i32+ now in g4
Packit 5c3484
	sllx	i48, 32, %l6		C (i48 << 32)
Packit 5c3484
	or	%i3, %o5, %o5
Packit 5c3484
	srlx	%g4, 32, %o3		C (i32 >> 32)
Packit 5c3484
	add	%l5, %l4, %o1		C hi64- in %o1
Packit 5c3484
	sllx	%g4, 16, %o2		C (i32 << 16)
Packit 5c3484
	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
Packit 5c3484
	sllx	%o1, 48, %o3		C (hi64 << 48)
Packit 5c3484
	add	%g2, %o2, %o2		C mi64- in %o2
Packit 5c3484
	add	%l6, %o2, %o2		C mi64- in %o2
Packit 5c3484
	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
Packit 5c3484
	stx	%o5, [%i4+%i2]
Packit 5c3484
	add	cy, %g5, %o4		C x = prev(i00) + cy
Packit 5c3484
	add	%i2, 8, %i2
Packit 5c3484
.L_out_1:
Packit 5c3484
	srlx	%o4, 16, %o5		C (x >> 16)
Packit 5c3484
	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
Packit 5c3484
	and	%o4, xffff, %o5		C (x & 0xffff)
Packit 5c3484
	srlx	%o2, 48, %o7		C (mi64 >> 48)
Packit 5c3484
	sllx	%o2, 16, %i3		C (mi64 << 16)
Packit 5c3484
	add	%o7, %o1, cy		C new cy
Packit 5c3484
	or	%i3, %o5, %o5
Packit 5c3484
	stx	%o5, [%i4+%i2]
Packit 5c3484
Packit 5c3484
	sllx	i00, 0, %g2
Packit 5c3484
	add	%g2, cy, cy
Packit 5c3484
	sllx	i16, 16, %g3
Packit 5c3484
	add	%g3, cy, cy
Packit 5c3484
Packit 5c3484
	return	%i7+8
Packit 5c3484
	mov	cy, %o0
Packit 5c3484
EPILOGUE(mpn_mul_1)