Blame mpn/pa64/submul_1.asm

Packit 5c3484
dnl  HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
Packit 5c3484
dnl  subtract the result from a second limb vector.
Packit 5c3484
Packit 5c3484
dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
C		    cycles/limb
Packit 5c3484
C 8000,8200:		7
Packit 5c3484
C 8500,8600,8700:	6.5
Packit 5c3484
Packit 5c3484
C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
Packit 5c3484
C  could be saved there per call.
Packit 5c3484
Packit 5c3484
C  DESCRIPTION:
Packit 5c3484
C  The main loop "BIG" is 4-way unrolled, mainly to allow
Packit 5c3484
C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
Packit 5c3484
C  registers to the IU registers, have demanded a deep software pipeline, and
Packit 5c3484
C  a lot of stack slots for partial products in flight.
Packit 5c3484
C
Packit 5c3484
C  CODE STRUCTURE:
Packit 5c3484
C  save-some-registers
Packit 5c3484
C  do 0, 1, 2, or 3 limbs
Packit 5c3484
C  if done, restore-some-regs and return
Packit 5c3484
C  save-many-regs
Packit 5c3484
C  do 4, 8, ... limb
Packit 5c3484
C  restore-all-regs
Packit 5c3484
Packit 5c3484
C  STACK LAYOUT:
Packit 5c3484
C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
Packit 5c3484
C  slots marked FREE, as well as some slots in the caller's "frame marker".
Packit 5c3484
C
Packit 5c3484
C -00 <- r30
Packit 5c3484
C -08  FREE
Packit 5c3484
C -10  tmp
Packit 5c3484
C -18  tmp
Packit 5c3484
C -20  tmp
Packit 5c3484
C -28  tmp
Packit 5c3484
C -30  tmp
Packit 5c3484
C -38  tmp
Packit 5c3484
C -40  tmp
Packit 5c3484
C -48  tmp
Packit 5c3484
C -50  tmp
Packit 5c3484
C -58  tmp
Packit 5c3484
C -60  tmp
Packit 5c3484
C -68  tmp
Packit 5c3484
C -70  tmp
Packit 5c3484
C -78  tmp
Packit 5c3484
C -80  tmp
Packit 5c3484
C -88  tmp
Packit 5c3484
C -90  FREE
Packit 5c3484
C -98  FREE
Packit 5c3484
C -a0  FREE
Packit 5c3484
C -a8  FREE
Packit 5c3484
C -b0  r13
Packit 5c3484
C -b8  r12
Packit 5c3484
C -c0  r11
Packit 5c3484
C -c8  r10
Packit 5c3484
C -d0  r8
Packit 5c3484
C -d8  r8
Packit 5c3484
C -e0  r7
Packit 5c3484
C -e8  r6
Packit 5c3484
C -f0  r5
Packit 5c3484
C -f8  r4
Packit 5c3484
C -100 r3
Packit 5c3484
C  Previous frame:
Packit 5c3484
C  [unused area]
Packit 5c3484
C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
Packit 5c3484
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
C INPUT PARAMETERS:
Packit 5c3484
define(`rp',`%r26')	C
Packit 5c3484
define(`up',`%r25')	C
Packit 5c3484
define(`n',`%r24')	C
Packit 5c3484
define(`vlimb',`%r23')	C
Packit 5c3484
Packit 5c3484
define(`climb',`%r23')	C
Packit 5c3484
Packit 5c3484
ifdef(`HAVE_ABI_2_0w',
Packit 5c3484
`	.level	2.0w
Packit 5c3484
',`	.level	2.0
Packit 5c3484
')
Packit 5c3484
PROLOGUE(mpn_submul_1)
Packit 5c3484
Packit 5c3484
ifdef(`HAVE_ABI_2_0w',
Packit 5c3484
`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
Packit 5c3484
')
Packit 5c3484
	std,ma		%r3, 0x100(%r30)
Packit 5c3484
	std		%r4, -0xf8(%r30)
Packit 5c3484
	std		%r5, -0xf0(%r30)
Packit 5c3484
	ldo		0(%r0), climb		C clear climb
Packit 5c3484
	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
Packit 5c3484
Packit 5c3484
define(`p032a1',`%r1')	C
Packit 5c3484
define(`p032a2',`%r19')	C
Packit 5c3484
Packit 5c3484
define(`m032',`%r20')	C
Packit 5c3484
define(`m096',`%r21')	C
Packit 5c3484
Packit 5c3484
define(`p000a',`%r22')	C
Packit 5c3484
define(`p064a',`%r29')	C
Packit 5c3484
Packit 5c3484
define(`s000',`%r31')	C
Packit 5c3484
Packit 5c3484
define(`ma000',`%r4')	C
Packit 5c3484
define(`ma064',`%r20')	C
Packit 5c3484
Packit 5c3484
define(`r000',`%r3')	C
Packit 5c3484
Packit 5c3484
	extrd,u		n, 63, 2, %r5
Packit 5c3484
	cmpb,=		%r5, %r0, L(BIG)
Packit 5c3484
	nop
Packit 5c3484
Packit 5c3484
	fldd		0(up), %fr4
Packit 5c3484
	ldo		8(up), up
Packit 5c3484
	xmpyu		%fr8R, %fr4L, %fr22
Packit 5c3484
	xmpyu		%fr8L, %fr4R, %fr23
Packit 5c3484
	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
Packit 5c3484
	xmpyu		%fr8R, %fr4R, %fr24
Packit 5c3484
	xmpyu		%fr8L, %fr4L, %fr25
Packit 5c3484
	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
Packit 5c3484
	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
Packit 5c3484
	addib,<>	-1, %r5, L(two_or_more)
Packit 5c3484
	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
Packit 5c3484
LDEF(one)
Packit 5c3484
	ldd		-0x78(%r30), p032a1
Packit 5c3484
	ldd		-0x70(%r30), p032a2
Packit 5c3484
	ldd		-0x80(%r30), p000a
Packit 5c3484
	b		L(0_one_out)
Packit 5c3484
	ldd		-0x68(%r30), p064a
Packit 5c3484
Packit 5c3484
LDEF(two_or_more)
Packit 5c3484
	fldd		0(up), %fr4
Packit 5c3484
	ldo		8(up), up
Packit 5c3484
	xmpyu		%fr8R, %fr4L, %fr22
Packit 5c3484
	xmpyu		%fr8L, %fr4R, %fr23
Packit 5c3484
	ldd		-0x78(%r30), p032a1
Packit 5c3484
	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
Packit 5c3484
	xmpyu		%fr8R, %fr4R, %fr24
Packit 5c3484
	xmpyu		%fr8L, %fr4L, %fr25
Packit 5c3484
	ldd		-0x70(%r30), p032a2
Packit 5c3484
	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
Packit 5c3484
	ldd		-0x80(%r30), p000a
Packit 5c3484
	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
Packit 5c3484
	ldd		-0x68(%r30), p064a
Packit 5c3484
	addib,<>	-1, %r5, L(three_or_more)
Packit 5c3484
	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
Packit 5c3484
LDEF(two)
Packit 5c3484
	add		p032a1, p032a2, m032
Packit 5c3484
	add,dc		%r0, %r0, m096
Packit 5c3484
	depd,z		m032, 31, 32, ma000
Packit 5c3484
	extrd,u		m032, 31, 32, ma064
Packit 5c3484
	ldd		0(rp), r000
Packit 5c3484
	b		L(0_two_out)
Packit 5c3484
	depd		m096, 31, 32, ma064
Packit 5c3484
Packit 5c3484
LDEF(three_or_more)
Packit 5c3484
	fldd		0(up), %fr4
Packit 5c3484
	add		p032a1, p032a2, m032
Packit 5c3484
	add,dc		%r0, %r0, m096
Packit 5c3484
	depd,z		m032, 31, 32, ma000
Packit 5c3484
	extrd,u		m032, 31, 32, ma064
Packit 5c3484
	ldd		0(rp), r000
Packit 5c3484
C	addib,=		-1, %r5, L(0_out)
Packit 5c3484
	depd		m096, 31, 32, ma064
Packit 5c3484
LDEF(loop0)
Packit 5c3484
C	xmpyu		%fr8R, %fr4L, %fr22
Packit 5c3484
C	xmpyu		%fr8L, %fr4R, %fr23
Packit 5c3484
C	ldd		-0x78(%r30), p032a1
Packit 5c3484
C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
Packit 5c3484
C
Packit 5c3484
C	xmpyu		%fr8R, %fr4R, %fr24
Packit 5c3484
C	xmpyu		%fr8L, %fr4L, %fr25
Packit 5c3484
C	ldd		-0x70(%r30), p032a2
Packit 5c3484
C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
Packit 5c3484
C
Packit 5c3484
C	ldo		8(rp), rp
Packit 5c3484
C	add		climb, p000a, s000
Packit 5c3484
C	ldd		-0x80(%r30), p000a
Packit 5c3484
C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
Packit 5c3484
C
Packit 5c3484
C	add,dc		p064a, %r0, climb
Packit 5c3484
C	ldo		8(up), up
Packit 5c3484
C	ldd		-0x68(%r30), p064a
Packit 5c3484
C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
Packit 5c3484
C
Packit 5c3484
C	add		ma000, s000, s000
Packit 5c3484
C	add,dc		ma064, climb, climb
Packit 5c3484
C	fldd		0(up), %fr4
Packit 5c3484
C
Packit 5c3484
C	sub		r000, s000, s000
Packit 5c3484
C	sub,db		%r0, climb, climb
Packit 5c3484
C	sub		%r0, climb, climb
Packit 5c3484
C	std		s000, -8(rp)
Packit 5c3484
C
Packit 5c3484
C	add		p032a1, p032a2, m032
Packit 5c3484
C	add,dc		%r0, %r0, m096
Packit 5c3484
C
Packit 5c3484
C	depd,z		m032, 31, 32, ma000
Packit 5c3484
C	extrd,u		m032, 31, 32, ma064
Packit 5c3484
C	ldd		0(rp), r000
Packit 5c3484
C	addib,<>	-1, %r5, L(loop0)
Packit 5c3484
C	depd		m096, 31, 32, ma064
Packit 5c3484
LDEF(0_out)
Packit 5c3484
	ldo		8(up), up
Packit 5c3484
	xmpyu		%fr8R, %fr4L, %fr22
Packit 5c3484
	xmpyu		%fr8L, %fr4R, %fr23
Packit 5c3484
	ldd		-0x78(%r30), p032a1
Packit 5c3484
	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
Packit 5c3484
	xmpyu		%fr8R, %fr4R, %fr24
Packit 5c3484
	xmpyu		%fr8L, %fr4L, %fr25
Packit 5c3484
	ldd		-0x70(%r30), p032a2
Packit 5c3484
	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
Packit 5c3484
	ldo		8(rp), rp
Packit 5c3484
	add		climb, p000a, s000
Packit 5c3484
	ldd		-0x80(%r30), p000a
Packit 5c3484
	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
Packit 5c3484
	add,dc		p064a, %r0, climb
Packit 5c3484
	ldd		-0x68(%r30), p064a
Packit 5c3484
	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
Packit 5c3484
	add		ma000, s000, s000
Packit 5c3484
	add,dc		ma064, climb, climb
Packit 5c3484
	sub		r000, s000, s000
Packit 5c3484
	sub,db		%r0, climb, climb
Packit 5c3484
	sub		%r0, climb, climb
Packit 5c3484
	std		s000, -8(rp)
Packit 5c3484
	add		p032a1, p032a2, m032
Packit 5c3484
	add,dc		%r0, %r0, m096
Packit 5c3484
	depd,z		m032, 31, 32, ma000
Packit 5c3484
	extrd,u		m032, 31, 32, ma064
Packit 5c3484
	ldd		0(rp), r000
Packit 5c3484
	depd		m096, 31, 32, ma064
Packit 5c3484
LDEF(0_two_out)
Packit 5c3484
	ldd		-0x78(%r30), p032a1
Packit 5c3484
	ldd		-0x70(%r30), p032a2
Packit 5c3484
	ldo		8(rp), rp
Packit 5c3484
	add		climb, p000a, s000
Packit 5c3484
	ldd		-0x80(%r30), p000a
Packit 5c3484
	add,dc		p064a, %r0, climb
Packit 5c3484
	ldd		-0x68(%r30), p064a
Packit 5c3484
	add		ma000, s000, s000
Packit 5c3484
	add,dc		ma064, climb, climb
Packit 5c3484
	sub		r000, s000, s000
Packit 5c3484
	sub,db		%r0, climb, climb
Packit 5c3484
	sub		%r0, climb, climb
Packit 5c3484
	std		s000, -8(rp)
Packit 5c3484
LDEF(0_one_out)
Packit 5c3484
	add		p032a1, p032a2, m032
Packit 5c3484
	add,dc		%r0, %r0, m096
Packit 5c3484
	depd,z		m032, 31, 32, ma000
Packit 5c3484
	extrd,u		m032, 31, 32, ma064
Packit 5c3484
	ldd		0(rp), r000
Packit 5c3484
	depd		m096, 31, 32, ma064
Packit 5c3484
Packit 5c3484
	add		climb, p000a, s000
Packit 5c3484
	add,dc		p064a, %r0, climb
Packit 5c3484
	add		ma000, s000, s000
Packit 5c3484
	add,dc		ma064, climb, climb
Packit 5c3484
	sub		r000, s000, s000
Packit 5c3484
	sub,db		%r0, climb, climb
Packit 5c3484
	sub		%r0, climb, climb
Packit 5c3484
	std		s000, 0(rp)
Packit 5c3484
Packit 5c3484
	cmpib,>=	4, n, L(done)
Packit 5c3484
	ldo		8(rp), rp
Packit 5c3484
Packit 5c3484
C 4-way unrolled code.
Packit 5c3484
Packit 5c3484
LDEF(BIG)
Packit 5c3484
Packit 5c3484
define(`p032a1',`%r1')	C
Packit 5c3484
define(`p032a2',`%r19')	C
Packit 5c3484
define(`p096b1',`%r20')	C
Packit 5c3484
define(`p096b2',`%r21')	C
Packit 5c3484
define(`p160c1',`%r22')	C
Packit 5c3484
define(`p160c2',`%r29')	C
Packit 5c3484
define(`p224d1',`%r31')	C
Packit 5c3484
define(`p224d2',`%r3')	C
Packit 5c3484
			C
Packit 5c3484
define(`m032',`%r4')	C
Packit 5c3484
define(`m096',`%r5')	C
Packit 5c3484
define(`m160',`%r6')	C
Packit 5c3484
define(`m224',`%r7')	C
Packit 5c3484
define(`m288',`%r8')	C
Packit 5c3484
			C
Packit 5c3484
define(`p000a',`%r1')	C
Packit 5c3484
define(`p064a',`%r19')	C
Packit 5c3484
define(`p064b',`%r20')	C
Packit 5c3484
define(`p128b',`%r21')	C
Packit 5c3484
define(`p128c',`%r22')	C
Packit 5c3484
define(`p192c',`%r29')	C
Packit 5c3484
define(`p192d',`%r31')	C
Packit 5c3484
define(`p256d',`%r3')	C
Packit 5c3484
			C
Packit 5c3484
define(`s000',`%r10')	C
Packit 5c3484
define(`s064',`%r11')	C
Packit 5c3484
define(`s128',`%r12')	C
Packit 5c3484
define(`s192',`%r13')	C
Packit 5c3484
			C
Packit 5c3484
define(`ma000',`%r9')	C
Packit 5c3484
define(`ma064',`%r4')	C
Packit 5c3484
define(`ma128',`%r5')	C
Packit 5c3484
define(`ma192',`%r6')	C
Packit 5c3484
define(`ma256',`%r7')	C
Packit 5c3484
			C
Packit 5c3484
define(`r000',`%r1')	C
Packit 5c3484
define(`r064',`%r19')	C
Packit 5c3484
define(`r128',`%r20')	C
Packit 5c3484
define(`r192',`%r21')	C
Packit 5c3484
Packit 5c3484
	std		%r6, -0xe8(%r30)
Packit 5c3484
	std		%r7, -0xe0(%r30)
Packit 5c3484
	std		%r8, -0xd8(%r30)
Packit 5c3484
	std		%r9, -0xd0(%r30)
Packit 5c3484
	std		%r10, -0xc8(%r30)
Packit 5c3484
	std		%r11, -0xc0(%r30)
Packit 5c3484
	std		%r12, -0xb8(%r30)
Packit 5c3484
	std		%r13, -0xb0(%r30)
Packit 5c3484
Packit 5c3484
ifdef(`HAVE_ABI_2_0w',
Packit 5c3484
`	extrd,u		n, 61, 62, n		C right shift 2
Packit 5c3484
',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
LDEF(4_or_more)
Packit 5c3484
	fldd		0(up), %fr4
Packit 5c3484
	fldd		8(up), %fr5
Packit 5c3484
	fldd		16(up), %fr6
Packit 5c3484
	fldd		24(up), %fr7
Packit 5c3484
	xmpyu		%fr8R, %fr4L, %fr22
Packit 5c3484
	xmpyu		%fr8L, %fr4R, %fr23
Packit 5c3484
	xmpyu		%fr8R, %fr5L, %fr24
Packit 5c3484
	xmpyu		%fr8L, %fr5R, %fr25
Packit 5c3484
	xmpyu		%fr8R, %fr6L, %fr26
Packit 5c3484
	xmpyu		%fr8L, %fr6R, %fr27
Packit 5c3484
	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
Packit 5c3484
	xmpyu		%fr8R, %fr7L, %fr28
Packit 5c3484
	xmpyu		%fr8L, %fr7R, %fr29
Packit 5c3484
	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
Packit 5c3484
	xmpyu		%fr8R, %fr4R, %fr30
Packit 5c3484
	xmpyu		%fr8L, %fr4L, %fr31
Packit 5c3484
	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
Packit 5c3484
	xmpyu		%fr8R, %fr5R, %fr22
Packit 5c3484
	xmpyu		%fr8L, %fr5L, %fr23
Packit 5c3484
	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
Packit 5c3484
	xmpyu		%fr8R, %fr6R, %fr24
Packit 5c3484
	xmpyu		%fr8L, %fr6L, %fr25
Packit 5c3484
	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
Packit 5c3484
	xmpyu		%fr8R, %fr7R, %fr26
Packit 5c3484
	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
Packit 5c3484
	addib,<>	-1, n, L(8_or_more)
Packit 5c3484
	xmpyu		%fr8L, %fr7L, %fr27
Packit 5c3484
	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
Packit 5c3484
	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
Packit 5c3484
	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
Packit 5c3484
	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
Packit 5c3484
	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
Packit 5c3484
	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
Packit 5c3484
	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
Packit 5c3484
	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
Packit 5c3484
	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
Packit 5c3484
	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
Packit 5c3484
	ldd		-0x78(%r30), p032a1
Packit 5c3484
	ldd		-0x70(%r30), p032a2
Packit 5c3484
	ldd		-0x38(%r30), p096b1
Packit 5c3484
	ldd		-0x30(%r30), p096b2
Packit 5c3484
	ldd		-0x58(%r30), p160c1
Packit 5c3484
	ldd		-0x50(%r30), p160c2
Packit 5c3484
	ldd		-0x18(%r30), p224d1
Packit 5c3484
	ldd		-0x10(%r30), p224d2
Packit 5c3484
	b		L(end1)
Packit 5c3484
	nop
Packit 5c3484
Packit 5c3484
LDEF(8_or_more)
Packit 5c3484
	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
Packit 5c3484
	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
Packit 5c3484
	ldo		32(up), up
Packit 5c3484
	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
Packit 5c3484
	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
Packit 5c3484
	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
Packit 5c3484
	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
Packit 5c3484
	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
Packit 5c3484
	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
Packit 5c3484
	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
Packit 5c3484
	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
Packit 5c3484
	fldd		0(up), %fr4
Packit 5c3484
	fldd		8(up), %fr5
Packit 5c3484
	fldd		16(up), %fr6
Packit 5c3484
	fldd		24(up), %fr7
Packit 5c3484
	xmpyu		%fr8R, %fr4L, %fr22
Packit 5c3484
	ldd		-0x78(%r30), p032a1
Packit 5c3484
	xmpyu		%fr8L, %fr4R, %fr23
Packit 5c3484
	xmpyu		%fr8R, %fr5L, %fr24
Packit 5c3484
	ldd		-0x70(%r30), p032a2
Packit 5c3484
	xmpyu		%fr8L, %fr5R, %fr25
Packit 5c3484
	xmpyu		%fr8R, %fr6L, %fr26
Packit 5c3484
	ldd		-0x38(%r30), p096b1
Packit 5c3484
	xmpyu		%fr8L, %fr6R, %fr27
Packit 5c3484
	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
Packit 5c3484
	xmpyu		%fr8R, %fr7L, %fr28
Packit 5c3484
	ldd		-0x30(%r30), p096b2
Packit 5c3484
	xmpyu		%fr8L, %fr7R, %fr29
Packit 5c3484
	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
Packit 5c3484
	xmpyu		%fr8R, %fr4R, %fr30
Packit 5c3484
	ldd		-0x58(%r30), p160c1
Packit 5c3484
	xmpyu		%fr8L, %fr4L, %fr31
Packit 5c3484
	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
Packit 5c3484
	xmpyu		%fr8R, %fr5R, %fr22
Packit 5c3484
	ldd		-0x50(%r30), p160c2
Packit 5c3484
	xmpyu		%fr8L, %fr5L, %fr23
Packit 5c3484
	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
Packit 5c3484
	xmpyu		%fr8R, %fr6R, %fr24
Packit 5c3484
	ldd		-0x18(%r30), p224d1
Packit 5c3484
	xmpyu		%fr8L, %fr6L, %fr25
Packit 5c3484
	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
Packit 5c3484
	xmpyu		%fr8R, %fr7R, %fr26
Packit 5c3484
	ldd		-0x10(%r30), p224d2
Packit 5c3484
	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
Packit 5c3484
	addib,=		-1, n, L(end2)
Packit 5c3484
	xmpyu		%fr8L, %fr7L, %fr27
Packit 5c3484
LDEF(loop)
Packit 5c3484
	add		p032a1, p032a2, m032
Packit 5c3484
	ldd		-0x80(%r30), p000a
Packit 5c3484
	add,dc		p096b1, p096b2, m096
Packit 5c3484
	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
Packit 5c3484
Packit 5c3484
	add,dc		p160c1, p160c2, m160
Packit 5c3484
	ldd		-0x68(%r30), p064a
Packit 5c3484
	add,dc		p224d1, p224d2, m224
Packit 5c3484
	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
Packit 5c3484
Packit 5c3484
	add,dc		%r0, %r0, m288
Packit 5c3484
	ldd		-0x40(%r30), p064b
Packit 5c3484
	ldo		32(up), up
Packit 5c3484
	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
Packit 5c3484
Packit 5c3484
	depd,z		m032, 31, 32, ma000
Packit 5c3484
	ldd		-0x28(%r30), p128b
Packit 5c3484
	extrd,u		m032, 31, 32, ma064
Packit 5c3484
	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
Packit 5c3484
Packit 5c3484
	depd		m096, 31, 32, ma064
Packit 5c3484
	ldd		-0x60(%r30), p128c
Packit 5c3484
	extrd,u		m096, 31, 32, ma128
Packit 5c3484
	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
Packit 5c3484
Packit 5c3484
	depd		m160, 31, 32, ma128
Packit 5c3484
	ldd		-0x48(%r30), p192c
Packit 5c3484
	extrd,u		m160, 31, 32, ma192
Packit 5c3484
	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
Packit 5c3484
Packit 5c3484
	depd		m224, 31, 32, ma192
Packit 5c3484
	ldd		-0x20(%r30), p192d
Packit 5c3484
	extrd,u		m224, 31, 32, ma256
Packit 5c3484
	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
Packit 5c3484
Packit 5c3484
	depd		m288, 31, 32, ma256
Packit 5c3484
	ldd		-0x88(%r30), p256d
Packit 5c3484
	add		climb, p000a, s000
Packit 5c3484
	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
Packit 5c3484
Packit 5c3484
	add,dc		p064a, p064b, s064
Packit 5c3484
	ldd		0(rp), r000
Packit 5c3484
	add,dc		p128b, p128c, s128
Packit 5c3484
	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
Packit 5c3484
Packit 5c3484
	add,dc		p192c, p192d, s192
Packit 5c3484
	ldd		8(rp), r064
Packit 5c3484
	add,dc		p256d, %r0, climb
Packit 5c3484
	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
Packit 5c3484
Packit 5c3484
	ldd		16(rp), r128
Packit 5c3484
	add		ma000, s000, s000	C accum mid 0
Packit 5c3484
	ldd		24(rp), r192
Packit 5c3484
	add,dc		ma064, s064, s064	C accum mid 1
Packit 5c3484
Packit 5c3484
	add,dc		ma128, s128, s128	C accum mid 2
Packit 5c3484
	fldd		0(up), %fr4
Packit 5c3484
	add,dc		ma192, s192, s192	C accum mid 3
Packit 5c3484
	fldd		8(up), %fr5
Packit 5c3484
Packit 5c3484
	add,dc		ma256, climb, climb
Packit 5c3484
	fldd		16(up), %fr6
Packit 5c3484
	sub		r000, s000, s000	C accum rlimb 0
Packit 5c3484
	fldd		24(up), %fr7
Packit 5c3484
Packit 5c3484
	sub,db		r064, s064, s064	C accum rlimb 1
Packit 5c3484
	sub,db		r128, s128, s128	C accum rlimb 2
Packit 5c3484
	std		s000, 0(rp)
Packit 5c3484
Packit 5c3484
	sub,db		r192, s192, s192	C accum rlimb 3
Packit 5c3484
	sub,db		%r0, climb, climb
Packit 5c3484
	sub		%r0, climb, climb
Packit 5c3484
	std		s064, 8(rp)
Packit 5c3484
Packit 5c3484
	xmpyu		%fr8R, %fr4L, %fr22
Packit 5c3484
	ldd		-0x78(%r30), p032a1
Packit 5c3484
	xmpyu		%fr8L, %fr4R, %fr23
Packit 5c3484
	std		s128, 16(rp)
Packit 5c3484
Packit 5c3484
	xmpyu		%fr8R, %fr5L, %fr24
Packit 5c3484
	ldd		-0x70(%r30), p032a2
Packit 5c3484
	xmpyu		%fr8L, %fr5R, %fr25
Packit 5c3484
	std		s192, 24(rp)
Packit 5c3484
Packit 5c3484
	xmpyu		%fr8R, %fr6L, %fr26
Packit 5c3484
	ldd		-0x38(%r30), p096b1
Packit 5c3484
	xmpyu		%fr8L, %fr6R, %fr27
Packit 5c3484
	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
Packit 5c3484
Packit 5c3484
	xmpyu		%fr8R, %fr7L, %fr28
Packit 5c3484
	ldd		-0x30(%r30), p096b2
Packit 5c3484
	xmpyu		%fr8L, %fr7R, %fr29
Packit 5c3484
	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
Packit 5c3484
Packit 5c3484
	xmpyu		%fr8R, %fr4R, %fr30
Packit 5c3484
	ldd		-0x58(%r30), p160c1
Packit 5c3484
	xmpyu		%fr8L, %fr4L, %fr31
Packit 5c3484
	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
Packit 5c3484
Packit 5c3484
	xmpyu		%fr8R, %fr5R, %fr22
Packit 5c3484
	ldd		-0x50(%r30), p160c2
Packit 5c3484
	xmpyu		%fr8L, %fr5L, %fr23
Packit 5c3484
	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
Packit 5c3484
Packit 5c3484
	xmpyu		%fr8R, %fr6R, %fr24
Packit 5c3484
	ldd		-0x18(%r30), p224d1
Packit 5c3484
	xmpyu		%fr8L, %fr6L, %fr25
Packit 5c3484
	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
Packit 5c3484
Packit 5c3484
	xmpyu		%fr8R, %fr7R, %fr26
Packit 5c3484
	ldd		-0x10(%r30), p224d2
Packit 5c3484
	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
Packit 5c3484
	xmpyu		%fr8L, %fr7L, %fr27
Packit 5c3484
Packit 5c3484
	addib,<>	-1, n, L(loop)
Packit 5c3484
	ldo		32(rp), rp
Packit 5c3484
Packit 5c3484
LDEF(end2)
Packit 5c3484
	add		p032a1, p032a2, m032
Packit 5c3484
	ldd		-0x80(%r30), p000a
Packit 5c3484
	add,dc		p096b1, p096b2, m096
Packit 5c3484
	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
Packit 5c3484
	add,dc		p160c1, p160c2, m160
Packit 5c3484
	ldd		-0x68(%r30), p064a
Packit 5c3484
	add,dc		p224d1, p224d2, m224
Packit 5c3484
	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
Packit 5c3484
	add,dc		%r0, %r0, m288
Packit 5c3484
	ldd		-0x40(%r30), p064b
Packit 5c3484
	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
Packit 5c3484
	depd,z		m032, 31, 32, ma000
Packit 5c3484
	ldd		-0x28(%r30), p128b
Packit 5c3484
	extrd,u		m032, 31, 32, ma064
Packit 5c3484
	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
Packit 5c3484
	depd		m096, 31, 32, ma064
Packit 5c3484
	ldd		-0x60(%r30), p128c
Packit 5c3484
	extrd,u		m096, 31, 32, ma128
Packit 5c3484
	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
Packit 5c3484
	depd		m160, 31, 32, ma128
Packit 5c3484
	ldd		-0x48(%r30), p192c
Packit 5c3484
	extrd,u		m160, 31, 32, ma192
Packit 5c3484
	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
Packit 5c3484
	depd		m224, 31, 32, ma192
Packit 5c3484
	ldd		-0x20(%r30), p192d
Packit 5c3484
	extrd,u		m224, 31, 32, ma256
Packit 5c3484
	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
Packit 5c3484
	depd		m288, 31, 32, ma256
Packit 5c3484
	ldd		-0x88(%r30), p256d
Packit 5c3484
	add		climb, p000a, s000
Packit 5c3484
	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
Packit 5c3484
	add,dc		p064a, p064b, s064
Packit 5c3484
	ldd		0(rp), r000
Packit 5c3484
	add,dc		p128b, p128c, s128
Packit 5c3484
	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
Packit 5c3484
	add,dc		p192c, p192d, s192
Packit 5c3484
	ldd		8(rp), r064
Packit 5c3484
	add,dc		p256d, %r0, climb
Packit 5c3484
	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
Packit 5c3484
	ldd		16(rp), r128
Packit 5c3484
	add		ma000, s000, s000	C accum mid 0
Packit 5c3484
	ldd		24(rp), r192
Packit 5c3484
	add,dc		ma064, s064, s064	C accum mid 1
Packit 5c3484
	add,dc		ma128, s128, s128	C accum mid 2
Packit 5c3484
	add,dc		ma192, s192, s192	C accum mid 3
Packit 5c3484
	add,dc		ma256, climb, climb
Packit 5c3484
	sub		r000, s000, s000	C accum rlimb 0
Packit 5c3484
	sub,db		r064, s064, s064	C accum rlimb 1
Packit 5c3484
	sub,db		r128, s128, s128	C accum rlimb 2
Packit 5c3484
	std		s000, 0(rp)
Packit 5c3484
	sub,db		r192, s192, s192	C accum rlimb 3
Packit 5c3484
	sub,db		%r0, climb, climb
Packit 5c3484
	sub		%r0, climb, climb
Packit 5c3484
	std		s064, 8(rp)
Packit 5c3484
	ldd		-0x78(%r30), p032a1
Packit 5c3484
	std		s128, 16(rp)
Packit 5c3484
	ldd		-0x70(%r30), p032a2
Packit 5c3484
	std		s192, 24(rp)
Packit 5c3484
	ldd		-0x38(%r30), p096b1
Packit 5c3484
	ldd		-0x30(%r30), p096b2
Packit 5c3484
	ldd		-0x58(%r30), p160c1
Packit 5c3484
	ldd		-0x50(%r30), p160c2
Packit 5c3484
	ldd		-0x18(%r30), p224d1
Packit 5c3484
	ldd		-0x10(%r30), p224d2
Packit 5c3484
	ldo		32(rp), rp
Packit 5c3484
Packit 5c3484
LDEF(end1)
Packit 5c3484
	add		p032a1, p032a2, m032
Packit 5c3484
	ldd		-0x80(%r30), p000a
Packit 5c3484
	add,dc		p096b1, p096b2, m096
Packit 5c3484
	add,dc		p160c1, p160c2, m160
Packit 5c3484
	ldd		-0x68(%r30), p064a
Packit 5c3484
	add,dc		p224d1, p224d2, m224
Packit 5c3484
	add,dc		%r0, %r0, m288
Packit 5c3484
	ldd		-0x40(%r30), p064b
Packit 5c3484
	depd,z		m032, 31, 32, ma000
Packit 5c3484
	ldd		-0x28(%r30), p128b
Packit 5c3484
	extrd,u		m032, 31, 32, ma064
Packit 5c3484
	depd		m096, 31, 32, ma064
Packit 5c3484
	ldd		-0x60(%r30), p128c
Packit 5c3484
	extrd,u		m096, 31, 32, ma128
Packit 5c3484
	depd		m160, 31, 32, ma128
Packit 5c3484
	ldd		-0x48(%r30), p192c
Packit 5c3484
	extrd,u		m160, 31, 32, ma192
Packit 5c3484
	depd		m224, 31, 32, ma192
Packit 5c3484
	ldd		-0x20(%r30), p192d
Packit 5c3484
	extrd,u		m224, 31, 32, ma256
Packit 5c3484
	depd		m288, 31, 32, ma256
Packit 5c3484
	ldd		-0x88(%r30), p256d
Packit 5c3484
	add		climb, p000a, s000
Packit 5c3484
	add,dc		p064a, p064b, s064
Packit 5c3484
	ldd		0(rp), r000
Packit 5c3484
	add,dc		p128b, p128c, s128
Packit 5c3484
	add,dc		p192c, p192d, s192
Packit 5c3484
	ldd		8(rp), r064
Packit 5c3484
	add,dc		p256d, %r0, climb
Packit 5c3484
	ldd		16(rp), r128
Packit 5c3484
	add		ma000, s000, s000	C accum mid 0
Packit 5c3484
	ldd		24(rp), r192
Packit 5c3484
	add,dc		ma064, s064, s064	C accum mid 1
Packit 5c3484
	add,dc		ma128, s128, s128	C accum mid 2
Packit 5c3484
	add,dc		ma192, s192, s192	C accum mid 3
Packit 5c3484
	add,dc		ma256, climb, climb
Packit 5c3484
	sub		r000, s000, s000	C accum rlimb 0
Packit 5c3484
	sub,db		r064, s064, s064	C accum rlimb 1
Packit 5c3484
	sub,db		r128, s128, s128	C accum rlimb 2
Packit 5c3484
	std		s000, 0(rp)
Packit 5c3484
	sub,db		r192, s192, s192	C accum rlimb 3
Packit 5c3484
	sub,db		%r0, climb, climb
Packit 5c3484
	sub		%r0, climb, climb
Packit 5c3484
	std		s064, 8(rp)
Packit 5c3484
	std		s128, 16(rp)
Packit 5c3484
	std		s192, 24(rp)
Packit 5c3484
Packit 5c3484
	ldd		-0xb0(%r30), %r13
Packit 5c3484
	ldd		-0xb8(%r30), %r12
Packit 5c3484
	ldd		-0xc0(%r30), %r11
Packit 5c3484
	ldd		-0xc8(%r30), %r10
Packit 5c3484
	ldd		-0xd0(%r30), %r9
Packit 5c3484
	ldd		-0xd8(%r30), %r8
Packit 5c3484
	ldd		-0xe0(%r30), %r7
Packit 5c3484
	ldd		-0xe8(%r30), %r6
Packit 5c3484
LDEF(done)
Packit 5c3484
ifdef(`HAVE_ABI_2_0w',
Packit 5c3484
`	copy		climb, %r28
Packit 5c3484
',`	extrd,u		climb, 63, 32, %r29
Packit 5c3484
	extrd,u		climb, 31, 32, %r28
Packit 5c3484
')
Packit 5c3484
	ldd		-0xf0(%r30), %r5
Packit 5c3484
	ldd		-0xf8(%r30), %r4
Packit 5c3484
	bve		(%r2)
Packit 5c3484
	ldd,mb		-0x100(%r30), %r3
Packit 5c3484
EPILOGUE(mpn_submul_1)