dnl  Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and
dnl  store sum in a third limb vector.

dnl  Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of either:
dnl
dnl    * the GNU Lesser General Public License as published by the Free
dnl      Software Foundation; either version 3 of the License, or (at your
dnl      option) any later version.
dnl
dnl  or
dnl
dnl    * the GNU General Public License as published by the Free Software
dnl      Foundation; either version 2 of the License, or (at your option) any
dnl      later version.
dnl
dnl  or both in parallel, as here.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
dnl  for more details.
dnl
dnl  You should have received copies of the GNU General Public License and the
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
dnl  see https://www.gnu.org/licenses/.

include(`../config.m4')

C      cycles/limb
C EV4:     ?
C EV5:     4.75
C EV6:     3

dnl  INPUT PARAMETERS
dnl  res_ptr	r16
dnl  s1_ptr	r17
dnl  s2_ptr	r18
dnl  size	r19

ASM_START()
PROLOGUE(mpn_add_nc)
	bis	r20,r31,r25
	br	L(com)
EPILOGUE()
PROLOGUE(mpn_add_n)
	bis	r31,r31,r25		C clear cy
L(com):	subq	r19,4,r19		C decr loop cnt
	blt	r19,$Lend2		C if less than 4 limbs, goto 2nd loop
C Start software pipeline for 1st loop
	ldq	r0,0(r18)
	ldq	r4,0(r17)
	ldq	r1,8(r18)
	ldq	r5,8(r17)
	addq	r17,32,r17		C update s1_ptr
	addq	r0,r4,r28		C 1st main add
	ldq	r2,16(r18)
	addq	r25,r28,r20		C 1st carry add
	ldq	r3,24(r18)
	cmpult	r28,r4,r8		C compute cy from last add
	ldq	r6,-16(r17)
	cmpult	r20,r28,r25		C compute cy from last add
	ldq	r7,-8(r17)
	bis	r8,r25,r25		C combine cy from the two adds
	subq	r19,4,r19		C decr loop cnt
	addq	r1,r5,r28		C 2nd main add
	addq	r18,32,r18		C update s2_ptr
	addq	r28,r25,r21		C 2nd carry add
	cmpult	r28,r5,r8		C compute cy from last add
	blt	r19,$Lend1		C if less than 4 limbs remain, jump
C 1st loop handles groups of 4 limbs in a software pipeline
	ALIGN(16)
$Loop:	cmpult	r21,r28,r25		C compute cy from last add
	ldq	r0,0(r18)
	bis	r8,r25,r25		C combine cy from the two adds
	ldq	r1,8(r18)
	addq	r2,r6,r28		C 3rd main add
	ldq	r4,0(r17)
	addq	r28,r25,r22		C 3rd carry add
	ldq	r5,8(r17)
	cmpult	r28,r6,r8		C compute cy from last add
	cmpult	r22,r28,r25		C compute cy from last add
	stq	r20,0(r16)
	bis	r8,r25,r25		C combine cy from the two adds
	stq	r21,8(r16)
	addq	r3,r7,r28		C 4th main add
	addq	r28,r25,r23		C 4th carry add
	cmpult	r28,r7,r8		C compute cy from last add
	cmpult	r23,r28,r25		C compute cy from last add
		addq	r17,32,r17		C update s1_ptr
	bis	r8,r25,r25		C combine cy from the two adds
		addq	r16,32,r16		C update res_ptr
	addq	r0,r4,r28		C 1st main add
	ldq	r2,16(r18)
	addq	r25,r28,r20		C 1st carry add
	ldq	r3,24(r18)
	cmpult	r28,r4,r8		C compute cy from last add
	ldq	r6,-16(r17)
	cmpult	r20,r28,r25		C compute cy from last add
	ldq	r7,-8(r17)
	bis	r8,r25,r25		C combine cy from the two adds
	subq	r19,4,r19		C decr loop cnt
	stq	r22,-16(r16)
	addq	r1,r5,r28		C 2nd main add
	stq	r23,-8(r16)
	addq	r25,r28,r21		C 2nd carry add
		addq	r18,32,r18		C update s2_ptr
	cmpult	r28,r5,r8		C compute cy from last add
	bge	r19,$Loop
C Finish software pipeline for 1st loop
$Lend1:	cmpult	r21,r28,r25		C compute cy from last add
	bis	r8,r25,r25		C combine cy from the two adds
	addq	r2,r6,r28		C 3rd main add
	addq	r28,r25,r22		C 3rd carry add
	cmpult	r28,r6,r8		C compute cy from last add
	cmpult	r22,r28,r25		C compute cy from last add
	stq	r20,0(r16)
	bis	r8,r25,r25		C combine cy from the two adds
	stq	r21,8(r16)
	addq	r3,r7,r28		C 4th main add
	addq	r28,r25,r23		C 4th carry add
	cmpult	r28,r7,r8		C compute cy from last add
	cmpult	r23,r28,r25		C compute cy from last add
	bis	r8,r25,r25		C combine cy from the two adds
	addq	r16,32,r16		C update res_ptr
	stq	r22,-16(r16)
	stq	r23,-8(r16)
$Lend2:	addq	r19,4,r19		C restore loop cnt
	beq	r19,$Lret
C Start software pipeline for 2nd loop
	ldq	r0,0(r18)
	ldq	r4,0(r17)
	subq	r19,1,r19
	beq	r19,$Lend0
C 2nd loop handles remaining 1-3 limbs
	ALIGN(16)
$Loop0:	addq	r0,r4,r28		C main add
	ldq	r0,8(r18)
	cmpult	r28,r4,r8		C compute cy from last add
	ldq	r4,8(r17)
	addq	r28,r25,r20		C carry add
	addq	r18,8,r18
	addq	r17,8,r17
	stq	r20,0(r16)
	cmpult	r20,r28,r25		C compute cy from last add
	subq	r19,1,r19		C decr loop cnt
	bis	r8,r25,r25		C combine cy from the two adds
	addq	r16,8,r16
	bne	r19,$Loop0
$Lend0:	addq	r0,r4,r28		C main add
	addq	r28,r25,r20		C carry add
	cmpult	r28,r4,r8		C compute cy from last add
	cmpult	r20,r28,r25		C compute cy from last add
	stq	r20,0(r16)
	bis	r8,r25,r25		C combine cy from the two adds

$Lret:	bis	r25,r31,r0		C return cy
	ret	r31,(r26),1
EPILOGUE()
ASM_END()