Blame mpn/x86/k7/addlsh1_n.asm

Packit 5c3484
dnl  AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
Packit 5c3484
Packit 5c3484
dnl  Copyright 2011 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
Packit 5c3484
C The innerloop is 2*3-way unrolled, which is best we can do with the available
Packit 5c3484
C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
Packit 5c3484
C cannot feed carry between operations there.
Packit 5c3484
Packit 5c3484
C			    cycles/limb
Packit 5c3484
C P5
Packit 5c3484
C P6 model 0-8,10-12
Packit 5c3484
C P6 model 9  (Banias)
Packit 5c3484
C P6 model 13 (Dothan)		 5.4	(worse than add_n + lshift)
Packit 5c3484
C P4 model 0  (Willamette)
Packit 5c3484
C P4 model 1  (?)
Packit 5c3484
C P4 model 2  (Northwood)
Packit 5c3484
C P4 model 3  (Prescott)
Packit 5c3484
C P4 model 4  (Nocona)
Packit 5c3484
C Intel Atom			 6
Packit 5c3484
C AMD K6			 ?
Packit 5c3484
C AMD K7			 2.5
Packit 5c3484
C AMD K8
Packit 5c3484
Packit 5c3484
C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
Packit 5c3484
C processors.  It uses 2*3-way unrolling, for good reasons.  Unfortunately,
Packit 5c3484
C that means we need an initial magic multiply.
Packit 5c3484
C
Packit 5c3484
C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern.  We
Packit 5c3484
C cannot do rsblsh1_n since we feed carry from the shift blocks to the
Packit 5c3484
C add/subtract blocks, which is right for addition but reversed for
Packit 5c3484
C subtraction.  We could perhaps do sublsh1_n, with some extra move insns,
Packit 5c3484
C without losing any time, since we're not issue limited but carry recurrency
Packit 5c3484
C latency.
Packit 5c3484
C
Packit 5c3484
C Breaking carry recurrency might be a good idea.  We would then need separate
Packit 5c3484
C registers for the shift carry and add/subtract carry, which in turn would
Packit 5c3484
C force is to 2*2-way unrolling.
Packit 5c3484
Packit 5c3484
defframe(PARAM_SIZE,	16)
Packit 5c3484
defframe(PARAM_DBLD,	12)
Packit 5c3484
defframe(PARAM_SRC,	 8)
Packit 5c3484
defframe(PARAM_DST,	 4)
Packit 5c3484
Packit 5c3484
dnl  re-use parameter space
Packit 5c3484
define(VAR_COUNT,`PARAM_DST')
Packit 5c3484
define(VAR_TMP,`PARAM_DBLD')
Packit 5c3484
Packit 5c3484
ASM_START()
Packit 5c3484
	TEXT
Packit 5c3484
	ALIGN(8)
Packit 5c3484
PROLOGUE(mpn_addlsh1_n)
Packit 5c3484
deflit(`FRAME',0)
Packit 5c3484
Packit 5c3484
define(`rp',  `%edi')
Packit 5c3484
define(`up',  `%esi')
Packit 5c3484
define(`vp',  `%ebp')
Packit 5c3484
Packit 5c3484
	mov	$0x2aaaaaab, %eax
Packit 5c3484
Packit 5c3484
	push	%ebx			FRAME_pushl()
Packit 5c3484
	mov	PARAM_SIZE, %ebx	C size
Packit 5c3484
Packit 5c3484
	push	rp			FRAME_pushl()
Packit 5c3484
	mov	PARAM_DST, rp
Packit 5c3484
Packit 5c3484
	mul	%ebx
Packit 5c3484
Packit 5c3484
	push	up			FRAME_pushl()
Packit 5c3484
	mov	PARAM_SRC, up
Packit 5c3484
Packit 5c3484
	not	%edx			C count = -(size\8)-1
Packit 5c3484
	mov	%edx, VAR_COUNT
Packit 5c3484
Packit 5c3484
	push	vp			FRAME_pushl()
Packit 5c3484
	mov	PARAM_DBLD, vp
Packit 5c3484
Packit 5c3484
	lea	3(%edx,%edx,2), %ecx	C count*3+3 = -(size\6)*3
Packit 5c3484
	xor	%edx, %edx
Packit 5c3484
	lea	(%ebx,%ecx,2), %ebx	C size + (count*3+3)*2 = size % 6
Packit 5c3484
	or	%ebx, %ebx
Packit 5c3484
	jz	L(exact)
Packit 5c3484
Packit 5c3484
L(oop):
Packit 5c3484
ifdef(`CPU_P6',`
Packit 5c3484
	shr	%edx ')			C restore 2nd saved carry bit
Packit 5c3484
	mov	(vp), %eax
Packit 5c3484
	adc	%eax, %eax
Packit 5c3484
	rcr	%edx			C restore 1st saved carry bit
Packit 5c3484
	lea	4(vp), vp
Packit 5c3484
	adc	(up), %eax
Packit 5c3484
	lea	4(up), up
Packit 5c3484
	adc	%edx, %edx		C save a carry bit in edx
Packit 5c3484
ifdef(`CPU_P6',`
Packit 5c3484
	adc	%edx, %edx ')		C save another carry bit in edx
Packit 5c3484
	dec	%ebx
Packit 5c3484
	mov	%eax, (rp)
Packit 5c3484
	lea	4(rp), rp
Packit 5c3484
	jnz	L(oop)
Packit 5c3484
	mov	vp, VAR_TMP
Packit 5c3484
L(exact):
Packit 5c3484
	incl	VAR_COUNT
Packit 5c3484
	jz	L(end)
Packit 5c3484
Packit 5c3484
	ALIGN(16)
Packit 5c3484
L(top):
Packit 5c3484
ifdef(`CPU_P6',`
Packit 5c3484
	shr	%edx ')			C restore 2nd saved carry bit
Packit 5c3484
	mov	(vp), %eax
Packit 5c3484
	adc	%eax, %eax
Packit 5c3484
	mov	4(vp), %ebx
Packit 5c3484
	adc	%ebx, %ebx
Packit 5c3484
	mov	8(vp), %ecx
Packit 5c3484
	adc	%ecx, %ecx
Packit 5c3484
Packit 5c3484
	rcr	%edx			C restore 1st saved carry bit
Packit 5c3484
Packit 5c3484
	adc	(up), %eax
Packit 5c3484
	mov	%eax, (rp)
Packit 5c3484
	adc	4(up), %ebx
Packit 5c3484
	mov	%ebx, 4(rp)
Packit 5c3484
	adc	8(up), %ecx
Packit 5c3484
	mov	%ecx, 8(rp)
Packit 5c3484
Packit 5c3484
	mov	12(vp), %eax
Packit 5c3484
	adc	%eax, %eax
Packit 5c3484
	mov	16(vp), %ebx
Packit 5c3484
	adc	%ebx, %ebx
Packit 5c3484
	mov	20(vp), %ecx
Packit 5c3484
	adc	%ecx, %ecx
Packit 5c3484
Packit 5c3484
	lea	24(vp), vp
Packit 5c3484
	adc	%edx, %edx		C save a carry bit in edx
Packit 5c3484
Packit 5c3484
	adc	12(up), %eax
Packit 5c3484
	mov	%eax, 12(rp)
Packit 5c3484
	adc	16(up), %ebx
Packit 5c3484
	mov	%ebx, 16(rp)
Packit 5c3484
	adc	20(up), %ecx
Packit 5c3484
Packit 5c3484
	lea	24(up), up
Packit 5c3484
Packit 5c3484
ifdef(`CPU_P6',`
Packit 5c3484
	adc	%edx, %edx ')		C save another carry bit in edx
Packit 5c3484
	mov	%ecx, 20(rp)
Packit 5c3484
	incl	VAR_COUNT
Packit 5c3484
	lea	24(rp), rp
Packit 5c3484
	jne	L(top)
Packit 5c3484
Packit 5c3484
L(end):
Packit 5c3484
	pop	vp			FRAME_popl()
Packit 5c3484
	pop	up			FRAME_popl()
Packit 5c3484
Packit 5c3484
ifdef(`CPU_P6',`
Packit 5c3484
	xor	%eax, %eax
Packit 5c3484
	shr	$1, %edx
Packit 5c3484
	adc	%edx, %eax
Packit 5c3484
',`
Packit 5c3484
	adc	$0, %edx
Packit 5c3484
	mov	%edx, %eax
Packit 5c3484
')
Packit 5c3484
	pop	rp			FRAME_popl()
Packit 5c3484
	pop	%ebx			FRAME_popl()
Packit 5c3484
	ret
Packit 5c3484
EPILOGUE()
Packit 5c3484
ASM_END()