Blame mpn/x86/p6/aorsmul_1.asm

Packit 5c3484
dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
Packit 5c3484
Packit 5c3484
dnl  Copyright 1999-2002, 2005 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C			    cycles/limb
Packit 5c3484
C P5
Packit 5c3484
C P6 model 0-8,10-12		 6.44
Packit 5c3484
C P6 model 9  (Banias)		 6.15
Packit 5c3484
C P6 model 13 (Dothan)		 6.11
Packit 5c3484
C P4 model 0  (Willamette)
Packit 5c3484
C P4 model 1  (?)
Packit 5c3484
C P4 model 2  (Northwood)
Packit 5c3484
C P4 model 3  (Prescott)
Packit 5c3484
C P4 model 4  (Nocona)
Packit 5c3484
C AMD K6
Packit 5c3484
C AMD K7
Packit 5c3484
C AMD K8
Packit 5c3484
Packit 5c3484
Packit 5c3484
dnl  P6 UNROLL_COUNT cycles/limb
Packit 5c3484
dnl          8           6.7
Packit 5c3484
dnl         16           6.35
Packit 5c3484
dnl         32           6.3
Packit 5c3484
dnl         64           6.3
Packit 5c3484
dnl  Maximum possible with the current code is 64.
Packit 5c3484
Packit 5c3484
deflit(UNROLL_COUNT, 16)
Packit 5c3484
Packit 5c3484
Packit 5c3484
ifdef(`OPERATION_addmul_1', `
Packit 5c3484
	define(M4_inst,        addl)
Packit 5c3484
	define(M4_function_1,  mpn_addmul_1)
Packit 5c3484
	define(M4_function_1c, mpn_addmul_1c)
Packit 5c3484
	define(M4_description, add it to)
Packit 5c3484
	define(M4_desc_retval, carry)
Packit 5c3484
',`ifdef(`OPERATION_submul_1', `
Packit 5c3484
	define(M4_inst,        subl)
Packit 5c3484
	define(M4_function_1,  mpn_submul_1)
Packit 5c3484
	define(M4_function_1c, mpn_submul_1c)
Packit 5c3484
	define(M4_description, subtract it from)
Packit 5c3484
	define(M4_desc_retval, borrow)
Packit 5c3484
',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
Packit 5c3484
')')')
Packit 5c3484
Packit 5c3484
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
Packit 5c3484
Packit 5c3484
Packit 5c3484
C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
Packit 5c3484
C                            mp_limb_t mult);
Packit 5c3484
C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
Packit 5c3484
C                             mp_limb_t mult, mp_limb_t carry);
Packit 5c3484
C
Packit 5c3484
C Calculate src,size multiplied by mult and M4_description dst,size.
Packit 5c3484
C Return the M4_desc_retval limb from the top of the result.
Packit 5c3484
C
Packit 5c3484
C This code is pretty much the same as the K6 code.  The unrolled loop is
Packit 5c3484
C the same, but there's just a few scheduling tweaks in the setups and the
Packit 5c3484
C simple loop.
Packit 5c3484
C
Packit 5c3484
C A number of variations have been tried for the unrolled loop, with one or
Packit 5c3484
C two carries, and with loads scheduled earlier, but nothing faster than 6
Packit 5c3484
C cycles/limb has been found.
Packit 5c3484
Packit 5c3484
ifdef(`PIC',`
Packit 5c3484
deflit(UNROLL_THRESHOLD, 5)
Packit 5c3484
',`
Packit 5c3484
deflit(UNROLL_THRESHOLD, 5)
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
defframe(PARAM_CARRY,     20)
Packit 5c3484
defframe(PARAM_MULTIPLIER,16)
Packit 5c3484
defframe(PARAM_SIZE,      12)
Packit 5c3484
defframe(PARAM_SRC,       8)
Packit 5c3484
defframe(PARAM_DST,       4)
Packit 5c3484
Packit 5c3484
	TEXT
Packit 5c3484
	ALIGN(32)
Packit 5c3484
Packit 5c3484
PROLOGUE(M4_function_1c)
Packit 5c3484
	pushl	%ebx
Packit 5c3484
deflit(`FRAME',4)
Packit 5c3484
	movl	PARAM_CARRY, %ebx
Packit 5c3484
	jmp	L(start_nc)
Packit 5c3484
EPILOGUE()
Packit 5c3484
Packit 5c3484
PROLOGUE(M4_function_1)
Packit 5c3484
	push	%ebx
Packit 5c3484
deflit(`FRAME',4)
Packit 5c3484
	xorl	%ebx, %ebx	C initial carry
Packit 5c3484
Packit 5c3484
L(start_nc):
Packit 5c3484
	movl	PARAM_SIZE, %ecx
Packit 5c3484
	pushl	%esi
Packit 5c3484
deflit(`FRAME',8)
Packit 5c3484
Packit 5c3484
	movl	PARAM_SRC, %esi
Packit 5c3484
	pushl	%edi
Packit 5c3484
deflit(`FRAME',12)
Packit 5c3484
Packit 5c3484
	movl	PARAM_DST, %edi
Packit 5c3484
	pushl	%ebp
Packit 5c3484
deflit(`FRAME',16)
Packit 5c3484
	cmpl	$UNROLL_THRESHOLD, %ecx
Packit 5c3484
Packit 5c3484
	movl	PARAM_MULTIPLIER, %ebp
Packit 5c3484
	jae	L(unroll)
Packit 5c3484
Packit 5c3484
Packit 5c3484
	C simple loop
Packit 5c3484
	C this is offset 0x22, so close enough to aligned
Packit 5c3484
L(simple):
Packit 5c3484
	C eax	scratch
Packit 5c3484
	C ebx	carry
Packit 5c3484
	C ecx	counter
Packit 5c3484
	C edx	scratch
Packit 5c3484
	C esi	src
Packit 5c3484
	C edi	dst
Packit 5c3484
	C ebp	multiplier
Packit 5c3484
Packit 5c3484
	movl	(%esi), %eax
Packit 5c3484
	addl	$4, %edi
Packit 5c3484
Packit 5c3484
	mull	%ebp
Packit 5c3484
Packit 5c3484
	addl	%ebx, %eax
Packit 5c3484
	adcl	$0, %edx
Packit 5c3484
Packit 5c3484
	M4_inst	%eax, -4(%edi)
Packit 5c3484
	movl	%edx, %ebx
Packit 5c3484
Packit 5c3484
	adcl	$0, %ebx
Packit 5c3484
	decl	%ecx
Packit 5c3484
Packit 5c3484
	leal	4(%esi), %esi
Packit 5c3484
	jnz	L(simple)
Packit 5c3484
Packit 5c3484
Packit 5c3484
	popl	%ebp
Packit 5c3484
	popl	%edi
Packit 5c3484
Packit 5c3484
	popl	%esi
Packit 5c3484
	movl	%ebx, %eax
Packit 5c3484
Packit 5c3484
	popl	%ebx
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
Packit 5c3484
Packit 5c3484
C------------------------------------------------------------------------------
Packit 5c3484
C VAR_JUMP holds the computed jump temporarily because there's not enough
Packit 5c3484
C registers when doing the mul for the initial two carry limbs.
Packit 5c3484
C
Packit 5c3484
C The add/adc for the initial carry in %ebx is necessary only for the
Packit 5c3484
C mpn_add/submul_1c entry points.  Duplicating the startup code to
Packit 5c3484
C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
Packit 5c3484
C idea.
Packit 5c3484
Packit 5c3484
dnl  overlapping with parameters already fetched
Packit 5c3484
define(VAR_COUNTER,`PARAM_SIZE')
Packit 5c3484
define(VAR_JUMP,   `PARAM_DST')
Packit 5c3484
Packit 5c3484
	C this is offset 0x43, so close enough to aligned
Packit 5c3484
L(unroll):
Packit 5c3484
	C eax
Packit 5c3484
	C ebx	initial carry
Packit 5c3484
	C ecx	size
Packit 5c3484
	C edx
Packit 5c3484
	C esi	src
Packit 5c3484
	C edi	dst
Packit 5c3484
	C ebp
Packit 5c3484
Packit 5c3484
	movl	%ecx, %edx
Packit 5c3484
	decl	%ecx
Packit 5c3484
Packit 5c3484
	subl	$2, %edx
Packit 5c3484
	negl	%ecx
Packit 5c3484
Packit 5c3484
	shrl	$UNROLL_LOG2, %edx
Packit 5c3484
	andl	$UNROLL_MASK, %ecx
Packit 5c3484
Packit 5c3484
	movl	%edx, VAR_COUNTER
Packit 5c3484
	movl	%ecx, %edx
Packit 5c3484
Packit 5c3484
	C 15 code bytes per limb
Packit 5c3484
ifdef(`PIC',`
Packit 5c3484
	call	L(pic_calc)
Packit 5c3484
L(here):
Packit 5c3484
',`
Packit 5c3484
	shll	$4, %edx
Packit 5c3484
	negl	%ecx
Packit 5c3484
Packit 5c3484
	leal	L(entry) (%edx,%ecx,1), %edx
Packit 5c3484
')
Packit 5c3484
	movl	(%esi), %eax		C src low limb
Packit 5c3484
Packit 5c3484
	movl	%edx, VAR_JUMP
Packit 5c3484
	leal	ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
Packit 5c3484
Packit 5c3484
	mull	%ebp
Packit 5c3484
Packit 5c3484
	addl	%ebx, %eax	C initial carry (from _1c)
Packit 5c3484
	adcl	$0, %edx
Packit 5c3484
Packit 5c3484
	movl	%edx, %ebx	C high carry
Packit 5c3484
	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
Packit 5c3484
Packit 5c3484
	movl	VAR_JUMP, %edx
Packit 5c3484
	testl	$1, %ecx
Packit 5c3484
	movl	%eax, %ecx	C low carry
Packit 5c3484
Packit 5c3484
	cmovnz(	%ebx, %ecx)	C high,low carry other way around
Packit 5c3484
	cmovnz(	%eax, %ebx)
Packit 5c3484
Packit 5c3484
	jmp	*%edx
Packit 5c3484
Packit 5c3484
Packit 5c3484
ifdef(`PIC',`
Packit 5c3484
L(pic_calc):
Packit 5c3484
	shll	$4, %edx
Packit 5c3484
	negl	%ecx
Packit 5c3484
Packit 5c3484
	C See mpn/x86/README about old gas bugs
Packit 5c3484
	leal	(%edx,%ecx,1), %edx
Packit 5c3484
	addl	$L(entry)-L(here), %edx
Packit 5c3484
Packit 5c3484
	addl	(%esp), %edx
Packit 5c3484
Packit 5c3484
	ret_internal
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C -----------------------------------------------------------
Packit 5c3484
	ALIGN(32)
Packit 5c3484
L(top):
Packit 5c3484
deflit(`FRAME',16)
Packit 5c3484
	C eax	scratch
Packit 5c3484
	C ebx	carry hi
Packit 5c3484
	C ecx	carry lo
Packit 5c3484
	C edx	scratch
Packit 5c3484
	C esi	src
Packit 5c3484
	C edi	dst
Packit 5c3484
	C ebp	multiplier
Packit 5c3484
	C
Packit 5c3484
	C VAR_COUNTER	loop counter
Packit 5c3484
	C
Packit 5c3484
	C 15 code bytes per limb
Packit 5c3484
Packit 5c3484
	addl	$UNROLL_BYTES, %edi
Packit 5c3484
Packit 5c3484
L(entry):
Packit 5c3484
deflit(CHUNK_COUNT,2)
Packit 5c3484
forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
Packit 5c3484
	deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
Packit 5c3484
	deflit(`disp1', eval(disp0 + 4))
Packit 5c3484
Packit 5c3484
Zdisp(	movl,	disp0,(%esi), %eax)
Packit 5c3484
	mull	%ebp
Packit 5c3484
Zdisp(	M4_inst,%ecx, disp0,(%edi))
Packit 5c3484
	adcl	%eax, %ebx
Packit 5c3484
	movl	%edx, %ecx
Packit 5c3484
	adcl	$0, %ecx
Packit 5c3484
Packit 5c3484
	movl	disp1(%esi), %eax
Packit 5c3484
	mull	%ebp
Packit 5c3484
	M4_inst	%ebx, disp1(%edi)
Packit 5c3484
	adcl	%eax, %ecx
Packit 5c3484
	movl	%edx, %ebx
Packit 5c3484
	adcl	$0, %ebx
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
	decl	VAR_COUNTER
Packit 5c3484
	leal	UNROLL_BYTES(%esi), %esi
Packit 5c3484
Packit 5c3484
	jns	L(top)
Packit 5c3484
Packit 5c3484
Packit 5c3484
deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
Packit 5c3484
Packit 5c3484
	M4_inst	%ecx, disp0(%edi)
Packit 5c3484
	movl	%ebx, %eax
Packit 5c3484
Packit 5c3484
	popl	%ebp
Packit 5c3484
	popl	%edi
Packit 5c3484
Packit 5c3484
	popl	%esi
Packit 5c3484
	popl	%ebx
Packit 5c3484
	adcl	$0, %eax
Packit 5c3484
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
EPILOGUE()