Blame mpn/x86/sqr_basecase.asm

Packit 5c3484
dnl  x86 generic mpn_sqr_basecase -- square an mpn number.
Packit 5c3484
Packit 5c3484
dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C     cycles/crossproduct  cycles/triangleproduct
Packit 5c3484
C P5
Packit 5c3484
C P6
Packit 5c3484
C K6
Packit 5c3484
C K7
Packit 5c3484
C P4
Packit 5c3484
Packit 5c3484
Packit 5c3484
C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
Packit 5c3484
C
Packit 5c3484
C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
Packit 5c3484
C lot of function call overheads are avoided, especially when the size is
Packit 5c3484
C small.
Packit 5c3484
C
Packit 5c3484
C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
Packit 5c3484
C code size to do so here.
Packit 5c3484
C
Packit 5c3484
C Enhancements:
Packit 5c3484
C
Packit 5c3484
C The addmul loop here is also not unrolled like aorsmul_1.asm and
Packit 5c3484
C mul_basecase.asm are.  Perhaps it should be done.  It'd add to the
Packit 5c3484
C complexity, but if it's worth doing in the other places then it should be
Packit 5c3484
C worthwhile here.
Packit 5c3484
C
Packit 5c3484
C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
Packit 5c3484
C might be worth considering.  That'd add quite a bit to the code size, but
Packit 5c3484
C only as much as is used would be dragged into L1 cache.
Packit 5c3484
Packit 5c3484
defframe(PARAM_SIZE,12)
Packit 5c3484
defframe(PARAM_SRC, 8)
Packit 5c3484
defframe(PARAM_DST, 4)
Packit 5c3484
Packit 5c3484
	TEXT
Packit 5c3484
	ALIGN(8)
Packit 5c3484
PROLOGUE(mpn_sqr_basecase)
Packit 5c3484
deflit(`FRAME',0)
Packit 5c3484
Packit 5c3484
	movl	PARAM_SIZE, %edx
Packit 5c3484
Packit 5c3484
	movl	PARAM_SRC, %eax
Packit 5c3484
Packit 5c3484
	cmpl	$2, %edx
Packit 5c3484
	movl	PARAM_DST, %ecx
Packit 5c3484
Packit 5c3484
	je	L(two_limbs)
Packit 5c3484
	ja	L(three_or_more)
Packit 5c3484
Packit 5c3484
Packit 5c3484
C -----------------------------------------------------------------------------
Packit 5c3484
C one limb only
Packit 5c3484
	C eax	src
Packit 5c3484
	C ebx
Packit 5c3484
	C ecx	dst
Packit 5c3484
	C edx
Packit 5c3484
Packit 5c3484
	movl	(%eax), %eax
Packit 5c3484
	mull	%eax
Packit 5c3484
	movl	%eax, (%ecx)
Packit 5c3484
	movl	%edx, 4(%ecx)
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
Packit 5c3484
C -----------------------------------------------------------------------------
Packit 5c3484
	ALIGN(8)
Packit 5c3484
L(two_limbs):
Packit 5c3484
	C eax	src
Packit 5c3484
	C ebx
Packit 5c3484
	C ecx	dst
Packit 5c3484
	C edx
Packit 5c3484
Packit 5c3484
	pushl	%ebx
Packit 5c3484
	pushl	%ebp
Packit 5c3484
Packit 5c3484
	movl	%eax, %ebx
Packit 5c3484
	movl	(%eax), %eax
Packit 5c3484
Packit 5c3484
	mull	%eax		C src[0]^2
Packit 5c3484
Packit 5c3484
	pushl	%esi
Packit 5c3484
	pushl	%edi
Packit 5c3484
Packit 5c3484
	movl	%edx, %esi	C dst[1]
Packit 5c3484
	movl	%eax, (%ecx)	C dst[0]
Packit 5c3484
Packit 5c3484
	movl	4(%ebx), %eax
Packit 5c3484
	mull	%eax		C src[1]^2
Packit 5c3484
Packit 5c3484
	movl	%eax, %edi	C dst[2]
Packit 5c3484
	movl	%edx, %ebp	C dst[3]
Packit 5c3484
Packit 5c3484
	movl	(%ebx), %eax
Packit 5c3484
	mull	4(%ebx)		C src[0]*src[1]
Packit 5c3484
Packit 5c3484
	addl	%eax, %esi
Packit 5c3484
Packit 5c3484
	adcl	%edx, %edi
Packit 5c3484
Packit 5c3484
	adcl	$0, %ebp
Packit 5c3484
	addl	%esi, %eax
Packit 5c3484
Packit 5c3484
	adcl	%edi, %edx
Packit 5c3484
	movl	%eax, 4(%ecx)
Packit 5c3484
Packit 5c3484
	adcl	$0, %ebp
Packit 5c3484
Packit 5c3484
	movl	%edx, 8(%ecx)
Packit 5c3484
	movl	%ebp, 12(%ecx)
Packit 5c3484
Packit 5c3484
	popl	%edi
Packit 5c3484
	popl	%esi
Packit 5c3484
Packit 5c3484
	popl	%ebp
Packit 5c3484
	popl	%ebx
Packit 5c3484
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
Packit 5c3484
C -----------------------------------------------------------------------------
Packit 5c3484
	ALIGN(8)
Packit 5c3484
L(three_or_more):
Packit 5c3484
deflit(`FRAME',0)
Packit 5c3484
	C eax	src
Packit 5c3484
	C ebx
Packit 5c3484
	C ecx	dst
Packit 5c3484
	C edx	size
Packit 5c3484
Packit 5c3484
	pushl	%ebx	FRAME_pushl()
Packit 5c3484
	pushl	%edi	FRAME_pushl()
Packit 5c3484
Packit 5c3484
	pushl	%esi	FRAME_pushl()
Packit 5c3484
	pushl	%ebp	FRAME_pushl()
Packit 5c3484
Packit 5c3484
	leal	(%ecx,%edx,4), %edi	C &dst[size], end of this mul1
Packit 5c3484
	leal	(%eax,%edx,4), %esi	C &src[size]
Packit 5c3484
Packit 5c3484
C First multiply src[0]*src[1..size-1] and store at dst[1..size].
Packit 5c3484
Packit 5c3484
	movl	(%eax), %ebp		C src[0], multiplier
Packit 5c3484
	movl	%edx, %ecx
Packit 5c3484
Packit 5c3484
	negl	%ecx			C -size
Packit 5c3484
	xorl	%ebx, %ebx		C clear carry limb
Packit 5c3484
Packit 5c3484
	incl	%ecx			C -(size-1)
Packit 5c3484
Packit 5c3484
L(mul1):
Packit 5c3484
	C eax	scratch
Packit 5c3484
	C ebx	carry
Packit 5c3484
	C ecx	counter, limbs, negative
Packit 5c3484
	C edx	scratch
Packit 5c3484
	C esi	&src[size]
Packit 5c3484
	C edi	&dst[size]
Packit 5c3484
	C ebp	multiplier
Packit 5c3484
Packit 5c3484
	movl	(%esi,%ecx,4), %eax
Packit 5c3484
	mull	%ebp
Packit 5c3484
	addl	%eax, %ebx
Packit 5c3484
	adcl	$0, %edx
Packit 5c3484
	movl	%ebx, (%edi,%ecx,4)
Packit 5c3484
	movl	%edx, %ebx
Packit 5c3484
	incl	%ecx
Packit 5c3484
	jnz	L(mul1)
Packit 5c3484
Packit 5c3484
	movl	%ebx, (%edi)
Packit 5c3484
Packit 5c3484
Packit 5c3484
	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
Packit 5c3484
	C n=1..size-2.
Packit 5c3484
	C
Packit 5c3484
	C The last products src[size-2]*src[size-1], which is the end corner
Packit 5c3484
	C of the product triangle, is handled separately at the end to save
Packit 5c3484
	C looping overhead.  If size is 3 then it's only this that needs to
Packit 5c3484
	C be done.
Packit 5c3484
	C
Packit 5c3484
	C In the outer loop %esi is a constant, and %edi just advances by 1
Packit 5c3484
	C limb each time.  The size of the operation decreases by 1 limb
Packit 5c3484
	C each time.
Packit 5c3484
Packit 5c3484
	C eax
Packit 5c3484
	C ebx	carry (needing carry flag added)
Packit 5c3484
	C ecx
Packit 5c3484
	C edx
Packit 5c3484
	C esi	&src[size]
Packit 5c3484
	C edi	&dst[size]
Packit 5c3484
	C ebp
Packit 5c3484
Packit 5c3484
	movl	PARAM_SIZE, %ecx
Packit 5c3484
	subl	$3, %ecx
Packit 5c3484
	jz	L(corner)
Packit 5c3484
Packit 5c3484
	negl	%ecx
Packit 5c3484
Packit 5c3484
dnl  re-use parameter space
Packit 5c3484
define(VAR_OUTER,`PARAM_DST')
Packit 5c3484
Packit 5c3484
L(outer):
Packit 5c3484
	C eax
Packit 5c3484
	C ebx
Packit 5c3484
	C ecx
Packit 5c3484
	C edx	outer loop counter, -(size-3) to -1
Packit 5c3484
	C esi	&src[size]
Packit 5c3484
	C edi	dst, pointing at stored carry limb of previous loop
Packit 5c3484
	C ebp
Packit 5c3484
Packit 5c3484
	movl	%ecx, VAR_OUTER
Packit 5c3484
	addl	$4, %edi		C advance dst end
Packit 5c3484
Packit 5c3484
	movl	-8(%esi,%ecx,4), %ebp	C next multiplier
Packit 5c3484
	subl	$1, %ecx
Packit 5c3484
Packit 5c3484
	xorl	%ebx, %ebx		C initial carry limb
Packit 5c3484
Packit 5c3484
L(inner):
Packit 5c3484
	C eax	scratch
Packit 5c3484
	C ebx	carry (needing carry flag added)
Packit 5c3484
	C ecx	counter, -n-1 to -1
Packit 5c3484
	C edx	scratch
Packit 5c3484
	C esi	&src[size]
Packit 5c3484
	C edi	dst end of this addmul
Packit 5c3484
	C ebp	multiplier
Packit 5c3484
Packit 5c3484
	movl	(%esi,%ecx,4), %eax
Packit 5c3484
	mull	%ebp
Packit 5c3484
	addl	%ebx, %eax
Packit 5c3484
	adcl	$0, %edx
Packit 5c3484
	addl	%eax, (%edi,%ecx,4)
Packit 5c3484
	adcl	$0, %edx
Packit 5c3484
	movl	%edx, %ebx
Packit 5c3484
	addl	$1, %ecx
Packit 5c3484
	jl	L(inner)
Packit 5c3484
Packit 5c3484
Packit 5c3484
	movl	%ebx, (%edi)
Packit 5c3484
	movl	VAR_OUTER, %ecx
Packit 5c3484
	incl	%ecx
Packit 5c3484
	jnz	L(outer)
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(corner):
Packit 5c3484
	C esi	&src[size]
Packit 5c3484
	C edi	&dst[2*size-3]
Packit 5c3484
Packit 5c3484
	movl	-4(%esi), %eax
Packit 5c3484
	mull	-8(%esi)		C src[size-1]*src[size-2]
Packit 5c3484
	addl	%eax, 0(%edi)
Packit 5c3484
	adcl	$0, %edx
Packit 5c3484
	movl	%edx, 4(%edi)		C dst high limb
Packit 5c3484
Packit 5c3484
Packit 5c3484
C -----------------------------------------------------------------------------
Packit 5c3484
C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
Packit 5c3484
Packit 5c3484
	movl	PARAM_SIZE, %eax
Packit 5c3484
	negl	%eax
Packit 5c3484
	addl	$1, %eax		C -(size-1) and clear carry
Packit 5c3484
Packit 5c3484
L(lshift):
Packit 5c3484
	C eax	counter, negative
Packit 5c3484
	C ebx	next limb
Packit 5c3484
	C ecx
Packit 5c3484
	C edx
Packit 5c3484
	C esi
Packit 5c3484
	C edi	&dst[2*size-4]
Packit 5c3484
	C ebp
Packit 5c3484
Packit 5c3484
	rcll	8(%edi,%eax,8)
Packit 5c3484
	rcll	12(%edi,%eax,8)
Packit 5c3484
	incl	%eax
Packit 5c3484
	jnz	L(lshift)
Packit 5c3484
Packit 5c3484
Packit 5c3484
	adcl	%eax, %eax		C high bit out
Packit 5c3484
	movl	%eax, 8(%edi)		C dst most significant limb
Packit 5c3484
Packit 5c3484
Packit 5c3484
C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
Packit 5c3484
C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
Packit 5c3484
C low limb of src[0]^2.
Packit 5c3484
Packit 5c3484
	movl	PARAM_SRC, %esi
Packit 5c3484
	movl	(%esi), %eax		C src[0]
Packit 5c3484
	mull	%eax			C src[0]^2
Packit 5c3484
Packit 5c3484
	movl	PARAM_SIZE, %ecx
Packit 5c3484
	leal	(%esi,%ecx,4), %esi	C src end
Packit 5c3484
Packit 5c3484
	negl	%ecx			C -size
Packit 5c3484
	movl	%edx, %ebx		C initial carry
Packit 5c3484
Packit 5c3484
	movl	%eax, 12(%edi,%ecx,8)	C dst[0]
Packit 5c3484
	incl	%ecx			C -(size-1)
Packit 5c3484
Packit 5c3484
L(diag):
Packit 5c3484
	C eax	scratch (low product)
Packit 5c3484
	C ebx	carry limb
Packit 5c3484
	C ecx	counter, -(size-1) to -1
Packit 5c3484
	C edx	scratch (high product)
Packit 5c3484
	C esi	&src[size]
Packit 5c3484
	C edi	&dst[2*size-3]
Packit 5c3484
	C ebp	scratch (fetched dst limbs)
Packit 5c3484
Packit 5c3484
	movl	(%esi,%ecx,4), %eax
Packit 5c3484
	mull	%eax
Packit 5c3484
Packit 5c3484
	addl	%ebx, 8(%edi,%ecx,8)
Packit 5c3484
	movl	%edx, %ebx
Packit 5c3484
Packit 5c3484
	adcl	%eax, 12(%edi,%ecx,8)
Packit 5c3484
	adcl	$0, %ebx
Packit 5c3484
Packit 5c3484
	incl	%ecx
Packit 5c3484
	jnz	L(diag)
Packit 5c3484
Packit 5c3484
Packit 5c3484
	addl	%ebx, 8(%edi)		C dst most significant limb
Packit 5c3484
Packit 5c3484
	popl	%ebp
Packit 5c3484
	popl	%esi
Packit 5c3484
Packit 5c3484
	popl	%edi
Packit 5c3484
	popl	%ebx
Packit 5c3484
Packit 5c3484
	ret
Packit 5c3484
Packit 5c3484
EPILOGUE()