|
Packit |
5c3484 |
dnl x86 generic mpn_sqr_basecase -- square an mpn number.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C cycles/crossproduct cycles/triangleproduct
|
|
Packit |
5c3484 |
C P5
|
|
Packit |
5c3484 |
C P6
|
|
Packit |
5c3484 |
C K6
|
|
Packit |
5c3484 |
C K7
|
|
Packit |
5c3484 |
C P4
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
|
|
Packit |
5c3484 |
C lot of function call overheads are avoided, especially when the size is
|
|
Packit |
5c3484 |
C small.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
|
|
Packit |
5c3484 |
C code size to do so here.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C Enhancements:
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C The addmul loop here is also not unrolled like aorsmul_1.asm and
|
|
Packit |
5c3484 |
C mul_basecase.asm are. Perhaps it should be done. It'd add to the
|
|
Packit |
5c3484 |
C complexity, but if it's worth doing in the other places then it should be
|
|
Packit |
5c3484 |
C worthwhile here.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
|
|
Packit |
5c3484 |
C might be worth considering. That'd add quite a bit to the code size, but
|
|
Packit |
5c3484 |
C only as much as is used would be dragged into L1 cache.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
defframe(PARAM_SIZE,12)
|
|
Packit |
5c3484 |
defframe(PARAM_SRC, 8)
|
|
Packit |
5c3484 |
defframe(PARAM_DST, 4)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
TEXT
|
|
Packit |
5c3484 |
ALIGN(8)
|
|
Packit |
5c3484 |
PROLOGUE(mpn_sqr_basecase)
|
|
Packit |
5c3484 |
deflit(`FRAME',0)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_SIZE, %edx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_SRC, %eax
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
cmpl $2, %edx
|
|
Packit |
5c3484 |
movl PARAM_DST, %ecx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
je L(two_limbs)
|
|
Packit |
5c3484 |
ja L(three_or_more)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C -----------------------------------------------------------------------------
|
|
Packit |
5c3484 |
C one limb only
|
|
Packit |
5c3484 |
C eax src
|
|
Packit |
5c3484 |
C ebx
|
|
Packit |
5c3484 |
C ecx dst
|
|
Packit |
5c3484 |
C edx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl (%eax), %eax
|
|
Packit |
5c3484 |
mull %eax
|
|
Packit |
5c3484 |
movl %eax, (%ecx)
|
|
Packit |
5c3484 |
movl %edx, 4(%ecx)
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C -----------------------------------------------------------------------------
|
|
Packit |
5c3484 |
ALIGN(8)
|
|
Packit |
5c3484 |
L(two_limbs):
|
|
Packit |
5c3484 |
C eax src
|
|
Packit |
5c3484 |
C ebx
|
|
Packit |
5c3484 |
C ecx dst
|
|
Packit |
5c3484 |
C edx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
pushl %ebx
|
|
Packit |
5c3484 |
pushl %ebp
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl %eax, %ebx
|
|
Packit |
5c3484 |
movl (%eax), %eax
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
mull %eax C src[0]^2
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
pushl %esi
|
|
Packit |
5c3484 |
pushl %edi
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl %edx, %esi C dst[1]
|
|
Packit |
5c3484 |
movl %eax, (%ecx) C dst[0]
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl 4(%ebx), %eax
|
|
Packit |
5c3484 |
mull %eax C src[1]^2
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl %eax, %edi C dst[2]
|
|
Packit |
5c3484 |
movl %edx, %ebp C dst[3]
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl (%ebx), %eax
|
|
Packit |
5c3484 |
mull 4(%ebx) C src[0]*src[1]
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
addl %eax, %esi
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
adcl %edx, %edi
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
adcl $0, %ebp
|
|
Packit |
5c3484 |
addl %esi, %eax
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
adcl %edi, %edx
|
|
Packit |
5c3484 |
movl %eax, 4(%ecx)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
adcl $0, %ebp
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl %edx, 8(%ecx)
|
|
Packit |
5c3484 |
movl %ebp, 12(%ecx)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
popl %edi
|
|
Packit |
5c3484 |
popl %esi
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
popl %ebp
|
|
Packit |
5c3484 |
popl %ebx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C -----------------------------------------------------------------------------
|
|
Packit |
5c3484 |
ALIGN(8)
|
|
Packit |
5c3484 |
L(three_or_more):
|
|
Packit |
5c3484 |
deflit(`FRAME',0)
|
|
Packit |
5c3484 |
C eax src
|
|
Packit |
5c3484 |
C ebx
|
|
Packit |
5c3484 |
C ecx dst
|
|
Packit |
5c3484 |
C edx size
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
pushl %ebx FRAME_pushl()
|
|
Packit |
5c3484 |
pushl %edi FRAME_pushl()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
pushl %esi FRAME_pushl()
|
|
Packit |
5c3484 |
pushl %ebp FRAME_pushl()
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
leal (%ecx,%edx,4), %edi C &dst[size], end of this mul1
|
|
Packit |
5c3484 |
leal (%eax,%edx,4), %esi C &src[size]
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C First multiply src[0]*src[1..size-1] and store at dst[1..size].
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl (%eax), %ebp C src[0], multiplier
|
|
Packit |
5c3484 |
movl %edx, %ecx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
negl %ecx C -size
|
|
Packit |
5c3484 |
xorl %ebx, %ebx C clear carry limb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
incl %ecx C -(size-1)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(mul1):
|
|
Packit |
5c3484 |
C eax scratch
|
|
Packit |
5c3484 |
C ebx carry
|
|
Packit |
5c3484 |
C ecx counter, limbs, negative
|
|
Packit |
5c3484 |
C edx scratch
|
|
Packit |
5c3484 |
C esi &src[size]
|
|
Packit |
5c3484 |
C edi &dst[size]
|
|
Packit |
5c3484 |
C ebp multiplier
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl (%esi,%ecx,4), %eax
|
|
Packit |
5c3484 |
mull %ebp
|
|
Packit |
5c3484 |
addl %eax, %ebx
|
|
Packit |
5c3484 |
adcl $0, %edx
|
|
Packit |
5c3484 |
movl %ebx, (%edi,%ecx,4)
|
|
Packit |
5c3484 |
movl %edx, %ebx
|
|
Packit |
5c3484 |
incl %ecx
|
|
Packit |
5c3484 |
jnz L(mul1)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl %ebx, (%edi)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
|
|
Packit |
5c3484 |
C n=1..size-2.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C The last products src[size-2]*src[size-1], which is the end corner
|
|
Packit |
5c3484 |
C of the product triangle, is handled separately at the end to save
|
|
Packit |
5c3484 |
C looping overhead. If size is 3 then it's only this that needs to
|
|
Packit |
5c3484 |
C be done.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C In the outer loop %esi is a constant, and %edi just advances by 1
|
|
Packit |
5c3484 |
C limb each time. The size of the operation decreases by 1 limb
|
|
Packit |
5c3484 |
C each time.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C eax
|
|
Packit |
5c3484 |
C ebx carry (needing carry flag added)
|
|
Packit |
5c3484 |
C ecx
|
|
Packit |
5c3484 |
C edx
|
|
Packit |
5c3484 |
C esi &src[size]
|
|
Packit |
5c3484 |
C edi &dst[size]
|
|
Packit |
5c3484 |
C ebp
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_SIZE, %ecx
|
|
Packit |
5c3484 |
subl $3, %ecx
|
|
Packit |
5c3484 |
jz L(corner)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
negl %ecx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl re-use parameter space
|
|
Packit |
5c3484 |
define(VAR_OUTER,`PARAM_DST')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(outer):
|
|
Packit |
5c3484 |
C eax
|
|
Packit |
5c3484 |
C ebx
|
|
Packit |
5c3484 |
C ecx
|
|
Packit |
5c3484 |
C edx outer loop counter, -(size-3) to -1
|
|
Packit |
5c3484 |
C esi &src[size]
|
|
Packit |
5c3484 |
C edi dst, pointing at stored carry limb of previous loop
|
|
Packit |
5c3484 |
C ebp
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl %ecx, VAR_OUTER
|
|
Packit |
5c3484 |
addl $4, %edi C advance dst end
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl -8(%esi,%ecx,4), %ebp C next multiplier
|
|
Packit |
5c3484 |
subl $1, %ecx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
xorl %ebx, %ebx C initial carry limb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(inner):
|
|
Packit |
5c3484 |
C eax scratch
|
|
Packit |
5c3484 |
C ebx carry (needing carry flag added)
|
|
Packit |
5c3484 |
C ecx counter, -n-1 to -1
|
|
Packit |
5c3484 |
C edx scratch
|
|
Packit |
5c3484 |
C esi &src[size]
|
|
Packit |
5c3484 |
C edi dst end of this addmul
|
|
Packit |
5c3484 |
C ebp multiplier
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl (%esi,%ecx,4), %eax
|
|
Packit |
5c3484 |
mull %ebp
|
|
Packit |
5c3484 |
addl %ebx, %eax
|
|
Packit |
5c3484 |
adcl $0, %edx
|
|
Packit |
5c3484 |
addl %eax, (%edi,%ecx,4)
|
|
Packit |
5c3484 |
adcl $0, %edx
|
|
Packit |
5c3484 |
movl %edx, %ebx
|
|
Packit |
5c3484 |
addl $1, %ecx
|
|
Packit |
5c3484 |
jl L(inner)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl %ebx, (%edi)
|
|
Packit |
5c3484 |
movl VAR_OUTER, %ecx
|
|
Packit |
5c3484 |
incl %ecx
|
|
Packit |
5c3484 |
jnz L(outer)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(corner):
|
|
Packit |
5c3484 |
C esi &src[size]
|
|
Packit |
5c3484 |
C edi &dst[2*size-3]
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl -4(%esi), %eax
|
|
Packit |
5c3484 |
mull -8(%esi) C src[size-1]*src[size-2]
|
|
Packit |
5c3484 |
addl %eax, 0(%edi)
|
|
Packit |
5c3484 |
adcl $0, %edx
|
|
Packit |
5c3484 |
movl %edx, 4(%edi) C dst high limb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C -----------------------------------------------------------------------------
|
|
Packit |
5c3484 |
C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_SIZE, %eax
|
|
Packit |
5c3484 |
negl %eax
|
|
Packit |
5c3484 |
addl $1, %eax C -(size-1) and clear carry
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(lshift):
|
|
Packit |
5c3484 |
C eax counter, negative
|
|
Packit |
5c3484 |
C ebx next limb
|
|
Packit |
5c3484 |
C ecx
|
|
Packit |
5c3484 |
C edx
|
|
Packit |
5c3484 |
C esi
|
|
Packit |
5c3484 |
C edi &dst[2*size-4]
|
|
Packit |
5c3484 |
C ebp
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
rcll 8(%edi,%eax,8)
|
|
Packit |
5c3484 |
rcll 12(%edi,%eax,8)
|
|
Packit |
5c3484 |
incl %eax
|
|
Packit |
5c3484 |
jnz L(lshift)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
adcl %eax, %eax C high bit out
|
|
Packit |
5c3484 |
movl %eax, 8(%edi) C dst most significant limb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
|
|
Packit |
5c3484 |
C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
|
|
Packit |
5c3484 |
C low limb of src[0]^2.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_SRC, %esi
|
|
Packit |
5c3484 |
movl (%esi), %eax C src[0]
|
|
Packit |
5c3484 |
mull %eax C src[0]^2
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_SIZE, %ecx
|
|
Packit |
5c3484 |
leal (%esi,%ecx,4), %esi C src end
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
negl %ecx C -size
|
|
Packit |
5c3484 |
movl %edx, %ebx C initial carry
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl %eax, 12(%edi,%ecx,8) C dst[0]
|
|
Packit |
5c3484 |
incl %ecx C -(size-1)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(diag):
|
|
Packit |
5c3484 |
C eax scratch (low product)
|
|
Packit |
5c3484 |
C ebx carry limb
|
|
Packit |
5c3484 |
C ecx counter, -(size-1) to -1
|
|
Packit |
5c3484 |
C edx scratch (high product)
|
|
Packit |
5c3484 |
C esi &src[size]
|
|
Packit |
5c3484 |
C edi &dst[2*size-3]
|
|
Packit |
5c3484 |
C ebp scratch (fetched dst limbs)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl (%esi,%ecx,4), %eax
|
|
Packit |
5c3484 |
mull %eax
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
addl %ebx, 8(%edi,%ecx,8)
|
|
Packit |
5c3484 |
movl %edx, %ebx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
adcl %eax, 12(%edi,%ecx,8)
|
|
Packit |
5c3484 |
adcl $0, %ebx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
incl %ecx
|
|
Packit |
5c3484 |
jnz L(diag)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
addl %ebx, 8(%edi) C dst most significant limb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
popl %ebp
|
|
Packit |
5c3484 |
popl %esi
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
popl %edi
|
|
Packit |
5c3484 |
popl %ebx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
EPILOGUE()
|