|
Packit |
5c3484 |
dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C K6-2: 1.0 cycles/limb
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
|
|
Packit |
5c3484 |
C cycle startup time, which amounts for instance to a 2x speedup at 15
|
|
Packit |
5c3484 |
C limbs.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
|
|
Packit |
5c3484 |
C processing one limb separately to make it aligned. This and a final odd
|
|
Packit |
5c3484 |
C limb are handled in a branch-free fashion, ending up re-copying if the
|
|
Packit |
5c3484 |
C special case isn't needed.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C Alternatives:
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C There used to be a big unrolled version of this, running at 0.56 c/l if
|
|
Packit |
5c3484 |
C the destination was aligned, but that seemed rather excessive for the
|
|
Packit |
5c3484 |
C relative importance of copyd.
|
|
Packit |
5c3484 |
C
|
|
Packit |
5c3484 |
C If the destination alignment is ignored and just left to run at 1.17 c/l
|
|
Packit |
5c3484 |
C some code size and a fixed few cycles can be saved. Considering how few
|
|
Packit |
5c3484 |
C uses copyd finds perhaps that should be favoured. The current code has
|
|
Packit |
5c3484 |
C the attraction of being no slower than a basic rep movsl though.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
defframe(PARAM_SIZE,12)
|
|
Packit |
5c3484 |
defframe(PARAM_SRC, 8)
|
|
Packit |
5c3484 |
defframe(PARAM_DST, 4)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl re-using parameter space
|
|
Packit |
5c3484 |
define(SAVE_EBX,`PARAM_SIZE')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
TEXT
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
PROLOGUE(mpn_copyd)
|
|
Packit |
5c3484 |
deflit(`FRAME',0)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_SIZE, %ecx
|
|
Packit |
5c3484 |
movl %ebx, SAVE_EBX
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl PARAM_SRC, %eax
|
|
Packit |
5c3484 |
movl PARAM_DST, %edx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
subl $1, %ecx C better code alignment than decl
|
|
Packit |
5c3484 |
jb L(zero)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
jz L(one_more)
|
|
Packit |
5c3484 |
leal 4(%edx,%ecx,4), %ebx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
Zdisp( movd, 0,(%eax,%ecx,4), %mm0) C high limb
|
|
Packit |
5c3484 |
Zdisp( movd, %mm0, 0,(%edx,%ecx,4)) C Zdisp for good code alignment
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
cmpl $1, %ecx
|
|
Packit |
5c3484 |
je L(one_more)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
shrl $2, %ebx
|
|
Packit |
5c3484 |
andl $1, %ebx C 1 if dst[size-2] unaligned
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
subl %ebx, %ecx
|
|
Packit |
5c3484 |
nop C code alignment
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(top):
|
|
Packit |
5c3484 |
C eax src
|
|
Packit |
5c3484 |
C ebx
|
|
Packit |
5c3484 |
C ecx counter
|
|
Packit |
5c3484 |
C edx dst
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movq -4(%eax,%ecx,4), %mm0
|
|
Packit |
5c3484 |
subl $2, %ecx
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movq %mm0, 4(%edx,%ecx,4)
|
|
Packit |
5c3484 |
ja L(top)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(one_more):
|
|
Packit |
5c3484 |
movd (%eax), %mm0
|
|
Packit |
5c3484 |
movd %mm0, (%edx)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movl SAVE_EBX, %ebx
|
|
Packit |
5c3484 |
emms_or_femms
|
|
Packit |
5c3484 |
L(zero):
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
EPILOGUE()
|