|
Packit |
5c3484 |
dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl Copyright 2010-2012 Free Software Foundation, Inc.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
dnl This file is part of the GNU MP Library.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
Packit |
5c3484 |
dnl it under the terms of either:
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU Lesser General Public License as published by the Free
|
|
Packit |
5c3484 |
dnl Software Foundation; either version 3 of the License, or (at your
|
|
Packit |
5c3484 |
dnl option) any later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl * the GNU General Public License as published by the Free Software
|
|
Packit |
5c3484 |
dnl Foundation; either version 2 of the License, or (at your option) any
|
|
Packit |
5c3484 |
dnl later version.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl or both in parallel, as here.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
Packit |
5c3484 |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
Packit |
5c3484 |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
Packit |
5c3484 |
dnl for more details.
|
|
Packit |
5c3484 |
dnl
|
|
Packit |
5c3484 |
dnl You should have received copies of the GNU General Public License and the
|
|
Packit |
5c3484 |
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
Packit |
5c3484 |
dnl see https://www.gnu.org/licenses/.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
include(`../config.m4')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C cycles/limb cycles/limb good
|
|
Packit |
5c3484 |
C 16-byte aligned 16-byte unaligned for cpu?
|
|
Packit |
5c3484 |
C AMD K8,K9 ? ?
|
|
Packit |
5c3484 |
C AMD K10 1.85 (1.635) 1.9 (1.67) Y
|
|
Packit |
5c3484 |
C AMD bd1 1.82 (1.75) 1.82 (1.75) Y
|
|
Packit |
5c3484 |
C AMD bobcat 4.5 4.5
|
|
Packit |
5c3484 |
C Intel P4 3.6 (3.125) 3.6 (3.125) Y
|
|
Packit |
5c3484 |
C Intel core2 2.05 (1.67) 2.55 (1.75)
|
|
Packit |
5c3484 |
C Intel NHM 2.05 (1.875) 2.6 (2.25)
|
|
Packit |
5c3484 |
C Intel SBR 1.55 (1.44) 2 (1.57) Y
|
|
Packit |
5c3484 |
C Intel atom ? ?
|
|
Packit |
5c3484 |
C VIA nano 2.5 (2.5) 2.5 (2.5) Y
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C We try to do as many 16-byte operations as possible. The top-most and
|
|
Packit |
5c3484 |
C bottom-most writes might need 8-byte operations. We always write using
|
|
Packit |
5c3484 |
C 16-byte operations, we read with both 8-byte and 16-byte operations.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
|
|
Packit |
5c3484 |
C not true. The aligned case reads 16+8 bytes, the unaligned case reads
|
|
Packit |
5c3484 |
C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C This is not yet great code:
|
|
Packit |
5c3484 |
C (1) The unaligned case makes too many reads.
|
|
Packit |
5c3484 |
C (2) We should do some unrolling, at least 2-way.
|
|
Packit |
5c3484 |
C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
|
|
Packit |
5c3484 |
C Nano.
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C INPUT PARAMETERS
|
|
Packit |
5c3484 |
define(`rp', `%rdi')
|
|
Packit |
5c3484 |
define(`ap', `%rsi')
|
|
Packit |
5c3484 |
define(`n', `%rdx')
|
|
Packit |
5c3484 |
define(`cnt', `%rcx')
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ASM_START()
|
|
Packit |
5c3484 |
TEXT
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
PROLOGUE(mpn_lshiftc)
|
|
Packit |
5c3484 |
movd R32(%rcx), %xmm4
|
|
Packit |
5c3484 |
mov $64, R32(%rax)
|
|
Packit |
5c3484 |
sub R32(%rcx), R32(%rax)
|
|
Packit |
5c3484 |
movd R32(%rax), %xmm5
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
neg R32(%rcx)
|
|
Packit |
5c3484 |
mov -8(ap,n,8), %rax
|
|
Packit |
5c3484 |
shr R8(%rcx), %rax
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
pcmpeqb %xmm7, %xmm7 C set to 111...111
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
cmp $2, n
|
|
Packit |
5c3484 |
jle L(le2)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
lea (rp,n,8), R32(%rcx)
|
|
Packit |
5c3484 |
test $8, R8(%rcx)
|
|
Packit |
5c3484 |
je L(rp_aligned)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Do one initial limb in order to make rp aligned
|
|
Packit |
5c3484 |
movq -8(ap,n,8), %xmm0
|
|
Packit |
5c3484 |
movq -16(ap,n,8), %xmm1
|
|
Packit |
5c3484 |
psllq %xmm4, %xmm0
|
|
Packit |
5c3484 |
psrlq %xmm5, %xmm1
|
|
Packit |
5c3484 |
por %xmm1, %xmm0
|
|
Packit |
5c3484 |
pxor %xmm7, %xmm0
|
|
Packit |
5c3484 |
movq %xmm0, -8(rp,n,8)
|
|
Packit |
5c3484 |
dec n
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(rp_aligned):
|
|
Packit |
5c3484 |
lea (ap,n,8), R32(%rcx)
|
|
Packit |
5c3484 |
test $8, R8(%rcx)
|
|
Packit |
5c3484 |
je L(aent)
|
|
Packit |
5c3484 |
jmp L(uent)
|
|
Packit |
5c3484 |
C *****************************************************************************
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Handle the case when ap != rp (mod 16).
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
L(utop):movq (ap,n,8), %xmm1
|
|
Packit |
5c3484 |
punpcklqdq 8(ap,n,8), %xmm1
|
|
Packit |
5c3484 |
movdqa -8(ap,n,8), %xmm0
|
|
Packit |
5c3484 |
psllq %xmm4, %xmm1
|
|
Packit |
5c3484 |
psrlq %xmm5, %xmm0
|
|
Packit |
5c3484 |
por %xmm1, %xmm0
|
|
Packit |
5c3484 |
pxor %xmm7, %xmm0
|
|
Packit |
5c3484 |
movdqa %xmm0, (rp,n,8)
|
|
Packit |
5c3484 |
L(uent):sub $2, n
|
|
Packit |
5c3484 |
ja L(utop)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
jne L(end8)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movq (ap), %xmm1
|
|
Packit |
5c3484 |
pxor %xmm0, %xmm0
|
|
Packit |
5c3484 |
punpcklqdq %xmm1, %xmm0
|
|
Packit |
5c3484 |
punpcklqdq 8(ap), %xmm1
|
|
Packit |
5c3484 |
psllq %xmm4, %xmm1
|
|
Packit |
5c3484 |
psrlq %xmm5, %xmm0
|
|
Packit |
5c3484 |
por %xmm1, %xmm0
|
|
Packit |
5c3484 |
pxor %xmm7, %xmm0
|
|
Packit |
5c3484 |
movdqa %xmm0, (rp)
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
C *****************************************************************************
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
C Handle the case when ap = rp (mod 16).
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2]
|
|
Packit |
5c3484 |
movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3]
|
|
Packit |
5c3484 |
punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3]
|
|
Packit |
5c3484 |
psllq %xmm4, %xmm0
|
|
Packit |
5c3484 |
psrlq %xmm5, %xmm1
|
|
Packit |
5c3484 |
por %xmm1, %xmm0
|
|
Packit |
5c3484 |
pxor %xmm7, %xmm0
|
|
Packit |
5c3484 |
movdqa %xmm0, (rp,n,8)
|
|
Packit |
5c3484 |
L(aent):sub $2, n
|
|
Packit |
5c3484 |
ja L(atop)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
jne L(end8)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movdqa (ap), %xmm0
|
|
Packit |
5c3484 |
pxor %xmm1, %xmm1
|
|
Packit |
5c3484 |
punpcklqdq %xmm0, %xmm1
|
|
Packit |
5c3484 |
psllq %xmm4, %xmm0
|
|
Packit |
5c3484 |
psrlq %xmm5, %xmm1
|
|
Packit |
5c3484 |
por %xmm1, %xmm0
|
|
Packit |
5c3484 |
pxor %xmm7, %xmm0
|
|
Packit |
5c3484 |
movdqa %xmm0, (rp)
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
C *****************************************************************************
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
ALIGN(16)
|
|
Packit |
5c3484 |
L(le2): jne L(end8)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
movq 8(ap), %xmm0
|
|
Packit |
5c3484 |
movq (ap), %xmm1
|
|
Packit |
5c3484 |
psllq %xmm4, %xmm0
|
|
Packit |
5c3484 |
psrlq %xmm5, %xmm1
|
|
Packit |
5c3484 |
por %xmm1, %xmm0
|
|
Packit |
5c3484 |
pxor %xmm7, %xmm0
|
|
Packit |
5c3484 |
movq %xmm0, 8(rp)
|
|
Packit |
5c3484 |
|
|
Packit |
5c3484 |
L(end8):movq (ap), %xmm0
|
|
Packit |
5c3484 |
psllq %xmm4, %xmm0
|
|
Packit |
5c3484 |
pxor %xmm7, %xmm0
|
|
Packit |
5c3484 |
movq %xmm0, (rp)
|
|
Packit |
5c3484 |
ret
|
|
Packit |
5c3484 |
EPILOGUE()
|
|
rpm-build |
c3cd4f |
CF_PROT
|