Blame mpn/ia64/lorrshift.asm

Packit 5c3484
dnl  IA-64 mpn_lshift/mpn_rshift.
Packit 5c3484
Packit 5c3484
dnl  Contributed to the GNU project by Torbjorn Granlund.
Packit 5c3484
Packit 5c3484
dnl  Copyright 2000-2005 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
C           cycles/limb
Packit 5c3484
C Itanium:      2
Packit 5c3484
C Itanium 2:    1
Packit 5c3484
Packit 5c3484
C This code is scheduled deeply since the plain shift instructions shr and shl
Packit 5c3484
C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
Packit 5c3484
C these instructions cause a 10 cycle replay trap on Itanium.
Packit 5c3484
Packit 5c3484
C The ld8 scheduling should probably be decreased to make the function smaller.
Packit 5c3484
C Good lfetch  will make sure we never stall anyway.
Packit 5c3484
Packit 5c3484
C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
Packit 5c3484
C at cycle 2.  Judicious use of predicates could allow us to issue more ld8's
Packit 5c3484
C in the prologue.
Packit 5c3484
Packit 5c3484
Packit 5c3484
C INPUT PARAMETERS
Packit 5c3484
define(`rp', `r32')
Packit 5c3484
define(`up', `r33')
Packit 5c3484
define(`n',  `r34')
Packit 5c3484
define(`cnt',`r35')
Packit 5c3484
Packit 5c3484
define(`tnc',`r9')
Packit 5c3484
Packit 5c3484
ifdef(`OPERATION_lshift',`
Packit 5c3484
	define(`FSH',`shl')
Packit 5c3484
	define(`BSH',`shr.u')
Packit 5c3484
	define(`UPD',`-8')
Packit 5c3484
	define(`POFF',`-512')
Packit 5c3484
	define(`PUPD',`-32')
Packit 5c3484
	define(`func',`mpn_lshift')
Packit 5c3484
')
Packit 5c3484
ifdef(`OPERATION_rshift',`
Packit 5c3484
	define(`FSH',`shr.u')
Packit 5c3484
	define(`BSH',`shl')
Packit 5c3484
	define(`UPD',`8')
Packit 5c3484
	define(`POFF',`512')
Packit 5c3484
	define(`PUPD',`32')
Packit 5c3484
	define(`func',`mpn_rshift')
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
Packit 5c3484
Packit 5c3484
ASM_START()
Packit 5c3484
PROLOGUE(func)
Packit 5c3484
	.prologue
Packit 5c3484
	.save	ar.lc, r2
Packit 5c3484
	.body
Packit 5c3484
ifdef(`HAVE_ABI_32',
Packit 5c3484
`	addp4	rp = 0, rp		C			M I
Packit 5c3484
	addp4	up = 0, up		C		M I
Packit 5c3484
	sxt4	n = n			C		M I
Packit 5c3484
	nop.m		0
Packit 5c3484
	nop.m		0
Packit 5c3484
	zxt4	cnt = cnt		C		I
Packit 5c3484
	;;
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
 {.mmi;	cmp.lt	p14, p15 = 4, n		C		M I
Packit 5c3484
	and	r14 = 3, n		C		M I
Packit 5c3484
	mov.i	r2 = ar.lc		C		I0
Packit 5c3484
}{.mmi;	add	r15 = -1, n		C		M I
Packit 5c3484
	sub	tnc = 64, cnt		C		M I
Packit 5c3484
	add	r16 = -5, n
Packit 5c3484
	;;
Packit 5c3484
}{.mmi;	cmp.eq	p6, p0 = 1, r14		C		M I
Packit 5c3484
	cmp.eq	p7, p0 = 2, r14		C		M I
Packit 5c3484
	shr.u	n = r16, 2		C		I0
Packit 5c3484
}{.mmi;	cmp.eq	p8, p0 = 3, r14		C		M I
Packit 5c3484
ifdef(`OPERATION_lshift',
Packit 5c3484
`	shladd	up = r15, 3, up		C		M I
Packit 5c3484
	shladd	rp = r15, 3, rp')	C		M I
Packit 5c3484
	;;
Packit 5c3484
}{.mmi;	add	r11 = POFF, up		C		M I
Packit 5c3484
	ld8	r10 = [up], UPD		C		M01
Packit 5c3484
	mov.i	ar.lc = n		C		I0
Packit 5c3484
}{.bbb;
Packit 5c3484
   (p6)	br.dptk	.Lb01
Packit 5c3484
   (p7)	br.dptk	.Lb10
Packit 5c3484
   (p8)	br.dptk	.Lb11
Packit 5c3484
	;; }
Packit 5c3484
Packit 5c3484
.Lb00:	ld8	r19 = [up], UPD
Packit 5c3484
	;;
Packit 5c3484
	ld8	r16 = [up], UPD
Packit 5c3484
	;;
Packit 5c3484
	ld8	r17 = [up], UPD
Packit 5c3484
	BSH	r8 = r10, tnc		C function return value
Packit 5c3484
	;;
Packit 5c3484
	FSH	r24 = r10, cnt
Packit 5c3484
	BSH	r25 = r19, tnc
Packit 5c3484
  (p14)	br.cond.dptk	.grt4
Packit 5c3484
	;;
Packit 5c3484
	FSH	r26 = r19, cnt
Packit 5c3484
	BSH	r27 = r16, tnc
Packit 5c3484
	;;
Packit 5c3484
	FSH	r20 = r16, cnt
Packit 5c3484
	BSH	r21 = r17, tnc
Packit 5c3484
	;;
Packit 5c3484
	or	r14 = r25, r24
Packit 5c3484
	FSH	r22 = r17, cnt
Packit 5c3484
	BSH	r23 = r10, tnc
Packit 5c3484
	br	.Lr4
Packit 5c3484
Packit 5c3484
.grt4:	ld8	r18 = [up], UPD
Packit 5c3484
	FSH	r26 = r19, cnt
Packit 5c3484
	BSH	r27 = r16, tnc
Packit 5c3484
	;;
Packit 5c3484
	ld8	r19 = [up], UPD
Packit 5c3484
	FSH	r20 = r16, cnt
Packit 5c3484
	BSH	r21 = r17, tnc
Packit 5c3484
	;;
Packit 5c3484
	ld8	r16 = [up], UPD
Packit 5c3484
	FSH	r22 = r17, cnt
Packit 5c3484
	BSH	r23 = r18, tnc
Packit 5c3484
	;;
Packit 5c3484
	or	r14 = r25, r24
Packit 5c3484
	ld8	r17 = [up], UPD
Packit 5c3484
	br.cloop.dpnt	.Ltop
Packit 5c3484
	br	.Lbot
Packit 5c3484
Packit 5c3484
.Lb01:
Packit 5c3484
  (p15)	BSH	r8 = r10, tnc		C function return value	I
Packit 5c3484
  (p15)	FSH	r22 = r10, cnt		C		I
Packit 5c3484
  (p15)	br.cond.dptk	.Lr1		C return	B
Packit 5c3484
Packit 5c3484
.grt1:	ld8	r18 = [up], UPD
Packit 5c3484
	;;
Packit 5c3484
	ld8	r19 = [up], UPD
Packit 5c3484
	BSH	r8 = r10, tnc		C function return value
Packit 5c3484
	;;
Packit 5c3484
	ld8	r16 = [up], UPD
Packit 5c3484
	FSH	r22 = r10, cnt
Packit 5c3484
	BSH	r23 = r18, tnc
Packit 5c3484
	;;
Packit 5c3484
	ld8	r17 = [up], UPD
Packit 5c3484
	FSH	r24 = r18, cnt
Packit 5c3484
	BSH	r25 = r19, tnc
Packit 5c3484
	br.cloop.dpnt	.grt5
Packit 5c3484
	;;
Packit 5c3484
	or	r15 = r23, r22
Packit 5c3484
	FSH	r26 = r19, cnt
Packit 5c3484
	BSH	r27 = r16, tnc
Packit 5c3484
	;;
Packit 5c3484
	FSH	r20 = r16, cnt
Packit 5c3484
	BSH	r21 = r17, tnc
Packit 5c3484
	br	.Lr5
Packit 5c3484
Packit 5c3484
.grt5:	ld8	r18 = [up], UPD
Packit 5c3484
	FSH	r26 = r19, cnt
Packit 5c3484
	BSH	r27 = r16, tnc
Packit 5c3484
	;;
Packit 5c3484
	ld8	r19 = [up], UPD
Packit 5c3484
	FSH	r20 = r16, cnt
Packit 5c3484
	BSH	r21 = r17, tnc
Packit 5c3484
	;;
Packit 5c3484
	or	r15 = r23, r22
Packit 5c3484
	ld8	r16 = [up], UPD
Packit 5c3484
	br	.LL01
Packit 5c3484
Packit 5c3484
Packit 5c3484
.Lb10:	ld8	r17 = [up], UPD
Packit 5c3484
  (p14)	br.cond.dptk	.grt2
Packit 5c3484
Packit 5c3484
	BSH	r8 = r10, tnc		C function return value
Packit 5c3484
	;;
Packit 5c3484
	FSH	r20 = r10, cnt
Packit 5c3484
	BSH	r21 = r17, tnc
Packit 5c3484
	;;
Packit 5c3484
	or	r14 = r21, r20
Packit 5c3484
	FSH	r22 = r17, cnt
Packit 5c3484
	br	.Lr2			C return
Packit 5c3484
Packit 5c3484
.grt2:	ld8	r18 = [up], UPD
Packit 5c3484
	BSH	r8 = r10, tnc		C function return value
Packit 5c3484
	;;
Packit 5c3484
	ld8	r19 = [up], UPD
Packit 5c3484
	FSH	r20 = r10, cnt
Packit 5c3484
	BSH	r21 = r17, tnc
Packit 5c3484
	;;
Packit 5c3484
	ld8	r16 = [up], UPD
Packit 5c3484
	FSH	r22 = r17, cnt
Packit 5c3484
	BSH	r23 = r18, tnc
Packit 5c3484
	;;
Packit 5c3484
 {.mmi;	ld8	r17 = [up], UPD
Packit 5c3484
	or	r14 = r21, r20
Packit 5c3484
	FSH	r24 = r18, cnt
Packit 5c3484
}{.mib;	nop	0
Packit 5c3484
	BSH	r25 = r19, tnc
Packit 5c3484
	br.cloop.dpnt	.grt6
Packit 5c3484
	;; }
Packit 5c3484
Packit 5c3484
	FSH	r26 = r19, cnt
Packit 5c3484
	BSH	r27 = r16, tnc
Packit 5c3484
	br	.Lr6
Packit 5c3484
Packit 5c3484
.grt6:	ld8	r18 = [up], UPD
Packit 5c3484
	FSH	r26 = r19, cnt
Packit 5c3484
	BSH	r27 = r16, tnc
Packit 5c3484
	;;
Packit 5c3484
	ld8	r19 = [up], UPD
Packit 5c3484
	br	.LL10
Packit 5c3484
Packit 5c3484
Packit 5c3484
.Lb11:	ld8	r16 = [up], UPD
Packit 5c3484
	;;
Packit 5c3484
	ld8	r17 = [up], UPD
Packit 5c3484
	BSH	r8 = r10, tnc		C function return value
Packit 5c3484
  (p14)	br.cond.dptk	.grt3
Packit 5c3484
	;;
Packit 5c3484
Packit 5c3484
	FSH	r26 = r10, cnt
Packit 5c3484
	BSH	r27 = r16, tnc
Packit 5c3484
	;;
Packit 5c3484
	FSH	r20 = r16, cnt
Packit 5c3484
	BSH	r21 = r17, tnc
Packit 5c3484
	;;
Packit 5c3484
	or	r15 = r27, r26
Packit 5c3484
	FSH	r22 = r17, cnt
Packit 5c3484
	br	.Lr3			C return
Packit 5c3484
Packit 5c3484
.grt3:	ld8	r18 = [up], UPD
Packit 5c3484
	FSH	r26 = r10, cnt
Packit 5c3484
	BSH	r27 = r16, tnc
Packit 5c3484
	;;
Packit 5c3484
	ld8	r19 = [up], UPD
Packit 5c3484
	FSH	r20 = r16, cnt
Packit 5c3484
	BSH	r21 = r17, tnc
Packit 5c3484
	;;
Packit 5c3484
	ld8	r16 = [up], UPD
Packit 5c3484
	FSH	r22 = r17, cnt
Packit 5c3484
	BSH	r23 = r18, tnc
Packit 5c3484
	;;
Packit 5c3484
	ld8	r17 = [up], UPD
Packit 5c3484
	br.cloop.dpnt	.grt7
Packit 5c3484
Packit 5c3484
	or	r15 = r27, r26
Packit 5c3484
	FSH	r24 = r18, cnt
Packit 5c3484
	BSH	r25 = r19, tnc
Packit 5c3484
	br	.Lr7
Packit 5c3484
Packit 5c3484
.grt7:	or	r15 = r27, r26
Packit 5c3484
	FSH	r24 = r18, cnt
Packit 5c3484
	BSH	r25 = r19, tnc
Packit 5c3484
	ld8	r18 = [up], UPD
Packit 5c3484
	br	.LL11
Packit 5c3484
Packit 5c3484
C *** MAIN LOOP START ***
Packit 5c3484
	ALIGN(32)
Packit 5c3484
.Ltop:
Packit 5c3484
 {.mmi;	st8	[rp] = r14, UPD		C M2
Packit 5c3484
	or	r15 = r27, r26		C M3
Packit 5c3484
	FSH	r24 = r18, cnt		C I0
Packit 5c3484
}{.mmi;	ld8	r18 = [up], UPD		C M1
Packit 5c3484
	lfetch	[r11], PUPD
Packit 5c3484
	BSH	r25 = r19, tnc		C I1
Packit 5c3484
	;; }
Packit 5c3484
.LL11:
Packit 5c3484
 {.mmi;	st8	[rp] = r15, UPD
Packit 5c3484
	or	r14 = r21, r20
Packit 5c3484
	FSH	r26 = r19, cnt
Packit 5c3484
}{.mmi;	ld8	r19 = [up], UPD
Packit 5c3484
	nop.m	0
Packit 5c3484
	BSH	r27 = r16, tnc
Packit 5c3484
	;; }
Packit 5c3484
.LL10:
Packit 5c3484
 {.mmi;	st8	[rp] = r14, UPD
Packit 5c3484
	or	r15 = r23, r22
Packit 5c3484
	FSH	r20 = r16, cnt
Packit 5c3484
}{.mmi;	ld8	r16 = [up], UPD
Packit 5c3484
	nop.m	0
Packit 5c3484
	BSH	r21 = r17, tnc
Packit 5c3484
	;; }
Packit 5c3484
.LL01:
Packit 5c3484
 {.mmi;	st8	[rp] = r15, UPD
Packit 5c3484
	or	r14 = r25, r24
Packit 5c3484
	FSH	r22 = r17, cnt
Packit 5c3484
}{.mib;	ld8	r17 = [up], UPD
Packit 5c3484
	BSH	r23 = r18, tnc
Packit 5c3484
	br.cloop.dptk	.Ltop
Packit 5c3484
	;; }
Packit 5c3484
C *** MAIN LOOP END ***
Packit 5c3484
Packit 5c3484
.Lbot:
Packit 5c3484
 {.mmi;	st8	[rp] = r14, UPD
Packit 5c3484
	or	r15 = r27, r26
Packit 5c3484
	FSH	r24 = r18, cnt
Packit 5c3484
}{.mib;	nop	0
Packit 5c3484
	BSH	r25 = r19, tnc
Packit 5c3484
	nop	0
Packit 5c3484
	;; }
Packit 5c3484
.Lr7:
Packit 5c3484
 {.mmi;	st8	[rp] = r15, UPD
Packit 5c3484
	or	r14 = r21, r20
Packit 5c3484
	FSH	r26 = r19, cnt
Packit 5c3484
}{.mib;	nop	0
Packit 5c3484
	BSH	r27 = r16, tnc
Packit 5c3484
	nop	0
Packit 5c3484
	;; }
Packit 5c3484
.Lr6:
Packit 5c3484
 {.mmi;	st8	[rp] = r14, UPD
Packit 5c3484
	or	r15 = r23, r22
Packit 5c3484
	FSH	r20 = r16, cnt
Packit 5c3484
}{.mib;	nop	0
Packit 5c3484
	BSH	r21 = r17, tnc
Packit 5c3484
	nop	0
Packit 5c3484
	;; }
Packit 5c3484
.Lr5:	st8	[rp] = r15, UPD
Packit 5c3484
	or	r14 = r25, r24
Packit 5c3484
	FSH	r22 = r17, cnt
Packit 5c3484
	;;
Packit 5c3484
.Lr4:	st8	[rp] = r14, UPD
Packit 5c3484
	or	r15 = r27, r26
Packit 5c3484
	;;
Packit 5c3484
.Lr3:	st8	[rp] = r15, UPD
Packit 5c3484
	or	r14 = r21, r20
Packit 5c3484
	;;
Packit 5c3484
.Lr2:	st8	[rp] = r14, UPD
Packit 5c3484
	;;
Packit 5c3484
.Lr1:	st8	[rp] = r22, UPD		C		M23
Packit 5c3484
	mov	ar.lc = r2		C		I0
Packit 5c3484
	br.ret.sptk.many b0		C		B
Packit 5c3484
EPILOGUE(func)
Packit 5c3484
ASM_END()