Blame mpn/ia64/mod_34lsub1.asm

Packit 5c3484
dnl  IA-64 mpn_mod_34lsub1
Packit 5c3484
Packit 5c3484
dnl  Contributed to the GNU project by Torbjorn Granlund.
Packit 5c3484
Packit 5c3484
dnl  Copyright 2003-2005, 2010 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
C           cycles/limb
Packit 5c3484
C Itanium:      ?
Packit 5c3484
C Itanium 2:    1
Packit 5c3484
Packit 5c3484
Packit 5c3484
C INPUT PARAMETERS
Packit 5c3484
define(`up', `r32')
Packit 5c3484
define(`n',  `r33')
Packit 5c3484
Packit 5c3484
C Some useful aliases for registers we use
Packit 5c3484
define(`u0',`r14') define(`u1',`r15') define(`u2',`r16')
Packit 5c3484
define(`a0',`r17') define(`a1',`r18') define(`a2',`r19')
Packit 5c3484
define(`c0',`r20') define(`c1',`r21') define(`c2',`r22')
Packit 5c3484
Packit 5c3484
C This is a fairly simple-minded implementation.  One could approach 0.67 c/l
Packit 5c3484
C with a more sophisticated implementation.  If we're really crazy, we could
Packit 5c3484
C super-unroll, storing carries just in predicate registers, then copy them to
Packit 5c3484
C a general register, and population count them from there.  That'd bring us
Packit 5c3484
C close to 3 insn/limb, for nearly 0.5 c/l.
Packit 5c3484
Packit 5c3484
C Computing n/3 needs 16 cycles, which is a lot of startup overhead.
Packit 5c3484
C We therefore use a plain while-style loop:
Packit 5c3484
C	add		n = -3, n
Packit 5c3484
C	cmp.le		p9, p0 = 3, n
Packit 5c3484
C  (p9)	br.cond		.Loop
Packit 5c3484
C Alternatively, we could table n/3 for, say, n < 256, and predicate the
Packit 5c3484
C 16-cycle code.
Packit 5c3484
Packit 5c3484
C The summing-up code at the end was written quickly, and could surely be
Packit 5c3484
C vastly improved.
Packit 5c3484
Packit 5c3484
ASM_START()
Packit 5c3484
PROLOGUE(mpn_mod_34lsub1)
Packit 5c3484
	.prologue
Packit 5c3484
	.save	ar.lc, r2
Packit 5c3484
	.body
Packit 5c3484
ifdef(`HAVE_ABI_32',`
Packit 5c3484
	addp4		up = 0, up		C			M I
Packit 5c3484
	nop.m		0
Packit 5c3484
	zxt4		n = n			C			I
Packit 5c3484
	;;
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
ifelse(0,1,`
Packit 5c3484
	movl		r14 = 0xAAAAAAAAAAAAAAAB
Packit 5c3484
	;;
Packit 5c3484
	setf.sig	f6 = r14
Packit 5c3484
	setf.sig	f7 = r33
Packit 5c3484
	;;
Packit 5c3484
	xmpy.hu		f6 = f6, f7
Packit 5c3484
	;;
Packit 5c3484
	getf.sig	r8 = f6
Packit 5c3484
	;;
Packit 5c3484
	shr.u		r8 = r8, 1		C Loop count
Packit 5c3484
	;;
Packit 5c3484
	mov.i		ar.lc = r8
Packit 5c3484
')
Packit 5c3484
Packit 5c3484
	ld8	u0 = [up], 8
Packit 5c3484
	cmp.ne	p9, p0 = 1, n
Packit 5c3484
  (p9)	br	L(gt1)
Packit 5c3484
	;;
Packit 5c3484
	shr.u	r8 = u0, 48
Packit 5c3484
	dep.z	r27 = u0, 0, 48
Packit 5c3484
	;;
Packit 5c3484
	add	r8 = r8, r27
Packit 5c3484
	br.ret.sptk.many b0
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(gt1):
Packit 5c3484
 {.mmi;	nop.m	0
Packit 5c3484
	mov	a0 = 0
Packit 5c3484
	add	n = -2, n
Packit 5c3484
}{.mmi;	mov	c0 = 0
Packit 5c3484
	mov	c1 = 0
Packit 5c3484
	mov	c2 = 0
Packit 5c3484
	;;
Packit 5c3484
}{.mmi;	ld8	u1 = [up], 8
Packit 5c3484
	mov	a1 = 0
Packit 5c3484
	cmp.ltu	p6, p0 = r0, r0		C clear p6
Packit 5c3484
}{.mmb;	cmp.gt	p9, p0 = 3, n
Packit 5c3484
	mov	a2 = 0
Packit 5c3484
  (p9)	br.cond.dptk	L(end)
Packit 5c3484
	;;
Packit 5c3484
}
Packit 5c3484
	ALIGN(32)
Packit 5c3484
L(top):
Packit 5c3484
 {.mmi;	ld8	u2 = [up], 8
Packit 5c3484
  (p6)	add	c0 = 1, c0
Packit 5c3484
	cmp.ltu	p7, p0 = a0, u0
Packit 5c3484
}{.mmb;	sub	a0 = a0, u0
Packit 5c3484
	add	n = -3, n
Packit 5c3484
	nop.b	0
Packit 5c3484
	;;
Packit 5c3484
}{.mmi;	ld8	u0 = [up], 8
Packit 5c3484
  (p7)	add	c1 = 1, c1
Packit 5c3484
	cmp.ltu	p8, p0 = a1, u1
Packit 5c3484
}{.mmb;	sub	a1 = a1, u1
Packit 5c3484
	cmp.le	p9, p0 = 3, n
Packit 5c3484
	nop.b	0
Packit 5c3484
	;;
Packit 5c3484
}{.mmi;	ld8	u1 = [up], 8
Packit 5c3484
  (p8)	add	c2 = 1, c2
Packit 5c3484
	cmp.ltu	p6, p0 = a2, u2
Packit 5c3484
}{.mmb;	sub	a2 = a2, u2
Packit 5c3484
	nop.m	0
Packit 5c3484
dnl	br.cloop.dptk	L(top)
Packit 5c3484
  (p9)	br.cond.dptk	L(top)
Packit 5c3484
	;;
Packit 5c3484
}
Packit 5c3484
L(end):
Packit 5c3484
	cmp.eq	p10, p0 = 0, n
Packit 5c3484
	cmp.eq	p11, p0 = 1, n
Packit 5c3484
  (p10)	br	L(0)
Packit 5c3484
Packit 5c3484
L(2):
Packit 5c3484
 {.mmi;	ld8	u2 = [up], 8
Packit 5c3484
  (p6)	add	c0 = 1, c0
Packit 5c3484
	cmp.ltu	p7, p0 = a0, u0
Packit 5c3484
}{.mmb;	sub	a0 = a0, u0
Packit 5c3484
	nop.m	0
Packit 5c3484
  (p11)	br	L(1)
Packit 5c3484
	;;
Packit 5c3484
}	ld8	u0 = [up], 8
Packit 5c3484
  (p7)	add	c1 = 1, c1
Packit 5c3484
	cmp.ltu	p8, p0 = a1, u1
Packit 5c3484
	sub	a1 = a1, u1
Packit 5c3484
	;;
Packit 5c3484
  (p8)	add	c2 = 1, c2
Packit 5c3484
	cmp.ltu	p6, p0 = a2, u2
Packit 5c3484
	sub	a2 = a2, u2
Packit 5c3484
	;;
Packit 5c3484
  (p6)	add	c0 = 1, c0
Packit 5c3484
	cmp.ltu	p7, p0 = a0, u0
Packit 5c3484
	sub	a0 = a0, u0
Packit 5c3484
	;;
Packit 5c3484
  (p7)	add	c1 = 1, c1
Packit 5c3484
	br	L(com)
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(1):
Packit 5c3484
  (p7)	add	c1 = 1, c1
Packit 5c3484
	cmp.ltu	p8, p0 = a1, u1
Packit 5c3484
	sub	a1 = a1, u1
Packit 5c3484
	;;
Packit 5c3484
  (p8)	add	c2 = 1, c2
Packit 5c3484
	cmp.ltu	p6, p0 = a2, u2
Packit 5c3484
	sub	a2 = a2, u2
Packit 5c3484
	;;
Packit 5c3484
  (p6)	add	c0 = 1, c0
Packit 5c3484
	br	L(com)
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(0):
Packit 5c3484
  (p6)	add	c0 = 1, c0
Packit 5c3484
	cmp.ltu	p7, p0 = a0, u0
Packit 5c3484
	sub	a0 = a0, u0
Packit 5c3484
	;;
Packit 5c3484
  (p7)	add	c1 = 1, c1
Packit 5c3484
	cmp.ltu	p8, p0 = a1, u1
Packit 5c3484
	sub	a1 = a1, u1
Packit 5c3484
	;;
Packit 5c3484
  (p8)	add	c2 = 1, c2
Packit 5c3484
Packit 5c3484
L(com):
Packit 5c3484
C |     a2    |     a1    |     a0    |
Packit 5c3484
C |        |        |        |        |
Packit 5c3484
	shr.u	r24 = a0, 48		C 16 bits
Packit 5c3484
	shr.u	r25 = a1, 32		C 32 bits
Packit 5c3484
	shr.u	r26 = a2, 16		C 48 bits
Packit 5c3484
	;;
Packit 5c3484
	shr.u	r10 = c0, 48		C 16 bits, always zero
Packit 5c3484
	shr.u	r11 = c1, 32		C 32 bits
Packit 5c3484
	shr.u	r30 = c2, 16		C 48 bits
Packit 5c3484
	;;
Packit 5c3484
	dep.z	r27 = a0,  0, 48	C 48 bits
Packit 5c3484
	dep.z	r28 = a1, 16, 32	C 48 bits
Packit 5c3484
	dep.z	r29 = a2, 32, 16	C 48 bits
Packit 5c3484
	dep.z	r31 = c0,  0, 48	C 48 bits
Packit 5c3484
	dep.z	r14 = c1, 16, 32	C 48 bits
Packit 5c3484
	dep.z	r15 = c2, 32, 16	C 48 bits
Packit 5c3484
	;;
Packit 5c3484
 {.mmi;	add	r24 = r24, r25
Packit 5c3484
	add	r26 = r26, r27
Packit 5c3484
	add	r28 = r28, r29
Packit 5c3484
}{.mmi;	add	r10 = r10, r11
Packit 5c3484
	add	r30 = r30, r31
Packit 5c3484
	add	r14 = r14, r15
Packit 5c3484
	;;
Packit 5c3484
}
Packit 5c3484
	movl	r8 = 0xffffffffffff0
Packit 5c3484
	add	r24 = r24, r26
Packit 5c3484
	add	r10 = r10, r30
Packit 5c3484
	;;
Packit 5c3484
	add	r24 = r24, r28
Packit 5c3484
	add	r10 = r10, r14
Packit 5c3484
	;;
Packit 5c3484
	sub	r8 = r8, r24
Packit 5c3484
	;;
Packit 5c3484
	add	r8 = r8, r10
Packit 5c3484
	br.ret.sptk.many b0
Packit 5c3484
EPILOGUE()
Packit 5c3484
ASM_END()