Blame mpn/alpha/com.asm

Packit 5c3484
dnl  Alpha mpn_com -- mpn one's complement.
Packit 5c3484
Packit 5c3484
dnl  Copyright 2003 Free Software Foundation, Inc.
Packit 5c3484
Packit 5c3484
dnl  This file is part of the GNU MP Library.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
Packit 5c3484
dnl  it under the terms of either:
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU Lesser General Public License as published by the Free
Packit 5c3484
dnl      Software Foundation; either version 3 of the License, or (at your
Packit 5c3484
dnl      option) any later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or
Packit 5c3484
dnl
Packit 5c3484
dnl    * the GNU General Public License as published by the Free Software
Packit 5c3484
dnl      Foundation; either version 2 of the License, or (at your option) any
Packit 5c3484
dnl      later version.
Packit 5c3484
dnl
Packit 5c3484
dnl  or both in parallel, as here.
Packit 5c3484
dnl
Packit 5c3484
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
Packit 5c3484
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
Packit 5c3484
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
Packit 5c3484
dnl  for more details.
Packit 5c3484
dnl
Packit 5c3484
dnl  You should have received copies of the GNU General Public License and the
Packit 5c3484
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
Packit 5c3484
dnl  see https://www.gnu.org/licenses/.
Packit 5c3484
Packit 5c3484
include(`../config.m4')
Packit 5c3484
Packit 5c3484
Packit 5c3484
C      cycles/limb
Packit 5c3484
C EV4:    4.75
Packit 5c3484
C EV5:    2.0
Packit 5c3484
C EV6:    1.5
Packit 5c3484
Packit 5c3484
Packit 5c3484
C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
Packit 5c3484
C
Packit 5c3484
C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
Packit 5c3484
C 2.0 c/l.  In general, a pattern like this unrolled to N limbs per loop
Packit 5c3484
C will be 1.5+2/N c/l.
Packit 5c3484
C
Packit 5c3484
C 2 cycles of loop control are unavoidable, for pointer updates and the
Packit 5c3484
C taken branch bubble, but also since ldq cannot issue two cycles after stq
Packit 5c3484
C (and with a run of stqs that means neither of two cycles at the end of the
Packit 5c3484
C loop.
Packit 5c3484
C
Packit 5c3484
C The fbeq is forced into the second cycle of the loop using unops, since
Packit 5c3484
C the first time through it must wait for the cvtqt result.  Once that
Packit 5c3484
C result is ready (a 1 cycle stall) then both the branch and following loads
Packit 5c3484
C can issue together.
Packit 5c3484
C
Packit 5c3484
C The main loop handles an odd count of limbs, being two limbs loaded before
Packit 5c3484
C each size test, plus one pipelined around from the previous iteration (or
Packit 5c3484
C setup in the entry sequence).
Packit 5c3484
C
Packit 5c3484
C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
Packit 5c3484
C entry sequence, and an increment of the pointers.  For an odd size there's
Packit 5c3484
C no increment and the first store in the loop (r24) is a repeat of dst[0].
Packit 5c3484
C
Packit 5c3484
C Note that the load for r24 after the possible pointer increment is done
Packit 5c3484
C before the explicit store to dst[0], in case src==dst.
Packit 5c3484
Packit 5c3484
Packit 5c3484
ASM_START()
Packit 5c3484
Packit 5c3484
FLOAT64(L(dat), 2.0)
Packit 5c3484
Packit 5c3484
	ALIGN(16)
Packit 5c3484
Packit 5c3484
PROLOGUE(mpn_com,gp)
Packit 5c3484
Packit 5c3484
	C r16	dst
Packit 5c3484
	C r17	src
Packit 5c3484
	C r18	size
Packit 5c3484
Packit 5c3484
	lda	r30, -16(r30)		C temporary stack space
Packit 5c3484
	lda	r7, -3(r18)		C size - 3
Packit 5c3484
Packit 5c3484
	ldq	r20, 0(r17)		C src[0]
Packit 5c3484
	srl	r7, 1, r6		C (size-3)/2
Packit 5c3484
Packit 5c3484
	stq	r6, 8(r30)		C (size-3)/2
Packit 5c3484
	and	r7, 1, r5		C 1 if size even
Packit 5c3484
Packit 5c3484
	LEA(	r8, L(dat))
Packit 5c3484
	s8addq	r5, r17, r17		C skip src[0] if even
Packit 5c3484
Packit 5c3484
	ornot	r31, r20, r20		C ~src[0]
Packit 5c3484
	unop
Packit 5c3484
Packit 5c3484
	ldt	f0, 8(r30)		C (size-3)/2
Packit 5c3484
	ldq	r24, 0(r17)		C src[0 or 1]
Packit 5c3484
Packit 5c3484
	stq	r20, 0(r16)		C dst[0]
Packit 5c3484
	s8addq	r5, r16, r19		C skip dst[0] if even
Packit 5c3484
Packit 5c3484
	ldt	f1, 0(r8)		C data 2.0
Packit 5c3484
	lda	r30, 16(r30)		C restore stack
Packit 5c3484
	unop
Packit 5c3484
	cvtqt	f0, f0			C (size-3)/2 as float
Packit 5c3484
Packit 5c3484
	ornot	r31, r24, r24
Packit 5c3484
	blt	r7, L(done_1)		C if size<=2
Packit 5c3484
	unop
Packit 5c3484
	unop
Packit 5c3484
Packit 5c3484
Packit 5c3484
	C 16-byte alignment here
Packit 5c3484
L(top):
Packit 5c3484
	C r17	src, incrementing
Packit 5c3484
	C r19	dst, incrementing
Packit 5c3484
	C r24	dst[i] result, ready to store
Packit 5c3484
	C f0	(size-3)/2, decrementing
Packit 5c3484
	C f1	2.0
Packit 5c3484
Packit 5c3484
	ldq	r20, 8(r17)		C src[i+1]
Packit 5c3484
	ldq	r21, 16(r17)		C src[i+2]
Packit 5c3484
	unop
Packit 5c3484
	unop
Packit 5c3484
Packit 5c3484
	fbeq	f0, L(done_2)
Packit 5c3484
	unop
Packit 5c3484
	ldq	r22, 24(r17)		C src[i+3]
Packit 5c3484
	ldq	r23, 32(r17)		C src[i+4]
Packit 5c3484
Packit 5c3484
	stq	r24, 0(r19)		C dst[i]
Packit 5c3484
	ornot	r31, r20, r20
Packit 5c3484
	subt	f0, f1, f0		C count -= 2
Packit 5c3484
	unop
Packit 5c3484
Packit 5c3484
	stq	r20, 8(r19)		C dst[i+1]
Packit 5c3484
	ornot	r31, r21, r21
Packit 5c3484
	unop
Packit 5c3484
	unop
Packit 5c3484
Packit 5c3484
	stq	r21, 16(r19)		C dst[i+2]
Packit 5c3484
	ornot	r31, r22, r22
Packit 5c3484
Packit 5c3484
	stq	r22, 24(r19)		C dst[i+3]
Packit 5c3484
	ornot	r31, r23, r24
Packit 5c3484
Packit 5c3484
	lda	r17, 32(r17)		C src += 4
Packit 5c3484
	lda	r19, 32(r19)		C dst += 4
Packit 5c3484
	unop
Packit 5c3484
	fbge	f0, L(top)
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(done_1):
Packit 5c3484
	C r19	&dst[size-1]
Packit 5c3484
	C r24	result for dst[size-1]
Packit 5c3484
Packit 5c3484
	stq	r24, 0(r19)		C dst[size-1]
Packit 5c3484
	ret	r31, (r26), 1
Packit 5c3484
Packit 5c3484
Packit 5c3484
L(done_2):
Packit 5c3484
	C r19	&dst[size-3]
Packit 5c3484
	C r20	src[size-2]
Packit 5c3484
	C r21	src[size-1]
Packit 5c3484
	C r24	result for dst[size-3]
Packit 5c3484
Packit 5c3484
	stq	r24, 0(r19)		C dst[size-3]
Packit 5c3484
	ornot	r31, r20, r20
Packit 5c3484
Packit 5c3484
	stq	r20, 8(r19)		C dst[size-2]
Packit 5c3484
	ornot	r31, r21, r21
Packit 5c3484
Packit 5c3484
	stq	r21, 16(r19)		C dst[size-1]
Packit 5c3484
	ret	r31, (r26), 1
Packit 5c3484
Packit 5c3484
EPILOGUE()
Packit 5c3484
ASM_END()