Blame sysdeps/ia64/memmove.S

Packit 6c4009
/* Optimized version of the standard memmove() function.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
   Copyright (C) 2000-2018 Free Software Foundation, Inc.
Packit 6c4009
   Contributed by Dan Pop <Dan.Pop@cern.ch>.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
/* Return: dest
Packit 6c4009
Packit 6c4009
   Inputs:
Packit 6c4009
        in0:    dest
Packit 6c4009
        in1:    src
Packit 6c4009
        in2:    byte count
Packit 6c4009
Packit 6c4009
   The core of the function is the memcpy implementation used in memcpy.S.
Packit 6c4009
   When bytes have to be copied backwards, only the easy case, when
Packit 6c4009
   all arguments are multiples of 8, is optimised.
Packit 6c4009
Packit 6c4009
   In this form, it assumes little endian mode.  For big endian mode,
Packit 6c4009
   sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
Packit 6c4009
   or the UM.be bit should be cleared at the beginning and set at the end.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
#undef ret
Packit 6c4009
Packit 6c4009
#define OP_T_THRES 	16
Packit 6c4009
#define OPSIZ 		 8
Packit 6c4009
Packit 6c4009
#define adest		r15
Packit 6c4009
#define saved_pr	r17
Packit 6c4009
#define saved_lc	r18
Packit 6c4009
#define dest		r19
Packit 6c4009
#define src		r20
Packit 6c4009
#define len		r21
Packit 6c4009
#define asrc		r22
Packit 6c4009
#define tmp2		r23
Packit 6c4009
#define tmp3		r24
Packit 6c4009
#define	tmp4		r25
Packit 6c4009
#define ptable		r26
Packit 6c4009
#define ploop56		r27
Packit 6c4009
#define	loopaddr	r28
Packit 6c4009
#define	sh1		r29
Packit 6c4009
#define loopcnt		r30
Packit 6c4009
#define	value		r31
Packit 6c4009
Packit 6c4009
#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
Packit 6c4009
# define ALIGN(n)	{ nop 0 }
Packit 6c4009
#else
Packit 6c4009
# define ALIGN(n)	.align n
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#define LOOP(shift)							\
Packit 6c4009
		ALIGN(32);						\
Packit 6c4009
.loop##shift##:								\
Packit 6c4009
(p[0])		ld8	r[0] = [asrc], 8 ;	/* w1 */		\
Packit 6c4009
(p[MEMLAT+1])	st8	[dest] = value, 8 ;				\
Packit 6c4009
(p[MEMLAT])	shrp	value = r[MEMLAT], r[MEMLAT+1], shift ;		\
Packit 6c4009
		nop.b	0 ;						\
Packit 6c4009
		nop.b	0 ;						\
Packit 6c4009
		br.ctop.sptk .loop##shift ;				\
Packit 6c4009
		br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
Packit 6c4009
Packit 6c4009
#define MEMLAT	21
Packit 6c4009
#define Nrot	(((2*MEMLAT+3) + 7) & ~7)
Packit 6c4009
Packit 6c4009
ENTRY(memmove)
Packit 6c4009
	.prologue
Packit 6c4009
	alloc 	r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
Packit 6c4009
	.rotr	r[MEMLAT + 2], q[MEMLAT + 1]
Packit 6c4009
	.rotp	p[MEMLAT + 2]
Packit 6c4009
	mov	ret0 = in0		// return value = dest
Packit 6c4009
	.save pr, saved_pr
Packit 6c4009
	mov	saved_pr = pr		// save the predicate registers
Packit 6c4009
	.save ar.lc, saved_lc
Packit 6c4009
        mov 	saved_lc = ar.lc	// save the loop counter
Packit 6c4009
	.body
Packit 6c4009
	or	tmp3 = in0, in1 ;;	// tmp3 = dest | src
Packit 6c4009
	or	tmp3 = tmp3, in2	// tmp3 = dest | src | len
Packit 6c4009
	mov 	dest = in0		// dest
Packit 6c4009
	mov 	src = in1		// src
Packit 6c4009
	mov	len = in2		// len
Packit 6c4009
	sub	tmp2 = r0, in0		// tmp2 = -dest
Packit 6c4009
	cmp.eq	p6, p0 = in2, r0	// if (len == 0)
Packit 6c4009
(p6)	br.cond.spnt .restore_and_exit;;// 	return dest;
Packit 6c4009
	and	tmp4 = 7, tmp3 		// tmp4 = (dest | src | len) & 7
Packit 6c4009
	cmp.le	p6, p0 = dest, src	// if dest <= src it's always safe
Packit 6c4009
(p6)	br.cond.spnt .forward		// to copy forward
Packit 6c4009
	add	tmp3 = src, len;;
Packit 6c4009
	cmp.lt	p6, p0 = dest, tmp3	// if dest > src && dest < src + len
Packit 6c4009
(p6)	br.cond.spnt .backward		// we have to copy backward
Packit 6c4009
Packit 6c4009
.forward:
Packit 6c4009
	shr.u	loopcnt = len, 4 ;;	// loopcnt = len / 16
Packit 6c4009
	cmp.ne	p6, p0 = tmp4, r0	// if ((dest | src | len) & 7 != 0)
Packit 6c4009
(p6)	br.cond.sptk .next		//	goto next;
Packit 6c4009
Packit 6c4009
// The optimal case, when dest, src and len are all multiples of 8
Packit 6c4009
Packit 6c4009
	and	tmp3 = 0xf, len
Packit 6c4009
	mov	pr.rot = 1 << 16	// set rotating predicates
Packit 6c4009
	mov	ar.ec = MEMLAT + 1 ;;	// set the epilog counter
Packit 6c4009
	cmp.ne	p6, p0 = tmp3, r0	// do we have to copy an extra word?
Packit 6c4009
	adds	loopcnt = -1, loopcnt;;	// --loopcnt
Packit 6c4009
(p6)	ld8	value = [src], 8;;
Packit 6c4009
(p6)	st8	[dest] = value, 8	// copy the "odd" word
Packit 6c4009
	mov	ar.lc = loopcnt 	// set the loop counter
Packit 6c4009
	cmp.eq	p6, p0 = 8, len
Packit 6c4009
(p6)	br.cond.spnt .restore_and_exit;;// the one-word special case
Packit 6c4009
	adds	adest = 8, dest		// set adest one word ahead of dest
Packit 6c4009
	adds	asrc = 8, src ;;	// set asrc one word ahead of src
Packit 6c4009
	nop.b	0			// get the "golden" alignment for
Packit 6c4009
	nop.b	0			// the next loop
Packit 6c4009
.l0:
Packit 6c4009
(p[0])		ld8	r[0] = [src], 16
Packit 6c4009
(p[0])		ld8	q[0] = [asrc], 16
Packit 6c4009
(p[MEMLAT])	st8	[dest] = r[MEMLAT], 16
Packit 6c4009
(p[MEMLAT])	st8	[adest] = q[MEMLAT], 16
Packit 6c4009
		br.ctop.dptk .l0 ;;
Packit 6c4009
Packit 6c4009
	mov	pr = saved_pr, -1	// restore the predicate registers
Packit 6c4009
	mov	ar.lc = saved_lc	// restore the loop counter
Packit 6c4009
	br.ret.sptk.many b0
Packit 6c4009
.next:
Packit 6c4009
	cmp.ge	p6, p0 = OP_T_THRES, len	// is len <= OP_T_THRES
Packit 6c4009
	and	loopcnt = 7, tmp2 		// loopcnt = -dest % 8
Packit 6c4009
(p6)	br.cond.spnt	.cpyfew			// copy byte by byte
Packit 6c4009
	;;
Packit 6c4009
	cmp.eq	p6, p0 = loopcnt, r0
Packit 6c4009
(p6)	br.cond.sptk	.dest_aligned
Packit 6c4009
	sub	len = len, loopcnt	// len -= -dest % 8
Packit 6c4009
	adds	loopcnt = -1, loopcnt	// --loopcnt
Packit 6c4009
	;;
Packit 6c4009
	mov	ar.lc = loopcnt
Packit 6c4009
.l1:					// copy -dest % 8 bytes
Packit 6c4009
	ld1	value = [src], 1	// value = *src++
Packit 6c4009
	;;
Packit 6c4009
	st1	[dest] = value, 1	// *dest++ = value
Packit 6c4009
	br.cloop.dptk .l1
Packit 6c4009
.dest_aligned:
Packit 6c4009
	and	sh1 = 7, src 		// sh1 = src % 8
Packit 6c4009
	and	tmp2 = -8, len   	// tmp2 = len & -OPSIZ
Packit 6c4009
	and	asrc = -8, src		// asrc = src & -OPSIZ  -- align src
Packit 6c4009
	shr.u	loopcnt = len, 3	// loopcnt = len / 8
Packit 6c4009
	and	len = 7, len;;		// len = len % 8
Packit 6c4009
	adds	loopcnt = -1, loopcnt	// --loopcnt
Packit 6c4009
	addl	tmp4 = @ltoff(.table), gp
Packit 6c4009
	addl	tmp3 = @ltoff(.loop56), gp
Packit 6c4009
	mov     ar.ec = MEMLAT + 1	// set EC
Packit 6c4009
	mov     pr.rot = 1 << 16;;	// set rotating predicates
Packit 6c4009
	mov	ar.lc = loopcnt		// set LC
Packit 6c4009
	cmp.eq  p6, p0 = sh1, r0 	// is the src aligned?
Packit 6c4009
(p6)    br.cond.sptk .src_aligned
Packit 6c4009
	add	src = src, tmp2		// src += len & -OPSIZ
Packit 6c4009
	shl	sh1 = sh1, 3		// sh1 = 8 * (src % 8)
Packit 6c4009
	ld8	ploop56 = [tmp3]	// ploop56 = &loop56
Packit 6c4009
	ld8	ptable = [tmp4];;	// ptable = &table
Packit 6c4009
	add	tmp3 = ptable, sh1;;	// tmp3 = &table + sh1
Packit 6c4009
	mov	ar.ec = MEMLAT + 1 + 1 // one more pass needed
Packit 6c4009
	ld8	tmp4 = [tmp3];;		// tmp4 = loop offset
Packit 6c4009
	sub	loopaddr = ploop56,tmp4	// loopadd = &loop56 - loop offset
Packit 6c4009
	ld8	r[1] = [asrc], 8;;	// w0
Packit 6c4009
	mov	b6 = loopaddr;;
Packit 6c4009
	br	b6			// jump to the appropriate loop
Packit 6c4009
Packit 6c4009
	LOOP(8)
Packit 6c4009
	LOOP(16)
Packit 6c4009
	LOOP(24)
Packit 6c4009
	LOOP(32)
Packit 6c4009
	LOOP(40)
Packit 6c4009
	LOOP(48)
Packit 6c4009
	LOOP(56)
Packit 6c4009
Packit 6c4009
.src_aligned:
Packit 6c4009
.l3:
Packit 6c4009
(p[0])		ld8	r[0] = [src], 8
Packit 6c4009
(p[MEMLAT])	st8	[dest] = r[MEMLAT], 8
Packit 6c4009
		br.ctop.dptk .l3
Packit 6c4009
.cpyfew:
Packit 6c4009
	cmp.eq	p6, p0 = len, r0	// is len == 0 ?
Packit 6c4009
	adds	len = -1, len		// --len;
Packit 6c4009
(p6)	br.cond.spnt	.restore_and_exit ;;
Packit 6c4009
	mov	ar.lc = len
Packit 6c4009
.l4:
Packit 6c4009
	ld1	value = [src], 1
Packit 6c4009
	;;
Packit 6c4009
	st1	[dest] = value, 1
Packit 6c4009
	br.cloop.dptk	.l4 ;;
Packit 6c4009
.restore_and_exit:
Packit 6c4009
	mov     pr = saved_pr, -1    	// restore the predicate registers
Packit 6c4009
	mov 	ar.lc = saved_lc	// restore the loop counter
Packit 6c4009
	br.ret.sptk.many b0
Packit 6c4009
Packit 6c4009
// In the case of a backward copy, optimise only the case when everything
Packit 6c4009
// is a multiple of 8, otherwise copy byte by byte.  The backward copy is
Packit 6c4009
// used only when the blocks are overlapping and dest > src.
Packit 6c4009
Packit 6c4009
.backward:
Packit 6c4009
	shr.u	loopcnt = len, 3	// loopcnt = len / 8
Packit 6c4009
	add	src = src, len		// src points one byte past the end
Packit 6c4009
	add	dest = dest, len ;; 	// dest points one byte past the end
Packit 6c4009
	mov	ar.ec = MEMLAT + 1	// set the epilog counter
Packit 6c4009
	mov	pr.rot = 1 << 16	// set rotating predicates
Packit 6c4009
	adds	loopcnt = -1, loopcnt	// --loopcnt
Packit 6c4009
	cmp.ne	p6, p0 = tmp4, r0	// if ((dest | src | len) & 7 != 0)
Packit 6c4009
(p6)	br.cond.sptk .bytecopy ;;	// copy byte by byte backward
Packit 6c4009
	adds	src = -8, src		// src points to the last word
Packit 6c4009
	adds	dest = -8, dest 	// dest points to the last word
Packit 6c4009
	mov	ar.lc = loopcnt;;	// set the loop counter
Packit 6c4009
.l5:
Packit 6c4009
(p[0])		ld8	r[0] = [src], -8
Packit 6c4009
(p[MEMLAT])	st8	[dest] = r[MEMLAT], -8
Packit 6c4009
		br.ctop.dptk .l5
Packit 6c4009
		br.cond.sptk .restore_and_exit
Packit 6c4009
.bytecopy:
Packit 6c4009
	adds	src = -1, src		// src points to the last byte
Packit 6c4009
	adds	dest = -1, dest		// dest points to the last byte
Packit 6c4009
	adds	loopcnt = -1, len;;	// loopcnt = len - 1
Packit 6c4009
	mov	ar.lc = loopcnt;;	// set the loop counter
Packit 6c4009
.l6:
Packit 6c4009
(p[0])		ld1	r[0] = [src], -1
Packit 6c4009
(p[MEMLAT])	st1	[dest] = r[MEMLAT], -1
Packit 6c4009
		br.ctop.dptk .l6
Packit 6c4009
		br.cond.sptk .restore_and_exit
Packit 6c4009
END(memmove)
Packit 6c4009
Packit 6c4009
	.rodata
Packit 6c4009
	.align 8
Packit 6c4009
.table:
Packit 6c4009
	data8	0			// dummy entry
Packit 6c4009
	data8 	.loop56 - .loop8
Packit 6c4009
	data8 	.loop56 - .loop16
Packit 6c4009
	data8 	.loop56 - .loop24
Packit 6c4009
	data8	.loop56 - .loop32
Packit 6c4009
	data8	.loop56 - .loop40
Packit 6c4009
	data8	.loop56 - .loop48
Packit 6c4009
	data8	.loop56 - .loop56
Packit 6c4009
Packit 6c4009
libc_hidden_builtin_def (memmove)