Blame sysdeps/powerpc/powerpc64/power7/memmove.S

Packit 6c4009
/* Optimized memmove implementation for PowerPC64/POWER7.
Packit 6c4009
   Copyright (C) 2014-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
Packit 6c4009
/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
Packit 6c4009
Packit 6c4009
   This optimization check if memory 'dest'  overlaps with 'src'. If it does
Packit 6c4009
   not then it calls an optimized memcpy call (similar to memcpy for POWER7,
Packit 6c4009
   embedded here to gain some cycles).
Packit 6c4009
   If source and destiny overlaps, a optimized backwards memcpy is used
Packit 6c4009
   instead.  */
Packit 6c4009
Packit 6c4009
#ifndef MEMMOVE
Packit 6c4009
# define MEMMOVE memmove
Packit 6c4009
#endif
Packit 6c4009
	.machine power7
Packit 6c4009
ENTRY_TOCLESS (MEMMOVE, 5)
Packit 6c4009
	CALL_MCOUNT 3
Packit 6c4009
Packit 6c4009
L(_memmove):
Packit 6c4009
	subf    r9,r4,r3
Packit 6c4009
	cmpld   cr7,r9,r5
Packit 6c4009
	blt	cr7,L(memmove_bwd)
Packit 6c4009
Packit 6c4009
	cmpldi	cr1,r5,31
Packit 6c4009
	neg	0,3
Packit 6c4009
	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
Packit 6c4009
				       code.  */
Packit 6c4009
Packit 6c4009
	andi.	10,3,15
Packit 6c4009
	clrldi	11,4,60
Packit 6c4009
	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
Packit 6c4009
Packit 6c4009
	mr	r11,3
Packit 6c4009
	bne	cr6,L(copy_GE_32_unaligned)
Packit 6c4009
	beq	L(aligned_copy)
Packit 6c4009
Packit 6c4009
	mtocrf	0x01,0
Packit 6c4009
	clrldi	0,0,60
Packit 6c4009
Packit 6c4009
/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
Packit 6c4009
1:
Packit 6c4009
	bf	31,2f
Packit 6c4009
	lbz	6,0(r4)
Packit 6c4009
	addi	r4,r4,1
Packit 6c4009
	stb	6,0(r11)
Packit 6c4009
	addi	r11,r11,1
Packit 6c4009
2:
Packit 6c4009
	bf	30,4f
Packit 6c4009
	lhz	6,0(r4)
Packit 6c4009
	addi	r4,r4,2
Packit 6c4009
	sth	6,0(r11)
Packit 6c4009
	addi	r11,r11,2
Packit 6c4009
4:
Packit 6c4009
	bf	29,8f
Packit 6c4009
	lwz	6,0(r4)
Packit 6c4009
	addi	r4,r4,4
Packit 6c4009
	stw	6,0(r11)
Packit 6c4009
	addi	r11,r11,4
Packit 6c4009
8:
Packit 6c4009
	bf	28,16f
Packit 6c4009
	ld	6,0(r4)
Packit 6c4009
	addi	r4,r4,8
Packit 6c4009
	std	6,0(r11)
Packit 6c4009
	addi	r11,r11,8
Packit 6c4009
16:
Packit 6c4009
	subf	r5,0,r5
Packit 6c4009
Packit 6c4009
/* Main aligned copy loop. Copies 128 bytes at a time. */
Packit 6c4009
L(aligned_copy):
Packit 6c4009
	li	6,16
Packit 6c4009
	li	7,32
Packit 6c4009
	li	8,48
Packit 6c4009
	mtocrf	0x02,r5
Packit 6c4009
	srdi	12,r5,7
Packit 6c4009
	cmpdi	12,0
Packit 6c4009
	beq	L(aligned_tail)
Packit 6c4009
	lvx	6,0,r4
Packit 6c4009
	lvx	7,r4,6
Packit 6c4009
	mtctr	12
Packit 6c4009
	b	L(aligned_128loop)
Packit 6c4009
Packit 6c4009
	.align  4
Packit 6c4009
L(aligned_128head):
Packit 6c4009
	/* for the 2nd + iteration of this loop. */
Packit 6c4009
	lvx	6,0,r4
Packit 6c4009
	lvx	7,r4,6
Packit 6c4009
L(aligned_128loop):
Packit 6c4009
	lvx	8,r4,7
Packit 6c4009
	lvx	9,r4,8
Packit 6c4009
	stvx	6,0,r11
Packit 6c4009
	addi	r4,r4,64
Packit 6c4009
	stvx	7,r11,6
Packit 6c4009
	stvx	8,r11,7
Packit 6c4009
	stvx	9,r11,8
Packit 6c4009
	lvx	6,0,r4
Packit 6c4009
	lvx	7,r4,6
Packit 6c4009
	addi	r11,r11,64
Packit 6c4009
	lvx	8,r4,7
Packit 6c4009
	lvx	9,r4,8
Packit 6c4009
	addi	r4,r4,64
Packit 6c4009
	stvx	6,0,r11
Packit 6c4009
	stvx	7,r11,6
Packit 6c4009
	stvx	8,r11,7
Packit 6c4009
	stvx	9,r11,8
Packit 6c4009
	addi	r11,r11,64
Packit 6c4009
	bdnz	L(aligned_128head)
Packit 6c4009
Packit 6c4009
L(aligned_tail):
Packit 6c4009
	mtocrf	0x01,r5
Packit 6c4009
	bf	25,32f
Packit 6c4009
	lvx	6,0,r4
Packit 6c4009
	lvx	7,r4,6
Packit 6c4009
	lvx	8,r4,7
Packit 6c4009
	lvx	9,r4,8
Packit 6c4009
	addi	r4,r4,64
Packit 6c4009
	stvx	6,0,r11
Packit 6c4009
	stvx	7,r11,6
Packit 6c4009
	stvx	8,r11,7
Packit 6c4009
	stvx	9,r11,8
Packit 6c4009
	addi	r11,r11,64
Packit 6c4009
32:
Packit 6c4009
	bf	26,16f
Packit 6c4009
	lvx	6,0,r4
Packit 6c4009
	lvx	7,r4,6
Packit 6c4009
	addi	r4,r4,32
Packit 6c4009
	stvx	6,0,r11
Packit 6c4009
	stvx	7,r11,6
Packit 6c4009
	addi	r11,r11,32
Packit 6c4009
16:
Packit 6c4009
	bf	27,8f
Packit 6c4009
	lvx	6,0,r4
Packit 6c4009
	addi	r4,r4,16
Packit 6c4009
	stvx	6,0,r11
Packit 6c4009
	addi	r11,r11,16
Packit 6c4009
8:
Packit 6c4009
	bf	28,4f
Packit 6c4009
	ld	6,0(r4)
Packit 6c4009
	addi	r4,r4,8
Packit 6c4009
	std     6,0(r11)
Packit 6c4009
	addi	r11,r11,8
Packit 6c4009
4:	/* Copies 4~7 bytes.  */
Packit 6c4009
	bf	29,L(tail2)
Packit 6c4009
	lwz	6,0(r4)
Packit 6c4009
	stw     6,0(r11)
Packit 6c4009
	bf      30,L(tail5)
Packit 6c4009
	lhz     7,4(r4)
Packit 6c4009
	sth     7,4(r11)
Packit 6c4009
	bflr	31
Packit 6c4009
	lbz     8,6(r4)
Packit 6c4009
	stb     8,6(r11)
Packit 6c4009
	/* Return original DST pointer.  */
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
/* Handle copies of 0~31 bytes.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(copy_LT_32):
Packit 6c4009
	mr	r11,3
Packit 6c4009
	cmpldi	cr6,r5,8
Packit 6c4009
	mtocrf	0x01,r5
Packit 6c4009
	ble	cr6,L(copy_LE_8)
Packit 6c4009
Packit 6c4009
	/* At least 9 bytes to go.  */
Packit 6c4009
	neg	8,4
Packit 6c4009
	andi.	0,8,3
Packit 6c4009
	cmpldi	cr1,r5,16
Packit 6c4009
	beq	L(copy_LT_32_aligned)
Packit 6c4009
Packit 6c4009
	/* Force 4-byte alignment for SRC.  */
Packit 6c4009
	mtocrf	0x01,0
Packit 6c4009
	subf	r5,0,r5
Packit 6c4009
2:
Packit 6c4009
	bf	30,1f
Packit 6c4009
	lhz	6,0(r4)
Packit 6c4009
	addi	r4,r4,2
Packit 6c4009
	sth	6,0(r11)
Packit 6c4009
	addi	r11,r11,2
Packit 6c4009
1:
Packit 6c4009
	bf	31,L(end_4bytes_alignment)
Packit 6c4009
	lbz	6,0(r4)
Packit 6c4009
	addi	r4,r4,1
Packit 6c4009
	stb	6,0(r11)
Packit 6c4009
	addi	r11,r11,1
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(end_4bytes_alignment):
Packit 6c4009
	cmpldi	cr1,r5,16
Packit 6c4009
	mtocrf	0x01,r5
Packit 6c4009
Packit 6c4009
L(copy_LT_32_aligned):
Packit 6c4009
	/* At least 6 bytes to go, and SRC is word-aligned.  */
Packit 6c4009
	blt	cr1,8f
Packit 6c4009
Packit 6c4009
	/* Copy 16 bytes.  */
Packit 6c4009
	lwz	6,0(r4)
Packit 6c4009
	lwz	7,4(r4)
Packit 6c4009
	stw	6,0(r11)
Packit 6c4009
	lwz	8,8(r4)
Packit 6c4009
	stw	7,4(r11)
Packit 6c4009
	lwz	6,12(r4)
Packit 6c4009
	addi	r4,r4,16
Packit 6c4009
	stw	8,8(r11)
Packit 6c4009
	stw	6,12(r11)
Packit 6c4009
	addi	r11,r11,16
Packit 6c4009
8:	/* Copy 8 bytes.  */
Packit 6c4009
	bf	28,L(tail4)
Packit 6c4009
	lwz	6,0(r4)
Packit 6c4009
	lwz	7,4(r4)
Packit 6c4009
	addi	r4,r4,8
Packit 6c4009
	stw	6,0(r11)
Packit 6c4009
	stw	7,4(r11)
Packit 6c4009
	addi	r11,r11,8
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
/* Copies 4~7 bytes.  */
Packit 6c4009
L(tail4):
Packit 6c4009
	bf	29,L(tail2)
Packit 6c4009
	lwz	6,0(r4)
Packit 6c4009
	stw	6,0(r11)
Packit 6c4009
	bf	30,L(tail5)
Packit 6c4009
	lhz	7,4(r4)
Packit 6c4009
	sth	7,4(r11)
Packit 6c4009
	bflr	31
Packit 6c4009
	lbz	8,6(r4)
Packit 6c4009
	stb	8,6(r11)
Packit 6c4009
	/* Return original DST pointer.  */
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
/* Copies 2~3 bytes.  */
Packit 6c4009
L(tail2):
Packit 6c4009
	bf	30,1f
Packit 6c4009
	lhz	6,0(r4)
Packit 6c4009
	sth	6,0(r11)
Packit 6c4009
	bflr	31
Packit 6c4009
	lbz	7,2(r4)
Packit 6c4009
	stb	7,2(r11)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(tail5):
Packit 6c4009
	bflr	31
Packit 6c4009
	lbz	6,4(r4)
Packit 6c4009
	stb	6,4(r11)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
1:
Packit 6c4009
	bflr	31
Packit 6c4009
	lbz	6,0(r4)
Packit 6c4009
	stb	6,0(r11)
Packit 6c4009
	/* Return original DST pointer.  */
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
/* Handles copies of 0~8 bytes.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(copy_LE_8):
Packit 6c4009
	bne	cr6,L(tail4)
Packit 6c4009
Packit 6c4009
	/* Though we could've used ld/std here, they are still
Packit 6c4009
	slow for unaligned cases.  */
Packit 6c4009
Packit 6c4009
	lwz	6,0(r4)
Packit 6c4009
	lwz	7,4(r4)
Packit 6c4009
	stw	6,0(r11)
Packit 6c4009
	stw	7,4(r11)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
Packit 6c4009
/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
Packit 6c4009
   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
Packit 6c4009
   the data, allowing for aligned DST stores.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(copy_GE_32_unaligned):
Packit 6c4009
	clrldi	0,0,60	      /* Number of bytes until the 1st r11 quadword.  */
Packit 6c4009
	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
Packit 6c4009
Packit 6c4009
	beq	L(copy_GE_32_unaligned_cont)
Packit 6c4009
Packit 6c4009
	/* DST is not quadword aligned, get it aligned.  */
Packit 6c4009
Packit 6c4009
	mtocrf	0x01,0
Packit 6c4009
	subf	r5,0,r5
Packit 6c4009
Packit 6c4009
	/* Vector instructions work best when proper alignment (16-bytes)
Packit 6c4009
	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
Packit 6c4009
1:
Packit 6c4009
	bf	31,2f
Packit 6c4009
	lbz	6,0(r4)
Packit 6c4009
	addi	r4,r4,1
Packit 6c4009
	stb	6,0(r11)
Packit 6c4009
	addi	r11,r11,1
Packit 6c4009
2:
Packit 6c4009
	bf	30,4f
Packit 6c4009
	lhz	6,0(r4)
Packit 6c4009
	addi	r4,r4,2
Packit 6c4009
	sth	6,0(r11)
Packit 6c4009
	addi	r11,r11,2
Packit 6c4009
4:
Packit 6c4009
	bf	29,8f
Packit 6c4009
	lwz	6,0(r4)
Packit 6c4009
	addi	r4,r4,4
Packit 6c4009
	stw	6,0(r11)
Packit 6c4009
	addi	r11,r11,4
Packit 6c4009
8:
Packit 6c4009
	bf	28,0f
Packit 6c4009
	ld	6,0(r4)
Packit 6c4009
	addi	r4,r4,8
Packit 6c4009
	std	6,0(r11)
Packit 6c4009
	addi	r11,r11,8
Packit 6c4009
0:
Packit 6c4009
	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
Packit 6c4009
Packit 6c4009
	/* The proper alignment is present, it is OK to copy the bytes now.  */
Packit 6c4009
L(copy_GE_32_unaligned_cont):
Packit 6c4009
Packit 6c4009
	/* Setup two indexes to speed up the indexed vector operations.  */
Packit 6c4009
	clrldi	10,r5,60
Packit 6c4009
	li	6,16	      /* Index for 16-bytes offsets.  */
Packit 6c4009
	li	7,32	      /* Index for 32-bytes offsets.  */
Packit 6c4009
	cmpldi	cr1,10,0
Packit 6c4009
	srdi	8,r5,5	      /* Setup the loop counter.  */
Packit 6c4009
	mtocrf	0x01,9
Packit 6c4009
	cmpldi	cr6,9,1
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	lvsr	5,0,r4
Packit 6c4009
#else
Packit 6c4009
	lvsl	5,0,r4
Packit 6c4009
#endif
Packit 6c4009
	lvx	3,0,r4
Packit 6c4009
	li	0,0
Packit 6c4009
	bf	31,L(setup_unaligned_loop)
Packit 6c4009
Packit 6c4009
	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
Packit 6c4009
	lvx	4,r4,6
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	vperm	6,4,3,5
Packit 6c4009
#else
Packit 6c4009
	vperm	6,3,4,5
Packit 6c4009
#endif
Packit 6c4009
	addi	r4,r4,16
Packit 6c4009
	stvx	6,0,r11
Packit 6c4009
	addi	r11,r11,16
Packit 6c4009
	vor	3,4,4
Packit 6c4009
	clrrdi	0,r4,60
Packit 6c4009
Packit 6c4009
L(setup_unaligned_loop):
Packit 6c4009
	mtctr	8
Packit 6c4009
	ble	cr6,L(end_unaligned_loop)
Packit 6c4009
Packit 6c4009
	/* Copy 32 bytes at a time using vector instructions.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(unaligned_loop):
Packit 6c4009
Packit 6c4009
	/* Note: vr6/vr10 may contain data that was already copied,
Packit 6c4009
	but in order to get proper alignment, we may have to copy
Packit 6c4009
	some portions again. This is faster than having unaligned
Packit 6c4009
	vector instructions though.  */
Packit 6c4009
Packit 6c4009
	lvx	4,r4,6
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	vperm	6,4,3,5
Packit 6c4009
#else
Packit 6c4009
	vperm	6,3,4,5
Packit 6c4009
#endif
Packit 6c4009
	lvx	3,r4,7
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	vperm	10,3,4,5
Packit 6c4009
#else
Packit 6c4009
	vperm	10,4,3,5
Packit 6c4009
#endif
Packit 6c4009
	addi	r4,r4,32
Packit 6c4009
	stvx	6,0,r11
Packit 6c4009
	stvx	10,r11,6
Packit 6c4009
	addi	r11,r11,32
Packit 6c4009
	bdnz	L(unaligned_loop)
Packit 6c4009
Packit 6c4009
	clrrdi	0,r4,60
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(end_unaligned_loop):
Packit 6c4009
Packit 6c4009
	/* Check for tail bytes.  */
Packit 6c4009
	mtocrf	0x01,r5
Packit 6c4009
	beqlr	cr1
Packit 6c4009
Packit 6c4009
	add	r4,r4,0
Packit 6c4009
Packit 6c4009
	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
Packit 6c4009
	/* Copy 8 bytes.  */
Packit 6c4009
	bf	28,4f
Packit 6c4009
	lwz	6,0(r4)
Packit 6c4009
	lwz	7,4(r4)
Packit 6c4009
	addi	r4,r4,8
Packit 6c4009
	stw	6,0(r11)
Packit 6c4009
	stw	7,4(r11)
Packit 6c4009
	addi	r11,r11,8
Packit 6c4009
4:	/* Copy 4~7 bytes.  */
Packit 6c4009
	bf	29,L(tail2)
Packit 6c4009
	lwz	6,0(r4)
Packit 6c4009
	stw	6,0(r11)
Packit 6c4009
	bf	30,L(tail5)
Packit 6c4009
	lhz	7,4(r4)
Packit 6c4009
	sth	7,4(r11)
Packit 6c4009
	bflr	31
Packit 6c4009
	lbz	8,6(r4)
Packit 6c4009
	stb	8,6(r11)
Packit 6c4009
	/* Return original DST pointer.  */
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* Start to memcpy backward implementation: the algorith first check if
Packit 6c4009
	   src and dest have the same alignment and if it does align both to 16
Packit 6c4009
	   bytes and copy using VSX instructions.
Packit 6c4009
	   If does not, align dest to 16 bytes and use VMX (altivec) instruction
Packit 6c4009
	   to read two 16 bytes at time, shift/permute the bytes read and write
Packit 6c4009
	   aligned to dest.  */
Packit 6c4009
L(memmove_bwd):
Packit 6c4009
	cmpldi	cr1,r5,31
Packit 6c4009
	/* Copy is done backwards: update the pointers and check alignment.  */
Packit 6c4009
	add	r11,r3,r5
Packit 6c4009
	add	r4,r4,r5
Packit 6c4009
	mr	r0,r11
Packit 6c4009
	ble	cr1, L(copy_LT_32_bwd)  /* If move < 32 bytes use short move
Packit 6c4009
				           code.  */
Packit 6c4009
Packit 6c4009
	andi.	r10,r11,15	    /* Check if r11 is aligned to 16 bytes  */
Packit 6c4009
	clrldi	r9,r4,60	    /* Check if r4 is aligned to 16 bytes  */
Packit 6c4009
	cmpld	cr6,r10,r9	    /* SRC and DST alignments match?  */
Packit 6c4009
Packit 6c4009
	bne     cr6,L(copy_GE_32_unaligned_bwd)
Packit 6c4009
	beq     L(aligned_copy_bwd)
Packit 6c4009
Packit 6c4009
	mtocrf	0x01,r0
Packit 6c4009
	clrldi	r0,r0,60
Packit 6c4009
Packit 6c4009
/* Get the DST and SRC aligned to 16 bytes.  */
Packit 6c4009
1:
Packit 6c4009
	bf	31,2f
Packit 6c4009
	lbz	r6,-1(r4)
Packit 6c4009
	subi	r4,r4,1
Packit 6c4009
	stb	r6,-1(r11)
Packit 6c4009
	subi	r11,r11,1
Packit 6c4009
2:
Packit 6c4009
	bf	30,4f
Packit 6c4009
	lhz	r6,-2(r4)
Packit 6c4009
	subi	r4,r4,2
Packit 6c4009
	sth	r6,-2(r11)
Packit 6c4009
	subi	r11,r11,2
Packit 6c4009
4:
Packit 6c4009
	bf	29,8f
Packit 6c4009
	lwz	r6,-4(r4)
Packit 6c4009
	subi	r4,r4,4
Packit 6c4009
	stw	r6,-4(r11)
Packit 6c4009
	subi	r11,r11,4
Packit 6c4009
8:
Packit 6c4009
	bf	28,16f
Packit 6c4009
	ld	r6,-8(r4)
Packit 6c4009
	subi	r4,r4,8
Packit 6c4009
	std	r6,-8(r11)
Packit 6c4009
	subi	r11,r11,8
Packit 6c4009
16:
Packit 6c4009
	subf	r5,0,r5
Packit 6c4009
Packit 6c4009
/* Main aligned copy loop. Copies 128 bytes at a time. */
Packit 6c4009
L(aligned_copy_bwd):
Packit 6c4009
	li	r6,-16
Packit 6c4009
	li	r7,-32
Packit 6c4009
	li	r8,-48
Packit 6c4009
	li	r9,-64
Packit 6c4009
	mtocrf	0x02,r5
Packit 6c4009
	srdi	r12,r5,7
Packit 6c4009
	cmpdi	r12,0
Packit 6c4009
	beq	L(aligned_tail_bwd)
Packit 6c4009
	lvx	v6,r4,r6
Packit 6c4009
	lvx	v7,r4,r7
Packit 6c4009
	mtctr	12
Packit 6c4009
	b	L(aligned_128loop_bwd)
Packit 6c4009
Packit 6c4009
	.align  4
Packit 6c4009
L(aligned_128head_bwd):
Packit 6c4009
	/* for the 2nd + iteration of this loop. */
Packit 6c4009
	lvx	v6,r4,r6
Packit 6c4009
	lvx	v7,r4,r7
Packit 6c4009
L(aligned_128loop_bwd):
Packit 6c4009
	lvx	v8,r4,r8
Packit 6c4009
	lvx	v9,r4,r9
Packit 6c4009
	stvx	v6,r11,r6
Packit 6c4009
	subi	r4,r4,64
Packit 6c4009
	stvx	v7,r11,r7
Packit 6c4009
	stvx	v8,r11,r8
Packit 6c4009
	stvx	v9,r11,r9
Packit 6c4009
	lvx	v6,r4,r6
Packit 6c4009
	lvx	v7,r4,7
Packit 6c4009
	subi	r11,r11,64
Packit 6c4009
	lvx	v8,r4,r8
Packit 6c4009
	lvx	v9,r4,r9
Packit 6c4009
	subi	r4,r4,64
Packit 6c4009
	stvx	v6,r11,r6
Packit 6c4009
	stvx	v7,r11,r7
Packit 6c4009
	stvx	v8,r11,r8
Packit 6c4009
	stvx	v9,r11,r9
Packit 6c4009
	subi	r11,r11,64
Packit 6c4009
	bdnz	L(aligned_128head_bwd)
Packit 6c4009
Packit 6c4009
L(aligned_tail_bwd):
Packit 6c4009
	mtocrf	0x01,r5
Packit 6c4009
	bf	25,32f
Packit 6c4009
	lvx	v6,r4,r6
Packit 6c4009
	lvx	v7,r4,r7
Packit 6c4009
	lvx	v8,r4,r8
Packit 6c4009
	lvx	v9,r4,r9
Packit 6c4009
	subi	r4,r4,64
Packit 6c4009
	stvx	v6,r11,r6
Packit 6c4009
	stvx	v7,r11,r7
Packit 6c4009
	stvx	v8,r11,r8
Packit 6c4009
	stvx	v9,r11,r9
Packit 6c4009
	subi	r11,r11,64
Packit 6c4009
32:
Packit 6c4009
	bf	26,16f
Packit 6c4009
	lvx	v6,r4,r6
Packit 6c4009
	lvx	v7,r4,r7
Packit 6c4009
	subi	r4,r4,32
Packit 6c4009
	stvx	v6,r11,r6
Packit 6c4009
	stvx	v7,r11,r7
Packit 6c4009
	subi	r11,r11,32
Packit 6c4009
16:
Packit 6c4009
	bf	27,8f
Packit 6c4009
	lvx	v6,r4,r6
Packit 6c4009
	subi	r4,r4,16
Packit 6c4009
	stvx	v6,r11,r6
Packit 6c4009
	subi	r11,r11,16
Packit 6c4009
8:
Packit 6c4009
	bf	28,4f
Packit 6c4009
	ld	r6,-8(r4)
Packit 6c4009
	subi	r4,r4,8
Packit 6c4009
	std     r6,-8(r11)
Packit 6c4009
	subi	r11,r11,8
Packit 6c4009
4:	/* Copies 4~7 bytes.  */
Packit 6c4009
	bf	29,L(tail2_bwd)
Packit 6c4009
	lwz	r6,-4(r4)
Packit 6c4009
	stw     r6,-4(r11)
Packit 6c4009
	bf      30,L(tail5_bwd)
Packit 6c4009
	lhz     r7,-6(r4)
Packit 6c4009
	sth     r7,-6(r11)
Packit 6c4009
	bflr	31
Packit 6c4009
	lbz     r8,-7(r4)
Packit 6c4009
	stb     r8,-7(r11)
Packit 6c4009
	/* Return original DST pointer.  */
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
/* Handle copies of 0~31 bytes.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(copy_LT_32_bwd):
Packit 6c4009
	cmpldi	cr6,r5,8
Packit 6c4009
	mtocrf	0x01,r5
Packit 6c4009
	ble	cr6,L(copy_LE_8_bwd)
Packit 6c4009
Packit 6c4009
	/* At least 9 bytes to go.  */
Packit 6c4009
	neg	r8,r4
Packit 6c4009
	andi.	r0,r8,3
Packit 6c4009
	cmpldi	cr1,r5,16
Packit 6c4009
	beq	L(copy_LT_32_aligned_bwd)
Packit 6c4009
Packit 6c4009
	/* Force 4-byte alignment for SRC.  */
Packit 6c4009
	mtocrf	0x01,0
Packit 6c4009
	subf	r5,0,r5
Packit 6c4009
2:
Packit 6c4009
	bf	30,1f
Packit 6c4009
	lhz	r6,-2(r4)
Packit 6c4009
	subi	r4,r4,2
Packit 6c4009
	sth	r6,-2(r11)
Packit 6c4009
	subi	r11,r11,2
Packit 6c4009
1:
Packit 6c4009
	bf	31,L(end_4bytes_alignment_bwd)
Packit 6c4009
	lbz	6,-1(r4)
Packit 6c4009
	subi	r4,r4,1
Packit 6c4009
	stb	6,-1(r11)
Packit 6c4009
	subi	r11,r11,1
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(end_4bytes_alignment_bwd):
Packit 6c4009
	cmpldi	cr1,r5,16
Packit 6c4009
	mtocrf	0x01,r5
Packit 6c4009
Packit 6c4009
L(copy_LT_32_aligned_bwd):
Packit 6c4009
	/* At least 6 bytes to go, and SRC is word-aligned.  */
Packit 6c4009
	blt	cr1,8f
Packit 6c4009
Packit 6c4009
	/* Copy 16 bytes.  */
Packit 6c4009
	lwz	r6,-4(r4)
Packit 6c4009
	lwz	r7,-8(r4)
Packit 6c4009
	stw	r6,-4(r11)
Packit 6c4009
	lwz	r8,-12(r4)
Packit 6c4009
	stw	r7,-8(r11)
Packit 6c4009
	lwz	r6,-16(r4)
Packit 6c4009
	subi	r4,r4,16
Packit 6c4009
	stw	r8,-12(r11)
Packit 6c4009
	stw	r6,-16(r11)
Packit 6c4009
	subi	r11,r11,16
Packit 6c4009
8:	/* Copy 8 bytes.  */
Packit 6c4009
	bf	28,L(tail4_bwd)
Packit 6c4009
	lwz	r6,-4(r4)
Packit 6c4009
	lwz	r7,-8(r4)
Packit 6c4009
	subi	r4,r4,8
Packit 6c4009
	stw	r6,-4(r11)
Packit 6c4009
	stw	r7,-8(r11)
Packit 6c4009
	subi	r11,r11,8
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
/* Copies 4~7 bytes.  */
Packit 6c4009
L(tail4_bwd):
Packit 6c4009
	bf	29,L(tail2_bwd)
Packit 6c4009
	lwz	6,-4(r4)
Packit 6c4009
	stw	6,-4(r11)
Packit 6c4009
	bf	30,L(tail5_bwd)
Packit 6c4009
	lhz	7,-6(r4)
Packit 6c4009
	sth	7,-6(r11)
Packit 6c4009
	bflr	31
Packit 6c4009
	lbz	8,-7(r4)
Packit 6c4009
	stb	8,-7(r11)
Packit 6c4009
	/* Return original DST pointer.  */
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
/* Copies 2~3 bytes.  */
Packit 6c4009
L(tail2_bwd):
Packit 6c4009
	bf	30,1f
Packit 6c4009
	lhz	6,-2(r4)
Packit 6c4009
	sth	6,-2(r11)
Packit 6c4009
	bflr	31
Packit 6c4009
	lbz	7,-3(r4)
Packit 6c4009
	stb	7,-3(r11)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(tail5_bwd):
Packit 6c4009
	bflr	31
Packit 6c4009
	lbz	6,-5(r4)
Packit 6c4009
	stb	6,-5(r11)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
1:
Packit 6c4009
	bflr	31
Packit 6c4009
	lbz	6,-1(r4)
Packit 6c4009
	stb	6,-1(r11)
Packit 6c4009
	/* Return original DST pointer.  */
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
Packit 6c4009
/* Handles copies of 0~8 bytes.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(copy_LE_8_bwd):
Packit 6c4009
	bne	cr6,L(tail4_bwd)
Packit 6c4009
Packit 6c4009
	/* Though we could've used ld/std here, they are still
Packit 6c4009
	   slow for unaligned cases.  */
Packit 6c4009
	lwz	6,-8(r4)
Packit 6c4009
	lwz	7,-4(r4)
Packit 6c4009
	stw	6,-8(r11)
Packit 6c4009
	stw	7,-4(r11)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
Packit 6c4009
/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
Packit 6c4009
   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
Packit 6c4009
   the data, allowing for aligned DST stores.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(copy_GE_32_unaligned_bwd):
Packit 6c4009
	andi.	r10,r11,15      /* Check alignment of DST against 16 bytes..  */
Packit 6c4009
	srdi	r9,r5,4		/* Number of full quadwords remaining.  */
Packit 6c4009
Packit 6c4009
	beq	L(copy_GE_32_unaligned_cont_bwd)
Packit 6c4009
Packit 6c4009
	/* DST is not quadword aligned and r10 holds the address masked to
Packit 6c4009
           compare alignments.  */
Packit 6c4009
	mtocrf	0x01,r10
Packit 6c4009
	subf	r5,r10,r5
Packit 6c4009
Packit 6c4009
	/* Vector instructions work best when proper alignment (16-bytes)
Packit 6c4009
	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
Packit 6c4009
1:
Packit 6c4009
	bf	31,2f
Packit 6c4009
	lbz	r6,-1(r4)
Packit 6c4009
	subi	r4,r4,1
Packit 6c4009
	stb	r6,-1(r11)
Packit 6c4009
	subi	r11,r11,1
Packit 6c4009
2:
Packit 6c4009
	bf	30,4f
Packit 6c4009
	lhz	r6,-2(r4)
Packit 6c4009
	subi	r4,r4,2
Packit 6c4009
	sth	r6,-2(r11)
Packit 6c4009
	subi	r11,r11,2
Packit 6c4009
4:
Packit 6c4009
	bf	29,8f
Packit 6c4009
	lwz	r6,-4(r4)
Packit 6c4009
	subi	r4,r4,4
Packit 6c4009
	stw	r6,-4(r11)
Packit 6c4009
	subi	r11,r11,4
Packit 6c4009
8:
Packit 6c4009
	bf	28,0f
Packit 6c4009
	ld	r6,-8(r4)
Packit 6c4009
	subi	r4,r4,8
Packit 6c4009
	std	r6,-8(r11)
Packit 6c4009
	subi	r11,r11,8
Packit 6c4009
0:
Packit 6c4009
	srdi	r9,r5,4	      /* Number of full quadwords remaining.  */
Packit 6c4009
Packit 6c4009
	/* The proper alignment is present, it is OK to copy the bytes now.  */
Packit 6c4009
L(copy_GE_32_unaligned_cont_bwd):
Packit 6c4009
Packit 6c4009
	/* Setup two indexes to speed up the indexed vector operations.  */
Packit 6c4009
	clrldi	r10,r5,60
Packit 6c4009
	li	r6,-16	      /* Index for 16-bytes offsets.  */
Packit 6c4009
	li	r7,-32	      /* Index for 32-bytes offsets.  */
Packit 6c4009
	cmpldi	cr1,10,0
Packit 6c4009
	srdi	r8,r5,5	      /* Setup the loop counter.  */
Packit 6c4009
	mtocrf	0x01,9
Packit 6c4009
	cmpldi	cr6,r9,1
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	lvsr	v5,r0,r4
Packit 6c4009
#else
Packit 6c4009
	lvsl	v5,r0,r4
Packit 6c4009
#endif
Packit 6c4009
	lvx	v3,0,r4
Packit 6c4009
	li	r0,0
Packit 6c4009
	bf	31,L(setup_unaligned_loop_bwd)
Packit 6c4009
Packit 6c4009
	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
Packit 6c4009
	lvx	v4,r4,r6
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	vperm	v6,v3,v4,v5
Packit 6c4009
#else
Packit 6c4009
	vperm	v6,v4,v3,v5
Packit 6c4009
#endif
Packit 6c4009
	subi	r4,r4,16
Packit 6c4009
	stvx	v6,r11,r6
Packit 6c4009
	subi	r11,r11,16
Packit 6c4009
	vor	v3,v4,v4
Packit 6c4009
	clrrdi	r0,r4,60
Packit 6c4009
Packit 6c4009
L(setup_unaligned_loop_bwd):
Packit 6c4009
	mtctr	r8
Packit 6c4009
	ble	cr6,L(end_unaligned_loop_bwd)
Packit 6c4009
Packit 6c4009
	/* Copy 32 bytes at a time using vector instructions.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(unaligned_loop_bwd):
Packit 6c4009
Packit 6c4009
	/* Note: vr6/vr10 may contain data that was already copied,
Packit 6c4009
	but in order to get proper alignment, we may have to copy
Packit 6c4009
	some portions again. This is faster than having unaligned
Packit 6c4009
	vector instructions though.  */
Packit 6c4009
Packit 6c4009
	lvx	v4,r4,r6
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	vperm	v6,v3,v4,v5
Packit 6c4009
#else
Packit 6c4009
	vperm	v6,v4,v3,v5
Packit 6c4009
#endif
Packit 6c4009
	lvx	v3,r4,r7
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	vperm	v10,v4,v3,v5
Packit 6c4009
#else
Packit 6c4009
	vperm	v10,v3,v4,v5
Packit 6c4009
#endif
Packit 6c4009
	subi	r4,r4,32
Packit 6c4009
	stvx	v6,r11,r6
Packit 6c4009
	stvx	v10,r11,r7
Packit 6c4009
	subi	r11,r11,32
Packit 6c4009
	bdnz	L(unaligned_loop_bwd)
Packit 6c4009
Packit 6c4009
	clrrdi	r0,r4,60
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(end_unaligned_loop_bwd):
Packit 6c4009
Packit 6c4009
	/* Check for tail bytes.  */
Packit 6c4009
	mtocrf	0x01,r5
Packit 6c4009
	beqlr	cr1
Packit 6c4009
Packit 6c4009
	add	r4,r4,0
Packit 6c4009
Packit 6c4009
	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
Packit 6c4009
	/* Copy 8 bytes.  */
Packit 6c4009
	bf	28,4f
Packit 6c4009
	lwz	r6,-4(r4)
Packit 6c4009
	lwz	r7,-8(r4)
Packit 6c4009
	subi	r4,r4,8
Packit 6c4009
	stw	r6,-4(r11)
Packit 6c4009
	stw	r7,-8(r11)
Packit 6c4009
	subi	r11,r11,8
Packit 6c4009
4:	/* Copy 4~7 bytes.  */
Packit 6c4009
	bf	29,L(tail2_bwd)
Packit 6c4009
	lwz	r6,-4(r4)
Packit 6c4009
	stw	r6,-4(r11)
Packit 6c4009
	bf	30,L(tail5_bwd)
Packit 6c4009
	lhz	r7,-6(r4)
Packit 6c4009
	sth	r7,-6(r11)
Packit 6c4009
	bflr	31
Packit 6c4009
	lbz	r8,-7(r4)
Packit 6c4009
	stb	r8,-7(r11)
Packit 6c4009
	/* Return original DST pointer.  */
Packit 6c4009
	blr
Packit 6c4009
END_GEN_TB (MEMMOVE, TB_TOCLESS)
Packit 6c4009
libc_hidden_builtin_def (memmove)
Packit 6c4009
Packit 6c4009
Packit 6c4009
/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
Packit 6c4009
   Implemented in this file to avoid linker create a stub function call
Packit 6c4009
   in the branch to '_memmove'.  */
Packit 6c4009
ENTRY_TOCLESS (__bcopy)
Packit 6c4009
	mr	r6,r3
Packit 6c4009
	mr	r3,r4
Packit 6c4009
	mr	r4,r6
Packit 6c4009
	b	L(_memmove)
Packit 6c4009
END (__bcopy)
Packit 6c4009
weak_alias (__bcopy, bcopy)