Blame sysdeps/powerpc/powerpc32/power7/mempcpy.S

Packit 6c4009
/* Optimized mempcpy implementation for POWER7.
Packit 6c4009
   Copyright (C) 2010-2018 Free Software Foundation, Inc.
Packit 6c4009
   Contributed by Luis Machado <luisgpm@br.ibm.com>.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]);
Packit 6c4009
	Returns 'dst' + 'len'.  */
Packit 6c4009
Packit 6c4009
	.machine  power7
Packit 6c4009
EALIGN (__mempcpy, 5, 0)
Packit 6c4009
	CALL_MCOUNT
Packit 6c4009
Packit 6c4009
	stwu	1,-32(1)
Packit 6c4009
	cfi_adjust_cfa_offset(32)
Packit 6c4009
	stw	30,20(1)
Packit 6c4009
	cfi_offset(30,(20-32))
Packit 6c4009
	stw	31,24(1)
Packit 6c4009
	mr	30,3
Packit 6c4009
	cmplwi	cr1,5,31
Packit 6c4009
	neg	0,3
Packit 6c4009
	cfi_offset(31,-8)
Packit 6c4009
	ble	cr1,L(copy_LT_32)  /* If move < 32 bytes use short move
Packit 6c4009
					code.  */
Packit 6c4009
Packit 6c4009
	andi.	11,3,7	      /* Check alignment of DST.  */
Packit 6c4009
	clrlwi	10,4,29	      /* Check alignment of SRC.  */
Packit 6c4009
	cmplw	cr6,10,11     /* SRC and DST alignments match?  */
Packit 6c4009
	mr	12,4
Packit 6c4009
	mr	31,5
Packit 6c4009
	bne	cr6,L(copy_GE_32_unaligned)
Packit 6c4009
Packit 6c4009
	srwi	9,5,3	      /* Number of full quadwords remaining.  */
Packit 6c4009
Packit 6c4009
	beq	L(copy_GE_32_aligned_cont)
Packit 6c4009
Packit 6c4009
	clrlwi	0,0,29
Packit 6c4009
	mtcrf	0x01,0
Packit 6c4009
	subf	31,0,5
Packit 6c4009
Packit 6c4009
	/* Get the SRC aligned to 8 bytes.  */
Packit 6c4009
Packit 6c4009
1:	bf	31,2f
Packit 6c4009
	lbz	6,0(12)
Packit 6c4009
	addi	12,12,1
Packit 6c4009
	stb	6,0(3)
Packit 6c4009
	addi	3,3,1
Packit 6c4009
2:	bf	30,4f
Packit 6c4009
	lhz	6,0(12)
Packit 6c4009
	addi	12,12,2
Packit 6c4009
	sth	6,0(3)
Packit 6c4009
	addi	3,3,2
Packit 6c4009
4:	bf	29,0f
Packit 6c4009
	lwz	6,0(12)
Packit 6c4009
	addi	12,12,4
Packit 6c4009
	stw	6,0(3)
Packit 6c4009
	addi	3,3,4
Packit 6c4009
0:
Packit 6c4009
	clrlwi	10,12,29      /* Check alignment of SRC again.  */
Packit 6c4009
	srwi	9,31,3	      /* Number of full doublewords remaining.  */
Packit 6c4009
Packit 6c4009
L(copy_GE_32_aligned_cont):
Packit 6c4009
Packit 6c4009
	clrlwi	11,31,29
Packit 6c4009
	mtcrf	0x01,9
Packit 6c4009
Packit 6c4009
	srwi	8,31,5
Packit 6c4009
	cmplwi	cr1,9,4
Packit 6c4009
	cmplwi	cr6,11,0
Packit 6c4009
	mr	11,12
Packit 6c4009
Packit 6c4009
	/* Copy 1~3 doublewords so the main loop starts
Packit 6c4009
	at a multiple of 32 bytes.  */
Packit 6c4009
Packit 6c4009
	bf	30,1f
Packit 6c4009
	lfd	6,0(12)
Packit 6c4009
	lfd	7,8(12)
Packit 6c4009
	addi	11,12,16
Packit 6c4009
	mtctr	8
Packit 6c4009
	stfd	6,0(3)
Packit 6c4009
	stfd	7,8(3)
Packit 6c4009
	addi	10,3,16
Packit 6c4009
	bf	31,4f
Packit 6c4009
	lfd	0,16(12)
Packit 6c4009
	stfd	0,16(3)
Packit 6c4009
	blt	cr1,3f
Packit 6c4009
	addi	11,12,24
Packit 6c4009
	addi	10,3,24
Packit 6c4009
	b	4f
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
1:	/* Copy 1 doubleword and set the counter.  */
Packit 6c4009
	mr	10,3
Packit 6c4009
	mtctr	8
Packit 6c4009
	bf	31,4f
Packit 6c4009
	lfd	6,0(12)
Packit 6c4009
	addi	11,12,8
Packit 6c4009
	stfd	6,0(3)
Packit 6c4009
	addi	10,3,8
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
4:	/* Main aligned copy loop. Copies 32-bytes at a time.  */
Packit 6c4009
	lfd	6,0(11)
Packit 6c4009
	lfd	7,8(11)
Packit 6c4009
	lfd	8,16(11)
Packit 6c4009
	lfd	0,24(11)
Packit 6c4009
	addi	11,11,32
Packit 6c4009
Packit 6c4009
	stfd	6,0(10)
Packit 6c4009
	stfd	7,8(10)
Packit 6c4009
	stfd	8,16(10)
Packit 6c4009
	stfd	0,24(10)
Packit 6c4009
	addi	10,10,32
Packit 6c4009
	bdnz	4b
Packit 6c4009
3:
Packit 6c4009
Packit 6c4009
	/* Check for tail bytes.  */
Packit 6c4009
Packit 6c4009
	clrrwi	0,31,3
Packit 6c4009
	mtcrf	0x01,31
Packit 6c4009
	beq	cr6,0f
Packit 6c4009
Packit 6c4009
.L9:
Packit 6c4009
	add	3,3,0
Packit 6c4009
	add	12,12,0
Packit 6c4009
Packit 6c4009
	/*  At this point we have a tail of 0-7 bytes and we know that the
Packit 6c4009
	destination is doubleword-aligned.  */
Packit 6c4009
4:	/* Copy 4 bytes.  */
Packit 6c4009
	bf	29,2f
Packit 6c4009
Packit 6c4009
	lwz	6,0(12)
Packit 6c4009
	addi	12,12,4
Packit 6c4009
	stw	6,0(3)
Packit 6c4009
	addi	3,3,4
Packit 6c4009
2:	/* Copy 2 bytes.  */
Packit 6c4009
	bf	30,1f
Packit 6c4009
Packit 6c4009
	lhz	6,0(12)
Packit 6c4009
	addi	12,12,2
Packit 6c4009
	sth	6,0(3)
Packit 6c4009
	addi	3,3,2
Packit 6c4009
1:	/* Copy 1 byte.  */
Packit 6c4009
	bf	31,0f
Packit 6c4009
Packit 6c4009
	lbz	6,0(12)
Packit 6c4009
	stb	6,0(3)
Packit 6c4009
0:	/* Return DST + LEN pointer.  */
Packit 6c4009
	add	3,30,5
Packit 6c4009
	lwz	30,20(1)
Packit 6c4009
	lwz	31,24(1)
Packit 6c4009
	addi	1,1,32
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* Handle copies of 0~31 bytes.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(copy_LT_32):
Packit 6c4009
	cmplwi	cr6,5,8
Packit 6c4009
	mr	12,4
Packit 6c4009
	mtcrf	0x01,5
Packit 6c4009
	ble	cr6,L(copy_LE_8)
Packit 6c4009
Packit 6c4009
	/* At least 9 bytes to go.  */
Packit 6c4009
	neg	8,4
Packit 6c4009
	clrrwi	11,4,2
Packit 6c4009
	andi.	0,8,3
Packit 6c4009
	cmplwi	cr1,5,16
Packit 6c4009
	mr	10,5
Packit 6c4009
	beq	L(copy_LT_32_aligned)
Packit 6c4009
Packit 6c4009
	/* Force 4-bytes alignment for SRC.  */
Packit 6c4009
	mtocrf  0x01,0
Packit 6c4009
	subf	10,0,5
Packit 6c4009
2:	bf	30,1f
Packit 6c4009
Packit 6c4009
	lhz	6,0(12)
Packit 6c4009
	addi	12,12,2
Packit 6c4009
	sth	6,0(3)
Packit 6c4009
	addi	3,3,2
Packit 6c4009
1:	bf	31,L(end_4bytes_alignment)
Packit 6c4009
Packit 6c4009
	lbz	6,0(12)
Packit 6c4009
	addi	12,12,1
Packit 6c4009
	stb	6,0(3)
Packit 6c4009
	addi	3,3,1
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(end_4bytes_alignment):
Packit 6c4009
	cmplwi	cr1,10,16
Packit 6c4009
	mtcrf	0x01,10
Packit 6c4009
Packit 6c4009
L(copy_LT_32_aligned):
Packit 6c4009
	/* At least 6 bytes to go, and SRC is word-aligned.  */
Packit 6c4009
	blt	cr1,8f
Packit 6c4009
Packit 6c4009
	/* Copy 16 bytes.  */
Packit 6c4009
	lwz	6,0(12)
Packit 6c4009
	lwz	7,4(12)
Packit 6c4009
	stw	6,0(3)
Packit 6c4009
	lwz	8,8(12)
Packit 6c4009
	stw	7,4(3)
Packit 6c4009
	lwz	6,12(12)
Packit 6c4009
	addi	12,12,16
Packit 6c4009
	stw	8,8(3)
Packit 6c4009
	stw	6,12(3)
Packit 6c4009
	addi	3,3,16
Packit 6c4009
8:	/* Copy 8 bytes.  */
Packit 6c4009
	bf	28,4f
Packit 6c4009
Packit 6c4009
	lwz	6,0(12)
Packit 6c4009
	lwz	7,4(12)
Packit 6c4009
	addi	12,12,8
Packit 6c4009
	stw	6,0(3)
Packit 6c4009
	stw	7,4(3)
Packit 6c4009
	addi	3,3,8
Packit 6c4009
4:	/* Copy 4 bytes.  */
Packit 6c4009
	bf	29,2f
Packit 6c4009
Packit 6c4009
	lwz	6,0(12)
Packit 6c4009
	addi	12,12,4
Packit 6c4009
	stw	6,0(3)
Packit 6c4009
	addi	3,3,4
Packit 6c4009
2:	/* Copy 2-3 bytes.  */
Packit 6c4009
	bf	30,1f
Packit 6c4009
Packit 6c4009
	lhz	6,0(12)
Packit 6c4009
	sth	6,0(3)
Packit 6c4009
	bf	31,0f
Packit 6c4009
	lbz	7,2(12)
Packit 6c4009
	stb	7,2(3)
Packit 6c4009
Packit 6c4009
	/* Return DST + LEN pointer.  */
Packit 6c4009
	add	3,30,5
Packit 6c4009
	lwz	30,20(1)
Packit 6c4009
	addi	1,1,32
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
1:	/* Copy 1 byte.  */
Packit 6c4009
	bf	31,0f
Packit 6c4009
Packit 6c4009
	lbz	6,0(12)
Packit 6c4009
	stb	6,0(3)
Packit 6c4009
0:	/* Return DST + LEN pointer.  */
Packit 6c4009
	add	3,30,5
Packit 6c4009
	lwz	30,20(1)
Packit 6c4009
	addi	1,1,32
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* Handles copies of 0~8 bytes.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(copy_LE_8):
Packit 6c4009
	bne	cr6,4f
Packit 6c4009
Packit 6c4009
	/* Though we could've used lfd/stfd here, they are still
Packit 6c4009
	slow for unaligned cases.  */
Packit 6c4009
Packit 6c4009
	lwz	6,0(4)
Packit 6c4009
	lwz	7,4(4)
Packit 6c4009
	stw	6,0(3)
Packit 6c4009
	stw	7,4(3)
Packit 6c4009
Packit 6c4009
	/* Return DST + LEN pointer.  */
Packit 6c4009
	add	3,30,5
Packit 6c4009
	lwz	30,20(1)
Packit 6c4009
	addi	1,1,32
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
4:	/* Copies 4~7 bytes.  */
Packit 6c4009
	bf	29,2b
Packit 6c4009
Packit 6c4009
	lwz	6,0(4)
Packit 6c4009
	stw	6,0(3)
Packit 6c4009
	bf	30,5f
Packit 6c4009
	lhz	7,4(4)
Packit 6c4009
	sth	7,4(3)
Packit 6c4009
	bf	31,0f
Packit 6c4009
	lbz	8,6(4)
Packit 6c4009
	stb	8,6(3)
Packit 6c4009
Packit 6c4009
	/* Return DST + LEN pointer.  */
Packit 6c4009
	add	3,30,5
Packit 6c4009
	lwz	30,20(1)
Packit 6c4009
	addi	1,1,32
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
5:	/* Copy 1 byte.  */
Packit 6c4009
	bf	31,0f
Packit 6c4009
Packit 6c4009
	lbz	6,4(4)
Packit 6c4009
	stb	6,4(3)
Packit 6c4009
Packit 6c4009
0:	/* Return DST + LEN pointer.  */
Packit 6c4009
	add	3,30,5
Packit 6c4009
	lwz	30,20(1)
Packit 6c4009
	addi	1,1,32
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
Packit 6c4009
	SRC is not. Use aligned quadword loads from SRC, shifted to realign
Packit 6c4009
	the data, allowing for aligned DST stores.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(copy_GE_32_unaligned):
Packit 6c4009
	andi.	11,3,15	      /* Check alignment of DST.  */
Packit 6c4009
	clrlwi	0,0,28	      /* Number of bytes until the 1st
Packit 6c4009
				 quadword of DST.  */
Packit 6c4009
	srwi	9,5,4	      /* Number of full quadwords remaining.  */
Packit 6c4009
Packit 6c4009
	beq	L(copy_GE_32_unaligned_cont)
Packit 6c4009
Packit 6c4009
	/* DST is not quadword aligned, get it aligned.  */
Packit 6c4009
Packit 6c4009
	mtcrf	0x01,0
Packit 6c4009
	subf	31,0,5
Packit 6c4009
Packit 6c4009
	/* Vector instructions work best when proper alignment (16-bytes)
Packit 6c4009
	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
Packit 6c4009
1:	/* Copy 1 byte.  */
Packit 6c4009
	bf	31,2f
Packit 6c4009
Packit 6c4009
	lbz	6,0(12)
Packit 6c4009
	addi	12,12,1
Packit 6c4009
	stb	6,0(3)
Packit 6c4009
	addi	3,3,1
Packit 6c4009
2:	/* Copy 2 bytes.  */
Packit 6c4009
	bf		30,4f
Packit 6c4009
Packit 6c4009
	lhz	6,0(12)
Packit 6c4009
	addi	12,12,2
Packit 6c4009
	sth	6,0(3)
Packit 6c4009
	addi	3,3,2
Packit 6c4009
4:	/* Copy 4 bytes.  */
Packit 6c4009
	bf	29,8f
Packit 6c4009
Packit 6c4009
	lwz	6,0(12)
Packit 6c4009
	addi	12,12,4
Packit 6c4009
	stw	6,0(3)
Packit 6c4009
	addi	3,3,4
Packit 6c4009
8:	/* Copy 8 bytes.  */
Packit 6c4009
	bf	28,0f
Packit 6c4009
Packit 6c4009
	lfd	6,0(12)
Packit 6c4009
	addi	12,12,8
Packit 6c4009
	stfd	6,0(3)
Packit 6c4009
	addi	3,3,8
Packit 6c4009
0:
Packit 6c4009
	clrlwi	10,12,28      /* Check alignment of SRC.  */
Packit 6c4009
	srwi	9,31,4	      /* Number of full quadwords remaining.  */
Packit 6c4009
Packit 6c4009
	/* The proper alignment is present, it is OK to copy the bytes now.  */
Packit 6c4009
L(copy_GE_32_unaligned_cont):
Packit 6c4009
Packit 6c4009
	/* Setup two indexes to speed up the indexed vector operations.  */
Packit 6c4009
	clrlwi	11,31,28
Packit 6c4009
	li	6,16	      /* Index for 16-bytes offsets.  */
Packit 6c4009
	li	7,32	      /* Index for 32-bytes offsets.  */
Packit 6c4009
	cmplwi	cr1,11,0
Packit 6c4009
	srwi	8,31,5	      /* Setup the loop counter.  */
Packit 6c4009
	mr	10,3
Packit 6c4009
	mr	11,12
Packit 6c4009
	mtcrf	0x01,9
Packit 6c4009
	cmplwi	cr6,9,1
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	lvsr    5,0,12
Packit 6c4009
#else
Packit 6c4009
	lvsl    5,0,12
Packit 6c4009
#endif
Packit 6c4009
	lvx	3,0,12
Packit 6c4009
	bf	31,L(setup_unaligned_loop)
Packit 6c4009
Packit 6c4009
	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
Packit 6c4009
	lvx	4,12,6
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	vperm   6,4,3,5
Packit 6c4009
#else
Packit 6c4009
	vperm   6,3,4,5
Packit 6c4009
#endif
Packit 6c4009
	addi	11,12,16
Packit 6c4009
	addi	10,3,16
Packit 6c4009
	stvx	6,0,3
Packit 6c4009
	vor	3,4,4
Packit 6c4009
Packit 6c4009
L(setup_unaligned_loop):
Packit 6c4009
	mtctr	8
Packit 6c4009
	ble	cr6,L(end_unaligned_loop)
Packit 6c4009
Packit 6c4009
	/* Copy 32 bytes at a time using vector instructions.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(unaligned_loop):
Packit 6c4009
Packit 6c4009
	/* Note: vr6/vr10 may contain data that was already copied,
Packit 6c4009
	but in order to get proper alignment, we may have to copy
Packit 6c4009
	some portions again. This is faster than having unaligned
Packit 6c4009
	vector instructions though.  */
Packit 6c4009
Packit 6c4009
	lvx	4,11,6	      /* vr4 = r11+16.  */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	vperm   6,4,3,5
Packit 6c4009
#else
Packit 6c4009
	vperm   6,3,4,5
Packit 6c4009
#endif
Packit 6c4009
	lvx	3,11,7	      /* vr3 = r11+32.  */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	vperm   10,3,4,5
Packit 6c4009
#else
Packit 6c4009
	vperm   10,4,3,5
Packit 6c4009
#endif
Packit 6c4009
	addi	11,11,32
Packit 6c4009
	stvx	6,0,10
Packit 6c4009
	stvx	10,10,6
Packit 6c4009
	addi	10,10,32
Packit 6c4009
Packit 6c4009
	bdnz	L(unaligned_loop)
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(end_unaligned_loop):
Packit 6c4009
Packit 6c4009
	/* Check for tail bytes.  */
Packit 6c4009
	clrrwi	0,31,4
Packit 6c4009
	mtcrf	0x01,31
Packit 6c4009
	beq	cr1,0f
Packit 6c4009
Packit 6c4009
	add	3,3,0
Packit 6c4009
	add	12,12,0
Packit 6c4009
Packit 6c4009
	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
Packit 6c4009
8:	/* Copy 8 bytes.  */
Packit 6c4009
	bf	28,4f
Packit 6c4009
Packit 6c4009
	lwz	6,0(12)
Packit 6c4009
	lwz	7,4(12)
Packit 6c4009
	addi	12,12,8
Packit 6c4009
	stw	6,0(3)
Packit 6c4009
	stw	7,4(3)
Packit 6c4009
	addi	3,3,8
Packit 6c4009
4:	/* Copy 4 bytes.  */
Packit 6c4009
	bf	29,2f
Packit 6c4009
Packit 6c4009
	lwz	6,0(12)
Packit 6c4009
	addi	12,12,4
Packit 6c4009
	stw	6,0(3)
Packit 6c4009
	addi	3,3,4
Packit 6c4009
2:	/* Copy 2~3 bytes.  */
Packit 6c4009
	bf	30,1f
Packit 6c4009
Packit 6c4009
	lhz	6,0(12)
Packit 6c4009
	addi	12,12,2
Packit 6c4009
	sth	6,0(3)
Packit 6c4009
	addi	3,3,2
Packit 6c4009
1:	/* Copy 1 byte.  */
Packit 6c4009
	bf	31,0f
Packit 6c4009
Packit 6c4009
	lbz	6,0(12)
Packit 6c4009
	stb	6,0(3)
Packit 6c4009
0:	/* Return DST + LEN pointer.  */
Packit 6c4009
	add	3,30,5
Packit 6c4009
	lwz	30,20(1)
Packit 6c4009
	lwz	31,24(1)
Packit 6c4009
	addi	1,1,32
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
END (__mempcpy)
Packit 6c4009
libc_hidden_def (__mempcpy)
Packit 6c4009
weak_alias (__mempcpy, mempcpy)
Packit 6c4009
libc_hidden_builtin_def (mempcpy)