Blame sysdeps/powerpc/powerpc64/power7/memset.S

Packit 6c4009
/* Optimized memset implementation for PowerPC64/POWER7.
Packit 6c4009
   Copyright (C) 2010-2018 Free Software Foundation, Inc.
Packit 6c4009
   Contributed by Luis Machado <luisgpm@br.ibm.com>.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
Packit 6c4009
   Returns 's'.  */
Packit 6c4009
Packit 6c4009
#ifndef MEMSET
Packit 6c4009
# define MEMSET memset
Packit 6c4009
#endif
Packit 6c4009
	.machine power7
Packit 6c4009
ENTRY_TOCLESS (MEMSET, 5)
Packit 6c4009
	CALL_MCOUNT 3
Packit 6c4009
Packit 6c4009
L(_memset):
Packit 6c4009
	cmpldi	cr7,5,31
Packit 6c4009
	cmpldi	cr6,5,8
Packit 6c4009
	mr	10,3
Packit 6c4009
Packit 6c4009
	/* Replicate byte to word.  */
Packit 6c4009
	insrdi	4,4,8,48
Packit 6c4009
	insrdi	4,4,16,32
Packit 6c4009
	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
Packit 6c4009
Packit 6c4009
	neg	0,3
Packit 6c4009
	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */
Packit 6c4009
Packit 6c4009
	andi.	11,10,7		/* Check alignment of SRC.  */
Packit 6c4009
	insrdi	4,4,32,0	/* Replicate word to double word.  */
Packit 6c4009
Packit 6c4009
	mr	12,5
Packit 6c4009
	beq	L(big_aligned)
Packit 6c4009
Packit 6c4009
	clrldi	0,0,61
Packit 6c4009
	mtocrf	0x01,0
Packit 6c4009
	subf	5,0,5
Packit 6c4009
Packit 6c4009
	/* Get DST aligned to 8 bytes.  */
Packit 6c4009
1:	bf	31,2f
Packit 6c4009
Packit 6c4009
	stb	4,0(10)
Packit 6c4009
	addi	10,10,1
Packit 6c4009
2:	bf	30,4f
Packit 6c4009
Packit 6c4009
	sth	4,0(10)
Packit 6c4009
	addi	10,10,2
Packit 6c4009
4:	bf	29,L(big_aligned)
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	addi	10,10,4
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(big_aligned):
Packit 6c4009
Packit 6c4009
	cmpldi	cr5,5,255
Packit 6c4009
	li	0,32
Packit 6c4009
	dcbtst	0,10
Packit 6c4009
	cmpldi	cr6,4,0
Packit 6c4009
	srdi	9,5,3	/* Number of full doublewords remaining.  */
Packit 6c4009
	crand	27,26,21
Packit 6c4009
	mtocrf	0x01,9
Packit 6c4009
	bt	27,L(huge)
Packit 6c4009
Packit 6c4009
	/* From this point on, we'll copy 32+ bytes and the value
Packit 6c4009
	   isn't 0 (so we can't use dcbz).  */
Packit 6c4009
Packit 6c4009
	srdi	8,5,5
Packit 6c4009
	clrldi	11,5,61
Packit 6c4009
	cmpldi	cr6,11,0
Packit 6c4009
	cmpldi	cr1,9,4
Packit 6c4009
	mtctr	8
Packit 6c4009
Packit 6c4009
	/* Copy 1~3 doublewords so the main loop starts
Packit 6c4009
	at a multiple of 32 bytes.  */
Packit 6c4009
Packit 6c4009
	bf	30,1f
Packit 6c4009
Packit 6c4009
	std	4,0(10)
Packit 6c4009
	std	4,8(10)
Packit 6c4009
	addi	10,10,16
Packit 6c4009
	bf	31,L(big_loop)
Packit 6c4009
Packit 6c4009
	std	4,0(10)
Packit 6c4009
	addi	10,10,8
Packit 6c4009
	mr	12,10
Packit 6c4009
	blt	cr1,L(tail_bytes)
Packit 6c4009
	b	L(big_loop)
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
1:	/* Copy 1 doubleword.  */
Packit 6c4009
	bf	31,L(big_loop)
Packit 6c4009
Packit 6c4009
	std	4,0(10)
Packit 6c4009
	addi	10,10,8
Packit 6c4009
Packit 6c4009
	/* Main aligned copy loop.  Copies 32-bytes at a time and
Packit 6c4009
	   ping-pong through r10 and r12 to avoid AGEN delays.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(big_loop):
Packit 6c4009
	addi	12,10,32
Packit 6c4009
	std	4,0(10)
Packit 6c4009
	std	4,8(10)
Packit 6c4009
	std	4,16(10)
Packit 6c4009
	std	4,24(10)
Packit 6c4009
	bdz	L(tail_bytes)
Packit 6c4009
Packit 6c4009
	addi	10,10,64
Packit 6c4009
	std	4,0(12)
Packit 6c4009
	std	4,8(12)
Packit 6c4009
	std	4,16(12)
Packit 6c4009
	std	4,24(12)
Packit 6c4009
	bdnz	L(big_loop)
Packit 6c4009
Packit 6c4009
	mr	12,10
Packit 6c4009
	b	L(tail_bytes)
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(tail_bytes):
Packit 6c4009
Packit 6c4009
	/* Check for tail bytes.  */
Packit 6c4009
	beqlr	cr6
Packit 6c4009
Packit 6c4009
	clrldi	0,5,61
Packit 6c4009
	mtocrf	0x01,0
Packit 6c4009
Packit 6c4009
	/*  At this point we have a tail of 0-7 bytes and we know that the
Packit 6c4009
	destination is doubleword-aligned.  */
Packit 6c4009
4:	/* Copy 4 bytes.  */
Packit 6c4009
	bf	29,2f
Packit 6c4009
Packit 6c4009
	stw	4,0(12)
Packit 6c4009
	addi	12,12,4
Packit 6c4009
2:	/* Copy 2 bytes.  */
Packit 6c4009
	bf	30,1f
Packit 6c4009
Packit 6c4009
	sth	4,0(12)
Packit 6c4009
	addi	12,12,2
Packit 6c4009
1:	/* Copy 1 byte.  */
Packit 6c4009
	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,0(12)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* Special case when value is 0 and we have a long length to deal
Packit 6c4009
	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
Packit 6c4009
	   dcbz though, we need to get the destination 128-bytes aligned.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(huge):
Packit 6c4009
	andi.	11,10,127
Packit 6c4009
	neg	0,10
Packit 6c4009
	beq	L(huge_aligned)
Packit 6c4009
Packit 6c4009
	clrldi	0,0,57
Packit 6c4009
	subf	5,0,5
Packit 6c4009
	srdi	0,0,3
Packit 6c4009
	mtocrf	0x01,0
Packit 6c4009
Packit 6c4009
	/* Get DST aligned to 128 bytes.  */
Packit 6c4009
8:	bf	28,4f
Packit 6c4009
Packit 6c4009
	std	4,0(10)
Packit 6c4009
	std	4,8(10)
Packit 6c4009
	std	4,16(10)
Packit 6c4009
	std	4,24(10)
Packit 6c4009
	std	4,32(10)
Packit 6c4009
	std	4,40(10)
Packit 6c4009
	std	4,48(10)
Packit 6c4009
	std	4,56(10)
Packit 6c4009
	addi	10,10,64
Packit 6c4009
	.align	4
Packit 6c4009
4:	bf	29,2f
Packit 6c4009
Packit 6c4009
	std	4,0(10)
Packit 6c4009
	std	4,8(10)
Packit 6c4009
	std	4,16(10)
Packit 6c4009
	std	4,24(10)
Packit 6c4009
	addi	10,10,32
Packit 6c4009
	.align	4
Packit 6c4009
2:	bf	30,1f
Packit 6c4009
Packit 6c4009
	std	4,0(10)
Packit 6c4009
	std	4,8(10)
Packit 6c4009
	addi	10,10,16
Packit 6c4009
	.align	4
Packit 6c4009
1:	bf	31,L(huge_aligned)
Packit 6c4009
Packit 6c4009
	std	4,0(10)
Packit 6c4009
	addi	10,10,8
Packit 6c4009
Packit 6c4009
Packit 6c4009
L(huge_aligned):
Packit 6c4009
	srdi	8,5,7
Packit 6c4009
	clrldi	11,5,57
Packit 6c4009
	cmpldi	cr6,11,0
Packit 6c4009
	mtctr	8
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(huge_loop):
Packit 6c4009
	dcbz	0,10
Packit 6c4009
	addi	10,10,128
Packit 6c4009
	bdnz	L(huge_loop)
Packit 6c4009
Packit 6c4009
	/* Check how many bytes are still left.  */
Packit 6c4009
	beqlr	cr6
Packit 6c4009
Packit 6c4009
	subf	9,3,10
Packit 6c4009
	subf	5,9,12
Packit 6c4009
	srdi	8,5,3
Packit 6c4009
	cmpldi	cr6,8,0
Packit 6c4009
	mtocrf	0x01,8
Packit 6c4009
Packit 6c4009
	/* We have a tail o 1~127 bytes.  Copy up to 15 doublewords for
Packit 6c4009
	speed.  We'll handle the resulting tail bytes later.  */
Packit 6c4009
	beq	cr6,L(tail)
Packit 6c4009
Packit 6c4009
8:	bf	28,4f
Packit 6c4009
Packit 6c4009
	std	4,0(10)
Packit 6c4009
	std	4,8(10)
Packit 6c4009
	std	4,16(10)
Packit 6c4009
	std	4,24(10)
Packit 6c4009
	std	4,32(10)
Packit 6c4009
	std	4,40(10)
Packit 6c4009
	std	4,48(10)
Packit 6c4009
	std	4,56(10)
Packit 6c4009
	addi	10,10,64
Packit 6c4009
	.align	4
Packit 6c4009
4:	bf	29,2f
Packit 6c4009
Packit 6c4009
	std	4,0(10)
Packit 6c4009
	std	4,8(10)
Packit 6c4009
	std	4,16(10)
Packit 6c4009
	std	4,24(10)
Packit 6c4009
	addi	10,10,32
Packit 6c4009
	.align	4
Packit 6c4009
2:	bf	30,1f
Packit 6c4009
Packit 6c4009
	std	4,0(10)
Packit 6c4009
	std	4,8(10)
Packit 6c4009
	addi	10,10,16
Packit 6c4009
	.align	4
Packit 6c4009
1:	bf	31,L(tail)
Packit 6c4009
Packit 6c4009
	std	4,0(10)
Packit 6c4009
	addi	10,10,8
Packit 6c4009
Packit 6c4009
	/* Handle the rest of the tail bytes here.  */
Packit 6c4009
L(tail):
Packit 6c4009
	mtocrf	0x01,5
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
4:	bf	29,2f
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	addi	10,10,4
Packit 6c4009
	.align	4
Packit 6c4009
2:	bf	30,1f
Packit 6c4009
Packit 6c4009
	sth	4,0(10)
Packit 6c4009
	addi	10,10,2
Packit 6c4009
	.align	4
Packit 6c4009
1:	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,0(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* Expanded tree to copy tail bytes without increments.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(copy_tail):
Packit 6c4009
	bf	29,L(FXX)
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	bf	30,L(TFX)
Packit 6c4009
Packit 6c4009
	sth	4,4(10)
Packit 6c4009
	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,6(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(FXX):	bf	30,L(FFX)
Packit 6c4009
Packit 6c4009
	sth	4,0(10)
Packit 6c4009
	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,2(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(TFX):	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,4(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(FFX):	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,0(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* Handle copies of 9~31 bytes.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(medium):
Packit 6c4009
	/* At least 9 bytes to go.  */
Packit 6c4009
	andi.	11,10,3
Packit 6c4009
	clrldi	0,0,62
Packit 6c4009
	beq	L(medium_aligned)
Packit 6c4009
Packit 6c4009
	/* Force 4-bytes alignment for DST.  */
Packit 6c4009
	mtocrf	0x01,0
Packit 6c4009
	subf	5,0,5
Packit 6c4009
1:	/* Copy 1 byte.  */
Packit 6c4009
	bf	31,2f
Packit 6c4009
Packit 6c4009
	stb	4,0(10)
Packit 6c4009
	addi	10,10,1
Packit 6c4009
2:	/* Copy 2 bytes.  */
Packit 6c4009
	bf	30,L(medium_aligned)
Packit 6c4009
Packit 6c4009
	sth	4,0(10)
Packit 6c4009
	addi	10,10,2
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(medium_aligned):
Packit 6c4009
	/* At least 6 bytes to go, and DST is word-aligned.  */
Packit 6c4009
	cmpldi	cr1,5,16
Packit 6c4009
	mtocrf	0x01,5
Packit 6c4009
	blt	cr1,8f
Packit 6c4009
Packit 6c4009
	/* Copy 16 bytes.  */
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	stw	4,4(10)
Packit 6c4009
	stw	4,8(10)
Packit 6c4009
	stw	4,12(10)
Packit 6c4009
	addi	10,10,16
Packit 6c4009
8:	/* Copy 8 bytes.  */
Packit 6c4009
	bf	28,4f
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	stw	4,4(10)
Packit 6c4009
	addi	10,10,8
Packit 6c4009
4:	/* Copy 4 bytes.  */
Packit 6c4009
	bf	29,2f
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	addi	10,10,4
Packit 6c4009
2:	/* Copy 2-3 bytes.  */
Packit 6c4009
	bf	30,1f
Packit 6c4009
Packit 6c4009
	sth	4,0(10)
Packit 6c4009
	addi	10,10,2
Packit 6c4009
1:	/* Copy 1 byte.  */
Packit 6c4009
	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,0(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* Handles copies of 0~8 bytes.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(small):
Packit 6c4009
	mtocrf	0x01,5
Packit 6c4009
	bne	cr6,L(copy_tail)
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	stw	4,4(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
END_GEN_TB (MEMSET,TB_TOCLESS)
Packit 6c4009
libc_hidden_builtin_def (memset)
Packit 6c4009
Packit 6c4009
/* Copied from bzero.S to prevent the linker from inserting a stub
Packit 6c4009
   between bzero and memset.  */
Packit 6c4009
ENTRY_TOCLESS (__bzero)
Packit 6c4009
	CALL_MCOUNT 3
Packit 6c4009
	mr	r5,r4
Packit 6c4009
	li	r4,0
Packit 6c4009
	b	L(_memset)
Packit 6c4009
END (__bzero)
Packit 6c4009
#ifndef __bzero
Packit 6c4009
weak_alias (__bzero, bzero)
Packit 6c4009
#endif