Blame sysdeps/powerpc/powerpc32/power7/memset.S

Packit 6c4009
/* Optimized memset implementation for PowerPC32/POWER7.
Packit 6c4009
   Copyright (C) 2010-2018 Free Software Foundation, Inc.
Packit 6c4009
   Contributed by Luis Machado <luisgpm@br.ibm.com>.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
Packit 6c4009
   Returns 's'.  */
Packit 6c4009
Packit 6c4009
	.machine  power7
Packit 6c4009
EALIGN (memset, 5, 0)
Packit 6c4009
	CALL_MCOUNT
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(_memset):
Packit 6c4009
	cmplwi	cr7,5,31
Packit 6c4009
	cmplwi	cr6,5,8
Packit 6c4009
	mr	10,3		/* Save original argument for later.  */
Packit 6c4009
	mr	7,1		/* Save original r1 for later.  */
Packit 6c4009
	cfi_offset(31,-8)
Packit 6c4009
Packit 6c4009
	/* Replicate byte to word.  */
Packit 6c4009
	insrwi	4,4,8,16
Packit 6c4009
	insrwi	4,4,16,0
Packit 6c4009
Packit 6c4009
	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
Packit 6c4009
Packit 6c4009
	neg	0,3
Packit 6c4009
	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */
Packit 6c4009
Packit 6c4009
	/* Save our word twice to create a doubleword that we will later
Packit 6c4009
	   copy to a FPR.  */
Packit 6c4009
	stwu	1,-32(1)
Packit 6c4009
	andi.	11,10,7		/* Check alignment of DST.  */
Packit 6c4009
	mr	12,5
Packit 6c4009
	stw	4,24(1)
Packit 6c4009
	stw	4,28(1)
Packit 6c4009
	beq	L(big_aligned)
Packit 6c4009
Packit 6c4009
	clrlwi	0,0,29
Packit 6c4009
	mtocrf	0x01,0
Packit 6c4009
	subf	5,0,5
Packit 6c4009
Packit 6c4009
	/* Get DST aligned to 8 bytes.  */
Packit 6c4009
1:	bf	31,2f
Packit 6c4009
Packit 6c4009
	stb	4,0(10)
Packit 6c4009
	addi	10,10,1
Packit 6c4009
2:	bf	30,4f
Packit 6c4009
Packit 6c4009
	sth	4,0(10)
Packit 6c4009
	addi	10,10,2
Packit 6c4009
4:	bf	29,L(big_aligned)
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	addi	10,10,4
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(big_aligned):
Packit 6c4009
	cmplwi	cr5,5,255
Packit 6c4009
	li	0,32
Packit 6c4009
	cmplwi	cr1,5,160
Packit 6c4009
	dcbtst	0,10
Packit 6c4009
	cmplwi	cr6,4,0
Packit 6c4009
	srwi	9,5,3		/* Number of full doublewords remaining.  */
Packit 6c4009
	crand	27,26,21
Packit 6c4009
	mtocrf	0x01,9
Packit 6c4009
	bt	27,L(huge)
Packit 6c4009
Packit 6c4009
	/* From this point on, we'll copy 32+ bytes and the value
Packit 6c4009
	   isn't 0 (so we can't use dcbz).  */
Packit 6c4009
Packit 6c4009
	srwi	8,5,5
Packit 6c4009
	clrlwi	11,5,29
Packit 6c4009
	cmplwi	cr6,11,0
Packit 6c4009
	cmplwi	cr1,9,4
Packit 6c4009
	mtctr	8
Packit 6c4009
Packit 6c4009
	/* Copy 1~3 doublewords so the main loop starts
Packit 6c4009
	at a multiple of 32 bytes.  */
Packit 6c4009
Packit 6c4009
	bf	30,1f
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	stw	4,4(10)
Packit 6c4009
	stw	4,8(10)
Packit 6c4009
	stw	4,12(10)
Packit 6c4009
	addi	10,10,16
Packit 6c4009
	bf	31,L(big_loop)
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	stw	4,4(10)
Packit 6c4009
	addi	10,10,8
Packit 6c4009
	mr	12,10
Packit 6c4009
	blt	cr1,L(tail_bytes)
Packit 6c4009
Packit 6c4009
	b	L(big_loop)
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
1:	/* Copy 1 doubleword.  */
Packit 6c4009
	bf	31,L(big_loop)
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	stw	4,4(10)
Packit 6c4009
	addi	10,10,8
Packit 6c4009
Packit 6c4009
	/* First use a 32-bytes loop with stw's to try and avoid the LHS due
Packit 6c4009
	   to the lfd we will do next.  Also, ping-pong through r10 and r12
Packit 6c4009
	   to avoid AGEN delays.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(big_loop):
Packit 6c4009
	addi	12,10,32
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	stw	4,4(10)
Packit 6c4009
	stw	4,8(10)
Packit 6c4009
	stw	4,12(10)
Packit 6c4009
	stw	4,16(10)
Packit 6c4009
	stw	4,20(10)
Packit 6c4009
	stw	4,24(10)
Packit 6c4009
	stw	4,28(10)
Packit 6c4009
	bdz	L(tail_bytes)
Packit 6c4009
Packit 6c4009
	addi	10,10,64
Packit 6c4009
	stw	4,0(12)
Packit 6c4009
	stw	4,4(12)
Packit 6c4009
	stw	4,8(12)
Packit 6c4009
	stw	4,12(12)
Packit 6c4009
	stw	4,16(12)
Packit 6c4009
	stw	4,20(12)
Packit 6c4009
	stw	4,24(12)
Packit 6c4009
	stw	4,28(12)
Packit 6c4009
	bdnz	L(big_loop_fast_setup)
Packit 6c4009
Packit 6c4009
	mr	12,10
Packit 6c4009
	b	L(tail_bytes)
Packit 6c4009
Packit 6c4009
	/* Now that we're probably past the LHS window, use the VSX to
Packit 6c4009
	   speed up the loop.  */
Packit 6c4009
L(big_loop_fast_setup):
Packit 6c4009
	li	11,24
Packit 6c4009
	li	6,16
Packit 6c4009
	lxvdsx	4,1,11
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(big_loop_fast):
Packit 6c4009
	addi	12,10,32
Packit 6c4009
	stxvd2x	4,0,10
Packit 6c4009
	stxvd2x	4,10,6
Packit 6c4009
	bdz	L(tail_bytes)
Packit 6c4009
Packit 6c4009
	addi	10,10,64
Packit 6c4009
	stxvd2x	4,0,12
Packit 6c4009
	stxvd2x	4,12,6
Packit 6c4009
	bdnz	L(big_loop_fast)
Packit 6c4009
Packit 6c4009
	mr	12,10
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(tail_bytes):
Packit 6c4009
Packit 6c4009
	/* Check for tail bytes.  */
Packit 6c4009
	mr	1,7		/* Restore r1.  */
Packit 6c4009
	beqlr	cr6
Packit 6c4009
Packit 6c4009
	clrlwi	0,5,29
Packit 6c4009
	mtocrf	0x01,0
Packit 6c4009
Packit 6c4009
	/*  At this point we have a tail of 0-7 bytes and we know that the
Packit 6c4009
	destination is doubleword-aligned.  */
Packit 6c4009
4:	/* Copy 4 bytes.  */
Packit 6c4009
	bf	29,2f
Packit 6c4009
Packit 6c4009
	stw	4,0(12)
Packit 6c4009
	addi	12,12,4
Packit 6c4009
2:	/* Copy 2 bytes.  */
Packit 6c4009
	bf	30,1f
Packit 6c4009
Packit 6c4009
	sth	4,0(12)
Packit 6c4009
	addi	12,12,2
Packit 6c4009
1:	/* Copy 1 byte.  */
Packit 6c4009
	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,0(12)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
Packit 6c4009
	/* Special case when value is 0 and we have a long length to deal
Packit 6c4009
	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
Packit 6c4009
	   dcbz though, we need to get the destination 128-bytes aligned.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(huge):
Packit 6c4009
	lfd	4,24(1)
Packit 6c4009
	andi.	11,10,127
Packit 6c4009
	neg	0,10
Packit 6c4009
	beq	L(huge_aligned)
Packit 6c4009
Packit 6c4009
	clrlwi	0,0,25
Packit 6c4009
	subf	5,0,5
Packit 6c4009
	srwi	0,0,3
Packit 6c4009
	mtocrf  0x01,0
Packit 6c4009
Packit 6c4009
	/* Get DST aligned to 128 bytes.  */
Packit 6c4009
8:	bf	28,4f
Packit 6c4009
Packit 6c4009
	stfd	4,0(10)
Packit 6c4009
	stfd	4,8(10)
Packit 6c4009
	stfd	4,16(10)
Packit 6c4009
	stfd	4,24(10)
Packit 6c4009
	stfd	4,32(10)
Packit 6c4009
	stfd	4,40(10)
Packit 6c4009
	stfd	4,48(10)
Packit 6c4009
	stfd	4,56(10)
Packit 6c4009
	addi	10,10,64
Packit 6c4009
	.align	4
Packit 6c4009
4:	bf	29,2f
Packit 6c4009
Packit 6c4009
	stfd	4,0(10)
Packit 6c4009
	stfd	4,8(10)
Packit 6c4009
	stfd	4,16(10)
Packit 6c4009
	stfd	4,24(10)
Packit 6c4009
	addi	10,10,32
Packit 6c4009
	.align	4
Packit 6c4009
2:	bf	30,1f
Packit 6c4009
Packit 6c4009
	stfd	4,0(10)
Packit 6c4009
	stfd	4,8(10)
Packit 6c4009
	addi	10,10,16
Packit 6c4009
	.align	4
Packit 6c4009
1:	bf	31,L(huge_aligned)
Packit 6c4009
Packit 6c4009
	stfd	4,0(10)
Packit 6c4009
	addi	10,10,8
Packit 6c4009
Packit 6c4009
L(huge_aligned):
Packit 6c4009
	srwi	8,5,7
Packit 6c4009
	clrlwi	11,5,25
Packit 6c4009
	cmplwi	cr6,11,0
Packit 6c4009
	mtctr	8
Packit 6c4009
Packit 6c4009
	/* Copies 128-bytes at a time.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(huge_loop):
Packit 6c4009
	dcbz	0,10
Packit 6c4009
	addi	10,10,128
Packit 6c4009
	bdnz	L(huge_loop)
Packit 6c4009
Packit 6c4009
	/* We have a tail of 0~127 bytes to handle.  */
Packit 6c4009
	mr	1,7		/* Restore r1.  */
Packit 6c4009
	beqlr	cr6
Packit 6c4009
Packit 6c4009
	subf	9,3,10
Packit 6c4009
	subf	5,9,12
Packit 6c4009
	srwi	8,5,3
Packit 6c4009
	cmplwi	cr6,8,0
Packit 6c4009
	mtocrf	0x01,8
Packit 6c4009
Packit 6c4009
	/* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
Packit 6c4009
	speed.  We'll handle the resulting tail bytes later.  */
Packit 6c4009
	beq	cr6,L(tail)
Packit 6c4009
Packit 6c4009
8:	bf	28,4f
Packit 6c4009
Packit 6c4009
	stfd	4,0(10)
Packit 6c4009
	stfd	4,8(10)
Packit 6c4009
	stfd	4,16(10)
Packit 6c4009
	stfd	4,24(10)
Packit 6c4009
	stfd	4,32(10)
Packit 6c4009
	stfd	4,40(10)
Packit 6c4009
	stfd	4,48(10)
Packit 6c4009
	stfd	4,56(10)
Packit 6c4009
	addi	10,10,64
Packit 6c4009
	.align	4
Packit 6c4009
4:	bf	29,2f
Packit 6c4009
Packit 6c4009
	stfd	4,0(10)
Packit 6c4009
	stfd	4,8(10)
Packit 6c4009
	stfd	4,16(10)
Packit 6c4009
	stfd	4,24(10)
Packit 6c4009
	addi	10,10,32
Packit 6c4009
	.align	4
Packit 6c4009
2:	bf	30,1f
Packit 6c4009
Packit 6c4009
	stfd	4,0(10)
Packit 6c4009
	stfd	4,8(10)
Packit 6c4009
	addi	10,10,16
Packit 6c4009
	.align	4
Packit 6c4009
1:	bf	31,L(tail)
Packit 6c4009
Packit 6c4009
	stfd	4,0(10)
Packit 6c4009
	addi	10,10,8
Packit 6c4009
Packit 6c4009
	/* Handle the rest of the tail bytes here.  */
Packit 6c4009
L(tail):
Packit 6c4009
	mtocrf	0x01,5
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
4:	bf	29,2f
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	addi	10,10,4
Packit 6c4009
	.align	4
Packit 6c4009
2:	bf	30,1f
Packit 6c4009
Packit 6c4009
	sth	4,0(10)
Packit 6c4009
	addi	10,10,2
Packit 6c4009
	.align	4
Packit 6c4009
1:	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,0(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
Packit 6c4009
	/* Expanded tree to copy tail bytes without increments.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(copy_tail):
Packit 6c4009
	bf	29,L(FXX)
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	bf	30,L(TFX)
Packit 6c4009
Packit 6c4009
	sth	4,4(10)
Packit 6c4009
	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,6(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(FXX):	bf	30,L(FFX)
Packit 6c4009
Packit 6c4009
	sth	4,0(10)
Packit 6c4009
	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,2(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(TFX):	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,4(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(FFX):	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,0(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* Handle copies of 9~31 bytes.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(medium):
Packit 6c4009
	/* At least 9 bytes to go.  */
Packit 6c4009
	andi.	11,10,3
Packit 6c4009
	clrlwi	0,0,30
Packit 6c4009
	beq	L(medium_aligned)
Packit 6c4009
Packit 6c4009
	/* Force 4-bytes alignment for DST.  */
Packit 6c4009
	mtocrf	0x01,0
Packit 6c4009
	subf	5,0,5
Packit 6c4009
1:	/* Copy 1 byte.  */
Packit 6c4009
	bf	31,2f
Packit 6c4009
Packit 6c4009
	stb	4,0(10)
Packit 6c4009
	addi	10,10,1
Packit 6c4009
2:	/* Copy 2 bytes.  */
Packit 6c4009
	bf	30,L(medium_aligned)
Packit 6c4009
Packit 6c4009
	sth	4,0(10)
Packit 6c4009
	addi	10,10,2
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(medium_aligned):
Packit 6c4009
	/* At least 6 bytes to go, and DST is word-aligned.  */
Packit 6c4009
	cmplwi	cr1,5,16
Packit 6c4009
	mtocrf	0x01,5
Packit 6c4009
	blt	cr1,8f
Packit 6c4009
Packit 6c4009
	/* Copy 16 bytes.  */
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	stw	4,4(10)
Packit 6c4009
	stw	4,8(10)
Packit 6c4009
	stw	4,12(10)
Packit 6c4009
	addi	10,10,16
Packit 6c4009
8:	/* Copy 8 bytes.  */
Packit 6c4009
	bf	28,4f
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	stw	4,4(10)
Packit 6c4009
	addi	10,10,8
Packit 6c4009
4:	/* Copy 4 bytes.  */
Packit 6c4009
	bf	29,2f
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	addi	10,10,4
Packit 6c4009
2:	/* Copy 2-3 bytes.  */
Packit 6c4009
	bf	30,1f
Packit 6c4009
Packit 6c4009
	sth	4,0(10)
Packit 6c4009
	addi	10,10,2
Packit 6c4009
1:	/* Copy 1 byte.  */
Packit 6c4009
	bflr	31
Packit 6c4009
Packit 6c4009
	stb	4,0(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* Handles copies of 0~8 bytes.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(small):
Packit 6c4009
	mtocrf	0x01,5
Packit 6c4009
	bne	cr6,L(copy_tail)
Packit 6c4009
Packit 6c4009
	stw	4,0(10)
Packit 6c4009
	stw	4,4(10)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
END (memset)
Packit 6c4009
libc_hidden_builtin_def (memset)