Blame sysdeps/powerpc/powerpc64/memset.S

Packit 6c4009
/* Optimized memset implementation for PowerPC64.
Packit 6c4009
   Copyright (C) 1997-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
	.section	".toc","aw"
Packit 6c4009
.LC0:
Packit 6c4009
	.tc __cache_line_size[TC],__cache_line_size
Packit 6c4009
	.section	".text"
Packit 6c4009
	.align 2
Packit 6c4009
Packit 6c4009
/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
Packit 6c4009
   Returns 's'.
Packit 6c4009
Packit 6c4009
   The memset is done in three sizes: byte (8 bits), word (32 bits),
Packit 6c4009
   cache line (256 bits). There is a special case for setting cache lines
Packit 6c4009
   to 0, to take advantage of the dcbz instruction.  */
Packit 6c4009
Packit 6c4009
#ifndef MEMSET
Packit 6c4009
# define MEMSET memset
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
ENTRY (MEMSET, 5)
Packit 6c4009
	CALL_MCOUNT 3
Packit 6c4009
Packit 6c4009
#define rTMP	r0
Packit 6c4009
#define rRTN	r3	/* Initial value of 1st argument.  */
Packit 6c4009
#define rMEMP0	r3	/* Original value of 1st arg.  */
Packit 6c4009
#define rCHR	r4	/* Char to set in each byte.  */
Packit 6c4009
#define rLEN	r5	/* Length of region to set.  */
Packit 6c4009
#define rMEMP	r6	/* Address at which we are storing.  */
Packit 6c4009
#define rALIGN	r7	/* Number of bytes we are setting now (when aligning). */
Packit 6c4009
#define rMEMP2	r8
Packit 6c4009
Packit 6c4009
#define rNEG64	r8	/* Constant -64 for clearing with dcbz.  */
Packit 6c4009
#define rCLS	r8	/* Cache line size obtained from static.  */
Packit 6c4009
#define rCLM	r9	/* Cache line size mask to check for cache alignment.  */
Packit 6c4009
L(_memset):
Packit 6c4009
/* Take care of case for size <= 4.  */
Packit 6c4009
	cmpldi	cr1, rLEN, 8
Packit 6c4009
	andi.	rALIGN, rMEMP0, 7
Packit 6c4009
	mr	rMEMP, rMEMP0
Packit 6c4009
	ble-	cr1, L(small)
Packit 6c4009
Packit 6c4009
/* Align to doubleword boundary.  */
Packit 6c4009
	cmpldi	cr5, rLEN, 31
Packit 6c4009
	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
Packit 6c4009
	beq+	L(aligned2)
Packit 6c4009
	mtcrf	0x01, rMEMP0
Packit 6c4009
	subfic	rALIGN, rALIGN, 8
Packit 6c4009
	cror	28,30,31		/* Detect odd word aligned.  */
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	sub	rLEN, rLEN, rALIGN
Packit 6c4009
	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
Packit 6c4009
	bt	29, L(g4)
Packit 6c4009
/* Process the even word of doubleword.  */
Packit 6c4009
	bf+	31, L(g2)
Packit 6c4009
	stb	rCHR, 0(rMEMP0)
Packit 6c4009
	bt	30, L(g4x)
Packit 6c4009
L(g2):
Packit 6c4009
	sth	rCHR, -6(rMEMP)
Packit 6c4009
L(g4x):
Packit 6c4009
	stw	rCHR, -4(rMEMP)
Packit 6c4009
	b	L(aligned)
Packit 6c4009
/* Process the odd word of doubleword.  */
Packit 6c4009
L(g4):
Packit 6c4009
	bf	28, L(g4x) /* If false, word aligned on odd word.  */
Packit 6c4009
	bf+	31, L(g0)
Packit 6c4009
	stb	rCHR, 0(rMEMP0)
Packit 6c4009
	bt	30, L(aligned)
Packit 6c4009
L(g0):
Packit 6c4009
	sth	rCHR, -2(rMEMP)
Packit 6c4009
Packit 6c4009
/* Handle the case of size < 31.  */
Packit 6c4009
L(aligned2):
Packit 6c4009
	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
Packit 6c4009
L(aligned):
Packit 6c4009
	mtcrf	0x01, rLEN
Packit 6c4009
	ble	cr5, L(medium)
Packit 6c4009
/* Align to 32-byte boundary.  */
Packit 6c4009
	andi.	rALIGN, rMEMP, 0x18
Packit 6c4009
	subfic	rALIGN, rALIGN, 0x20
Packit 6c4009
	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word. */
Packit 6c4009
	beq	L(caligned)
Packit 6c4009
	mtcrf	0x01, rALIGN
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	sub	rLEN, rLEN, rALIGN
Packit 6c4009
	cmplwi	cr1, rALIGN, 0x10
Packit 6c4009
	mr	rMEMP2, rMEMP
Packit 6c4009
	bf	28, L(a1)
Packit 6c4009
	stdu	rCHR, -8(rMEMP2)
Packit 6c4009
L(a1):	blt	cr1, L(a2)
Packit 6c4009
	std	rCHR, -8(rMEMP2)
Packit 6c4009
	stdu	rCHR, -16(rMEMP2)
Packit 6c4009
L(a2):
Packit 6c4009
Packit 6c4009
/* Now aligned to a 32 byte boundary.  */
Packit 6c4009
L(caligned):
Packit 6c4009
	cmpldi	cr1, rCHR, 0
Packit 6c4009
	clrrdi.	rALIGN, rLEN, 5
Packit 6c4009
	mtcrf	0x01, rLEN
Packit 6c4009
	beq	cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
Packit 6c4009
L(nondcbz):
Packit 6c4009
	srdi	rTMP, rALIGN, 5
Packit 6c4009
	mtctr	rTMP
Packit 6c4009
	beq	L(medium)	/* We may not actually get to do a full line.  */
Packit 6c4009
	clrldi.	rLEN, rLEN, 59
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	li	rNEG64, -0x40
Packit 6c4009
	bdz	L(cloopdone)
Packit 6c4009
Packit 6c4009
L(c3):	dcbtst	rNEG64, rMEMP
Packit 6c4009
	std	rCHR, -8(rMEMP)
Packit 6c4009
	std	rCHR, -16(rMEMP)
Packit 6c4009
	std	rCHR, -24(rMEMP)
Packit 6c4009
	stdu	rCHR, -32(rMEMP)
Packit 6c4009
	bdnz	L(c3)
Packit 6c4009
L(cloopdone):
Packit 6c4009
	std	rCHR, -8(rMEMP)
Packit 6c4009
	std	rCHR, -16(rMEMP)
Packit 6c4009
	cmpldi	cr1, rLEN, 16
Packit 6c4009
	std	rCHR, -24(rMEMP)
Packit 6c4009
	stdu	rCHR, -32(rMEMP)
Packit 6c4009
	beqlr
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	b	L(medium_tail2)
Packit 6c4009
Packit 6c4009
	.align 5
Packit 6c4009
/* Clear lines of memory in 128-byte chunks.  */
Packit 6c4009
L(zloopstart):
Packit 6c4009
/* If the remaining length is less the 32 bytes, don't bother getting
Packit 6c4009
	 the cache line size.  */
Packit 6c4009
	beq	L(medium)
Packit 6c4009
	ld	rCLS,.LC0@toc(r2)
Packit 6c4009
	lwz	rCLS,0(rCLS)
Packit 6c4009
/* If the cache line size was not set just goto to L(nondcbz) which is
Packit 6c4009
	 safe for any cache line size.  */
Packit 6c4009
	cmpldi	cr1,rCLS,0
Packit 6c4009
	beq		cr1,L(nondcbz)
Packit 6c4009
Packit 6c4009
Packit 6c4009
/* Now we know the cache line size, and it is not 32-bytes, but
Packit 6c4009
	 we may not yet be aligned to the cache line. May have a partial
Packit 6c4009
	 line to fill, so touch it 1st.  */
Packit 6c4009
	dcbt	0,rMEMP
Packit 6c4009
	addi	rCLM,rCLS,-1
Packit 6c4009
L(getCacheAligned):
Packit 6c4009
	cmpldi	cr1,rLEN,32
Packit 6c4009
	and.	rTMP,rCLM,rMEMP
Packit 6c4009
	blt		cr1,L(handletail32)
Packit 6c4009
	beq		L(cacheAligned)
Packit 6c4009
	addi	rMEMP,rMEMP,32
Packit 6c4009
	addi	rLEN,rLEN,-32
Packit 6c4009
	std		rCHR,-32(rMEMP)
Packit 6c4009
	std		rCHR,-24(rMEMP)
Packit 6c4009
	std		rCHR,-16(rMEMP)
Packit 6c4009
	std		rCHR,-8(rMEMP)
Packit 6c4009
	b		L(getCacheAligned)
Packit 6c4009
Packit 6c4009
/* Now we are aligned to the cache line and can use dcbz.  */
Packit 6c4009
L(cacheAligned):
Packit 6c4009
	cmpld	cr1,rLEN,rCLS
Packit 6c4009
	blt		cr1,L(handletail32)
Packit 6c4009
	dcbz	0,rMEMP
Packit 6c4009
	subf	rLEN,rCLS,rLEN
Packit 6c4009
	add		rMEMP,rMEMP,rCLS
Packit 6c4009
	b		L(cacheAligned)
Packit 6c4009
Packit 6c4009
/* We are here because the cache line size was set and was not 32-bytes
Packit 6c4009
   and the remainder (rLEN) is less than the actual cache line size.
Packit 6c4009
   So set up the preconditions for L(nondcbz) and go there.  */
Packit 6c4009
L(handletail32):
Packit 6c4009
	clrrwi.	rALIGN, rLEN, 5
Packit 6c4009
	b		L(nondcbz)
Packit 6c4009
Packit 6c4009
	.align 5
Packit 6c4009
L(small):
Packit 6c4009
/* Memset of 8 bytes or less.  */
Packit 6c4009
	cmpldi	cr6, rLEN, 4
Packit 6c4009
	cmpldi	cr5, rLEN, 1
Packit 6c4009
	ble	cr6,L(le4)
Packit 6c4009
	subi	rLEN, rLEN, 4
Packit 6c4009
	stb	rCHR,0(rMEMP)
Packit 6c4009
	stb	rCHR,1(rMEMP)
Packit 6c4009
	stb	rCHR,2(rMEMP)
Packit 6c4009
	stb	rCHR,3(rMEMP)
Packit 6c4009
	addi	rMEMP,rMEMP, 4
Packit 6c4009
	cmpldi	cr5, rLEN, 1
Packit 6c4009
L(le4):
Packit 6c4009
	cmpldi	cr1, rLEN, 3
Packit 6c4009
	bltlr	cr5
Packit 6c4009
	stb	rCHR, 0(rMEMP)
Packit 6c4009
	beqlr	cr5
Packit 6c4009
	stb	rCHR, 1(rMEMP)
Packit 6c4009
	bltlr	cr1
Packit 6c4009
	stb	rCHR, 2(rMEMP)
Packit 6c4009
	beqlr	cr1
Packit 6c4009
	stb	rCHR, 3(rMEMP)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
/* Memset of 0-31 bytes.  */
Packit 6c4009
	.align 5
Packit 6c4009
L(medium):
Packit 6c4009
	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word.  */
Packit 6c4009
	cmpldi	cr1, rLEN, 16
Packit 6c4009
L(medium_tail2):
Packit 6c4009
	add	rMEMP, rMEMP, rLEN
Packit 6c4009
L(medium_tail):
Packit 6c4009
	bt-	31, L(medium_31t)
Packit 6c4009
	bt-	30, L(medium_30t)
Packit 6c4009
L(medium_30f):
Packit 6c4009
	bt-	29, L(medium_29t)
Packit 6c4009
L(medium_29f):
Packit 6c4009
	bge-	cr1, L(medium_27t)
Packit 6c4009
	bflr-	28
Packit 6c4009
	std	rCHR, -8(rMEMP)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
L(medium_31t):
Packit 6c4009
	stbu	rCHR, -1(rMEMP)
Packit 6c4009
	bf-	30, L(medium_30f)
Packit 6c4009
L(medium_30t):
Packit 6c4009
	sthu	rCHR, -2(rMEMP)
Packit 6c4009
	bf-	29, L(medium_29f)
Packit 6c4009
L(medium_29t):
Packit 6c4009
	stwu	rCHR, -4(rMEMP)
Packit 6c4009
	blt-	cr1, L(medium_27f)
Packit 6c4009
L(medium_27t):
Packit 6c4009
	std	rCHR, -8(rMEMP)
Packit 6c4009
	stdu	rCHR, -16(rMEMP)
Packit 6c4009
L(medium_27f):
Packit 6c4009
	bflr-	28
Packit 6c4009
L(medium_28t):
Packit 6c4009
	std	rCHR, -8(rMEMP)
Packit 6c4009
	blr
Packit 6c4009
END_GEN_TB (MEMSET,TB_TOCLESS)
Packit 6c4009
libc_hidden_builtin_def (memset)
Packit 6c4009
Packit 6c4009
#ifndef NO_BZERO_IMPL
Packit 6c4009
/* Copied from bzero.S to prevent the linker from inserting a stub
Packit 6c4009
   between bzero and memset.  */
Packit 6c4009
ENTRY (__bzero)
Packit 6c4009
	CALL_MCOUNT 3
Packit 6c4009
	mr	r5,r4
Packit 6c4009
	li	r4,0
Packit 6c4009
	b	L(_memset)
Packit 6c4009
END_GEN_TB (__bzero,TB_TOCLESS)
Packit 6c4009
Packit 6c4009
weak_alias (__bzero, bzero)
Packit 6c4009
#endif