Blame sysdeps/powerpc/powerpc32/power4/memset.S

Packit 6c4009
/* Optimized memset implementation for PowerPC64.
Packit 6c4009
   Copyright (C) 1997-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
Packit 6c4009
   Returns 's'.
Packit 6c4009
Packit 6c4009
   The memset is done in three sizes: byte (8 bits), word (32 bits),
Packit 6c4009
   cache line (1024 bits). There is a special case for setting cache lines
Packit 6c4009
   to 0, to take advantage of the dcbz instruction.  */
Packit 6c4009
Packit 6c4009
	.machine power4
Packit 6c4009
EALIGN (memset, 5, 0)
Packit 6c4009
	CALL_MCOUNT
Packit 6c4009
Packit 6c4009
#define rTMP	r0
Packit 6c4009
#define rRTN	r3	/* Initial value of 1st argument.  */
Packit 6c4009
#define rMEMP0	r3	/* Original value of 1st arg.  */
Packit 6c4009
#define rCHR	r4	/* Char to set in each byte.  */
Packit 6c4009
#define rLEN	r5	/* Length of region to set.  */
Packit 6c4009
#define rMEMP	r6	/* Address at which we are storing.  */
Packit 6c4009
#define rALIGN	r7	/* Number of bytes we are setting now (when aligning). */
Packit 6c4009
#define rMEMP2	r8
Packit 6c4009
Packit 6c4009
#define rNEG64	r8	/* Constant -64 for clearing with dcbz.  */
Packit 6c4009
#define rCLS	r8	/* Cache line size (known to be 128).  */
Packit 6c4009
#define rCLM	r9	/* Cache line size mask to check for cache alignment.  */
Packit 6c4009
L(_memset):
Packit 6c4009
/* Take care of case for size <= 4.  */
Packit 6c4009
	cmplwi	cr1, rLEN, 4
Packit 6c4009
	andi.	rALIGN, rMEMP0, 3
Packit 6c4009
	mr	rMEMP, rMEMP0
Packit 6c4009
	ble-	cr1, L(small)
Packit 6c4009
Packit 6c4009
/* Align to word boundary.  */
Packit 6c4009
	cmplwi	cr5, rLEN, 31
Packit 6c4009
	insrwi	rCHR, rCHR, 8, 16     /* Replicate byte to halfword.  */
Packit 6c4009
	beq+	L(aligned)
Packit 6c4009
	mtcrf	0x01, rMEMP0
Packit 6c4009
	subfic	rALIGN, rALIGN, 4
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	sub	rLEN, rLEN, rALIGN
Packit 6c4009
	bf+	31, L(g0)
Packit 6c4009
	stb	rCHR, 0(rMEMP0)
Packit 6c4009
	bt	30, L(aligned)
Packit 6c4009
L(g0):
Packit 6c4009
	sth	rCHR, -2(rMEMP)
Packit 6c4009
Packit 6c4009
/* Handle the case of size < 31.  */
Packit 6c4009
L(aligned):
Packit 6c4009
	mtcrf	0x01, rLEN
Packit 6c4009
	insrwi	rCHR, rCHR, 16, 0    /* Replicate halfword to word.  */
Packit 6c4009
	ble	cr5, L(medium)
Packit 6c4009
/* Align to 32-byte boundary.  */
Packit 6c4009
	andi.	rALIGN, rMEMP, 0x1C
Packit 6c4009
	subfic	rALIGN, rALIGN, 0x20
Packit 6c4009
	beq	L(caligned)
Packit 6c4009
	mtcrf	0x01, rALIGN
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	sub	rLEN, rLEN, rALIGN
Packit 6c4009
	cmplwi	cr1, rALIGN, 0x10
Packit 6c4009
	mr	rMEMP2, rMEMP
Packit 6c4009
	bf	28, L(a1)
Packit 6c4009
        stw     rCHR, -4(rMEMP2)
Packit 6c4009
	stwu	rCHR, -8(rMEMP2)
Packit 6c4009
L(a1):	blt	cr1, L(a2)
Packit 6c4009
        stw     rCHR, -4(rMEMP2)
Packit 6c4009
	stw	rCHR, -8(rMEMP2)
Packit 6c4009
	stw	rCHR, -12(rMEMP2)
Packit 6c4009
	stwu	rCHR, -16(rMEMP2)
Packit 6c4009
L(a2):  bf      29, L(caligned)
Packit 6c4009
        stw     rCHR, -4(rMEMP2)
Packit 6c4009
Packit 6c4009
/* Now aligned to a 32 byte boundary.  */
Packit 6c4009
L(caligned):
Packit 6c4009
	cmplwi	cr1, rCHR, 0
Packit 6c4009
	clrrwi.	rALIGN, rLEN, 5
Packit 6c4009
	mtcrf	0x01, rLEN
Packit 6c4009
	beq	cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
Packit 6c4009
L(nondcbz):
Packit 6c4009
	srwi	rTMP, rALIGN, 5
Packit 6c4009
	mtctr	rTMP
Packit 6c4009
	beq	L(medium)	/* We may not actually get to do a full line.  */
Packit 6c4009
	clrlwi.	rLEN, rLEN, 27
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	li	rNEG64, -0x40
Packit 6c4009
	bdz	L(cloopdone)
Packit 6c4009
Packit 6c4009
        .align 4
Packit 6c4009
L(c3): 	dcbtst	rNEG64, rMEMP
Packit 6c4009
        stw     rCHR, -4(rMEMP)
Packit 6c4009
	stw	rCHR, -8(rMEMP)
Packit 6c4009
        stw     rCHR, -12(rMEMP)
Packit 6c4009
	stw	rCHR, -16(rMEMP)
Packit 6c4009
        stw     rCHR, -20(rMEMP)
Packit 6c4009
	stw	rCHR, -24(rMEMP)
Packit 6c4009
        stw     rCHR, -28(rMEMP)
Packit 6c4009
	stwu	rCHR, -32(rMEMP)
Packit 6c4009
	bdnz	L(c3)
Packit 6c4009
L(cloopdone):
Packit 6c4009
        stw     rCHR, -4(rMEMP)
Packit 6c4009
	stw	rCHR, -8(rMEMP)
Packit 6c4009
        stw     rCHR, -12(rMEMP)
Packit 6c4009
	stw	rCHR, -16(rMEMP)
Packit 6c4009
	cmplwi	cr1, rLEN, 16
Packit 6c4009
        stw     rCHR, -20(rMEMP)
Packit 6c4009
	stw	rCHR, -24(rMEMP)
Packit 6c4009
        stw     rCHR, -28(rMEMP)
Packit 6c4009
	stwu	rCHR, -32(rMEMP)
Packit 6c4009
	beqlr
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	b	L(medium_tail2)
Packit 6c4009
Packit 6c4009
	.align 5
Packit 6c4009
/* Clear lines of memory in 128-byte chunks.  */
Packit 6c4009
L(zloopstart):
Packit 6c4009
/* If the remaining length is less the 32 bytes, don't bother getting
Packit 6c4009
	 the cache line size.  */
Packit 6c4009
	beq	L(medium)
Packit 6c4009
	li      rCLS,128  /* cache line size is 128 */
Packit 6c4009
	dcbt	0,rMEMP
Packit 6c4009
L(getCacheAligned):
Packit 6c4009
	cmplwi	cr1,rLEN,32
Packit 6c4009
	andi.	rTMP,rMEMP,127
Packit 6c4009
	blt	cr1,L(handletail32)
Packit 6c4009
	beq	L(cacheAligned)
Packit 6c4009
	addi	rMEMP,rMEMP,32
Packit 6c4009
	addi	rLEN,rLEN,-32
Packit 6c4009
	stw	rCHR,-32(rMEMP)
Packit 6c4009
        stw     rCHR,-28(rMEMP)
Packit 6c4009
	stw	rCHR,-24(rMEMP)
Packit 6c4009
	stw     rCHR,-20(rMEMP)
Packit 6c4009
	stw	rCHR,-16(rMEMP)
Packit 6c4009
        stw     rCHR,-12(rMEMP)
Packit 6c4009
	stw	rCHR,-8(rMEMP)
Packit 6c4009
        stw     rCHR,-4(rMEMP)
Packit 6c4009
	b	L(getCacheAligned)
Packit 6c4009
Packit 6c4009
/* Now we are aligned to the cache line and can use dcbz.  */
Packit 6c4009
        .align 4
Packit 6c4009
L(cacheAligned):
Packit 6c4009
	cmplw	cr1,rLEN,rCLS
Packit 6c4009
	blt	cr1,L(handletail32)
Packit 6c4009
	dcbz	0,rMEMP
Packit 6c4009
	subf	rLEN,rCLS,rLEN
Packit 6c4009
	add	rMEMP,rMEMP,rCLS
Packit 6c4009
	b	L(cacheAligned)
Packit 6c4009
Packit 6c4009
/* We are here because the cache line size was set and the remainder
Packit 6c4009
  (rLEN) is less than the actual cache line size.
Packit 6c4009
   So set up the preconditions for L(nondcbz) and go there.  */
Packit 6c4009
L(handletail32):
Packit 6c4009
	clrrwi.	rALIGN, rLEN, 5
Packit 6c4009
	b		L(nondcbz)
Packit 6c4009
Packit 6c4009
	.align 5
Packit 6c4009
L(small):
Packit 6c4009
/* Memset of 4 bytes or less.  */
Packit 6c4009
	cmplwi	cr5, rLEN, 1
Packit 6c4009
	cmplwi	cr1, rLEN, 3
Packit 6c4009
	bltlr	cr5
Packit 6c4009
	stb	rCHR, 0(rMEMP)
Packit 6c4009
	beqlr	cr5
Packit 6c4009
	stb	rCHR, 1(rMEMP)
Packit 6c4009
	bltlr	cr1
Packit 6c4009
	stb	rCHR, 2(rMEMP)
Packit 6c4009
	beqlr	cr1
Packit 6c4009
	stb	rCHR, 3(rMEMP)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
/* Memset of 0-31 bytes.  */
Packit 6c4009
	.align 5
Packit 6c4009
L(medium):
Packit 6c4009
	cmplwi	cr1, rLEN, 16
Packit 6c4009
L(medium_tail2):
Packit 6c4009
	add	rMEMP, rMEMP, rLEN
Packit 6c4009
L(medium_tail):
Packit 6c4009
	bt-	31, L(medium_31t)
Packit 6c4009
	bt-	30, L(medium_30t)
Packit 6c4009
L(medium_30f):
Packit 6c4009
	bt-	29, L(medium_29t)
Packit 6c4009
L(medium_29f):
Packit 6c4009
	bge-	cr1, L(medium_27t)
Packit 6c4009
	bflr-	28
Packit 6c4009
        stw     rCHR, -4(rMEMP)
Packit 6c4009
	stw	rCHR, -8(rMEMP)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
L(medium_31t):
Packit 6c4009
	stbu	rCHR, -1(rMEMP)
Packit 6c4009
	bf-	30, L(medium_30f)
Packit 6c4009
L(medium_30t):
Packit 6c4009
	sthu	rCHR, -2(rMEMP)
Packit 6c4009
	bf-	29, L(medium_29f)
Packit 6c4009
L(medium_29t):
Packit 6c4009
	stwu	rCHR, -4(rMEMP)
Packit 6c4009
	blt-	cr1, L(medium_27f)
Packit 6c4009
L(medium_27t):
Packit 6c4009
        stw     rCHR, -4(rMEMP)
Packit 6c4009
	stw	rCHR, -8(rMEMP)
Packit 6c4009
        stw     rCHR, -12(rMEMP)
Packit 6c4009
	stwu	rCHR, -16(rMEMP)
Packit 6c4009
L(medium_27f):
Packit 6c4009
	bflr-	28
Packit 6c4009
L(medium_28t):
Packit 6c4009
        stw     rCHR, -4(rMEMP)
Packit 6c4009
	stw	rCHR, -8(rMEMP)
Packit 6c4009
	blr
Packit 6c4009
END (memset)
Packit 6c4009
libc_hidden_builtin_def (memset)