Blame sysdeps/powerpc/powerpc64/power6/memset.S

Packit 6c4009
/* Optimized 64-bit memset implementation for POWER6.
Packit 6c4009
   Copyright (C) 1997-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
Packit 6c4009
   Returns 's'.
Packit 6c4009
Packit 6c4009
   The memset is done in three sizes: byte (8 bits), word (32 bits),
Packit 6c4009
   cache line (256 bits). There is a special case for setting cache lines
Packit 6c4009
   to 0, to take advantage of the dcbz instruction.  */
Packit 6c4009
Packit 6c4009
#ifndef MEMSET
Packit 6c4009
# define MEMSET memset
Packit 6c4009
#endif
Packit 6c4009
	.machine power6
Packit 6c4009
ENTRY_TOCLESS (MEMSET, 7)
Packit 6c4009
	CALL_MCOUNT 3
Packit 6c4009
Packit 6c4009
#define rTMP	r0
Packit 6c4009
#define rRTN	r3	/* Initial value of 1st argument.  */
Packit 6c4009
#define rMEMP0	r3	/* Original value of 1st arg.  */
Packit 6c4009
#define rCHR	r4	/* Char to set in each byte.  */
Packit 6c4009
#define rLEN	r5	/* Length of region to set.  */
Packit 6c4009
#define rMEMP	r6	/* Address at which we are storing.  */
Packit 6c4009
#define rALIGN	r7	/* Number of bytes we are setting now (when aligning). */
Packit 6c4009
#define rMEMP2	r8
Packit 6c4009
#define rMEMP3	r9	/* Alt mem pointer.  */
Packit 6c4009
L(_memset):
Packit 6c4009
/* Take care of case for size <= 4.  */
Packit 6c4009
	cmpldi	cr1, rLEN, 8
Packit 6c4009
	andi.	rALIGN, rMEMP0, 7
Packit 6c4009
	mr	rMEMP, rMEMP0
Packit 6c4009
	ble	cr1, L(small)
Packit 6c4009
Packit 6c4009
/* Align to doubleword boundary.  */
Packit 6c4009
	cmpldi	cr5, rLEN, 31
Packit 6c4009
	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
Packit 6c4009
	beq+	L(aligned2)
Packit 6c4009
	mtcrf	0x01, rMEMP0
Packit 6c4009
	subfic	rALIGN, rALIGN, 8
Packit 6c4009
	cror	28,30,31		/* Detect odd word aligned.  */
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	sub	rLEN, rLEN, rALIGN
Packit 6c4009
	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
Packit 6c4009
	bt	29, L(g4)
Packit 6c4009
/* Process the even word of doubleword.  */
Packit 6c4009
	bf+	31, L(g2)
Packit 6c4009
	stb	rCHR, 0(rMEMP0)
Packit 6c4009
	bt	30, L(g4x)
Packit 6c4009
L(g2):
Packit 6c4009
	sth	rCHR, -6(rMEMP)
Packit 6c4009
L(g4x):
Packit 6c4009
	stw	rCHR, -4(rMEMP)
Packit 6c4009
	b	L(aligned)
Packit 6c4009
/* Process the odd word of doubleword.  */
Packit 6c4009
L(g4):
Packit 6c4009
	bf	28, L(g4x) /* If false, word aligned on odd word.  */
Packit 6c4009
	bf+	31, L(g0)
Packit 6c4009
	stb	rCHR, 0(rMEMP0)
Packit 6c4009
	bt	30, L(aligned)
Packit 6c4009
L(g0):
Packit 6c4009
	sth	rCHR, -2(rMEMP)
Packit 6c4009
Packit 6c4009
/* Handle the case of size < 31.  */
Packit 6c4009
L(aligned2):
Packit 6c4009
	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
Packit 6c4009
L(aligned):
Packit 6c4009
	mtcrf	0x01, rLEN
Packit 6c4009
	ble	cr5, L(medium)
Packit 6c4009
/* Align to 32-byte boundary.  */
Packit 6c4009
	andi.	rALIGN, rMEMP, 0x18
Packit 6c4009
	subfic	rALIGN, rALIGN, 0x20
Packit 6c4009
	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word. */
Packit 6c4009
	beq	L(caligned)
Packit 6c4009
	mtcrf	0x01, rALIGN
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	sub	rLEN, rLEN, rALIGN
Packit 6c4009
	cmplwi	cr1, rALIGN, 0x10
Packit 6c4009
	mr	rMEMP2, rMEMP
Packit 6c4009
	bf	28, L(a1)
Packit 6c4009
	stdu	rCHR, -8(rMEMP2)
Packit 6c4009
L(a1):	blt	cr1, L(a2)
Packit 6c4009
	std	rCHR, -8(rMEMP2)
Packit 6c4009
	stdu	rCHR, -16(rMEMP2)
Packit 6c4009
L(a2):
Packit 6c4009
Packit 6c4009
/* Now aligned to a 32 byte boundary.  */
Packit 6c4009
        .align 4
Packit 6c4009
L(caligned):
Packit 6c4009
	cmpldi	cr1, rCHR, 0
Packit 6c4009
	clrrdi.	rALIGN, rLEN, 5
Packit 6c4009
	mtcrf	0x01, rLEN
Packit 6c4009
	beq	cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
Packit 6c4009
	beq	L(medium)	/* We may not actually get to do a full line.  */
Packit 6c4009
	.align 4
Packit 6c4009
/* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
Packit 6c4009
   boundary may not be at cache line (128-byte) boundary.  */
Packit 6c4009
L(nzloopstart):
Packit 6c4009
/* memset in 32-byte chunks until we get to a cache line boundary.
Packit 6c4009
   If rLEN is less than the distance to the next cache-line boundary use
Packit 6c4009
   cacheAligned1 code to finish the tail.  */
Packit 6c4009
	cmpldi	cr1,rLEN,128
Packit 6c4009
Packit 6c4009
	andi.	rTMP,rMEMP,127
Packit 6c4009
	blt	cr1,L(cacheAligned1)
Packit 6c4009
	addi	rMEMP3,rMEMP,32
Packit 6c4009
	beq	L(nzCacheAligned)
Packit 6c4009
	addi	rLEN,rLEN,-32
Packit 6c4009
	std	rCHR,0(rMEMP)
Packit 6c4009
	std	rCHR,8(rMEMP)
Packit 6c4009
	std	rCHR,16(rMEMP)
Packit 6c4009
	addi	rMEMP,rMEMP,32
Packit 6c4009
	andi.	rTMP,rMEMP3,127
Packit 6c4009
	std	rCHR,-8(rMEMP3)
Packit 6c4009
Packit 6c4009
	beq	L(nzCacheAligned)
Packit 6c4009
	addi	rLEN,rLEN,-32
Packit 6c4009
	std	rCHR,0(rMEMP3)
Packit 6c4009
	addi	rMEMP,rMEMP,32
Packit 6c4009
	std	rCHR,8(rMEMP3)
Packit 6c4009
	andi.	rTMP,rMEMP,127
Packit 6c4009
	std	rCHR,16(rMEMP3)
Packit 6c4009
	std	rCHR,24(rMEMP3)
Packit 6c4009
Packit 6c4009
	beq	L(nzCacheAligned)
Packit 6c4009
	addi	rLEN,rLEN,-32
Packit 6c4009
	std	rCHR,32(rMEMP3)
Packit 6c4009
	addi	rMEMP,rMEMP,32
Packit 6c4009
	cmpldi	cr1,rLEN,128
Packit 6c4009
	std	rCHR,40(rMEMP3)
Packit 6c4009
	cmpldi	cr6,rLEN,256
Packit 6c4009
	li	rMEMP2,128
Packit 6c4009
	std	rCHR,48(rMEMP3)
Packit 6c4009
	std	rCHR,56(rMEMP3)
Packit 6c4009
	blt	cr1,L(cacheAligned1)
Packit 6c4009
	b	L(nzCacheAligned128)
Packit 6c4009
Packit 6c4009
/* Now we are aligned to the cache line and can use dcbtst.  */
Packit 6c4009
        .align 4
Packit 6c4009
L(nzCacheAligned):
Packit 6c4009
	cmpldi	cr1,rLEN,128
Packit 6c4009
	blt	cr1,L(cacheAligned1)
Packit 6c4009
	b	L(nzCacheAligned128)
Packit 6c4009
        .align 5
Packit 6c4009
L(nzCacheAligned128):
Packit 6c4009
	cmpldi	cr1,rLEN,256
Packit 6c4009
	addi	rMEMP3,rMEMP,64
Packit 6c4009
	std	rCHR,0(rMEMP)
Packit 6c4009
	std	rCHR,8(rMEMP)
Packit 6c4009
	std	rCHR,16(rMEMP)
Packit 6c4009
	std	rCHR,24(rMEMP)
Packit 6c4009
	std	rCHR,32(rMEMP)
Packit 6c4009
	std	rCHR,40(rMEMP)
Packit 6c4009
	std	rCHR,48(rMEMP)
Packit 6c4009
	std	rCHR,56(rMEMP)
Packit 6c4009
	addi	rMEMP,rMEMP3,64
Packit 6c4009
	addi	rLEN,rLEN,-128
Packit 6c4009
	std	rCHR,0(rMEMP3)
Packit 6c4009
	std	rCHR,8(rMEMP3)
Packit 6c4009
	std	rCHR,16(rMEMP3)
Packit 6c4009
	std	rCHR,24(rMEMP3)
Packit 6c4009
	std	rCHR,32(rMEMP3)
Packit 6c4009
	std	rCHR,40(rMEMP3)
Packit 6c4009
	std	rCHR,48(rMEMP3)
Packit 6c4009
	std	rCHR,56(rMEMP3)
Packit 6c4009
	bge	cr1,L(nzCacheAligned128)
Packit 6c4009
	dcbtst	0,rMEMP
Packit 6c4009
	b	L(cacheAligned1)
Packit 6c4009
	.align 5
Packit 6c4009
/* Storing a zero "c" value. We are aligned at a sector (32-byte)
Packit 6c4009
   boundary but may not be at cache line (128-byte) boundary.  If the
Packit 6c4009
   remaining length spans a full cache line we can use the Data cache
Packit 6c4009
   block zero instruction. */
Packit 6c4009
L(zloopstart):
Packit 6c4009
/* memset in 32-byte chunks until we get to a cache line boundary.
Packit 6c4009
   If rLEN is less than the distance to the next cache-line boundary use
Packit 6c4009
   cacheAligned1 code to finish the tail.  */
Packit 6c4009
	cmpldi	cr1,rLEN,128
Packit 6c4009
	beq	L(medium)
Packit 6c4009
L(getCacheAligned):
Packit 6c4009
	andi.	rTMP,rMEMP,127
Packit 6c4009
	nop
Packit 6c4009
	blt	cr1,L(cacheAligned1)
Packit 6c4009
	addi	rMEMP3,rMEMP,32
Packit 6c4009
	beq	L(cacheAligned)
Packit 6c4009
	addi	rLEN,rLEN,-32
Packit 6c4009
	std	rCHR,0(rMEMP)
Packit 6c4009
	std	rCHR,8(rMEMP)
Packit 6c4009
	std	rCHR,16(rMEMP)
Packit 6c4009
	addi	rMEMP,rMEMP,32
Packit 6c4009
	andi.	rTMP,rMEMP3,127
Packit 6c4009
	std	rCHR,-8(rMEMP3)
Packit 6c4009
L(getCacheAligned2):
Packit 6c4009
	beq	L(cacheAligned)
Packit 6c4009
	addi	rLEN,rLEN,-32
Packit 6c4009
	std	rCHR,0(rMEMP3)
Packit 6c4009
	std	rCHR,8(rMEMP3)
Packit 6c4009
	addi	rMEMP,rMEMP,32
Packit 6c4009
	andi.	rTMP,rMEMP,127
Packit 6c4009
	std	rCHR,16(rMEMP3)
Packit 6c4009
	std	rCHR,24(rMEMP3)
Packit 6c4009
L(getCacheAligned3):
Packit 6c4009
	beq	L(cacheAligned)
Packit 6c4009
	addi	rLEN,rLEN,-32
Packit 6c4009
	std	rCHR,32(rMEMP3)
Packit 6c4009
	addi	rMEMP,rMEMP,32
Packit 6c4009
	cmpldi	cr1,rLEN,128
Packit 6c4009
	std	rCHR,40(rMEMP3)
Packit 6c4009
	cmpldi	cr6,rLEN,256
Packit 6c4009
	li	rMEMP2,128
Packit 6c4009
	std	rCHR,48(rMEMP3)
Packit 6c4009
	std	rCHR,56(rMEMP3)
Packit 6c4009
	blt	cr1,L(cacheAligned1)
Packit 6c4009
	blt	cr6,L(cacheAligned128)
Packit 6c4009
	b	L(cacheAlignedx)
Packit 6c4009
Packit 6c4009
/* Now we are aligned to the cache line and can use dcbz.  */
Packit 6c4009
        .align 5
Packit 6c4009
L(cacheAligned):
Packit 6c4009
	cmpldi	cr1,rLEN,128
Packit 6c4009
	cmpldi	cr6,rLEN,256
Packit 6c4009
	blt	cr1,L(cacheAligned1)
Packit 6c4009
	li	rMEMP2,128
Packit 6c4009
L(cacheAlignedx):
Packit 6c4009
	cmpldi	cr5,rLEN,640
Packit 6c4009
	blt	cr6,L(cacheAligned128)
Packit 6c4009
	bgt	cr5,L(cacheAligned512)
Packit 6c4009
	cmpldi	cr6,rLEN,512
Packit 6c4009
	dcbz	0,rMEMP
Packit 6c4009
	cmpldi	cr1,rLEN,384
Packit 6c4009
	dcbz	rMEMP2,rMEMP
Packit 6c4009
	addi	rMEMP,rMEMP,256
Packit 6c4009
	addi	rLEN,rLEN,-256
Packit 6c4009
	blt	cr1,L(cacheAligned1)
Packit 6c4009
	blt	cr6,L(cacheAligned128)
Packit 6c4009
	b	L(cacheAligned256)
Packit 6c4009
	.align 5
Packit 6c4009
/* A simple loop for the longer (>640 bytes) lengths.  This form limits
Packit 6c4009
   the branch miss-predicted to exactly 1 at loop exit.*/
Packit 6c4009
L(cacheAligned512):
Packit 6c4009
	cmpldi	cr1,rLEN,128
Packit 6c4009
	blt	cr1,L(cacheAligned1)
Packit 6c4009
	dcbz	0,rMEMP
Packit 6c4009
	addi	rLEN,rLEN,-128
Packit 6c4009
	addi	rMEMP,rMEMP,128
Packit 6c4009
	b	L(cacheAligned512)
Packit 6c4009
        .align 5
Packit 6c4009
L(cacheAligned256):
Packit 6c4009
Packit 6c4009
	cmpldi	cr6,rLEN,512
Packit 6c4009
Packit 6c4009
	dcbz	0,rMEMP
Packit 6c4009
	cmpldi	cr1,rLEN,384
Packit 6c4009
	dcbz	rMEMP2,rMEMP
Packit 6c4009
	addi	rMEMP,rMEMP,256
Packit 6c4009
	addi	rLEN,rLEN,-256
Packit 6c4009
Packit 6c4009
	bge	cr6,L(cacheAligned256)
Packit 6c4009
Packit 6c4009
	blt	cr1,L(cacheAligned1)
Packit 6c4009
        .align 4
Packit 6c4009
L(cacheAligned128):
Packit 6c4009
	dcbz	0,rMEMP
Packit 6c4009
	addi	rMEMP,rMEMP,128
Packit 6c4009
	addi	rLEN,rLEN,-128
Packit 6c4009
        nop
Packit 6c4009
L(cacheAligned1):
Packit 6c4009
	cmpldi	cr1,rLEN,32
Packit 6c4009
	blt	cr1,L(handletail32)
Packit 6c4009
	addi	rMEMP3,rMEMP,32
Packit 6c4009
	addi	rLEN,rLEN,-32
Packit 6c4009
	std	rCHR,0(rMEMP)
Packit 6c4009
	std	rCHR,8(rMEMP)
Packit 6c4009
	std	rCHR,16(rMEMP)
Packit 6c4009
	addi	rMEMP,rMEMP,32
Packit 6c4009
	cmpldi	cr1,rLEN,32
Packit 6c4009
	std	rCHR,-8(rMEMP3)
Packit 6c4009
L(cacheAligned2):
Packit 6c4009
	blt	cr1,L(handletail32)
Packit 6c4009
	addi	rLEN,rLEN,-32
Packit 6c4009
	std	rCHR,0(rMEMP3)
Packit 6c4009
	std	rCHR,8(rMEMP3)
Packit 6c4009
	addi	rMEMP,rMEMP,32
Packit 6c4009
	cmpldi	cr1,rLEN,32
Packit 6c4009
	std	rCHR,16(rMEMP3)
Packit 6c4009
	std	rCHR,24(rMEMP3)
Packit 6c4009
	nop
Packit 6c4009
L(cacheAligned3):
Packit 6c4009
	blt	cr1,L(handletail32)
Packit 6c4009
	addi	rMEMP,rMEMP,32
Packit 6c4009
	addi	rLEN,rLEN,-32
Packit 6c4009
	std	rCHR,32(rMEMP3)
Packit 6c4009
	std	rCHR,40(rMEMP3)
Packit 6c4009
	std	rCHR,48(rMEMP3)
Packit 6c4009
	std	rCHR,56(rMEMP3)
Packit 6c4009
Packit 6c4009
/* We are here because the length or remainder (rLEN) is less than the
Packit 6c4009
   cache line/sector size and does not justify aggressive loop unrolling.
Packit 6c4009
   So set up the preconditions for L(medium) and go there.  */
Packit 6c4009
        .align 3
Packit 6c4009
L(handletail32):
Packit 6c4009
	cmpldi	cr1,rLEN,0
Packit 6c4009
	beqlr   cr1
Packit 6c4009
	b	L(medium)
Packit 6c4009
Packit 6c4009
	.align 5
Packit 6c4009
L(small):
Packit 6c4009
/* Memset of 8 bytes or less.  */
Packit 6c4009
	cmpldi	cr6, rLEN, 4
Packit 6c4009
	cmpldi	cr5, rLEN, 1
Packit 6c4009
	ble	cr6,L(le4)
Packit 6c4009
	subi	rLEN, rLEN, 4
Packit 6c4009
	stb	rCHR,0(rMEMP)
Packit 6c4009
	stb	rCHR,1(rMEMP)
Packit 6c4009
	stb	rCHR,2(rMEMP)
Packit 6c4009
	stb	rCHR,3(rMEMP)
Packit 6c4009
	addi	rMEMP,rMEMP, 4
Packit 6c4009
	cmpldi	cr5, rLEN, 1
Packit 6c4009
L(le4):
Packit 6c4009
	cmpldi	cr1, rLEN, 3
Packit 6c4009
	bltlr	cr5
Packit 6c4009
	stb	rCHR, 0(rMEMP)
Packit 6c4009
	beqlr	cr5
Packit 6c4009
	stb	rCHR, 1(rMEMP)
Packit 6c4009
	bltlr	cr1
Packit 6c4009
	stb	rCHR, 2(rMEMP)
Packit 6c4009
	beqlr	cr1
Packit 6c4009
	stb	rCHR, 3(rMEMP)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
/* Memset of 0-31 bytes.  */
Packit 6c4009
	.align 5
Packit 6c4009
L(medium):
Packit 6c4009
	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word.  */
Packit 6c4009
	cmpldi	cr1, rLEN, 16
Packit 6c4009
L(medium_tail2):
Packit 6c4009
	add	rMEMP, rMEMP, rLEN
Packit 6c4009
L(medium_tail):
Packit 6c4009
	bt-	31, L(medium_31t)
Packit 6c4009
	bt-	30, L(medium_30t)
Packit 6c4009
L(medium_30f):
Packit 6c4009
	bt	29, L(medium_29t)
Packit 6c4009
L(medium_29f):
Packit 6c4009
	bge	cr1, L(medium_27t)
Packit 6c4009
	bflr	28
Packit 6c4009
	std	rCHR, -8(rMEMP)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
L(medium_31t):
Packit 6c4009
	stbu	rCHR, -1(rMEMP)
Packit 6c4009
	bf-	30, L(medium_30f)
Packit 6c4009
L(medium_30t):
Packit 6c4009
	sthu	rCHR, -2(rMEMP)
Packit 6c4009
	bf-	29, L(medium_29f)
Packit 6c4009
L(medium_29t):
Packit 6c4009
	stwu	rCHR, -4(rMEMP)
Packit 6c4009
	blt	cr1, L(medium_27f)
Packit 6c4009
L(medium_27t):
Packit 6c4009
	std	rCHR, -8(rMEMP)
Packit 6c4009
	stdu	rCHR, -16(rMEMP)
Packit 6c4009
L(medium_27f):
Packit 6c4009
	bflr	28
Packit 6c4009
L(medium_28t):
Packit 6c4009
	std	rCHR, -8(rMEMP)
Packit 6c4009
	blr
Packit 6c4009
END_GEN_TB (MEMSET,TB_TOCLESS)
Packit 6c4009
libc_hidden_builtin_def (memset)
Packit 6c4009
Packit 6c4009
/* Copied from bzero.S to prevent the linker from inserting a stub
Packit 6c4009
   between bzero and memset.  */
Packit 6c4009
ENTRY_TOCLESS (__bzero)
Packit 6c4009
	CALL_MCOUNT 3
Packit 6c4009
	mr	r5,r4
Packit 6c4009
	li	r4,0
Packit 6c4009
	b	L(_memset)
Packit 6c4009
END (__bzero)
Packit 6c4009
#ifndef __bzero
Packit 6c4009
weak_alias (__bzero, bzero)
Packit 6c4009
#endif