Blame sysdeps/powerpc/powerpc32/memset.S

Packit 6c4009
/* Optimized memset implementation for PowerPC.
Packit 6c4009
   Copyright (C) 1997-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
Packit 6c4009
   Returns 's'.
Packit 6c4009
Packit 6c4009
   The memset is done in four sizes: byte (8 bits), word (32 bits),
Packit 6c4009
   32-byte blocks (256 bits) and __cache_line_size (128, 256, 1024 bits).
Packit 6c4009
   There is a special case for setting whole cache lines to 0, which
Packit 6c4009
   takes advantage of the dcbz instruction.  */
Packit 6c4009
Packit 6c4009
	.section	".text"
Packit 6c4009
EALIGN (memset, 5, 1)
Packit 6c4009
Packit 6c4009
#define rTMP	r0
Packit 6c4009
#define rRTN	r3	/* initial value of 1st argument */
Packit 6c4009
#define rMEMP0	r3	/* original value of 1st arg */
Packit 6c4009
#define rCHR	r4	/* char to set in each byte */
Packit 6c4009
#define rLEN	r5	/* length of region to set */
Packit 6c4009
#define rMEMP	r6	/* address at which we are storing */
Packit 6c4009
#define rALIGN	r7	/* number of bytes we are setting now (when aligning) */
Packit 6c4009
#define rMEMP2	r8
Packit 6c4009
Packit 6c4009
#define rPOS32	r7	/* constant +32 for clearing with dcbz */
Packit 6c4009
#define rNEG64	r8	/* constant -64 for clearing with dcbz */
Packit 6c4009
#define rNEG32	r9	/* constant -32 for clearing with dcbz */
Packit 6c4009
Packit 6c4009
#define rGOT	r9	/* Address of the Global Offset Table.  */
Packit 6c4009
#define rCLS	r8	/* Cache line size obtained from static.  */
Packit 6c4009
#define rCLM	r9	/* Cache line size mask to check for cache alignment.  */
Packit 6c4009
Packit 6c4009
/* take care of case for size <= 4  */
Packit 6c4009
	cmplwi	cr1, rLEN, 4
Packit 6c4009
	andi.	rALIGN, rMEMP0, 3
Packit 6c4009
	mr	rMEMP, rMEMP0
Packit 6c4009
	ble-	cr1, L(small)
Packit 6c4009
/* align to word boundary  */
Packit 6c4009
	cmplwi	cr5, rLEN, 31
Packit 6c4009
	rlwimi	rCHR, rCHR, 8, 16, 23
Packit 6c4009
	beq+	L(aligned)	/* 8th instruction from .align */
Packit 6c4009
	mtcrf	0x01, rMEMP0
Packit 6c4009
	subfic	rALIGN, rALIGN, 4
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	sub	rLEN, rLEN, rALIGN
Packit 6c4009
	bf+	31, L(g0)
Packit 6c4009
	stb	rCHR, 0(rMEMP0)
Packit 6c4009
	bt	30, L(aligned)
Packit 6c4009
L(g0):	sth	rCHR, -2(rMEMP)	/* 16th instruction from .align */
Packit 6c4009
/* take care of case for size < 31 */
Packit 6c4009
L(aligned):
Packit 6c4009
	mtcrf	0x01, rLEN
Packit 6c4009
	rlwimi	rCHR, rCHR, 16, 0, 15
Packit 6c4009
	ble	cr5, L(medium)
Packit 6c4009
/* align to cache line boundary...  */
Packit 6c4009
	andi.	rALIGN, rMEMP, 0x1C
Packit 6c4009
	subfic	rALIGN, rALIGN, 0x20
Packit 6c4009
	beq	L(caligned)
Packit 6c4009
	mtcrf	0x01, rALIGN
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	sub	rLEN, rLEN, rALIGN
Packit 6c4009
	cmplwi	cr1, rALIGN, 0x10
Packit 6c4009
	mr	rMEMP2, rMEMP
Packit 6c4009
	bf	28, L(a1)
Packit 6c4009
	stw	rCHR, -4(rMEMP2)
Packit 6c4009
	stwu	rCHR, -8(rMEMP2)
Packit 6c4009
L(a1):	blt	cr1, L(a2)
Packit 6c4009
	stw	rCHR, -4(rMEMP2) /* 32nd instruction from .align */
Packit 6c4009
	stw	rCHR, -8(rMEMP2)
Packit 6c4009
	stw	rCHR, -12(rMEMP2)
Packit 6c4009
	stwu	rCHR, -16(rMEMP2)
Packit 6c4009
L(a2):	bf	29, L(caligned)
Packit 6c4009
	stw	rCHR, -4(rMEMP2)
Packit 6c4009
/* now aligned to a cache line.  */
Packit 6c4009
L(caligned):
Packit 6c4009
	cmplwi	cr1, rCHR, 0
Packit 6c4009
	clrrwi.	rALIGN, rLEN, 5
Packit 6c4009
	mtcrf	0x01, rLEN	/* 40th instruction from .align */
Packit 6c4009
Packit 6c4009
/* Check if we can use the special case for clearing memory using dcbz.
Packit 6c4009
   This requires that we know the correct cache line size for this
Packit 6c4009
   processor.  Getting the __cache_line_size may require establishing GOT
Packit 6c4009
   addressability, so branch out of line to set this up.  */
Packit 6c4009
	beq	cr1, L(checklinesize)
Packit 6c4009
Packit 6c4009
/* Store blocks of 32-bytes (256-bits) starting on a 32-byte boundary.
Packit 6c4009
   Can't assume that rCHR is zero or that the cache line size is either
Packit 6c4009
   32-bytes or even known.  */
Packit 6c4009
L(nondcbz):
Packit 6c4009
	srwi	rTMP, rALIGN, 5
Packit 6c4009
	mtctr	rTMP
Packit 6c4009
	beq	L(medium)	/* we may not actually get to do a full line */
Packit 6c4009
	clrlwi.	rLEN, rLEN, 27
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	li	rNEG64, -0x40
Packit 6c4009
	bdz	L(cloopdone)	/* 48th instruction from .align */
Packit 6c4009
Packit 6c4009
/* We can't use dcbz here as we don't know the cache line size.  We can
Packit 6c4009
   use "data cache block touch for store", which is safe.  */
Packit 6c4009
L(c3):	dcbtst	rNEG64, rMEMP
Packit 6c4009
	stw	rCHR, -4(rMEMP)
Packit 6c4009
	stw	rCHR, -8(rMEMP)
Packit 6c4009
	stw	rCHR, -12(rMEMP)
Packit 6c4009
	stw	rCHR, -16(rMEMP)
Packit 6c4009
	nop			/* let 601 fetch last 4 instructions of loop */
Packit 6c4009
	stw	rCHR, -20(rMEMP)
Packit 6c4009
	stw	rCHR, -24(rMEMP) /* 56th instruction from .align */
Packit 6c4009
	nop			/* let 601 fetch first 8 instructions of loop */
Packit 6c4009
	stw	rCHR, -28(rMEMP)
Packit 6c4009
	stwu	rCHR, -32(rMEMP)
Packit 6c4009
	bdnz	L(c3)
Packit 6c4009
L(cloopdone):
Packit 6c4009
	stw	rCHR, -4(rMEMP)
Packit 6c4009
	stw	rCHR, -8(rMEMP)
Packit 6c4009
	stw	rCHR, -12(rMEMP)
Packit 6c4009
	stw	rCHR, -16(rMEMP) /* 64th instruction from .align */
Packit 6c4009
	stw	rCHR, -20(rMEMP)
Packit 6c4009
	cmplwi	cr1, rLEN, 16
Packit 6c4009
	stw	rCHR, -24(rMEMP)
Packit 6c4009
	stw	rCHR, -28(rMEMP)
Packit 6c4009
	stwu	rCHR, -32(rMEMP)
Packit 6c4009
	beqlr
Packit 6c4009
	add	rMEMP, rMEMP, rALIGN
Packit 6c4009
	b	L(medium_tail2)	/* 72nd instruction from .align */
Packit 6c4009
Packit 6c4009
	.align	5
Packit 6c4009
	nop
Packit 6c4009
/* Clear cache lines of memory in 128-byte chunks.
Packit 6c4009
   This code is optimized for processors with 32-byte cache lines.
Packit 6c4009
   It is further optimized for the 601 processor, which requires
Packit 6c4009
   some care in how the code is aligned in the i-cache.  */
Packit 6c4009
L(zloopstart):
Packit 6c4009
	clrlwi	rLEN, rLEN, 27
Packit 6c4009
	mtcrf	0x02, rALIGN
Packit 6c4009
	srwi.	rTMP, rALIGN, 7
Packit 6c4009
	mtctr	rTMP
Packit 6c4009
	li	rPOS32, 0x20
Packit 6c4009
	li	rNEG64, -0x40
Packit 6c4009
	cmplwi	cr1, rLEN, 16	/* 8 */
Packit 6c4009
	bf	26, L(z0)
Packit 6c4009
	dcbz	0, rMEMP
Packit 6c4009
	addi	rMEMP, rMEMP, 0x20
Packit 6c4009
L(z0):	li	rNEG32, -0x20
Packit 6c4009
	bf	25, L(z1)
Packit 6c4009
	dcbz	0, rMEMP
Packit 6c4009
	dcbz	rPOS32, rMEMP
Packit 6c4009
	addi	rMEMP, rMEMP, 0x40 /* 16 */
Packit 6c4009
L(z1):	cmplwi	cr5, rLEN, 0
Packit 6c4009
	beq	L(medium)
Packit 6c4009
L(zloop):
Packit 6c4009
	dcbz	0, rMEMP
Packit 6c4009
	dcbz	rPOS32, rMEMP
Packit 6c4009
	addi	rMEMP, rMEMP, 0x80
Packit 6c4009
	dcbz	rNEG64, rMEMP
Packit 6c4009
	dcbz	rNEG32, rMEMP
Packit 6c4009
	bdnz	L(zloop)
Packit 6c4009
	beqlr	cr5
Packit 6c4009
	b	L(medium_tail2)
Packit 6c4009
Packit 6c4009
	.align	5
Packit 6c4009
L(small):
Packit 6c4009
/* Memset of 4 bytes or less.  */
Packit 6c4009
	cmplwi	cr5, rLEN, 1
Packit 6c4009
	cmplwi	cr1, rLEN, 3
Packit 6c4009
	bltlr	cr5
Packit 6c4009
	stb	rCHR, 0(rMEMP)
Packit 6c4009
	beqlr	cr5
Packit 6c4009
	nop
Packit 6c4009
	stb	rCHR, 1(rMEMP)
Packit 6c4009
	bltlr	cr1
Packit 6c4009
	stb	rCHR, 2(rMEMP)
Packit 6c4009
	beqlr	cr1
Packit 6c4009
	nop
Packit 6c4009
	stb	rCHR, 3(rMEMP)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
/* Memset of 0-31 bytes.  */
Packit 6c4009
	.align	5
Packit 6c4009
L(medium):
Packit 6c4009
	cmplwi	cr1, rLEN, 16
Packit 6c4009
L(medium_tail2):
Packit 6c4009
	add	rMEMP, rMEMP, rLEN
Packit 6c4009
L(medium_tail):
Packit 6c4009
	bt-	31, L(medium_31t)
Packit 6c4009
	bt-	30, L(medium_30t)
Packit 6c4009
L(medium_30f):
Packit 6c4009
	bt-	29, L(medium_29t)
Packit 6c4009
L(medium_29f):
Packit 6c4009
	bge-	cr1, L(medium_27t)
Packit 6c4009
	bflr-	28
Packit 6c4009
	stw	rCHR, -4(rMEMP)	/* 8th instruction from .align */
Packit 6c4009
	stw	rCHR, -8(rMEMP)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
L(medium_31t):
Packit 6c4009
	stbu	rCHR, -1(rMEMP)
Packit 6c4009
	bf-	30, L(medium_30f)
Packit 6c4009
L(medium_30t):
Packit 6c4009
	sthu	rCHR, -2(rMEMP)
Packit 6c4009
	bf-	29, L(medium_29f)
Packit 6c4009
L(medium_29t):
Packit 6c4009
	stwu	rCHR, -4(rMEMP)
Packit 6c4009
	blt-	cr1, L(medium_27f) /* 16th instruction from .align */
Packit 6c4009
L(medium_27t):
Packit 6c4009
	stw	rCHR, -4(rMEMP)
Packit 6c4009
	stw	rCHR, -8(rMEMP)
Packit 6c4009
	stw	rCHR, -12(rMEMP)
Packit 6c4009
	stwu	rCHR, -16(rMEMP)
Packit 6c4009
L(medium_27f):
Packit 6c4009
	bflr-	28
Packit 6c4009
L(medium_28t):
Packit 6c4009
	stw	rCHR, -4(rMEMP)
Packit 6c4009
	stw	rCHR, -8(rMEMP)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
L(checklinesize):
Packit 6c4009
#ifdef SHARED
Packit 6c4009
	mflr	rTMP
Packit 6c4009
/* If the remaining length is less the 32 bytes then don't bother getting
Packit 6c4009
   the cache line size.  */
Packit 6c4009
	beq	L(medium)
Packit 6c4009
/* Establishes GOT addressability so we can load __cache_line_size
Packit 6c4009
   from static. This value was set from the aux vector during startup.  */
Packit 6c4009
	SETUP_GOT_ACCESS(rGOT,got_label)
Packit 6c4009
	addis	rGOT,rGOT,__cache_line_size-got_label@ha
Packit 6c4009
	lwz	rCLS,__cache_line_size-got_label@l(rGOT)
Packit 6c4009
	mtlr	rTMP
Packit 6c4009
#else
Packit 6c4009
/* Load __cache_line_size from static. This value was set from the
Packit 6c4009
   aux vector during startup.  */
Packit 6c4009
	lis	rCLS,__cache_line_size@ha
Packit 6c4009
/* If the remaining length is less the 32 bytes then don't bother getting
Packit 6c4009
   the cache line size.  */
Packit 6c4009
	beq	L(medium)
Packit 6c4009
	lwz	rCLS,__cache_line_size@l(rCLS)
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* If the cache line size was not set then goto to L(nondcbz), which is
Packit 6c4009
   safe for any cache line size.  */
Packit 6c4009
	cmplwi	cr1,rCLS,0
Packit 6c4009
	beq	cr1,L(nondcbz)
Packit 6c4009
Packit 6c4009
/* If the cache line size is 32 bytes then goto to L(zloopstart),
Packit 6c4009
   which is coded specifically for 32-byte lines (and 601).  */
Packit 6c4009
	cmplwi	cr1,rCLS,32
Packit 6c4009
	beq	cr1,L(zloopstart)
Packit 6c4009
Packit 6c4009
/* Now we know the cache line size and it is not 32-bytes.  However
Packit 6c4009
   we may not yet be aligned to the cache line and may have a partial
Packit 6c4009
   line to fill.  Touch it 1st to fetch the cache line.  */
Packit 6c4009
	dcbtst	0,rMEMP
Packit 6c4009
Packit 6c4009
	addi	rCLM,rCLS,-1
Packit 6c4009
L(getCacheAligned):
Packit 6c4009
	cmplwi	cr1,rLEN,32
Packit 6c4009
	and.	rTMP,rCLM,rMEMP
Packit 6c4009
	blt	cr1,L(handletail32)
Packit 6c4009
	beq	L(cacheAligned)
Packit 6c4009
/* We are not aligned to start of a cache line yet.  Store 32-byte
Packit 6c4009
   of data and test again.  */
Packit 6c4009
	addi	rMEMP,rMEMP,32
Packit 6c4009
	addi	rLEN,rLEN,-32
Packit 6c4009
	stw	rCHR,-32(rMEMP)
Packit 6c4009
	stw	rCHR,-28(rMEMP)
Packit 6c4009
	stw	rCHR,-24(rMEMP)
Packit 6c4009
	stw	rCHR,-20(rMEMP)
Packit 6c4009
	stw	rCHR,-16(rMEMP)
Packit 6c4009
	stw	rCHR,-12(rMEMP)
Packit 6c4009
	stw	rCHR,-8(rMEMP)
Packit 6c4009
	stw	rCHR,-4(rMEMP)
Packit 6c4009
	b	L(getCacheAligned)
Packit 6c4009
Packit 6c4009
/* Now we are aligned to the cache line and can use dcbz.  */
Packit 6c4009
L(cacheAligned):
Packit 6c4009
	cmplw	cr1,rLEN,rCLS
Packit 6c4009
	blt	cr1,L(handletail32)
Packit 6c4009
	dcbz	0,rMEMP
Packit 6c4009
	subf	rLEN,rCLS,rLEN
Packit 6c4009
	add	rMEMP,rMEMP,rCLS
Packit 6c4009
	b	L(cacheAligned)
Packit 6c4009
Packit 6c4009
/* We are here because; the cache line size was set, it was not
Packit 6c4009
   32-bytes, and the remainder (rLEN) is now less than the actual cache
Packit 6c4009
   line size.  Set up the preconditions for L(nondcbz) and go there to
Packit 6c4009
   store the remaining bytes.  */
Packit 6c4009
L(handletail32):
Packit 6c4009
	clrrwi.	rALIGN, rLEN, 5
Packit 6c4009
	b	L(nondcbz)
Packit 6c4009
Packit 6c4009
END (memset)
Packit 6c4009
libc_hidden_builtin_def (memset)