Blame sysdeps/aarch64/memset.S

Packit 6c4009
/* Copyright (C) 2012-2018 Free Software Foundation, Inc.
Packit 6c4009
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library.  If not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
#include "memset-reg.h"
Packit 6c4009
Packit 6c4009
#ifndef MEMSET
Packit 6c4009
# define MEMSET memset
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* Assumptions:
Packit 6c4009
 *
Packit 6c4009
 * ARMv8-a, AArch64, unaligned accesses
Packit 6c4009
 *
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
ENTRY_ALIGN (MEMSET, 6)
Packit 6c4009
Packit 6c4009
	DELOUSE (0)
Packit 6c4009
	DELOUSE (2)
Packit 6c4009
Packit 6c4009
	dup	v0.16B, valw
Packit 6c4009
	add	dstend, dstin, count
Packit 6c4009
Packit 6c4009
	cmp	count, 96
Packit 6c4009
	b.hi	L(set_long)
Packit 6c4009
	cmp	count, 16
Packit 6c4009
	b.hs	L(set_medium)
Packit 6c4009
	mov	val, v0.D[0]
Packit 6c4009
Packit 6c4009
	/* Set 0..15 bytes.  */
Packit 6c4009
	tbz	count, 3, 1f
Packit 6c4009
	str	val, [dstin]
Packit 6c4009
	str	val, [dstend, -8]
Packit 6c4009
	ret
Packit 6c4009
	nop
Packit 6c4009
1:	tbz	count, 2, 2f
Packit 6c4009
	str	valw, [dstin]
Packit 6c4009
	str	valw, [dstend, -4]
Packit 6c4009
	ret
Packit 6c4009
2:	cbz	count, 3f
Packit 6c4009
	strb	valw, [dstin]
Packit 6c4009
	tbz	count, 1, 3f
Packit 6c4009
	strh	valw, [dstend, -2]
Packit 6c4009
3:	ret
Packit 6c4009
Packit 6c4009
	/* Set 17..96 bytes.  */
Packit 6c4009
L(set_medium):
Packit 6c4009
	str	q0, [dstin]
Packit 6c4009
	tbnz	count, 6, L(set96)
Packit 6c4009
	str	q0, [dstend, -16]
Packit 6c4009
	tbz	count, 5, 1f
Packit 6c4009
	str	q0, [dstin, 16]
Packit 6c4009
	str	q0, [dstend, -32]
Packit 6c4009
1:	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
	/* Set 64..96 bytes.  Write 64 bytes from the start and
Packit 6c4009
	   32 bytes from the end.  */
Packit 6c4009
L(set96):
Packit 6c4009
	str	q0, [dstin, 16]
Packit 6c4009
	stp	q0, q0, [dstin, 32]
Packit 6c4009
	stp	q0, q0, [dstend, -32]
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
	nop
Packit 6c4009
L(set_long):
Packit 6c4009
	and	valw, valw, 255
Packit 6c4009
	bic	dst, dstin, 15
Packit 6c4009
	str	q0, [dstin]
Packit 6c4009
	cmp	count, 256
Packit 6c4009
	ccmp	valw, 0, 0, cs
Packit 6c4009
	b.eq	L(try_zva)
Packit 6c4009
L(no_zva):
Packit 6c4009
	sub	count, dstend, dst	/* Count is 16 too large.  */
Packit 6c4009
	add	dst, dst, 16
Packit 6c4009
	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
Packit 6c4009
1:	stp	q0, q0, [dst], 64
Packit 6c4009
	stp	q0, q0, [dst, -32]
Packit 6c4009
L(tail64):
Packit 6c4009
	subs	count, count, 64
Packit 6c4009
	b.hi	1b
Packit 6c4009
2:	stp	q0, q0, [dstend, -64]
Packit 6c4009
	stp	q0, q0, [dstend, -32]
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
L(try_zva):
Packit 6c4009
#ifdef ZVA_MACRO
Packit 6c4009
	zva_macro
Packit 6c4009
#else
Packit 6c4009
	.p2align 3
Packit 6c4009
	mrs	tmp1, dczid_el0
Packit 6c4009
	tbnz	tmp1w, 4, L(no_zva)
Packit 6c4009
	and	tmp1w, tmp1w, 15
Packit 6c4009
	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
Packit 6c4009
	b.ne	 L(zva_128)
Packit 6c4009
Packit 6c4009
	/* Write the first and last 64 byte aligned block using stp rather
Packit 6c4009
	   than using DC ZVA.  This is faster on some cores.
Packit 6c4009
	 */
Packit 6c4009
L(zva_64):
Packit 6c4009
	str	q0, [dst, 16]
Packit 6c4009
	stp	q0, q0, [dst, 32]
Packit 6c4009
	bic	dst, dst, 63
Packit 6c4009
	stp	q0, q0, [dst, 64]
Packit 6c4009
	stp	q0, q0, [dst, 96]
Packit 6c4009
	sub	count, dstend, dst	/* Count is now 128 too large.	*/
Packit 6c4009
	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
Packit 6c4009
	add	dst, dst, 128
Packit 6c4009
	nop
Packit 6c4009
1:	dc	zva, dst
Packit 6c4009
	add	dst, dst, 64
Packit 6c4009
	subs	count, count, 64
Packit 6c4009
	b.hi	1b
Packit 6c4009
	stp	q0, q0, [dst, 0]
Packit 6c4009
	stp	q0, q0, [dst, 32]
Packit 6c4009
	stp	q0, q0, [dstend, -64]
Packit 6c4009
	stp	q0, q0, [dstend, -32]
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 3
Packit 6c4009
L(zva_128):
Packit 6c4009
	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
Packit 6c4009
	b.ne	L(zva_other)
Packit 6c4009
Packit 6c4009
	str	q0, [dst, 16]
Packit 6c4009
	stp	q0, q0, [dst, 32]
Packit 6c4009
	stp	q0, q0, [dst, 64]
Packit 6c4009
	stp	q0, q0, [dst, 96]
Packit 6c4009
	bic	dst, dst, 127
Packit 6c4009
	sub	count, dstend, dst	/* Count is now 128 too large.	*/
Packit 6c4009
	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
Packit 6c4009
	add	dst, dst, 128
Packit 6c4009
1:	dc	zva, dst
Packit 6c4009
	add	dst, dst, 128
Packit 6c4009
	subs	count, count, 128
Packit 6c4009
	b.hi	1b
Packit 6c4009
	stp	q0, q0, [dstend, -128]
Packit 6c4009
	stp	q0, q0, [dstend, -96]
Packit 6c4009
	stp	q0, q0, [dstend, -64]
Packit 6c4009
	stp	q0, q0, [dstend, -32]
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
L(zva_other):
Packit 6c4009
	mov	tmp2w, 4
Packit 6c4009
	lsl	zva_lenw, tmp2w, tmp1w
Packit 6c4009
	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
Packit 6c4009
	cmp	count, tmp1
Packit 6c4009
	blo	L(no_zva)
Packit 6c4009
Packit 6c4009
	sub	tmp2, zva_len, 1
Packit 6c4009
	add	tmp1, dst, zva_len
Packit 6c4009
	add	dst, dst, 16
Packit 6c4009
	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
Packit 6c4009
	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
Packit 6c4009
	beq	2f
Packit 6c4009
1:	stp	q0, q0, [dst], 64
Packit 6c4009
	stp	q0, q0, [dst, -32]
Packit 6c4009
	subs	count, count, 64
Packit 6c4009
	b.hi	1b
Packit 6c4009
2:	mov	dst, tmp1
Packit 6c4009
	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
Packit 6c4009
	subs	count, count, zva_len
Packit 6c4009
	b.lo	4f
Packit 6c4009
3:	dc	zva, dst
Packit 6c4009
	add	dst, dst, zva_len
Packit 6c4009
	subs	count, count, zva_len
Packit 6c4009
	b.hs	3b
Packit 6c4009
4:	add	count, count, zva_len
Packit 6c4009
	b	L(tail64)
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
END (MEMSET)
Packit 6c4009
libc_hidden_builtin_def (MEMSET)