Blame sysdeps/aarch64/strnlen.S

Packit 6c4009
/* strnlen - calculate the length of a string with limit.
Packit 6c4009
Packit 6c4009
   Copyright (C) 2013-2018 Free Software Foundation, Inc.
Packit 6c4009
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library.  If not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* Assumptions:
Packit 6c4009
 *
Packit 6c4009
 * ARMv8-a, AArch64
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
/* Arguments and results.  */
Packit 6c4009
#define srcin		x0
Packit 6c4009
#define len		x0
Packit 6c4009
#define limit		x1
Packit 6c4009
Packit 6c4009
/* Locals and temporaries.  */
Packit 6c4009
#define src		x2
Packit 6c4009
#define data1		x3
Packit 6c4009
#define data2		x4
Packit 6c4009
#define data2a		x5
Packit 6c4009
#define has_nul1	x6
Packit 6c4009
#define has_nul2	x7
Packit 6c4009
#define tmp1		x8
Packit 6c4009
#define tmp2		x9
Packit 6c4009
#define tmp3		x10
Packit 6c4009
#define tmp4		x11
Packit 6c4009
#define zeroones	x12
Packit 6c4009
#define pos		x13
Packit 6c4009
#define limit_wd	x14
Packit 6c4009
Packit 6c4009
#define REP8_01 0x0101010101010101
Packit 6c4009
#define REP8_7f 0x7f7f7f7f7f7f7f7f
Packit 6c4009
#define REP8_80 0x8080808080808080
Packit 6c4009
Packit 6c4009
ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
Packit 6c4009
	DELOUSE (0)
Packit 6c4009
	DELOUSE (1)
Packit 6c4009
	DELOUSE (2)
Packit 6c4009
	cbz	limit, L(hit_limit)
Packit 6c4009
	mov	zeroones, #REP8_01
Packit 6c4009
	bic	src, srcin, #15
Packit 6c4009
	ands	tmp1, srcin, #15
Packit 6c4009
	b.ne	L(misaligned)
Packit 6c4009
	/* Calculate the number of full and partial words -1.  */
Packit 6c4009
	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
Packit 6c4009
	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
Packit 6c4009
Packit 6c4009
	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
Packit 6c4009
	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
Packit 6c4009
	   can be done in parallel across the entire word.  */
Packit 6c4009
	/* The inner loop deals with two Dwords at a time.  This has a
Packit 6c4009
	   slightly higher start-up cost, but we should win quite quickly,
Packit 6c4009
	   especially on cores with a high number of issue slots per
Packit 6c4009
	   cycle, as we get much better parallelism out of the operations.  */
Packit 6c4009
Packit 6c4009
	/* Start of critial section -- keep to one 64Byte cache line.  */
Packit 6c4009
L(loop):
Packit 6c4009
	ldp	data1, data2, [src], #16
Packit 6c4009
L(realigned):
Packit 6c4009
	sub	tmp1, data1, zeroones
Packit 6c4009
	orr	tmp2, data1, #REP8_7f
Packit 6c4009
	sub	tmp3, data2, zeroones
Packit 6c4009
	orr	tmp4, data2, #REP8_7f
Packit 6c4009
	bic	has_nul1, tmp1, tmp2
Packit 6c4009
	bic	has_nul2, tmp3, tmp4
Packit 6c4009
	subs	limit_wd, limit_wd, #1
Packit 6c4009
	orr	tmp1, has_nul1, has_nul2
Packit 6c4009
	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
Packit 6c4009
	b.eq	L(loop)
Packit 6c4009
	/* End of critical section -- keep to one 64Byte cache line.  */
Packit 6c4009
Packit 6c4009
	orr	tmp1, has_nul1, has_nul2
Packit 6c4009
	cbz	tmp1, L(hit_limit)	/* No null in final Qword.  */
Packit 6c4009
Packit 6c4009
	/* We know there's a null in the final Qword.  The easiest thing
Packit 6c4009
	   to do now is work out the length of the string and return
Packit 6c4009
	   MIN (len, limit).  */
Packit 6c4009
Packit 6c4009
	sub	len, src, srcin
Packit 6c4009
	cbz	has_nul1, L(nul_in_data2)
Packit 6c4009
#ifdef __AARCH64EB__
Packit 6c4009
	mov	data2, data1
Packit 6c4009
#endif
Packit 6c4009
	sub	len, len, #8
Packit 6c4009
	mov	has_nul2, has_nul1
Packit 6c4009
L(nul_in_data2):
Packit 6c4009
#ifdef __AARCH64EB__
Packit 6c4009
	/* For big-endian, carry propagation (if the final byte in the
Packit 6c4009
	   string is 0x01) means we cannot use has_nul directly.  The
Packit 6c4009
	   easiest way to get the correct byte is to byte-swap the data
Packit 6c4009
	   and calculate the syndrome a second time.  */
Packit 6c4009
	rev	data2, data2
Packit 6c4009
	sub	tmp1, data2, zeroones
Packit 6c4009
	orr	tmp2, data2, #REP8_7f
Packit 6c4009
	bic	has_nul2, tmp1, tmp2
Packit 6c4009
#endif
Packit 6c4009
	sub	len, len, #8
Packit 6c4009
	rev	has_nul2, has_nul2
Packit 6c4009
	clz	pos, has_nul2
Packit 6c4009
	add	len, len, pos, lsr #3		/* Bits to bytes.  */
Packit 6c4009
	cmp	len, limit
Packit 6c4009
	csel	len, len, limit, ls		/* Return the lower value.  */
Packit 6c4009
	RET
Packit 6c4009
Packit 6c4009
L(misaligned):
Packit 6c4009
	/* Deal with a partial first word.
Packit 6c4009
	   We're doing two things in parallel here;
Packit 6c4009
	   1) Calculate the number of words (but avoiding overflow if
Packit 6c4009
	      limit is near ULONG_MAX) - to do this we need to work out
Packit 6c4009
	      limit + tmp1 - 1 as a 65-bit value before shifting it;
Packit 6c4009
	   2) Load and mask the initial data words - we force the bytes
Packit 6c4009
	      before the ones we are interested in to 0xff - this ensures
Packit 6c4009
	      early bytes will not hit any zero detection.  */
Packit 6c4009
	sub	limit_wd, limit, #1
Packit 6c4009
	neg	tmp4, tmp1
Packit 6c4009
	cmp	tmp1, #8
Packit 6c4009
Packit 6c4009
	and	tmp3, limit_wd, #15
Packit 6c4009
	lsr	limit_wd, limit_wd, #4
Packit 6c4009
	mov	tmp2, #~0
Packit 6c4009
Packit 6c4009
	ldp	data1, data2, [src], #16
Packit 6c4009
	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
Packit 6c4009
	add	tmp3, tmp3, tmp1
Packit 6c4009
Packit 6c4009
#ifdef __AARCH64EB__
Packit 6c4009
	/* Big-endian.  Early bytes are at MSB.  */
Packit 6c4009
	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
Packit 6c4009
#else
Packit 6c4009
	/* Little-endian.  Early bytes are at LSB.  */
Packit 6c4009
	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
Packit 6c4009
#endif
Packit 6c4009
	add	limit_wd, limit_wd, tmp3, lsr #4
Packit 6c4009
Packit 6c4009
	orr	data1, data1, tmp2
Packit 6c4009
	orr	data2a, data2, tmp2
Packit 6c4009
Packit 6c4009
	csinv	data1, data1, xzr, le
Packit 6c4009
	csel	data2, data2, data2a, le
Packit 6c4009
	b	L(realigned)
Packit 6c4009
Packit 6c4009
L(hit_limit):
Packit 6c4009
	mov	len, limit
Packit 6c4009
	RET
Packit 6c4009
END (__strnlen)
Packit 6c4009
libc_hidden_def (__strnlen)
Packit 6c4009
weak_alias (__strnlen, strnlen)
Packit 6c4009
libc_hidden_def (strnlen)