Blame sysdeps/aarch64/strlen.S

Packit 6c4009
/* Copyright (C) 2012-2018 Free Software Foundation, Inc.
Packit 6c4009
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library.  If not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* Assumptions:
Packit 6c4009
 *
Packit 6c4009
 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
/* To test the page crossing code path more thoroughly, compile with
Packit 6c4009
   -DTEST_PAGE_CROSS - this will force all calls through the slower
Packit 6c4009
   entry path.  This option is not intended for production use.  */
Packit 6c4009
Packit 6c4009
/* Arguments and results.  */
Packit 6c4009
#define srcin		x0
Packit 6c4009
#define len		x0
Packit 6c4009
Packit 6c4009
/* Locals and temporaries.  */
Packit 6c4009
#define src		x1
Packit 6c4009
#define data1		x2
Packit 6c4009
#define data2		x3
Packit 6c4009
#define has_nul1	x4
Packit 6c4009
#define has_nul2	x5
Packit 6c4009
#define tmp1		x4
Packit 6c4009
#define tmp2		x5
Packit 6c4009
#define tmp3		x6
Packit 6c4009
#define tmp4		x7
Packit 6c4009
#define zeroones	x8
Packit 6c4009
Packit 6c4009
	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
Packit 6c4009
	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
Packit 6c4009
	   can be done in parallel across the entire word. A faster check
Packit 6c4009
	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
Packit 6c4009
	   false hits for characters 129..255.	*/
Packit 6c4009
Packit 6c4009
#define REP8_01 0x0101010101010101
Packit 6c4009
#define REP8_7f 0x7f7f7f7f7f7f7f7f
Packit 6c4009
#define REP8_80 0x8080808080808080
Packit 6c4009
Packit 6c4009
#ifdef TEST_PAGE_CROSS
Packit 6c4009
# define MIN_PAGE_SIZE 15
Packit 6c4009
#else
Packit 6c4009
# define MIN_PAGE_SIZE 4096
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
	/* Since strings are short on average, we check the first 16 bytes
Packit 6c4009
	   of the string for a NUL character.  In order to do an unaligned ldp
Packit 6c4009
	   safely we have to do a page cross check first.  If there is a NUL
Packit 6c4009
	   byte we calculate the length from the 2 8-byte words using
Packit 6c4009
	   conditional select to reduce branch mispredictions (it is unlikely
Packit 6c4009
	   strlen will be repeatedly called on strings with the same length).
Packit 6c4009
Packit 6c4009
	   If the string is longer than 16 bytes, we align src so don't need
Packit 6c4009
	   further page cross checks, and process 32 bytes per iteration
Packit 6c4009
	   using the fast NUL check.  If we encounter non-ASCII characters,
Packit 6c4009
	   fallback to a second loop using the full NUL check.
Packit 6c4009
Packit 6c4009
	   If the page cross check fails, we read 16 bytes from an aligned
Packit 6c4009
	   address, remove any characters before the string, and continue
Packit 6c4009
	   in the main loop using aligned loads.  Since strings crossing a
Packit 6c4009
	   page in the first 16 bytes are rare (probability of
Packit 6c4009
	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
Packit 6c4009
Packit 6c4009
	   AArch64 systems have a minimum page size of 4k.  We don't bother
Packit 6c4009
	   checking for larger page sizes - the cost of setting up the correct
Packit 6c4009
	   page size is just not worth the extra gain from a small reduction in
Packit 6c4009
	   the cases taking the slow path.  Note that we only care about
Packit 6c4009
	   whether the first fetch, which may be misaligned, crosses a page
Packit 6c4009
	   boundary.  */
Packit 6c4009
Packit 6c4009
ENTRY_ALIGN (__strlen, 6)
Packit 6c4009
	DELOUSE (0)
Packit 6c4009
	DELOUSE (1)
Packit 6c4009
	and	tmp1, srcin, MIN_PAGE_SIZE - 1
Packit 6c4009
	mov	zeroones, REP8_01
Packit 6c4009
	cmp	tmp1, MIN_PAGE_SIZE - 16
Packit 6c4009
	b.gt	L(page_cross)
Packit 6c4009
	ldp	data1, data2, [srcin]
Packit 6c4009
#ifdef __AARCH64EB__
Packit 6c4009
	/* For big-endian, carry propagation (if the final byte in the
Packit 6c4009
	   string is 0x01) means we cannot use has_nul1/2 directly.
Packit 6c4009
	   Since we expect strings to be small and early-exit,
Packit 6c4009
	   byte-swap the data now so has_null1/2 will be correct.  */
Packit 6c4009
	rev	data1, data1
Packit 6c4009
	rev	data2, data2
Packit 6c4009
#endif
Packit 6c4009
	sub	tmp1, data1, zeroones
Packit 6c4009
	orr	tmp2, data1, REP8_7f
Packit 6c4009
	sub	tmp3, data2, zeroones
Packit 6c4009
	orr	tmp4, data2, REP8_7f
Packit 6c4009
	bics	has_nul1, tmp1, tmp2
Packit 6c4009
	bic	has_nul2, tmp3, tmp4
Packit 6c4009
	ccmp	has_nul2, 0, 0, eq
Packit 6c4009
	beq	L(main_loop_entry)
Packit 6c4009
Packit 6c4009
	/* Enter with C = has_nul1 == 0.  */
Packit 6c4009
	csel	has_nul1, has_nul1, has_nul2, cc
Packit 6c4009
	mov	len, 8
Packit 6c4009
	rev	has_nul1, has_nul1
Packit 6c4009
	clz	tmp1, has_nul1
Packit 6c4009
	csel	len, xzr, len, cc
Packit 6c4009
	add	len, len, tmp1, lsr 3
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	/* The inner loop processes 32 bytes per iteration and uses the fast
Packit 6c4009
	   NUL check.  If we encounter non-ASCII characters, use a second
Packit 6c4009
	   loop with the accurate NUL check.  */
Packit 6c4009
	.p2align 4
Packit 6c4009
L(main_loop_entry):
Packit 6c4009
	bic	src, srcin, 15
Packit 6c4009
	sub	src, src, 16
Packit 6c4009
L(main_loop):
Packit 6c4009
	ldp	data1, data2, [src, 32]!
Packit 6c4009
L(page_cross_entry):
Packit 6c4009
	sub	tmp1, data1, zeroones
Packit 6c4009
	sub	tmp3, data2, zeroones
Packit 6c4009
	orr	tmp2, tmp1, tmp3
Packit 6c4009
	tst	tmp2, zeroones, lsl 7
Packit 6c4009
	bne	1f
Packit 6c4009
	ldp	data1, data2, [src, 16]
Packit 6c4009
	sub	tmp1, data1, zeroones
Packit 6c4009
	sub	tmp3, data2, zeroones
Packit 6c4009
	orr	tmp2, tmp1, tmp3
Packit 6c4009
	tst	tmp2, zeroones, lsl 7
Packit 6c4009
	beq	L(main_loop)
Packit 6c4009
	add	src, src, 16
Packit 6c4009
1:
Packit 6c4009
	/* The fast check failed, so do the slower, accurate NUL check.	 */
Packit 6c4009
	orr	tmp2, data1, REP8_7f
Packit 6c4009
	orr	tmp4, data2, REP8_7f
Packit 6c4009
	bics	has_nul1, tmp1, tmp2
Packit 6c4009
	bic	has_nul2, tmp3, tmp4
Packit 6c4009
	ccmp	has_nul2, 0, 0, eq
Packit 6c4009
	beq	L(nonascii_loop)
Packit 6c4009
Packit 6c4009
	/* Enter with C = has_nul1 == 0.  */
Packit 6c4009
L(tail):
Packit 6c4009
#ifdef __AARCH64EB__
Packit 6c4009
	/* For big-endian, carry propagation (if the final byte in the
Packit 6c4009
	   string is 0x01) means we cannot use has_nul1/2 directly.  The
Packit 6c4009
	   easiest way to get the correct byte is to byte-swap the data
Packit 6c4009
	   and calculate the syndrome a second time.  */
Packit 6c4009
	csel	data1, data1, data2, cc
Packit 6c4009
	rev	data1, data1
Packit 6c4009
	sub	tmp1, data1, zeroones
Packit 6c4009
	orr	tmp2, data1, REP8_7f
Packit 6c4009
	bic	has_nul1, tmp1, tmp2
Packit 6c4009
#else
Packit 6c4009
	csel	has_nul1, has_nul1, has_nul2, cc
Packit 6c4009
#endif
Packit 6c4009
	sub	len, src, srcin
Packit 6c4009
	rev	has_nul1, has_nul1
Packit 6c4009
	add	tmp2, len, 8
Packit 6c4009
	clz	tmp1, has_nul1
Packit 6c4009
	csel	len, len, tmp2, cc
Packit 6c4009
	add	len, len, tmp1, lsr 3
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
L(nonascii_loop):
Packit 6c4009
	ldp	data1, data2, [src, 16]!
Packit 6c4009
	sub	tmp1, data1, zeroones
Packit 6c4009
	orr	tmp2, data1, REP8_7f
Packit 6c4009
	sub	tmp3, data2, zeroones
Packit 6c4009
	orr	tmp4, data2, REP8_7f
Packit 6c4009
	bics	has_nul1, tmp1, tmp2
Packit 6c4009
	bic	has_nul2, tmp3, tmp4
Packit 6c4009
	ccmp	has_nul2, 0, 0, eq
Packit 6c4009
	bne	L(tail)
Packit 6c4009
	ldp	data1, data2, [src, 16]!
Packit 6c4009
	sub	tmp1, data1, zeroones
Packit 6c4009
	orr	tmp2, data1, REP8_7f
Packit 6c4009
	sub	tmp3, data2, zeroones
Packit 6c4009
	orr	tmp4, data2, REP8_7f
Packit 6c4009
	bics	has_nul1, tmp1, tmp2
Packit 6c4009
	bic	has_nul2, tmp3, tmp4
Packit 6c4009
	ccmp	has_nul2, 0, 0, eq
Packit 6c4009
	beq	L(nonascii_loop)
Packit 6c4009
	b	L(tail)
Packit 6c4009
Packit 6c4009
	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
Packit 6c4009
	   srcin to 0x7f, so we ignore any NUL bytes before the string.
Packit 6c4009
	   Then continue in the aligned loop.  */
Packit 6c4009
L(page_cross):
Packit 6c4009
	bic	src, srcin, 15
Packit 6c4009
	ldp	data1, data2, [src]
Packit 6c4009
	lsl	tmp1, srcin, 3
Packit 6c4009
	mov	tmp4, -1
Packit 6c4009
#ifdef __AARCH64EB__
Packit 6c4009
	/* Big-endian.	Early bytes are at MSB.	 */
Packit 6c4009
	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
Packit 6c4009
#else
Packit 6c4009
	/* Little-endian.  Early bytes are at LSB.  */
Packit 6c4009
	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
Packit 6c4009
#endif
Packit 6c4009
	orr	tmp1, tmp1, REP8_80
Packit 6c4009
	orn	data1, data1, tmp1
Packit 6c4009
	orn	tmp2, data2, tmp1
Packit 6c4009
	tst	srcin, 8
Packit 6c4009
	csel	data1, data1, tmp4, eq
Packit 6c4009
	csel	data2, data2, tmp2, eq
Packit 6c4009
	b	L(page_cross_entry)
Packit 6c4009
END (__strlen)
Packit 6c4009
weak_alias (__strlen, strlen)
Packit 6c4009
libc_hidden_builtin_def (strlen)