Blame sysdeps/aarch64/strrchr.S

Packit 6c4009
/* strrchr: find the last instance of a character in a string.
Packit 6c4009
Packit 6c4009
   Copyright (C) 2014-2018 Free Software Foundation, Inc.
Packit 6c4009
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library.  If not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* Assumptions:
Packit 6c4009
 *
Packit 6c4009
 * ARMv8-a, AArch64
Packit 6c4009
 * Neon Available.
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
/* Arguments and results.  */
Packit 6c4009
#define srcin		x0
Packit 6c4009
#define chrin		w1
Packit 6c4009
Packit 6c4009
#define result		x0
Packit 6c4009
Packit 6c4009
#define src		x2
Packit 6c4009
#define	tmp1		x3
Packit 6c4009
#define wtmp2		w4
Packit 6c4009
#define tmp3		x5
Packit 6c4009
#define src_match	x6
Packit 6c4009
#define src_offset	x7
Packit 6c4009
#define const_m1	x8
Packit 6c4009
#define tmp4		x9
Packit 6c4009
#define nul_match	x10
Packit 6c4009
#define chr_match	x11
Packit 6c4009
Packit 6c4009
#define vrepchr		v0
Packit 6c4009
#define vdata1		v1
Packit 6c4009
#define vdata2		v2
Packit 6c4009
#define vhas_nul1	v3
Packit 6c4009
#define vhas_nul2	v4
Packit 6c4009
#define vhas_chr1	v5
Packit 6c4009
#define vhas_chr2	v6
Packit 6c4009
#define vrepmask_0	v7
Packit 6c4009
#define vrepmask_c	v16
Packit 6c4009
#define vend1		v17
Packit 6c4009
#define vend2		v18
Packit 6c4009
Packit 6c4009
/* Core algorithm.
Packit 6c4009
Packit 6c4009
   For each 32-byte hunk we calculate a 64-bit syndrome value, with
Packit 6c4009
   two bits per byte (LSB is always in bits 0 and 1, for both big
Packit 6c4009
   and little-endian systems).  For each tuple, bit 0 is set iff
Packit 6c4009
   the relevant byte matched the requested character; bit 1 is set
Packit 6c4009
   iff the relevant byte matched the NUL end of string (we trigger
Packit 6c4009
   off bit0 for the special case of looking for NUL).  Since the bits
Packit 6c4009
   in the syndrome reflect exactly the order in which things occur
Packit 6c4009
   in the original string a count_trailing_zeros() operation will
Packit 6c4009
   identify exactly which byte is causing the termination, and why.  */
Packit 6c4009
Packit 6c4009
ENTRY(strrchr)
Packit 6c4009
	DELOUSE (0)
Packit 6c4009
	cbz	x1, L(null_search)
Packit 6c4009
	/* Magic constant 0x40100401 to allow us to identify which lane
Packit 6c4009
	   matches the requested byte.  Magic constant 0x80200802 used
Packit 6c4009
	   similarly for NUL termination.  */
Packit 6c4009
	mov	wtmp2, #0x0401
Packit 6c4009
	movk	wtmp2, #0x4010, lsl #16
Packit 6c4009
	dup	vrepchr.16b, chrin
Packit 6c4009
	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
Packit 6c4009
	dup	vrepmask_c.4s, wtmp2
Packit 6c4009
	mov	src_offset, #0
Packit 6c4009
	ands	tmp1, srcin, #31
Packit 6c4009
	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
Packit 6c4009
	b.eq	L(aligned)
Packit 6c4009
Packit 6c4009
	/* Input string is not 32-byte aligned.  Rather than forcing
Packit 6c4009
	   the padding bytes to a safe value, we calculate the syndrome
Packit 6c4009
	   for all the bytes, but then mask off those bits of the
Packit 6c4009
	   syndrome that are related to the padding.  */
Packit 6c4009
	ld1	{vdata1.16b, vdata2.16b}, [src], #32
Packit 6c4009
	neg	tmp1, tmp1
Packit 6c4009
	cmeq	vhas_nul1.16b, vdata1.16b, #0
Packit 6c4009
	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
Packit 6c4009
	cmeq	vhas_nul2.16b, vdata2.16b, #0
Packit 6c4009
	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
Packit 6c4009
	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
Packit 6c4009
	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
Packit 6c4009
	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
Packit 6c4009
	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
Packit 6c4009
	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
Packit 6c4009
	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
Packit 6c4009
	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b	// 128->64
Packit 6c4009
	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
Packit 6c4009
	mov	nul_match, vhas_nul1.2d[0]
Packit 6c4009
	lsl	tmp1, tmp1, #1
Packit 6c4009
	mov	const_m1, #~0
Packit 6c4009
	mov	chr_match, vhas_chr1.2d[0]
Packit 6c4009
	lsr	tmp3, const_m1, tmp1
Packit 6c4009
Packit 6c4009
	bic	nul_match, nul_match, tmp3	// Mask padding bits.
Packit 6c4009
	bic	chr_match, chr_match, tmp3	// Mask padding bits.
Packit 6c4009
	cbnz	nul_match, L(tail)
Packit 6c4009
Packit 6c4009
L(loop):
Packit 6c4009
	cmp	chr_match, #0
Packit 6c4009
	csel	src_match, src, src_match, ne
Packit 6c4009
	csel	src_offset, chr_match, src_offset, ne
Packit 6c4009
L(aligned):
Packit 6c4009
	ld1	{vdata1.16b, vdata2.16b}, [src], #32
Packit 6c4009
	cmeq	vhas_nul1.16b, vdata1.16b, #0
Packit 6c4009
	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
Packit 6c4009
	cmeq	vhas_nul2.16b, vdata2.16b, #0
Packit 6c4009
	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
Packit 6c4009
	addp	vend1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
Packit 6c4009
	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
Packit 6c4009
	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
Packit 6c4009
	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
Packit 6c4009
	addp	vend1.16b, vend1.16b, vend1.16b	// 128->64
Packit 6c4009
	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
Packit 6c4009
	mov	nul_match, vend1.2d[0]
Packit 6c4009
	mov	chr_match, vhas_chr1.2d[0]
Packit 6c4009
	cbz	nul_match, L(loop)
Packit 6c4009
Packit 6c4009
	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
Packit 6c4009
	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
Packit 6c4009
	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
Packit 6c4009
	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
Packit 6c4009
	mov	nul_match, vhas_nul1.2d[0]
Packit 6c4009
Packit 6c4009
L(tail):
Packit 6c4009
	/* Work out exactly where the string ends.  */
Packit 6c4009
	sub	tmp4, nul_match, #1
Packit 6c4009
	eor	tmp4, tmp4, nul_match
Packit 6c4009
	ands	chr_match, chr_match, tmp4
Packit 6c4009
	/* And pick the values corresponding to the last match.  */
Packit 6c4009
	csel	src_match, src, src_match, ne
Packit 6c4009
	csel	src_offset, chr_match, src_offset, ne
Packit 6c4009
Packit 6c4009
	/* Count down from the top of the syndrome to find the last match.  */
Packit 6c4009
	clz	tmp3, src_offset
Packit 6c4009
	/* Src_match points beyond the word containing the match, so we can
Packit 6c4009
	   simply subtract half the bit-offset into the syndrome.  Because
Packit 6c4009
	   we are counting down, we need to go back one more character.  */
Packit 6c4009
	add	tmp3, tmp3, #2
Packit 6c4009
	sub	result, src_match, tmp3, lsr #1
Packit 6c4009
	/* But if the syndrome shows no match was found, then return NULL.  */
Packit 6c4009
	cmp	src_offset, #0
Packit 6c4009
	csel	result, result, xzr, ne
Packit 6c4009
Packit 6c4009
	ret
Packit 6c4009
L(null_search):
Packit 6c4009
	b	__strchrnul
Packit 6c4009
Packit 6c4009
END(strrchr)
Packit 6c4009
weak_alias (strrchr, rindex)
Packit 6c4009
libc_hidden_builtin_def (strrchr)