Blame sysdeps/aarch64/strchrnul.S

Packit 6c4009
/* strchrnul - find a character or nul in a string
Packit 6c4009
Packit 6c4009
   Copyright (C) 2014-2018 Free Software Foundation, Inc.
Packit 6c4009
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library.  If not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* Assumptions:
Packit 6c4009
 *
Packit 6c4009
 * ARMv8-a, AArch64
Packit 6c4009
 * Neon Available.
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
/* Arguments and results.  */
Packit 6c4009
#define srcin		x0
Packit 6c4009
#define chrin		w1
Packit 6c4009
Packit 6c4009
#define result		x0
Packit 6c4009
Packit 6c4009
/* Locals and temporaries.  */
Packit 6c4009
Packit 6c4009
#define src		x2
Packit 6c4009
#define tmp1		x3
Packit 6c4009
#define wtmp2		w4
Packit 6c4009
#define tmp3		x5
Packit 6c4009
Packit 6c4009
#define vrepchr		v0
Packit 6c4009
#define vdata1		v1
Packit 6c4009
#define vdata2		v2
Packit 6c4009
#define vhas_nul1	v3
Packit 6c4009
#define vhas_nul2	v4
Packit 6c4009
#define vhas_chr1	v5
Packit 6c4009
#define vhas_chr2	v6
Packit 6c4009
#define vrepmask	v7
Packit 6c4009
#define vend1		v16
Packit 6c4009
Packit 6c4009
/* Core algorithm.
Packit 6c4009
Packit 6c4009
   For each 32-byte hunk we calculate a 64-bit syndrome value, with
Packit 6c4009
   two bits per byte (LSB is always in bits 0 and 1, for both big
Packit 6c4009
   and little-endian systems).  For each tuple, bit 0 is set iff
Packit 6c4009
   the relevant byte matched the requested character or nul.  Since the
Packit 6c4009
   bits in the syndrome reflect exactly the order in which things occur
Packit 6c4009
   in the original string a count_trailing_zeros() operation will
Packit 6c4009
   identify exactly which byte is causing the termination.  */
Packit 6c4009
Packit 6c4009
ENTRY (__strchrnul)
Packit 6c4009
	DELOUSE (0)
Packit 6c4009
	/* Magic constant 0x40100401 to allow us to identify which lane
Packit 6c4009
	   matches the termination condition.  */
Packit 6c4009
	mov	wtmp2, #0x0401
Packit 6c4009
	movk	wtmp2, #0x4010, lsl #16
Packit 6c4009
	dup	vrepchr.16b, chrin
Packit 6c4009
	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
Packit 6c4009
	dup	vrepmask.4s, wtmp2
Packit 6c4009
	ands	tmp1, srcin, #31
Packit 6c4009
	b.eq	L(loop)
Packit 6c4009
Packit 6c4009
	/* Input string is not 32-byte aligned.  Rather than forcing
Packit 6c4009
	   the padding bytes to a safe value, we calculate the syndrome
Packit 6c4009
	   for all the bytes, but then mask off those bits of the
Packit 6c4009
	   syndrome that are related to the padding.  */
Packit 6c4009
	ld1	{vdata1.16b, vdata2.16b}, [src], #32
Packit 6c4009
	neg	tmp1, tmp1
Packit 6c4009
	cmeq	vhas_nul1.16b, vdata1.16b, #0
Packit 6c4009
	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
Packit 6c4009
	cmeq	vhas_nul2.16b, vdata2.16b, #0
Packit 6c4009
	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
Packit 6c4009
	orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
Packit 6c4009
	orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
Packit 6c4009
	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
Packit 6c4009
	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
Packit 6c4009
	lsl	tmp1, tmp1, #1
Packit 6c4009
	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
Packit 6c4009
	mov	tmp3, #~0
Packit 6c4009
	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
Packit 6c4009
	lsr	tmp1, tmp3, tmp1
Packit 6c4009
Packit 6c4009
	mov	tmp3, vend1.2d[0]
Packit 6c4009
	bic	tmp1, tmp3, tmp1	// Mask padding bits.
Packit 6c4009
	cbnz	tmp1, L(tail)
Packit 6c4009
Packit 6c4009
L(loop):
Packit 6c4009
	ld1	{vdata1.16b, vdata2.16b}, [src], #32
Packit 6c4009
	cmeq	vhas_nul1.16b, vdata1.16b, #0
Packit 6c4009
	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
Packit 6c4009
	cmeq	vhas_nul2.16b, vdata2.16b, #0
Packit 6c4009
	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
Packit 6c4009
	/* Use a fast check for the termination condition.  */
Packit 6c4009
	orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
Packit 6c4009
	orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
Packit 6c4009
	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
Packit 6c4009
	addp	vend1.2d, vend1.2d, vend1.2d
Packit 6c4009
	mov	tmp1, vend1.2d[0]
Packit 6c4009
	cbz	tmp1, L(loop)
Packit 6c4009
Packit 6c4009
	/* Termination condition found.  Now need to establish exactly why
Packit 6c4009
	   we terminated.  */
Packit 6c4009
	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
Packit 6c4009
	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
Packit 6c4009
	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
Packit 6c4009
	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
Packit 6c4009
Packit 6c4009
	mov	tmp1, vend1.2d[0]
Packit 6c4009
L(tail):
Packit 6c4009
	/* Count the trailing zeros, by bit reversing...  */
Packit 6c4009
	rbit	tmp1, tmp1
Packit 6c4009
	/* Re-bias source.  */
Packit 6c4009
	sub	src, src, #32
Packit 6c4009
	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
Packit 6c4009
	/* tmp1 is twice the offset into the fragment.  */
Packit 6c4009
	add	result, src, tmp1, lsr #1
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
END(__strchrnul)
Packit 6c4009
weak_alias (__strchrnul, strchrnul)