Blame sysdeps/aarch64/memchr.S

Packit 6c4009
/* memchr - find a character in a memory zone
Packit 6c4009
Packit 6c4009
   Copyright (C) 2015-2018 Free Software Foundation, Inc.
Packit 6c4009
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library.  If not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* Assumptions:
Packit 6c4009
 *
Packit 6c4009
 * ARMv8-a, AArch64
Packit 6c4009
 * Neon Available.
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
/* Arguments and results.  */
Packit 6c4009
#define srcin		x0
Packit 6c4009
#define chrin		w1
Packit 6c4009
#define cntin		x2
Packit 6c4009
Packit 6c4009
#define result		x0
Packit 6c4009
Packit 6c4009
#define src		x3
Packit 6c4009
#define	tmp		x4
Packit 6c4009
#define wtmp2		w5
Packit 6c4009
#define synd		x6
Packit 6c4009
#define soff		x9
Packit 6c4009
#define cntrem		x10
Packit 6c4009
Packit 6c4009
#define vrepchr		v0
Packit 6c4009
#define vdata1		v1
Packit 6c4009
#define vdata2		v2
Packit 6c4009
#define vhas_chr1	v3
Packit 6c4009
#define vhas_chr2	v4
Packit 6c4009
#define vrepmask	v5
Packit 6c4009
#define vend		v6
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
 * Core algorithm:
Packit 6c4009
 *
Packit 6c4009
 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
Packit 6c4009
 * per byte. For each tuple, bit 0 is set if the relevant byte matched the
Packit 6c4009
 * requested character and bit 1 is not used (faster than using a 32bit
Packit 6c4009
 * syndrome). Since the bits in the syndrome reflect exactly the order in which
Packit 6c4009
 * things occur in the original string, counting trailing zeros allows to
Packit 6c4009
 * identify exactly which byte has matched.
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
ENTRY (__memchr)
Packit 6c4009
	/* Do not dereference srcin if no bytes to compare.  */
Packit 6c4009
	cbz	cntin, L(zero_length)
Packit 6c4009
	/*
Packit 6c4009
	 * Magic constant 0x40100401 allows us to identify which lane matches
Packit 6c4009
	 * the requested byte.
Packit 6c4009
	 */
Packit 6c4009
	mov	wtmp2, #0x0401
Packit 6c4009
	movk	wtmp2, #0x4010, lsl #16
Packit 6c4009
	dup	vrepchr.16b, chrin
Packit 6c4009
	/* Work with aligned 32-byte chunks */
Packit 6c4009
	bic	src, srcin, #31
Packit 6c4009
	dup	vrepmask.4s, wtmp2
Packit 6c4009
	ands	soff, srcin, #31
Packit 6c4009
	and	cntrem, cntin, #31
Packit 6c4009
	b.eq	L(loop)
Packit 6c4009
Packit 6c4009
	/*
Packit 6c4009
	 * Input string is not 32-byte aligned. We calculate the syndrome
Packit 6c4009
	 * value for the aligned 32 bytes block containing the first bytes
Packit 6c4009
	 * and mask the irrelevant part.
Packit 6c4009
	 */
Packit 6c4009
Packit 6c4009
	ld1	{vdata1.16b, vdata2.16b}, [src], #32
Packit 6c4009
	sub	tmp, soff, #32
Packit 6c4009
	adds	cntin, cntin, tmp
Packit 6c4009
	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
Packit 6c4009
	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
Packit 6c4009
	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
Packit 6c4009
	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
Packit 6c4009
	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
Packit 6c4009
	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
Packit 6c4009
	mov	synd, vend.2d[0]
Packit 6c4009
	/* Clear the soff*2 lower bits */
Packit 6c4009
	lsl	tmp, soff, #1
Packit 6c4009
	lsr	synd, synd, tmp
Packit 6c4009
	lsl	synd, synd, tmp
Packit 6c4009
	/* The first block can also be the last */
Packit 6c4009
	b.ls	L(masklast)
Packit 6c4009
	/* Have we found something already? */
Packit 6c4009
	cbnz	synd, L(tail)
Packit 6c4009
Packit 6c4009
L(loop):
Packit 6c4009
	ld1	{vdata1.16b, vdata2.16b}, [src], #32
Packit 6c4009
	subs	cntin, cntin, #32
Packit 6c4009
	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
Packit 6c4009
	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
Packit 6c4009
	/* If we're out of data we finish regardless of the result */
Packit 6c4009
	b.ls	L(end)
Packit 6c4009
	/* Use a fast check for the termination condition */
Packit 6c4009
	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
Packit 6c4009
	addp	vend.2d, vend.2d, vend.2d
Packit 6c4009
	mov	synd, vend.2d[0]
Packit 6c4009
	/* We're not out of data, loop if we haven't found the character */
Packit 6c4009
	cbz	synd, L(loop)
Packit 6c4009
Packit 6c4009
L(end):
Packit 6c4009
	/* Termination condition found, let's calculate the syndrome value */
Packit 6c4009
	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
Packit 6c4009
	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
Packit 6c4009
	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
Packit 6c4009
	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
Packit 6c4009
	mov	synd, vend.2d[0]
Packit 6c4009
	/* Only do the clear for the last possible block */
Packit 6c4009
	b.hi	L(tail)
Packit 6c4009
Packit 6c4009
L(masklast):
Packit 6c4009
	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
Packit 6c4009
	add	tmp, cntrem, soff
Packit 6c4009
	and	tmp, tmp, #31
Packit 6c4009
	sub	tmp, tmp, #32
Packit 6c4009
	neg	tmp, tmp, lsl #1
Packit 6c4009
	lsl	synd, synd, tmp
Packit 6c4009
	lsr	synd, synd, tmp
Packit 6c4009
Packit 6c4009
L(tail):
Packit 6c4009
	/* Count the trailing zeros using bit reversing */
Packit 6c4009
	rbit	synd, synd
Packit 6c4009
	/* Compensate the last post-increment */
Packit 6c4009
	sub	src, src, #32
Packit 6c4009
	/* Check that we have found a character */
Packit 6c4009
	cmp	synd, #0
Packit 6c4009
	/* And count the leading zeros */
Packit 6c4009
	clz	synd, synd
Packit 6c4009
	/* Compute the potential result */
Packit 6c4009
	add	result, src, synd, lsr #1
Packit 6c4009
	/* Select result or NULL */
Packit 6c4009
	csel	result, xzr, result, eq
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
L(zero_length):
Packit 6c4009
	mov	result, #0
Packit 6c4009
	ret
Packit 6c4009
END (__memchr)
Packit 6c4009
weak_alias (__memchr, memchr)
Packit 6c4009
libc_hidden_builtin_def (memchr)