Blame sysdeps/aarch64/memchr.S

Packit Service 82fcde
/* memchr - find a character in a memory zone
Packit Service 82fcde
Packit Service 82fcde
   Copyright (C) 2015-2018 Free Software Foundation, Inc.
Packit Service 82fcde
Packit Service 82fcde
   This file is part of the GNU C Library.
Packit Service 82fcde
Packit Service 82fcde
   The GNU C Library is free software; you can redistribute it and/or
Packit Service 82fcde
   modify it under the terms of the GNU Lesser General Public
Packit Service 82fcde
   License as published by the Free Software Foundation; either
Packit Service 82fcde
   version 2.1 of the License, or (at your option) any later version.
Packit Service 82fcde
Packit Service 82fcde
   The GNU C Library is distributed in the hope that it will be useful,
Packit Service 82fcde
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit Service 82fcde
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit Service 82fcde
   Lesser General Public License for more details.
Packit Service 82fcde
Packit Service 82fcde
   You should have received a copy of the GNU Lesser General Public
Packit Service 82fcde
   License along with the GNU C Library.  If not, see
Packit Service 82fcde
   <http://www.gnu.org/licenses/>.  */
Packit Service 82fcde
Packit Service 82fcde
#include <sysdep.h>
Packit Service 82fcde
Packit Service 82fcde
/* Assumptions:
Packit Service 82fcde
 *
Packit Service 82fcde
 * ARMv8-a, AArch64
Packit Service 82fcde
 * Neon Available.
Packit Service 82fcde
 */
Packit Service 82fcde
Packit Service 82fcde
/* Arguments and results.  */
Packit Service 82fcde
#define srcin		x0
Packit Service 82fcde
#define chrin		w1
Packit Service 82fcde
#define cntin		x2
Packit Service 82fcde
Packit Service 82fcde
#define result		x0
Packit Service 82fcde
Packit Service 82fcde
#define src		x3
Packit Service 82fcde
#define	tmp		x4
Packit Service 82fcde
#define wtmp2		w5
Packit Service 82fcde
#define synd		x6
Packit Service 82fcde
#define soff		x9
Packit Service 82fcde
#define cntrem		x10
Packit Service 82fcde
Packit Service 82fcde
#define vrepchr		v0
Packit Service 82fcde
#define vdata1		v1
Packit Service 82fcde
#define vdata2		v2
Packit Service 82fcde
#define vhas_chr1	v3
Packit Service 82fcde
#define vhas_chr2	v4
Packit Service 82fcde
#define vrepmask	v5
Packit Service 82fcde
#define vend		v6
Packit Service 82fcde
Packit Service 82fcde
/*
Packit Service 82fcde
 * Core algorithm:
Packit Service 82fcde
 *
Packit Service 82fcde
 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
Packit Service 82fcde
 * per byte. For each tuple, bit 0 is set if the relevant byte matched the
Packit Service 82fcde
 * requested character and bit 1 is not used (faster than using a 32bit
Packit Service 82fcde
 * syndrome). Since the bits in the syndrome reflect exactly the order in which
Packit Service 82fcde
 * things occur in the original string, counting trailing zeros allows to
Packit Service 82fcde
 * identify exactly which byte has matched.
Packit Service 82fcde
 */
Packit Service 82fcde
Packit Service 82fcde
ENTRY (__memchr)
Packit Service 82fcde
	/* Do not dereference srcin if no bytes to compare.  */
Packit Service 82fcde
	cbz	cntin, L(zero_length)
Packit Service 82fcde
	/*
Packit Service 82fcde
	 * Magic constant 0x40100401 allows us to identify which lane matches
Packit Service 82fcde
	 * the requested byte.
Packit Service 82fcde
	 */
Packit Service 82fcde
	mov	wtmp2, #0x0401
Packit Service 82fcde
	movk	wtmp2, #0x4010, lsl #16
Packit Service 82fcde
	dup	vrepchr.16b, chrin
Packit Service 82fcde
	/* Work with aligned 32-byte chunks */
Packit Service 82fcde
	bic	src, srcin, #31
Packit Service 82fcde
	dup	vrepmask.4s, wtmp2
Packit Service 82fcde
	ands	soff, srcin, #31
Packit Service 82fcde
	and	cntrem, cntin, #31
Packit Service 82fcde
	b.eq	L(loop)
Packit Service 82fcde
Packit Service 82fcde
	/*
Packit Service 82fcde
	 * Input string is not 32-byte aligned. We calculate the syndrome
Packit Service 82fcde
	 * value for the aligned 32 bytes block containing the first bytes
Packit Service 82fcde
	 * and mask the irrelevant part.
Packit Service 82fcde
	 */
Packit Service 82fcde
Packit Service 82fcde
	ld1	{vdata1.16b, vdata2.16b}, [src], #32
Packit Service 82fcde
	sub	tmp, soff, #32
Packit Service 82fcde
	adds	cntin, cntin, tmp
Packit Service 82fcde
	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
Packit Service 82fcde
	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
Packit Service 82fcde
	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
Packit Service 82fcde
	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
Packit Service 82fcde
	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
Packit Service 82fcde
	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
Packit Service 82fcde
	mov	synd, vend.2d[0]
Packit Service 82fcde
	/* Clear the soff*2 lower bits */
Packit Service 82fcde
	lsl	tmp, soff, #1
Packit Service 82fcde
	lsr	synd, synd, tmp
Packit Service 82fcde
	lsl	synd, synd, tmp
Packit Service 82fcde
	/* The first block can also be the last */
Packit Service 82fcde
	b.ls	L(masklast)
Packit Service 82fcde
	/* Have we found something already? */
Packit Service 82fcde
	cbnz	synd, L(tail)
Packit Service 82fcde
Packit Service 82fcde
L(loop):
Packit Service 82fcde
	ld1	{vdata1.16b, vdata2.16b}, [src], #32
Packit Service 82fcde
	subs	cntin, cntin, #32
Packit Service 82fcde
	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
Packit Service 82fcde
	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
Packit Service 82fcde
	/* If we're out of data we finish regardless of the result */
Packit Service 82fcde
	b.ls	L(end)
Packit Service 82fcde
	/* Use a fast check for the termination condition */
Packit Service 82fcde
	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
Packit Service 82fcde
	addp	vend.2d, vend.2d, vend.2d
Packit Service 82fcde
	mov	synd, vend.2d[0]
Packit Service 82fcde
	/* We're not out of data, loop if we haven't found the character */
Packit Service 82fcde
	cbz	synd, L(loop)
Packit Service 82fcde
Packit Service 82fcde
L(end):
Packit Service 82fcde
	/* Termination condition found, let's calculate the syndrome value */
Packit Service 82fcde
	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
Packit Service 82fcde
	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
Packit Service 82fcde
	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
Packit Service 82fcde
	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
Packit Service 82fcde
	mov	synd, vend.2d[0]
Packit Service 82fcde
	/* Only do the clear for the last possible block */
Packit Service 82fcde
	b.hi	L(tail)
Packit Service 82fcde
Packit Service 82fcde
L(masklast):
Packit Service 82fcde
	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
Packit Service 82fcde
	add	tmp, cntrem, soff
Packit Service 82fcde
	and	tmp, tmp, #31
Packit Service 82fcde
	sub	tmp, tmp, #32
Packit Service 82fcde
	neg	tmp, tmp, lsl #1
Packit Service 82fcde
	lsl	synd, synd, tmp
Packit Service 82fcde
	lsr	synd, synd, tmp
Packit Service 82fcde
Packit Service 82fcde
L(tail):
Packit Service 82fcde
	/* Count the trailing zeros using bit reversing */
Packit Service 82fcde
	rbit	synd, synd
Packit Service 82fcde
	/* Compensate the last post-increment */
Packit Service 82fcde
	sub	src, src, #32
Packit Service 82fcde
	/* Check that we have found a character */
Packit Service 82fcde
	cmp	synd, #0
Packit Service 82fcde
	/* And count the leading zeros */
Packit Service 82fcde
	clz	synd, synd
Packit Service 82fcde
	/* Compute the potential result */
Packit Service 82fcde
	add	result, src, synd, lsr #1
Packit Service 82fcde
	/* Select result or NULL */
Packit Service 82fcde
	csel	result, xzr, result, eq
Packit Service 82fcde
	ret
Packit Service 82fcde
Packit Service 82fcde
L(zero_length):
Packit Service 82fcde
	mov	result, #0
Packit Service 82fcde
	ret
Packit Service 82fcde
END (__memchr)
Packit Service 82fcde
weak_alias (__memchr, memchr)
Packit Service 82fcde
libc_hidden_builtin_def (memchr)