Blame sysdeps/powerpc/powerpc64/power8/memchr.S

Packit 6c4009
/* Optimized memchr implementation for POWER8.
Packit 6c4009
   Copyright (C) 2017-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5])  */
Packit 6c4009
Packit 6c4009
/* TODO: change these to the actual instructions when the minimum required
Packit 6c4009
   binutils allows it.  */
Packit 6c4009
#define MTVRD(v, r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
Packit 6c4009
#define MFVRD(r, v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
Packit 6c4009
#define VBPERMQ(t, a, b)  .long (0x1000054c \
Packit 6c4009
				| ((t)<<(32-11)) \
Packit 6c4009
				| ((a)<<(32-16)) \
Packit 6c4009
				| ((b)<<(32-21)) )
Packit 6c4009
Packit 6c4009
#ifndef MEMCHR
Packit 6c4009
# define MEMCHR __memchr
Packit 6c4009
#endif
Packit 6c4009
/* TODO: change this to .machine power8 when the minimum required binutils
Packit 6c4009
   allows it.  */
Packit 6c4009
	.machine  power7
Packit 6c4009
ENTRY_TOCLESS (MEMCHR)
Packit 6c4009
	CALL_MCOUNT 3
Packit 6c4009
	dcbt	0, r3
Packit 6c4009
	clrrdi  r8, r3, 3
Packit 6c4009
	insrdi	r4, r4, 8, 48
Packit 6c4009
Packit 6c4009
	/* Calculate the last acceptable address and check for possible
Packit 6c4009
	   addition overflow by using satured math:
Packit 6c4009
	   r7 = r3 + r5
Packit 6c4009
	   r7 |= -(r7 < x)  */
Packit 6c4009
	add     r7, r3, r5
Packit 6c4009
	subfc   r6, r3, r7
Packit 6c4009
	subfe   r9, r9, r9
Packit 6c4009
	extsw   r6, r9
Packit 6c4009
	or      r7, r7, r6
Packit 6c4009
Packit 6c4009
	insrdi	r4, r4, 16, 32
Packit 6c4009
	cmpldi	r5, 32
Packit 6c4009
	li	r9, -1
Packit 6c4009
	rlwinm	r6, r3, 3, 26, 28 /* Calculate padding.  */
Packit 6c4009
	insrdi  r4, r4, 32, 0
Packit 6c4009
	mr	r10, r7
Packit 6c4009
	addi	r7, r7, -1
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	sld	r9, r9, r6
Packit 6c4009
#else
Packit 6c4009
	srd	r9, r9, r6
Packit 6c4009
#endif
Packit 6c4009
	ble	L(small_range)
Packit 6c4009
	andi.	r11, r3, 63
Packit 6c4009
	beq	cr0, L(align_qw)
Packit 6c4009
	clrldi	r11, r3, 61
Packit 6c4009
	ld	r12, 0(r8)     /* Load doubleword from memory.  */
Packit 6c4009
	cmpb	r3, r12, r4     /* Check for BYTEs in DWORD1.  */
Packit 6c4009
	and	r3, r3, r9
Packit 6c4009
	clrldi	r6, r7, 61      /* Byte count - 1 in last dword.  */
Packit 6c4009
	clrrdi	r7, r7, 3       /* Address of last doubleword.  */
Packit 6c4009
	cmpldi	cr7, r3, 0      /* Does r3 indicate we got a hit?  */
Packit 6c4009
	bne	cr7, L(done)
Packit 6c4009
	addi	r8, r8, 8
Packit 6c4009
	addi	r5, r5, -8
Packit 6c4009
	add	r5, r5, r11
Packit 6c4009
Packit 6c4009
	/* Are we now aligned to a quadword boundary?  */
Packit 6c4009
	andi.	r11, r8, 15
Packit 6c4009
	beq	cr0, L(align_qw)
Packit 6c4009
Packit 6c4009
	/* Handle DWORD to make it QW aligned.  */
Packit 6c4009
	ld	r12, 0(r8)
Packit 6c4009
	cmpb	r3, r12, r4
Packit 6c4009
	cmpldi	cr7, r3, 0
Packit 6c4009
	bne	cr7, L(done)
Packit 6c4009
	addi	r5, r5, -8
Packit 6c4009
	addi	r8, r8, 8
Packit 6c4009
	/* At this point, r8 is 16B aligned.  */
Packit 6c4009
L(align_qw):
Packit 6c4009
	vspltisb	v0, 0
Packit 6c4009
	/* Precompute vbpermq constant.  */
Packit 6c4009
	vspltisb	v10, 3
Packit 6c4009
	li	r0, 0
Packit 6c4009
	lvsl	v11, r0, r0
Packit 6c4009
	vslb	v10, v11, v10
Packit 6c4009
	MTVRD(v1, r4)
Packit 6c4009
	vspltb	v1, v1, 7
Packit 6c4009
	cmpldi	r5, 64
Packit 6c4009
	ble	L(tail64)
Packit 6c4009
	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
Packit 6c4009
	   Note: aligning to 64-byte will necessarily slow down performance for
Packit 6c4009
	   strings around 64 bytes in length due to the extra comparisons
Packit 6c4009
	   required to check alignment for the vectorized loop.  This is a
Packit 6c4009
	   necessary tradeoff we are willing to take in order to speed up the
Packit 6c4009
	   calculation for larger strings.  */
Packit 6c4009
	andi.	r11, r8, 63
Packit 6c4009
	beq	cr0, L(preloop_64B)
Packit 6c4009
	/* In order to begin the 64B loop, it needs to be 64
Packit 6c4009
	   bytes aligned.  So read until it is 64B aligned.  */
Packit 6c4009
	lvx	v4, 0, r8
Packit 6c4009
	vcmpequb	v6, v1, v4
Packit 6c4009
	vcmpequb.	v11, v0, v6
Packit 6c4009
	bnl	cr6, L(found_16B)
Packit 6c4009
	addi	r8, r8, 16
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
Packit 6c4009
	andi.	r11, r8, 63
Packit 6c4009
	beq	cr0, L(preloop_64B)
Packit 6c4009
	lvx	v4, 0, r8
Packit 6c4009
	vcmpequb	v6, v1, v4
Packit 6c4009
	vcmpequb.	v11, v0, v6
Packit 6c4009
	bnl	cr6, L(found_16B)
Packit 6c4009
	addi	r8, r8, 16
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
Packit 6c4009
	andi.	r11, r8, 63
Packit 6c4009
	beq	cr0, L(preloop_64B)
Packit 6c4009
	lvx	v4, 0, r8
Packit 6c4009
	vcmpequb	v6, v1, v4
Packit 6c4009
	vcmpequb.	v11, v0, v6
Packit 6c4009
	bnl	cr6, L(found_16B)
Packit 6c4009
	addi	r8, r8, 16
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
	/* At this point it should be 64B aligned.
Packit 6c4009
	   Prepare for the 64B loop.  */
Packit 6c4009
L(preloop_64B):
Packit 6c4009
	cmpldi	r5, 64		/* Check if r5 < 64.  */
Packit 6c4009
	ble	L(tail64)
Packit 6c4009
	sub	r6, r10, r8
Packit 6c4009
	srdi	r9, r6, 6	/* Number of loop iterations.  */
Packit 6c4009
	mtctr	r9		/* Setup the counter.  */
Packit 6c4009
	li	r11, 16		/* Load required offsets.  */
Packit 6c4009
	li	r9, 32
Packit 6c4009
	li	r7, 48
Packit 6c4009
Packit 6c4009
	/* Handle r5 > 64.  Loop over the bytes in strides of 64B.  */
Packit 6c4009
	.align 4
Packit 6c4009
L(loop):
Packit 6c4009
	lvx	v2, 0, r8	/* Load 4 quadwords.  */
Packit 6c4009
	lvx	v3, r8, r11
Packit 6c4009
	lvx	v4, v8, r9
Packit 6c4009
	lvx	v5, v8, r7
Packit 6c4009
	vcmpequb	v6, v1, v2
Packit 6c4009
	vcmpequb	v7, v1, v3
Packit 6c4009
	vcmpequb	v8, v1, v4
Packit 6c4009
	vcmpequb	v9, v1, v5
Packit 6c4009
	vor	v11, v6, v7
Packit 6c4009
	vor	v12, v8, v9
Packit 6c4009
	vor	v11, v11, v12	/* Compare and merge into one VR for speed.  */
Packit 6c4009
	vcmpequb.	v11, v0, v11
Packit 6c4009
	bnl	cr6, L(found)
Packit 6c4009
	addi	r8, r8, 64	/* Adjust address for the next iteration.  */
Packit 6c4009
	bdnz	L(loop)
Packit 6c4009
	clrldi	r5, r6, 58
Packit 6c4009
Packit 6c4009
	/* Handle remainder of 64B loop or r5 > 64.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(tail64):
Packit 6c4009
	cmpldi	r5, 0
Packit 6c4009
	beq	L(null)
Packit 6c4009
	lvx	v4, 0, r8
Packit 6c4009
	vcmpequb	v6, v1, v4
Packit 6c4009
	vcmpequb.	v11, v0, v6
Packit 6c4009
	bnl	cr6, L(found_16B)
Packit 6c4009
	addi	r8, r8, 16
Packit 6c4009
	cmpldi	cr6, r5, 16
Packit 6c4009
	ble	cr6, L(null)
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
Packit 6c4009
	lvx	v4, 0, r8
Packit 6c4009
	vcmpequb	v6, v1, v4
Packit 6c4009
	vcmpequb.	v11, v0, v6
Packit 6c4009
	bnl	cr6, L(found_16B)
Packit 6c4009
	addi	r8, r8, 16
Packit 6c4009
	cmpldi	cr6, r5, 16
Packit 6c4009
	ble	cr6, L(null)
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
Packit 6c4009
	lvx	v4, 0, r8
Packit 6c4009
	vcmpequb	v6, v1, v4
Packit 6c4009
	vcmpequb.	v11, v0, v6
Packit 6c4009
	bnl	cr6, L(found_16B)
Packit 6c4009
	addi	r8, r8, 16
Packit 6c4009
	cmpldi	cr6, r5, 16
Packit 6c4009
	ble	cr6, L(null)
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
Packit 6c4009
	lvx	v4, 0, r8
Packit 6c4009
	vcmpequb	v6, v1, v4
Packit 6c4009
	vcmpequb.	v11, v0, v6
Packit 6c4009
	bnl	cr6, L(found_16B)
Packit 6c4009
	li	r3, 0
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* Found a match in 64B loop.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(found):
Packit 6c4009
	/* Permute the first bit of each byte into bits 48-63.  */
Packit 6c4009
	VBPERMQ(v6, v6, v10)
Packit 6c4009
	VBPERMQ(v7, v7, v10)
Packit 6c4009
	VBPERMQ(v8, v8, v10)
Packit 6c4009
	VBPERMQ(v9, v9, v10)
Packit 6c4009
	/* Shift each component into its correct position for merging.  */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	vsldoi	v7, v7, v7, 2
Packit 6c4009
	vsldoi	v8, v8, v8, 4
Packit 6c4009
	vsldoi	v9, v9, v9, 6
Packit 6c4009
#else
Packit 6c4009
	vsldoi	v6, v6, v6, 6
Packit 6c4009
	vsldoi	v7, v7, v7, 4
Packit 6c4009
	vsldoi	v8, v8, v8, 2
Packit 6c4009
#endif
Packit 6c4009
	/* Merge the results and move to a GPR.  */
Packit 6c4009
	vor	v11, v6, v7
Packit 6c4009
	vor	v4, v9, v8
Packit 6c4009
	vor	v4, v11, v4
Packit 6c4009
	MFVRD(r5, v4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	addi	r6, r5, -1
Packit 6c4009
	andc	r6, r6, r5
Packit 6c4009
	popcntd	r6, r6
Packit 6c4009
#else
Packit 6c4009
	cntlzd	r6, r5	/* Count leading zeros before the match.  */
Packit 6c4009
#endif
Packit 6c4009
	add	r3, r8, r6	/* Compute final length.  */
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* Found a match in last 16 bytes.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(found_16B):
Packit 6c4009
	/* Permute the first bit of each byte into bits 48-63.  */
Packit 6c4009
	VBPERMQ(v6, v6, v10)
Packit 6c4009
	/* Shift each component into its correct position for merging.  */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	MFVRD(r7, v6)
Packit 6c4009
	addi	r6, r7, -1
Packit 6c4009
	andc	r6, r6, r7
Packit 6c4009
	popcntd	r6, r6
Packit 6c4009
#else
Packit 6c4009
	vsldoi	v6, v6, v6, 6
Packit 6c4009
	MFVRD(r7, v6)
Packit 6c4009
	cntlzd	r6, r7	/* Count leading zeros before the match.  */
Packit 6c4009
#endif
Packit 6c4009
	add	r3, r8, r6	/* Compute final length.  */
Packit 6c4009
	cmpld	r6, r5
Packit 6c4009
	bltlr
Packit 6c4009
	li	r3, 0
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
	/* r3 has the output of the cmpb instruction, that is, it contains
Packit 6c4009
	   0xff in the same position as BYTE in the original
Packit 6c4009
	   doubleword from the string.  Use that to calculate the pointer.
Packit 6c4009
	   We need to make sure BYTE is *before* the end of the range.  */
Packit 6c4009
L(done):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	addi	r0, r3, -1
Packit 6c4009
	andc	r0, r0, r3
Packit 6c4009
	popcntd	r0, r0	      /* Count trailing zeros.  */
Packit 6c4009
#else
Packit 6c4009
	cntlzd	r0, r3	      /* Count leading zeros before the match.  */
Packit 6c4009
#endif
Packit 6c4009
	cmpld	r8, r7         /* Are we on the last dword?  */
Packit 6c4009
	srdi	r0, r0, 3	/* Convert leading/trailing zeros to bytes.  */
Packit 6c4009
	add	r3, r8, r0
Packit 6c4009
	cmpld	cr7, r0, r6     /* If on the last dword, check byte offset.  */
Packit 6c4009
	bnelr
Packit 6c4009
	blelr	cr7
Packit 6c4009
	li	r3, 0
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align	4
Packit 6c4009
L(null):
Packit 6c4009
	li	r3, 0
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
/* Deals with size <= 32.  */
Packit 6c4009
	.align	4
Packit 6c4009
L(small_range):
Packit 6c4009
	cmpldi	r5, 0
Packit 6c4009
	beq	L(null)
Packit 6c4009
	ld	r12, 0(r8)     /* Load word from memory.  */
Packit 6c4009
	cmpb	r3, r12, r4     /* Check for BYTE in DWORD1.  */
Packit 6c4009
	and	r3, r3, r9
Packit 6c4009
	cmpldi	cr7, r3, 0
Packit 6c4009
	clrldi	r6, r7, 61      /* Byte count - 1 in last dword.  */
Packit 6c4009
	clrrdi	r7, r7, 3       /* Address of last doubleword.  */
Packit 6c4009
	cmpld	r8, r7         /* Are we done already?  */
Packit 6c4009
	bne	cr7, L(done)
Packit 6c4009
	beqlr
Packit 6c4009
Packit 6c4009
	ldu	r12, 8(r8)
Packit 6c4009
	cmpb	r3, r12, r4
Packit 6c4009
	cmpldi	cr6, r3, 0
Packit 6c4009
	cmpld	r8, r7
Packit 6c4009
	bne	cr6, L(done)   /* Found something.  */
Packit 6c4009
	beqlr		      /* Hit end of string (length).  */
Packit 6c4009
Packit 6c4009
	ldu	r12, 8(r8)
Packit 6c4009
	cmpb	r3, r12, r4
Packit 6c4009
	cmpldi	cr6, r3, 0
Packit 6c4009
	cmpld	r8, r7
Packit 6c4009
	bne	cr6, L(done)
Packit 6c4009
	beqlr
Packit 6c4009
Packit 6c4009
	ldu	r12, 8(r8)
Packit 6c4009
	cmpb	r3, r12, r4
Packit 6c4009
	cmpldi	cr6, r3, 0
Packit 6c4009
	cmpld	r8, r7
Packit 6c4009
	bne	cr6, L(done)
Packit 6c4009
	beqlr
Packit 6c4009
Packit 6c4009
	ldu	r12, 8(r8)
Packit 6c4009
	cmpb	r3, r12, r4
Packit 6c4009
	cmpldi	cr6, r3, 0
Packit 6c4009
	bne	cr6, L(done)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
END (MEMCHR)
Packit 6c4009
weak_alias (__memchr, memchr)
Packit 6c4009
libc_hidden_builtin_def (memchr)