Blame sysdeps/powerpc/powerpc64/le/power9/strlen.S

Packit Service ab6be3
/* Optimized strlen implementation for PowerPC64/POWER9.
Packit Service ab6be3
   Copyright (C) 2020 Free Software Foundation, Inc.
Packit Service ab6be3
   This file is part of the GNU C Library.
Packit Service ab6be3
Packit Service ab6be3
   The GNU C Library is free software; you can redistribute it and/or
Packit Service ab6be3
   modify it under the terms of the GNU Lesser General Public
Packit Service ab6be3
   License as published by the Free Software Foundation; either
Packit Service ab6be3
   version 2.1 of the License, or (at your option) any later version.
Packit Service ab6be3
Packit Service ab6be3
   The GNU C Library is distributed in the hope that it will be useful,
Packit Service ab6be3
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit Service ab6be3
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit Service ab6be3
   Lesser General Public License for more details.
Packit Service ab6be3
Packit Service ab6be3
   You should have received a copy of the GNU Lesser General Public
Packit Service ab6be3
   License along with the GNU C Library; if not, see
Packit Service ab6be3
   <https://www.gnu.org/licenses/>.  */
Packit Service ab6be3
Packit Service ab6be3
#include <sysdep.h>
Packit Service ab6be3
Packit Service ab6be3
#ifndef STRLEN
Packit Service ab6be3
# define STRLEN __strlen
Packit Service ab6be3
# define DEFINE_STRLEN_HIDDEN_DEF 1
Packit Service ab6be3
#endif
Packit Service ab6be3
Packit Service ab6be3
/* Implements the function
Packit Service ab6be3
Packit Service ab6be3
   int [r3] strlen (const void *s [r3])
Packit Service ab6be3
Packit Service ab6be3
   The implementation can load bytes past a matching byte, but only
Packit Service ab6be3
   up to the next 64B boundary, so it never crosses a page.  */
Packit Service ab6be3
Packit Service ab6be3
.machine power9
Packit Service ab6be3
ENTRY_TOCLESS (STRLEN, 4)
Packit Service ab6be3
	CALL_MCOUNT 2
Packit Service ab6be3
Packit Service ab6be3
	vspltisb  v18,0
Packit Service ab6be3
	vspltisb  v19,-1
Packit Service ab6be3
Packit Service ab6be3
	neg	  r5,r3
Packit Service ab6be3
	rldicl	  r9,r5,0,60   /* How many bytes to get source 16B aligned?  */
Packit Service ab6be3
Packit Service ab6be3
	/* Align data and fill bytes not loaded with non matching char.  */
Packit Service ab6be3
	lvx	  v0,0,r3
Packit Service ab6be3
	lvsr	  v1,0,r3
Packit Service ab6be3
	vperm	  v0,v19,v0,v1
Packit Service ab6be3
Packit Service ab6be3
	vcmpequb. v6,v0,v18
Packit Service ab6be3
	beq	  cr6,L(aligned)
Packit Service ab6be3
Packit Service ab6be3
	vctzlsbb  r3,v6
Packit Service ab6be3
	blr
Packit Service ab6be3
Packit Service ab6be3
	/* Test 64B 16B at a time.  The 64B vector loop is optimized for
Packit Service ab6be3
	   longer strings.  Likewise, we check a multiple of 64B to avoid
Packit Service ab6be3
	   breaking the alignment calculation below.  */
Packit Service ab6be3
L(aligned):
Packit Service ab6be3
	add	  r4,r3,r9
Packit Service ab6be3
	rldicl.	  r5,r4,60,62  /* Determine the number of 48B loops needed for
Packit Service ab6be3
                                  alignment to 64B.  And test for zero.  */
Packit Service ab6be3
Packit Service ab6be3
	lxv	  v0+32,0(r4)
Packit Service ab6be3
	vcmpequb. v6,v0,v18
Packit Service ab6be3
	bne	  cr6,L(tail1)
Packit Service ab6be3
Packit Service ab6be3
	lxv	  v0+32,16(r4)
Packit Service ab6be3
	vcmpequb. v6,v0,v18
Packit Service ab6be3
	bne 	  cr6,L(tail2)
Packit Service ab6be3
Packit Service ab6be3
	lxv	  v0+32,32(r4)
Packit Service ab6be3
	vcmpequb. v6,v0,v18
Packit Service ab6be3
	bne 	  cr6,L(tail3)
Packit Service ab6be3
Packit Service ab6be3
	lxv	  v0+32,48(r4)
Packit Service ab6be3
	vcmpequb. v6,v0,v18
Packit Service ab6be3
	bne 	  cr6,L(tail4)
Packit Service ab6be3
	addi	  r4,r4,64
Packit Service ab6be3
Packit Service ab6be3
	/* Speculatively generate a fake 16B aligned address to generate the
Packit Service ab6be3
	   vector byte constant 0,1,..,15 using lvsl during reduction.  */
Packit Service ab6be3
	li	  r0,0
Packit Service ab6be3
Packit Service ab6be3
	/* Skip the alignment if already 64B aligned.  */
Packit Service ab6be3
	beq	  L(loop_64b)
Packit Service ab6be3
	mtctr	  r5
Packit Service ab6be3
Packit Service ab6be3
	/* Test 48B per iteration until 64B aligned.  */
Packit Service ab6be3
	.p2align  5
Packit Service ab6be3
L(loop):
Packit Service ab6be3
	lxv	  v0+32,0(r4)
Packit Service ab6be3
	vcmpequb. v6,v0,v18
Packit Service ab6be3
	bne	  cr6,L(tail1)
Packit Service ab6be3
Packit Service ab6be3
	lxv	  v0+32,16(r4)
Packit Service ab6be3
	vcmpequb. v6,v0,v18
Packit Service ab6be3
	bne	  cr6,L(tail2)
Packit Service ab6be3
Packit Service ab6be3
	lxv 	  v0+32,32(r4)
Packit Service ab6be3
	vcmpequb. v6,v0,v18
Packit Service ab6be3
	bne	  cr6,L(tail3)
Packit Service ab6be3
Packit Service ab6be3
	addi	  r4,r4,48
Packit Service ab6be3
	bdnz	  L(loop)
Packit Service ab6be3
Packit Service ab6be3
	.p2align  5
Packit Service ab6be3
L(loop_64b):
Packit Service ab6be3
	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
Packit Service ab6be3
	lxv	  v2+32,16(r4)
Packit Service ab6be3
	lxv	  v3+32,32(r4)
Packit Service ab6be3
	lxv	  v4+32,48(r4)
Packit Service ab6be3
	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
Packit Service ab6be3
	vminub	  v6,v3,v4
Packit Service ab6be3
	vminub	  v7,v5,v6
Packit Service ab6be3
	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
Packit Service ab6be3
	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
Packit Service ab6be3
	bne	  cr6,L(vmx_zero)
Packit Service ab6be3
Packit Service ab6be3
	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
Packit Service ab6be3
	lxv	  v2+32,16(r4)
Packit Service ab6be3
	lxv	  v3+32,32(r4)
Packit Service ab6be3
	lxv	  v4+32,48(r4)
Packit Service ab6be3
	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
Packit Service ab6be3
	vminub	  v6,v3,v4
Packit Service ab6be3
	vminub	  v7,v5,v6
Packit Service ab6be3
	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
Packit Service ab6be3
	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
Packit Service ab6be3
	bne	  cr6,L(vmx_zero)
Packit Service ab6be3
Packit Service ab6be3
	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
Packit Service ab6be3
	lxv	  v2+32,16(r4)
Packit Service ab6be3
	lxv	  v3+32,32(r4)
Packit Service ab6be3
	lxv	  v4+32,48(r4)
Packit Service ab6be3
	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
Packit Service ab6be3
	vminub	  v6,v3,v4
Packit Service ab6be3
	vminub	  v7,v5,v6
Packit Service ab6be3
	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
Packit Service ab6be3
	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
Packit Service ab6be3
	beq	  cr6,L(loop_64b)
Packit Service ab6be3
Packit Service ab6be3
L(vmx_zero):
Packit Service ab6be3
	/* OK, we found a null byte.  Let's look for it in the current 64-byte
Packit Service ab6be3
	   block and mark it in its corresponding VR.  */
Packit Service ab6be3
	vcmpequb  v1,v1,v18
Packit Service ab6be3
	vcmpequb  v2,v2,v18
Packit Service ab6be3
	vcmpequb  v3,v3,v18
Packit Service ab6be3
	vcmpequb  v4,v4,v18
Packit Service ab6be3
Packit Service ab6be3
	/* We will now 'compress' the result into a single doubleword, so it
Packit Service ab6be3
	   can be moved to a GPR for the final calculation.  First, we
Packit Service ab6be3
	   generate an appropriate mask for vbpermq, so we can permute bits into
Packit Service ab6be3
	   the first halfword.  */
Packit Service ab6be3
	vspltisb  v10,3
Packit Service ab6be3
	lvsl	  v11,0,r0
Packit Service ab6be3
	vslb	  v10,v11,v10
Packit Service ab6be3
Packit Service ab6be3
	/* Permute the first bit of each byte into bits 48-63.  */
Packit Service ab6be3
	vbpermq	  v1,v1,v10
Packit Service ab6be3
	vbpermq	  v2,v2,v10
Packit Service ab6be3
	vbpermq	  v3,v3,v10
Packit Service ab6be3
	vbpermq	  v4,v4,v10
Packit Service ab6be3
Packit Service ab6be3
	/* Shift each component into its correct position for merging.  */
Packit Service ab6be3
	vsldoi	  v2,v2,v2,2
Packit Service ab6be3
	vsldoi	  v3,v3,v3,4
Packit Service ab6be3
	vsldoi	  v4,v4,v4,6
Packit Service ab6be3
Packit Service ab6be3
	/* Merge the results and move to a GPR.  */
Packit Service ab6be3
	vor	  v1,v2,v1
Packit Service ab6be3
	vor	  v2,v3,v4
Packit Service ab6be3
	vor	  v4,v1,v2
Packit Service ab6be3
	mfvrd	  r10,v4
Packit Service ab6be3
Packit Service ab6be3
	/* Adjust address to the begninning of the current 64-byte block.  */
Packit Service ab6be3
	addi	  r4,r4,-64
Packit Service ab6be3
Packit Service ab6be3
	cnttzd	  r0,r10           /* Count trailing zeros before the match.  */
Packit Service ab6be3
	subf	  r5,r3,r4
Packit Service ab6be3
	add	  r3,r5,r0         /* Compute final length.  */
Packit Service ab6be3
	blr
Packit Service ab6be3
Packit Service ab6be3
L(tail1):
Packit Service ab6be3
	vctzlsbb  r0,v6
Packit Service ab6be3
	add	  r4,r4,r0
Packit Service ab6be3
	subf	  r3,r3,r4
Packit Service ab6be3
	blr
Packit Service ab6be3
Packit Service ab6be3
L(tail2):
Packit Service ab6be3
	vctzlsbb  r0,v6
Packit Service ab6be3
	add	  r4,r4,r0
Packit Service ab6be3
	addi	  r4,r4,16
Packit Service ab6be3
	subf	  r3,r3,r4
Packit Service ab6be3
	blr
Packit Service ab6be3
Packit Service ab6be3
L(tail3):
Packit Service ab6be3
	vctzlsbb  r0,v6
Packit Service ab6be3
	add	  r4,r4,r0
Packit Service ab6be3
	addi	  r4,r4,32
Packit Service ab6be3
	subf	  r3,r3,r4
Packit Service ab6be3
	blr
Packit Service ab6be3
Packit Service ab6be3
L(tail4):
Packit Service ab6be3
	vctzlsbb  r0,v6
Packit Service ab6be3
	add	  r4,r4,r0
Packit Service ab6be3
	addi	  r4,r4,48
Packit Service ab6be3
	subf	  r3,r3,r4
Packit Service ab6be3
	blr
Packit Service ab6be3
Packit Service ab6be3
END (STRLEN)
Packit Service ab6be3
Packit Service ab6be3
#ifdef DEFINE_STRLEN_HIDDEN_DEF
Packit Service ab6be3
weak_alias (__strlen, strlen)
Packit Service ab6be3
libc_hidden_builtin_def (strlen)
Packit Service ab6be3
#endif