Blame sysdeps/powerpc/powerpc64/le/power9/strlen.S

Packit Service 945913
/* Optimized strlen implementation for PowerPC64/POWER9.
Packit Service 945913
   Copyright (C) 2020 Free Software Foundation, Inc.
Packit Service 945913
   This file is part of the GNU C Library.
Packit Service 945913
Packit Service 945913
   The GNU C Library is free software; you can redistribute it and/or
Packit Service 945913
   modify it under the terms of the GNU Lesser General Public
Packit Service 945913
   License as published by the Free Software Foundation; either
Packit Service 945913
   version 2.1 of the License, or (at your option) any later version.
Packit Service 945913
Packit Service 945913
   The GNU C Library is distributed in the hope that it will be useful,
Packit Service 945913
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit Service 945913
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit Service 945913
   Lesser General Public License for more details.
Packit Service 945913
Packit Service 945913
   You should have received a copy of the GNU Lesser General Public
Packit Service 945913
   License along with the GNU C Library; if not, see
Packit Service 945913
   <https://www.gnu.org/licenses/>.  */
Packit Service 945913
Packit Service 945913
#include <sysdep.h>
Packit Service 945913
Packit Service 945913
#ifndef STRLEN
Packit Service 945913
# define STRLEN __strlen
Packit Service 945913
# define DEFINE_STRLEN_HIDDEN_DEF 1
Packit Service 945913
#endif
Packit Service 945913
Packit Service 945913
/* Implements the function
Packit Service 945913
Packit Service 945913
   int [r3] strlen (const void *s [r3])
Packit Service 945913
Packit Service 945913
   The implementation can load bytes past a matching byte, but only
Packit Service 945913
   up to the next 64B boundary, so it never crosses a page.  */
Packit Service 945913
Packit Service 945913
.machine power9
Packit Service 945913
ENTRY_TOCLESS (STRLEN, 4)
Packit Service 945913
	CALL_MCOUNT 2
Packit Service 945913
Packit Service 945913
	vspltisb  v18,0
Packit Service 945913
	vspltisb  v19,-1
Packit Service 945913
Packit Service 945913
	neg	  r5,r3
Packit Service 945913
	rldicl	  r9,r5,0,60   /* How many bytes to get source 16B aligned?  */
Packit Service 945913
Packit Service 945913
	/* Align data and fill bytes not loaded with non matching char.  */
Packit Service 945913
	lvx	  v0,0,r3
Packit Service 945913
	lvsr	  v1,0,r3
Packit Service 945913
	vperm	  v0,v19,v0,v1
Packit Service 945913
Packit Service 945913
	vcmpequb. v6,v0,v18
Packit Service 945913
	beq	  cr6,L(aligned)
Packit Service 945913
Packit Service 945913
	vctzlsbb  r3,v6
Packit Service 945913
	blr
Packit Service 945913
Packit Service 945913
	/* Test 64B 16B at a time.  The 64B vector loop is optimized for
Packit Service 945913
	   longer strings.  Likewise, we check a multiple of 64B to avoid
Packit Service 945913
	   breaking the alignment calculation below.  */
Packit Service 945913
L(aligned):
Packit Service 945913
	add	  r4,r3,r9
Packit Service 945913
	rldicl.	  r5,r4,60,62  /* Determine the number of 48B loops needed for
Packit Service 945913
                                  alignment to 64B.  And test for zero.  */
Packit Service 945913
Packit Service 945913
	lxv	  v0+32,0(r4)
Packit Service 945913
	vcmpequb. v6,v0,v18
Packit Service 945913
	bne	  cr6,L(tail1)
Packit Service 945913
Packit Service 945913
	lxv	  v0+32,16(r4)
Packit Service 945913
	vcmpequb. v6,v0,v18
Packit Service 945913
	bne 	  cr6,L(tail2)
Packit Service 945913
Packit Service 945913
	lxv	  v0+32,32(r4)
Packit Service 945913
	vcmpequb. v6,v0,v18
Packit Service 945913
	bne 	  cr6,L(tail3)
Packit Service 945913
Packit Service 945913
	lxv	  v0+32,48(r4)
Packit Service 945913
	vcmpequb. v6,v0,v18
Packit Service 945913
	bne 	  cr6,L(tail4)
Packit Service 945913
	addi	  r4,r4,64
Packit Service 945913
Packit Service 945913
	/* Speculatively generate a fake 16B aligned address to generate the
Packit Service 945913
	   vector byte constant 0,1,..,15 using lvsl during reduction.  */
Packit Service 945913
	li	  r0,0
Packit Service 945913
Packit Service 945913
	/* Skip the alignment if already 64B aligned.  */
Packit Service 945913
	beq	  L(loop_64b)
Packit Service 945913
	mtctr	  r5
Packit Service 945913
Packit Service 945913
	/* Test 48B per iteration until 64B aligned.  */
Packit Service 945913
	.p2align  5
Packit Service 945913
L(loop):
Packit Service 945913
	lxv	  v0+32,0(r4)
Packit Service 945913
	vcmpequb. v6,v0,v18
Packit Service 945913
	bne	  cr6,L(tail1)
Packit Service 945913
Packit Service 945913
	lxv	  v0+32,16(r4)
Packit Service 945913
	vcmpequb. v6,v0,v18
Packit Service 945913
	bne	  cr6,L(tail2)
Packit Service 945913
Packit Service 945913
	lxv 	  v0+32,32(r4)
Packit Service 945913
	vcmpequb. v6,v0,v18
Packit Service 945913
	bne	  cr6,L(tail3)
Packit Service 945913
Packit Service 945913
	addi	  r4,r4,48
Packit Service 945913
	bdnz	  L(loop)
Packit Service 945913
Packit Service 945913
	.p2align  5
Packit Service 945913
L(loop_64b):
Packit Service 945913
	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
Packit Service 945913
	lxv	  v2+32,16(r4)
Packit Service 945913
	lxv	  v3+32,32(r4)
Packit Service 945913
	lxv	  v4+32,48(r4)
Packit Service 945913
	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
Packit Service 945913
	vminub	  v6,v3,v4
Packit Service 945913
	vminub	  v7,v5,v6
Packit Service 945913
	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
Packit Service 945913
	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
Packit Service 945913
	bne	  cr6,L(vmx_zero)
Packit Service 945913
Packit Service 945913
	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
Packit Service 945913
	lxv	  v2+32,16(r4)
Packit Service 945913
	lxv	  v3+32,32(r4)
Packit Service 945913
	lxv	  v4+32,48(r4)
Packit Service 945913
	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
Packit Service 945913
	vminub	  v6,v3,v4
Packit Service 945913
	vminub	  v7,v5,v6
Packit Service 945913
	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
Packit Service 945913
	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
Packit Service 945913
	bne	  cr6,L(vmx_zero)
Packit Service 945913
Packit Service 945913
	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
Packit Service 945913
	lxv	  v2+32,16(r4)
Packit Service 945913
	lxv	  v3+32,32(r4)
Packit Service 945913
	lxv	  v4+32,48(r4)
Packit Service 945913
	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
Packit Service 945913
	vminub	  v6,v3,v4
Packit Service 945913
	vminub	  v7,v5,v6
Packit Service 945913
	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
Packit Service 945913
	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
Packit Service 945913
	beq	  cr6,L(loop_64b)
Packit Service 945913
Packit Service 945913
L(vmx_zero):
Packit Service 945913
	/* OK, we found a null byte.  Let's look for it in the current 64-byte
Packit Service 945913
	   block and mark it in its corresponding VR.  */
Packit Service 945913
	vcmpequb  v1,v1,v18
Packit Service 945913
	vcmpequb  v2,v2,v18
Packit Service 945913
	vcmpequb  v3,v3,v18
Packit Service 945913
	vcmpequb  v4,v4,v18
Packit Service 945913
Packit Service 945913
	/* We will now 'compress' the result into a single doubleword, so it
Packit Service 945913
	   can be moved to a GPR for the final calculation.  First, we
Packit Service 945913
	   generate an appropriate mask for vbpermq, so we can permute bits into
Packit Service 945913
	   the first halfword.  */
Packit Service 945913
	vspltisb  v10,3
Packit Service 945913
	lvsl	  v11,0,r0
Packit Service 945913
	vslb	  v10,v11,v10
Packit Service 945913
Packit Service 945913
	/* Permute the first bit of each byte into bits 48-63.  */
Packit Service 945913
	vbpermq	  v1,v1,v10
Packit Service 945913
	vbpermq	  v2,v2,v10
Packit Service 945913
	vbpermq	  v3,v3,v10
Packit Service 945913
	vbpermq	  v4,v4,v10
Packit Service 945913
Packit Service 945913
	/* Shift each component into its correct position for merging.  */
Packit Service 945913
	vsldoi	  v2,v2,v2,2
Packit Service 945913
	vsldoi	  v3,v3,v3,4
Packit Service 945913
	vsldoi	  v4,v4,v4,6
Packit Service 945913
Packit Service 945913
	/* Merge the results and move to a GPR.  */
Packit Service 945913
	vor	  v1,v2,v1
Packit Service 945913
	vor	  v2,v3,v4
Packit Service 945913
	vor	  v4,v1,v2
Packit Service 945913
	mfvrd	  r10,v4
Packit Service 945913
Packit Service 945913
	/* Adjust address to the begninning of the current 64-byte block.  */
Packit Service 945913
	addi	  r4,r4,-64
Packit Service 945913
Packit Service 945913
	cnttzd	  r0,r10           /* Count trailing zeros before the match.  */
Packit Service 945913
	subf	  r5,r3,r4
Packit Service 945913
	add	  r3,r5,r0         /* Compute final length.  */
Packit Service 945913
	blr
Packit Service 945913
Packit Service 945913
L(tail1):
Packit Service 945913
	vctzlsbb  r0,v6
Packit Service 945913
	add	  r4,r4,r0
Packit Service 945913
	subf	  r3,r3,r4
Packit Service 945913
	blr
Packit Service 945913
Packit Service 945913
L(tail2):
Packit Service 945913
	vctzlsbb  r0,v6
Packit Service 945913
	add	  r4,r4,r0
Packit Service 945913
	addi	  r4,r4,16
Packit Service 945913
	subf	  r3,r3,r4
Packit Service 945913
	blr
Packit Service 945913
Packit Service 945913
L(tail3):
Packit Service 945913
	vctzlsbb  r0,v6
Packit Service 945913
	add	  r4,r4,r0
Packit Service 945913
	addi	  r4,r4,32
Packit Service 945913
	subf	  r3,r3,r4
Packit Service 945913
	blr
Packit Service 945913
Packit Service 945913
L(tail4):
Packit Service 945913
	vctzlsbb  r0,v6
Packit Service 945913
	add	  r4,r4,r0
Packit Service 945913
	addi	  r4,r4,48
Packit Service 945913
	subf	  r3,r3,r4
Packit Service 945913
	blr
Packit Service 945913
Packit Service 945913
END (STRLEN)
Packit Service 945913
Packit Service 945913
#ifdef DEFINE_STRLEN_HIDDEN_DEF
Packit Service 945913
weak_alias (__strlen, strlen)
Packit Service 945913
libc_hidden_builtin_def (strlen)
Packit Service 945913
#endif