Blame sysdeps/powerpc/powerpc64/le/power9/strlen.S

Packit Bot 6c6608
/* Optimized strlen implementation for PowerPC64/POWER9.
Packit Bot 6c6608
   Copyright (C) 2020 Free Software Foundation, Inc.
Packit Bot 6c6608
   This file is part of the GNU C Library.
Packit Bot 6c6608
Packit Bot 6c6608
   The GNU C Library is free software; you can redistribute it and/or
Packit Bot 6c6608
   modify it under the terms of the GNU Lesser General Public
Packit Bot 6c6608
   License as published by the Free Software Foundation; either
Packit Bot 6c6608
   version 2.1 of the License, or (at your option) any later version.
Packit Bot 6c6608
Packit Bot 6c6608
   The GNU C Library is distributed in the hope that it will be useful,
Packit Bot 6c6608
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit Bot 6c6608
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit Bot 6c6608
   Lesser General Public License for more details.
Packit Bot 6c6608
Packit Bot 6c6608
   You should have received a copy of the GNU Lesser General Public
Packit Bot 6c6608
   License along with the GNU C Library; if not, see
Packit Bot 6c6608
   <https://www.gnu.org/licenses/>.  */
Packit Bot 6c6608
Packit Bot 6c6608
#include <sysdep.h>
Packit Bot 6c6608
Packit Bot 6c6608
#ifndef STRLEN
Packit Bot 6c6608
# define STRLEN __strlen
Packit Bot 6c6608
# define DEFINE_STRLEN_HIDDEN_DEF 1
Packit Bot 6c6608
#endif
Packit Bot 6c6608
Packit Bot 6c6608
/* Implements the function
Packit Bot 6c6608
Packit Bot 6c6608
   int [r3] strlen (const void *s [r3])
Packit Bot 6c6608
Packit Bot 6c6608
   The implementation can load bytes past a matching byte, but only
Packit Bot 6c6608
   up to the next 64B boundary, so it never crosses a page.  */
Packit Bot 6c6608
Packit Bot 6c6608
.machine power9
Packit Bot 6c6608
ENTRY_TOCLESS (STRLEN, 4)
Packit Bot 6c6608
	CALL_MCOUNT 2
Packit Bot 6c6608
Packit Bot 6c6608
	vspltisb  v18,0
Packit Bot 6c6608
	vspltisb  v19,-1
Packit Bot 6c6608
Packit Bot 6c6608
	neg	  r5,r3
Packit Bot 6c6608
	rldicl	  r9,r5,0,60   /* How many bytes to get source 16B aligned?  */
Packit Bot 6c6608
Packit Bot 6c6608
	/* Align data and fill bytes not loaded with non matching char.  */
Packit Bot 6c6608
	lvx	  v0,0,r3
Packit Bot 6c6608
	lvsr	  v1,0,r3
Packit Bot 6c6608
	vperm	  v0,v19,v0,v1
Packit Bot 6c6608
Packit Bot 6c6608
	vcmpequb. v6,v0,v18
Packit Bot 6c6608
	beq	  cr6,L(aligned)
Packit Bot 6c6608
Packit Bot 6c6608
	vctzlsbb  r3,v6
Packit Bot 6c6608
	blr
Packit Bot 6c6608
Packit Bot 6c6608
	/* Test 64B 16B at a time.  The 64B vector loop is optimized for
Packit Bot 6c6608
	   longer strings.  Likewise, we check a multiple of 64B to avoid
Packit Bot 6c6608
	   breaking the alignment calculation below.  */
Packit Bot 6c6608
L(aligned):
Packit Bot 6c6608
	add	  r4,r3,r9
Packit Bot 6c6608
	rldicl.	  r5,r4,60,62  /* Determine the number of 48B loops needed for
Packit Bot 6c6608
                                  alignment to 64B.  And test for zero.  */
Packit Bot 6c6608
Packit Bot 6c6608
	lxv	  v0+32,0(r4)
Packit Bot 6c6608
	vcmpequb. v6,v0,v18
Packit Bot 6c6608
	bne	  cr6,L(tail1)
Packit Bot 6c6608
Packit Bot 6c6608
	lxv	  v0+32,16(r4)
Packit Bot 6c6608
	vcmpequb. v6,v0,v18
Packit Bot 6c6608
	bne 	  cr6,L(tail2)
Packit Bot 6c6608
Packit Bot 6c6608
	lxv	  v0+32,32(r4)
Packit Bot 6c6608
	vcmpequb. v6,v0,v18
Packit Bot 6c6608
	bne 	  cr6,L(tail3)
Packit Bot 6c6608
Packit Bot 6c6608
	lxv	  v0+32,48(r4)
Packit Bot 6c6608
	vcmpequb. v6,v0,v18
Packit Bot 6c6608
	bne 	  cr6,L(tail4)
Packit Bot 6c6608
	addi	  r4,r4,64
Packit Bot 6c6608
Packit Bot 6c6608
	/* Speculatively generate a fake 16B aligned address to generate the
Packit Bot 6c6608
	   vector byte constant 0,1,..,15 using lvsl during reduction.  */
Packit Bot 6c6608
	li	  r0,0
Packit Bot 6c6608
Packit Bot 6c6608
	/* Skip the alignment if already 64B aligned.  */
Packit Bot 6c6608
	beq	  L(loop_64b)
Packit Bot 6c6608
	mtctr	  r5
Packit Bot 6c6608
Packit Bot 6c6608
	/* Test 48B per iteration until 64B aligned.  */
Packit Bot 6c6608
	.p2align  5
Packit Bot 6c6608
L(loop):
Packit Bot 6c6608
	lxv	  v0+32,0(r4)
Packit Bot 6c6608
	vcmpequb. v6,v0,v18
Packit Bot 6c6608
	bne	  cr6,L(tail1)
Packit Bot 6c6608
Packit Bot 6c6608
	lxv	  v0+32,16(r4)
Packit Bot 6c6608
	vcmpequb. v6,v0,v18
Packit Bot 6c6608
	bne	  cr6,L(tail2)
Packit Bot 6c6608
Packit Bot 6c6608
	lxv 	  v0+32,32(r4)
Packit Bot 6c6608
	vcmpequb. v6,v0,v18
Packit Bot 6c6608
	bne	  cr6,L(tail3)
Packit Bot 6c6608
Packit Bot 6c6608
	addi	  r4,r4,48
Packit Bot 6c6608
	bdnz	  L(loop)
Packit Bot 6c6608
Packit Bot 6c6608
	.p2align  5
Packit Bot 6c6608
L(loop_64b):
Packit Bot 6c6608
	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
Packit Bot 6c6608
	lxv	  v2+32,16(r4)
Packit Bot 6c6608
	lxv	  v3+32,32(r4)
Packit Bot 6c6608
	lxv	  v4+32,48(r4)
Packit Bot 6c6608
	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
Packit Bot 6c6608
	vminub	  v6,v3,v4
Packit Bot 6c6608
	vminub	  v7,v5,v6
Packit Bot 6c6608
	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
Packit Bot 6c6608
	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
Packit Bot 6c6608
	bne	  cr6,L(vmx_zero)
Packit Bot 6c6608
Packit Bot 6c6608
	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
Packit Bot 6c6608
	lxv	  v2+32,16(r4)
Packit Bot 6c6608
	lxv	  v3+32,32(r4)
Packit Bot 6c6608
	lxv	  v4+32,48(r4)
Packit Bot 6c6608
	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
Packit Bot 6c6608
	vminub	  v6,v3,v4
Packit Bot 6c6608
	vminub	  v7,v5,v6
Packit Bot 6c6608
	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
Packit Bot 6c6608
	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
Packit Bot 6c6608
	bne	  cr6,L(vmx_zero)
Packit Bot 6c6608
Packit Bot 6c6608
	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
Packit Bot 6c6608
	lxv	  v2+32,16(r4)
Packit Bot 6c6608
	lxv	  v3+32,32(r4)
Packit Bot 6c6608
	lxv	  v4+32,48(r4)
Packit Bot 6c6608
	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
Packit Bot 6c6608
	vminub	  v6,v3,v4
Packit Bot 6c6608
	vminub	  v7,v5,v6
Packit Bot 6c6608
	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
Packit Bot 6c6608
	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
Packit Bot 6c6608
	beq	  cr6,L(loop_64b)
Packit Bot 6c6608
Packit Bot 6c6608
L(vmx_zero):
Packit Bot 6c6608
	/* OK, we found a null byte.  Let's look for it in the current 64-byte
Packit Bot 6c6608
	   block and mark it in its corresponding VR.  */
Packit Bot 6c6608
	vcmpequb  v1,v1,v18
Packit Bot 6c6608
	vcmpequb  v2,v2,v18
Packit Bot 6c6608
	vcmpequb  v3,v3,v18
Packit Bot 6c6608
	vcmpequb  v4,v4,v18
Packit Bot 6c6608
Packit Bot 6c6608
	/* We will now 'compress' the result into a single doubleword, so it
Packit Bot 6c6608
	   can be moved to a GPR for the final calculation.  First, we
Packit Bot 6c6608
	   generate an appropriate mask for vbpermq, so we can permute bits into
Packit Bot 6c6608
	   the first halfword.  */
Packit Bot 6c6608
	vspltisb  v10,3
Packit Bot 6c6608
	lvsl	  v11,0,r0
Packit Bot 6c6608
	vslb	  v10,v11,v10
Packit Bot 6c6608
Packit Bot 6c6608
	/* Permute the first bit of each byte into bits 48-63.  */
Packit Bot 6c6608
	vbpermq	  v1,v1,v10
Packit Bot 6c6608
	vbpermq	  v2,v2,v10
Packit Bot 6c6608
	vbpermq	  v3,v3,v10
Packit Bot 6c6608
	vbpermq	  v4,v4,v10
Packit Bot 6c6608
Packit Bot 6c6608
	/* Shift each component into its correct position for merging.  */
Packit Bot 6c6608
	vsldoi	  v2,v2,v2,2
Packit Bot 6c6608
	vsldoi	  v3,v3,v3,4
Packit Bot 6c6608
	vsldoi	  v4,v4,v4,6
Packit Bot 6c6608
Packit Bot 6c6608
	/* Merge the results and move to a GPR.  */
Packit Bot 6c6608
	vor	  v1,v2,v1
Packit Bot 6c6608
	vor	  v2,v3,v4
Packit Bot 6c6608
	vor	  v4,v1,v2
Packit Bot 6c6608
	mfvrd	  r10,v4
Packit Bot 6c6608
Packit Bot 6c6608
	/* Adjust address to the begninning of the current 64-byte block.  */
Packit Bot 6c6608
	addi	  r4,r4,-64
Packit Bot 6c6608
Packit Bot 6c6608
	cnttzd	  r0,r10           /* Count trailing zeros before the match.  */
Packit Bot 6c6608
	subf	  r5,r3,r4
Packit Bot 6c6608
	add	  r3,r5,r0         /* Compute final length.  */
Packit Bot 6c6608
	blr
Packit Bot 6c6608
Packit Bot 6c6608
L(tail1):
Packit Bot 6c6608
	vctzlsbb  r0,v6
Packit Bot 6c6608
	add	  r4,r4,r0
Packit Bot 6c6608
	subf	  r3,r3,r4
Packit Bot 6c6608
	blr
Packit Bot 6c6608
Packit Bot 6c6608
L(tail2):
Packit Bot 6c6608
	vctzlsbb  r0,v6
Packit Bot 6c6608
	add	  r4,r4,r0
Packit Bot 6c6608
	addi	  r4,r4,16
Packit Bot 6c6608
	subf	  r3,r3,r4
Packit Bot 6c6608
	blr
Packit Bot 6c6608
Packit Bot 6c6608
L(tail3):
Packit Bot 6c6608
	vctzlsbb  r0,v6
Packit Bot 6c6608
	add	  r4,r4,r0
Packit Bot 6c6608
	addi	  r4,r4,32
Packit Bot 6c6608
	subf	  r3,r3,r4
Packit Bot 6c6608
	blr
Packit Bot 6c6608
Packit Bot 6c6608
L(tail4):
Packit Bot 6c6608
	vctzlsbb  r0,v6
Packit Bot 6c6608
	add	  r4,r4,r0
Packit Bot 6c6608
	addi	  r4,r4,48
Packit Bot 6c6608
	subf	  r3,r3,r4
Packit Bot 6c6608
	blr
Packit Bot 6c6608
Packit Bot 6c6608
END (STRLEN)
Packit Bot 6c6608
Packit Bot 6c6608
#ifdef DEFINE_STRLEN_HIDDEN_DEF
Packit Bot 6c6608
weak_alias (__strlen, strlen)
Packit Bot 6c6608
libc_hidden_builtin_def (strlen)
Packit Bot 6c6608
#endif