Blame sysdeps/powerpc/powerpc64/power8/strncmp.S

Packit 6c4009
/* Optimized strncmp implementation for PowerPC64/POWER8.
Packit 6c4009
   Copyright (C) 2015-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
#ifndef STRNCMP
Packit 6c4009
# define STRNCMP strncmp
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* Implements the function
Packit 6c4009
Packit 6c4009
   int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
Packit 6c4009
Packit 6c4009
   The implementation uses unaligned doubleword access to avoid specialized
Packit 6c4009
   code paths depending of data alignment.  Although recent powerpc64 uses
Packit 6c4009
   64K as default, the page cross handling assumes minimum page size of
Packit 6c4009
   4k.  */
Packit 6c4009
Packit 6c4009
	.machine  power7
Packit 6c4009
ENTRY_TOCLESS (STRNCMP, 4)
Packit 6c4009
	/* Check if size is 0.  */
Packit 6c4009
	mr.	r10,r5
Packit 6c4009
	beq	cr0,L(ret0)
Packit 6c4009
Packit 6c4009
	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
Packit 6c4009
	   the code:
Packit 6c4009
Packit 6c4009
	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
Packit 6c4009
Packit 6c4009
	   with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
Packit 6c4009
	rldicl	r8,r3,0,52
Packit 6c4009
	cmpldi	cr7,r8,4096-16
Packit 6c4009
	bgt	cr7,L(pagecross)
Packit 6c4009
	rldicl	r9,r4,0,52
Packit 6c4009
	cmpldi	cr7,r9,4096-16
Packit 6c4009
	bgt	cr7,L(pagecross)
Packit 6c4009
Packit 6c4009
	/* For short string up to 16 bytes, load both s1 and s2 using
Packit 6c4009
	   unaligned dwords and compare.  */
Packit 6c4009
	ld	r7,0(r3)
Packit 6c4009
	ld	r9,0(r4)
Packit 6c4009
	li	r8,0
Packit 6c4009
	cmpb	r8,r7,r8
Packit 6c4009
	cmpb	r6,r7,r9
Packit 6c4009
	orc.	r8,r8,r6
Packit 6c4009
	bne	cr0,L(different1)
Packit 6c4009
Packit 6c4009
	/* If the string compared are equal, but size is less or equal
Packit 6c4009
	   to 8, return 0.  */
Packit 6c4009
	cmpldi	cr7,r10,8
Packit 6c4009
	li	r9,0
Packit 6c4009
	ble	cr7,L(ret1)
Packit 6c4009
	addi	r5,r10,-8
Packit 6c4009
Packit 6c4009
	ld	r7,8(r3)
Packit 6c4009
	ld	r9,8(r4)
Packit 6c4009
	cmpb	r8,r7,r8
Packit 6c4009
	cmpb	r6,r7,r9
Packit 6c4009
	orc.	r8,r8,r6
Packit 6c4009
	bne	cr0,L(different0)
Packit 6c4009
Packit 6c4009
	cmpldi	cr7,r5,8
Packit 6c4009
	mr	r9,r8
Packit 6c4009
	ble	cr7,L(ret1)
Packit 6c4009
Packit 6c4009
	/* Update pointers and size.  */
Packit 6c4009
	addi	r10,r10,-16
Packit 6c4009
	addi	r3,r3,16
Packit 6c4009
	addi	r4,r4,16
Packit 6c4009
Packit 6c4009
	/* Now it has checked for first 16 bytes, align source1 to doubleword
Packit 6c4009
	   and adjust source2 address.  */
Packit 6c4009
L(align_8b):
Packit 6c4009
	rldicl	r5,r3,0,61
Packit 6c4009
	rldicr	r3,r3,0,60
Packit 6c4009
	subf	r4,r5,r4
Packit 6c4009
	add	r10,r10,r5
Packit 6c4009
Packit 6c4009
	/* At this point, source1 alignment is 0 and source2 alignment is
Packit 6c4009
	   between 0 and 7.  Check is source2 alignment is 0, meaning both
Packit 6c4009
	   sources have the same alignment.  */
Packit 6c4009
	andi.	r8,r4,0x7
Packit 6c4009
	beq	cr0,L(loop_eq_align_0)
Packit 6c4009
Packit 6c4009
	li	r5,0
Packit 6c4009
	b	L(loop_ne_align_1)
Packit 6c4009
Packit 6c4009
	/* If source2 is unaligned to doubleword, the code needs to check
Packit 6c4009
	   on each interation if the unaligned doubleword access will cross
Packit 6c4009
	   a 4k page boundary.  */
Packit 6c4009
	.align 4
Packit 6c4009
L(loop_ne_align_0):
Packit 6c4009
	ld	r7,0(r3)
Packit 6c4009
	ld	r9,0(r4)
Packit 6c4009
	cmpb	r8,r7,r5
Packit 6c4009
	cmpb	r6,r7,r9
Packit 6c4009
	orc.	r8,r8,r6
Packit 6c4009
	bne	cr0,L(different1)
Packit 6c4009
Packit 6c4009
	cmpldi	cr7,r10,8
Packit 6c4009
	ble	cr7,L(ret0)
Packit 6c4009
	addi	r10,r10,-8
Packit 6c4009
	addi	r3,r3,8
Packit 6c4009
	addi	r4,r4,8
Packit 6c4009
L(loop_ne_align_1):
Packit 6c4009
	rldicl	r9,r4,0,52
Packit 6c4009
	cmpldi	r7,r9,4088
Packit 6c4009
	ble	cr7,L(loop_ne_align_0)
Packit 6c4009
	cmpdi	cr7,r10,0
Packit 6c4009
	beq	cr7,L(ret0)
Packit 6c4009
Packit 6c4009
	lbz	r9,0(r3)
Packit 6c4009
	lbz	r8,0(r4)
Packit 6c4009
	cmplw	cr7,r9,r8
Packit 6c4009
	bne	cr7,L(byte_ne_4)
Packit 6c4009
	cmpdi	cr7,r9,0
Packit 6c4009
	beq	cr7,L(size_reached_0)
Packit 6c4009
Packit 6c4009
	li	r9,r7
Packit 6c4009
	addi	r8,r3,1
Packit 6c4009
	mtctr	r9
Packit 6c4009
	addi	r4,r4,1
Packit 6c4009
	addi	r10,r10,-1
Packit 6c4009
	addi	r3,r3,8
Packit 6c4009
Packit 6c4009
	/* The unaligned read of source2 will cross a 4K page boundary,
Packit 6c4009
	   and the different byte or NULL maybe be in the remaining page
Packit 6c4009
	   bytes.  Since it can not use the unaligned load the algorithm
Packit 6c4009
	   reads and compares 8 bytes to keep source1 doubleword aligned.  */
Packit 6c4009
	.align 4
Packit 6c4009
L(loop_ne_align_byte):
Packit 6c4009
	cmpdi	cr7,r10,0
Packit 6c4009
	addi	r10,r10,-1
Packit 6c4009
	beq	cr7,L(ret0)
Packit 6c4009
	lbz	r9,0(r8)
Packit 6c4009
	lbz	r7,0(r4)
Packit 6c4009
	addi	r8,r8,1
Packit 6c4009
	addi	r4,r4,1
Packit 6c4009
	cmplw	cr7,r9,r7
Packit 6c4009
	cmpdi	cr5,r9,0
Packit 6c4009
	bne	cr7,L(size_reached_2)
Packit 6c4009
	beq	cr5,L(size_reached_0)
Packit 6c4009
	bdnz	L(loop_ne_align_byte)
Packit 6c4009
Packit 6c4009
	cmpdi	cr7,r10,0
Packit 6c4009
	bne+	cr7,L(loop_ne_align_0)
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(ret0):
Packit 6c4009
	li	r9,0
Packit 6c4009
L(ret1):
Packit 6c4009
	mr	r3,r9
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* The code now check if r8 and r10 are different by issuing a
Packit 6c4009
	   cmpb and shift the result based on its output:
Packit 6c4009
Packit 6c4009
	#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	  leadzero = (__builtin_ffsl (z1) - 1);
Packit 6c4009
	  leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
Packit 6c4009
	  r1 = (r1 >> leadzero) & 0xFFUL;
Packit 6c4009
	  r2 = (r2 >> leadzero) & 0xFFUL;
Packit 6c4009
	#else
Packit 6c4009
	  leadzero = __builtin_clzl (z1);
Packit 6c4009
	  leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
Packit 6c4009
	  r1 = (r1 >> (56 - leadzero)) & 0xFFUL;
Packit 6c4009
	  r2 = (r2 >> (56 - leadzero)) & 0xFFUL;
Packit 6c4009
	#endif
Packit 6c4009
	  return r1 - r2;  */
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(different0):
Packit 6c4009
	mr	r10,r5
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
L(different1):
Packit 6c4009
        neg	r11,r8
Packit 6c4009
        sldi	r10,r10,3
Packit 6c4009
        and	r8,r11,r8
Packit 6c4009
        addi	r10,r10,-8
Packit 6c4009
        cntlzd	r8,r8
Packit 6c4009
        subfic	r8,r8,63
Packit 6c4009
        extsw 	r8,r8
Packit 6c4009
        cmpld	cr7,r8,r10
Packit 6c4009
        ble	cr7,L(different2)
Packit 6c4009
        mr	r8,r10
Packit 6c4009
L(different2):
Packit 6c4009
        extsw	r8,r8
Packit 6c4009
#else
Packit 6c4009
L(different1):
Packit 6c4009
	addi	r10,r10,-1
Packit 6c4009
	cntlzd	r8,r8
Packit 6c4009
	sldi	r10,r10,3
Packit 6c4009
	cmpld	cr7,r8,r10
Packit 6c4009
	blt	cr7,L(different2)
Packit 6c4009
	mr	r8,r10
Packit 6c4009
L(different2):
Packit 6c4009
	subfic	r8,r8,56
Packit 6c4009
#endif
Packit 6c4009
	srd	r7,r7,r8
Packit 6c4009
	srd	r9,r9,r8
Packit 6c4009
	rldicl	r3,r7,0,56
Packit 6c4009
	rldicl	r9,r9,0,56
Packit 6c4009
	subf	r9,r9,3
Packit 6c4009
	extsw	r9,r9
Packit 6c4009
	mr	r3,r9
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
Packit 6c4009
	   a simple byte a byte comparison until the page alignment for s1
Packit 6c4009
	   is reached.  */
Packit 6c4009
	.align 4
Packit 6c4009
L(pagecross):
Packit 6c4009
	lbz	r7,0(r3)
Packit 6c4009
	lbz	r9,0(r4)
Packit 6c4009
	subfic	r8,r8,4095
Packit 6c4009
	cmplw	cr7,r9,r7
Packit 6c4009
	bne	cr7,L(byte_ne_3)
Packit 6c4009
	cmpdi	cr7,r9,0
Packit 6c4009
	beq	cr7,L(byte_ne_0)
Packit 6c4009
	addi	r10,r10,-1
Packit 6c4009
	subf	r7,r8,r10
Packit 6c4009
	subf	r9,r7,r10
Packit 6c4009
	addi	r9,r9,1
Packit 6c4009
	mtctr	r9
Packit 6c4009
	b	L(pagecross_loop1)
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(pagecross_loop0):
Packit 6c4009
	beq	cr7,L(ret0)
Packit 6c4009
	lbz	r9,0(r3)
Packit 6c4009
	lbz	r8,0(r4)
Packit 6c4009
	addi	r10,r10,-1
Packit 6c4009
	cmplw	cr7,r9,r8
Packit 6c4009
	cmpdi	cr5,r9,0
Packit 6c4009
	bne	r7,L(byte_ne_2)
Packit 6c4009
	beq	r5,L(byte_ne_0)
Packit 6c4009
L(pagecross_loop1):
Packit 6c4009
	cmpdi	cr7,r10,0
Packit 6c4009
	addi	r3,r3,1
Packit 6c4009
	addi	r4,r4,1
Packit 6c4009
	bdnz	L(pagecross_loop0)
Packit 6c4009
	cmpdi	cr7,r7,0
Packit 6c4009
	li	r9,0
Packit 6c4009
	bne+	cr7,L(align_8b)
Packit 6c4009
	b	L(ret1)
Packit 6c4009
Packit 6c4009
	/* If both source1 and source2 are doubleword aligned, there is no
Packit 6c4009
	   need for page boundary cross checks.  */
Packit 6c4009
	.align 4
Packit 6c4009
L(loop_eq_align_0):
Packit 6c4009
	ld	r7,0(r3)
Packit 6c4009
	ld	r9,0(r4)
Packit 6c4009
	cmpb	r8,r7,r8
Packit 6c4009
	cmpb	r6,r7,r9
Packit 6c4009
	orc.	r8,r8,r6
Packit 6c4009
	bne	cr0,L(different1)
Packit 6c4009
Packit 6c4009
	cmpldi	cr7,r10,8
Packit 6c4009
	ble	cr7,L(ret0)
Packit 6c4009
	addi	r9,r10,-9
Packit 6c4009
Packit 6c4009
	li	r5,0
Packit 6c4009
	srdi	r9,r9,3
Packit 6c4009
	addi	r9,r9,1
Packit 6c4009
	mtctr	r9
Packit 6c4009
	b	L(loop_eq_align_2)
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(loop_eq_align_1):
Packit 6c4009
	bdz	L(ret0)
Packit 6c4009
L(loop_eq_align_2):
Packit 6c4009
	ldu	r7,8(r3)
Packit 6c4009
	addi	r10,r10,-8
Packit 6c4009
	ldu	r9,8(r4)
Packit 6c4009
	cmpb	r8,r7,r5
Packit 6c4009
	cmpb	r6,r7,r9
Packit 6c4009
	orc.	r8,r8,r6
Packit 6c4009
	beq	cr0,L(loop_eq_align_1)
Packit 6c4009
	b	L(different1)
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(byte_ne_0):
Packit 6c4009
	li	r7,0
Packit 6c4009
L(byte_ne_1):
Packit 6c4009
	subf	r9,r9,r7
Packit 6c4009
	extsw	r9,r9
Packit 6c4009
	b	L(ret1)
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(byte_ne_2):
Packit 6c4009
	extsw	r7,r9
Packit 6c4009
	mr	r9,r8
Packit 6c4009
	b	L(byte_ne_1)
Packit 6c4009
L(size_reached_0):
Packit 6c4009
	li	r10,0
Packit 6c4009
L(size_reached_1):
Packit 6c4009
	subf	r9,r9,r10
Packit 6c4009
	extsw	r9,r9
Packit 6c4009
	b	L(ret1)
Packit 6c4009
L(size_reached_2):
Packit 6c4009
	extsw	r10,r9
Packit 6c4009
	mr	r9,r7
Packit 6c4009
	b	L(size_reached_1)
Packit 6c4009
L(byte_ne_3):
Packit 6c4009
	extsw	r7,r7
Packit 6c4009
	b	L(byte_ne_1)
Packit 6c4009
L(byte_ne_4):
Packit 6c4009
	extsw	r10,r9
Packit 6c4009
	mr	r9,r8
Packit 6c4009
	b	L(size_reached_1)
Packit 6c4009
END(STRNCMP)
Packit 6c4009
libc_hidden_builtin_def(strncmp)