Blame sysdeps/powerpc/powerpc64/le/power9/strncmp.S

Packit 6c4009
/* Optimized strncmp implementation for PowerPC64/POWER9.
Packit 6c4009
   Copyright (C) 2016-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* Implements the function
Packit 6c4009
Packit 6c4009
   int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
Packit 6c4009
Packit 6c4009
   The implementation uses unaligned doubleword access to avoid specialized
Packit 6c4009
   code paths depending of data alignment for first 32 bytes and uses
Packit 6c4009
   vectorised loops after that.  */
Packit 6c4009
Packit 6c4009
#ifndef STRNCMP
Packit 6c4009
# define STRNCMP strncmp
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* TODO: Change this to actual instructions when minimum binutils is upgraded
Packit Service e01443
   to 2.27.  Macros are defined below for these newer instructions in order
Packit 6c4009
   to maintain compatibility.  */
Packit Service e01443
#define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))
Packit 6c4009
Packit Service e01443
#define VEXTUBRX(t,a,b) .long (0x1000070d \
Packit 6c4009
				| ((t)<<(32-11))  \
Packit 6c4009
				| ((a)<<(32-16))  \
Packit 6c4009
				| ((b)<<(32-21)) )
Packit 6c4009
Packit Service e01443
#define VCMPNEZB(t,a,b) .long (0x10000507 \
Packit 6c4009
				| ((t)<<(32-11))  \
Packit 6c4009
				| ((a)<<(32-16))  \
Packit 6c4009
				| ((b)<<(32-21)) )
Packit 6c4009
Packit 6c4009
/* Get 16 bytes for unaligned case.
Packit 6c4009
   reg1: Vector to hold next 16 bytes.
Packit 6c4009
   reg2: Address to read from.
Packit 6c4009
   reg3: Permute control vector.  */
Packit Service e01443
#define GET16BYTES(reg1, reg2, reg3) \
Packit 6c4009
	lvx	reg1, 0, reg2; \
Packit 6c4009
	vperm	v8, v2, reg1, reg3; \
Packit 6c4009
	vcmpequb.	v8, v0, v8; \
Packit 6c4009
	beq	cr6, 1f; \
Packit 6c4009
	vspltisb	v9, 0; \
Packit 6c4009
	b	2f; \
Packit 6c4009
	.align 4; \
Packit 6c4009
1: \
Packit 6c4009
	cmplw	cr6, r5, r11; \
Packit 6c4009
	ble	cr6, 2f; \
Packit 6c4009
	addi	r6, reg2, 16; \
Packit 6c4009
	lvx	v9, 0, r6; \
Packit 6c4009
2: \
Packit 6c4009
	vperm	reg1, v9, reg1, reg3;
Packit 6c4009
Packit 6c4009
/* TODO: change this to .machine power9 when minimum binutils
Packit 6c4009
   is upgraded to 2.27.  */
Packit 6c4009
	.machine  power7
Packit 6c4009
ENTRY_TOCLESS (STRNCMP, 4)
Packit 6c4009
	/* Check if size is 0.  */
Packit 6c4009
	cmpdi	cr0, r5, 0
Packit 6c4009
	beq	cr0, L(ret0)
Packit 6c4009
	li	r0, 0
Packit 6c4009
Packit 6c4009
	/* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
Packit 6c4009
	   the code:
Packit 6c4009
Packit 6c4009
	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
Packit 6c4009
Packit 6c4009
	   with PAGE_SIZE being 4096 and ITER_SIZE begin 32.  */
Packit 6c4009
	rldicl	r8, r3, 0, 52
Packit 6c4009
	cmpldi	cr7, r8, 4096-32
Packit 6c4009
	bgt	cr7, L(pagecross)
Packit 6c4009
	rldicl	r9, r4, 0, 52
Packit 6c4009
	cmpldi	cr7, r9, 4096-32
Packit 6c4009
	bgt	cr7, L(pagecross)
Packit 6c4009
Packit 6c4009
	/* For short strings up to 32 bytes, load both s1 and s2 using
Packit 6c4009
	   unaligned dwords and compare.  */
Packit 6c4009
Packit 6c4009
	ld	r7, 0(r3)
Packit 6c4009
	ld	r9, 0(r4)
Packit 6c4009
	li	r8, 0
Packit 6c4009
	cmpb	r8, r7, r8
Packit 6c4009
	cmpb	r6, r7, r9
Packit 6c4009
	orc.	r8, r8, r6
Packit 6c4009
	bne	cr0, L(different1)
Packit 6c4009
Packit 6c4009
	/* If the strings compared are equal, but size is less or equal
Packit 6c4009
	   to 8, return 0.  */
Packit 6c4009
	cmpldi	cr7, r5, 8
Packit 6c4009
	li	r9, 0
Packit 6c4009
	ble	cr7, L(ret1)
Packit 6c4009
	addi	r5, r5, -8
Packit 6c4009
Packit 6c4009
	ld	r7, 8(r3)
Packit 6c4009
	ld	r9, 8(r4)
Packit 6c4009
	cmpb	r8, r7, r8
Packit 6c4009
	cmpb	r6, r7, r9
Packit 6c4009
	orc.	r8, r8, r6
Packit 6c4009
	bne	cr0, L(different1)
Packit 6c4009
	cmpldi	cr7, r5, 8
Packit 6c4009
	mr	r9, r8
Packit 6c4009
	ble	cr7, L(ret1)
Packit 6c4009
	/* Update pointers and size.  */
Packit 6c4009
	addi	r5, r5, -8
Packit 6c4009
	addi	r3, r3, 16
Packit 6c4009
	addi	r4, r4, 16
Packit 6c4009
Packit 6c4009
	ld	r7, 0(r3)
Packit 6c4009
	ld	r9, 0(r4)
Packit 6c4009
	li	r8, 0
Packit 6c4009
	cmpb	r8, r7, r8
Packit 6c4009
	cmpb	r6, r7, r9
Packit 6c4009
	orc.	r8, r8, r6
Packit 6c4009
	bne	cr0, L(different1)
Packit 6c4009
	cmpldi	cr7, r5, 8
Packit 6c4009
	li	r9, 0
Packit 6c4009
	ble	cr7, L(ret1)
Packit 6c4009
	addi	r5, r5, -8
Packit 6c4009
Packit 6c4009
	ld	r7, 8(r3)
Packit 6c4009
	ld	r9, 8(r4)
Packit 6c4009
	cmpb	r8, r7, r8
Packit 6c4009
	cmpb	r6, r7, r9
Packit 6c4009
	orc.	r8, r8, r6
Packit 6c4009
	bne	cr0, L(different1)
Packit 6c4009
	cmpldi	cr7, r5, 8
Packit 6c4009
	mr	r9, r8
Packit 6c4009
	ble	cr7, L(ret1)
Packit 6c4009
Packit 6c4009
	/* Update pointers and size.  */
Packit 6c4009
	addi	r5, r5, -8
Packit 6c4009
	addi	r3, r3, 16
Packit 6c4009
	addi	r4, r4, 16
Packit 6c4009
L(align):
Packit 6c4009
	/* Now it has checked for first 32 bytes, align source1 to doubleword
Packit 6c4009
	   and adjust source2 address.  */
Packit 6c4009
	vspltisb	v0, 0
Packit 6c4009
	vspltisb	v2, -1
Packit 6c4009
	or	r6, r4, r3
Packit 6c4009
	andi.	r6, r6, 0xF
Packit 6c4009
	beq	cr0, L(aligned)
Packit 6c4009
	lvsr	v6, 0, r4   /* Compute mask.  */
Packit 6c4009
	clrldi	r6, r4, 60
Packit 6c4009
	subfic	r11, r6, 16
Packit 6c4009
	andi.	r6, r3, 0xF
Packit 6c4009
	beq	cr0, L(s1_align)
Packit 6c4009
	/* Both s1 and s2 are unaligned.  */
Packit 6c4009
	GET16BYTES(v5, r4, v6)
Packit 6c4009
	lvsr	v10, 0, r3   /* Compute mask.  */
Packit 6c4009
	clrldi	r6, r3, 60
Packit 6c4009
	subfic	r11, r6, 16
Packit 6c4009
	GET16BYTES(v4, r3, v10)
Packit 6c4009
	VCMPNEZB(v7, v5, v4)
Packit 6c4009
	beq	cr6, L(match)
Packit 6c4009
	b	L(different)
Packit 6c4009
Packit 6c4009
	/* Align s1 to qw and adjust s2 address.  */
Packit 6c4009
	.align  4
Packit 6c4009
L(match):
Packit 6c4009
	cmpldi	cr7, r5, 16
Packit 6c4009
	ble	cr7, L(ret0)
Packit 6c4009
	subf	r5, r11, r5
Packit 6c4009
	add	r3, r3, r11
Packit 6c4009
	add	r4, r4, r11
Packit 6c4009
	andi.	r11, r4, 0xF
Packit 6c4009
	beq	cr0, L(aligned)
Packit 6c4009
	lvsr	v6, 0, r4
Packit 6c4009
	clrldi	r6, r4, 60
Packit 6c4009
	subfic	r11, r6, 16
Packit 6c4009
	/* There are 2 loops depending on the input alignment.
Packit 6c4009
	   Each loop gets 16 bytes from s1 and s2, checks for null
Packit 6c4009
	   and compares them. Loops until a mismatch or  null occurs.  */
Packit 6c4009
L(s1_align):
Packit 6c4009
	lvx	v4, 0, r3
Packit 6c4009
	GET16BYTES(v5, r4, v6)
Packit 6c4009
	VCMPNEZB(v7, v5, v4)
Packit 6c4009
	bne	cr6, L(different)
Packit 6c4009
	cmpldi	cr7, r5, 16
Packit 6c4009
	ble	cr7, L(ret0)
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
	addi	r3, r3, 16
Packit 6c4009
	addi	r4, r4, 16
Packit 6c4009
Packit 6c4009
	lvx	v4, 0, r3
Packit 6c4009
	GET16BYTES(v5, r4, v6)
Packit 6c4009
	VCMPNEZB(v7, v5, v4)
Packit 6c4009
	bne	cr6, L(different)
Packit 6c4009
	cmpldi	cr7, r5, 16
Packit 6c4009
	ble	cr7, L(ret0)
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
	addi	r3, r3, 16
Packit 6c4009
	addi	r4, r4, 16
Packit 6c4009
Packit 6c4009
	lvx	v4, 0, r3
Packit 6c4009
	GET16BYTES(v5, r4, v6)
Packit 6c4009
	VCMPNEZB(v7, v5, v4)
Packit 6c4009
	bne	cr6, L(different)
Packit 6c4009
	cmpldi	cr7, r5, 16
Packit 6c4009
	ble	cr7, L(ret0)
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
	addi	r3, r3, 16
Packit 6c4009
	addi	r4, r4, 16
Packit 6c4009
Packit 6c4009
	lvx	v4, 0, r3
Packit 6c4009
	GET16BYTES(v5, r4, v6)
Packit 6c4009
	VCMPNEZB(v7, v5, v4)
Packit 6c4009
	bne	cr6, L(different)
Packit 6c4009
	cmpldi	cr7, r5, 16
Packit 6c4009
	ble	cr7, L(ret0)
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
	addi	r3, r3, 16
Packit 6c4009
	addi	r4, r4, 16
Packit 6c4009
	b	L(s1_align)
Packit 6c4009
	.align  4
Packit 6c4009
L(aligned):
Packit 6c4009
	lvx	v4, 0, r3
Packit 6c4009
	lvx	v5, 0, r4
Packit 6c4009
	VCMPNEZB(v7, v5, v4)
Packit 6c4009
	bne	cr6, L(different)
Packit 6c4009
	cmpldi	cr7, r5, 16
Packit 6c4009
	ble	cr7, L(ret0)
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
	addi	r3, r3, 16
Packit 6c4009
	addi	r4, r4, 16
Packit 6c4009
Packit 6c4009
	lvx	v4, 0, r3
Packit 6c4009
	lvx	v5, 0, r4
Packit 6c4009
	VCMPNEZB(v7, v5, v4)
Packit 6c4009
	bne	cr6, L(different)
Packit 6c4009
	cmpldi	cr7, r5, 16
Packit 6c4009
	ble	cr7, L(ret0)
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
	addi	r3, r3, 16
Packit 6c4009
	addi	r4, r4, 16
Packit 6c4009
Packit 6c4009
	lvx	v4, 0, r3
Packit 6c4009
	lvx	v5, 0, r4
Packit 6c4009
	VCMPNEZB(v7, v5, v4)
Packit 6c4009
	bne	cr6, L(different)
Packit 6c4009
	cmpldi	cr7, r5, 16
Packit 6c4009
	ble	cr7, L(ret0)
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
	addi	r3, r3, 16
Packit 6c4009
	addi	r4, r4, 16
Packit 6c4009
Packit 6c4009
	lvx	v4, 0, r3
Packit 6c4009
	lvx	v5, 0, r4
Packit 6c4009
	VCMPNEZB(v7, v5, v4)
Packit 6c4009
	bne	cr6, L(different)
Packit 6c4009
	cmpldi	cr7, r5, 16
Packit 6c4009
	ble	cr7, L(ret0)
Packit 6c4009
	addi	r5, r5, -16
Packit 6c4009
	addi	r3, r3, 16
Packit 6c4009
	addi	r4, r4, 16
Packit 6c4009
	b	L(aligned)
Packit 6c4009
	/* Calculate and return the difference.  */
Packit 6c4009
L(different):
Packit 6c4009
	VCTZLSBB(r6, v7)
Packit 6c4009
	cmplw	cr7, r5, r6
Packit 6c4009
	ble	cr7, L(ret0)
Packit 6c4009
	VEXTUBRX(r5, r6, v4)
Packit 6c4009
	VEXTUBRX(r4, r6, v5)
Packit 6c4009
	subf	r3, r4, r5
Packit 6c4009
	extsw	r3, r3
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(ret0):
Packit 6c4009
	li	r9, 0
Packit 6c4009
L(ret1):
Packit 6c4009
	mr	r3, r9
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* The code now checks if r8 and r5 are different by issuing a
Packit 6c4009
	   cmpb and shifts the result based on its output:
Packit 6c4009
Packit 6c4009
	  leadzero = (__builtin_ffsl (z1) - 1);
Packit 6c4009
	  leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
Packit 6c4009
	  r1 = (r1 >> leadzero) & 0xFFUL;
Packit 6c4009
	  r2 = (r2 >> leadzero) & 0xFFUL;
Packit 6c4009
	  return r1 - r2;  */
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(different1):
Packit 6c4009
	neg	r11, r8
Packit 6c4009
	sldi	r5, r5, 3
Packit 6c4009
	and	r8, r11, r8
Packit 6c4009
	addi	r5, r5, -8
Packit 6c4009
	cntlzd	r8, r8
Packit 6c4009
	subfic	r8, r8, 63
Packit 6c4009
	extsw 	r8, r8
Packit 6c4009
	cmpld	cr7, r8, r5
Packit 6c4009
	ble	cr7, L(different2)
Packit 6c4009
	mr	r8, r5
Packit 6c4009
L(different2):
Packit 6c4009
	extsw	r8, r8
Packit 6c4009
	srd	r7, r7, r8
Packit 6c4009
	srd	r9, r9, r8
Packit 6c4009
	rldicl	r3, r7, 0, 56
Packit 6c4009
	rldicl	r9, r9, 0, 56
Packit 6c4009
	subf	r9, r9, 3
Packit 6c4009
	extsw	r9, r9
Packit 6c4009
	mr	r3, r9
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
Packit 6c4009
	   a simple byte a byte comparison until the page alignment for s1
Packit 6c4009
	   is reached.  */
Packit 6c4009
	.align 4
Packit 6c4009
L(pagecross):
Packit 6c4009
	lbz	r7, 0(r3)
Packit 6c4009
	lbz	r9, 0(r4)
Packit 6c4009
	subfic	r8, r8,4095
Packit 6c4009
	cmplw	cr7, r9, r7
Packit 6c4009
	bne	cr7, L(byte_ne_3)
Packit 6c4009
	cmpdi	cr7, r9, 0
Packit 6c4009
	beq	cr7, L(byte_ne_0)
Packit 6c4009
	addi	r5, r5, -1
Packit 6c4009
	subf	r7, r8, r5
Packit 6c4009
	subf	r9, r7, r5
Packit 6c4009
	addi	r9, r9, 1
Packit 6c4009
	mtctr	r9
Packit 6c4009
	b	L(pagecross_loop1)
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(pagecross_loop0):
Packit 6c4009
	beq	cr7, L(ret0)
Packit 6c4009
	lbz	r9, 0(r3)
Packit 6c4009
	lbz	r8, 0(r4)
Packit 6c4009
	addi	r5, r5, -1
Packit 6c4009
	cmplw	cr7, r9, r8
Packit 6c4009
	cmpdi	cr5, r9, 0
Packit 6c4009
	bne	cr7, L(byte_ne_2)
Packit 6c4009
	beq	cr5, L(byte_ne_0)
Packit 6c4009
L(pagecross_loop1):
Packit 6c4009
	cmpdi	cr7, r5, 0
Packit 6c4009
	addi	r3, r3, 1
Packit 6c4009
	addi	r4, r4, 1
Packit 6c4009
	bdnz	L(pagecross_loop0)
Packit 6c4009
	cmpdi	cr7, r7, 0
Packit 6c4009
	li	r9, 0
Packit 6c4009
	bne+	cr7, L(align)
Packit 6c4009
	b	L(ret1)
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(byte_ne_0):
Packit 6c4009
	li	r7, 0
Packit 6c4009
L(byte_ne_1):
Packit 6c4009
	subf	r9, r9, r7
Packit 6c4009
	extsw	r9, r9
Packit 6c4009
	b	L(ret1)
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(byte_ne_2):
Packit 6c4009
	extsw	r7, r9
Packit 6c4009
	mr	r9, r8
Packit 6c4009
	b	L(byte_ne_1)
Packit 6c4009
L(byte_ne_3):
Packit 6c4009
	extsw	r7, r7
Packit 6c4009
	b	L(byte_ne_1)
Packit 6c4009
END(STRNCMP)
Packit 6c4009
libc_hidden_builtin_def(strncmp)