Blob Blame History Raw
/* Optimized strcasecmp implementation for PowerPC64.
   Copyright (C) 2016-2018 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>
#include <locale-defines.h>

/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */

#ifndef USE_AS_STRNCASECMP
#  define __STRCASECMP __strcasecmp
#  define STRCASECMP   strcasecmp
#else
#  define __STRCASECMP __strncasecmp
#  define STRCASECMP   strncasecmp
#endif
/* Convert 16 bytes to lowercase and compare */
#define TOLOWER()     \
	vaddubm	v8, v4, v1; \
	vaddubm	v7, v4, v3; \
	vcmpgtub	v8, v8, v2; \
	vsel	v4, v7, v4, v8; \
	vaddubm	v8, v5, v1; \
	vaddubm	v7, v5, v3; \
	vcmpgtub	v8, v8, v2; \
	vsel	v5, v7, v5, v8; \
	vcmpequb.	v7, v5, v4;

/*
 * Get 16 bytes for unaligned case.
 * reg1: Vector to hold next 16 bytes.
 * reg2: Address to read from.
 * reg3: Permute control vector.
 * v8: Tmp vector used to mask unwanted bytes.
 * v9: Tmp vector,0 when null is found on first 16 bytes
 */
#ifdef __LITTLE_ENDIAN__
#define GET16BYTES(reg1, reg2, reg3) \
	lvx	reg1, 0, reg2; \
	vspltisb	v8, -1; \
	vperm	v8, v8, reg1, reg3; \
	vcmpequb.	v8, v0, v8; \
	beq	cr6, 1f; \
	vspltisb	v9, 0; \
	b	2f; \
	.align 4; \
1: \
	addi	r6, reg2, 16; \
	lvx	v9, 0, r6; \
2: \
	vperm	reg1, v9, reg1, reg3;
#else
#define GET16BYTES(reg1, reg2, reg3) \
	lvx	reg1, 0, reg2; \
	vspltisb	 v8, -1; \
	vperm	v8, reg1, v8,  reg3; \
	vcmpequb.	v8, v0, v8; \
	beq	cr6, 1f; \
	vspltisb	v9, 0; \
	b	2f; \
	.align 4; \
1: \
	addi	r6, reg2, 16; \
	lvx	v9, 0, r6; \
2: \
	vperm	reg1, reg1, v9, reg3;
#endif

/* Check null in v4, v5 and convert to lower.  */
#define CHECKNULLANDCONVERT() \
	vcmpequb.	v7, v0, v5; \
	beq	cr6, 3f; \
	vcmpequb.	v7, v0, v4; \
	beq	cr6, 3f; \
	b	L(null_found); \
	.align  4; \
3: \
	TOLOWER()

#ifdef _ARCH_PWR8
#  define VCLZD_V8_v7	vclzd	v8, v7;
#  define MFVRD_R3_V1	mfvrd	r3, v1;
#  define VSUBUDM_V9_V8	vsubudm	v9, v9, v8;
#  define VPOPCNTD_V8_V8	vpopcntd v8, v8;
#  define VADDUQM_V7_V8	vadduqm	v9, v7, v8;
#else
#  define VCLZD_V8_v7	.long	0x11003fc2
#  define MFVRD_R3_V1	.long	0x7c230067
#  define VSUBUDM_V9_V8	.long	0x112944c0
#  define VPOPCNTD_V8_V8	.long	0x110047c3
#  define VADDUQM_V7_V8	.long	0x11274100
#endif

	.machine  power7

ENTRY (__STRCASECMP)
#ifdef USE_AS_STRNCASECMP
	CALL_MCOUNT 3
#else
	CALL_MCOUNT 2
#endif
#define rRTN	r3	/* Return value */
#define rSTR1	r10	/* 1st string */
#define rSTR2	r4	/* 2nd string */
#define rCHAR1	r6	/* Byte read from 1st string */
#define rCHAR2	r7	/* Byte read from 2nd string */
#define rADDR1	r8	/* Address of tolower(rCHAR1) */
#define rADDR2	r12	/* Address of tolower(rCHAR2) */
#define rLWR1	r8	/* Word tolower(rCHAR1) */
#define rLWR2	r12	/* Word tolower(rCHAR2) */
#define rTMP	r9
#define rLOC	r11	/* Default locale address */

	cmpd	cr7, rRTN, rSTR2

	/* Get locale address.  */
	ld 	rTMP, __libc_tsd_LOCALE@got@tprel(r2)
	add 	rLOC, rTMP, __libc_tsd_LOCALE@tls
	ld	rLOC, 0(rLOC)

	mr	rSTR1, rRTN
	li	rRTN, 0
	beqlr	cr7
#ifdef USE_AS_STRNCASECMP
	cmpdi	cr7, r5, 0
	beq	cr7, L(retnull)
	cmpdi	cr7, r5, 16
	blt	cr7, L(bytebybyte)
#endif
	vspltisb	v0, 0
	vspltisb	v8, -1
	/* Check for null in initial characters.
	   Check max of 16 char depending on the alignment.
	   If null is present, proceed byte by byte.  */
	lvx	v4, 0, rSTR1
#ifdef  __LITTLE_ENDIAN__
	lvsr	v10, 0, rSTR1	/* Compute mask.  */
	vperm	v9, v8, v4, v10	/* Mask bits that are not part of string.  */
#else
	lvsl	v10, 0, rSTR1
	vperm	v9, v4, v8, v10
#endif
	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
	bne	cr6, L(bytebybyte)
	lvx	v5, 0, rSTR2
	/* Calculate alignment.  */
#ifdef __LITTLE_ENDIAN__
	lvsr	v6, 0, rSTR2
	vperm	v9, v8, v5, v6	/* Mask bits that are not part of string.  */
#else
	lvsl	v6, 0, rSTR2
	vperm	v9, v5, v8, v6
#endif
	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
	bne	cr6, L(bytebybyte)
	/* Check if locale has non ascii characters.  */
	ld	rTMP, 0(rLOC)
	addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
	lwz	rTMP, 0(r6)
	cmpdi	cr7, rTMP, 1
	beq	cr7, L(bytebybyte)

	/* Load vector registers with values used for TOLOWER.  */
	/* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte.  */
	vspltisb	v3, 2
	vspltisb	v9, 4
	vsl	v3, v3, v9
	vaddubm	v1, v3, v3
	vnor	v1, v1, v1
	vspltisb	v2, 7
	vsububm	v2, v3, v2

	andi.	rADDR1, rSTR1, 0xF
	beq	cr0, L(align)
	addi	r6, rSTR1, 16
	lvx	v9, 0, r6
	/* Compute 16 bytes from previous two loads.  */
#ifdef __LITTLE_ENDIAN__
	vperm	v4, v9, v4, v10
#else
	vperm	v4, v4, v9, v10
#endif
L(align):
	andi.	rADDR2, rSTR2, 0xF
	beq	cr0, L(align1)
	addi	r6, rSTR2, 16
	lvx	v9, 0, r6
	/* Compute 16 bytes from previous two loads.  */
#ifdef __LITTLE_ENDIAN__
	vperm	v5, v9, v5, v6
#else
	vperm	v5, v5, v9, v6
#endif
L(align1):
	CHECKNULLANDCONVERT()
	blt	cr6, L(match)
	b	L(different)
	.align 	4
L(match):
	clrldi	r6, rSTR1, 60
	subfic	r7, r6, 16
#ifdef USE_AS_STRNCASECMP
	sub	r5, r5, r7
#endif
	add	rSTR1, rSTR1, r7
	add	rSTR2, rSTR2, r7
	andi.	rADDR2, rSTR2, 0xF
	addi	rSTR1, rSTR1, -16
	addi	rSTR2, rSTR2, -16
	beq	cr0, L(aligned)
#ifdef __LITTLE_ENDIAN__
	lvsr	v6, 0, rSTR2
#else
	lvsl	v6, 0, rSTR2
#endif
	/* There are 2 loops depending on the input alignment.
	   Each loop gets 16 bytes from s1 and s2, check for null,
	   convert to lowercase and compare. Loop till difference
	   or null occurs. */
L(s1_align):
	addi	rSTR1, rSTR1, 16
	addi	rSTR2, rSTR2, 16
#ifdef USE_AS_STRNCASECMP
	cmpdi	cr7, r5, 16
	blt	cr7, L(bytebybyte)
	addi	r5, r5, -16
#endif
	lvx	v4, 0, rSTR1
	GET16BYTES(v5, rSTR2, v6)
	CHECKNULLANDCONVERT()
	blt	cr6, L(s1_align)
	b	L(different)
	.align 	4
L(aligned):
	addi	rSTR1, rSTR1, 16
	addi	rSTR2, rSTR2, 16
#ifdef USE_AS_STRNCASECMP
	cmpdi	cr7, r5, 16
	blt	cr7, L(bytebybyte)
	addi	r5, r5, -16
#endif
	lvx	v4, 0, rSTR1
	lvx	v5, 0, rSTR2
	CHECKNULLANDCONVERT()
	blt	cr6, L(aligned)

	/* Calculate and return the difference. */
L(different):
	vaddubm	v1, v3, v3
	vcmpequb	v7, v0, v7
#ifdef __LITTLE_ENDIAN__
	/* Count trailing zero.  */
	vspltisb	v8, -1
	VADDUQM_V7_V8
	vandc	v8, v9, v7
	VPOPCNTD_V8_V8
	vspltb	v6, v8, 15
	vcmpequb.	v6, v6, v1
	blt	cr6, L(shift8)
#else
	/* Count leading zero.  */
	VCLZD_V8_v7
	vspltb	v6, v8, 7
	vcmpequb.	v6, v6, v1
	blt	cr6, L(shift8)
	vsro	v8, v8, v1
#endif
	b	L(skipsum)
	.align  4
L(shift8):
	vsumsws		v8, v8, v0
L(skipsum):
#ifdef __LITTLE_ENDIAN__
	/* Shift registers based on leading zero count.  */
	vsro	v6, v5, v8
	vsro	v7, v4, v8
	/* Merge and move to GPR.  */
	vmrglb	v6, v6, v7
	vslo	v1, v6, v1
	MFVRD_R3_V1
	/* Place the characters that are different in first position.  */
	sldi	rSTR2, rRTN, 56
	srdi	rSTR2, rSTR2, 56
	sldi	rSTR1, rRTN, 48
	srdi	rSTR1, rSTR1, 56
#else
	vslo	v6, v5, v8
	vslo	v7, v4, v8
	vmrghb	v1, v6, v7
	MFVRD_R3_V1
	srdi	rSTR2, rRTN, 48
	sldi	rSTR2, rSTR2, 56
	srdi	rSTR2, rSTR2, 56
	srdi	rSTR1, rRTN, 56
#endif
	subf  	rRTN, rSTR1, rSTR2
	extsw 	rRTN, rRTN
	blr

	.align  4
	/* OK. We've hit the end of the string. We need to be careful that
	   we don't compare two strings as different because of junk beyond
	   the end of the strings...  */
L(null_found):
	vaddubm	v10, v3, v3
#ifdef __LITTLE_ENDIAN__
	/* Count trailing zero.  */
	vspltisb	v8, -1
	VADDUQM_V7_V8
	vandc	v8, v9, v7
	VPOPCNTD_V8_V8
	vspltb	v6, v8, 15
	vcmpequb.	v6, v6, v10
	blt	cr6, L(shift_8)
#else
	/* Count leading zero.  */
	VCLZD_V8_v7
	vspltb	v6, v8, 7
	vcmpequb.	v6, v6, v10
	blt	cr6, L(shift_8)
	vsro	v8, v8, v10
#endif
	b	L(skipsum1)
	.align  4
L(shift_8):
	vsumsws	v8, v8, v0
L(skipsum1):
	/* Calculate shift count based on count of zero.  */
	vspltisb	v10, 7
	vslb	v10, v10, v10
	vsldoi	v9, v0, v10, 1
	VSUBUDM_V9_V8
	vspltisb	v8, 8
	vsldoi	v8, v0, v8, 1
	VSUBUDM_V9_V8
	/* Shift and remove junk after null character.  */
#ifdef __LITTLE_ENDIAN__
	vslo	v5, v5, v9
	vslo	v4, v4, v9
#else
	vsro	v5, v5, v9
	vsro	v4, v4, v9
#endif
	/* Convert and compare 16 bytes.  */
	TOLOWER()
	blt	cr6, L(retnull)
	b	L(different)
	.align  4
L(retnull):
	li	rRTN, 0
	blr
	.align  4
L(bytebybyte):
	/* Unrolling loop for POWER: loads are done with 'lbz' plus
	offset and string descriptors are only updated in the end
	of loop unrolling. */
	ld	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
#ifdef USE_AS_STRNCASECMP
	rldicl	rTMP, r5, 62, 2
	cmpdi	cr7, rTMP, 0
	beq	cr7, L(lessthan4)
	mtctr	rTMP
#endif
L(loop):
	cmpdi	rCHAR1, 0		/* *s1 == '\0' ? */
	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
	cmpw	cr1, rLWR1, rLWR2	/* r = tolower(*s1) == tolower(*s2) ? */
	crorc	4*cr1+eq,eq,4*cr1+eq	/* (*s1 != '\0') || (r == 1) */
	beq	cr1, L(done)
	lbz	rCHAR1, 1(rSTR1)
	lbz	rCHAR2, 1(rSTR2)
	cmpdi	rCHAR1, 0
	sldi	rADDR1, rCHAR1, 2
	sldi	rADDR2, rCHAR2, 2
	lwzx	rLWR1, rLOC, rADDR1
	lwzx	rLWR2, rLOC, rADDR2
	cmpw	cr1, rLWR1, rLWR2
	crorc	4*cr1+eq,eq,4*cr1+eq
	beq	cr1, L(done)
	lbz	rCHAR1, 2(rSTR1)
	lbz	rCHAR2, 2(rSTR2)
	cmpdi	rCHAR1, 0
	sldi	rADDR1, rCHAR1, 2
	sldi	rADDR2, rCHAR2, 2
	lwzx	rLWR1, rLOC, rADDR1
	lwzx	rLWR2, rLOC, rADDR2
	cmpw	cr1, rLWR1, rLWR2
	crorc	4*cr1+eq,eq,4*cr1+eq
	beq	cr1, L(done)
	lbz	rCHAR1, 3(rSTR1)
	lbz	rCHAR2, 3(rSTR2)
	cmpdi	rCHAR1, 0
	/* Increment both string descriptors */
	addi	rSTR1, rSTR1, 4
	addi	rSTR2, rSTR2, 4
	sldi	rADDR1, rCHAR1, 2
	sldi	rADDR2, rCHAR2, 2
	lwzx	rLWR1, rLOC, rADDR1
	lwzx	rLWR2, rLOC, rADDR2
	cmpw	cr1, rLWR1, rLWR2
	crorc	4*cr1+eq,eq,4*cr1+eq
	beq     cr1, L(done)
	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
#ifdef USE_AS_STRNCASECMP
	bdnz	L(loop)
#else
	b	L(loop)
#endif
#ifdef USE_AS_STRNCASECMP
L(lessthan4):
	clrldi	r5, r5, 62
	cmpdi	cr7, r5, 0
	beq	cr7, L(retnull)
	mtctr	r5
L(loop1):
	cmpdi	rCHAR1, 0
	sldi	rADDR1, rCHAR1, 2
	sldi	rADDR2, rCHAR2, 2
	lwzx	rLWR1, rLOC, rADDR1
	lwzx	rLWR2, rLOC, rADDR2
	cmpw	cr1, rLWR1, rLWR2
	crorc	4*cr1+eq,eq,4*cr1+eq
	beq	cr1, L(done)
	addi	rSTR1, rSTR1, 1
	addi	rSTR2, rSTR2, 1
	lbz	rCHAR1, 0(rSTR1)
	lbz	rCHAR2, 0(rSTR2)
	bdnz	L(loop1)
#endif
L(done):
	subf	r0, rLWR2, rLWR1
	extsw	rRTN, r0
	blr
END (__STRCASECMP)

weak_alias (__STRCASECMP, STRCASECMP)
libc_hidden_builtin_def (__STRCASECMP)