Blame sysdeps/x86_64/multiarch/strcmp-avx2.S

Packit 6c4009
/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
Packit 6c4009
   Copyright (C) 2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#if IS_IN (libc)
Packit 6c4009
Packit 6c4009
# include <sysdep.h>
Packit 6c4009
Packit 6c4009
# ifndef STRCMP
Packit 6c4009
#  define STRCMP	__strcmp_avx2
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
# define PAGE_SIZE	4096
Packit 6c4009
Packit 6c4009
/* VEC_SIZE = Number of bytes in a ymm register */
Packit 6c4009
# define VEC_SIZE	32
Packit 6c4009
Packit 6c4009
/* Shift for dividing by (VEC_SIZE * 4).  */
Packit 6c4009
# define DIVIDE_BY_VEC_4_SHIFT	7
Packit 6c4009
# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
Packit 6c4009
#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
# ifdef USE_AS_WCSCMP
Packit 6c4009
/* Compare packed dwords.  */
Packit 6c4009
#  define VPCMPEQ	vpcmpeqd
Packit 6c4009
/* Compare packed dwords and store minimum.  */
Packit 6c4009
#  define VPMINU	vpminud
Packit 6c4009
/* 1 dword char == 4 bytes.  */
Packit 6c4009
#  define SIZE_OF_CHAR	4
Packit 6c4009
# else
Packit 6c4009
/* Compare packed bytes.  */
Packit 6c4009
#  define VPCMPEQ	vpcmpeqb
Packit 6c4009
/* Compare packed bytes and store minimum.  */
Packit 6c4009
#  define VPMINU	vpminub
Packit 6c4009
/* 1 byte char == 1 byte.  */
Packit 6c4009
#  define SIZE_OF_CHAR	1
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
# ifndef VZEROUPPER
Packit 6c4009
#  define VZEROUPPER	vzeroupper
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
/* Warning!
Packit 6c4009
           wcscmp/wcsncmp have to use SIGNED comparison for elements.
Packit 6c4009
           strcmp/strncmp have to use UNSIGNED comparison for elements.
Packit 6c4009
*/
Packit 6c4009
Packit 6c4009
/* The main idea of the string comparison (byte or dword) using AVX2
Packit 6c4009
   consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
Packit 6c4009
   either packed bytes or dwords depending on USE_AS_WCSCMP. In order
Packit 6c4009
   to check the null char, algorithm keeps the matched bytes/dwords,
Packit 6c4009
   requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
Packit 6c4009
   the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
Packit 6c4009
   one VPMINU instructions, together with movdqu and testl instructions.
Packit 6c4009
   Main loop (away from from page boundary) compares 4 vectors are a time,
Packit 6c4009
   effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
Packit 6c4009
Packit 6c4009
   The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
Packit 6c4009
   is the same as strcmp, except that an a maximum offset is tracked.  If
Packit 6c4009
   the maximum offset is reached before a difference is found, zero is
Packit 6c4009
   returned.  */
Packit 6c4009
Packit 6c4009
	.section .text.avx,"ax",@progbits
Packit 6c4009
ENTRY (STRCMP)
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	/* Check for simple cases (0 or 1) in offset.  */
Packit 6c4009
	cmp	$1, %rdx
Packit 6c4009
	je	L(char0)
Packit 6c4009
	jb	L(zero)
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	/* Convert units: from wide to byte char.  */
Packit 6c4009
	shl	$2, %rdx
Packit 6c4009
#  endif
Packit 6c4009
	/* Register %r11 tracks the maximum offset.  */
Packit 6c4009
	movq	%rdx, %r11
Packit 6c4009
# endif
Packit 6c4009
	movl	%edi, %eax
Packit 6c4009
	xorl	%edx, %edx
Packit 6c4009
	/* Make %ymm7 all zeros in this function.  */
Packit 6c4009
	vpxor	%ymm7, %ymm7, %ymm7
Packit 6c4009
	orl	%esi, %eax
Packit 6c4009
	andl	$(PAGE_SIZE - 1), %eax
Packit 6c4009
	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
Packit 6c4009
	jg	L(cross_page)
Packit 6c4009
	/* Start comparing 4 vectors.  */
Packit 6c4009
	vmovdqu	(%rdi), %ymm1
Packit 6c4009
	VPCMPEQ	(%rsi), %ymm1, %ymm0
Packit 6c4009
	VPMINU	%ymm1, %ymm0, %ymm0
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm0, %ymm0
Packit 6c4009
	vpmovmskb %ymm0, %ecx
Packit 6c4009
	testl	%ecx, %ecx
Packit 6c4009
	je	L(next_3_vectors)
Packit 6c4009
	tzcntl	%ecx, %edx
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	/* Return 0 if the mismatched index (%rdx) is after the maximum
Packit 6c4009
	   offset (%r11).   */
Packit 6c4009
	cmpq	%r11, %rdx
Packit 6c4009
	jae	L(zero)
Packit 6c4009
# endif
Packit 6c4009
# ifdef USE_AS_WCSCMP
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rdi, %rdx), %ecx
Packit 6c4009
	cmpl	(%rsi, %rdx), %ecx
Packit 6c4009
	je	L(return)
Packit 6c4009
L(wcscmp_return):
Packit 6c4009
	setl	%al
Packit 6c4009
	negl	%eax
Packit 6c4009
	orl	$1, %eax
Packit 6c4009
L(return):
Packit 6c4009
# else
Packit 6c4009
	movzbl	(%rdi, %rdx), %eax
Packit 6c4009
	movzbl	(%rsi, %rdx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
# endif
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(return_vec_size):
Packit 6c4009
	tzcntl	%ecx, %edx
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
Packit 6c4009
	   the maximum offset (%r11).  */
Packit 6c4009
	addq	$VEC_SIZE, %rdx
Packit 6c4009
	cmpq	%r11, %rdx
Packit 6c4009
	jae	L(zero)
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rdi, %rdx), %ecx
Packit 6c4009
	cmpl	(%rsi, %rdx), %ecx
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(%rdi, %rdx), %eax
Packit 6c4009
	movzbl	(%rsi, %rdx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# else
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	VEC_SIZE(%rdi, %rdx), %ecx
Packit 6c4009
	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	VEC_SIZE(%rdi, %rdx), %eax
Packit 6c4009
	movzbl	VEC_SIZE(%rsi, %rdx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# endif
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(return_2_vec_size):
Packit 6c4009
	tzcntl	%ecx, %edx
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
Packit 6c4009
	   after the maximum offset (%r11).  */
Packit 6c4009
	addq	$(VEC_SIZE * 2), %rdx
Packit 6c4009
	cmpq	%r11, %rdx
Packit 6c4009
	jae	L(zero)
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rdi, %rdx), %ecx
Packit 6c4009
	cmpl	(%rsi, %rdx), %ecx
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(%rdi, %rdx), %eax
Packit 6c4009
	movzbl	(%rsi, %rdx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# else
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
Packit 6c4009
	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
Packit 6c4009
	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# endif
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(return_3_vec_size):
Packit 6c4009
	tzcntl	%ecx, %edx
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
Packit 6c4009
	   after the maximum offset (%r11).  */
Packit 6c4009
	addq	$(VEC_SIZE * 3), %rdx
Packit 6c4009
	cmpq	%r11, %rdx
Packit 6c4009
	jae	L(zero)
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rdi, %rdx), %ecx
Packit 6c4009
	cmpl	(%rsi, %rdx), %ecx
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(%rdi, %rdx), %eax
Packit 6c4009
	movzbl	(%rsi, %rdx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# else
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
Packit 6c4009
	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
Packit 6c4009
	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# endif
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(next_3_vectors):
Packit 6c4009
	vmovdqu	VEC_SIZE(%rdi), %ymm6
Packit 6c4009
	VPCMPEQ	VEC_SIZE(%rsi), %ymm6, %ymm3
Packit 6c4009
	VPMINU	%ymm6, %ymm3, %ymm3
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm3, %ymm3
Packit 6c4009
	vpmovmskb %ymm3, %ecx
Packit 6c4009
	testl	%ecx, %ecx
Packit 6c4009
	jne	L(return_vec_size)
Packit 6c4009
	vmovdqu	(VEC_SIZE * 2)(%rdi), %ymm5
Packit 6c4009
	vmovdqu	(VEC_SIZE * 3)(%rdi), %ymm4
Packit 6c4009
	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm0
Packit 6c4009
	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
Packit 6c4009
	VPMINU	%ymm5, %ymm2, %ymm2
Packit 6c4009
	VPCMPEQ	%ymm4, %ymm0, %ymm0
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm2, %ymm2
Packit 6c4009
	vpmovmskb %ymm2, %ecx
Packit 6c4009
	testl	%ecx, %ecx
Packit 6c4009
	jne	L(return_2_vec_size)
Packit 6c4009
	VPMINU	%ymm4, %ymm0, %ymm0
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm0, %ymm0
Packit 6c4009
	vpmovmskb %ymm0, %ecx
Packit 6c4009
	testl	%ecx, %ecx
Packit 6c4009
	jne	L(return_3_vec_size)
Packit 6c4009
L(main_loop_header):
Packit 6c4009
	leaq	(VEC_SIZE * 4)(%rdi), %rdx
Packit 6c4009
	movl	$PAGE_SIZE, %ecx
Packit 6c4009
	/* Align load via RAX.  */
Packit 6c4009
	andq	$-(VEC_SIZE * 4), %rdx
Packit 6c4009
	subq	%rdi, %rdx
Packit 6c4009
	leaq	(%rdi, %rdx), %rax
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	/* Starting from this point, the maximum offset, or simply the
Packit 6c4009
	   'offset', DECREASES by the same amount when base pointers are
Packit 6c4009
	   moved forward.  Return 0 when:
Packit 6c4009
	     1) On match: offset <= the matched vector index.
Packit 6c4009
	     2) On mistmach, offset is before the mistmatched index.
Packit 6c4009
	 */
Packit 6c4009
	subq	%rdx, %r11
Packit 6c4009
	jbe	L(zero)
Packit 6c4009
# endif
Packit 6c4009
	addq	%rsi, %rdx
Packit 6c4009
	movq	%rdx, %rsi
Packit 6c4009
	andl	$(PAGE_SIZE - 1), %esi
Packit 6c4009
	/* Number of bytes before page crossing.  */
Packit 6c4009
	subq	%rsi, %rcx
Packit 6c4009
	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
Packit 6c4009
	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
Packit 6c4009
	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
Packit 6c4009
	movl	%ecx, %esi
Packit 6c4009
	jmp	L(loop_start)
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(loop):
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
Packit 6c4009
	   the maximum offset (%r11) by the same amount.  */
Packit 6c4009
	subq	$(VEC_SIZE * 4), %r11
Packit 6c4009
	jbe	L(zero)
Packit 6c4009
# endif
Packit 6c4009
	addq	$(VEC_SIZE * 4), %rax
Packit 6c4009
	addq	$(VEC_SIZE * 4), %rdx
Packit 6c4009
L(loop_start):
Packit 6c4009
	testl	%esi, %esi
Packit 6c4009
	leal	-1(%esi), %esi
Packit 6c4009
	je	L(loop_cross_page)
Packit 6c4009
L(back_to_loop):
Packit 6c4009
	/* Main loop, comparing 4 vectors are a time.  */
Packit 6c4009
	vmovdqa	(%rax), %ymm0
Packit 6c4009
	vmovdqa	VEC_SIZE(%rax), %ymm3
Packit 6c4009
	VPCMPEQ	(%rdx), %ymm0, %ymm4
Packit 6c4009
	VPCMPEQ	VEC_SIZE(%rdx), %ymm3, %ymm1
Packit 6c4009
	VPMINU	%ymm0, %ymm4, %ymm4
Packit 6c4009
	VPMINU	%ymm3, %ymm1, %ymm1
Packit 6c4009
	vmovdqa	(VEC_SIZE * 2)(%rax), %ymm2
Packit 6c4009
	VPMINU	%ymm1, %ymm4, %ymm0
Packit 6c4009
	vmovdqa	(VEC_SIZE * 3)(%rax), %ymm3
Packit 6c4009
	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
Packit 6c4009
	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
Packit 6c4009
	VPMINU	%ymm2, %ymm5, %ymm5
Packit 6c4009
	VPMINU	%ymm3, %ymm6, %ymm6
Packit 6c4009
	VPMINU	%ymm5, %ymm0, %ymm0
Packit 6c4009
	VPMINU	%ymm6, %ymm0, %ymm0
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm0, %ymm0
Packit 6c4009
Packit 6c4009
	/* Test each mask (32 bits) individually because for VEC_SIZE
Packit 6c4009
	   == 32 is not possible to OR the four masks and keep all bits
Packit 6c4009
	   in a 64-bit integer register, differing from SSE2 strcmp
Packit 6c4009
	   where ORing is possible.  */
Packit 6c4009
	vpmovmskb %ymm0, %ecx
Packit 6c4009
	testl	%ecx, %ecx
Packit 6c4009
	je	L(loop)
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm4, %ymm0
Packit 6c4009
	vpmovmskb %ymm0, %edi
Packit 6c4009
	testl	%edi, %edi
Packit 6c4009
	je	L(test_vec)
Packit 6c4009
	tzcntl	%edi, %ecx
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	cmpq	%rcx, %r11
Packit 6c4009
	jbe	L(zero)
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	movq	%rax, %rsi
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rsi, %rcx), %edi
Packit 6c4009
	cmpl	(%rdx, %rcx), %edi
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(%rax, %rcx), %eax
Packit 6c4009
	movzbl	(%rdx, %rcx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# else
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	movq	%rax, %rsi
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rsi, %rcx), %edi
Packit 6c4009
	cmpl	(%rdx, %rcx), %edi
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(%rax, %rcx), %eax
Packit 6c4009
	movzbl	(%rdx, %rcx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# endif
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(test_vec):
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	/* The first vector matched.  Return 0 if the maximum offset
Packit 6c4009
	   (%r11) <= VEC_SIZE.  */
Packit 6c4009
	cmpq	$VEC_SIZE, %r11
Packit 6c4009
	jbe	L(zero)
Packit 6c4009
# endif
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm1, %ymm1
Packit 6c4009
	vpmovmskb %ymm1, %ecx
Packit 6c4009
	testl	%ecx, %ecx
Packit 6c4009
	je	L(test_2_vec)
Packit 6c4009
	tzcntl	%ecx, %edi
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	addq	$VEC_SIZE, %rdi
Packit 6c4009
	cmpq	%rdi, %r11
Packit 6c4009
	jbe	L(zero)
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	movq	%rax, %rsi
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rsi, %rdi), %ecx
Packit 6c4009
	cmpl	(%rdx, %rdi), %ecx
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(%rax, %rdi), %eax
Packit 6c4009
	movzbl	(%rdx, %rdi), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# else
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	movq	%rax, %rsi
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	VEC_SIZE(%rsi, %rdi), %ecx
Packit 6c4009
	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	VEC_SIZE(%rax, %rdi), %eax
Packit 6c4009
	movzbl	VEC_SIZE(%rdx, %rdi), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# endif
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(test_2_vec):
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	/* The first 2 vectors matched.  Return 0 if the maximum offset
Packit 6c4009
	   (%r11) <= 2 * VEC_SIZE.  */
Packit 6c4009
	cmpq	$(VEC_SIZE * 2), %r11
Packit 6c4009
	jbe	L(zero)
Packit 6c4009
# endif
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm5, %ymm5
Packit 6c4009
	vpmovmskb %ymm5, %ecx
Packit 6c4009
	testl	%ecx, %ecx
Packit 6c4009
	je	L(test_3_vec)
Packit 6c4009
	tzcntl	%ecx, %edi
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	addq	$(VEC_SIZE * 2), %rdi
Packit 6c4009
	cmpq	%rdi, %r11
Packit 6c4009
	jbe	L(zero)
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	movq	%rax, %rsi
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rsi, %rdi), %ecx
Packit 6c4009
	cmpl	(%rdx, %rdi), %ecx
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(%rax, %rdi), %eax
Packit 6c4009
	movzbl	(%rdx, %rdi), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# else
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	movq	%rax, %rsi
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
Packit 6c4009
	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
Packit 6c4009
	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# endif
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(test_3_vec):
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	/* The first 3 vectors matched.  Return 0 if the maximum offset
Packit 6c4009
	   (%r11) <= 3 * VEC_SIZE.  */
Packit 6c4009
	cmpq	$(VEC_SIZE * 3), %r11
Packit 6c4009
	jbe	L(zero)
Packit 6c4009
# endif
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm6, %ymm6
Packit 6c4009
	vpmovmskb %ymm6, %esi
Packit 6c4009
	tzcntl	%esi, %ecx
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	addq	$(VEC_SIZE * 3), %rcx
Packit 6c4009
	cmpq	%rcx, %r11
Packit 6c4009
	jbe	L(zero)
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	movq	%rax, %rsi
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rsi, %rcx), %esi
Packit 6c4009
	cmpl	(%rdx, %rcx), %esi
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(%rax, %rcx), %eax
Packit 6c4009
	movzbl	(%rdx, %rcx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# else
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	movq	%rax, %rsi
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
Packit 6c4009
	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
Packit 6c4009
	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# endif
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(loop_cross_page):
Packit 6c4009
	xorl	%r10d, %r10d
Packit 6c4009
	movq	%rdx, %rcx
Packit 6c4009
	/* Align load via RDX.  We load the extra ECX bytes which should
Packit 6c4009
	   be ignored.  */
Packit 6c4009
	andl	$((VEC_SIZE * 4) - 1), %ecx
Packit 6c4009
	/* R10 is -RCX.  */
Packit 6c4009
	subq	%rcx, %r10
Packit 6c4009
Packit 6c4009
	/* This works only if VEC_SIZE * 2 == 64. */
Packit 6c4009
# if (VEC_SIZE * 2) != 64
Packit 6c4009
#  error (VEC_SIZE * 2) != 64
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
Packit 6c4009
	cmpl	$(VEC_SIZE * 2), %ecx
Packit 6c4009
	jge	L(loop_cross_page_2_vec)
Packit 6c4009
Packit 6c4009
	vmovdqu	(%rax, %r10), %ymm2
Packit 6c4009
	vmovdqu	VEC_SIZE(%rax, %r10), %ymm3
Packit 6c4009
	VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0
Packit 6c4009
	VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
Packit 6c4009
	VPMINU	%ymm2, %ymm0, %ymm0
Packit 6c4009
	VPMINU	%ymm3, %ymm1, %ymm1
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm0, %ymm0
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm1, %ymm1
Packit 6c4009
Packit 6c4009
	vpmovmskb %ymm0, %edi
Packit 6c4009
	vpmovmskb %ymm1, %esi
Packit 6c4009
Packit 6c4009
	salq	$32, %rsi
Packit 6c4009
	xorq	%rsi, %rdi
Packit 6c4009
Packit 6c4009
	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
Packit 6c4009
	shrq	%cl, %rdi
Packit 6c4009
Packit 6c4009
	testq	%rdi, %rdi
Packit 6c4009
	je	L(loop_cross_page_2_vec)
Packit 6c4009
	tzcntq	%rdi, %rcx
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	cmpq	%rcx, %r11
Packit 6c4009
	jbe	L(zero)
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	movq	%rax, %rsi
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rsi, %rcx), %edi
Packit 6c4009
	cmpl	(%rdx, %rcx), %edi
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(%rax, %rcx), %eax
Packit 6c4009
	movzbl	(%rdx, %rcx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# else
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	movq	%rax, %rsi
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rsi, %rcx), %edi
Packit 6c4009
	cmpl	(%rdx, %rcx), %edi
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(%rax, %rcx), %eax
Packit 6c4009
	movzbl	(%rdx, %rcx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# endif
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(loop_cross_page_2_vec):
Packit 6c4009
	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
Packit 6c4009
	vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2
Packit 6c4009
	vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3
Packit 6c4009
	VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
Packit 6c4009
	VPMINU	%ymm2, %ymm5, %ymm5
Packit 6c4009
	VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm5, %ymm5
Packit 6c4009
	VPMINU	%ymm3, %ymm6, %ymm6
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm6, %ymm6
Packit 6c4009
Packit 6c4009
	vpmovmskb %ymm5, %edi
Packit 6c4009
	vpmovmskb %ymm6, %esi
Packit 6c4009
Packit 6c4009
	salq	$32, %rsi
Packit 6c4009
	xorq	%rsi, %rdi
Packit 6c4009
Packit 6c4009
	xorl	%r8d, %r8d
Packit 6c4009
	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
Packit 6c4009
	subl	$(VEC_SIZE * 2), %ecx
Packit 6c4009
	jle	1f
Packit 6c4009
	/* Skip ECX bytes.  */
Packit 6c4009
	shrq	%cl, %rdi
Packit 6c4009
	/* R8 has number of bytes skipped.  */
Packit 6c4009
	movl	%ecx, %r8d
Packit 6c4009
1:
Packit 6c4009
	/* Before jumping back to the loop, set ESI to the number of
Packit 6c4009
	   VEC_SIZE * 4 blocks before page crossing.  */
Packit 6c4009
	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
Packit 6c4009
Packit 6c4009
	testq	%rdi, %rdi
Packit Service a1de30
# ifdef USE_AS_STRNCMP
Packit Service a1de30
	/* At this point, if %rdi value is 0, it already tested
Packit Service a1de30
	   VEC_SIZE*4+%r10 byte starting from %rax. This label
Packit Service a1de30
	   checks whether strncmp maximum offset reached or not.  */
Packit Service a1de30
	je	L(string_nbyte_offset_check)
Packit Service a1de30
# else
Packit 6c4009
	je	L(back_to_loop)
Packit Service a1de30
# endif
Packit 6c4009
	tzcntq	%rdi, %rcx
Packit 6c4009
	addq	%r10, %rcx
Packit 6c4009
	/* Adjust for number of bytes skipped.  */
Packit 6c4009
	addq	%r8, %rcx
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	addq	$(VEC_SIZE * 2), %rcx
Packit 6c4009
	subq	%rcx, %r11
Packit 6c4009
	jbe	L(zero)
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	movq	%rax, %rsi
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rsi, %rcx), %edi
Packit 6c4009
	cmpl	(%rdx, %rcx), %edi
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(%rax, %rcx), %eax
Packit 6c4009
	movzbl	(%rdx, %rcx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# else
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	movq	%rax, %rsi
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
Packit 6c4009
	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
Packit 6c4009
	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
#  endif
Packit 6c4009
# endif
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit Service a1de30
# ifdef USE_AS_STRNCMP
Packit Service a1de30
L(string_nbyte_offset_check):
Packit Service a1de30
	leaq	(VEC_SIZE * 4)(%r10), %r10
Packit Service a1de30
	cmpq	%r10, %r11
Packit Service a1de30
	jbe	L(zero)
Packit Service a1de30
	jmp	L(back_to_loop)
Packit Service a1de30
# endif
Packit Service a1de30
Packit 6c4009
	.p2align 4
Packit 6c4009
L(cross_page_loop):
Packit 6c4009
	/* Check one byte/dword at a time.  */
Packit 6c4009
# ifdef USE_AS_WCSCMP
Packit 6c4009
	cmpl	%ecx, %eax
Packit 6c4009
# else
Packit 6c4009
	subl	%ecx, %eax
Packit 6c4009
# endif
Packit 6c4009
	jne	L(different)
Packit 6c4009
	addl	$SIZE_OF_CHAR, %edx
Packit 6c4009
	cmpl	$(VEC_SIZE * 4), %edx
Packit 6c4009
	je	L(main_loop_header)
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	cmpq	%r11, %rdx
Packit 6c4009
	jae	L(zero)
Packit 6c4009
# endif
Packit 6c4009
# ifdef USE_AS_WCSCMP
Packit 6c4009
	movl	(%rdi, %rdx), %eax
Packit 6c4009
	movl	(%rsi, %rdx), %ecx
Packit 6c4009
# else
Packit 6c4009
	movzbl	(%rdi, %rdx), %eax
Packit 6c4009
	movzbl	(%rsi, %rdx), %ecx
Packit 6c4009
# endif
Packit 6c4009
	/* Check null char.  */
Packit 6c4009
	testl	%eax, %eax
Packit 6c4009
	jne	L(cross_page_loop)
Packit 6c4009
	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
Packit 6c4009
	   comparisons.  */
Packit 6c4009
	subl	%ecx, %eax
Packit 6c4009
# ifndef USE_AS_WCSCMP
Packit 6c4009
L(different):
Packit 6c4009
# endif
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
# ifdef USE_AS_WCSCMP
Packit 6c4009
	.p2align 4
Packit 6c4009
L(different):
Packit 6c4009
	/* Use movl to avoid modifying EFLAGS.  */
Packit 6c4009
	movl	$0, %eax
Packit 6c4009
	setl	%al
Packit 6c4009
	negl	%eax
Packit 6c4009
	orl	$1, %eax
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	.p2align 4
Packit 6c4009
L(zero):
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(char0):
Packit 6c4009
#  ifdef USE_AS_WCSCMP
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rdi), %ecx
Packit 6c4009
	cmpl	(%rsi), %ecx
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
#  else
Packit 6c4009
	movzbl	(%rsi), %ecx
Packit 6c4009
	movzbl	(%rdi), %eax
Packit 6c4009
	subl	%ecx, %eax
Packit 6c4009
#  endif
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(last_vector):
Packit 6c4009
	addq	%rdx, %rdi
Packit 6c4009
	addq	%rdx, %rsi
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	subq	%rdx, %r11
Packit 6c4009
# endif
Packit 6c4009
	tzcntl	%ecx, %edx
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	cmpq	%r11, %rdx
Packit 6c4009
	jae	L(zero)
Packit 6c4009
# endif
Packit 6c4009
# ifdef USE_AS_WCSCMP
Packit 6c4009
	xorl	%eax, %eax
Packit 6c4009
	movl	(%rdi, %rdx), %ecx
Packit 6c4009
	cmpl	(%rsi, %rdx), %ecx
Packit 6c4009
	jne	L(wcscmp_return)
Packit 6c4009
# else
Packit 6c4009
	movzbl	(%rdi, %rdx), %eax
Packit 6c4009
	movzbl	(%rsi, %rdx), %edx
Packit 6c4009
	subl	%edx, %eax
Packit 6c4009
# endif
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	/* Comparing on page boundary region requires special treatment:
Packit 6c4009
	   It must done one vector at the time, starting with the wider
Packit 6c4009
	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
Packit 6c4009
	   (xmm) still passes the boundary, byte comparison must be done.
Packit 6c4009
	 */
Packit 6c4009
	.p2align 4
Packit 6c4009
L(cross_page):
Packit 6c4009
	/* Try one ymm vector at a time.  */
Packit 6c4009
	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
Packit 6c4009
	jg	L(cross_page_1_vector)
Packit 6c4009
L(loop_1_vector):
Packit 6c4009
	vmovdqu	(%rdi, %rdx), %ymm1
Packit 6c4009
	VPCMPEQ	(%rsi, %rdx), %ymm1, %ymm0
Packit 6c4009
	VPMINU	%ymm1, %ymm0, %ymm0
Packit 6c4009
	VPCMPEQ	%ymm7, %ymm0, %ymm0
Packit 6c4009
	vpmovmskb %ymm0, %ecx
Packit 6c4009
	testl	%ecx, %ecx
Packit 6c4009
	jne	L(last_vector)
Packit 6c4009
Packit 6c4009
	addl	$VEC_SIZE, %edx
Packit 6c4009
Packit 6c4009
	addl	$VEC_SIZE, %eax
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	/* Return 0 if the current offset (%rdx) >= the maximum offset
Packit 6c4009
	   (%r11).  */
Packit 6c4009
	cmpq	%r11, %rdx
Packit 6c4009
	jae	L(zero)
Packit 6c4009
# endif
Packit 6c4009
	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
Packit 6c4009
	jle	L(loop_1_vector)
Packit 6c4009
L(cross_page_1_vector):
Packit 6c4009
	/* Less than 32 bytes to check, try one xmm vector.  */
Packit 6c4009
	cmpl	$(PAGE_SIZE - 16), %eax
Packit 6c4009
	jg	L(cross_page_1_xmm)
Packit 6c4009
	vmovdqu	(%rdi, %rdx), %xmm1
Packit 6c4009
	VPCMPEQ	(%rsi, %rdx), %xmm1, %xmm0
Packit 6c4009
	VPMINU	%xmm1, %xmm0, %xmm0
Packit 6c4009
	VPCMPEQ	%xmm7, %xmm0, %xmm0
Packit 6c4009
	vpmovmskb %xmm0, %ecx
Packit 6c4009
	testl	%ecx, %ecx
Packit 6c4009
	jne	L(last_vector)
Packit 6c4009
Packit 6c4009
	addl	$16, %edx
Packit 6c4009
# ifndef USE_AS_WCSCMP
Packit 6c4009
	addl	$16, %eax
Packit 6c4009
# endif
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	/* Return 0 if the current offset (%rdx) >= the maximum offset
Packit 6c4009
	   (%r11).  */
Packit 6c4009
	cmpq	%r11, %rdx
Packit 6c4009
	jae	L(zero)
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
L(cross_page_1_xmm):
Packit 6c4009
# ifndef USE_AS_WCSCMP
Packit 6c4009
	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
Packit 6c4009
	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
Packit 6c4009
	cmpl	$(PAGE_SIZE - 8), %eax
Packit 6c4009
	jg	L(cross_page_8bytes)
Packit 6c4009
	vmovq	(%rdi, %rdx), %xmm1
Packit 6c4009
	vmovq	(%rsi, %rdx), %xmm0
Packit 6c4009
	VPCMPEQ	%xmm0, %xmm1, %xmm0
Packit 6c4009
	VPMINU	%xmm1, %xmm0, %xmm0
Packit 6c4009
	VPCMPEQ	%xmm7, %xmm0, %xmm0
Packit 6c4009
	vpmovmskb %xmm0, %ecx
Packit 6c4009
	/* Only last 8 bits are valid.  */
Packit 6c4009
	andl	$0xff, %ecx
Packit 6c4009
	testl	%ecx, %ecx
Packit 6c4009
	jne	L(last_vector)
Packit 6c4009
Packit 6c4009
	addl	$8, %edx
Packit 6c4009
	addl	$8, %eax
Packit 6c4009
#  ifdef USE_AS_STRNCMP
Packit 6c4009
	/* Return 0 if the current offset (%rdx) >= the maximum offset
Packit 6c4009
	   (%r11).  */
Packit 6c4009
	cmpq	%r11, %rdx
Packit 6c4009
	jae	L(zero)
Packit 6c4009
#  endif
Packit 6c4009
Packit 6c4009
L(cross_page_8bytes):
Packit 6c4009
	/* Less than 8 bytes to check, try 4 byte vector.  */
Packit 6c4009
	cmpl	$(PAGE_SIZE - 4), %eax
Packit 6c4009
	jg	L(cross_page_4bytes)
Packit 6c4009
	vmovd	(%rdi, %rdx), %xmm1
Packit 6c4009
	vmovd	(%rsi, %rdx), %xmm0
Packit 6c4009
	VPCMPEQ	%xmm0, %xmm1, %xmm0
Packit 6c4009
	VPMINU	%xmm1, %xmm0, %xmm0
Packit 6c4009
	VPCMPEQ	%xmm7, %xmm0, %xmm0
Packit 6c4009
	vpmovmskb %xmm0, %ecx
Packit 6c4009
	/* Only last 4 bits are valid.  */
Packit 6c4009
	andl	$0xf, %ecx
Packit 6c4009
	testl	%ecx, %ecx
Packit 6c4009
	jne	L(last_vector)
Packit 6c4009
Packit 6c4009
	addl	$4, %edx
Packit 6c4009
#  ifdef USE_AS_STRNCMP
Packit 6c4009
	/* Return 0 if the current offset (%rdx) >= the maximum offset
Packit 6c4009
	   (%r11).  */
Packit 6c4009
	cmpq	%r11, %rdx
Packit 6c4009
	jae	L(zero)
Packit 6c4009
#  endif
Packit 6c4009
Packit 6c4009
L(cross_page_4bytes):
Packit 6c4009
# endif
Packit 6c4009
	/* Less than 4 bytes to check, try one byte/dword at a time.  */
Packit 6c4009
# ifdef USE_AS_STRNCMP
Packit 6c4009
	cmpq	%r11, %rdx
Packit 6c4009
	jae	L(zero)
Packit 6c4009
# endif
Packit 6c4009
# ifdef USE_AS_WCSCMP
Packit 6c4009
	movl	(%rdi, %rdx), %eax
Packit 6c4009
	movl	(%rsi, %rdx), %ecx
Packit 6c4009
# else
Packit 6c4009
	movzbl	(%rdi, %rdx), %eax
Packit 6c4009
	movzbl	(%rsi, %rdx), %ecx
Packit 6c4009
# endif
Packit 6c4009
	testl	%eax, %eax
Packit 6c4009
	jne	L(cross_page_loop)
Packit 6c4009
	subl	%ecx, %eax
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
END (STRCMP)
Packit 6c4009
#endif