|
Packit |
6c4009 |
/* memcmp with SSE2
|
|
Packit |
6c4009 |
Copyright (C) 2009-2018 Free Software Foundation, Inc.
|
|
Packit |
6c4009 |
Contributed by Intel Corporation.
|
|
Packit |
6c4009 |
This file is part of the GNU C Library.
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
The GNU C Library is free software; you can redistribute it and/or
|
|
Packit |
6c4009 |
modify it under the terms of the GNU Lesser General Public
|
|
Packit |
6c4009 |
License as published by the Free Software Foundation; either
|
|
Packit |
6c4009 |
version 2.1 of the License, or (at your option) any later version.
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
The GNU C Library is distributed in the hope that it will be useful,
|
|
Packit |
6c4009 |
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
Packit |
6c4009 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Packit |
6c4009 |
Lesser General Public License for more details.
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
You should have received a copy of the GNU Lesser General Public
|
|
Packit |
6c4009 |
License along with the GNU C Library; if not, see
|
|
Packit |
6c4009 |
<http://www.gnu.org/licenses/>. */
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
#include <sysdep.h>
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
.text
|
|
Packit |
6c4009 |
ENTRY (memcmp)
|
|
Packit |
6c4009 |
test %rdx, %rdx
|
|
Packit |
6c4009 |
jz L(finz)
|
|
Packit |
6c4009 |
cmpq $1, %rdx
|
|
Packit |
6c4009 |
jle L(finr1b)
|
|
Packit |
6c4009 |
subq %rdi, %rsi
|
|
Packit |
6c4009 |
movq %rdx, %r10
|
|
Packit |
6c4009 |
cmpq $32, %r10
|
|
Packit |
6c4009 |
jge L(gt32)
|
|
Packit |
6c4009 |
/* Handle small chunks and last block of less than 32 bytes. */
|
|
Packit |
6c4009 |
L(small):
|
|
Packit |
6c4009 |
testq $1, %r10
|
|
Packit |
6c4009 |
jz L(s2b)
|
|
Packit |
6c4009 |
movzbl (%rdi), %eax
|
|
Packit |
6c4009 |
movzbl (%rdi, %rsi), %edx
|
|
Packit |
6c4009 |
subq $1, %r10
|
|
Packit |
6c4009 |
je L(finz1)
|
|
Packit |
6c4009 |
addq $1, %rdi
|
|
Packit |
6c4009 |
subl %edx, %eax
|
|
Packit |
6c4009 |
jnz L(exit)
|
|
Packit |
6c4009 |
L(s2b):
|
|
Packit |
6c4009 |
testq $2, %r10
|
|
Packit |
6c4009 |
jz L(s4b)
|
|
Packit |
6c4009 |
movzwl (%rdi), %eax
|
|
Packit |
6c4009 |
movzwl (%rdi, %rsi), %edx
|
|
Packit |
6c4009 |
subq $2, %r10
|
|
Packit |
6c4009 |
je L(fin2_7)
|
|
Packit |
6c4009 |
addq $2, %rdi
|
|
Packit |
6c4009 |
cmpl %edx, %eax
|
|
Packit |
6c4009 |
jnz L(fin2_7)
|
|
Packit |
6c4009 |
L(s4b):
|
|
Packit |
6c4009 |
testq $4, %r10
|
|
Packit |
6c4009 |
jz L(s8b)
|
|
Packit |
6c4009 |
movl (%rdi), %eax
|
|
Packit |
6c4009 |
movl (%rdi, %rsi), %edx
|
|
Packit |
6c4009 |
subq $4, %r10
|
|
Packit |
6c4009 |
je L(fin2_7)
|
|
Packit |
6c4009 |
addq $4, %rdi
|
|
Packit |
6c4009 |
cmpl %edx, %eax
|
|
Packit |
6c4009 |
jnz L(fin2_7)
|
|
Packit |
6c4009 |
L(s8b):
|
|
Packit |
6c4009 |
testq $8, %r10
|
|
Packit |
6c4009 |
jz L(s16b)
|
|
Packit |
6c4009 |
movq (%rdi), %rax
|
|
Packit |
6c4009 |
movq (%rdi, %rsi), %rdx
|
|
Packit |
6c4009 |
subq $8, %r10
|
|
Packit |
6c4009 |
je L(fin2_7)
|
|
Packit |
6c4009 |
addq $8, %rdi
|
|
Packit |
6c4009 |
cmpq %rdx, %rax
|
|
Packit |
6c4009 |
jnz L(fin2_7)
|
|
Packit |
6c4009 |
L(s16b):
|
|
Packit |
6c4009 |
movdqu (%rdi), %xmm1
|
|
Packit |
6c4009 |
movdqu (%rdi, %rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb %xmm0, %xmm1
|
|
Packit |
6c4009 |
pmovmskb %xmm1, %edx
|
|
Packit |
6c4009 |
xorl %eax, %eax
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jz L(finz)
|
|
Packit |
6c4009 |
bsfl %edx, %ecx
|
|
Packit |
6c4009 |
leaq (%rdi, %rcx), %rcx
|
|
Packit |
6c4009 |
movzbl (%rcx), %eax
|
|
Packit |
6c4009 |
movzbl (%rsi, %rcx), %edx
|
|
Packit |
6c4009 |
jmp L(finz1)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
.p2align 4,, 4
|
|
Packit |
6c4009 |
L(finr1b):
|
|
Packit |
6c4009 |
movzbl (%rdi), %eax
|
|
Packit |
6c4009 |
movzbl (%rsi), %edx
|
|
Packit |
6c4009 |
L(finz1):
|
|
Packit |
6c4009 |
subl %edx, %eax
|
|
Packit |
6c4009 |
L(exit):
|
|
Packit |
6c4009 |
ret
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
.p2align 4,, 4
|
|
Packit |
6c4009 |
L(fin2_7):
|
|
Packit |
6c4009 |
cmpq %rdx, %rax
|
|
Packit |
6c4009 |
jz L(finz)
|
|
Packit |
6c4009 |
movq %rax, %r11
|
|
Packit |
6c4009 |
subq %rdx, %r11
|
|
Packit |
6c4009 |
bsfq %r11, %rcx
|
|
Packit |
6c4009 |
sarq $3, %rcx
|
|
Packit |
6c4009 |
salq $3, %rcx
|
|
Packit |
6c4009 |
sarq %cl, %rax
|
|
Packit |
6c4009 |
movzbl %al, %eax
|
|
Packit |
6c4009 |
sarq %cl, %rdx
|
|
Packit |
6c4009 |
movzbl %dl, %edx
|
|
Packit |
6c4009 |
subl %edx, %eax
|
|
Packit |
6c4009 |
ret
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
.p2align 4,, 4
|
|
Packit |
6c4009 |
L(finz):
|
|
Packit |
6c4009 |
xorl %eax, %eax
|
|
Packit |
6c4009 |
ret
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
/* For blocks bigger than 32 bytes
|
|
Packit |
6c4009 |
1. Advance one of the addr pointer to be 16B aligned.
|
|
Packit |
6c4009 |
2. Treat the case of both addr pointers aligned to 16B
|
|
Packit |
6c4009 |
separately to avoid movdqu.
|
|
Packit |
6c4009 |
3. Handle any blocks of greater than 64 consecutive bytes with
|
|
Packit |
6c4009 |
unrolling to reduce branches.
|
|
Packit |
6c4009 |
4. At least one addr pointer is 16B aligned, use memory version
|
|
Packit |
6c4009 |
of pcmbeqb.
|
|
Packit |
6c4009 |
*/
|
|
Packit |
6c4009 |
.p2align 4,, 4
|
|
Packit |
6c4009 |
L(gt32):
|
|
Packit |
6c4009 |
movq %rdx, %r11
|
|
Packit |
6c4009 |
addq %rdi, %r11
|
|
Packit |
6c4009 |
movq %rdi, %r8
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
andq $15, %r8
|
|
Packit |
6c4009 |
jz L(16am)
|
|
Packit |
6c4009 |
/* Both pointers may be misaligned. */
|
|
Packit |
6c4009 |
movdqu (%rdi), %xmm1
|
|
Packit |
6c4009 |
movdqu (%rdi, %rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb %xmm0, %xmm1
|
|
Packit |
6c4009 |
pmovmskb %xmm1, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
neg %r8
|
|
Packit |
6c4009 |
leaq 16(%rdi, %r8), %rdi
|
|
Packit |
6c4009 |
L(16am):
|
|
Packit |
6c4009 |
/* Handle two 16B aligned pointers separately. */
|
|
Packit |
6c4009 |
testq $15, %rsi
|
|
Packit |
6c4009 |
jz L(ATR)
|
|
Packit |
6c4009 |
testq $16, %rdi
|
|
Packit |
6c4009 |
jz L(A32)
|
|
Packit |
6c4009 |
movdqu (%rdi, %rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
L(A32):
|
|
Packit |
6c4009 |
movq %r11, %r10
|
|
Packit |
6c4009 |
andq $-32, %r10
|
|
Packit |
6c4009 |
cmpq %r10, %rdi
|
|
Packit |
6c4009 |
jge L(mt16)
|
|
Packit |
6c4009 |
/* Pre-unroll to be ready for unrolled 64B loop. */
|
|
Packit |
6c4009 |
testq $32, %rdi
|
|
Packit |
6c4009 |
jz L(A64)
|
|
Packit |
6c4009 |
movdqu (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
movdqu (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
L(A64):
|
|
Packit |
6c4009 |
movq %r11, %r10
|
|
Packit |
6c4009 |
andq $-64, %r10
|
|
Packit |
6c4009 |
cmpq %r10, %rdi
|
|
Packit |
6c4009 |
jge L(mt32)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
L(A64main):
|
|
Packit |
6c4009 |
movdqu (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
movdqu (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
movdqu (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
movdqu (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
cmpq %rdi, %r10
|
|
Packit |
6c4009 |
jne L(A64main)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
L(mt32):
|
|
Packit |
6c4009 |
movq %r11, %r10
|
|
Packit |
6c4009 |
andq $-32, %r10
|
|
Packit |
6c4009 |
cmpq %r10, %rdi
|
|
Packit |
6c4009 |
jge L(mt16)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
L(A32main):
|
|
Packit |
6c4009 |
movdqu (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
movdqu (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
cmpq %rdi, %r10
|
|
Packit |
6c4009 |
jne L(A32main)
|
|
Packit |
6c4009 |
L(mt16):
|
|
Packit |
6c4009 |
subq %rdi, %r11
|
|
Packit |
6c4009 |
je L(finz)
|
|
Packit |
6c4009 |
movq %r11, %r10
|
|
Packit |
6c4009 |
jmp L(small)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
.p2align 4,, 4
|
|
Packit |
6c4009 |
L(neq):
|
|
Packit |
6c4009 |
bsfl %edx, %ecx
|
|
Packit |
6c4009 |
movzbl (%rdi, %rcx), %eax
|
|
Packit |
6c4009 |
addq %rdi, %rsi
|
|
Packit |
6c4009 |
movzbl (%rsi,%rcx), %edx
|
|
Packit |
6c4009 |
jmp L(finz1)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
.p2align 4,, 4
|
|
Packit |
6c4009 |
L(ATR):
|
|
Packit |
6c4009 |
movq %r11, %r10
|
|
Packit |
6c4009 |
andq $-32, %r10
|
|
Packit |
6c4009 |
cmpq %r10, %rdi
|
|
Packit |
6c4009 |
jge L(mt16)
|
|
Packit |
6c4009 |
testq $16, %rdi
|
|
Packit |
6c4009 |
jz L(ATR32)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
movdqa (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
cmpq %rdi, %r10
|
|
Packit |
6c4009 |
je L(mt16)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
L(ATR32):
|
|
Packit |
6c4009 |
movq %r11, %r10
|
|
Packit |
6c4009 |
andq $-64, %r10
|
|
Packit |
6c4009 |
testq $32, %rdi
|
|
Packit |
6c4009 |
jz L(ATR64)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
movdqa (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
movdqa (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
L(ATR64):
|
|
Packit |
6c4009 |
cmpq %rdi, %r10
|
|
Packit |
6c4009 |
je L(mt32)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
L(ATR64main):
|
|
Packit |
6c4009 |
movdqa (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
movdqa (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
movdqa (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
movdqa (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
cmpq %rdi, %r10
|
|
Packit |
6c4009 |
jne L(ATR64main)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
movq %r11, %r10
|
|
Packit |
6c4009 |
andq $-32, %r10
|
|
Packit |
6c4009 |
cmpq %r10, %rdi
|
|
Packit |
6c4009 |
jge L(mt16)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
L(ATR32res):
|
|
Packit |
6c4009 |
movdqa (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
movdqa (%rdi,%rsi), %xmm0
|
|
Packit |
6c4009 |
pcmpeqb (%rdi), %xmm0
|
|
Packit |
6c4009 |
pmovmskb %xmm0, %edx
|
|
Packit |
6c4009 |
subl $0xffff, %edx
|
|
Packit |
6c4009 |
jnz L(neq)
|
|
Packit |
6c4009 |
addq $16, %rdi
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
cmpq %r10, %rdi
|
|
Packit |
6c4009 |
jne L(ATR32res)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
subq %rdi, %r11
|
|
Packit |
6c4009 |
je L(finz)
|
|
Packit |
6c4009 |
movq %r11, %r10
|
|
Packit |
6c4009 |
jmp L(small)
|
|
Packit |
6c4009 |
/* Align to 16byte to improve instruction fetch. */
|
|
Packit |
6c4009 |
.p2align 4,, 4
|
|
Packit |
6c4009 |
END(memcmp)
|
|
Packit |
6c4009 |
|
|
Packit |
6c4009 |
#undef bcmp
|
|
Packit |
6c4009 |
weak_alias (memcmp, bcmp)
|
|
Packit |
6c4009 |
libc_hidden_builtin_def (memcmp)
|