Blame sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S

Packit 6c4009
/* memset/bzero with unaligned store and rep stosb
Packit 6c4009
   Copyright (C) 2016-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
/* memset is implemented as:
Packit 6c4009
   1. Use overlapping store to avoid branch.
Packit 6c4009
   2. If size is less than VEC, use integer register stores.
Packit 6c4009
   3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
Packit 6c4009
   4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
Packit 6c4009
   5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
Packit 6c4009
      4 VEC stores and store 4 * VEC at a time until done.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
#ifndef MEMSET_CHK_SYMBOL
Packit 6c4009
# define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#ifndef WMEMSET_CHK_SYMBOL
Packit 6c4009
# define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#ifndef VZEROUPPER
Packit 6c4009
# if VEC_SIZE > 16
Packit 6c4009
#  define VZEROUPPER			vzeroupper
Packit 6c4009
# else
Packit 6c4009
#  define VZEROUPPER
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#ifndef VZEROUPPER_SHORT_RETURN
Packit 6c4009
# if VEC_SIZE > 16
Packit 6c4009
#  define VZEROUPPER_SHORT_RETURN	vzeroupper
Packit 6c4009
# else
Packit 6c4009
#  define VZEROUPPER_SHORT_RETURN	rep
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#ifndef MOVQ
Packit 6c4009
# if VEC_SIZE > 16
Packit 6c4009
#  define MOVQ				vmovq
Packit 6c4009
# else
Packit 6c4009
#  define MOVQ				movq
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
Packit Service 3b0880
/* Threshold to use Enhanced REP STOSB.  Since there is overhead to set
Packit Service 3b0880
   up REP STOSB operation, REP STOSB isn't faster on short data.  The
Packit Service 3b0880
   memset micro benchmark in glibc shows that 2KB is the approximate
Packit Service 3b0880
   value above which REP STOSB becomes faster on processors with
Packit Service 3b0880
   Enhanced REP STOSB.  Since the stored value is fixed, larger register
Packit Service 3b0880
   size has minimal impact on threshold.  */
Packit Service 3b0880
#ifndef REP_STOSB_THRESHOLD
Packit Service 3b0880
# define REP_STOSB_THRESHOLD		2048
Packit Service 3b0880
#endif
Packit Service 3b0880
Packit 6c4009
#ifndef SECTION
Packit 6c4009
# error SECTION is not defined!
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
	.section SECTION(.text),"ax",@progbits
Packit 6c4009
#if VEC_SIZE == 16 && IS_IN (libc)
Packit 6c4009
ENTRY (__bzero)
Packit 6c4009
	movq	%rdi, %rax /* Set return value.  */
Packit 6c4009
	movq	%rsi, %rdx /* Set n.  */
Packit 6c4009
	pxor	%xmm0, %xmm0
Packit 6c4009
	jmp	L(entry_from_bzero)
Packit 6c4009
END (__bzero)
Packit 6c4009
weak_alias (__bzero, bzero)
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#if IS_IN (libc)
Packit 6c4009
# if defined SHARED
Packit 6c4009
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
Packit 6c4009
	cmpq	%rdx, %rcx
Packit 6c4009
	jb	HIDDEN_JUMPTARGET (__chk_fail)
Packit 6c4009
END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
Packit 6c4009
	shlq	$2, %rdx
Packit 6c4009
	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
Packit 6c4009
	jmp	L(entry_from_bzero)
Packit 6c4009
END (WMEMSET_SYMBOL (__wmemset, unaligned))
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#if defined SHARED && IS_IN (libc)
Packit 6c4009
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
Packit 6c4009
	cmpq	%rdx, %rcx
Packit 6c4009
	jb	HIDDEN_JUMPTARGET (__chk_fail)
Packit 6c4009
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
Packit 6c4009
	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
Packit 6c4009
L(entry_from_bzero):
Packit 6c4009
	cmpq	$VEC_SIZE, %rdx
Packit 6c4009
	jb	L(less_vec)
Packit 6c4009
	cmpq	$(VEC_SIZE * 2), %rdx
Packit 6c4009
	ja	L(more_2x_vec)
Packit 6c4009
	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
Packit 6c4009
	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
Packit 6c4009
	VMOVU	%VEC(0), (%rdi)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
#if defined USE_MULTIARCH && IS_IN (libc)
Packit 6c4009
END (MEMSET_SYMBOL (__memset, unaligned))
Packit 6c4009
Packit 6c4009
# if VEC_SIZE == 16
Packit 6c4009
ENTRY (__memset_chk_erms)
Packit 6c4009
	cmpq	%rdx, %rcx
Packit 6c4009
	jb	HIDDEN_JUMPTARGET (__chk_fail)
Packit 6c4009
END (__memset_chk_erms)
Packit 6c4009
Packit 6c4009
/* Only used to measure performance of REP STOSB.  */
Packit 6c4009
ENTRY (__memset_erms)
Packit 6c4009
	/* Skip zero length.  */
Packit 6c4009
	testq	%rdx, %rdx
Packit 6c4009
	jnz	 L(stosb)
Packit 6c4009
	movq	%rdi, %rax
Packit 6c4009
	ret
Packit 6c4009
# else
Packit 6c4009
/* Provide a hidden symbol to debugger.  */
Packit 6c4009
	.hidden	MEMSET_SYMBOL (__memset, erms)
Packit 6c4009
ENTRY (MEMSET_SYMBOL (__memset, erms))
Packit 6c4009
# endif
Packit 6c4009
L(stosb):
Packit 6c4009
	/* Issue vzeroupper before rep stosb.  */
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	movq	%rdx, %rcx
Packit 6c4009
	movzbl	%sil, %eax
Packit 6c4009
	movq	%rdi, %rdx
Packit 6c4009
	rep stosb
Packit 6c4009
	movq	%rdx, %rax
Packit 6c4009
	ret
Packit 6c4009
# if VEC_SIZE == 16
Packit 6c4009
END (__memset_erms)
Packit 6c4009
# else
Packit 6c4009
END (MEMSET_SYMBOL (__memset, erms))
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
# if defined SHARED && IS_IN (libc)
Packit 6c4009
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
Packit 6c4009
	cmpq	%rdx, %rcx
Packit 6c4009
	jb	HIDDEN_JUMPTARGET (__chk_fail)
Packit 6c4009
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
Packit 6c4009
	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
Packit 6c4009
	cmpq	$VEC_SIZE, %rdx
Packit 6c4009
	jb	L(less_vec)
Packit 6c4009
	cmpq	$(VEC_SIZE * 2), %rdx
Packit 6c4009
	ja	L(stosb_more_2x_vec)
Packit 6c4009
	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
Packit 6c4009
	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
Packit 6c4009
	VMOVU	%VEC(0), (%rdi)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
L(stosb_more_2x_vec):
Packit Service 3b0880
	cmpq	$REP_STOSB_THRESHOLD, %rdx
Packit 6c4009
	ja	L(stosb)
Packit 6c4009
#endif
Packit 6c4009
L(more_2x_vec):
Packit 6c4009
	cmpq  $(VEC_SIZE * 4), %rdx
Packit 6c4009
	ja	L(loop_start)
Packit 6c4009
	VMOVU	%VEC(0), (%rdi)
Packit 6c4009
	VMOVU	%VEC(0), VEC_SIZE(%rdi)
Packit 6c4009
	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
Packit 6c4009
	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
Packit 6c4009
L(return):
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
L(loop_start):
Packit 6c4009
	leaq	(VEC_SIZE * 4)(%rdi), %rcx
Packit 6c4009
	VMOVU	%VEC(0), (%rdi)
Packit 6c4009
	andq	$-(VEC_SIZE * 4), %rcx
Packit 6c4009
	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
Packit 6c4009
	VMOVU	%VEC(0), VEC_SIZE(%rdi)
Packit 6c4009
	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
Packit 6c4009
	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
Packit 6c4009
	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
Packit 6c4009
	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
Packit 6c4009
	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
Packit 6c4009
	addq	%rdi, %rdx
Packit 6c4009
	andq	$-(VEC_SIZE * 4), %rdx
Packit 6c4009
	cmpq	%rdx, %rcx
Packit 6c4009
	je	L(return)
Packit 6c4009
L(loop):
Packit 6c4009
	VMOVA	%VEC(0), (%rcx)
Packit 6c4009
	VMOVA	%VEC(0), VEC_SIZE(%rcx)
Packit 6c4009
	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
Packit 6c4009
	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
Packit 6c4009
	addq	$(VEC_SIZE * 4), %rcx
Packit 6c4009
	cmpq	%rcx, %rdx
Packit 6c4009
	jne	L(loop)
Packit 6c4009
	VZEROUPPER_SHORT_RETURN
Packit 6c4009
	ret
Packit 6c4009
L(less_vec):
Packit 6c4009
	/* Less than 1 VEC.  */
Packit 6c4009
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
Packit 6c4009
#  error Unsupported VEC_SIZE!
Packit 6c4009
# endif
Packit 6c4009
# if VEC_SIZE > 32
Packit 6c4009
	cmpb	$32, %dl
Packit 6c4009
	jae	L(between_32_63)
Packit 6c4009
# endif
Packit 6c4009
# if VEC_SIZE > 16
Packit 6c4009
	cmpb	$16, %dl
Packit 6c4009
	jae	L(between_16_31)
Packit 6c4009
# endif
Packit 6c4009
	MOVQ	%xmm0, %rcx
Packit 6c4009
	cmpb	$8, %dl
Packit 6c4009
	jae	L(between_8_15)
Packit 6c4009
	cmpb	$4, %dl
Packit 6c4009
	jae	L(between_4_7)
Packit 6c4009
	cmpb	$1, %dl
Packit 6c4009
	ja	L(between_2_3)
Packit 6c4009
	jb	1f
Packit 6c4009
	movb	%cl, (%rdi)
Packit 6c4009
1:
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
# if VEC_SIZE > 32
Packit 6c4009
	/* From 32 to 63.  No branch when size == 32.  */
Packit 6c4009
L(between_32_63):
Packit 6c4009
	vmovdqu	%ymm0, -32(%rdi,%rdx)
Packit 6c4009
	vmovdqu	%ymm0, (%rdi)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
# endif
Packit 6c4009
# if VEC_SIZE > 16
Packit 6c4009
	/* From 16 to 31.  No branch when size == 16.  */
Packit 6c4009
L(between_16_31):
Packit 6c4009
	vmovdqu	%xmm0, -16(%rdi,%rdx)
Packit 6c4009
	vmovdqu	%xmm0, (%rdi)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
# endif
Packit 6c4009
	/* From 8 to 15.  No branch when size == 8.  */
Packit 6c4009
L(between_8_15):
Packit 6c4009
	movq	%rcx, -8(%rdi,%rdx)
Packit 6c4009
	movq	%rcx, (%rdi)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
L(between_4_7):
Packit 6c4009
	/* From 4 to 7.  No branch when size == 4.  */
Packit 6c4009
	movl	%ecx, -4(%rdi,%rdx)
Packit 6c4009
	movl	%ecx, (%rdi)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
L(between_2_3):
Packit 6c4009
	/* From 2 to 3.  No branch when size == 2.  */
Packit 6c4009
	movw	%cx, -2(%rdi,%rdx)
Packit 6c4009
	movw	%cx, (%rdi)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
END (MEMSET_SYMBOL (__memset, unaligned_erms))