Blob Blame History Raw
/* memset/bzero with unaligned store and rep stosb
   Copyright (C) 2016-2018 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

/* memset is implemented as:
   1. Use overlapping store to avoid branch.
   2. If size is less than VEC, use integer register stores.
   3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
   4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
   5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
      4 VEC stores and store 4 * VEC at a time until done.  */

#include <sysdep.h>

#ifndef MEMSET_CHK_SYMBOL
# define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
#endif

#ifndef WMEMSET_CHK_SYMBOL
# define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
#endif

#ifndef VZEROUPPER
# if VEC_SIZE > 16
#  define VZEROUPPER			vzeroupper
# else
#  define VZEROUPPER
# endif
#endif

#ifndef VZEROUPPER_SHORT_RETURN
# if VEC_SIZE > 16
#  define VZEROUPPER_SHORT_RETURN	vzeroupper
# else
#  define VZEROUPPER_SHORT_RETURN	rep
# endif
#endif

#ifndef MOVQ
# if VEC_SIZE > 16
#  define MOVQ				vmovq
# else
#  define MOVQ				movq
# endif
#endif

#ifndef SECTION
# error SECTION is not defined!
#endif

	.section SECTION(.text),"ax",@progbits
#if VEC_SIZE == 16 && IS_IN (libc)
ENTRY (__bzero)
	movq	%rdi, %rax /* Set return value.  */
	movq	%rsi, %rdx /* Set n.  */
	pxor	%xmm0, %xmm0
	jmp	L(entry_from_bzero)
END (__bzero)
weak_alias (__bzero, bzero)
#endif

#if IS_IN (libc)
# if defined SHARED
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
	cmpq	%rdx, %rcx
	jb	HIDDEN_JUMPTARGET (__chk_fail)
END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
# endif

ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
	shlq	$2, %rdx
	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
	jmp	L(entry_from_bzero)
END (WMEMSET_SYMBOL (__wmemset, unaligned))
#endif

#if defined SHARED && IS_IN (libc)
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
	cmpq	%rdx, %rcx
	jb	HIDDEN_JUMPTARGET (__chk_fail)
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
#endif

ENTRY (MEMSET_SYMBOL (__memset, unaligned))
	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
L(entry_from_bzero):
	cmpq	$VEC_SIZE, %rdx
	jb	L(less_vec)
	cmpq	$(VEC_SIZE * 2), %rdx
	ja	L(more_2x_vec)
	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
	VMOVU	%VEC(0), (%rdi)
	VZEROUPPER
	ret
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMSET_SYMBOL (__memset, unaligned))

# if VEC_SIZE == 16
ENTRY (__memset_chk_erms)
	cmpq	%rdx, %rcx
	jb	HIDDEN_JUMPTARGET (__chk_fail)
END (__memset_chk_erms)

/* Only used to measure performance of REP STOSB.  */
ENTRY (__memset_erms)
	/* Skip zero length.  */
	testq	%rdx, %rdx
	jnz	 L(stosb)
	movq	%rdi, %rax
	ret
# else
/* Provide a hidden symbol to debugger.  */
	.hidden	MEMSET_SYMBOL (__memset, erms)
ENTRY (MEMSET_SYMBOL (__memset, erms))
# endif
L(stosb):
	/* Issue vzeroupper before rep stosb.  */
	VZEROUPPER
	movq	%rdx, %rcx
	movzbl	%sil, %eax
	movq	%rdi, %rdx
	rep stosb
	movq	%rdx, %rax
	ret
# if VEC_SIZE == 16
END (__memset_erms)
# else
END (MEMSET_SYMBOL (__memset, erms))
# endif

# if defined SHARED && IS_IN (libc)
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
	cmpq	%rdx, %rcx
	jb	HIDDEN_JUMPTARGET (__chk_fail)
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
# endif

ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
	cmpq	$VEC_SIZE, %rdx
	jb	L(less_vec)
	cmpq	$(VEC_SIZE * 2), %rdx
	ja	L(stosb_more_2x_vec)
	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
	VMOVU	%VEC(0), (%rdi)
	VZEROUPPER
	ret

L(stosb_more_2x_vec):
	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
	ja	L(stosb)
#endif
L(more_2x_vec):
	cmpq  $(VEC_SIZE * 4), %rdx
	ja	L(loop_start)
	VMOVU	%VEC(0), (%rdi)
	VMOVU	%VEC(0), VEC_SIZE(%rdi)
	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
L(return):
	VZEROUPPER
	ret

L(loop_start):
	leaq	(VEC_SIZE * 4)(%rdi), %rcx
	VMOVU	%VEC(0), (%rdi)
	andq	$-(VEC_SIZE * 4), %rcx
	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
	VMOVU	%VEC(0), VEC_SIZE(%rdi)
	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
	addq	%rdi, %rdx
	andq	$-(VEC_SIZE * 4), %rdx
	cmpq	%rdx, %rcx
	je	L(return)
L(loop):
	VMOVA	%VEC(0), (%rcx)
	VMOVA	%VEC(0), VEC_SIZE(%rcx)
	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
	addq	$(VEC_SIZE * 4), %rcx
	cmpq	%rcx, %rdx
	jne	L(loop)
	VZEROUPPER_SHORT_RETURN
	ret
L(less_vec):
	/* Less than 1 VEC.  */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
#  error Unsupported VEC_SIZE!
# endif
# if VEC_SIZE > 32
	cmpb	$32, %dl
	jae	L(between_32_63)
# endif
# if VEC_SIZE > 16
	cmpb	$16, %dl
	jae	L(between_16_31)
# endif
	MOVQ	%xmm0, %rcx
	cmpb	$8, %dl
	jae	L(between_8_15)
	cmpb	$4, %dl
	jae	L(between_4_7)
	cmpb	$1, %dl
	ja	L(between_2_3)
	jb	1f
	movb	%cl, (%rdi)
1:
	VZEROUPPER
	ret
# if VEC_SIZE > 32
	/* From 32 to 63.  No branch when size == 32.  */
L(between_32_63):
	vmovdqu	%ymm0, -32(%rdi,%rdx)
	vmovdqu	%ymm0, (%rdi)
	VZEROUPPER
	ret
# endif
# if VEC_SIZE > 16
	/* From 16 to 31.  No branch when size == 16.  */
L(between_16_31):
	vmovdqu	%xmm0, -16(%rdi,%rdx)
	vmovdqu	%xmm0, (%rdi)
	VZEROUPPER
	ret
# endif
	/* From 8 to 15.  No branch when size == 8.  */
L(between_8_15):
	movq	%rcx, -8(%rdi,%rdx)
	movq	%rcx, (%rdi)
	VZEROUPPER
	ret
L(between_4_7):
	/* From 4 to 7.  No branch when size == 4.  */
	movl	%ecx, -4(%rdi,%rdx)
	movl	%ecx, (%rdi)
	VZEROUPPER
	ret
L(between_2_3):
	/* From 2 to 3.  No branch when size == 2.  */
	movw	%cx, -2(%rdi,%rdx)
	movw	%cx, (%rdi)
	VZEROUPPER
	ret
END (MEMSET_SYMBOL (__memset, unaligned_erms))