Blame sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

Packit 6c4009
/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
Packit 6c4009
   Copyright (C) 2016-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
/* memmove/memcpy/mempcpy is implemented as:
Packit 6c4009
   1. Use overlapping load and store to avoid branch.
Packit 6c4009
   2. Load all sources into registers and store them together to avoid
Packit 6c4009
      possible address overlap between source and destination.
Packit 6c4009
   3. If size is 8 * VEC_SIZE or less, load all sources into registers
Packit 6c4009
      and store them together.
Packit 6c4009
   4. If address of destination > address of source, backward copy
Packit 6c4009
      4 * VEC_SIZE at a time with unaligned load and aligned store.
Packit 6c4009
      Load the first 4 * VEC and last VEC before the loop and store
Packit 6c4009
      them after the loop to support overlapping addresses.
Packit 6c4009
   5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
Packit 6c4009
      load and aligned store.  Load the last 4 * VEC and first VEC
Packit 6c4009
      before the loop and store them after the loop to support
Packit 6c4009
      overlapping addresses.
Packit 6c4009
   6. If size >= __x86_shared_non_temporal_threshold and there is no
Packit 6c4009
      overlap between destination and source, use non-temporal store
Packit 6c4009
      instead of aligned store.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
#ifndef MEMCPY_SYMBOL
Packit 6c4009
# define MEMCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#ifndef MEMPCPY_SYMBOL
Packit 6c4009
# define MEMPCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#ifndef MEMMOVE_CHK_SYMBOL
Packit 6c4009
# define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#ifndef VZEROUPPER
Packit 6c4009
# if VEC_SIZE > 16
Packit 6c4009
#  define VZEROUPPER vzeroupper
Packit 6c4009
# else
Packit 6c4009
#  define VZEROUPPER
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#ifndef PREFETCH
Packit 6c4009
# define PREFETCH(addr) prefetcht0 addr
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* Assume 64-byte prefetch size.  */
Packit 6c4009
#ifndef PREFETCH_SIZE
Packit 6c4009
# define PREFETCH_SIZE 64
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
Packit 6c4009
Packit 6c4009
#if PREFETCH_SIZE == 64
Packit 6c4009
# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
Packit 6c4009
#  define PREFETCH_ONE_SET(dir, base, offset) \
Packit 6c4009
	PREFETCH ((offset)base)
Packit 6c4009
# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
Packit 6c4009
#  define PREFETCH_ONE_SET(dir, base, offset) \
Packit 6c4009
	PREFETCH ((offset)base); \
Packit 6c4009
	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
Packit 6c4009
# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
Packit 6c4009
#  define PREFETCH_ONE_SET(dir, base, offset) \
Packit 6c4009
	PREFETCH ((offset)base); \
Packit 6c4009
	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
Packit 6c4009
	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
Packit 6c4009
	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
Packit 6c4009
# else
Packit 6c4009
#   error Unsupported PREFETCHED_LOAD_SIZE!
Packit 6c4009
# endif
Packit 6c4009
#else
Packit 6c4009
# error Unsupported PREFETCH_SIZE!
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#ifndef SECTION
Packit 6c4009
# error SECTION is not defined!
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
	.section SECTION(.text),"ax",@progbits
Packit 6c4009
#if defined SHARED && IS_IN (libc)
Packit 6c4009
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
Packit 6c4009
	cmpq	%rdx, %rcx
Packit 6c4009
	jb	HIDDEN_JUMPTARGET (__chk_fail)
Packit 6c4009
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
Packit 6c4009
	movq	%rdi, %rax
Packit 6c4009
	addq	%rdx, %rax
Packit 6c4009
	jmp	L(start)
Packit 6c4009
END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
Packit 6c4009
Packit 6c4009
#if defined SHARED && IS_IN (libc)
Packit 6c4009
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
Packit 6c4009
	cmpq	%rdx, %rcx
Packit 6c4009
	jb	HIDDEN_JUMPTARGET (__chk_fail)
Packit 6c4009
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
Packit 6c4009
	movq	%rdi, %rax
Packit 6c4009
L(start):
Packit 6c4009
	cmpq	$VEC_SIZE, %rdx
Packit 6c4009
	jb	L(less_vec)
Packit 6c4009
	cmpq	$(VEC_SIZE * 2), %rdx
Packit 6c4009
	ja	L(more_2x_vec)
Packit 6c4009
#if !defined USE_MULTIARCH || !IS_IN (libc)
Packit 6c4009
L(last_2x_vec):
Packit 6c4009
#endif
Packit 6c4009
	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
Packit 6c4009
	VMOVU	(%rsi), %VEC(0)
Packit 6c4009
	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
Packit 6c4009
	VMOVU	%VEC(0), (%rdi)
Packit 6c4009
	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
#if !defined USE_MULTIARCH || !IS_IN (libc)
Packit 6c4009
L(nop):
Packit 6c4009
#endif
Packit 6c4009
	ret
Packit 6c4009
#if defined USE_MULTIARCH && IS_IN (libc)
Packit 6c4009
END (MEMMOVE_SYMBOL (__memmove, unaligned))
Packit 6c4009
Packit 6c4009
# if VEC_SIZE == 16
Packit 6c4009
ENTRY (__mempcpy_chk_erms)
Packit 6c4009
	cmpq	%rdx, %rcx
Packit 6c4009
	jb	HIDDEN_JUMPTARGET (__chk_fail)
Packit 6c4009
END (__mempcpy_chk_erms)
Packit 6c4009
Packit 6c4009
/* Only used to measure performance of REP MOVSB.  */
Packit 6c4009
ENTRY (__mempcpy_erms)
Packit 6c4009
	movq	%rdi, %rax
Packit 6c4009
	/* Skip zero length.  */
Packit 6c4009
	testq	%rdx, %rdx
Packit 6c4009
	jz	2f
Packit 6c4009
	addq	%rdx, %rax
Packit 6c4009
	jmp	L(start_movsb)
Packit 6c4009
END (__mempcpy_erms)
Packit 6c4009
Packit 6c4009
ENTRY (__memmove_chk_erms)
Packit 6c4009
	cmpq	%rdx, %rcx
Packit 6c4009
	jb	HIDDEN_JUMPTARGET (__chk_fail)
Packit 6c4009
END (__memmove_chk_erms)
Packit 6c4009
Packit 6c4009
ENTRY (__memmove_erms)
Packit 6c4009
	movq	%rdi, %rax
Packit 6c4009
	/* Skip zero length.  */
Packit 6c4009
	testq	%rdx, %rdx
Packit 6c4009
	jz	2f
Packit 6c4009
L(start_movsb):
Packit 6c4009
	movq	%rdx, %rcx
Packit 6c4009
	cmpq	%rsi, %rdi
Packit 6c4009
	jb	1f
Packit 6c4009
	/* Source == destination is less common.  */
Packit 6c4009
	je	2f
Packit 6c4009
	leaq	(%rsi,%rcx), %rdx
Packit 6c4009
	cmpq	%rdx, %rdi
Packit 6c4009
	jb	L(movsb_backward)
Packit 6c4009
1:
Packit 6c4009
	rep movsb
Packit 6c4009
2:
Packit 6c4009
	ret
Packit 6c4009
L(movsb_backward):
Packit 6c4009
	leaq	-1(%rdi,%rcx), %rdi
Packit 6c4009
	leaq	-1(%rsi,%rcx), %rsi
Packit 6c4009
	std
Packit 6c4009
	rep movsb
Packit 6c4009
	cld
Packit 6c4009
	ret
Packit 6c4009
END (__memmove_erms)
Packit 6c4009
strong_alias (__memmove_erms, __memcpy_erms)
Packit 6c4009
strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
# ifdef SHARED
Packit 6c4009
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
Packit 6c4009
	cmpq	%rdx, %rcx
Packit 6c4009
	jb	HIDDEN_JUMPTARGET (__chk_fail)
Packit 6c4009
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
Packit 6c4009
	movq	%rdi, %rax
Packit 6c4009
	addq	%rdx, %rax
Packit 6c4009
	jmp	L(start_erms)
Packit 6c4009
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
Packit 6c4009
Packit 6c4009
# ifdef SHARED
Packit 6c4009
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
Packit 6c4009
	cmpq	%rdx, %rcx
Packit 6c4009
	jb	HIDDEN_JUMPTARGET (__chk_fail)
Packit 6c4009
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
Packit 6c4009
	movq	%rdi, %rax
Packit 6c4009
L(start_erms):
Packit 6c4009
	cmpq	$VEC_SIZE, %rdx
Packit 6c4009
	jb	L(less_vec)
Packit 6c4009
	cmpq	$(VEC_SIZE * 2), %rdx
Packit 6c4009
	ja	L(movsb_more_2x_vec)
Packit 6c4009
L(last_2x_vec):
Packit 6c4009
	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
Packit 6c4009
	VMOVU	(%rsi), %VEC(0)
Packit 6c4009
	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
Packit 6c4009
	VMOVU	%VEC(0), (%rdi)
Packit 6c4009
	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
Packit 6c4009
L(return):
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
L(movsb):
Packit 6c4009
	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
Packit 6c4009
	jae	L(more_8x_vec)
Packit 6c4009
	cmpq	%rsi, %rdi
Packit 6c4009
	jb	1f
Packit 6c4009
	/* Source == destination is less common.  */
Packit 6c4009
	je	L(nop)
Packit 6c4009
	leaq	(%rsi,%rdx), %r9
Packit 6c4009
	cmpq	%r9, %rdi
Packit 6c4009
	/* Avoid slow backward REP MOVSB.  */
Packit 6c4009
	jb	L(more_8x_vec_backward)
Packit 6c4009
1:
Packit 6c4009
	movq	%rdx, %rcx
Packit 6c4009
	rep movsb
Packit 6c4009
L(nop):
Packit 6c4009
	ret
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
L(less_vec):
Packit 6c4009
	/* Less than 1 VEC.  */
Packit 6c4009
#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
Packit 6c4009
# error Unsupported VEC_SIZE!
Packit 6c4009
#endif
Packit 6c4009
#if VEC_SIZE > 32
Packit 6c4009
	cmpb	$32, %dl
Packit 6c4009
	jae	L(between_32_63)
Packit 6c4009
#endif
Packit 6c4009
#if VEC_SIZE > 16
Packit 6c4009
	cmpb	$16, %dl
Packit 6c4009
	jae	L(between_16_31)
Packit 6c4009
#endif
Packit 6c4009
	cmpb	$8, %dl
Packit 6c4009
	jae	L(between_8_15)
Packit 6c4009
	cmpb	$4, %dl
Packit 6c4009
	jae	L(between_4_7)
Packit 6c4009
	cmpb	$1, %dl
Packit 6c4009
	ja	L(between_2_3)
Packit 6c4009
	jb	1f
Packit 6c4009
	movzbl	(%rsi), %ecx
Packit 6c4009
	movb	%cl, (%rdi)
Packit 6c4009
1:
Packit 6c4009
	ret
Packit 6c4009
#if VEC_SIZE > 32
Packit 6c4009
L(between_32_63):
Packit 6c4009
	/* From 32 to 63.  No branch when size == 32.  */
Packit 6c4009
	vmovdqu	(%rsi), %ymm0
Packit 6c4009
	vmovdqu	-32(%rsi,%rdx), %ymm1
Packit 6c4009
	vmovdqu	%ymm0, (%rdi)
Packit 6c4009
	vmovdqu	%ymm1, -32(%rdi,%rdx)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
#endif
Packit 6c4009
#if VEC_SIZE > 16
Packit 6c4009
	/* From 16 to 31.  No branch when size == 16.  */
Packit 6c4009
L(between_16_31):
Packit 6c4009
	vmovdqu	(%rsi), %xmm0
Packit 6c4009
	vmovdqu	-16(%rsi,%rdx), %xmm1
Packit 6c4009
	vmovdqu	%xmm0, (%rdi)
Packit 6c4009
	vmovdqu	%xmm1, -16(%rdi,%rdx)
Packit 6c4009
	ret
Packit 6c4009
#endif
Packit 6c4009
L(between_8_15):
Packit 6c4009
	/* From 8 to 15.  No branch when size == 8.  */
Packit 6c4009
	movq	-8(%rsi,%rdx), %rcx
Packit 6c4009
	movq	(%rsi), %rsi
Packit 6c4009
	movq	%rcx, -8(%rdi,%rdx)
Packit 6c4009
	movq	%rsi, (%rdi)
Packit 6c4009
	ret
Packit 6c4009
L(between_4_7):
Packit 6c4009
	/* From 4 to 7.  No branch when size == 4.  */
Packit 6c4009
	movl	-4(%rsi,%rdx), %ecx
Packit 6c4009
	movl	(%rsi), %esi
Packit 6c4009
	movl	%ecx, -4(%rdi,%rdx)
Packit 6c4009
	movl	%esi, (%rdi)
Packit 6c4009
	ret
Packit 6c4009
L(between_2_3):
Packit 6c4009
	/* From 2 to 3.  No branch when size == 2.  */
Packit 6c4009
	movzwl	-2(%rsi,%rdx), %ecx
Packit 6c4009
	movzwl	(%rsi), %esi
Packit 6c4009
	movw	%cx, -2(%rdi,%rdx)
Packit 6c4009
	movw	%si, (%rdi)
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
#if defined USE_MULTIARCH && IS_IN (libc)
Packit 6c4009
L(movsb_more_2x_vec):
Packit Service f35bee
	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
Packit 6c4009
	ja	L(movsb)
Packit 6c4009
#endif
Packit 6c4009
L(more_2x_vec):
Packit 6c4009
	/* More than 2 * VEC and there may be overlap between destination
Packit 6c4009
	   and source.  */
Packit 6c4009
	cmpq	$(VEC_SIZE * 8), %rdx
Packit 6c4009
	ja	L(more_8x_vec)
Packit 6c4009
	cmpq	$(VEC_SIZE * 4), %rdx
Packit 6c4009
	jb	L(last_4x_vec)
Packit 6c4009
	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
Packit 6c4009
	VMOVU	(%rsi), %VEC(0)
Packit 6c4009
	VMOVU	VEC_SIZE(%rsi), %VEC(1)
Packit 6c4009
	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
Packit 6c4009
	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
Packit 6c4009
	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
Packit 6c4009
	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
Packit 6c4009
	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
Packit 6c4009
	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
Packit 6c4009
	VMOVU	%VEC(0), (%rdi)
Packit 6c4009
	VMOVU	%VEC(1), VEC_SIZE(%rdi)
Packit 6c4009
	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
Packit 6c4009
	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
Packit 6c4009
	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
Packit 6c4009
	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
Packit 6c4009
	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
Packit 6c4009
	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
L(last_4x_vec):
Packit 6c4009
	/* Copy from 2 * VEC to 4 * VEC. */
Packit 6c4009
	VMOVU	(%rsi), %VEC(0)
Packit 6c4009
	VMOVU	VEC_SIZE(%rsi), %VEC(1)
Packit 6c4009
	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
Packit 6c4009
	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
Packit 6c4009
	VMOVU	%VEC(0), (%rdi)
Packit 6c4009
	VMOVU	%VEC(1), VEC_SIZE(%rdi)
Packit 6c4009
	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
Packit 6c4009
	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
L(more_8x_vec):
Packit 6c4009
	cmpq	%rsi, %rdi
Packit 6c4009
	ja	L(more_8x_vec_backward)
Packit 6c4009
	/* Source == destination is less common.  */
Packit 6c4009
	je	L(nop)
Packit 6c4009
	/* Load the first VEC and last 4 * VEC to support overlapping
Packit 6c4009
	   addresses.  */
Packit 6c4009
	VMOVU	(%rsi), %VEC(4)
Packit 6c4009
	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
Packit 6c4009
	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
Packit 6c4009
	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
Packit 6c4009
	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
Packit 6c4009
	/* Save start and stop of the destination buffer.  */
Packit 6c4009
	movq	%rdi, %r11
Packit 6c4009
	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
Packit 6c4009
	/* Align destination for aligned stores in the loop.  Compute
Packit 6c4009
	   how much destination is misaligned.  */
Packit 6c4009
	movq	%rdi, %r8
Packit 6c4009
	andq	$(VEC_SIZE - 1), %r8
Packit 6c4009
	/* Get the negative of offset for alignment.  */
Packit 6c4009
	subq	$VEC_SIZE, %r8
Packit 6c4009
	/* Adjust source.  */
Packit 6c4009
	subq	%r8, %rsi
Packit 6c4009
	/* Adjust destination which should be aligned now.  */
Packit 6c4009
	subq	%r8, %rdi
Packit 6c4009
	/* Adjust length.  */
Packit 6c4009
	addq	%r8, %rdx
Packit 6c4009
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
Packit 6c4009
	/* Check non-temporal store threshold.  */
Packit 6c4009
	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
Packit 6c4009
	ja	L(large_forward)
Packit 6c4009
#endif
Packit 6c4009
L(loop_4x_vec_forward):
Packit 6c4009
	/* Copy 4 * VEC a time forward.  */
Packit 6c4009
	VMOVU	(%rsi), %VEC(0)
Packit 6c4009
	VMOVU	VEC_SIZE(%rsi), %VEC(1)
Packit 6c4009
	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
Packit 6c4009
	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
Packit 6c4009
	addq	$(VEC_SIZE * 4), %rsi
Packit 6c4009
	subq	$(VEC_SIZE * 4), %rdx
Packit 6c4009
	VMOVA	%VEC(0), (%rdi)
Packit 6c4009
	VMOVA	%VEC(1), VEC_SIZE(%rdi)
Packit 6c4009
	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
Packit 6c4009
	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
Packit 6c4009
	addq	$(VEC_SIZE * 4), %rdi
Packit 6c4009
	cmpq	$(VEC_SIZE * 4), %rdx
Packit 6c4009
	ja	L(loop_4x_vec_forward)
Packit 6c4009
	/* Store the last 4 * VEC.  */
Packit 6c4009
	VMOVU	%VEC(5), (%rcx)
Packit 6c4009
	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
Packit 6c4009
	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
Packit 6c4009
	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
Packit 6c4009
	/* Store the first VEC.  */
Packit 6c4009
	VMOVU	%VEC(4), (%r11)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
L(more_8x_vec_backward):
Packit 6c4009
	/* Load the first 4 * VEC and last VEC to support overlapping
Packit 6c4009
	   addresses.  */
Packit 6c4009
	VMOVU	(%rsi), %VEC(4)
Packit 6c4009
	VMOVU	VEC_SIZE(%rsi), %VEC(5)
Packit 6c4009
	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
Packit 6c4009
	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
Packit 6c4009
	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
Packit 6c4009
	/* Save stop of the destination buffer.  */
Packit 6c4009
	leaq	-VEC_SIZE(%rdi, %rdx), %r11
Packit 6c4009
	/* Align destination end for aligned stores in the loop.  Compute
Packit 6c4009
	   how much destination end is misaligned.  */
Packit 6c4009
	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
Packit 6c4009
	movq	%r11, %r9
Packit 6c4009
	movq	%r11, %r8
Packit 6c4009
	andq	$(VEC_SIZE - 1), %r8
Packit 6c4009
	/* Adjust source.  */
Packit 6c4009
	subq	%r8, %rcx
Packit 6c4009
	/* Adjust the end of destination which should be aligned now.  */
Packit 6c4009
	subq	%r8, %r9
Packit 6c4009
	/* Adjust length.  */
Packit 6c4009
	subq	%r8, %rdx
Packit 6c4009
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
Packit 6c4009
	/* Check non-temporal store threshold.  */
Packit 6c4009
	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
Packit 6c4009
	ja	L(large_backward)
Packit 6c4009
#endif
Packit 6c4009
L(loop_4x_vec_backward):
Packit 6c4009
	/* Copy 4 * VEC a time backward.  */
Packit 6c4009
	VMOVU	(%rcx), %VEC(0)
Packit 6c4009
	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
Packit 6c4009
	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
Packit 6c4009
	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
Packit 6c4009
	subq	$(VEC_SIZE * 4), %rcx
Packit 6c4009
	subq	$(VEC_SIZE * 4), %rdx
Packit 6c4009
	VMOVA	%VEC(0), (%r9)
Packit 6c4009
	VMOVA	%VEC(1), -VEC_SIZE(%r9)
Packit 6c4009
	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
Packit 6c4009
	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
Packit 6c4009
	subq	$(VEC_SIZE * 4), %r9
Packit 6c4009
	cmpq	$(VEC_SIZE * 4), %rdx
Packit 6c4009
	ja	L(loop_4x_vec_backward)
Packit 6c4009
	/* Store the first 4 * VEC.  */
Packit 6c4009
	VMOVU	%VEC(4), (%rdi)
Packit 6c4009
	VMOVU	%VEC(5), VEC_SIZE(%rdi)
Packit 6c4009
	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
Packit 6c4009
	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
Packit 6c4009
	/* Store the last VEC.  */
Packit 6c4009
	VMOVU	%VEC(8), (%r11)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
Packit 6c4009
L(large_forward):
Packit 6c4009
	/* Don't use non-temporal store if there is overlap between
Packit 6c4009
	   destination and source since destination may be in cache
Packit 6c4009
	   when source is loaded.  */
Packit 6c4009
	leaq    (%rdi, %rdx), %r10
Packit 6c4009
	cmpq    %r10, %rsi
Packit 6c4009
	jb	L(loop_4x_vec_forward)
Packit 6c4009
L(loop_large_forward):
Packit 6c4009
	/* Copy 4 * VEC a time forward with non-temporal stores.  */
Packit 6c4009
	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
Packit 6c4009
	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
Packit 6c4009
	VMOVU	(%rsi), %VEC(0)
Packit 6c4009
	VMOVU	VEC_SIZE(%rsi), %VEC(1)
Packit 6c4009
	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
Packit 6c4009
	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
Packit 6c4009
	addq	$PREFETCHED_LOAD_SIZE, %rsi
Packit 6c4009
	subq	$PREFETCHED_LOAD_SIZE, %rdx
Packit 6c4009
	VMOVNT	%VEC(0), (%rdi)
Packit 6c4009
	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
Packit 6c4009
	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
Packit 6c4009
	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
Packit 6c4009
	addq	$PREFETCHED_LOAD_SIZE, %rdi
Packit 6c4009
	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
Packit 6c4009
	ja	L(loop_large_forward)
Packit 6c4009
	sfence
Packit 6c4009
	/* Store the last 4 * VEC.  */
Packit 6c4009
	VMOVU	%VEC(5), (%rcx)
Packit 6c4009
	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
Packit 6c4009
	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
Packit 6c4009
	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
Packit 6c4009
	/* Store the first VEC.  */
Packit 6c4009
	VMOVU	%VEC(4), (%r11)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
L(large_backward):
Packit 6c4009
	/* Don't use non-temporal store if there is overlap between
Packit 6c4009
	   destination and source since destination may be in cache
Packit 6c4009
	   when source is loaded.  */
Packit 6c4009
	leaq    (%rcx, %rdx), %r10
Packit 6c4009
	cmpq    %r10, %r9
Packit 6c4009
	jb	L(loop_4x_vec_backward)
Packit 6c4009
L(loop_large_backward):
Packit 6c4009
	/* Copy 4 * VEC a time backward with non-temporal stores.  */
Packit 6c4009
	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
Packit 6c4009
	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
Packit 6c4009
	VMOVU	(%rcx), %VEC(0)
Packit 6c4009
	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
Packit 6c4009
	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
Packit 6c4009
	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
Packit 6c4009
	subq	$PREFETCHED_LOAD_SIZE, %rcx
Packit 6c4009
	subq	$PREFETCHED_LOAD_SIZE, %rdx
Packit 6c4009
	VMOVNT	%VEC(0), (%r9)
Packit 6c4009
	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
Packit 6c4009
	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
Packit 6c4009
	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
Packit 6c4009
	subq	$PREFETCHED_LOAD_SIZE, %r9
Packit 6c4009
	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
Packit 6c4009
	ja	L(loop_large_backward)
Packit 6c4009
	sfence
Packit 6c4009
	/* Store the first 4 * VEC.  */
Packit 6c4009
	VMOVU	%VEC(4), (%rdi)
Packit 6c4009
	VMOVU	%VEC(5), VEC_SIZE(%rdi)
Packit 6c4009
	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
Packit 6c4009
	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
Packit 6c4009
	/* Store the last VEC.  */
Packit 6c4009
	VMOVU	%VEC(8), (%r11)
Packit 6c4009
	VZEROUPPER
Packit 6c4009
	ret
Packit 6c4009
#endif
Packit 6c4009
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
Packit 6c4009
Packit 6c4009
#if IS_IN (libc)
Packit 6c4009
# ifdef USE_MULTIARCH
Packit 6c4009
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
Packit 6c4009
	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
Packit 6c4009
#  ifdef SHARED
Packit 6c4009
strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
Packit 6c4009
	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
Packit 6c4009
#  endif
Packit 6c4009
# endif
Packit 6c4009
# ifdef SHARED
Packit 6c4009
strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
Packit 6c4009
	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
Packit 6c4009
	      MEMCPY_SYMBOL (__memcpy, unaligned))