Blame sysdeps/aarch64/memcpy.S

Packit 6c4009
/* Copyright (C) 2012-2018 Free Software Foundation, Inc.
Packit 6c4009
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library.  If not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* Assumptions:
Packit 6c4009
 *
Packit 6c4009
 * ARMv8-a, AArch64, unaligned accesses.
Packit 6c4009
 *
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
#define dstin	x0
Packit 6c4009
#define src	x1
Packit 6c4009
#define count	x2
Packit 6c4009
#define dst	x3
Packit 6c4009
#define srcend	x4
Packit 6c4009
#define dstend	x5
Packit 6c4009
#define A_l	x6
Packit 6c4009
#define A_lw	w6
Packit 6c4009
#define A_h	x7
Packit 6c4009
#define A_hw	w7
Packit 6c4009
#define B_l	x8
Packit 6c4009
#define B_lw	w8
Packit 6c4009
#define B_h	x9
Packit 6c4009
#define C_l	x10
Packit 6c4009
#define C_h	x11
Packit 6c4009
#define D_l	x12
Packit 6c4009
#define D_h	x13
Packit 6c4009
#define E_l	src
Packit 6c4009
#define E_h	count
Packit 6c4009
#define F_l	srcend
Packit 6c4009
#define F_h	dst
Packit 6c4009
#define G_l	count
Packit 6c4009
#define G_h	dst
Packit 6c4009
#define tmp1	x14
Packit 6c4009
Packit 6c4009
/* Copies are split into 3 main cases: small copies of up to 16 bytes,
Packit 6c4009
   medium copies of 17..96 bytes which are fully unrolled. Large copies
Packit 6c4009
   of more than 96 bytes align the destination and use an unrolled loop
Packit 6c4009
   processing 64 bytes per iteration.
Packit 6c4009
   In order to share code with memmove, small and medium copies read all
Packit 6c4009
   data before writing, allowing any kind of overlap. So small, medium
Packit 6c4009
   and large backwards memmoves are handled by falling through into memcpy.
Packit 6c4009
   Overlapping large forward memmoves use a loop that copies backwards.
Packit 6c4009
*/
Packit 6c4009
Packit 6c4009
#ifndef MEMMOVE
Packit 6c4009
# define MEMMOVE memmove
Packit 6c4009
#endif
Packit 6c4009
#ifndef MEMCPY
Packit 6c4009
# define MEMCPY memcpy
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
ENTRY_ALIGN (MEMMOVE, 6)
Packit 6c4009
Packit 6c4009
	DELOUSE (0)
Packit 6c4009
	DELOUSE (1)
Packit 6c4009
	DELOUSE (2)
Packit 6c4009
Packit 6c4009
	sub	tmp1, dstin, src
Packit 6c4009
	cmp	count, 96
Packit 6c4009
	ccmp	tmp1, count, 2, hi
Packit 6c4009
	b.lo	L(move_long)
Packit 6c4009
Packit 6c4009
	/* Common case falls through into memcpy.  */
Packit 6c4009
END (MEMMOVE)
Packit 6c4009
libc_hidden_builtin_def (MEMMOVE)
Packit 6c4009
ENTRY (MEMCPY)
Packit 6c4009
Packit 6c4009
	DELOUSE (0)
Packit 6c4009
	DELOUSE (1)
Packit 6c4009
	DELOUSE (2)
Packit 6c4009
Packit 6c4009
	prfm	PLDL1KEEP, [src]
Packit 6c4009
	add	srcend, src, count
Packit 6c4009
	add	dstend, dstin, count
Packit 6c4009
	cmp	count, 16
Packit 6c4009
	b.ls	L(copy16)
Packit 6c4009
	cmp	count, 96
Packit 6c4009
	b.hi	L(copy_long)
Packit 6c4009
Packit 6c4009
	/* Medium copies: 17..96 bytes.  */
Packit 6c4009
	sub	tmp1, count, 1
Packit 6c4009
	ldp	A_l, A_h, [src]
Packit 6c4009
	tbnz	tmp1, 6, L(copy96)
Packit 6c4009
	ldp	D_l, D_h, [srcend, -16]
Packit 6c4009
	tbz	tmp1, 5, 1f
Packit 6c4009
	ldp	B_l, B_h, [src, 16]
Packit 6c4009
	ldp	C_l, C_h, [srcend, -32]
Packit 6c4009
	stp	B_l, B_h, [dstin, 16]
Packit 6c4009
	stp	C_l, C_h, [dstend, -32]
Packit 6c4009
1:
Packit 6c4009
	stp	A_l, A_h, [dstin]
Packit 6c4009
	stp	D_l, D_h, [dstend, -16]
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
	/* Small copies: 0..16 bytes.  */
Packit 6c4009
L(copy16):
Packit 6c4009
	cmp	count, 8
Packit 6c4009
	b.lo	1f
Packit 6c4009
	ldr	A_l, [src]
Packit 6c4009
	ldr	A_h, [srcend, -8]
Packit 6c4009
	str	A_l, [dstin]
Packit 6c4009
	str	A_h, [dstend, -8]
Packit 6c4009
	ret
Packit 6c4009
	.p2align 4
Packit 6c4009
1:
Packit 6c4009
	tbz	count, 2, 1f
Packit 6c4009
	ldr	A_lw, [src]
Packit 6c4009
	ldr	A_hw, [srcend, -4]
Packit 6c4009
	str	A_lw, [dstin]
Packit 6c4009
	str	A_hw, [dstend, -4]
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
Packit 6c4009
	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
Packit 6c4009
1:
Packit 6c4009
	cbz	count, 2f
Packit 6c4009
	lsr	tmp1, count, 1
Packit 6c4009
	ldrb	A_lw, [src]
Packit 6c4009
	ldrb	A_hw, [srcend, -1]
Packit 6c4009
	ldrb	B_lw, [src, tmp1]
Packit 6c4009
	strb	A_lw, [dstin]
Packit 6c4009
	strb	B_lw, [dstin, tmp1]
Packit 6c4009
	strb	A_hw, [dstend, -1]
Packit 6c4009
2:	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
Packit 6c4009
	   32 bytes from the end.  */
Packit 6c4009
L(copy96):
Packit 6c4009
	ldp	B_l, B_h, [src, 16]
Packit 6c4009
	ldp	C_l, C_h, [src, 32]
Packit 6c4009
	ldp	D_l, D_h, [src, 48]
Packit 6c4009
	ldp	E_l, E_h, [srcend, -32]
Packit 6c4009
	ldp	F_l, F_h, [srcend, -16]
Packit 6c4009
	stp	A_l, A_h, [dstin]
Packit 6c4009
	stp	B_l, B_h, [dstin, 16]
Packit 6c4009
	stp	C_l, C_h, [dstin, 32]
Packit 6c4009
	stp	D_l, D_h, [dstin, 48]
Packit 6c4009
	stp	E_l, E_h, [dstend, -32]
Packit 6c4009
	stp	F_l, F_h, [dstend, -16]
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	/* Align DST to 16 byte alignment so that we don't cross cache line
Packit 6c4009
	   boundaries on both loads and stores.  There are at least 96 bytes
Packit 6c4009
	   to copy, so copy 16 bytes unaligned and then align.  The loop
Packit 6c4009
	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(copy_long):
Packit 6c4009
	and	tmp1, dstin, 15
Packit 6c4009
	bic	dst, dstin, 15
Packit 6c4009
	ldp	D_l, D_h, [src]
Packit 6c4009
	sub	src, src, tmp1
Packit 6c4009
	add	count, count, tmp1	/* Count is now 16 too large.  */
Packit 6c4009
	ldp	A_l, A_h, [src, 16]
Packit 6c4009
	stp	D_l, D_h, [dstin]
Packit 6c4009
	ldp	B_l, B_h, [src, 32]
Packit 6c4009
	ldp	C_l, C_h, [src, 48]
Packit 6c4009
	ldp	D_l, D_h, [src, 64]!
Packit 6c4009
	subs	count, count, 128 + 16	/* Test and readjust count.  */
Packit 6c4009
	b.ls	L(last64)
Packit 6c4009
L(loop64):
Packit 6c4009
	stp	A_l, A_h, [dst, 16]
Packit 6c4009
	ldp	A_l, A_h, [src, 16]
Packit 6c4009
	stp	B_l, B_h, [dst, 32]
Packit 6c4009
	ldp	B_l, B_h, [src, 32]
Packit 6c4009
	stp	C_l, C_h, [dst, 48]
Packit 6c4009
	ldp	C_l, C_h, [src, 48]
Packit 6c4009
	stp	D_l, D_h, [dst, 64]!
Packit 6c4009
	ldp	D_l, D_h, [src, 64]!
Packit 6c4009
	subs	count, count, 64
Packit 6c4009
	b.hi	L(loop64)
Packit 6c4009
Packit 6c4009
	/* Write the last full set of 64 bytes.  The remainder is at most 64
Packit 6c4009
	   bytes, so it is safe to always copy 64 bytes from the end even if
Packit 6c4009
	   there is just 1 byte left.  */
Packit 6c4009
L(last64):
Packit 6c4009
	ldp	E_l, E_h, [srcend, -64]
Packit 6c4009
	stp	A_l, A_h, [dst, 16]
Packit 6c4009
	ldp	A_l, A_h, [srcend, -48]
Packit 6c4009
	stp	B_l, B_h, [dst, 32]
Packit 6c4009
	ldp	B_l, B_h, [srcend, -32]
Packit 6c4009
	stp	C_l, C_h, [dst, 48]
Packit 6c4009
	ldp	C_l, C_h, [srcend, -16]
Packit 6c4009
	stp	D_l, D_h, [dst, 64]
Packit 6c4009
	stp	E_l, E_h, [dstend, -64]
Packit 6c4009
	stp	A_l, A_h, [dstend, -48]
Packit 6c4009
	stp	B_l, B_h, [dstend, -32]
Packit 6c4009
	stp	C_l, C_h, [dstend, -16]
Packit 6c4009
	ret
Packit 6c4009
Packit 6c4009
	.p2align 4
Packit 6c4009
L(move_long):
Packit 6c4009
	cbz	tmp1, 3f
Packit 6c4009
Packit 6c4009
	add	srcend, src, count
Packit 6c4009
	add	dstend, dstin, count
Packit 6c4009
Packit 6c4009
	/* Align dstend to 16 byte alignment so that we don't cross cache line
Packit 6c4009
	   boundaries on both loads and stores.  There are at least 96 bytes
Packit 6c4009
	   to copy, so copy 16 bytes unaligned and then align.  The loop
Packit 6c4009
	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
Packit 6c4009
Packit 6c4009
	and	tmp1, dstend, 15
Packit 6c4009
	ldp	D_l, D_h, [srcend, -16]
Packit 6c4009
	sub	srcend, srcend, tmp1
Packit 6c4009
	sub	count, count, tmp1
Packit 6c4009
	ldp	A_l, A_h, [srcend, -16]
Packit 6c4009
	stp	D_l, D_h, [dstend, -16]
Packit 6c4009
	ldp	B_l, B_h, [srcend, -32]
Packit 6c4009
	ldp	C_l, C_h, [srcend, -48]
Packit 6c4009
	ldp	D_l, D_h, [srcend, -64]!
Packit 6c4009
	sub	dstend, dstend, tmp1
Packit 6c4009
	subs	count, count, 128
Packit 6c4009
	b.ls	2f
Packit 6c4009
Packit 6c4009
	nop
Packit 6c4009
1:
Packit 6c4009
	stp	A_l, A_h, [dstend, -16]
Packit 6c4009
	ldp	A_l, A_h, [srcend, -16]
Packit 6c4009
	stp	B_l, B_h, [dstend, -32]
Packit 6c4009
	ldp	B_l, B_h, [srcend, -32]
Packit 6c4009
	stp	C_l, C_h, [dstend, -48]
Packit 6c4009
	ldp	C_l, C_h, [srcend, -48]
Packit 6c4009
	stp	D_l, D_h, [dstend, -64]!
Packit 6c4009
	ldp	D_l, D_h, [srcend, -64]!
Packit 6c4009
	subs	count, count, 64
Packit 6c4009
	b.hi	1b
Packit 6c4009
Packit 6c4009
	/* Write the last full set of 64 bytes.  The remainder is at most 64
Packit 6c4009
	   bytes, so it is safe to always copy 64 bytes from the start even if
Packit 6c4009
	   there is just 1 byte left.  */
Packit 6c4009
2:
Packit 6c4009
	ldp	G_l, G_h, [src, 48]
Packit 6c4009
	stp	A_l, A_h, [dstend, -16]
Packit 6c4009
	ldp	A_l, A_h, [src, 32]
Packit 6c4009
	stp	B_l, B_h, [dstend, -32]
Packit 6c4009
	ldp	B_l, B_h, [src, 16]
Packit 6c4009
	stp	C_l, C_h, [dstend, -48]
Packit 6c4009
	ldp	C_l, C_h, [src]
Packit 6c4009
	stp	D_l, D_h, [dstend, -64]
Packit 6c4009
	stp	G_l, G_h, [dstin, 48]
Packit 6c4009
	stp	A_l, A_h, [dstin, 32]
Packit 6c4009
	stp	B_l, B_h, [dstin, 16]
Packit 6c4009
	stp	C_l, C_h, [dstin]
Packit 6c4009
3:	ret
Packit 6c4009
Packit 6c4009
END (MEMCPY)
Packit 6c4009
libc_hidden_builtin_def (MEMCPY)