Blame sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S

Packit 6c4009
/* Optimized memcpy implementation for cached memory on PowerPC64/POWER8.
Packit 6c4009
   Copyright (C) 2017-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
Packit 6c4009
/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
Packit 6c4009
   Returns 'dst'.  */
Packit 6c4009
Packit 6c4009
	.machine power8
Packit 6c4009
ENTRY_TOCLESS (__memcpy_power8_cached, 5)
Packit 6c4009
	CALL_MCOUNT 3
Packit 6c4009
Packit 6c4009
	cmpldi	cr7,r5,15
Packit 6c4009
	bgt	cr7,L(ge_16)
Packit 6c4009
	andi.	r9,r5,0x1
Packit 6c4009
	mr	r9,r3
Packit 6c4009
	beq	cr0,1f
Packit 6c4009
	lbz	r10,0(r4)
Packit 6c4009
	addi	r9,r3,1
Packit 6c4009
	addi	r4,r4,1
Packit 6c4009
	stb	r10,0(r3)
Packit 6c4009
1:
Packit 6c4009
	andi.	r10,r5,0x2
Packit 6c4009
	beq	cr0,2f
Packit 6c4009
	lhz	r10,0(r4)
Packit 6c4009
	addi	r9,r9,2
Packit 6c4009
	addi	r4,r4,2
Packit 6c4009
	sth	r10,-2(r9)
Packit 6c4009
2:
Packit 6c4009
	andi.	r10,r5,0x4
Packit 6c4009
	beq	cr0,3f
Packit 6c4009
	lwz	r10,0(r4)
Packit 6c4009
	addi	r9,9,4
Packit 6c4009
	addi	r4,4,4
Packit 6c4009
	stw	r10,-4(r9)
Packit 6c4009
3:
Packit 6c4009
	andi.	r10,r5,0x8
Packit 6c4009
	beqlr	cr0
Packit 6c4009
	ld	r10,0(r4)
Packit 6c4009
	std	r10,0(r9)
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(ge_16):
Packit 6c4009
	cmpldi	cr7,r5,32
Packit 6c4009
	ble	cr7,L(ge_16_le_32)
Packit 6c4009
	cmpldi	cr7,r5,64
Packit 6c4009
	ble	cr7,L(gt_32_le_64)
Packit 6c4009
Packit 6c4009
	/* Align dst to 16 bytes.  */
Packit 6c4009
	andi.	r9,r3,0xf
Packit 6c4009
	mr	r12,r3
Packit 6c4009
	beq	cr0,L(dst_is_align_16)
Packit 6c4009
	lxvd2x	v0,0,r4
Packit 6c4009
	subfic	r12,r9,16
Packit 6c4009
	subf	r5,r12,r5
Packit 6c4009
	add	r4,r4,r12
Packit 6c4009
	add	r12,r3,r12
Packit 6c4009
	stxvd2x	v0,0,r3
Packit 6c4009
L(dst_is_align_16):
Packit 6c4009
	cmpldi	cr7,r5,127
Packit 6c4009
	ble	cr7,L(tail_copy)
Packit 6c4009
	mr	r9,r12
Packit 6c4009
	srdi	r10,r5,7
Packit 6c4009
	li	r11,16
Packit 6c4009
	li	r6,32
Packit 6c4009
	li	r7,48
Packit 6c4009
	mtctr	r10
Packit 6c4009
	clrrdi	r0,r5,7
Packit 6c4009
Packit 6c4009
	/* Main loop, copy 128 bytes each time.  */
Packit 6c4009
	.align 4
Packit 6c4009
L(copy_128):
Packit 6c4009
	lxvd2x	v10,0,r4
Packit 6c4009
	lxvd2x	v11,r4,r11
Packit 6c4009
	addi	r8,r4,64
Packit 6c4009
	addi	r10,r9,64
Packit 6c4009
	lxvd2x	v12,r4,r6
Packit 6c4009
	lxvd2x	v0,r4,r7
Packit 6c4009
	addi	r4,r4,128
Packit 6c4009
	stxvd2x v10,0,r9
Packit 6c4009
	stxvd2x v11,r9,r11
Packit 6c4009
	stxvd2x v12,r9,r6
Packit 6c4009
	stxvd2x v0,r9,r7
Packit 6c4009
	addi	r9,r9,128
Packit 6c4009
	lxvd2x	v10,0,r8
Packit 6c4009
	lxvd2x	v11,r8,r11
Packit 6c4009
	lxvd2x	v12,r8,r6
Packit 6c4009
	lxvd2x	v0,r8,r7
Packit 6c4009
	stxvd2x v10,0,r10
Packit 6c4009
	stxvd2x v11,r10,r11
Packit 6c4009
	stxvd2x v12,r10,r6
Packit 6c4009
	stxvd2x v0,r10,r7
Packit 6c4009
	bdnz	L(copy_128)
Packit 6c4009
Packit 6c4009
	add	r12,r12,r0
Packit 6c4009
	rldicl 	r5,r5,0,57
Packit 6c4009
L(tail_copy):
Packit 6c4009
	cmpldi	cr7,r5,63
Packit 6c4009
	ble	cr7,L(tail_le_64)
Packit 6c4009
	li	r8,16
Packit 6c4009
	li	r10,32
Packit 6c4009
	lxvd2x	v10,0,r4
Packit 6c4009
	li	r9,48
Packit 6c4009
	addi	r5,r5,-64
Packit 6c4009
	lxvd2x	v11,r4,r8
Packit 6c4009
	lxvd2x	v12,r4,r10
Packit 6c4009
	lxvd2x	v0,r4,r9
Packit 6c4009
	addi	r4,r4,64
Packit 6c4009
	stxvd2x	v10,0,r12
Packit 6c4009
	stxvd2x	v11,r12,r8
Packit 6c4009
	stxvd2x	v12,r12,r10
Packit 6c4009
	stxvd2x	v0,r12,9
Packit 6c4009
	addi	r12,r12,64
Packit 6c4009
Packit 6c4009
L(tail_le_64):
Packit 6c4009
	cmpldi	cr7,r5,32
Packit 6c4009
	bgt	cr7,L(tail_gt_32_le_64)
Packit 6c4009
	cmpdi	cr7,r5,0
Packit 6c4009
	beqlr	cr7
Packit 6c4009
	addi	r5,r5,-32
Packit 6c4009
	li	r9,16
Packit 6c4009
	add	r8,r4,r5
Packit 6c4009
	add	r10,r12,r5
Packit 6c4009
	lxvd2x	v12,r4,r5
Packit 6c4009
	lxvd2x	v0,r8,r9
Packit 6c4009
	stxvd2x	v12,r12,r5
Packit 6c4009
	stxvd2x	v0,r10,r9
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(ge_16_le_32):
Packit 6c4009
	addi	r5,r5,-16
Packit 6c4009
	lxvd2x	v0,0,r4
Packit 6c4009
	lxvd2x	v1,r4,r5
Packit 6c4009
	stxvd2x	v0,0,r3
Packit 6c4009
	stxvd2x	v1,r3,r5
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(gt_32_le_64):
Packit 6c4009
	mr	r12,r3
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
L(tail_gt_32_le_64):
Packit 6c4009
	li	r9,16
Packit 6c4009
	lxvd2x	v0,0,r4
Packit 6c4009
	addi	r5,r5,-32
Packit 6c4009
	lxvd2x	v1,r4,r9
Packit 6c4009
	add	r8,r4,r5
Packit 6c4009
	lxvd2x	v2,r4,r5
Packit 6c4009
	add	r10,r12,r5
Packit 6c4009
	lxvd2x	v3,r8,r9
Packit 6c4009
	stxvd2x	v0,0,r12
Packit 6c4009
	stxvd2x	v1,r12,r9
Packit 6c4009
	stxvd2x	v2,r12,r5
Packit 6c4009
	stxvd2x	v3,r10,r9
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
END_GEN_TB (__memcpy_power8_cached,TB_TOCLESS)