Blame sysdeps/powerpc/powerpc64/cell/memcpy.S

Packit 6c4009
/* Optimized memcpy implementation for CELL BE PowerPC.
Packit 6c4009
   Copyright (C) 2010-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
#ifndef MEMCPY
Packit 6c4009
# define MEMCPY memcpy
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#define PREFETCH_AHEAD 6	/* no cache lines SRC prefetching ahead  */
Packit 6c4009
#define ZERO_AHEAD 4		/* no cache lines DST zeroing ahead  */
Packit 6c4009
Packit 6c4009
/* memcpy routine optimized for CELL-BE-PPC	v2.0
Packit 6c4009
 *
Packit 6c4009
 * The CELL PPC core has 1 integer unit and 1 load/store unit
Packit 6c4009
 * CELL:
Packit 6c4009
 * 1st level data cache = 32K
Packit 6c4009
 * 2nd level data cache = 512K
Packit 6c4009
 * 3rd level data cache = 0K
Packit 6c4009
 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
Packit 6c4009
 * latency to memory is >400 clocks
Packit 6c4009
 * To improve copy performance we need to prefetch source data
Packit 6c4009
 * far ahead to hide this latency
Packit 6c4009
 * For best performance instruction forms ending in "." like "andi."
Packit 6c4009
 * should be avoided as the are implemented in microcode on CELL.
Packit 6c4009
 * The below code is loop unrolled for the CELL cache line of 128 bytes
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
.align  7
Packit 6c4009
Packit 6c4009
ENTRY_TOCLESS (MEMCPY, 5)
Packit 6c4009
	CALL_MCOUNT 3
Packit 6c4009
Packit 6c4009
	dcbt	0,r4		/* Prefetch ONE SRC cacheline  */
Packit 6c4009
	cmpldi	cr1,r5,16	/* is size < 16 ?  */
Packit 6c4009
	mr	r6,r3
Packit 6c4009
	blt+	cr1,.Lshortcopy
Packit 6c4009
Packit 6c4009
.Lbigcopy:
Packit 6c4009
	neg	r8,r3		/* LS 3 bits = # bytes to 8-byte dest bdry  */
Packit 6c4009
	clrldi  r8,r8,64-4	/* align to 16byte boundary  */
Packit 6c4009
	sub     r7,r4,r3
Packit 6c4009
	cmpldi	cr0,r8,0
Packit 6c4009
	beq+	.Ldst_aligned
Packit 6c4009
Packit 6c4009
.Ldst_unaligned:
Packit 6c4009
	mtcrf	0x01,r8		/* put #bytes to boundary into cr7  */
Packit 6c4009
	subf	r5,r8,r5
Packit 6c4009
Packit 6c4009
	bf	cr7*4+3,1f
Packit 6c4009
	lbzx	r0,r7,r6	/* copy 1 byte  */
Packit 6c4009
	stb	r0,0(r6)
Packit 6c4009
	addi	r6,r6,1
Packit 6c4009
1:	bf	cr7*4+2,2f
Packit 6c4009
	lhzx	r0,r7,r6	/* copy 2 byte  */
Packit 6c4009
	sth	r0,0(r6)
Packit 6c4009
	addi	r6,r6,2
Packit 6c4009
2:	bf	cr7*4+1,4f
Packit 6c4009
	lwzx	r0,r7,r6	/* copy 4 byte  */
Packit 6c4009
	stw	r0,0(r6)
Packit 6c4009
	addi	r6,r6,4
Packit 6c4009
4:	bf	cr7*4+0,8f
Packit 6c4009
	ldx	r0,r7,r6	/* copy 8 byte  */
Packit 6c4009
	std	r0,0(r6)
Packit 6c4009
	addi	r6,r6,8
Packit 6c4009
8:
Packit 6c4009
	add	r4,r7,r6
Packit 6c4009
Packit 6c4009
.Ldst_aligned:
Packit 6c4009
Packit 6c4009
	cmpdi	cr5,r5,128-1
Packit 6c4009
Packit 6c4009
	neg	r7,r6
Packit 6c4009
	addi	r6,r6,-8	/* prepare for stdu  */
Packit 6c4009
	addi	r4,r4,-8	/* prepare for ldu  */
Packit 6c4009
Packit 6c4009
	clrldi  r7,r7,64-7	/* align to cacheline boundary  */
Packit 6c4009
	ble+	cr5,.Llessthancacheline
Packit 6c4009
Packit 6c4009
	cmpldi	cr6,r7,0
Packit 6c4009
	subf	r5,r7,r5
Packit 6c4009
	srdi	r7,r7,4		/* divide size by 16  */
Packit 6c4009
	srdi	r10,r5,7	/* number of cache lines to copy  */
Packit 6c4009
Packit 6c4009
	cmpldi	r10,0
Packit 6c4009
	li	r11,0		/* number cachelines to copy with prefetch  */
Packit 6c4009
	beq	.Lnocacheprefetch
Packit 6c4009
Packit 6c4009
	cmpldi	r10,PREFETCH_AHEAD
Packit 6c4009
	li	r12,128+8	/* prefetch distance  */
Packit 6c4009
	ble	.Llessthanmaxprefetch
Packit 6c4009
Packit 6c4009
	subi	r11,r10,PREFETCH_AHEAD
Packit 6c4009
	li	r10,PREFETCH_AHEAD
Packit 6c4009
Packit 6c4009
.Llessthanmaxprefetch:
Packit 6c4009
	mtctr	r10
Packit 6c4009
Packit 6c4009
.LprefetchSRC:
Packit 6c4009
	dcbt    r12,r4
Packit 6c4009
	addi    r12,r12,128
Packit 6c4009
	bdnz    .LprefetchSRC
Packit 6c4009
Packit 6c4009
.Lnocacheprefetch:
Packit 6c4009
	mtctr	r7
Packit 6c4009
	cmpldi	cr1,r5,128
Packit 6c4009
	clrldi  r5,r5,64-7
Packit 6c4009
	beq	cr6,.Lcachelinealigned
Packit 6c4009
Packit 6c4009
.Laligntocacheline:
Packit 6c4009
	ld	r9,0x08(r4)
Packit 6c4009
	ldu	r7,0x10(r4)
Packit 6c4009
	std	r9,0x08(r6)
Packit 6c4009
	stdu	r7,0x10(r6)
Packit 6c4009
	bdnz	.Laligntocacheline
Packit 6c4009
Packit 6c4009
Packit 6c4009
.Lcachelinealigned:		/* copy while cache lines  */
Packit 6c4009
Packit 6c4009
	blt-	cr1,.Llessthancacheline	/* size <128  */
Packit 6c4009
Packit 6c4009
.Louterloop:
Packit 6c4009
	cmpdi   r11,0
Packit 6c4009
	mtctr	r11
Packit 6c4009
	beq-	.Lendloop
Packit 6c4009
Packit 6c4009
	li	r11,128*ZERO_AHEAD +8	/* DCBZ dist  */
Packit 6c4009
Packit 6c4009
.align	4
Packit 6c4009
	/* Copy whole cachelines, optimized by prefetching SRC cacheline  */
Packit 6c4009
.Lloop:				/* Copy aligned body  */
Packit 6c4009
	dcbt	r12,r4		/* PREFETCH SOURCE some cache lines ahead  */
Packit 6c4009
	ld	r9, 0x08(r4)
Packit 6c4009
	dcbz	r11,r6
Packit 6c4009
	ld	r7, 0x10(r4)	/* 4 register stride copy is optimal  */
Packit 6c4009
	ld	r8, 0x18(r4)	/* to hide 1st level cache latency.  */
Packit 6c4009
	ld	r0, 0x20(r4)
Packit 6c4009
	std	r9, 0x08(r6)
Packit 6c4009
	std	r7, 0x10(r6)
Packit 6c4009
	std	r8, 0x18(r6)
Packit 6c4009
	std	r0, 0x20(r6)
Packit 6c4009
	ld	r9, 0x28(r4)
Packit 6c4009
	ld	r7, 0x30(r4)
Packit 6c4009
	ld	r8, 0x38(r4)
Packit 6c4009
	ld	r0, 0x40(r4)
Packit 6c4009
	std	r9, 0x28(r6)
Packit 6c4009
	std	r7, 0x30(r6)
Packit 6c4009
	std	r8, 0x38(r6)
Packit 6c4009
	std	r0, 0x40(r6)
Packit 6c4009
	ld	r9, 0x48(r4)
Packit 6c4009
	ld	r7, 0x50(r4)
Packit 6c4009
	ld	r8, 0x58(r4)
Packit 6c4009
	ld	r0, 0x60(r4)
Packit 6c4009
	std	r9, 0x48(r6)
Packit 6c4009
	std	r7, 0x50(r6)
Packit 6c4009
	std	r8, 0x58(r6)
Packit 6c4009
	std	r0, 0x60(r6)
Packit 6c4009
	ld	r9, 0x68(r4)
Packit 6c4009
	ld	r7, 0x70(r4)
Packit 6c4009
	ld	r8, 0x78(r4)
Packit 6c4009
	ldu	r0, 0x80(r4)
Packit 6c4009
	std	r9, 0x68(r6)
Packit 6c4009
	std	r7, 0x70(r6)
Packit 6c4009
	std	r8, 0x78(r6)
Packit 6c4009
	stdu	r0, 0x80(r6)
Packit 6c4009
Packit 6c4009
	bdnz	.Lloop
Packit 6c4009
Packit 6c4009
.Lendloop:
Packit 6c4009
	cmpdi	r10,0
Packit 6c4009
	sldi	r10,r10,2	/* adjust from 128 to 32 byte stride  */
Packit 6c4009
	beq-	.Lendloop2
Packit 6c4009
	mtctr	r10
Packit 6c4009
Packit 6c4009
.Lloop2:			/* Copy aligned body  */
Packit 6c4009
	ld	r9, 0x08(r4)
Packit 6c4009
	ld	r7, 0x10(r4)
Packit 6c4009
	ld	r8, 0x18(r4)
Packit 6c4009
	ldu	r0, 0x20(r4)
Packit 6c4009
	std	r9, 0x08(r6)
Packit 6c4009
	std	r7, 0x10(r6)
Packit 6c4009
	std	r8, 0x18(r6)
Packit 6c4009
	stdu	r0, 0x20(r6)
Packit 6c4009
Packit 6c4009
	bdnz	.Lloop2
Packit 6c4009
.Lendloop2:
Packit 6c4009
Packit 6c4009
.Llessthancacheline:		/* less than cache to do ?  */
Packit 6c4009
	cmpldi	cr0,r5,16
Packit 6c4009
	srdi	r7,r5,4		/* divide size by 16  */
Packit 6c4009
	blt-	.Ldo_lt16
Packit 6c4009
	mtctr	r7
Packit 6c4009
Packit 6c4009
.Lcopy_remaining:
Packit 6c4009
	ld	r8,0x08(r4)
Packit 6c4009
	ldu	r7,0x10(r4)
Packit 6c4009
	std	r8,0x08(r6)
Packit 6c4009
	stdu	r7,0x10(r6)
Packit 6c4009
	bdnz	.Lcopy_remaining
Packit 6c4009
Packit 6c4009
.Ldo_lt16:			/* less than 16 ?  */
Packit 6c4009
	cmpldi	cr0,r5,0	/* copy remaining bytes (0-15)  */
Packit 6c4009
	beqlr+			/* no rest to copy  */
Packit 6c4009
	addi	r4,r4,8
Packit 6c4009
	addi	r6,r6,8
Packit 6c4009
Packit 6c4009
.Lshortcopy:			/* SIMPLE COPY to handle size =< 15 bytes  */
Packit 6c4009
	mtcrf	0x01,r5
Packit 6c4009
	sub	r7,r4,r6
Packit 6c4009
	bf-	cr7*4+0,8f
Packit 6c4009
	ldx	r0,r7,r6	/* copy 8 byte  */
Packit 6c4009
	std	r0,0(r6)
Packit 6c4009
	addi	r6,r6,8
Packit 6c4009
8:
Packit 6c4009
	bf	cr7*4+1,4f
Packit 6c4009
	lwzx	r0,r7,r6	/* copy 4 byte  */
Packit 6c4009
	stw	r0,0(r6)
Packit 6c4009
	addi	r6,r6,4
Packit 6c4009
4:
Packit 6c4009
	bf	cr7*4+2,2f
Packit 6c4009
	lhzx	r0,r7,r6	/* copy 2 byte  */
Packit 6c4009
	sth	r0,0(r6)
Packit 6c4009
	addi	r6,r6,2
Packit 6c4009
2:
Packit 6c4009
	bf	cr7*4+3,1f
Packit 6c4009
	lbzx	r0,r7,r6	/* copy 1 byte  */
Packit 6c4009
	stb	r0,0(r6)
Packit 6c4009
1:	blr
Packit 6c4009
Packit 6c4009
END_GEN_TB (MEMCPY,TB_TOCLESS)
Packit 6c4009
libc_hidden_builtin_def (memcpy)