Blame sysdeps/mips/memcpy.S

Packit 6c4009
/* Copyright (C) 2012-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library.  If not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#ifdef ANDROID_CHANGES
Packit 6c4009
# include "machine/asm.h"
Packit 6c4009
# include "machine/regdef.h"
Packit 6c4009
# define USE_MEMMOVE_FOR_OVERLAP
Packit 6c4009
# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
Packit 6c4009
# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
Packit 6c4009
#elif _LIBC
Packit 6c4009
# include <sysdep.h>
Packit 6c4009
# include <regdef.h>
Packit 6c4009
# include <sys/asm.h>
Packit 6c4009
# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
Packit 6c4009
# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
Packit 6c4009
#elif defined _COMPILING_NEWLIB
Packit 6c4009
# include "machine/asm.h"
Packit 6c4009
# include "machine/regdef.h"
Packit 6c4009
# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
Packit 6c4009
# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
Packit 6c4009
#else
Packit 6c4009
# include <regdef.h>
Packit 6c4009
# include <sys/asm.h>
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \
Packit 6c4009
    (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64)
Packit 6c4009
# ifndef DISABLE_PREFETCH
Packit 6c4009
#  define USE_PREFETCH
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
Packit 6c4009
# ifndef DISABLE_DOUBLE
Packit 6c4009
#  define USE_DOUBLE
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* Some asm.h files do not have the L macro definition.  */
Packit 6c4009
#ifndef L
Packit 6c4009
# if _MIPS_SIM == _ABIO32
Packit 6c4009
#  define L(label) $L ## label
Packit 6c4009
# else
Packit 6c4009
#  define L(label) .L ## label
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
Packit 6c4009
#ifndef PTR_ADDIU
Packit 6c4009
# ifdef USE_DOUBLE
Packit 6c4009
#  define PTR_ADDIU	daddiu
Packit 6c4009
# else
Packit 6c4009
#  define PTR_ADDIU	addiu
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* Some asm.h files do not have the PTR_SRA macro definition.  */
Packit 6c4009
#ifndef PTR_SRA
Packit 6c4009
# ifdef USE_DOUBLE
Packit 6c4009
#  define PTR_SRA		dsra
Packit 6c4009
# else
Packit 6c4009
#  define PTR_SRA		sra
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* New R6 instructions that may not be in asm.h.  */
Packit 6c4009
#ifndef PTR_LSA
Packit 6c4009
# if _MIPS_SIM == _ABI64
Packit 6c4009
#  define PTR_LSA	dlsa
Packit 6c4009
# else
Packit 6c4009
#  define PTR_LSA	lsa
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
 * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
Packit 6c4009
 * prefetches appears to offer a slight preformance advantage.
Packit 6c4009
 *
Packit 6c4009
 * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
Packit 6c4009
 * or PREFETCH_STORE_STREAMED offers a large performance advantage
Packit 6c4009
 * but PREPAREFORSTORE has some special restrictions to consider.
Packit 6c4009
 *
Packit 6c4009
 * Prefetch with the 'prepare for store' hint does not copy a memory
Packit 6c4009
 * location into the cache, it just allocates a cache line and zeros
Packit 6c4009
 * it out.  This means that if you do not write to the entire cache
Packit 6c4009
 * line before writing it out to memory some data will get zero'ed out
Packit 6c4009
 * when the cache line is written back to memory and data will be lost.
Packit 6c4009
 *
Packit 6c4009
 * Also if you are using this memcpy to copy overlapping buffers it may
Packit 6c4009
 * not behave correctly when using the 'prepare for store' hint.  If you
Packit 6c4009
 * use the 'prepare for store' prefetch on a memory area that is in the
Packit 6c4009
 * memcpy source (as well as the memcpy destination), then you will get
Packit 6c4009
 * some data zero'ed out before you have a chance to read it and data will
Packit 6c4009
 * be lost.
Packit 6c4009
 *
Packit 6c4009
 * If you are going to use this memcpy routine with the 'prepare for store'
Packit 6c4009
 * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
Packit 6c4009
 * the problem of running memcpy on overlapping buffers.
Packit 6c4009
 *
Packit 6c4009
 * There are ifdef'ed sections of this memcpy to make sure that it does not
Packit 6c4009
 * do prefetches on cache lines that are not going to be completely written.
Packit 6c4009
 * This code is only needed and only used when PREFETCH_STORE_HINT is set to
Packit 6c4009
 * PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
Packit 6c4009
 * 32 bytes and if the cache line is larger it will not work correctly.
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
#ifdef USE_PREFETCH
Packit 6c4009
# define PREFETCH_HINT_LOAD		0
Packit 6c4009
# define PREFETCH_HINT_STORE		1
Packit 6c4009
# define PREFETCH_HINT_LOAD_STREAMED	4
Packit 6c4009
# define PREFETCH_HINT_STORE_STREAMED	5
Packit 6c4009
# define PREFETCH_HINT_LOAD_RETAINED	6
Packit 6c4009
# define PREFETCH_HINT_STORE_RETAINED	7
Packit 6c4009
# define PREFETCH_HINT_WRITEBACK_INVAL	25
Packit 6c4009
# define PREFETCH_HINT_PREPAREFORSTORE	30
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
 * If we have not picked out what hints to use at this point use the
Packit 6c4009
 * standard load and store prefetch hints.
Packit 6c4009
 */
Packit 6c4009
# ifndef PREFETCH_STORE_HINT
Packit 6c4009
#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
Packit 6c4009
# endif
Packit 6c4009
# ifndef PREFETCH_LOAD_HINT
Packit 6c4009
#  define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
Packit 6c4009
# endif
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
 * We double everything when USE_DOUBLE is true so we do 2 prefetches to
Packit 6c4009
 * get 64 bytes in that case.  The assumption is that each individual
Packit 6c4009
 * prefetch brings in 32 bytes.
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
# ifdef USE_DOUBLE
Packit 6c4009
#  define PREFETCH_CHUNK 64
Packit 6c4009
#  define PREFETCH_FOR_LOAD(chunk, reg) \
Packit 6c4009
 pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \
Packit 6c4009
 pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg)
Packit 6c4009
#  define PREFETCH_FOR_STORE(chunk, reg) \
Packit 6c4009
 pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
Packit 6c4009
 pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
Packit 6c4009
# else
Packit 6c4009
#  define PREFETCH_CHUNK 32
Packit 6c4009
#  define PREFETCH_FOR_LOAD(chunk, reg) \
Packit 6c4009
 pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
Packit 6c4009
#  define PREFETCH_FOR_STORE(chunk, reg) \
Packit 6c4009
 pref PREFETCH_STORE_HINT, (chunk)*32(reg)
Packit 6c4009
# endif
Packit 6c4009
/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
Packit 6c4009
 * than PREFETCH_CHUNK, the assumed size of each prefetch.  If the real size
Packit 6c4009
 * of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
Packit 6c4009
 * hint is used, the code will not work correctly.  If PREPAREFORSTORE is not
Packit 6c4009
 * used then MAX_PREFETCH_SIZE does not matter.  */
Packit 6c4009
# define MAX_PREFETCH_SIZE 128
Packit 6c4009
/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
Packit 6c4009
 * than 5 on a STORE prefetch and that a single prefetch can never be larger
Packit 6c4009
 * than MAX_PREFETCH_SIZE.  We add the extra 32 when USE_DOUBLE is set because
Packit 6c4009
 * we actually do two prefetches in that case, one 32 bytes after the other.  */
Packit 6c4009
# ifdef USE_DOUBLE
Packit 6c4009
#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
Packit 6c4009
# else
Packit 6c4009
#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
Packit 6c4009
# endif
Packit 6c4009
# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
Packit 6c4009
    && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
Packit 6c4009
/* We cannot handle this because the initial prefetches may fetch bytes that
Packit 6c4009
 * are before the buffer being copied.  We start copies with an offset
Packit 6c4009
 * of 4 so avoid this situation when using PREPAREFORSTORE.  */
Packit 6c4009
#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
Packit 6c4009
# endif
Packit 6c4009
#else /* USE_PREFETCH not defined */
Packit 6c4009
# define PREFETCH_FOR_LOAD(offset, reg)
Packit 6c4009
# define PREFETCH_FOR_STORE(offset, reg)
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#if __mips_isa_rev > 5
Packit 6c4009
# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
Packit 6c4009
#  undef PREFETCH_STORE_HINT
Packit 6c4009
#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
Packit 6c4009
# endif
Packit 6c4009
# define R6_CODE
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* Allow the routine to be named something else if desired.  */
Packit 6c4009
#ifndef MEMCPY_NAME
Packit 6c4009
# define MEMCPY_NAME memcpy
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* We use these 32/64 bit registers as temporaries to do the copying.  */
Packit 6c4009
#define REG0 t0
Packit 6c4009
#define REG1 t1
Packit 6c4009
#define REG2 t2
Packit 6c4009
#define REG3 t3
Packit 6c4009
#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABIO32) || (_MIPS_SIM == _ABIO64))
Packit 6c4009
# define REG4 t4
Packit 6c4009
# define REG5 t5
Packit 6c4009
# define REG6 t6
Packit 6c4009
# define REG7 t7
Packit 6c4009
#else
Packit 6c4009
# define REG4 ta0
Packit 6c4009
# define REG5 ta1
Packit 6c4009
# define REG6 ta2
Packit 6c4009
# define REG7 ta3
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* We load/store 64 bits at a time when USE_DOUBLE is true.
Packit 6c4009
 * The C_ prefix stands for CHUNK and is used to avoid macro name
Packit 6c4009
 * conflicts with system header files.  */
Packit 6c4009
Packit 6c4009
#ifdef USE_DOUBLE
Packit 6c4009
# define C_ST	sd
Packit 6c4009
# define C_LD	ld
Packit 6c4009
# ifdef __MIPSEB
Packit 6c4009
#  define C_LDHI	ldl	/* high part is left in big-endian	*/
Packit 6c4009
#  define C_STHI	sdl	/* high part is left in big-endian	*/
Packit 6c4009
#  define C_LDLO	ldr	/* low part is right in big-endian	*/
Packit 6c4009
#  define C_STLO	sdr	/* low part is right in big-endian	*/
Packit 6c4009
# else
Packit 6c4009
#  define C_LDHI	ldr	/* high part is right in little-endian	*/
Packit 6c4009
#  define C_STHI	sdr	/* high part is right in little-endian	*/
Packit 6c4009
#  define C_LDLO	ldl	/* low part is left in little-endian	*/
Packit 6c4009
#  define C_STLO	sdl	/* low part is left in little-endian	*/
Packit 6c4009
# endif
Packit 6c4009
# define C_ALIGN	dalign	/* r6 align instruction			*/
Packit 6c4009
#else
Packit 6c4009
# define C_ST	sw
Packit 6c4009
# define C_LD	lw
Packit 6c4009
# ifdef __MIPSEB
Packit 6c4009
#  define C_LDHI	lwl	/* high part is left in big-endian	*/
Packit 6c4009
#  define C_STHI	swl	/* high part is left in big-endian	*/
Packit 6c4009
#  define C_LDLO	lwr	/* low part is right in big-endian	*/
Packit 6c4009
#  define C_STLO	swr	/* low part is right in big-endian	*/
Packit 6c4009
# else
Packit 6c4009
#  define C_LDHI	lwr	/* high part is right in little-endian	*/
Packit 6c4009
#  define C_STHI	swr	/* high part is right in little-endian	*/
Packit 6c4009
#  define C_LDLO	lwl	/* low part is left in little-endian	*/
Packit 6c4009
#  define C_STLO	swl	/* low part is left in little-endian	*/
Packit 6c4009
# endif
Packit 6c4009
# define C_ALIGN	align	/* r6 align instruction			*/
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* Bookkeeping values for 32 vs. 64 bit mode.  */
Packit 6c4009
#ifdef USE_DOUBLE
Packit 6c4009
# define NSIZE 8
Packit 6c4009
# define NSIZEMASK 0x3f
Packit 6c4009
# define NSIZEDMASK 0x7f
Packit 6c4009
#else
Packit 6c4009
# define NSIZE 4
Packit 6c4009
# define NSIZEMASK 0x1f
Packit 6c4009
# define NSIZEDMASK 0x3f
Packit 6c4009
#endif
Packit 6c4009
#define UNIT(unit) ((unit)*NSIZE)
Packit 6c4009
#define UNITM1(unit) (((unit)*NSIZE)-1)
Packit 6c4009
Packit 6c4009
#ifdef ANDROID_CHANGES
Packit 6c4009
LEAF(MEMCPY_NAME, 0)
Packit 6c4009
#else
Packit 6c4009
LEAF(MEMCPY_NAME)
Packit 6c4009
#endif
Packit 6c4009
	.set	nomips16
Packit 6c4009
	.set	noreorder
Packit 6c4009
/*
Packit 6c4009
 * Below we handle the case where memcpy is called with overlapping src and dst.
Packit 6c4009
 * Although memcpy is not required to handle this case, some parts of Android
Packit 6c4009
 * like Skia rely on such usage. We call memmove to handle such cases.
Packit 6c4009
 */
Packit 6c4009
#ifdef USE_MEMMOVE_FOR_OVERLAP
Packit 6c4009
	PTR_SUBU t0,a0,a1
Packit 6c4009
	PTR_SRA	t2,t0,31
Packit 6c4009
	xor	t1,t0,t2
Packit 6c4009
	PTR_SUBU t0,t1,t2
Packit 6c4009
	sltu	t2,t0,a2
Packit 6c4009
	beq	t2,zero,L(memcpy)
Packit 6c4009
	la	t9,memmove
Packit 6c4009
	jr	t9
Packit 6c4009
	 nop
Packit 6c4009
L(memcpy):
Packit 6c4009
#endif
Packit 6c4009
/*
Packit 6c4009
 * If the size is less than 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
Packit 6c4009
 * size, copy dst pointer to v0 for the return value.
Packit 6c4009
 */
Packit 6c4009
	slti	t2,a2,(2 * NSIZE)
Packit 6c4009
	bne	t2,zero,L(lasts)
Packit 6c4009
#if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH)
Packit 6c4009
	move	v0,zero
Packit 6c4009
#else
Packit 6c4009
	move	v0,a0
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
#ifndef R6_CODE
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
 * If src and dst have different alignments, go to L(unaligned), if they
Packit 6c4009
 * have the same alignment (but are not actually aligned) do a partial
Packit 6c4009
 * load/store to make them aligned.  If they are both already aligned
Packit 6c4009
 * we can start copying at L(aligned).
Packit 6c4009
 */
Packit 6c4009
	xor	t8,a1,a0
Packit 6c4009
	andi	t8,t8,(NSIZE-1)		/* t8 is a0/a1 word-displacement */
Packit 6c4009
	bne	t8,zero,L(unaligned)
Packit 6c4009
	PTR_SUBU a3, zero, a0
Packit 6c4009
Packit 6c4009
	andi	a3,a3,(NSIZE-1)		/* copy a3 bytes to align a0/a1	  */
Packit 6c4009
	beq	a3,zero,L(aligned)	/* if a3=0, it is already aligned */
Packit 6c4009
	PTR_SUBU a2,a2,a3		/* a2 is the remining bytes count */
Packit 6c4009
Packit 6c4009
	C_LDHI	t8,0(a1)
Packit 6c4009
	PTR_ADDU a1,a1,a3
Packit 6c4009
	C_STHI	t8,0(a0)
Packit 6c4009
	PTR_ADDU a0,a0,a3
Packit 6c4009
Packit 6c4009
#else /* R6_CODE */
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
 * Align the destination and hope that the source gets aligned too.  If it
Packit 6c4009
 * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6
Packit 6c4009
 * align instruction.
Packit 6c4009
 */
Packit 6c4009
	andi	t8,a0,7
Packit 6c4009
	lapc	t9,L(atable)
Packit 6c4009
	PTR_LSA	t9,t8,t9,2
Packit 6c4009
	jrc	t9
Packit 6c4009
L(atable):
Packit 6c4009
	bc	L(lb0)
Packit 6c4009
	bc	L(lb7)
Packit 6c4009
	bc	L(lb6)
Packit 6c4009
	bc	L(lb5)
Packit 6c4009
	bc	L(lb4)
Packit 6c4009
	bc	L(lb3)
Packit 6c4009
	bc	L(lb2)
Packit 6c4009
	bc	L(lb1)
Packit 6c4009
L(lb7):
Packit 6c4009
	lb	a3, 6(a1)
Packit 6c4009
	sb	a3, 6(a0)
Packit 6c4009
L(lb6):
Packit 6c4009
	lb	a3, 5(a1)
Packit 6c4009
	sb	a3, 5(a0)
Packit 6c4009
L(lb5):
Packit 6c4009
	lb	a3, 4(a1)
Packit 6c4009
	sb	a3, 4(a0)
Packit 6c4009
L(lb4):
Packit 6c4009
	lb	a3, 3(a1)
Packit 6c4009
	sb	a3, 3(a0)
Packit 6c4009
L(lb3):
Packit 6c4009
	lb	a3, 2(a1)
Packit 6c4009
	sb	a3, 2(a0)
Packit 6c4009
L(lb2):
Packit 6c4009
	lb	a3, 1(a1)
Packit 6c4009
	sb	a3, 1(a0)
Packit 6c4009
L(lb1):
Packit 6c4009
	lb	a3, 0(a1)
Packit 6c4009
	sb	a3, 0(a0)
Packit 6c4009
Packit 6c4009
	li	t9,8
Packit 6c4009
	subu	t8,t9,t8
Packit 6c4009
	PTR_SUBU a2,a2,t8
Packit 6c4009
	PTR_ADDU a0,a0,t8
Packit 6c4009
	PTR_ADDU a1,a1,t8
Packit 6c4009
L(lb0):
Packit 6c4009
Packit 6c4009
	andi	t8,a1,(NSIZE-1)
Packit 6c4009
	lapc	t9,L(jtable)
Packit 6c4009
	PTR_LSA	t9,t8,t9,2
Packit 6c4009
	jrc	t9
Packit 6c4009
L(jtable):
Packit 6c4009
        bc      L(aligned)
Packit 6c4009
        bc      L(r6_unaligned1)
Packit 6c4009
        bc      L(r6_unaligned2)
Packit 6c4009
        bc      L(r6_unaligned3)
Packit 6c4009
# ifdef USE_DOUBLE
Packit 6c4009
        bc      L(r6_unaligned4)
Packit 6c4009
        bc      L(r6_unaligned5)
Packit 6c4009
        bc      L(r6_unaligned6)
Packit 6c4009
        bc      L(r6_unaligned7)
Packit 6c4009
# endif
Packit 6c4009
#endif /* R6_CODE */
Packit 6c4009
Packit 6c4009
L(aligned):
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
 * Now dst/src are both aligned to (word or double word) aligned addresses
Packit 6c4009
 * Set a2 to count how many bytes we have to copy after all the 64/128 byte
Packit 6c4009
 * chunks are copied and a3 to the dst pointer after all the 64/128 byte
Packit 6c4009
 * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
Packit 6c4009
 * equals a3.
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
Packit 6c4009
	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
Packit 6c4009
	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
Packit 6c4009
	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
Packit 6c4009
Packit 6c4009
/* When in the loop we may prefetch with the 'prepare to store' hint,
Packit 6c4009
 * in this case the a0+x should not be past the "t0-32" address.  This
Packit 6c4009
 * means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
Packit 6c4009
 * for x=64 the last "safe" a0 address is "t0-96" In the current version we
Packit 6c4009
 * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
Packit 6c4009
 */
Packit 6c4009
#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
Packit 6c4009
	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
Packit 6c4009
	PTR_SUBU t9,t0,PREFETCH_LIMIT	/* t9 is the "last safe pref" address */
Packit 6c4009
#endif
Packit 6c4009
	PREFETCH_FOR_LOAD  (0, a1)
Packit 6c4009
	PREFETCH_FOR_LOAD  (1, a1)
Packit 6c4009
	PREFETCH_FOR_LOAD  (2, a1)
Packit 6c4009
	PREFETCH_FOR_LOAD  (3, a1)
Packit 6c4009
#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
Packit 6c4009
	PREFETCH_FOR_STORE (1, a0)
Packit 6c4009
	PREFETCH_FOR_STORE (2, a0)
Packit 6c4009
	PREFETCH_FOR_STORE (3, a0)
Packit 6c4009
#endif
Packit 6c4009
#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
Packit 6c4009
# if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE
Packit 6c4009
	sltu    v1,t9,a0
Packit 6c4009
	bgtz    v1,L(skip_set)
Packit 6c4009
	nop
Packit 6c4009
	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
Packit 6c4009
L(skip_set):
Packit 6c4009
# else
Packit 6c4009
	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \
Packit 6c4009
    && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
Packit 6c4009
	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3)
Packit 6c4009
# ifdef USE_DOUBLE
Packit 6c4009
	PTR_ADDIU v0,v0,32
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
L(loop16w):
Packit 6c4009
	C_LD	t0,UNIT(0)(a1)
Packit 6c4009
#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
Packit 6c4009
	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
Packit 6c4009
	bgtz	v1,L(skip_pref)
Packit 6c4009
#endif
Packit 6c4009
	C_LD	t1,UNIT(1)(a1)
Packit 6c4009
#ifdef R6_CODE
Packit 6c4009
	PREFETCH_FOR_STORE (2, a0)
Packit 6c4009
#else
Packit 6c4009
	PREFETCH_FOR_STORE (4, a0)
Packit 6c4009
	PREFETCH_FOR_STORE (5, a0)
Packit 6c4009
#endif
Packit 6c4009
#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH)
Packit 6c4009
	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5)
Packit 6c4009
# ifdef USE_DOUBLE
Packit 6c4009
	PTR_ADDIU v0,v0,32
Packit 6c4009
# endif
Packit 6c4009
#endif
Packit 6c4009
L(skip_pref):
Packit 6c4009
	C_LD	REG2,UNIT(2)(a1)
Packit 6c4009
	C_LD	REG3,UNIT(3)(a1)
Packit 6c4009
	C_LD	REG4,UNIT(4)(a1)
Packit 6c4009
	C_LD	REG5,UNIT(5)(a1)
Packit 6c4009
	C_LD	REG6,UNIT(6)(a1)
Packit 6c4009
	C_LD	REG7,UNIT(7)(a1)
Packit 6c4009
#ifdef R6_CODE
Packit 6c4009
	PREFETCH_FOR_LOAD (3, a1)
Packit 6c4009
#else
Packit 6c4009
	PREFETCH_FOR_LOAD (4, a1)
Packit 6c4009
#endif
Packit 6c4009
	C_ST	t0,UNIT(0)(a0)
Packit 6c4009
	C_ST	t1,UNIT(1)(a0)
Packit 6c4009
	C_ST	REG2,UNIT(2)(a0)
Packit 6c4009
	C_ST	REG3,UNIT(3)(a0)
Packit 6c4009
	C_ST	REG4,UNIT(4)(a0)
Packit 6c4009
	C_ST	REG5,UNIT(5)(a0)
Packit 6c4009
	C_ST	REG6,UNIT(6)(a0)
Packit 6c4009
	C_ST	REG7,UNIT(7)(a0)
Packit 6c4009
Packit 6c4009
	C_LD	t0,UNIT(8)(a1)
Packit 6c4009
	C_LD	t1,UNIT(9)(a1)
Packit 6c4009
	C_LD	REG2,UNIT(10)(a1)
Packit 6c4009
	C_LD	REG3,UNIT(11)(a1)
Packit 6c4009
	C_LD	REG4,UNIT(12)(a1)
Packit 6c4009
	C_LD	REG5,UNIT(13)(a1)
Packit 6c4009
	C_LD	REG6,UNIT(14)(a1)
Packit 6c4009
	C_LD	REG7,UNIT(15)(a1)
Packit 6c4009
#ifndef R6_CODE
Packit 6c4009
        PREFETCH_FOR_LOAD (5, a1)
Packit 6c4009
#endif
Packit 6c4009
	C_ST	t0,UNIT(8)(a0)
Packit 6c4009
	C_ST	t1,UNIT(9)(a0)
Packit 6c4009
	C_ST	REG2,UNIT(10)(a0)
Packit 6c4009
	C_ST	REG3,UNIT(11)(a0)
Packit 6c4009
	C_ST	REG4,UNIT(12)(a0)
Packit 6c4009
	C_ST	REG5,UNIT(13)(a0)
Packit 6c4009
	C_ST	REG6,UNIT(14)(a0)
Packit 6c4009
	C_ST	REG7,UNIT(15)(a0)
Packit 6c4009
	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
Packit 6c4009
	bne	a0,a3,L(loop16w)
Packit 6c4009
	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
Packit 6c4009
	move	a2,t8
Packit 6c4009
Packit 6c4009
/* Here we have src and dest word-aligned but less than 64-bytes or
Packit 6c4009
 * 128 bytes to go.  Check for a 32(64) byte chunk and copy if there
Packit 6c4009
 * is one.  Otherwise jump down to L(chk1w) to handle the tail end of
Packit 6c4009
 * the copy.
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
L(chkw):
Packit 6c4009
	PREFETCH_FOR_LOAD (0, a1)
Packit 6c4009
	andi	t8,a2,NSIZEMASK	/* Is there a 32-byte/64-byte chunk.  */
Packit 6c4009
				/* The t8 is the reminder count past 32-bytes */
Packit 6c4009
	beq	a2,t8,L(chk1w)	/* When a2=t8, no 32-byte chunk  */
Packit 6c4009
	nop
Packit 6c4009
	C_LD	t0,UNIT(0)(a1)
Packit 6c4009
	C_LD	t1,UNIT(1)(a1)
Packit 6c4009
	C_LD	REG2,UNIT(2)(a1)
Packit 6c4009
	C_LD	REG3,UNIT(3)(a1)
Packit 6c4009
	C_LD	REG4,UNIT(4)(a1)
Packit 6c4009
	C_LD	REG5,UNIT(5)(a1)
Packit 6c4009
	C_LD	REG6,UNIT(6)(a1)
Packit 6c4009
	C_LD	REG7,UNIT(7)(a1)
Packit 6c4009
	PTR_ADDIU a1,a1,UNIT(8)
Packit 6c4009
	C_ST	t0,UNIT(0)(a0)
Packit 6c4009
	C_ST	t1,UNIT(1)(a0)
Packit 6c4009
	C_ST	REG2,UNIT(2)(a0)
Packit 6c4009
	C_ST	REG3,UNIT(3)(a0)
Packit 6c4009
	C_ST	REG4,UNIT(4)(a0)
Packit 6c4009
	C_ST	REG5,UNIT(5)(a0)
Packit 6c4009
	C_ST	REG6,UNIT(6)(a0)
Packit 6c4009
	C_ST	REG7,UNIT(7)(a0)
Packit 6c4009
	PTR_ADDIU a0,a0,UNIT(8)
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
 * Here we have less than 32(64) bytes to copy.  Set up for a loop to
Packit 6c4009
 * copy one word (or double word) at a time.  Set a2 to count how many
Packit 6c4009
 * bytes we have to copy after all the word (or double word) chunks are
Packit 6c4009
 * copied and a3 to the dst pointer after all the (d)word chunks have
Packit 6c4009
 * been copied.  We will loop, incrementing a0 and a1 until a0 equals a3.
Packit 6c4009
 */
Packit 6c4009
L(chk1w):
Packit 6c4009
	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
Packit 6c4009
	beq	a2,t8,L(lastw)
Packit 6c4009
	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
Packit 6c4009
	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
Packit 6c4009
Packit 6c4009
/* copying in words (4-byte or 8-byte chunks) */
Packit 6c4009
L(wordCopy_loop):
Packit 6c4009
	C_LD	REG3,UNIT(0)(a1)
Packit 6c4009
	PTR_ADDIU a0,a0,UNIT(1)
Packit 6c4009
	PTR_ADDIU a1,a1,UNIT(1)
Packit 6c4009
	bne	a0,a3,L(wordCopy_loop)
Packit 6c4009
	C_ST	REG3,UNIT(-1)(a0)
Packit 6c4009
Packit 6c4009
/* If we have been copying double words, see if we can copy a single word
Packit 6c4009
   before doing byte copies.  We can have, at most, one word to copy.  */
Packit 6c4009
Packit 6c4009
L(lastw):
Packit 6c4009
#ifdef USE_DOUBLE
Packit 6c4009
	andi    t8,a2,3		/* a2 is the remainder past 4 byte chunks.  */
Packit 6c4009
	beq	t8,a2,L(lastb)
Packit 6c4009
	move	a2,t8
Packit 6c4009
	lw	REG3,0(a1)
Packit 6c4009
	sw	REG3,0(a0)
Packit 6c4009
	PTR_ADDIU a0,a0,4
Packit 6c4009
	PTR_ADDIU a1,a1,4
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
/* Copy the last 8 (or 16) bytes */
Packit 6c4009
L(lastb):
Packit 6c4009
	blez	a2,L(leave)
Packit 6c4009
	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
Packit 6c4009
L(lastbloop):
Packit 6c4009
	lb	v1,0(a1)
Packit 6c4009
	PTR_ADDIU a0,a0,1
Packit 6c4009
	PTR_ADDIU a1,a1,1
Packit 6c4009
	bne	a0,a3,L(lastbloop)
Packit 6c4009
	sb	v1,-1(a0)
Packit 6c4009
L(leave):
Packit 6c4009
	j	ra
Packit 6c4009
	nop
Packit 6c4009
Packit 6c4009
/* We jump here with a memcpy of less than 8 or 16 bytes, depending on
Packit 6c4009
   whether or not USE_DOUBLE is defined.  Instead of just doing byte
Packit 6c4009
   copies, check the alignment and size and use lw/sw if possible.
Packit 6c4009
   Otherwise, do byte copies.  */
Packit 6c4009
Packit 6c4009
L(lasts):
Packit 6c4009
	andi	t8,a2,3
Packit 6c4009
	beq	t8,a2,L(lastb)
Packit 6c4009
Packit 6c4009
	andi	t9,a0,3
Packit 6c4009
	bne	t9,zero,L(lastb)
Packit 6c4009
	andi	t9,a1,3
Packit 6c4009
	bne	t9,zero,L(lastb)
Packit 6c4009
Packit 6c4009
	PTR_SUBU a3,a2,t8
Packit 6c4009
	PTR_ADDU a3,a0,a3
Packit 6c4009
Packit 6c4009
L(wcopy_loop):
Packit 6c4009
	lw	REG3,0(a1)
Packit 6c4009
	PTR_ADDIU a0,a0,4
Packit 6c4009
	PTR_ADDIU a1,a1,4
Packit 6c4009
	bne	a0,a3,L(wcopy_loop)
Packit 6c4009
	sw	REG3,-4(a0)
Packit 6c4009
Packit 6c4009
	b	L(lastb)
Packit 6c4009
	move	a2,t8
Packit 6c4009
Packit 6c4009
#ifndef R6_CODE
Packit 6c4009
/*
Packit 6c4009
 * UNALIGNED case, got here with a3 = "negu a0"
Packit 6c4009
 * This code is nearly identical to the aligned code above
Packit 6c4009
 * but only the destination (not the source) gets aligned
Packit 6c4009
 * so we need to do partial loads of the source followed
Packit 6c4009
 * by normal stores to the destination (once we have aligned
Packit 6c4009
 * the destination).
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
L(unaligned):
Packit 6c4009
	andi	a3,a3,(NSIZE-1)	/* copy a3 bytes to align a0/a1 */
Packit 6c4009
	beqz	a3,L(ua_chk16w) /* if a3=0, it is already aligned */
Packit 6c4009
	PTR_SUBU a2,a2,a3	/* a2 is the remining bytes count */
Packit 6c4009
Packit 6c4009
	C_LDHI	v1,UNIT(0)(a1)
Packit 6c4009
	C_LDLO	v1,UNITM1(1)(a1)
Packit 6c4009
	PTR_ADDU a1,a1,a3
Packit 6c4009
	C_STHI	v1,UNIT(0)(a0)
Packit 6c4009
	PTR_ADDU a0,a0,a3
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
 *  Now the destination (but not the source) is aligned
Packit 6c4009
 * Set a2 to count how many bytes we have to copy after all the 64/128 byte
Packit 6c4009
 * chunks are copied and a3 to the dst pointer after all the 64/128 byte
Packit 6c4009
 * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
Packit 6c4009
 * equals a3.
Packit 6c4009
 */
Packit 6c4009
Packit 6c4009
L(ua_chk16w):
Packit 6c4009
	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
Packit 6c4009
	beq	a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
Packit 6c4009
	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
Packit 6c4009
	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
Packit 6c4009
Packit 6c4009
# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
Packit 6c4009
	PTR_ADDU t0,a0,a2	  /* t0 is the "past the end" address */
Packit 6c4009
	PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
Packit 6c4009
# endif
Packit 6c4009
	PREFETCH_FOR_LOAD  (0, a1)
Packit 6c4009
	PREFETCH_FOR_LOAD  (1, a1)
Packit 6c4009
	PREFETCH_FOR_LOAD  (2, a1)
Packit 6c4009
# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
Packit 6c4009
	PREFETCH_FOR_STORE (1, a0)
Packit 6c4009
	PREFETCH_FOR_STORE (2, a0)
Packit 6c4009
	PREFETCH_FOR_STORE (3, a0)
Packit 6c4009
# endif
Packit 6c4009
# if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
Packit 6c4009
#  if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
Packit 6c4009
	sltu    v1,t9,a0
Packit 6c4009
	bgtz    v1,L(ua_skip_set)
Packit 6c4009
	nop
Packit 6c4009
	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
Packit 6c4009
L(ua_skip_set):
Packit 6c4009
#  else
Packit 6c4009
	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
Packit 6c4009
#  endif
Packit 6c4009
# endif
Packit 6c4009
L(ua_loop16w):
Packit 6c4009
	PREFETCH_FOR_LOAD  (3, a1)
Packit 6c4009
	C_LDHI	t0,UNIT(0)(a1)
Packit 6c4009
	C_LDHI	t1,UNIT(1)(a1)
Packit 6c4009
	C_LDHI	REG2,UNIT(2)(a1)
Packit 6c4009
# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
Packit 6c4009
	sltu	v1,t9,a0
Packit 6c4009
	bgtz	v1,L(ua_skip_pref)
Packit 6c4009
# endif
Packit 6c4009
	C_LDHI	REG3,UNIT(3)(a1)
Packit 6c4009
	PREFETCH_FOR_STORE (4, a0)
Packit 6c4009
	PREFETCH_FOR_STORE (5, a0)
Packit 6c4009
L(ua_skip_pref):
Packit 6c4009
	C_LDHI	REG4,UNIT(4)(a1)
Packit 6c4009
	C_LDHI	REG5,UNIT(5)(a1)
Packit 6c4009
	C_LDHI	REG6,UNIT(6)(a1)
Packit 6c4009
	C_LDHI	REG7,UNIT(7)(a1)
Packit 6c4009
	C_LDLO	t0,UNITM1(1)(a1)
Packit 6c4009
	C_LDLO	t1,UNITM1(2)(a1)
Packit 6c4009
	C_LDLO	REG2,UNITM1(3)(a1)
Packit 6c4009
	C_LDLO	REG3,UNITM1(4)(a1)
Packit 6c4009
	C_LDLO	REG4,UNITM1(5)(a1)
Packit 6c4009
	C_LDLO	REG5,UNITM1(6)(a1)
Packit 6c4009
	C_LDLO	REG6,UNITM1(7)(a1)
Packit 6c4009
	C_LDLO	REG7,UNITM1(8)(a1)
Packit 6c4009
        PREFETCH_FOR_LOAD (4, a1)
Packit 6c4009
	C_ST	t0,UNIT(0)(a0)
Packit 6c4009
	C_ST	t1,UNIT(1)(a0)
Packit 6c4009
	C_ST	REG2,UNIT(2)(a0)
Packit 6c4009
	C_ST	REG3,UNIT(3)(a0)
Packit 6c4009
	C_ST	REG4,UNIT(4)(a0)
Packit 6c4009
	C_ST	REG5,UNIT(5)(a0)
Packit 6c4009
	C_ST	REG6,UNIT(6)(a0)
Packit 6c4009
	C_ST	REG7,UNIT(7)(a0)
Packit 6c4009
	C_LDHI	t0,UNIT(8)(a1)
Packit 6c4009
	C_LDHI	t1,UNIT(9)(a1)
Packit 6c4009
	C_LDHI	REG2,UNIT(10)(a1)
Packit 6c4009
	C_LDHI	REG3,UNIT(11)(a1)
Packit 6c4009
	C_LDHI	REG4,UNIT(12)(a1)
Packit 6c4009
	C_LDHI	REG5,UNIT(13)(a1)
Packit 6c4009
	C_LDHI	REG6,UNIT(14)(a1)
Packit 6c4009
	C_LDHI	REG7,UNIT(15)(a1)
Packit 6c4009
	C_LDLO	t0,UNITM1(9)(a1)
Packit 6c4009
	C_LDLO	t1,UNITM1(10)(a1)
Packit 6c4009
	C_LDLO	REG2,UNITM1(11)(a1)
Packit 6c4009
	C_LDLO	REG3,UNITM1(12)(a1)
Packit 6c4009
	C_LDLO	REG4,UNITM1(13)(a1)
Packit 6c4009
	C_LDLO	REG5,UNITM1(14)(a1)
Packit 6c4009
	C_LDLO	REG6,UNITM1(15)(a1)
Packit 6c4009
	C_LDLO	REG7,UNITM1(16)(a1)
Packit 6c4009
        PREFETCH_FOR_LOAD (5, a1)
Packit 6c4009
	C_ST	t0,UNIT(8)(a0)
Packit 6c4009
	C_ST	t1,UNIT(9)(a0)
Packit 6c4009
	C_ST	REG2,UNIT(10)(a0)
Packit 6c4009
	C_ST	REG3,UNIT(11)(a0)
Packit 6c4009
	C_ST	REG4,UNIT(12)(a0)
Packit 6c4009
	C_ST	REG5,UNIT(13)(a0)
Packit 6c4009
	C_ST	REG6,UNIT(14)(a0)
Packit 6c4009
	C_ST	REG7,UNIT(15)(a0)
Packit 6c4009
	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
Packit 6c4009
	bne	a0,a3,L(ua_loop16w)
Packit 6c4009
	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
Packit 6c4009
	move	a2,t8
Packit 6c4009
Packit 6c4009
/* Here we have src and dest word-aligned but less than 64-bytes or
Packit 6c4009
 * 128 bytes to go.  Check for a 32(64) byte chunk and copy if there
Packit 6c4009
 * is one.  Otherwise jump down to L(ua_chk1w) to handle the tail end of
Packit 6c4009
 * the copy.  */
Packit 6c4009
Packit 6c4009
L(ua_chkw):
Packit 6c4009
	PREFETCH_FOR_LOAD (0, a1)
Packit 6c4009
	andi	t8,a2,NSIZEMASK	  /* Is there a 32-byte/64-byte chunk.  */
Packit 6c4009
				  /* t8 is the reminder count past 32-bytes */
Packit 6c4009
	beq	a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
Packit 6c4009
	nop
Packit 6c4009
	C_LDHI	t0,UNIT(0)(a1)
Packit 6c4009
	C_LDHI	t1,UNIT(1)(a1)
Packit 6c4009
	C_LDHI	REG2,UNIT(2)(a1)
Packit 6c4009
	C_LDHI	REG3,UNIT(3)(a1)
Packit 6c4009
	C_LDHI	REG4,UNIT(4)(a1)
Packit 6c4009
	C_LDHI	REG5,UNIT(5)(a1)
Packit 6c4009
	C_LDHI	REG6,UNIT(6)(a1)
Packit 6c4009
	C_LDHI	REG7,UNIT(7)(a1)
Packit 6c4009
	C_LDLO	t0,UNITM1(1)(a1)
Packit 6c4009
	C_LDLO	t1,UNITM1(2)(a1)
Packit 6c4009
	C_LDLO	REG2,UNITM1(3)(a1)
Packit 6c4009
	C_LDLO	REG3,UNITM1(4)(a1)
Packit 6c4009
	C_LDLO	REG4,UNITM1(5)(a1)
Packit 6c4009
	C_LDLO	REG5,UNITM1(6)(a1)
Packit 6c4009
	C_LDLO	REG6,UNITM1(7)(a1)
Packit 6c4009
	C_LDLO	REG7,UNITM1(8)(a1)
Packit 6c4009
	PTR_ADDIU a1,a1,UNIT(8)
Packit 6c4009
	C_ST	t0,UNIT(0)(a0)
Packit 6c4009
	C_ST	t1,UNIT(1)(a0)
Packit 6c4009
	C_ST	REG2,UNIT(2)(a0)
Packit 6c4009
	C_ST	REG3,UNIT(3)(a0)
Packit 6c4009
	C_ST	REG4,UNIT(4)(a0)
Packit 6c4009
	C_ST	REG5,UNIT(5)(a0)
Packit 6c4009
	C_ST	REG6,UNIT(6)(a0)
Packit 6c4009
	C_ST	REG7,UNIT(7)(a0)
Packit 6c4009
	PTR_ADDIU a0,a0,UNIT(8)
Packit 6c4009
/*
Packit 6c4009
 * Here we have less than 32(64) bytes to copy.  Set up for a loop to
Packit 6c4009
 * copy one word (or double word) at a time.
Packit 6c4009
 */
Packit 6c4009
L(ua_chk1w):
Packit 6c4009
	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
Packit 6c4009
	beq	a2,t8,L(ua_smallCopy)
Packit 6c4009
	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
Packit 6c4009
	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
Packit 6c4009
Packit 6c4009
/* copying in words (4-byte or 8-byte chunks) */
Packit 6c4009
L(ua_wordCopy_loop):
Packit 6c4009
	C_LDHI	v1,UNIT(0)(a1)
Packit 6c4009
	C_LDLO	v1,UNITM1(1)(a1)
Packit 6c4009
	PTR_ADDIU a0,a0,UNIT(1)
Packit 6c4009
	PTR_ADDIU a1,a1,UNIT(1)
Packit 6c4009
	bne	a0,a3,L(ua_wordCopy_loop)
Packit 6c4009
	C_ST	v1,UNIT(-1)(a0)
Packit 6c4009
Packit 6c4009
/* Copy the last 8 (or 16) bytes */
Packit 6c4009
L(ua_smallCopy):
Packit 6c4009
	beqz	a2,L(leave)
Packit 6c4009
	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
Packit 6c4009
L(ua_smallCopy_loop):
Packit 6c4009
	lb	v1,0(a1)
Packit 6c4009
	PTR_ADDIU a0,a0,1
Packit 6c4009
	PTR_ADDIU a1,a1,1
Packit 6c4009
	bne	a0,a3,L(ua_smallCopy_loop)
Packit 6c4009
	sb	v1,-1(a0)
Packit 6c4009
Packit 6c4009
	j	ra
Packit 6c4009
	nop
Packit 6c4009
Packit 6c4009
#else /* R6_CODE */
Packit 6c4009
Packit 6c4009
# ifdef __MIPSEB
Packit 6c4009
#  define SWAP_REGS(X,Y) X, Y
Packit 6c4009
#  define ALIGN_OFFSET(N) (N)
Packit 6c4009
# else
Packit 6c4009
#  define SWAP_REGS(X,Y) Y, X
Packit 6c4009
#  define ALIGN_OFFSET(N) (NSIZE-N)
Packit 6c4009
# endif
Packit 6c4009
# define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \
Packit 6c4009
	andi	REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes.     */ \
Packit 6c4009
	beq	REG7, a2, L(lastb); /* Check for bytes to copy by word	   */ \
Packit 6c4009
	PTR_SUBU a3, a2, REG7;	/* a3 is number of bytes to be copied in   */ \
Packit 6c4009
				/* (d)word chunks.			   */ \
Packit 6c4009
	move	a2, REG7;	/* a2 is # of bytes to copy byte by byte   */ \
Packit 6c4009
				/* after word loop is finished.		   */ \
Packit 6c4009
	PTR_ADDU REG6, a0, a3;	/* REG6 is the dst address after loop.	   */ \
Packit 6c4009
	PTR_SUBU REG2, a1, t8;	/* REG2 is the aligned src address.	   */ \
Packit 6c4009
	PTR_ADDU a1, a1, a3;	/* a1 is addr of source after word loop.   */ \
Packit 6c4009
	C_LD	t0, UNIT(0)(REG2);  /* Load first part of source.	   */ \
Packit 6c4009
L(r6_ua_wordcopy##BYTEOFFSET):						      \
Packit 6c4009
	C_LD	t1, UNIT(1)(REG2);  /* Load second part of source.	   */ \
Packit 6c4009
	C_ALIGN	REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET);	      \
Packit 6c4009
	PTR_ADDIU a0, a0, UNIT(1);  /* Increment destination pointer.	   */ \
Packit 6c4009
	PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \
Packit 6c4009
	move	t0, t1;		/* Move second part of source to first.	   */ \
Packit 6c4009
	bne	a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET);			      \
Packit 6c4009
	C_ST	REG3, UNIT(-1)(a0);					      \
Packit 6c4009
	j	L(lastb);						      \
Packit 6c4009
	nop
Packit 6c4009
Packit 6c4009
	/* We are generating R6 code, the destination is 4 byte aligned and
Packit 6c4009
	   the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the
Packit 6c4009
           alignment of the source.  */
Packit 6c4009
Packit 6c4009
L(r6_unaligned1):
Packit 6c4009
	R6_UNALIGNED_WORD_COPY(1)
Packit 6c4009
L(r6_unaligned2):
Packit 6c4009
	R6_UNALIGNED_WORD_COPY(2)
Packit 6c4009
L(r6_unaligned3):
Packit 6c4009
	R6_UNALIGNED_WORD_COPY(3)
Packit 6c4009
# ifdef USE_DOUBLE
Packit 6c4009
L(r6_unaligned4):
Packit 6c4009
	R6_UNALIGNED_WORD_COPY(4)
Packit 6c4009
L(r6_unaligned5):
Packit 6c4009
	R6_UNALIGNED_WORD_COPY(5)
Packit 6c4009
L(r6_unaligned6):
Packit 6c4009
	R6_UNALIGNED_WORD_COPY(6)
Packit 6c4009
L(r6_unaligned7):
Packit 6c4009
	R6_UNALIGNED_WORD_COPY(7)
Packit 6c4009
# endif
Packit 6c4009
#endif /* R6_CODE */
Packit 6c4009
Packit 6c4009
	.set	at
Packit 6c4009
	.set	reorder
Packit 6c4009
END(MEMCPY_NAME)
Packit 6c4009
#ifndef ANDROID_CHANGES
Packit 6c4009
# ifdef _LIBC
Packit 6c4009
libc_hidden_builtin_def (MEMCPY_NAME)
Packit 6c4009
# endif
Packit 6c4009
#endif