Blame pixman/pixman-mips-memcpy-asm.S

Packit 030a23
/*
Packit 030a23
 * Copyright (c) 2012
Packit 030a23
 *      MIPS Technologies, Inc., California.
Packit 030a23
 *
Packit 030a23
 * Redistribution and use in source and binary forms, with or without
Packit 030a23
 * modification, are permitted provided that the following conditions
Packit 030a23
 * are met:
Packit 030a23
 * 1. Redistributions of source code must retain the above copyright
Packit 030a23
 *    notice, this list of conditions and the following disclaimer.
Packit 030a23
 * 2. Redistributions in binary form must reproduce the above copyright
Packit 030a23
 *    notice, this list of conditions and the following disclaimer in the
Packit 030a23
 *    documentation and/or other materials provided with the distribution.
Packit 030a23
 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
Packit 030a23
 *    contributors may be used to endorse or promote products derived from
Packit 030a23
 *    this software without specific prior written permission.
Packit 030a23
 *
Packit 030a23
 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
Packit 030a23
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
Packit 030a23
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
Packit 030a23
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
Packit 030a23
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
Packit 030a23
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
Packit 030a23
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
Packit 030a23
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
Packit 030a23
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
Packit 030a23
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
Packit 030a23
 * SUCH DAMAGE.
Packit 030a23
 */
Packit 030a23
Packit 030a23
#include "pixman-mips-dspr2-asm.h"
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * This routine could be optimized for MIPS64. The current code only
Packit 030a23
 * uses MIPS32 instructions.
Packit 030a23
 */
Packit 030a23
Packit 030a23
#ifdef EB
Packit 030a23
#  define LWHI	lwl		/* high part is left in big-endian */
Packit 030a23
#  define SWHI	swl		/* high part is left in big-endian */
Packit 030a23
#  define LWLO	lwr		/* low part is right in big-endian */
Packit 030a23
#  define SWLO	swr		/* low part is right in big-endian */
Packit 030a23
#else
Packit 030a23
#  define LWHI	lwr		/* high part is right in little-endian */
Packit 030a23
#  define SWHI	swr		/* high part is right in little-endian */
Packit 030a23
#  define LWLO	lwl		/* low part is left in big-endian */
Packit 030a23
#  define SWLO	swl		/* low part is left in big-endian */
Packit 030a23
#endif
Packit 030a23
Packit 030a23
LEAF_MIPS32R2(pixman_mips_fast_memcpy)
Packit 030a23
Packit 030a23
	slti	AT, a2, 8
Packit 030a23
	bne	AT, zero, $last8
Packit 030a23
	move	v0, a0	/* memcpy returns the dst pointer */
Packit 030a23
Packit 030a23
/* Test if the src and dst are word-aligned, or can be made word-aligned */
Packit 030a23
	xor	t8, a1, a0
Packit 030a23
	andi	t8, t8, 0x3		/* t8 is a0/a1 word-displacement */
Packit 030a23
Packit 030a23
	bne	t8, zero, $unaligned
Packit 030a23
	negu	a3, a0
Packit 030a23
Packit 030a23
	andi	a3, a3, 0x3	/* we need to copy a3 bytes to make a0/a1 aligned */
Packit 030a23
	beq	a3, zero, $chk16w	/* when a3=0 then the dst (a0) is word-aligned */
Packit 030a23
	subu	a2, a2, a3	/* now a2 is the remining bytes count */
Packit 030a23
Packit 030a23
	LWHI	t8, 0(a1)
Packit 030a23
	addu	a1, a1, a3
Packit 030a23
	SWHI	t8, 0(a0)
Packit 030a23
	addu	a0, a0, a3
Packit 030a23
Packit 030a23
/* Now the dst/src are mutually word-aligned with word-aligned addresses */
Packit 030a23
$chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
Packit 030a23
				/* t8 is the byte count after 64-byte chunks */
Packit 030a23
Packit 030a23
	beq	a2, t8, $chk8w	/* if a2==t8, no 64-byte chunks */
Packit 030a23
				/* There will be at most 1 32-byte chunk after it */
Packit 030a23
	subu	a3, a2, t8	/* subtract from a2 the reminder */
Packit 030a23
                                /* Here a3 counts bytes in 16w chunks */
Packit 030a23
	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
Packit 030a23
Packit 030a23
	addu	t0, a0, a2	/* t0 is the "past the end" address */
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
Packit 030a23
 * the "t0-32" address
Packit 030a23
 * This means: for x=128 the last "safe" a0 address is "t0-160"
Packit 030a23
 * Alternatively, for x=64 the last "safe" a0 address is "t0-96"
Packit 030a23
 * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit
Packit 030a23
 */
Packit 030a23
	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
Packit 030a23
Packit 030a23
	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
Packit 030a23
	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
Packit 030a23
	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
Packit 030a23
	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
Packit 030a23
/* In case the a0 > t9 don't use "pref 30" at all */
Packit 030a23
	sgtu	v1, a0, t9
Packit 030a23
	bgtz	v1, $loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
Packit 030a23
	nop
Packit 030a23
/* otherwise, start with using pref30 */
Packit 030a23
	pref	30, 64(a0)
Packit 030a23
$loop16w:
Packit 030a23
	pref	0, 96(a1)
Packit 030a23
	lw	t0, 0(a1)
Packit 030a23
	bgtz	v1, $skip_pref30_96	/* skip "pref 30, 96(a0)" */
Packit 030a23
	lw	t1, 4(a1)
Packit 030a23
	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
Packit 030a23
$skip_pref30_96:
Packit 030a23
	lw	t2, 8(a1)
Packit 030a23
	lw	t3, 12(a1)
Packit 030a23
	lw	t4, 16(a1)
Packit 030a23
	lw	t5, 20(a1)
Packit 030a23
	lw	t6, 24(a1)
Packit 030a23
	lw	t7, 28(a1)
Packit 030a23
        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
Packit 030a23
Packit 030a23
	sw	t0, 0(a0)
Packit 030a23
	sw	t1, 4(a0)
Packit 030a23
	sw	t2, 8(a0)
Packit 030a23
	sw	t3, 12(a0)
Packit 030a23
	sw	t4, 16(a0)
Packit 030a23
	sw	t5, 20(a0)
Packit 030a23
	sw	t6, 24(a0)
Packit 030a23
	sw	t7, 28(a0)
Packit 030a23
Packit 030a23
	lw	t0, 32(a1)
Packit 030a23
	bgtz	v1, $skip_pref30_128	/* skip "pref 30, 128(a0)" */
Packit 030a23
	lw	t1, 36(a1)
Packit 030a23
	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
Packit 030a23
$skip_pref30_128:
Packit 030a23
	lw	t2, 40(a1)
Packit 030a23
	lw	t3, 44(a1)
Packit 030a23
	lw	t4, 48(a1)
Packit 030a23
	lw	t5, 52(a1)
Packit 030a23
	lw	t6, 56(a1)
Packit 030a23
	lw	t7, 60(a1)
Packit 030a23
        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
Packit 030a23
Packit 030a23
	sw	t0, 32(a0)
Packit 030a23
	sw	t1, 36(a0)
Packit 030a23
	sw	t2, 40(a0)
Packit 030a23
	sw	t3, 44(a0)
Packit 030a23
	sw	t4, 48(a0)
Packit 030a23
	sw	t5, 52(a0)
Packit 030a23
	sw	t6, 56(a0)
Packit 030a23
	sw	t7, 60(a0)
Packit 030a23
Packit 030a23
	addiu	a0, a0, 64	/* adding 64 to dest */
Packit 030a23
	sgtu	v1, a0, t9
Packit 030a23
	bne	a0, a3, $loop16w
Packit 030a23
	addiu	a1, a1, 64	/* adding 64 to src */
Packit 030a23
	move	a2, t8
Packit 030a23
Packit 030a23
/* Here we have src and dest word-aligned but less than 64-bytes to go */
Packit 030a23
Packit 030a23
$chk8w:
Packit 030a23
	pref 0, 0x0(a1)
Packit 030a23
	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
Packit 030a23
				/* the t8 is the reminder count past 32-bytes */
Packit 030a23
	beq	a2, t8, $chk1w	/* when a2=t8, no 32-byte chunk */
Packit 030a23
	 nop
Packit 030a23
Packit 030a23
	lw	t0, 0(a1)
Packit 030a23
	lw	t1, 4(a1)
Packit 030a23
	lw	t2, 8(a1)
Packit 030a23
	lw	t3, 12(a1)
Packit 030a23
	lw	t4, 16(a1)
Packit 030a23
	lw	t5, 20(a1)
Packit 030a23
	lw	t6, 24(a1)
Packit 030a23
	lw	t7, 28(a1)
Packit 030a23
	addiu	a1, a1, 32
Packit 030a23
Packit 030a23
	sw	t0, 0(a0)
Packit 030a23
	sw	t1, 4(a0)
Packit 030a23
	sw	t2, 8(a0)
Packit 030a23
	sw	t3, 12(a0)
Packit 030a23
	sw	t4, 16(a0)
Packit 030a23
	sw	t5, 20(a0)
Packit 030a23
	sw	t6, 24(a0)
Packit 030a23
	sw	t7, 28(a0)
Packit 030a23
	addiu	a0, a0, 32
Packit 030a23
Packit 030a23
$chk1w:
Packit 030a23
	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
Packit 030a23
	beq	a2, t8, $last8
Packit 030a23
	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
Packit 030a23
	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
Packit 030a23
Packit 030a23
/* copying in words (4-byte chunks) */
Packit 030a23
$wordCopy_loop:
Packit 030a23
	lw	t3, 0(a1)	/* the first t3 may be equal t0 ... optimize? */
Packit 030a23
	addiu	a1, a1, 4
Packit 030a23
	addiu	a0, a0, 4
Packit 030a23
	bne	a0, a3, $wordCopy_loop
Packit 030a23
	sw	t3, -4(a0)
Packit 030a23
Packit 030a23
/* For the last (<8) bytes */
Packit 030a23
$last8:
Packit 030a23
	blez	a2, leave
Packit 030a23
	addu	a3, a0, a2	/* a3 is the last dst address */
Packit 030a23
$last8loop:
Packit 030a23
	lb	v1, 0(a1)
Packit 030a23
	addiu	a1, a1, 1
Packit 030a23
	addiu	a0, a0, 1
Packit 030a23
	bne	a0, a3, $last8loop
Packit 030a23
	sb	v1, -1(a0)
Packit 030a23
Packit 030a23
leave:	j	ra
Packit 030a23
	nop
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * UNALIGNED case
Packit 030a23
 */
Packit 030a23
Packit 030a23
$unaligned:
Packit 030a23
	/* got here with a3="negu a0" */
Packit 030a23
	andi	a3, a3, 0x3	/* test if the a0 is word aligned */
Packit 030a23
	beqz	a3, $ua_chk16w
Packit 030a23
	subu	a2, a2, a3	/* bytes left after initial a3 bytes */
Packit 030a23
Packit 030a23
	LWHI	v1, 0(a1)
Packit 030a23
	LWLO	v1, 3(a1)
Packit 030a23
	addu	a1, a1, a3	/* a3 may be here 1, 2 or 3 */
Packit 030a23
	SWHI	v1, 0(a0)
Packit 030a23
	addu	a0, a0, a3	/* below the dst will be word aligned (NOTE1) */
Packit 030a23
Packit 030a23
$ua_chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
Packit 030a23
				/* t8 is the byte count after 64-byte chunks */
Packit 030a23
	beq	a2, t8, $ua_chk8w	/* if a2==t8, no 64-byte chunks */
Packit 030a23
				/* There will be at most 1 32-byte chunk after it */
Packit 030a23
	subu	a3, a2, t8	/* subtract from a2 the reminder */
Packit 030a23
                                /* Here a3 counts bytes in 16w chunks */
Packit 030a23
	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
Packit 030a23
Packit 030a23
	addu	t0, a0, a2	/* t0 is the "past the end" address */
Packit 030a23
Packit 030a23
	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
Packit 030a23
Packit 030a23
	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
Packit 030a23
	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
Packit 030a23
	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
Packit 030a23
	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
Packit 030a23
/* In case the a0 > t9 don't use "pref 30" at all */
Packit 030a23
	sgtu	v1, a0, t9
Packit 030a23
	bgtz	v1, $ua_loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
Packit 030a23
	nop
Packit 030a23
/* otherwise,  start with using pref30 */
Packit 030a23
	pref	30, 64(a0)
Packit 030a23
$ua_loop16w:
Packit 030a23
	pref	0, 96(a1)
Packit 030a23
	LWHI	t0, 0(a1)
Packit 030a23
	LWLO	t0, 3(a1)
Packit 030a23
	LWHI	t1, 4(a1)
Packit 030a23
	bgtz	v1, $ua_skip_pref30_96
Packit 030a23
	LWLO	t1, 7(a1)
Packit 030a23
	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
Packit 030a23
$ua_skip_pref30_96:
Packit 030a23
	LWHI	t2, 8(a1)
Packit 030a23
	LWLO	t2, 11(a1)
Packit 030a23
	LWHI	t3, 12(a1)
Packit 030a23
	LWLO	t3, 15(a1)
Packit 030a23
	LWHI	t4, 16(a1)
Packit 030a23
	LWLO	t4, 19(a1)
Packit 030a23
	LWHI	t5, 20(a1)
Packit 030a23
	LWLO	t5, 23(a1)
Packit 030a23
	LWHI	t6, 24(a1)
Packit 030a23
	LWLO	t6, 27(a1)
Packit 030a23
	LWHI	t7, 28(a1)
Packit 030a23
	LWLO	t7, 31(a1)
Packit 030a23
        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
Packit 030a23
Packit 030a23
	sw	t0, 0(a0)
Packit 030a23
	sw	t1, 4(a0)
Packit 030a23
	sw	t2, 8(a0)
Packit 030a23
	sw	t3, 12(a0)
Packit 030a23
	sw	t4, 16(a0)
Packit 030a23
	sw	t5, 20(a0)
Packit 030a23
	sw	t6, 24(a0)
Packit 030a23
	sw	t7, 28(a0)
Packit 030a23
Packit 030a23
	LWHI	t0, 32(a1)
Packit 030a23
	LWLO	t0, 35(a1)
Packit 030a23
	LWHI	t1, 36(a1)
Packit 030a23
	bgtz	v1, $ua_skip_pref30_128
Packit 030a23
	LWLO	t1, 39(a1)
Packit 030a23
	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
Packit 030a23
$ua_skip_pref30_128:
Packit 030a23
	LWHI	t2, 40(a1)
Packit 030a23
	LWLO	t2, 43(a1)
Packit 030a23
	LWHI	t3, 44(a1)
Packit 030a23
	LWLO	t3, 47(a1)
Packit 030a23
	LWHI	t4, 48(a1)
Packit 030a23
	LWLO	t4, 51(a1)
Packit 030a23
	LWHI	t5, 52(a1)
Packit 030a23
	LWLO	t5, 55(a1)
Packit 030a23
	LWHI	t6, 56(a1)
Packit 030a23
	LWLO	t6, 59(a1)
Packit 030a23
	LWHI	t7, 60(a1)
Packit 030a23
	LWLO	t7, 63(a1)
Packit 030a23
        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
Packit 030a23
Packit 030a23
	sw	t0, 32(a0)
Packit 030a23
	sw	t1, 36(a0)
Packit 030a23
	sw	t2, 40(a0)
Packit 030a23
	sw	t3, 44(a0)
Packit 030a23
	sw	t4, 48(a0)
Packit 030a23
	sw	t5, 52(a0)
Packit 030a23
	sw	t6, 56(a0)
Packit 030a23
	sw	t7, 60(a0)
Packit 030a23
Packit 030a23
	addiu	a0, a0, 64	/* adding 64 to dest */
Packit 030a23
	sgtu	v1, a0, t9
Packit 030a23
	bne	a0, a3, $ua_loop16w
Packit 030a23
	addiu	a1, a1, 64	/* adding 64 to src */
Packit 030a23
	move	a2, t8
Packit 030a23
Packit 030a23
/* Here we have src and dest word-aligned but less than 64-bytes to go */
Packit 030a23
Packit 030a23
$ua_chk8w:
Packit 030a23
	pref 0, 0x0(a1)
Packit 030a23
	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
Packit 030a23
				/* the t8 is the reminder count */
Packit 030a23
	beq	a2, t8, $ua_chk1w	/* when a2=t8, no 32-byte chunk */
Packit 030a23
Packit 030a23
	LWHI	t0, 0(a1)
Packit 030a23
	LWLO	t0, 3(a1)
Packit 030a23
	LWHI	t1, 4(a1)
Packit 030a23
	LWLO	t1, 7(a1)
Packit 030a23
	LWHI	t2, 8(a1)
Packit 030a23
	LWLO	t2, 11(a1)
Packit 030a23
	LWHI	t3, 12(a1)
Packit 030a23
	LWLO	t3, 15(a1)
Packit 030a23
	LWHI	t4, 16(a1)
Packit 030a23
	LWLO	t4, 19(a1)
Packit 030a23
	LWHI	t5, 20(a1)
Packit 030a23
	LWLO	t5, 23(a1)
Packit 030a23
	LWHI	t6, 24(a1)
Packit 030a23
	LWLO	t6, 27(a1)
Packit 030a23
	LWHI	t7, 28(a1)
Packit 030a23
	LWLO	t7, 31(a1)
Packit 030a23
	addiu	a1, a1, 32
Packit 030a23
Packit 030a23
	sw	t0, 0(a0)
Packit 030a23
	sw	t1, 4(a0)
Packit 030a23
	sw	t2, 8(a0)
Packit 030a23
	sw	t3, 12(a0)
Packit 030a23
	sw	t4, 16(a0)
Packit 030a23
	sw	t5, 20(a0)
Packit 030a23
	sw	t6, 24(a0)
Packit 030a23
	sw	t7, 28(a0)
Packit 030a23
	addiu	a0, a0, 32
Packit 030a23
Packit 030a23
$ua_chk1w:
Packit 030a23
	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
Packit 030a23
	beq	a2, t8, $ua_smallCopy
Packit 030a23
	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
Packit 030a23
	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
Packit 030a23
Packit 030a23
/* copying in words (4-byte chunks) */
Packit 030a23
$ua_wordCopy_loop:
Packit 030a23
	LWHI	v1, 0(a1)
Packit 030a23
	LWLO	v1, 3(a1)
Packit 030a23
	addiu	a1, a1, 4
Packit 030a23
	addiu	a0, a0, 4		/* note: dst=a0 is word aligned here, see NOTE1 */
Packit 030a23
	bne	a0, a3, $ua_wordCopy_loop
Packit 030a23
	sw	v1, -4(a0)
Packit 030a23
Packit 030a23
/* Now less than 4 bytes (value in a2) left to copy */
Packit 030a23
$ua_smallCopy:
Packit 030a23
	beqz	a2, leave
Packit 030a23
	addu	a3, a0, a2	/* a3 is the last dst address */
Packit 030a23
$ua_smallCopy_loop:
Packit 030a23
	lb	v1, 0(a1)
Packit 030a23
	addiu	a1, a1, 1
Packit 030a23
	addiu	a0, a0, 1
Packit 030a23
	bne	a0, a3, $ua_smallCopy_loop
Packit 030a23
	sb	v1, -1(a0)
Packit 030a23
Packit 030a23
	j	ra
Packit 030a23
	nop
Packit 030a23
Packit 030a23
END(pixman_mips_fast_memcpy)