Blame sysdeps/powerpc/powerpc64/power6/memcpy.S

Packit 6c4009
/* Optimized memcpy implementation for PowerPC64.
Packit 6c4009
   Copyright (C) 2003-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
Packit 6c4009
   Returns 'dst'.
Packit 6c4009
Packit 6c4009
   Memcpy handles short copies (< 32-bytes) using a binary move blocks
Packit 6c4009
   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
Packit 6c4009
   with the appropriate combination of byte and halfword load/stores.
Packit 6c4009
   There is minimal effort to optimize the alignment of short moves.
Packit 6c4009
   The 64-bit implementations of POWER3 and POWER4 do a reasonable job
Packit 6c4009
   of handling unaligned load/stores that do not cross 32-byte boundaries.
Packit 6c4009
Packit 6c4009
   Longer moves (>= 32-bytes) justify the effort to get at least the
Packit 6c4009
   destination doubleword (8-byte) aligned.  Further optimization is
Packit 6c4009
   possible when both source and destination are doubleword aligned.
Packit 6c4009
   Each case has a optimized unrolled loop.
Packit 6c4009
Packit 6c4009
   For POWER6 unaligned loads will take a 20+ cycle hiccup for any
Packit 6c4009
   L1 cache miss that crosses a 32- or 128-byte boundary.  Store
Packit 6c4009
   is more forgiving and does not take a hiccup until page or
Packit 6c4009
   segment boundaries.  So we require doubleword alignment for
Packit 6c4009
   the source but may take a risk and only require word alignment
Packit 6c4009
   for the destination.  */
Packit 6c4009
Packit 6c4009
#ifndef MEMCPY
Packit 6c4009
# define MEMCPY memcpy
Packit 6c4009
#endif
Packit 6c4009
	.machine	"power6"
Packit 6c4009
ENTRY_TOCLESS (MEMCPY, 7)
Packit 6c4009
	CALL_MCOUNT 3
Packit 6c4009
Packit 6c4009
    cmpldi cr1,5,31
Packit 6c4009
    neg   0,3
Packit 6c4009
    std   3,-16(1)
Packit 6c4009
    std   31,-8(1)
Packit 6c4009
    andi. 11,3,7	/* check alignment of dst.  */
Packit 6c4009
    clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
Packit 6c4009
    clrldi 10,4,61	/* check alignment of src.  */
Packit 6c4009
    cmpldi cr6,5,8
Packit 6c4009
    ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
Packit 6c4009
    mtcrf 0x01,0
Packit 6c4009
    cmpld cr6,10,11
Packit 6c4009
    srdi  9,5,3		/* Number of full double words remaining.  */
Packit 6c4009
    beq   .L0
Packit 6c4009
Packit 6c4009
    subf  5,0,5
Packit 6c4009
  /* Move 0-7 bytes as needed to get the destination doubleword aligned.
Packit 6c4009
     Duplicate some code to maximize fall-through and minimize agen delays.  */
Packit 6c4009
1:  bf    31,2f
Packit 6c4009
    lbz   6,0(4)
Packit 6c4009
    stb   6,0(3)
Packit 6c4009
    bf    30,5f
Packit 6c4009
    lhz   6,1(4)
Packit 6c4009
    sth   6,1(3)
Packit 6c4009
    bf    29,0f
Packit 6c4009
    lwz   6,3(4)
Packit 6c4009
    stw   6,3(3)
Packit 6c4009
    b     0f
Packit 6c4009
5:
Packit 6c4009
    bf    29,0f
Packit 6c4009
    lwz   6,1(4)
Packit 6c4009
    stw   6,1(3)
Packit 6c4009
    b     0f
Packit 6c4009
Packit 6c4009
2:  bf    30,4f
Packit 6c4009
    lhz   6,0(4)
Packit 6c4009
    sth   6,0(3)
Packit 6c4009
    bf    29,0f
Packit 6c4009
    lwz   6,2(4)
Packit 6c4009
    stw   6,2(3)
Packit 6c4009
    b     0f
Packit 6c4009
Packit 6c4009
4:  bf    29,0f
Packit 6c4009
    lwz   6,0(4)
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
0:
Packit 6c4009
/* Add the number of bytes until the 1st doubleword of dst to src and dst.  */
Packit 6c4009
    add   4,4,0
Packit 6c4009
    add   3,3,0
Packit 6c4009
Packit 6c4009
    clrldi 10,4,61	/* check alignment of src again.  */
Packit 6c4009
    srdi  9,5,3	/* Number of full double words remaining.  */
Packit 6c4009
Packit 6c4009
  /* Copy doublewords from source to destination, assuming the
Packit 6c4009
     destination is aligned on a doubleword boundary.
Packit 6c4009
Packit 6c4009
     At this point we know there are at least 25 bytes left (32-7) to copy.
Packit 6c4009
     The next step is to determine if the source is also doubleword aligned.
Packit 6c4009
     If not branch to the unaligned move code at .L6. which uses
Packit 6c4009
     a load, shift, store strategy.
Packit 6c4009
Packit 6c4009
     Otherwise source and destination are doubleword aligned, and we can
Packit 6c4009
     the optimized doubleword copy loop.  */
Packit 6c4009
    .align  4
Packit 6c4009
.L0:
Packit 6c4009
    clrldi  11,5,61
Packit 6c4009
    andi.   0,5,0x78
Packit 6c4009
    srdi    12,5,7	/* Number of 128-byte blocks to move.  */
Packit 6c4009
    cmpldi  cr1,11,0	/* If the tail is 0 bytes  */
Packit 6c4009
    bne-    cr6,.L6     /* If source is not DW aligned.  */
Packit 6c4009
Packit 6c4009
  /* Move doublewords where destination and source are DW aligned.
Packit 6c4009
     Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration.
Packit 6c4009
     If the copy is not an exact multiple of 128 bytes, 1-15
Packit 6c4009
     doublewords are copied as needed to set up the main loop.  After
Packit 6c4009
     the main loop exits there may be a tail of 1-7 bytes. These byte
Packit 6c4009
     are copied a word/halfword/byte at a time as needed to preserve
Packit 6c4009
     alignment.
Packit 6c4009
Packit 6c4009
     For POWER6 the L1 is store-through and the L2 is store-in.  The
Packit 6c4009
     L2 is clocked at half CPU clock so we can store 16 bytes every
Packit 6c4009
     other cycle.  POWER6 also has a load/store bypass so we can do
Packit 6c4009
     load, load, store, store every 2 cycles.
Packit 6c4009
Packit 6c4009
     The following code is sensitive to cache line alignment.  Do not
Packit 6c4009
     make any change with out first making sure they don't result in
Packit 6c4009
     splitting ld/std pairs across a cache line.  */
Packit 6c4009
Packit 6c4009
    mtcrf 0x02,5
Packit 6c4009
    mtcrf 0x01,5
Packit 6c4009
    cmpldi  cr5,12,1
Packit 6c4009
    beq   L(das_loop)
Packit 6c4009
Packit 6c4009
    bf    25,4f
Packit 6c4009
    .align  3
Packit 6c4009
    ld    6,0(4)
Packit 6c4009
    ld    7,8(4)
Packit 6c4009
    mr    11,4
Packit 6c4009
    mr    10,3
Packit 6c4009
    std   6,0(3)
Packit 6c4009
    std   7,8(3)
Packit 6c4009
    ld    6,16(4)
Packit 6c4009
    ld    7,24(4)
Packit 6c4009
    std   6,16(3)
Packit 6c4009
    std   7,24(3)
Packit 6c4009
    ld    6,0+32(4)
Packit 6c4009
    ld    7,8+32(4)
Packit 6c4009
    addi  4,4,64
Packit 6c4009
    addi  3,3,64
Packit 6c4009
    std   6,0+32(10)
Packit 6c4009
    std   7,8+32(10)
Packit 6c4009
    ld    6,16+32(11)
Packit 6c4009
    ld    7,24+32(11)
Packit 6c4009
    std   6,16+32(10)
Packit 6c4009
    std   7,24+32(10)
Packit 6c4009
4:
Packit 6c4009
    mr    10,3
Packit 6c4009
    bf    26,2f
Packit 6c4009
    ld    6,0(4)
Packit 6c4009
    ld    7,8(4)
Packit 6c4009
    mr    11,4
Packit 6c4009
    nop
Packit 6c4009
    std   6,0(3)
Packit 6c4009
    std   7,8(3)
Packit 6c4009
    ld    6,16(4)
Packit 6c4009
    ld    7,24(4)
Packit 6c4009
    addi  4,4,32
Packit 6c4009
    std   6,16(3)
Packit 6c4009
    std   7,24(3)
Packit 6c4009
    addi  3,3,32
Packit 6c4009
6:
Packit 6c4009
    nop
Packit 6c4009
    bf    27,5f
Packit 6c4009
    ld    6,0+32(11)
Packit 6c4009
    ld    7,8+32(11)
Packit 6c4009
    addi  4,4,16
Packit 6c4009
    addi  3,3,16
Packit 6c4009
    std   6,0+32(10)
Packit 6c4009
    std   7,8+32(10)
Packit 6c4009
    bf    28,L(das_loop_s)
Packit 6c4009
    ld    0,16+32(11)
Packit 6c4009
    addi  4,4,8
Packit 6c4009
    addi  3,3,8
Packit 6c4009
    std   0,16+32(10)
Packit 6c4009
    blt   cr5,L(das_tail)
Packit 6c4009
    b     L(das_loop)
Packit 6c4009
    .align  3
Packit 6c4009
5:
Packit 6c4009
    nop
Packit 6c4009
    bf    28,L(das_loop_s)
Packit 6c4009
    ld    6,32(11)
Packit 6c4009
    addi  4,4,8
Packit 6c4009
    addi  3,3,8
Packit 6c4009
    std   6,32(10)
Packit 6c4009
    blt   cr5,L(das_tail)
Packit 6c4009
    b     L(das_loop)
Packit 6c4009
    .align  3
Packit 6c4009
2:
Packit 6c4009
    mr    11,4
Packit 6c4009
    bf    27,1f
Packit 6c4009
    ld    6,0(4)
Packit 6c4009
    ld    7,8(4)
Packit 6c4009
    addi  4,4,16
Packit 6c4009
    addi  3,3,16
Packit 6c4009
    std   6,0(10)
Packit 6c4009
    std   7,8(10)
Packit 6c4009
    bf    28,L(das_loop_s)
Packit 6c4009
    ld    0,16(11)
Packit 6c4009
    addi  4,11,24
Packit 6c4009
    addi  3,10,24
Packit 6c4009
    std   0,16(10)
Packit 6c4009
    blt   cr5,L(das_tail)
Packit 6c4009
    b     L(das_loop)
Packit 6c4009
    .align  3
Packit 6c4009
1:
Packit 6c4009
    nop
Packit 6c4009
    bf    28,L(das_loop_s)
Packit 6c4009
    ld    6,0(4)
Packit 6c4009
    addi  4,4,8
Packit 6c4009
    addi  3,3,8
Packit 6c4009
    std   6,0(10)
Packit 6c4009
L(das_loop_s):
Packit 6c4009
    nop
Packit 6c4009
    blt   cr5,L(das_tail)
Packit 6c4009
    .align  4
Packit 6c4009
L(das_loop):
Packit 6c4009
    ld    6,0(4)
Packit 6c4009
    ld    7,8(4)
Packit 6c4009
    mr    10,3
Packit 6c4009
    mr    11,4
Packit 6c4009
    std   6,0(3)
Packit 6c4009
    std   7,8(3)
Packit 6c4009
    addi  12,12,-1
Packit 6c4009
    nop
Packit 6c4009
    ld    8,16(4)
Packit 6c4009
    ld    0,24(4)
Packit 6c4009
    std   8,16(3)
Packit 6c4009
    std   0,24(3)
Packit 6c4009
Packit 6c4009
    ld    6,0+32(4)
Packit 6c4009
    ld    7,8+32(4)
Packit 6c4009
    std   6,0+32(3)
Packit 6c4009
    std   7,8+32(3)
Packit 6c4009
    ld    8,16+32(4)
Packit 6c4009
    ld    0,24+32(4)
Packit 6c4009
    std   8,16+32(3)
Packit 6c4009
    std   0,24+32(3)
Packit 6c4009
Packit 6c4009
    ld    6,0+64(11)
Packit 6c4009
    ld    7,8+64(11)
Packit 6c4009
    std   6,0+64(10)
Packit 6c4009
    std   7,8+64(10)
Packit 6c4009
    ld    8,16+64(11)
Packit 6c4009
    ld    0,24+64(11)
Packit 6c4009
    std   8,16+64(10)
Packit 6c4009
    std   0,24+64(10)
Packit 6c4009
Packit 6c4009
    ld    6,0+96(11)
Packit 6c4009
    ld    7,8+96(11)
Packit 6c4009
    addi  4,4,128
Packit 6c4009
    addi  3,3,128
Packit 6c4009
    std   6,0+96(10)
Packit 6c4009
    std   7,8+96(10)
Packit 6c4009
    ld    8,16+96(11)
Packit 6c4009
    ld    0,24+96(11)
Packit 6c4009
    std   8,16+96(10)
Packit 6c4009
    std   0,24+96(10)
Packit 6c4009
    ble   cr5,L(das_loop_e)
Packit 6c4009
Packit 6c4009
    mtctr   12
Packit 6c4009
    .align  4
Packit 6c4009
L(das_loop2):
Packit 6c4009
    ld    6,0(4)
Packit 6c4009
    ld    7,8(4)
Packit 6c4009
    mr    10,3
Packit 6c4009
    mr    11,4
Packit 6c4009
    std   6,0(3)
Packit 6c4009
    std   7,8(3)
Packit 6c4009
    ld    8,16(4)
Packit 6c4009
    ld    0,24(4)
Packit 6c4009
    std   8,16(3)
Packit 6c4009
    std   0,24(3)
Packit 6c4009
Packit 6c4009
    ld    6,0+32(4)
Packit 6c4009
    ld    7,8+32(4)
Packit 6c4009
    std   6,0+32(3)
Packit 6c4009
    std   7,8+32(3)
Packit 6c4009
    ld    8,16+32(4)
Packit 6c4009
    ld    0,24+32(4)
Packit 6c4009
    std   8,16+32(3)
Packit 6c4009
    std   0,24+32(3)
Packit 6c4009
Packit 6c4009
    ld    6,0+64(11)
Packit 6c4009
    ld    7,8+64(11)
Packit 6c4009
    std   6,0+64(10)
Packit 6c4009
    std   7,8+64(10)
Packit 6c4009
    ld    8,16+64(11)
Packit 6c4009
    ld    0,24+64(11)
Packit 6c4009
    std   8,16+64(10)
Packit 6c4009
    std   0,24+64(10)
Packit 6c4009
Packit 6c4009
    ld    6,0+96(11)
Packit 6c4009
    ld    7,8+96(11)
Packit 6c4009
    addi  4,4,128
Packit 6c4009
    addi  3,3,128
Packit 6c4009
    std   6,0+96(10)
Packit 6c4009
    std   7,8+96(10)
Packit 6c4009
    ld    8,16+96(11)
Packit 6c4009
    ld    0,24+96(11)
Packit 6c4009
    std   8,16+96(10)
Packit 6c4009
    std   0,24+96(10)
Packit 6c4009
    bdnz  L(das_loop2)
Packit 6c4009
L(das_loop_e):
Packit 6c4009
/* Check of a 1-7 byte tail, return if none.  */
Packit 6c4009
    bne   cr1,L(das_tail2)
Packit 6c4009
/* Return original dst pointer.  */
Packit 6c4009
    ld 3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
    .align  4
Packit 6c4009
L(das_tail):
Packit 6c4009
    beq   cr1,0f
Packit 6c4009
Packit 6c4009
L(das_tail2):
Packit 6c4009
/*  At this point we have a tail of 0-7 bytes and we know that the
Packit 6c4009
    destination is double word aligned.  */
Packit 6c4009
4:  bf    29,2f
Packit 6c4009
    lwz   6,0(4)
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
    bf    30,5f
Packit 6c4009
    lhz   6,4(4)
Packit 6c4009
    sth   6,4(3)
Packit 6c4009
    bf    31,0f
Packit 6c4009
    lbz   6,6(4)
Packit 6c4009
    stb   6,6(3)
Packit 6c4009
    b     0f
Packit 6c4009
5:  bf    31,0f
Packit 6c4009
    lbz   6,4(4)
Packit 6c4009
    stb   6,4(3)
Packit 6c4009
    b     0f
Packit 6c4009
Packit 6c4009
2:  bf    30,1f
Packit 6c4009
    lhz   6,0(4)
Packit 6c4009
    sth   6,0(3)
Packit 6c4009
    bf    31,0f
Packit 6c4009
    lbz   6,2(4)
Packit 6c4009
    stb   6,2(3)
Packit 6c4009
    b     0f
Packit 6c4009
Packit 6c4009
1:  bf    31,0f
Packit 6c4009
    lbz   6,0(4)
Packit 6c4009
    stb   6,0(3)
Packit 6c4009
0:
Packit 6c4009
  /* Return original dst pointer.  */
Packit 6c4009
    ld 3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
Packit 6c4009
/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
Packit 6c4009
   bytes.  Each case is handled without loops, using binary (1,2,4,8)
Packit 6c4009
   tests.
Packit 6c4009
Packit 6c4009
   In the short (0-8 byte) case no attempt is made to force alignment
Packit 6c4009
   of either source or destination.  The hardware will handle the
Packit 6c4009
   unaligned load/stores with small delays for crossing 32- 128-byte,
Packit 6c4009
   and 4096-byte boundaries. Since these short moves are unlikely to be
Packit 6c4009
   unaligned or cross these boundaries, the overhead to force
Packit 6c4009
   alignment is not justified.
Packit 6c4009
Packit 6c4009
   The longer (9-31 byte) move is more likely to cross 32- or 128-byte
Packit 6c4009
   boundaries.  Since only loads are sensitive to the 32-/128-byte
Packit 6c4009
   boundaries it is more important to align the source then the
Packit 6c4009
   destination.  If the source is not already word aligned, we first
Packit 6c4009
   move 1-3 bytes as needed.  Since we are only word aligned we don't
Packit 6c4009
   use double word load/stores to insure that all loads are aligned.
Packit 6c4009
   While the destination and stores may still be unaligned, this
Packit 6c4009
   is only an issue for page (4096 byte boundary) crossing, which
Packit 6c4009
   should be rare for these short moves.  The hardware handles this
Packit 6c4009
   case automatically with a small (~20 cycle) delay.  */
Packit 6c4009
    .align  4
Packit 6c4009
.L2:
Packit 6c4009
    mtcrf 0x01,5
Packit 6c4009
    neg   8,4
Packit 6c4009
    clrrdi	11,4,2
Packit 6c4009
    andi. 0,8,3
Packit 6c4009
    ble   cr6,.LE8	/* Handle moves of 0-8 bytes.  */
Packit 6c4009
/* At least 9 bytes left.  Get the source word aligned.  */
Packit 6c4009
    cmpldi	cr1,5,16
Packit 6c4009
    mr    10,5
Packit 6c4009
    mr    12,4
Packit 6c4009
    cmpldi	cr6,0,2
Packit 6c4009
    beq   L(dus_tail)	/* If the source is already word aligned skip this.  */
Packit 6c4009
/* Copy 1-3 bytes to get source address word aligned.  */
Packit 6c4009
    lwz   6,0(11)
Packit 6c4009
    subf  10,0,5
Packit 6c4009
    add   12,4,0
Packit 6c4009
    blt   cr6,5f
Packit 6c4009
    srdi  7,6,16
Packit 6c4009
    bgt	  cr6,3f
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    sth   7,0(3)
Packit 6c4009
#else
Packit 6c4009
    sth   6,0(3)
Packit 6c4009
#endif
Packit 6c4009
    b     7f
Packit 6c4009
    .align  4
Packit 6c4009
3:
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    rotlwi 6,6,24
Packit 6c4009
    stb   6,0(3)
Packit 6c4009
    sth   7,1(3)
Packit 6c4009
#else
Packit 6c4009
    stb   7,0(3)
Packit 6c4009
    sth   6,1(3)
Packit 6c4009
#endif
Packit 6c4009
    b     7f
Packit 6c4009
    .align  4
Packit 6c4009
5:
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    rotlwi 6,6,8
Packit 6c4009
#endif
Packit 6c4009
    stb   6,0(3)
Packit 6c4009
7:
Packit 6c4009
    cmpldi	cr1,10,16
Packit 6c4009
    add   3,3,0
Packit 6c4009
    mtcrf 0x01,10
Packit 6c4009
    .align  4
Packit 6c4009
L(dus_tail):
Packit 6c4009
/* At least 6 bytes left and the source is word aligned.  This allows
Packit 6c4009
   some speculative loads up front.  */
Packit 6c4009
/* We need to special case the fall-through because the biggest delays
Packit 6c4009
   are due to address computation not being ready in time for the
Packit 6c4009
   AGEN.  */
Packit 6c4009
    lwz   6,0(12)
Packit 6c4009
    lwz   7,4(12)
Packit 6c4009
    blt   cr1,L(dus_tail8)
Packit 6c4009
    cmpldi	cr0,10,24
Packit 6c4009
L(dus_tail16): /* Move 16 bytes.  */
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
    stw   7,4(3)
Packit 6c4009
    lwz   6,8(12)
Packit 6c4009
    lwz   7,12(12)
Packit 6c4009
    stw   6,8(3)
Packit 6c4009
    stw   7,12(3)
Packit 6c4009
/* Move 8 bytes more.  */
Packit 6c4009
    bf    28,L(dus_tail16p8)
Packit 6c4009
    cmpldi	cr1,10,28
Packit 6c4009
    lwz   6,16(12)
Packit 6c4009
    lwz   7,20(12)
Packit 6c4009
    stw   6,16(3)
Packit 6c4009
    stw   7,20(3)
Packit 6c4009
/* Move 4 bytes more.  */
Packit 6c4009
    bf    29,L(dus_tail16p4)
Packit 6c4009
    lwz   6,24(12)
Packit 6c4009
    stw   6,24(3)
Packit 6c4009
    addi  12,12,28
Packit 6c4009
    addi  3,3,28
Packit 6c4009
    bgt   cr1,L(dus_tail2)
Packit 6c4009
 /* exactly 28 bytes.  Return original dst pointer and exit.  */
Packit 6c4009
    ld    3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
    .align  4
Packit 6c4009
L(dus_tail16p8):  /* less than 8 bytes left.  */
Packit 6c4009
    beq   cr1,L(dus_tailX) /* exactly 16 bytes, early exit.  */
Packit 6c4009
    cmpldi	cr1,10,20
Packit 6c4009
    bf    29,L(dus_tail16p2)
Packit 6c4009
/* Move 4 bytes more.  */
Packit 6c4009
    lwz   6,16(12)
Packit 6c4009
    stw   6,16(3)
Packit 6c4009
    addi  12,12,20
Packit 6c4009
    addi  3,3,20
Packit 6c4009
    bgt   cr1,L(dus_tail2)
Packit 6c4009
 /* exactly 20 bytes.  Return original dst pointer and exit.  */
Packit 6c4009
    ld    3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
    .align  4
Packit 6c4009
L(dus_tail16p4):  /* less than 4 bytes left.  */
Packit 6c4009
    addi  12,12,24
Packit 6c4009
    addi  3,3,24
Packit 6c4009
    bgt   cr0,L(dus_tail2)
Packit 6c4009
 /* exactly 24 bytes.  Return original dst pointer and exit.  */
Packit 6c4009
    ld    3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
    .align  4
Packit 6c4009
L(dus_tail16p2):  /* 16 bytes moved, less than 4 bytes left.  */
Packit 6c4009
    addi  12,12,16
Packit 6c4009
    addi  3,3,16
Packit 6c4009
    b     L(dus_tail2)
Packit 6c4009
Packit 6c4009
    .align  4
Packit 6c4009
L(dus_tail8):  /* Move 8 bytes.  */
Packit 6c4009
/*  r6, r7 already loaded speculatively.  */
Packit 6c4009
    cmpldi	cr1,10,8
Packit 6c4009
    cmpldi	cr0,10,12
Packit 6c4009
    bf    28,L(dus_tail4)
Packit 6c4009
    .align  2
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
    stw   7,4(3)
Packit 6c4009
/* Move 4 bytes more.  */
Packit 6c4009
    bf    29,L(dus_tail8p4)
Packit 6c4009
    lwz   6,8(12)
Packit 6c4009
    stw   6,8(3)
Packit 6c4009
    addi  12,12,12
Packit 6c4009
    addi  3,3,12
Packit 6c4009
    bgt   cr0,L(dus_tail2)
Packit 6c4009
 /* exactly 12 bytes.  Return original dst pointer and exit.  */
Packit 6c4009
    ld    3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
    .align  4
Packit 6c4009
L(dus_tail8p4):  /* less than 4 bytes left.  */
Packit 6c4009
    addi  12,12,8
Packit 6c4009
    addi  3,3,8
Packit 6c4009
    bgt   cr1,L(dus_tail2)
Packit 6c4009
 /* exactly 8 bytes.  Return original dst pointer and exit.  */
Packit 6c4009
    ld    3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
Packit 6c4009
    .align  4
Packit 6c4009
L(dus_tail4):  /* Move 4 bytes.  */
Packit 6c4009
/*  r6 already loaded speculatively.  If we are here we know there is
Packit 6c4009
    more than 4 bytes left.  So there is no need to test.  */
Packit 6c4009
    addi  12,12,4
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
    addi  3,3,4
Packit 6c4009
L(dus_tail2):  /* Move 2-3 bytes.  */
Packit 6c4009
    bf    30,L(dus_tail1)
Packit 6c4009
    lhz   6,0(12)
Packit 6c4009
    sth   6,0(3)
Packit 6c4009
    bf    31,L(dus_tailX)
Packit 6c4009
    lbz   7,2(12)
Packit 6c4009
    stb   7,2(3)
Packit 6c4009
    ld 3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
L(dus_tail1):  /* Move 1 byte.  */
Packit 6c4009
    bf    31,L(dus_tailX)
Packit 6c4009
    lbz   6,0(12)
Packit 6c4009
    stb   6,0(3)
Packit 6c4009
L(dus_tailX):
Packit 6c4009
  /* Return original dst pointer.  */
Packit 6c4009
    ld    3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
Packit 6c4009
/* Special case to copy 0-8 bytes.  */
Packit 6c4009
    .align  4
Packit 6c4009
.LE8:
Packit 6c4009
    mr    12,4
Packit 6c4009
    bne   cr6,L(dus_4)
Packit 6c4009
/* Exactly 8 bytes.  We may cross a 32-/128-byte boundary and take a ~20
Packit 6c4009
   cycle delay.  This case should be rare and any attempt to avoid this
Packit 6c4009
   would take most of 20 cycles any way.  */
Packit 6c4009
    ld   6,0(4)
Packit 6c4009
    std   6,0(3)
Packit 6c4009
  /* Return original dst pointer.  */
Packit 6c4009
    ld    3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
    .align  4
Packit 6c4009
L(dus_4):
Packit 6c4009
    bf    29,L(dus_tail2)
Packit 6c4009
    lwz   6,0(4)
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
    bf    30,L(dus_5)
Packit 6c4009
    lhz   7,4(4)
Packit 6c4009
    sth   7,4(3)
Packit 6c4009
    bf    31,L(dus_0)
Packit 6c4009
    lbz   8,6(4)
Packit 6c4009
    stb   8,6(3)
Packit 6c4009
    ld 3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
    .align  4
Packit 6c4009
L(dus_5):
Packit 6c4009
    bf    31,L(dus_0)
Packit 6c4009
    lbz   6,4(4)
Packit 6c4009
    stb   6,4(3)
Packit 6c4009
L(dus_0):
Packit 6c4009
  /* Return original dst pointer.  */
Packit 6c4009
    ld    3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
Packit 6c4009
    .align  4
Packit 6c4009
.L6:
Packit 6c4009
    cfi_offset(31,-8)
Packit 6c4009
    mr    12,4
Packit 6c4009
    mr    31,5
Packit 6c4009
  /* Copy doublewords where the destination is aligned but the source is
Packit 6c4009
     not.  Use aligned doubleword loads from the source, shifted to realign
Packit 6c4009
     the data, to allow aligned destination stores.  */
Packit 6c4009
    addi    11,9,-1  /* loop DW count is one less than total */
Packit 6c4009
    subf    5,10,12  /* Move source addr to previous full double word.  */
Packit 6c4009
    cmpldi  cr5, 10, 2
Packit 6c4009
    cmpldi  cr0, 10, 4
Packit 6c4009
    mr      4,3
Packit 6c4009
    srdi    8,11,2   /* calculate the 32 byte loop count */
Packit 6c4009
    ld      6,0(5)   /* pre load 1st full doubleword.  */
Packit 6c4009
    mtcrf   0x01,11
Packit 6c4009
    cmpldi  cr6,9,4
Packit 6c4009
    mtctr   8
Packit 6c4009
    ld      7,8(5)   /* pre load 2nd full doubleword.  */
Packit 6c4009
    bge     cr0, L(du4_do)
Packit 6c4009
    blt     cr5, L(du1_do)
Packit 6c4009
    beq     cr5, L(du2_do)
Packit 6c4009
    b       L(du3_do)
Packit 6c4009
Packit 6c4009
    .align 4
Packit 6c4009
L(du1_do):
Packit 6c4009
    bf      30,L(du1_1dw)
Packit 6c4009
Packit 6c4009
    /* there are at least two DWs to copy */
Packit 6c4009
    /* FIXME: can combine last shift and "or" into "rldimi" */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 8
Packit 6c4009
    sldi     8,7, 64-8
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 8
Packit 6c4009
    srdi     8,7, 64-8
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      6,16(5)
Packit 6c4009
    std     0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,7, 8
Packit 6c4009
    sldi     8,6, 64-8
Packit 6c4009
#else
Packit 6c4009
    sldi     0,7, 8
Packit 6c4009
    srdi     8,6, 64-8
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      7,24(5)
Packit 6c4009
    std     0,8(4)
Packit 6c4009
    addi    4,4,16
Packit 6c4009
    addi    5,5,32
Packit 6c4009
    blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
Packit 6c4009
    bf      31,L(du1_loop)
Packit 6c4009
    /* there is a third DW to copy */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 8
Packit 6c4009
    sldi     8,7, 64-8
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 8
Packit 6c4009
    srdi     8,7, 64-8
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    beq     cr6,L(du1_fini)  /* if total DWs = 4, then bypass loop */
Packit 6c4009
    b       L(du1_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du1_1dw):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 8
Packit 6c4009
    sldi     8,7, 64-8
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 8
Packit 6c4009
    srdi     8,7, 64-8
Packit 6c4009
#endif
Packit 6c4009
    addi    5,5,16
Packit 6c4009
    or      0,0,8
Packit 6c4009
    bf      31,L(du1_loop)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    .align 4
Packit 6c4009
/* copy 32 bytes at a time */
Packit 6c4009
L(du1_loop):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 8
Packit 6c4009
    sldi   8,7, 64-8
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 8
Packit 6c4009
    srdi   8,7, 64-8
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,0(5)
Packit 6c4009
    std   0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 8
Packit 6c4009
    sldi   8,6, 64-8
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 8
Packit 6c4009
    srdi   8,6, 64-8
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,8(5)
Packit 6c4009
    std   0,8(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 8
Packit 6c4009
    sldi   8,7, 64-8
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 8
Packit 6c4009
    srdi   8,7, 64-8
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,16(5)
Packit 6c4009
    std   0,16(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 8
Packit 6c4009
    sldi   8,6, 64-8
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 8
Packit 6c4009
    srdi   8,6, 64-8
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,24(5)
Packit 6c4009
    std   0,24(4)
Packit 6c4009
    addi  5,5,32
Packit 6c4009
    addi  4,4,32
Packit 6c4009
    bdnz+ L(du1_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du1_fini):
Packit 6c4009
    /* calculate and store the final DW */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 8
Packit 6c4009
    sldi   8,7, 64-8
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 8
Packit 6c4009
    srdi   8,7, 64-8
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    std   0,0(4)
Packit 6c4009
    b     L(du_done)
Packit 6c4009
Packit 6c4009
    .align 4
Packit 6c4009
L(du2_do):
Packit 6c4009
    bf      30,L(du2_1dw)
Packit 6c4009
Packit 6c4009
    /* there are at least two DWs to copy */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 16
Packit 6c4009
    sldi     8,7, 64-16
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 16
Packit 6c4009
    srdi     8,7, 64-16
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      6,16(5)
Packit 6c4009
    std     0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,7, 16
Packit 6c4009
    sldi     8,6, 64-16
Packit 6c4009
#else
Packit 6c4009
    sldi     0,7, 16
Packit 6c4009
    srdi     8,6, 64-16
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      7,24(5)
Packit 6c4009
    std     0,8(4)
Packit 6c4009
    addi    4,4,16
Packit 6c4009
    addi    5,5,32
Packit 6c4009
    blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
Packit 6c4009
    bf      31,L(du2_loop)
Packit 6c4009
    /* there is a third DW to copy */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 16
Packit 6c4009
    sldi     8,7, 64-16
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 16
Packit 6c4009
    srdi     8,7, 64-16
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    beq     cr6,L(du2_fini)  /* if total DWs = 4, then bypass loop */
Packit 6c4009
    b       L(du2_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du2_1dw):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 16
Packit 6c4009
    sldi     8,7, 64-16
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 16
Packit 6c4009
    srdi     8,7, 64-16
Packit 6c4009
#endif
Packit 6c4009
    addi    5,5,16
Packit 6c4009
    or      0,0,8
Packit 6c4009
    bf      31,L(du2_loop)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    .align 4
Packit 6c4009
/* copy 32 bytes at a time */
Packit 6c4009
L(du2_loop):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 16
Packit 6c4009
    sldi   8,7, 64-16
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 16
Packit 6c4009
    srdi   8,7, 64-16
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,0(5)
Packit 6c4009
    std   0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 16
Packit 6c4009
    sldi   8,6, 64-16
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 16
Packit 6c4009
    srdi   8,6, 64-16
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,8(5)
Packit 6c4009
    std   0,8(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 16
Packit 6c4009
    sldi   8,7, 64-16
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 16
Packit 6c4009
    srdi   8,7, 64-16
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,16(5)
Packit 6c4009
    std   0,16(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 16
Packit 6c4009
    sldi   8,6, 64-16
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 16
Packit 6c4009
    srdi   8,6, 64-16
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,24(5)
Packit 6c4009
    std   0,24(4)
Packit 6c4009
    addi  5,5,32
Packit 6c4009
    addi  4,4,32
Packit 6c4009
    bdnz+ L(du2_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du2_fini):
Packit 6c4009
    /* calculate and store the final DW */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 16
Packit 6c4009
    sldi   8,7, 64-16
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 16
Packit 6c4009
    srdi   8,7, 64-16
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    std   0,0(4)
Packit 6c4009
    b     L(du_done)
Packit 6c4009
Packit 6c4009
    .align 4
Packit 6c4009
L(du3_do):
Packit 6c4009
    bf      30,L(du3_1dw)
Packit 6c4009
Packit 6c4009
    /* there are at least two DWs to copy */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 24
Packit 6c4009
    sldi     8,7, 64-24
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 24
Packit 6c4009
    srdi     8,7, 64-24
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      6,16(5)
Packit 6c4009
    std     0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,7, 24
Packit 6c4009
    sldi     8,6, 64-24
Packit 6c4009
#else
Packit 6c4009
    sldi     0,7, 24
Packit 6c4009
    srdi     8,6, 64-24
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      7,24(5)
Packit 6c4009
    std     0,8(4)
Packit 6c4009
    addi    4,4,16
Packit 6c4009
    addi    5,5,32
Packit 6c4009
    blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
Packit 6c4009
    bf      31,L(du3_loop)
Packit 6c4009
    /* there is a third DW to copy */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 24
Packit 6c4009
    sldi     8,7, 64-24
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 24
Packit 6c4009
    srdi     8,7, 64-24
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    beq     cr6,L(du3_fini)  /* if total DWs = 4, then bypass loop */
Packit 6c4009
    b       L(du3_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du3_1dw):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 24
Packit 6c4009
    sldi     8,7, 64-24
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 24
Packit 6c4009
    srdi     8,7, 64-24
Packit 6c4009
#endif
Packit 6c4009
    addi    5,5,16
Packit 6c4009
    or      0,0,8
Packit 6c4009
    bf      31,L(du3_loop)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    .align 4
Packit 6c4009
/* copy 32 bytes at a time */
Packit 6c4009
L(du3_loop):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 24
Packit 6c4009
    sldi   8,7, 64-24
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 24
Packit 6c4009
    srdi   8,7, 64-24
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,0(5)
Packit 6c4009
    std   0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 24
Packit 6c4009
    sldi   8,6, 64-24
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 24
Packit 6c4009
    srdi   8,6, 64-24
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,8(5)
Packit 6c4009
    std   0,8(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 24
Packit 6c4009
    sldi   8,7, 64-24
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 24
Packit 6c4009
    srdi   8,7, 64-24
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,16(5)
Packit 6c4009
    std   0,16(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 24
Packit 6c4009
    sldi   8,6, 64-24
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 24
Packit 6c4009
    srdi   8,6, 64-24
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,24(5)
Packit 6c4009
    std   0,24(4)
Packit 6c4009
    addi  5,5,32
Packit 6c4009
    addi  4,4,32
Packit 6c4009
    bdnz+ L(du3_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du3_fini):
Packit 6c4009
    /* calculate and store the final DW */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 24
Packit 6c4009
    sldi   8,7, 64-24
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 24
Packit 6c4009
    srdi   8,7, 64-24
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    std   0,0(4)
Packit 6c4009
    b     L(du_done)
Packit 6c4009
Packit 6c4009
    .align 4
Packit 6c4009
L(du4_do):
Packit 6c4009
    cmpldi  cr5, 10, 6
Packit 6c4009
    beq     cr0, L(du4_dox)
Packit 6c4009
    blt     cr5, L(du5_do)
Packit 6c4009
    beq     cr5, L(du6_do)
Packit 6c4009
    b       L(du7_do)
Packit 6c4009
L(du4_dox):
Packit 6c4009
    bf      30,L(du4_1dw)
Packit 6c4009
Packit 6c4009
    /* there are at least two DWs to copy */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 32
Packit 6c4009
    sldi     8,7, 64-32
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 32
Packit 6c4009
    srdi     8,7, 64-32
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      6,16(5)
Packit 6c4009
    std     0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,7, 32
Packit 6c4009
    sldi     8,6, 64-32
Packit 6c4009
#else
Packit 6c4009
    sldi     0,7, 32
Packit 6c4009
    srdi     8,6, 64-32
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      7,24(5)
Packit 6c4009
    std     0,8(4)
Packit 6c4009
    addi    4,4,16
Packit 6c4009
    addi    5,5,32
Packit 6c4009
    blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
Packit 6c4009
    bf      31,L(du4_loop)
Packit 6c4009
    /* there is a third DW to copy */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 32
Packit 6c4009
    sldi     8,7, 64-32
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 32
Packit 6c4009
    srdi     8,7, 64-32
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    beq     cr6,L(du4_fini)  /* if total DWs = 4, then bypass loop */
Packit 6c4009
    b       L(du4_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du4_1dw):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 32
Packit 6c4009
    sldi     8,7, 64-32
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 32
Packit 6c4009
    srdi     8,7, 64-32
Packit 6c4009
#endif
Packit 6c4009
    addi    5,5,16
Packit 6c4009
    or      0,0,8
Packit 6c4009
    bf      31,L(du4_loop)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    .align 4
Packit 6c4009
/* copy 32 bytes at a time */
Packit 6c4009
L(du4_loop):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 32
Packit 6c4009
    sldi   8,7, 64-32
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 32
Packit 6c4009
    srdi   8,7, 64-32
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,0(5)
Packit 6c4009
    std   0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 32
Packit 6c4009
    sldi   8,6, 64-32
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 32
Packit 6c4009
    srdi   8,6, 64-32
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,8(5)
Packit 6c4009
    std   0,8(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 32
Packit 6c4009
    sldi   8,7, 64-32
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 32
Packit 6c4009
    srdi   8,7, 64-32
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,16(5)
Packit 6c4009
    std   0,16(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 32
Packit 6c4009
    sldi   8,6, 64-32
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 32
Packit 6c4009
    srdi   8,6, 64-32
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,24(5)
Packit 6c4009
    std   0,24(4)
Packit 6c4009
    addi  5,5,32
Packit 6c4009
    addi  4,4,32
Packit 6c4009
    bdnz+ L(du4_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du4_fini):
Packit 6c4009
    /* calculate and store the final DW */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 32
Packit 6c4009
    sldi   8,7, 64-32
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 32
Packit 6c4009
    srdi   8,7, 64-32
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    std   0,0(4)
Packit 6c4009
    b     L(du_done)
Packit 6c4009
Packit 6c4009
    .align 4
Packit 6c4009
L(du5_do):
Packit 6c4009
    bf      30,L(du5_1dw)
Packit 6c4009
Packit 6c4009
    /* there are at least two DWs to copy */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 40
Packit 6c4009
    sldi     8,7, 64-40
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 40
Packit 6c4009
    srdi     8,7, 64-40
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      6,16(5)
Packit 6c4009
    std     0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,7, 40
Packit 6c4009
    sldi     8,6, 64-40
Packit 6c4009
#else
Packit 6c4009
    sldi     0,7, 40
Packit 6c4009
    srdi     8,6, 64-40
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      7,24(5)
Packit 6c4009
    std     0,8(4)
Packit 6c4009
    addi    4,4,16
Packit 6c4009
    addi    5,5,32
Packit 6c4009
    blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
Packit 6c4009
    bf      31,L(du5_loop)
Packit 6c4009
    /* there is a third DW to copy */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 40
Packit 6c4009
    sldi     8,7, 64-40
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 40
Packit 6c4009
    srdi     8,7, 64-40
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    beq     cr6,L(du5_fini)  /* if total DWs = 4, then bypass loop */
Packit 6c4009
    b       L(du5_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du5_1dw):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 40
Packit 6c4009
    sldi     8,7, 64-40
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 40
Packit 6c4009
    srdi     8,7, 64-40
Packit 6c4009
#endif
Packit 6c4009
    addi    5,5,16
Packit 6c4009
    or      0,0,8
Packit 6c4009
    bf      31,L(du5_loop)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    .align 4
Packit 6c4009
/* copy 32 bytes at a time */
Packit 6c4009
L(du5_loop):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 40
Packit 6c4009
    sldi   8,7, 64-40
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 40
Packit 6c4009
    srdi   8,7, 64-40
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,0(5)
Packit 6c4009
    std   0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 40
Packit 6c4009
    sldi   8,6, 64-40
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 40
Packit 6c4009
    srdi   8,6, 64-40
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,8(5)
Packit 6c4009
    std   0,8(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 40
Packit 6c4009
    sldi   8,7, 64-40
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 40
Packit 6c4009
    srdi   8,7, 64-40
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,16(5)
Packit 6c4009
    std   0,16(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 40
Packit 6c4009
    sldi   8,6, 64-40
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 40
Packit 6c4009
    srdi   8,6, 64-40
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,24(5)
Packit 6c4009
    std   0,24(4)
Packit 6c4009
    addi  5,5,32
Packit 6c4009
    addi  4,4,32
Packit 6c4009
    bdnz+ L(du5_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du5_fini):
Packit 6c4009
    /* calculate and store the final DW */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 40
Packit 6c4009
    sldi   8,7, 64-40
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 40
Packit 6c4009
    srdi   8,7, 64-40
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    std   0,0(4)
Packit 6c4009
    b     L(du_done)
Packit 6c4009
Packit 6c4009
    .align 4
Packit 6c4009
L(du6_do):
Packit 6c4009
    bf      30,L(du6_1dw)
Packit 6c4009
Packit 6c4009
    /* there are at least two DWs to copy */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 48
Packit 6c4009
    sldi     8,7, 64-48
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 48
Packit 6c4009
    srdi     8,7, 64-48
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      6,16(5)
Packit 6c4009
    std     0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,7, 48
Packit 6c4009
    sldi     8,6, 64-48
Packit 6c4009
#else
Packit 6c4009
    sldi     0,7, 48
Packit 6c4009
    srdi     8,6, 64-48
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      7,24(5)
Packit 6c4009
    std     0,8(4)
Packit 6c4009
    addi    4,4,16
Packit 6c4009
    addi    5,5,32
Packit 6c4009
    blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
Packit 6c4009
    bf      31,L(du6_loop)
Packit 6c4009
    /* there is a third DW to copy */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 48
Packit 6c4009
    sldi     8,7, 64-48
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 48
Packit 6c4009
    srdi     8,7, 64-48
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    beq     cr6,L(du6_fini)  /* if total DWs = 4, then bypass loop */
Packit 6c4009
    b       L(du6_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du6_1dw):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 48
Packit 6c4009
    sldi     8,7, 64-48
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 48
Packit 6c4009
    srdi     8,7, 64-48
Packit 6c4009
#endif
Packit 6c4009
    addi    5,5,16
Packit 6c4009
    or      0,0,8
Packit 6c4009
    bf      31,L(du6_loop)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    .align 4
Packit 6c4009
/* copy 32 bytes at a time */
Packit 6c4009
L(du6_loop):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 48
Packit 6c4009
    sldi   8,7, 64-48
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 48
Packit 6c4009
    srdi   8,7, 64-48
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,0(5)
Packit 6c4009
    std   0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 48
Packit 6c4009
    sldi   8,6, 64-48
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 48
Packit 6c4009
    srdi   8,6, 64-48
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,8(5)
Packit 6c4009
    std   0,8(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 48
Packit 6c4009
    sldi   8,7, 64-48
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 48
Packit 6c4009
    srdi   8,7, 64-48
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,16(5)
Packit 6c4009
    std   0,16(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 48
Packit 6c4009
    sldi   8,6, 64-48
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 48
Packit 6c4009
    srdi   8,6, 64-48
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,24(5)
Packit 6c4009
    std   0,24(4)
Packit 6c4009
    addi  5,5,32
Packit 6c4009
    addi  4,4,32
Packit 6c4009
    bdnz+ L(du6_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du6_fini):
Packit 6c4009
    /* calculate and store the final DW */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 48
Packit 6c4009
    sldi   8,7, 64-48
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 48
Packit 6c4009
    srdi   8,7, 64-48
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    std   0,0(4)
Packit 6c4009
    b     L(du_done)
Packit 6c4009
Packit 6c4009
    .align 4
Packit 6c4009
L(du7_do):
Packit 6c4009
    bf      30,L(du7_1dw)
Packit 6c4009
Packit 6c4009
    /* there are at least two DWs to copy */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 56
Packit 6c4009
    sldi     8,7, 64-56
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 56
Packit 6c4009
    srdi     8,7, 64-56
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      6,16(5)
Packit 6c4009
    std     0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,7, 56
Packit 6c4009
    sldi     8,6, 64-56
Packit 6c4009
#else
Packit 6c4009
    sldi     0,7, 56
Packit 6c4009
    srdi     8,6, 64-56
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    ld      7,24(5)
Packit 6c4009
    std     0,8(4)
Packit 6c4009
    addi    4,4,16
Packit 6c4009
    addi    5,5,32
Packit 6c4009
    blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
Packit 6c4009
    bf      31,L(du7_loop)
Packit 6c4009
    /* there is a third DW to copy */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 56
Packit 6c4009
    sldi     8,7, 64-56
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 56
Packit 6c4009
    srdi     8,7, 64-56
Packit 6c4009
#endif
Packit 6c4009
    or      0,0,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    beq     cr6,L(du7_fini)  /* if total DWs = 4, then bypass loop */
Packit 6c4009
    b       L(du7_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du7_1dw):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi     0,6, 56
Packit 6c4009
    sldi     8,7, 64-56
Packit 6c4009
#else
Packit 6c4009
    sldi     0,6, 56
Packit 6c4009
    srdi     8,7, 64-56
Packit 6c4009
#endif
Packit 6c4009
    addi    5,5,16
Packit 6c4009
    or      0,0,8
Packit 6c4009
    bf      31,L(du7_loop)
Packit 6c4009
    mr      6,7
Packit 6c4009
    ld      7,0(5)
Packit 6c4009
    addi    5,5,8
Packit 6c4009
    std     0,0(4)
Packit 6c4009
    addi    4,4,8
Packit 6c4009
    .align 4
Packit 6c4009
/* copy 32 bytes at a time */
Packit 6c4009
L(du7_loop):
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 56
Packit 6c4009
    sldi   8,7, 64-56
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 56
Packit 6c4009
    srdi   8,7, 64-56
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,0(5)
Packit 6c4009
    std   0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 56
Packit 6c4009
    sldi   8,6, 64-56
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 56
Packit 6c4009
    srdi   8,6, 64-56
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,8(5)
Packit 6c4009
    std   0,8(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 56
Packit 6c4009
    sldi   8,7, 64-56
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 56
Packit 6c4009
    srdi   8,7, 64-56
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    6,16(5)
Packit 6c4009
    std   0,16(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,7, 56
Packit 6c4009
    sldi   8,6, 64-56
Packit 6c4009
#else
Packit 6c4009
    sldi   0,7, 56
Packit 6c4009
    srdi   8,6, 64-56
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    ld    7,24(5)
Packit 6c4009
    std   0,24(4)
Packit 6c4009
    addi  5,5,32
Packit 6c4009
    addi  4,4,32
Packit 6c4009
    bdnz+ L(du7_loop)
Packit 6c4009
    .align 4
Packit 6c4009
L(du7_fini):
Packit 6c4009
    /* calculate and store the final DW */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srdi   0,6, 56
Packit 6c4009
    sldi   8,7, 64-56
Packit 6c4009
#else
Packit 6c4009
    sldi   0,6, 56
Packit 6c4009
    srdi   8,7, 64-56
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    std   0,0(4)
Packit 6c4009
    b     L(du_done)
Packit 6c4009
Packit 6c4009
    .align 4
Packit 6c4009
L(du_done):
Packit 6c4009
    rldicr 0,31,0,60
Packit 6c4009
    mtcrf 0x01,31
Packit 6c4009
    beq   cr1,0f	/* If the tail is 0 bytes we are done!  */
Packit 6c4009
Packit 6c4009
    add   3,3,0
Packit 6c4009
    add   12,12,0
Packit 6c4009
/*  At this point we have a tail of 0-7 bytes and we know that the
Packit 6c4009
    destination is double word aligned.  */
Packit 6c4009
4:  bf    29,2f
Packit 6c4009
    lwz   6,0(12)
Packit 6c4009
    addi  12,12,4
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
    addi  3,3,4
Packit 6c4009
2:  bf    30,1f
Packit 6c4009
    lhz   6,0(12)
Packit 6c4009
    addi  12,12,2
Packit 6c4009
    sth   6,0(3)
Packit 6c4009
    addi  3,3,2
Packit 6c4009
1:  bf    31,0f
Packit 6c4009
    lbz   6,0(12)
Packit 6c4009
    stb   6,0(3)
Packit 6c4009
0:
Packit 6c4009
  /* Return original dst pointer.  */
Packit 6c4009
    ld 31,-8(1)
Packit 6c4009
    ld 3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
END_GEN_TB (MEMCPY,TB_TOCLESS)
Packit 6c4009
libc_hidden_builtin_def (memcpy)