Blame sysdeps/powerpc/powerpc64/memcpy.S

Packit 6c4009
/* Optimized memcpy implementation for PowerPC64.
Packit 6c4009
   Copyright (C) 2003-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
Packit 6c4009
   Returns 'dst'.
Packit 6c4009
Packit 6c4009
   Memcpy handles short copies (< 32-bytes) using a binary move blocks
Packit 6c4009
   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
Packit 6c4009
   with the appropriate combination of byte and halfword load/stores.
Packit 6c4009
   There is minimal effort to optimize the alignment of short moves.
Packit 6c4009
   The 64-bit implementations of POWER3 and POWER4 do a reasonable job
Packit 6c4009
   of handling unaligned load/stores that do not cross 32-byte boundaries.
Packit 6c4009
Packit 6c4009
   Longer moves (>= 32-bytes) justify the effort to get at least the
Packit 6c4009
   destination doubleword (8-byte) aligned.  Further optimization is
Packit 6c4009
   possible when both source and destination are doubleword aligned.
Packit 6c4009
   Each case has a optimized unrolled loop.   */
Packit 6c4009
Packit 6c4009
#ifndef MEMCPY
Packit 6c4009
# define MEMCPY memcpy
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
ENTRY_TOCLESS (MEMCPY, 5)
Packit 6c4009
	CALL_MCOUNT 3
Packit 6c4009
Packit 6c4009
    cmpldi cr1,5,31
Packit 6c4009
    neg   0,3
Packit 6c4009
    std   3,-16(1)
Packit 6c4009
    std   31,-8(1)
Packit 6c4009
    cfi_offset(31,-8)
Packit 6c4009
    andi. 11,3,7	/* check alignment of dst.  */
Packit 6c4009
    clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
Packit 6c4009
    clrldi 10,4,61	/* check alignment of src.  */
Packit 6c4009
    cmpldi cr6,5,8
Packit 6c4009
    ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
Packit 6c4009
    cmpld cr6,10,11
Packit 6c4009
    mr    12,4
Packit 6c4009
    srdi  9,5,3		/* Number of full double words remaining.  */
Packit 6c4009
    mtcrf 0x01,0
Packit 6c4009
    mr    31,5
Packit 6c4009
    beq   .L0
Packit 6c4009
Packit 6c4009
    subf  31,0,5
Packit 6c4009
  /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
Packit 6c4009
1:  bf    31,2f
Packit 6c4009
    lbz   6,0(12)
Packit 6c4009
    addi  12,12,1
Packit 6c4009
    stb   6,0(3)
Packit 6c4009
    addi  3,3,1
Packit 6c4009
2:  bf    30,4f
Packit 6c4009
    lhz   6,0(12)
Packit 6c4009
    addi  12,12,2
Packit 6c4009
    sth   6,0(3)
Packit 6c4009
    addi  3,3,2
Packit 6c4009
4:  bf    29,0f
Packit 6c4009
    lwz   6,0(12)
Packit 6c4009
    addi  12,12,4
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
    addi  3,3,4
Packit 6c4009
0:
Packit 6c4009
    clrldi 10,12,61	/* check alignment of src again.  */
Packit 6c4009
    srdi  9,31,3	/* Number of full double words remaining.  */
Packit 6c4009
Packit 6c4009
  /* Copy doublewords from source to destination, assuming the
Packit 6c4009
     destination is aligned on a doubleword boundary.
Packit 6c4009
Packit 6c4009
     At this point we know there are at least 25 bytes left (32-7) to copy.
Packit 6c4009
     The next step is to determine if the source is also doubleword aligned.
Packit 6c4009
     If not branch to the unaligned move code at .L6. which uses
Packit 6c4009
     a load, shift, store strategy.
Packit 6c4009
Packit 6c4009
     Otherwise source and destination are doubleword aligned, and we can
Packit 6c4009
     the optimized doubleword copy loop.  */
Packit 6c4009
.L0:
Packit 6c4009
    clrldi	11,31,61
Packit 6c4009
    mtcrf 0x01,9
Packit 6c4009
    bne-  cr6,.L6   /* If source is not DW aligned.  */
Packit 6c4009
Packit 6c4009
  /* Move doublewords where destination and source are DW aligned.
Packit 6c4009
     Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
Packit 6c4009
     If the copy is not an exact multiple of 32 bytes, 1-3
Packit 6c4009
     doublewords are copied as needed to set up the main loop.  After
Packit 6c4009
     the main loop exits there may be a tail of 1-7 bytes. These byte are
Packit 6c4009
     copied a word/halfword/byte at a time as needed to preserve alignment.  */
Packit 6c4009
Packit 6c4009
    srdi  8,31,5
Packit 6c4009
    cmpldi	cr1,9,4
Packit 6c4009
    cmpldi	cr6,11,0
Packit 6c4009
    mr    11,12
Packit 6c4009
Packit 6c4009
    bf    30,1f
Packit 6c4009
    ld    6,0(12)
Packit 6c4009
    ld    7,8(12)
Packit 6c4009
    addi  11,12,16
Packit 6c4009
    mtctr 8
Packit 6c4009
    std   6,0(3)
Packit 6c4009
    std   7,8(3)
Packit 6c4009
    addi  10,3,16
Packit 6c4009
    bf    31,4f
Packit 6c4009
    ld    0,16(12)
Packit 6c4009
    std   0,16(3)
Packit 6c4009
    blt   cr1,3f
Packit 6c4009
    addi  11,12,24
Packit 6c4009
    addi  10,3,24
Packit 6c4009
    b     4f
Packit 6c4009
    .align  4
Packit 6c4009
1:
Packit 6c4009
    mr    10,3
Packit 6c4009
    mtctr 8
Packit 6c4009
    bf    31,4f
Packit 6c4009
    ld    6,0(12)
Packit 6c4009
    addi  11,12,8
Packit 6c4009
    std   6,0(3)
Packit 6c4009
    addi  10,3,8
Packit 6c4009
Packit 6c4009
    .align  4
Packit 6c4009
4:
Packit 6c4009
    ld    6,0(11)
Packit 6c4009
    ld    7,8(11)
Packit 6c4009
    ld    8,16(11)
Packit 6c4009
    ld    0,24(11)
Packit 6c4009
    addi  11,11,32
Packit 6c4009
2:
Packit 6c4009
    std   6,0(10)
Packit 6c4009
    std   7,8(10)
Packit 6c4009
    std   8,16(10)
Packit 6c4009
    std   0,24(10)
Packit 6c4009
    addi  10,10,32
Packit 6c4009
    bdnz  4b
Packit 6c4009
3:
Packit 6c4009
Packit 6c4009
    rldicr 0,31,0,60
Packit 6c4009
    mtcrf 0x01,31
Packit 6c4009
    beq   cr6,0f
Packit 6c4009
.L9:
Packit 6c4009
    add   3,3,0
Packit 6c4009
    add   12,12,0
Packit 6c4009
Packit 6c4009
/*  At this point we have a tail of 0-7 bytes and we know that the
Packit 6c4009
    destination is double word aligned.  */
Packit 6c4009
4:  bf    29,2f
Packit 6c4009
    lwz   6,0(12)
Packit 6c4009
    addi  12,12,4
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
    addi  3,3,4
Packit 6c4009
2:  bf    30,1f
Packit 6c4009
    lhz   6,0(12)
Packit 6c4009
    addi  12,12,2
Packit 6c4009
    sth   6,0(3)
Packit 6c4009
    addi  3,3,2
Packit 6c4009
1:  bf    31,0f
Packit 6c4009
    lbz   6,0(12)
Packit 6c4009
    stb   6,0(3)
Packit 6c4009
0:
Packit 6c4009
  /* Return original dst pointer.  */
Packit 6c4009
    ld 31,-8(1)
Packit 6c4009
    ld 3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
Packit 6c4009
/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
Packit 6c4009
   bytes.  Each case is handled without loops, using binary (1,2,4,8)
Packit 6c4009
   tests.
Packit 6c4009
Packit 6c4009
   In the short (0-8 byte) case no attempt is made to force alignment
Packit 6c4009
   of either source or destination.  The hardware will handle the
Packit 6c4009
   unaligned load/stores with small delays for crossing 32- 64-byte, and
Packit 6c4009
   4096-byte boundaries. Since these short moves are unlikely to be
Packit 6c4009
   unaligned or cross these boundaries, the overhead to force
Packit 6c4009
   alignment is not justified.
Packit 6c4009
Packit 6c4009
   The longer (9-31 byte) move is more likely to cross 32- or 64-byte
Packit 6c4009
   boundaries.  Since only loads are sensitive to the 32-/64-byte
Packit 6c4009
   boundaries it is more important to align the source then the
Packit 6c4009
   destination.  If the source is not already word aligned, we first
Packit 6c4009
   move 1-3 bytes as needed.  Since we are only word aligned we don't
Packit 6c4009
   use double word load/stores to insure that all loads are aligned.
Packit 6c4009
   While the destination and stores may still be unaligned, this
Packit 6c4009
   is only an issue for page (4096 byte boundary) crossing, which
Packit 6c4009
   should be rare for these short moves.  The hardware handles this
Packit 6c4009
   case automatically with a small delay.  */
Packit 6c4009
Packit 6c4009
    .align  4
Packit 6c4009
.L2:
Packit 6c4009
    mtcrf 0x01,5
Packit 6c4009
    neg   8,4
Packit 6c4009
    clrrdi	11,4,2
Packit 6c4009
    andi. 0,8,3
Packit 6c4009
    ble   cr6,.LE8	/* Handle moves of 0-8 bytes.  */
Packit 6c4009
/* At least 9 bytes left.  Get the source word aligned.  */
Packit 6c4009
    cmpldi	cr1,5,16
Packit 6c4009
    mr    10,5
Packit 6c4009
    mr    12,4
Packit 6c4009
    cmpldi	cr6,0,2
Packit 6c4009
    beq   .L3	/* If the source is already word aligned skip this.  */
Packit 6c4009
/* Copy 1-3 bytes to get source address word aligned.  */
Packit 6c4009
    lwz   6,0(11)
Packit 6c4009
    subf  10,0,5
Packit 6c4009
    add   12,4,0
Packit 6c4009
    blt   cr6,5f
Packit 6c4009
    srdi  7,6,16
Packit 6c4009
    bgt	  cr6,3f
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    sth   7,0(3)
Packit 6c4009
#else
Packit 6c4009
    sth   6,0(3)
Packit 6c4009
#endif
Packit 6c4009
    b     7f
Packit 6c4009
    .align  4
Packit 6c4009
3:
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    rotlwi 6,6,24
Packit 6c4009
    stb   6,0(3)
Packit 6c4009
    sth   7,1(3)
Packit 6c4009
#else
Packit 6c4009
    stb   7,0(3)
Packit 6c4009
    sth   6,1(3)
Packit 6c4009
#endif
Packit 6c4009
    b     7f
Packit 6c4009
    .align  4
Packit 6c4009
5:
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    rotlwi 6,6,8
Packit 6c4009
#endif
Packit 6c4009
    stb   6,0(3)
Packit 6c4009
7:
Packit 6c4009
    cmpldi	cr1,10,16
Packit 6c4009
    add   3,3,0
Packit 6c4009
    mtcrf 0x01,10
Packit 6c4009
    .align  4
Packit 6c4009
.L3:
Packit 6c4009
/* At least 6 bytes left and the source is word aligned.  */
Packit 6c4009
    blt   cr1,8f
Packit 6c4009
16: /* Move 16 bytes.  */
Packit 6c4009
    lwz   6,0(12)
Packit 6c4009
    lwz   7,4(12)
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
    lwz   6,8(12)
Packit 6c4009
    stw   7,4(3)
Packit 6c4009
    lwz   7,12(12)
Packit 6c4009
    addi  12,12,16
Packit 6c4009
    stw   6,8(3)
Packit 6c4009
    stw   7,12(3)
Packit 6c4009
    addi  3,3,16
Packit 6c4009
8:  /* Move 8 bytes.  */
Packit 6c4009
    bf    28,4f
Packit 6c4009
    lwz   6,0(12)
Packit 6c4009
    lwz   7,4(12)
Packit 6c4009
    addi  12,12,8
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
    stw   7,4(3)
Packit 6c4009
    addi  3,3,8
Packit 6c4009
4:  /* Move 4 bytes.  */
Packit 6c4009
    bf    29,2f
Packit 6c4009
    lwz   6,0(12)
Packit 6c4009
    addi  12,12,4
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
    addi  3,3,4
Packit 6c4009
2:  /* Move 2-3 bytes.  */
Packit 6c4009
    bf    30,1f
Packit 6c4009
    lhz   6,0(12)
Packit 6c4009
    sth   6,0(3)
Packit 6c4009
    bf    31,0f
Packit 6c4009
    lbz   7,2(12)
Packit 6c4009
    stb   7,2(3)
Packit 6c4009
    ld 3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
1:  /* Move 1 byte.  */
Packit 6c4009
    bf    31,0f
Packit 6c4009
    lbz   6,0(12)
Packit 6c4009
    stb   6,0(3)
Packit 6c4009
0:
Packit 6c4009
  /* Return original dst pointer.  */
Packit 6c4009
    ld    3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
Packit 6c4009
/* Special case to copy 0-8 bytes.  */
Packit 6c4009
    .align  4
Packit 6c4009
.LE8:
Packit 6c4009
    mr    12,4
Packit 6c4009
    bne   cr6,4f
Packit 6c4009
/* Would have liked to use use ld/std here but the 630 processors are
Packit 6c4009
   slow for load/store doubles that are not at least word aligned.
Packit 6c4009
   Unaligned Load/Store word execute with only a 1 cycle penalty.  */
Packit 6c4009
    lwz   6,0(4)
Packit 6c4009
    lwz   7,4(4)
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
    stw   7,4(3)
Packit 6c4009
  /* Return original dst pointer.  */
Packit 6c4009
    ld    3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
    .align  4
Packit 6c4009
4:  bf    29,2b
Packit 6c4009
    lwz   6,0(4)
Packit 6c4009
    stw   6,0(3)
Packit 6c4009
6:
Packit 6c4009
    bf    30,5f
Packit 6c4009
    lhz   7,4(4)
Packit 6c4009
    sth   7,4(3)
Packit 6c4009
    bf    31,0f
Packit 6c4009
    lbz   8,6(4)
Packit 6c4009
    stb   8,6(3)
Packit 6c4009
    ld 3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
    .align  4
Packit 6c4009
5:
Packit 6c4009
    bf    31,0f
Packit 6c4009
    lbz   6,4(4)
Packit 6c4009
    stb   6,4(3)
Packit 6c4009
    .align  4
Packit 6c4009
0:
Packit 6c4009
  /* Return original dst pointer.  */
Packit 6c4009
    ld    3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
Packit 6c4009
    .align  4
Packit 6c4009
.L6:
Packit 6c4009
Packit 6c4009
  /* Copy doublewords where the destination is aligned but the source is
Packit 6c4009
     not.  Use aligned doubleword loads from the source, shifted to realign
Packit 6c4009
     the data, to allow aligned destination stores.  */
Packit 6c4009
    subf  5,10,12
Packit 6c4009
    andi. 0,9,1
Packit 6c4009
    cmpldi cr6,11,0
Packit 6c4009
    sldi  10,10,3
Packit 6c4009
    mr    11,9
Packit 6c4009
    mr    4,3
Packit 6c4009
    ld    6,0(5)
Packit 6c4009
    ld    7,8(5)
Packit 6c4009
    subfic  9,10,64
Packit 6c4009
    beq   2f
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srd   0,6,10
Packit 6c4009
#else
Packit 6c4009
    sld   0,6,10
Packit 6c4009
#endif
Packit 6c4009
    cmpldi  11,1
Packit 6c4009
    mr    6,7
Packit 6c4009
    addi  4,4,-8
Packit 6c4009
    addi  11,11,-1
Packit 6c4009
    b     1f
Packit 6c4009
2:  addi  5,5,8
Packit 6c4009
    .align  4
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
0:  srd   0,6,10
Packit 6c4009
    sld   8,7,9
Packit 6c4009
#else
Packit 6c4009
0:  sld   0,6,10
Packit 6c4009
    srd   8,7,9
Packit 6c4009
#endif
Packit 6c4009
    cmpldi  11,2
Packit 6c4009
    ld    6,8(5)
Packit 6c4009
    or    0,0,8
Packit 6c4009
    addi  11,11,-2
Packit 6c4009
    std   0,0(4)
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
    srd   0,7,10
Packit 6c4009
1:  sld   8,6,9
Packit 6c4009
#else
Packit 6c4009
    sld   0,7,10
Packit 6c4009
1:  srd   8,6,9
Packit 6c4009
#endif
Packit 6c4009
    or    0,0,8
Packit 6c4009
    beq   8f
Packit 6c4009
    ld    7,16(5)
Packit 6c4009
    std   0,8(4)
Packit 6c4009
    addi  5,5,16
Packit 6c4009
    addi  4,4,16
Packit 6c4009
    b     0b
Packit 6c4009
    .align 4
Packit 6c4009
8:
Packit 6c4009
    std   0,8(4)
Packit 6c4009
    rldicr 0,31,0,60
Packit 6c4009
    mtcrf 0x01,31
Packit 6c4009
    bne   cr6,.L9	/* If the tail is 0 bytes we are done!  */
Packit 6c4009
  /* Return original dst pointer.  */
Packit 6c4009
    ld 31,-8(1)
Packit 6c4009
    ld 3,-16(1)
Packit 6c4009
    blr
Packit 6c4009
END_GEN_TB (MEMCPY,TB_TOCLESS)
Packit 6c4009
libc_hidden_builtin_def (memcpy)