Blame sysdeps/powerpc/powerpc32/strlen.S

Packit 6c4009
/* Optimized strlen implementation for PowerPC.
Packit 6c4009
   Copyright (C) 1997-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
/* The algorithm here uses the following techniques:
Packit 6c4009
Packit 6c4009
   1) Given a word 'x', we can test to see if it contains any 0 bytes
Packit 6c4009
      by subtracting 0x01010101, and seeing if any of the high bits of each
Packit 6c4009
      byte changed from 0 to 1. This works because the least significant
Packit 6c4009
      0 byte must have had no incoming carry (otherwise it's not the least
Packit 6c4009
      significant), so it is 0x00 - 0x01 == 0xff. For all other
Packit 6c4009
      byte values, either they have the high bit set initially, or when
Packit 6c4009
      1 is subtracted you get a value in the range 0x00-0x7f, none of which
Packit 6c4009
      have their high bit set. The expression here is
Packit 6c4009
      (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
Packit 6c4009
      there were no 0x00 bytes in the word.  You get 0x80 in bytes that
Packit 6c4009
      match, but possibly false 0x80 matches in the next more significant
Packit 6c4009
      byte to a true match due to carries.  For little-endian this is
Packit 6c4009
      of no consequence since the least significant match is the one
Packit 6c4009
      we're interested in, but big-endian needs method 2 to find which
Packit 6c4009
      byte matches.
Packit 6c4009
Packit 6c4009
   2) Given a word 'x', we can test to see _which_ byte was zero by
Packit 6c4009
      calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
Packit 6c4009
      This produces 0x80 in each byte that was zero, and 0x00 in all
Packit 6c4009
      the other bytes. The '| 0x7f7f7f7f' clears the low 7 bits in each
Packit 6c4009
      byte, and the '| x' part ensures that bytes with the high bit set
Packit 6c4009
      produce 0x00. The addition will carry into the high bit of each byte
Packit 6c4009
      iff that byte had one of its low 7 bits set. We can then just see
Packit 6c4009
      which was the most significant bit set and divide by 8 to find how
Packit 6c4009
      many to add to the index.
Packit 6c4009
      This is from the book 'The PowerPC Compiler Writer's Guide',
Packit 6c4009
      by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
Packit 6c4009
Packit 6c4009
   We deal with strings not aligned to a word boundary by taking the
Packit 6c4009
   first word and ensuring that bytes not part of the string
Packit 6c4009
   are treated as nonzero. To allow for memory latency, we unroll the
Packit 6c4009
   loop a few times, being careful to ensure that we do not read ahead
Packit 6c4009
   across cache line boundaries.
Packit 6c4009
Packit 6c4009
   Questions to answer:
Packit 6c4009
   1) How long are strings passed to strlen? If they're often really long,
Packit 6c4009
   we should probably use cache management instructions and/or unroll the
Packit 6c4009
   loop more. If they're often quite short, it might be better to use
Packit 6c4009
   fact (2) in the inner loop than have to recalculate it.
Packit 6c4009
   2) How popular are bytes with the high bit set? If they are very rare,
Packit 6c4009
   on some processors it might be useful to use the simpler expression
Packit 6c4009
   ~((x - 0x01010101) | 0x7f7f7f7f) (that is, on processors with only one
Packit 6c4009
   ALU), but this fails when any character has its high bit set.  */
Packit 6c4009
Packit 6c4009
/* Some notes on register usage: Under the SVR4 ABI, we can use registers
Packit 6c4009
   0 and 3 through 12 (so long as we don't call any procedures) without
Packit 6c4009
   saving them. We can also use registers 14 through 31 if we save them.
Packit 6c4009
   We can't use r1 (it's the stack pointer), r2 nor r13 because the user
Packit 6c4009
   program may expect them to hold their usual value if we get sent
Packit 6c4009
   a signal. Integer parameters are passed in r3 through r10.
Packit 6c4009
   We can use condition registers cr0, cr1, cr5, cr6, and cr7 without saving
Packit 6c4009
   them, the others we must save.  */
Packit 6c4009
Packit 6c4009
/* int [r3] strlen (char *s [r3])  */
Packit 6c4009
Packit 6c4009
ENTRY (strlen)
Packit 6c4009
Packit 6c4009
#define rTMP4	r0
Packit 6c4009
#define rRTN	r3	/* incoming STR arg, outgoing result */
Packit 6c4009
#define rSTR	r4	/* current string position */
Packit 6c4009
#define rPADN	r5	/* number of padding bits we prepend to the
Packit 6c4009
			   string to make it start at a word boundary */
Packit 6c4009
#define rFEFE	r6	/* constant 0xfefefeff (-0x01010101) */
Packit 6c4009
#define r7F7F	r7	/* constant 0x7f7f7f7f */
Packit 6c4009
#define rWORD1	r8	/* current string word */
Packit 6c4009
#define rWORD2	r9	/* next string word */
Packit 6c4009
#define rMASK	r9	/* mask for first string word */
Packit 6c4009
#define rTMP1	r10
Packit 6c4009
#define rTMP2	r11
Packit 6c4009
#define rTMP3	r12
Packit 6c4009
Packit 6c4009
Packit 6c4009
	clrrwi	rSTR, rRTN, 2
Packit 6c4009
	lis	r7F7F, 0x7f7f
Packit 6c4009
	rlwinm	rPADN, rRTN, 3, 27, 28
Packit 6c4009
	lwz	rWORD1, 0(rSTR)
Packit 6c4009
	li	rMASK, -1
Packit 6c4009
	addi	r7F7F, r7F7F, 0x7f7f
Packit 6c4009
/* We use method (2) on the first two words, because rFEFE isn't
Packit 6c4009
   required which reduces setup overhead.  Also gives a faster return
Packit 6c4009
   for small strings on big-endian due to needing to recalculate with
Packit 6c4009
   method (2) anyway.  */
Packit 6c4009
#ifdef __LITTLE_ENDIAN__
Packit 6c4009
	slw	rMASK, rMASK, rPADN
Packit 6c4009
#else
Packit 6c4009
	srw	rMASK, rMASK, rPADN
Packit 6c4009
#endif
Packit 6c4009
	and	rTMP1, r7F7F, rWORD1
Packit 6c4009
	or	rTMP2, r7F7F, rWORD1
Packit 6c4009
	add	rTMP1, rTMP1, r7F7F
Packit 6c4009
	nor	rTMP3, rTMP2, rTMP1
Packit 6c4009
	and.	rTMP3, rTMP3, rMASK
Packit 6c4009
	mtcrf	0x01, rRTN
Packit 6c4009
	bne	L(done0)
Packit 6c4009
	lis	rFEFE, -0x101
Packit 6c4009
	addi	rFEFE, rFEFE, -0x101
Packit 6c4009
/* Are we now aligned to a doubleword boundary?  */
Packit 6c4009
	bt	29, L(loop)
Packit 6c4009
Packit 6c4009
/* Handle second word of pair.  */
Packit 6c4009
/* Perhaps use method (1) here for little-endian, saving one instruction?  */
Packit 6c4009
	lwzu	rWORD1, 4(rSTR)
Packit 6c4009
	and	rTMP1, r7F7F, rWORD1
Packit 6c4009
	or	rTMP2, r7F7F, rWORD1
Packit 6c4009
	add	rTMP1, rTMP1, r7F7F
Packit 6c4009
	nor.	rTMP3, rTMP2, rTMP1
Packit 6c4009
	bne	L(done0)
Packit 6c4009
Packit 6c4009
/* The loop.  */
Packit 6c4009
Packit 6c4009
L(loop):
Packit 6c4009
	lwz	rWORD1, 4(rSTR)
Packit 6c4009
	lwzu	rWORD2, 8(rSTR)
Packit 6c4009
	add	rTMP1, rFEFE, rWORD1
Packit 6c4009
	nor	rTMP2, r7F7F, rWORD1
Packit 6c4009
	and.	rTMP1, rTMP1, rTMP2
Packit 6c4009
	add	rTMP3, rFEFE, rWORD2
Packit 6c4009
	nor	rTMP4, r7F7F, rWORD2
Packit 6c4009
	bne	L(done1)
Packit 6c4009
	and.	rTMP3, rTMP3, rTMP4
Packit 6c4009
	beq	L(loop)
Packit 6c4009
Packit 6c4009
#ifndef __LITTLE_ENDIAN__
Packit 6c4009
	and	rTMP1, r7F7F, rWORD2
Packit 6c4009
	add	rTMP1, rTMP1, r7F7F
Packit 6c4009
	andc	rTMP3, rTMP4, rTMP1
Packit 6c4009
	b	L(done0)
Packit 6c4009
Packit 6c4009
L(done1):
Packit 6c4009
	and	rTMP1, r7F7F, rWORD1
Packit 6c4009
	subi	rSTR, rSTR, 4
Packit 6c4009
	add	rTMP1, rTMP1, r7F7F
Packit 6c4009
	andc	rTMP3, rTMP2, rTMP1
Packit 6c4009
Packit 6c4009
/* When we get to here, rSTR points to the first word in the string that
Packit 6c4009
   contains a zero byte, and rTMP3 has 0x80 for bytes that are zero,
Packit 6c4009
   and 0x00 otherwise.  */
Packit 6c4009
L(done0):
Packit 6c4009
	cntlzw	rTMP3, rTMP3
Packit 6c4009
	subf	rTMP1, rRTN, rSTR
Packit 6c4009
	srwi	rTMP3, rTMP3, 3
Packit 6c4009
	add	rRTN, rTMP1, rTMP3
Packit 6c4009
	blr
Packit 6c4009
#else
Packit 6c4009
Packit 6c4009
L(done0):
Packit 6c4009
	addi	rTMP1, rTMP3, -1	/* Form a mask from trailing zeros.  */
Packit 6c4009
	andc	rTMP1, rTMP1, rTMP3
Packit 6c4009
	cntlzw	rTMP1, rTMP1		/* Count bits not in the mask.  */
Packit 6c4009
	subf	rTMP3, rRTN, rSTR
Packit 6c4009
	subfic	rTMP1, rTMP1, 32-7
Packit 6c4009
	srwi	rTMP1, rTMP1, 3
Packit 6c4009
	add	rRTN, rTMP1, rTMP3
Packit 6c4009
	blr
Packit 6c4009
Packit 6c4009
L(done1):
Packit 6c4009
	addi	rTMP3, rTMP1, -1
Packit 6c4009
	andc	rTMP3, rTMP3, rTMP1
Packit 6c4009
	cntlzw	rTMP3, rTMP3
Packit 6c4009
	subf	rTMP1, rRTN, rSTR
Packit 6c4009
	subfic	rTMP3, rTMP3, 32-7-32
Packit 6c4009
	srawi	rTMP3, rTMP3, 3
Packit 6c4009
	add	rRTN, rTMP1, rTMP3
Packit 6c4009
	blr
Packit 6c4009
#endif
Packit 6c4009
Packit 6c4009
END (strlen)
Packit 6c4009
libc_hidden_builtin_def (strlen)