Blame sysdeps/powerpc/powerpc32/strlen.S

Packit Service 82fcde
/* Optimized strlen implementation for PowerPC.
Packit Service 82fcde
   Copyright (C) 1997-2018 Free Software Foundation, Inc.
Packit Service 82fcde
   This file is part of the GNU C Library.
Packit Service 82fcde
Packit Service 82fcde
   The GNU C Library is free software; you can redistribute it and/or
Packit Service 82fcde
   modify it under the terms of the GNU Lesser General Public
Packit Service 82fcde
   License as published by the Free Software Foundation; either
Packit Service 82fcde
   version 2.1 of the License, or (at your option) any later version.
Packit Service 82fcde
Packit Service 82fcde
   The GNU C Library is distributed in the hope that it will be useful,
Packit Service 82fcde
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit Service 82fcde
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit Service 82fcde
   Lesser General Public License for more details.
Packit Service 82fcde
Packit Service 82fcde
   You should have received a copy of the GNU Lesser General Public
Packit Service 82fcde
   License along with the GNU C Library; if not, see
Packit Service 82fcde
   <http://www.gnu.org/licenses/>.  */
Packit Service 82fcde
Packit Service 82fcde
#include <sysdep.h>
Packit Service 82fcde
Packit Service 82fcde
/* The algorithm here uses the following techniques:
Packit Service 82fcde
Packit Service 82fcde
   1) Given a word 'x', we can test to see if it contains any 0 bytes
Packit Service 82fcde
      by subtracting 0x01010101, and seeing if any of the high bits of each
Packit Service 82fcde
      byte changed from 0 to 1. This works because the least significant
Packit Service 82fcde
      0 byte must have had no incoming carry (otherwise it's not the least
Packit Service 82fcde
      significant), so it is 0x00 - 0x01 == 0xff. For all other
Packit Service 82fcde
      byte values, either they have the high bit set initially, or when
Packit Service 82fcde
      1 is subtracted you get a value in the range 0x00-0x7f, none of which
Packit Service 82fcde
      have their high bit set. The expression here is
Packit Service 82fcde
      (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
Packit Service 82fcde
      there were no 0x00 bytes in the word.  You get 0x80 in bytes that
Packit Service 82fcde
      match, but possibly false 0x80 matches in the next more significant
Packit Service 82fcde
      byte to a true match due to carries.  For little-endian this is
Packit Service 82fcde
      of no consequence since the least significant match is the one
Packit Service 82fcde
      we're interested in, but big-endian needs method 2 to find which
Packit Service 82fcde
      byte matches.
Packit Service 82fcde
Packit Service 82fcde
   2) Given a word 'x', we can test to see _which_ byte was zero by
Packit Service 82fcde
      calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
Packit Service 82fcde
      This produces 0x80 in each byte that was zero, and 0x00 in all
Packit Service 82fcde
      the other bytes. The '| 0x7f7f7f7f' clears the low 7 bits in each
Packit Service 82fcde
      byte, and the '| x' part ensures that bytes with the high bit set
Packit Service 82fcde
      produce 0x00. The addition will carry into the high bit of each byte
Packit Service 82fcde
      iff that byte had one of its low 7 bits set. We can then just see
Packit Service 82fcde
      which was the most significant bit set and divide by 8 to find how
Packit Service 82fcde
      many to add to the index.
Packit Service 82fcde
      This is from the book 'The PowerPC Compiler Writer's Guide',
Packit Service 82fcde
      by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
Packit Service 82fcde
Packit Service 82fcde
   We deal with strings not aligned to a word boundary by taking the
Packit Service 82fcde
   first word and ensuring that bytes not part of the string
Packit Service 82fcde
   are treated as nonzero. To allow for memory latency, we unroll the
Packit Service 82fcde
   loop a few times, being careful to ensure that we do not read ahead
Packit Service 82fcde
   across cache line boundaries.
Packit Service 82fcde
Packit Service 82fcde
   Questions to answer:
Packit Service 82fcde
   1) How long are strings passed to strlen? If they're often really long,
Packit Service 82fcde
   we should probably use cache management instructions and/or unroll the
Packit Service 82fcde
   loop more. If they're often quite short, it might be better to use
Packit Service 82fcde
   fact (2) in the inner loop than have to recalculate it.
Packit Service 82fcde
   2) How popular are bytes with the high bit set? If they are very rare,
Packit Service 82fcde
   on some processors it might be useful to use the simpler expression
Packit Service 82fcde
   ~((x - 0x01010101) | 0x7f7f7f7f) (that is, on processors with only one
Packit Service 82fcde
   ALU), but this fails when any character has its high bit set.  */
Packit Service 82fcde
Packit Service 82fcde
/* Some notes on register usage: Under the SVR4 ABI, we can use registers
Packit Service 82fcde
   0 and 3 through 12 (so long as we don't call any procedures) without
Packit Service 82fcde
   saving them. We can also use registers 14 through 31 if we save them.
Packit Service 82fcde
   We can't use r1 (it's the stack pointer), r2 nor r13 because the user
Packit Service 82fcde
   program may expect them to hold their usual value if we get sent
Packit Service 82fcde
   a signal. Integer parameters are passed in r3 through r10.
Packit Service 82fcde
   We can use condition registers cr0, cr1, cr5, cr6, and cr7 without saving
Packit Service 82fcde
   them, the others we must save.  */
Packit Service 82fcde
Packit Service 82fcde
/* int [r3] strlen (char *s [r3])  */
Packit Service 82fcde
Packit Service 82fcde
ENTRY (strlen)
Packit Service 82fcde
Packit Service 82fcde
#define rTMP4	r0
Packit Service 82fcde
#define rRTN	r3	/* incoming STR arg, outgoing result */
Packit Service 82fcde
#define rSTR	r4	/* current string position */
Packit Service 82fcde
#define rPADN	r5	/* number of padding bits we prepend to the
Packit Service 82fcde
			   string to make it start at a word boundary */
Packit Service 82fcde
#define rFEFE	r6	/* constant 0xfefefeff (-0x01010101) */
Packit Service 82fcde
#define r7F7F	r7	/* constant 0x7f7f7f7f */
Packit Service 82fcde
#define rWORD1	r8	/* current string word */
Packit Service 82fcde
#define rWORD2	r9	/* next string word */
Packit Service 82fcde
#define rMASK	r9	/* mask for first string word */
Packit Service 82fcde
#define rTMP1	r10
Packit Service 82fcde
#define rTMP2	r11
Packit Service 82fcde
#define rTMP3	r12
Packit Service 82fcde
Packit Service 82fcde
Packit Service 82fcde
	clrrwi	rSTR, rRTN, 2
Packit Service 82fcde
	lis	r7F7F, 0x7f7f
Packit Service 82fcde
	rlwinm	rPADN, rRTN, 3, 27, 28
Packit Service 82fcde
	lwz	rWORD1, 0(rSTR)
Packit Service 82fcde
	li	rMASK, -1
Packit Service 82fcde
	addi	r7F7F, r7F7F, 0x7f7f
Packit Service 82fcde
/* We use method (2) on the first two words, because rFEFE isn't
Packit Service 82fcde
   required which reduces setup overhead.  Also gives a faster return
Packit Service 82fcde
   for small strings on big-endian due to needing to recalculate with
Packit Service 82fcde
   method (2) anyway.  */
Packit Service 82fcde
#ifdef __LITTLE_ENDIAN__
Packit Service 82fcde
	slw	rMASK, rMASK, rPADN
Packit Service 82fcde
#else
Packit Service 82fcde
	srw	rMASK, rMASK, rPADN
Packit Service 82fcde
#endif
Packit Service 82fcde
	and	rTMP1, r7F7F, rWORD1
Packit Service 82fcde
	or	rTMP2, r7F7F, rWORD1
Packit Service 82fcde
	add	rTMP1, rTMP1, r7F7F
Packit Service 82fcde
	nor	rTMP3, rTMP2, rTMP1
Packit Service 82fcde
	and.	rTMP3, rTMP3, rMASK
Packit Service 82fcde
	mtcrf	0x01, rRTN
Packit Service 82fcde
	bne	L(done0)
Packit Service 82fcde
	lis	rFEFE, -0x101
Packit Service 82fcde
	addi	rFEFE, rFEFE, -0x101
Packit Service 82fcde
/* Are we now aligned to a doubleword boundary?  */
Packit Service 82fcde
	bt	29, L(loop)
Packit Service 82fcde
Packit Service 82fcde
/* Handle second word of pair.  */
Packit Service 82fcde
/* Perhaps use method (1) here for little-endian, saving one instruction?  */
Packit Service 82fcde
	lwzu	rWORD1, 4(rSTR)
Packit Service 82fcde
	and	rTMP1, r7F7F, rWORD1
Packit Service 82fcde
	or	rTMP2, r7F7F, rWORD1
Packit Service 82fcde
	add	rTMP1, rTMP1, r7F7F
Packit Service 82fcde
	nor.	rTMP3, rTMP2, rTMP1
Packit Service 82fcde
	bne	L(done0)
Packit Service 82fcde
Packit Service 82fcde
/* The loop.  */
Packit Service 82fcde
Packit Service 82fcde
L(loop):
Packit Service 82fcde
	lwz	rWORD1, 4(rSTR)
Packit Service 82fcde
	lwzu	rWORD2, 8(rSTR)
Packit Service 82fcde
	add	rTMP1, rFEFE, rWORD1
Packit Service 82fcde
	nor	rTMP2, r7F7F, rWORD1
Packit Service 82fcde
	and.	rTMP1, rTMP1, rTMP2
Packit Service 82fcde
	add	rTMP3, rFEFE, rWORD2
Packit Service 82fcde
	nor	rTMP4, r7F7F, rWORD2
Packit Service 82fcde
	bne	L(done1)
Packit Service 82fcde
	and.	rTMP3, rTMP3, rTMP4
Packit Service 82fcde
	beq	L(loop)
Packit Service 82fcde
Packit Service 82fcde
#ifndef __LITTLE_ENDIAN__
Packit Service 82fcde
	and	rTMP1, r7F7F, rWORD2
Packit Service 82fcde
	add	rTMP1, rTMP1, r7F7F
Packit Service 82fcde
	andc	rTMP3, rTMP4, rTMP1
Packit Service 82fcde
	b	L(done0)
Packit Service 82fcde
Packit Service 82fcde
L(done1):
Packit Service 82fcde
	and	rTMP1, r7F7F, rWORD1
Packit Service 82fcde
	subi	rSTR, rSTR, 4
Packit Service 82fcde
	add	rTMP1, rTMP1, r7F7F
Packit Service 82fcde
	andc	rTMP3, rTMP2, rTMP1
Packit Service 82fcde
Packit Service 82fcde
/* When we get to here, rSTR points to the first word in the string that
Packit Service 82fcde
   contains a zero byte, and rTMP3 has 0x80 for bytes that are zero,
Packit Service 82fcde
   and 0x00 otherwise.  */
Packit Service 82fcde
L(done0):
Packit Service 82fcde
	cntlzw	rTMP3, rTMP3
Packit Service 82fcde
	subf	rTMP1, rRTN, rSTR
Packit Service 82fcde
	srwi	rTMP3, rTMP3, 3
Packit Service 82fcde
	add	rRTN, rTMP1, rTMP3
Packit Service 82fcde
	blr
Packit Service 82fcde
#else
Packit Service 82fcde
Packit Service 82fcde
L(done0):
Packit Service 82fcde
	addi	rTMP1, rTMP3, -1	/* Form a mask from trailing zeros.  */
Packit Service 82fcde
	andc	rTMP1, rTMP1, rTMP3
Packit Service 82fcde
	cntlzw	rTMP1, rTMP1		/* Count bits not in the mask.  */
Packit Service 82fcde
	subf	rTMP3, rRTN, rSTR
Packit Service 82fcde
	subfic	rTMP1, rTMP1, 32-7
Packit Service 82fcde
	srwi	rTMP1, rTMP1, 3
Packit Service 82fcde
	add	rRTN, rTMP1, rTMP3
Packit Service 82fcde
	blr
Packit Service 82fcde
Packit Service 82fcde
L(done1):
Packit Service 82fcde
	addi	rTMP3, rTMP1, -1
Packit Service 82fcde
	andc	rTMP3, rTMP3, rTMP1
Packit Service 82fcde
	cntlzw	rTMP3, rTMP3
Packit Service 82fcde
	subf	rTMP1, rRTN, rSTR
Packit Service 82fcde
	subfic	rTMP3, rTMP3, 32-7-32
Packit Service 82fcde
	srawi	rTMP3, rTMP3, 3
Packit Service 82fcde
	add	rRTN, rTMP1, rTMP3
Packit Service 82fcde
	blr
Packit Service 82fcde
#endif
Packit Service 82fcde
Packit Service 82fcde
END (strlen)
Packit Service 82fcde
libc_hidden_builtin_def (strlen)