Blame sysdeps/ia64/memchr.S

Packit 6c4009
/* Optimized version of the standard memchr() function.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
   Copyright (C) 2000-2018 Free Software Foundation, Inc.
Packit 6c4009
   Contributed by Dan Pop <Dan.Pop@cern.ch>.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
/* Return: the address of the first occurence of chr in str or NULL
Packit 6c4009
Packit 6c4009
   Inputs:
Packit 6c4009
	in0:	str
Packit 6c4009
	in1:	chr
Packit 6c4009
	in2:	byte count
Packit 6c4009
Packit 6c4009
   This implementation assumes little endian mode.  For big endian mode,
Packit 6c4009
   the instruction czx1.r should be replaced by czx1.l.
Packit 6c4009
Packit 6c4009
   The algorithm is fairly straightforward: search byte by byte until we
Packit 6c4009
   we get to a word aligned address, then search word by word as much as
Packit 6c4009
   possible; the remaining few bytes are searched one at a time.
Packit 6c4009
Packit 6c4009
   The word by word search is performed by xor-ing the word with a word
Packit 6c4009
   containing chr in every byte.  If there is a hit, the result will
Packit 6c4009
   contain a zero byte in the corresponding position.  The presence and
Packit 6c4009
   position of that zero byte is detected with a czx instruction.
Packit 6c4009
Packit 6c4009
   All the loops in this function could have had the internal branch removed
Packit 6c4009
   if br.ctop and br.cloop could be predicated :-(.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
#undef ret
Packit 6c4009
Packit 6c4009
#define saved_pr	r15
Packit 6c4009
#define saved_lc	r16
Packit 6c4009
#define	chr		r17
Packit 6c4009
#define len		r18
Packit 6c4009
#define last		r20
Packit 6c4009
#define val		r21
Packit 6c4009
#define tmp		r24
Packit 6c4009
#define chrx8		r25
Packit 6c4009
#define loopcnt		r30
Packit 6c4009
Packit 6c4009
#define str		in0
Packit 6c4009
Packit 6c4009
ENTRY(__memchr)
Packit 6c4009
	.prologue
Packit 6c4009
	alloc r2 = ar.pfs, 3, 0, 29, 32
Packit 6c4009
#include "softpipe.h"
Packit 6c4009
	.rotr	value[MEMLAT+1], addr[MEMLAT+3], aux[2], poschr[2]
Packit 6c4009
	.rotp	p[MEMLAT+3]
Packit 6c4009
	.save ar.lc, saved_lc
Packit 6c4009
	mov	saved_lc = ar.lc	// save the loop counter
Packit 6c4009
	.save pr, saved_pr
Packit 6c4009
	mov	saved_pr = pr		// save the predicates
Packit 6c4009
	.body
Packit 6c4009
	mov	ret0 = str
Packit 6c4009
	add	last = str, in2		// last byte
Packit 6c4009
	;;
Packit 6c4009
	cmp.ltu	p6, p0 = last, str
Packit 6c4009
	;;
Packit 6c4009
(p6)	mov	last = -1
Packit 6c4009
	and	tmp = 7, str		// tmp = str % 8
Packit 6c4009
	cmp.ne	p7, p0 = r0, r0		// clear p7
Packit 6c4009
	extr.u	chr = in1, 0, 8		// chr = (unsigned char) in1
Packit 6c4009
	mov	len = in2
Packit 6c4009
	cmp.gtu	p6, p0 = 16, in2	// use a simple loop for short
Packit 6c4009
(p6)	br.cond.spnt .srchfew ;;	// searches
Packit 6c4009
	sub	loopcnt = 8, tmp	// loopcnt = 8 - tmp
Packit 6c4009
	cmp.eq	p6, p0 = tmp, r0
Packit 6c4009
(p6)	br.cond.sptk	.str_aligned;;
Packit 6c4009
	sub	len = len, loopcnt
Packit 6c4009
	adds	loopcnt = -1, loopcnt;;
Packit 6c4009
	mov	ar.lc = loopcnt
Packit 6c4009
.l1:
Packit 6c4009
	ld1	val = [ret0], 1
Packit 6c4009
	;;
Packit 6c4009
	cmp.eq	p6, p0 = val, chr
Packit 6c4009
(p6)	br.cond.spnt	.foundit
Packit 6c4009
	br.cloop.sptk	.l1 ;;
Packit 6c4009
.str_aligned:
Packit 6c4009
	cmp.ne	p6, p0 = r0, r0		// clear p6
Packit 6c4009
	shr.u	loopcnt = len, 3	// loopcnt = len / 8
Packit 6c4009
	and	len = 7, len ;;		// remaining len = len & 7
Packit 6c4009
	adds	loopcnt = -1, loopcnt
Packit 6c4009
	mov	ar.ec = MEMLAT + 3
Packit 6c4009
	mux1	chrx8 = chr, @brcst ;;	// get a word full of chr
Packit 6c4009
	mov	ar.lc = loopcnt
Packit 6c4009
	mov	pr.rot = 1 << 16 ;;
Packit 6c4009
.l2:
Packit 6c4009
(p[0])		mov	addr[0] = ret0
Packit 6c4009
(p[0])		ld8.s	value[0] = [ret0], 8	 // speculative load
Packit 6c4009
(p[MEMLAT])	chk.s	value[MEMLAT], .recovery // check and recovery
Packit 6c4009
(p[MEMLAT])	xor	aux[0] = value[MEMLAT], chrx8
Packit 6c4009
(p[MEMLAT+1])	czx1.r	poschr[0] = aux[1]
Packit 6c4009
(p[MEMLAT+2])	cmp.ne	p7, p0 = 8, poschr[1]
Packit 6c4009
(p7)		br.cond.dpnt .foundit
Packit 6c4009
		br.ctop.dptk .l2
Packit 6c4009
.srchfew:
Packit 6c4009
	adds	loopcnt = -1, len
Packit 6c4009
	cmp.eq	p6, p0 = len, r0
Packit 6c4009
(p6)	br.cond.spnt .notfound ;;
Packit 6c4009
	mov	ar.lc = loopcnt
Packit 6c4009
.l3:
Packit 6c4009
	ld1	val = [ret0], 1
Packit 6c4009
	;;
Packit 6c4009
	cmp.eq	p6, p0 = val, chr
Packit 6c4009
(p6)	br.cond.dpnt	.foundit
Packit 6c4009
	br.cloop.sptk	.l3 ;;
Packit 6c4009
.notfound:
Packit 6c4009
	cmp.ne	p6, p0 = r0, r0	// clear p6 (p7 was already 0 when we got here)
Packit 6c4009
	mov	ret0 = r0 ;;	// return NULL
Packit 6c4009
.foundit:
Packit 6c4009
	.pred.rel "mutex" p6, p7
Packit 6c4009
(p6)	adds	ret0 = -1, ret0			   // if we got here from l1 or l3
Packit 6c4009
(p7)	add	ret0 = addr[MEMLAT+2], poschr[1]   // if we got here from l2
Packit 6c4009
	mov	pr = saved_pr, -1
Packit 6c4009
	mov	ar.lc = saved_lc
Packit 6c4009
	br.ret.sptk.many b0
Packit 6c4009
Packit 6c4009
.recovery:
Packit 6c4009
#if MEMLAT != 6
Packit 6c4009
# error "MEMLAT must be 6!"
Packit 6c4009
#endif
Packit 6c4009
(p[MEMLAT-6])	add	ret0 = -8, ret0;;
Packit 6c4009
(p[MEMLAT-5])	add	ret0 = -8, ret0;;
Packit 6c4009
(p[MEMLAT-4])	add	ret0 = -8, ret0;;
Packit 6c4009
(p[MEMLAT-3])	add	ret0 = -8, ret0;;
Packit 6c4009
(p[MEMLAT-2])	add	ret0 = -8, ret0;;
Packit 6c4009
(p[MEMLAT-1])	add	ret0 = -8, ret0;;
Packit 6c4009
(p[MEMLAT])	add	ret0 = -8, ret0;;
Packit 6c4009
(p[MEMLAT+1])	add	ret0 = -8, ret0;;
Packit 6c4009
(p[MEMLAT+2])	add	ret0 = -8, ret0;;
Packit 6c4009
.l4:
Packit 6c4009
	mov     addr[MEMLAT+2] = ret0
Packit 6c4009
	ld8	tmp = [ret0];;		// load the first unchecked 8byte
Packit 6c4009
	xor	aux[1] = tmp, chrx8;;
Packit 6c4009
	czx1.r	poschr[1] = aux[1];;
Packit 6c4009
	cmp.ne	p7, p0 = 8, poschr[1];;
Packit 6c4009
(p7)	add	ret0 = addr[MEMLAT+2], poschr[1];;
Packit 6c4009
(p7)	cmp.geu	p6, p7 = ret0, last	// don't go over the last byte
Packit 6c4009
(p6)	br.cond.spnt	.notfound;;
Packit 6c4009
(p7)	br.cond.spnt	.foundit;;
Packit 6c4009
	adds	ret0 = 8, ret0		// load the next unchecked 8byte
Packit 6c4009
	br.sptk	.l4;;
Packit 6c4009
Packit 6c4009
END(__memchr)
Packit 6c4009
Packit 6c4009
weak_alias (__memchr, memchr)
Packit 6c4009
libc_hidden_builtin_def (memchr)