Blame sysdeps/alpha/alphaev6/stxncpy.S

Packit 6c4009
/* Copyright (C) 2000-2018 Free Software Foundation, Inc.
Packit 6c4009
   Contributed by Richard Henderson (rth@tamu.edu)
Packit 6c4009
   EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library.  If not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
/* Copy no more than COUNT bytes of the null-terminated string from
Packit 6c4009
   SRC to DST.
Packit 6c4009
Packit 6c4009
   This is an internal routine used by strncpy, stpncpy, and strncat.
Packit 6c4009
   As such, it uses special linkage conventions to make implementation
Packit 6c4009
   of these public functions more efficient.
Packit 6c4009
Packit 6c4009
   On input:
Packit 6c4009
	t9 = return address
Packit 6c4009
	a0 = DST
Packit 6c4009
	a1 = SRC
Packit 6c4009
	a2 = COUNT
Packit 6c4009
Packit 6c4009
   Furthermore, COUNT may not be zero.
Packit 6c4009
Packit 6c4009
   On output:
Packit 6c4009
	t0  = last word written
Packit 6c4009
	t8  = bitmask (with one bit set) indicating the last byte written
Packit 6c4009
	t10 = bitmask (with one bit set) indicating the byte position of
Packit 6c4009
	      the end of the range specified by COUNT
Packit 6c4009
	a0  = unaligned address of the last *word* written
Packit 6c4009
	a2  = the number of full words left in COUNT
Packit 6c4009
Packit 6c4009
   Furthermore, v0, a3-a5, t11, and t12 are untouched.
Packit 6c4009
*/
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
Packit 6c4009
	.arch ev6
Packit 6c4009
	.set noat
Packit 6c4009
	.set noreorder
Packit 6c4009
Packit 6c4009
	.text
Packit 6c4009
	.type	__stxncpy, @function
Packit 6c4009
	.globl	__stxncpy
Packit 6c4009
	.usepv	__stxncpy, no
Packit 6c4009
Packit 6c4009
	cfi_startproc
Packit 6c4009
	cfi_return_column (t9)
Packit 6c4009
Packit 6c4009
	/* On entry to this basic block:
Packit 6c4009
	   t0 == the first destination word for masking back in
Packit 6c4009
	   t1 == the first source word.  */
Packit 6c4009
	.align 4
Packit 6c4009
stxncpy_aligned:
Packit 6c4009
	/* Create the 1st output word and detect 0's in the 1st input word.  */
Packit 6c4009
	lda	t2, -1		# E : build a mask against false zero
Packit 6c4009
	mskqh	t2, a1, t2	# U :   detection in the src word (stall)
Packit 6c4009
	mskqh	t1, a1, t3	# U :
Packit 6c4009
	ornot	t1, t2, t2	# E : (stall)
Packit 6c4009
Packit 6c4009
	mskql	t0, a1, t0	# U : assemble the first output word
Packit 6c4009
	cmpbge	zero, t2, t7	# E : bits set iff null found
Packit 6c4009
	or	t0, t3, t0	# E : (stall)
Packit 6c4009
	beq	a2, $a_eoc	# U :
Packit 6c4009
Packit 6c4009
	bne	t7, $a_eos	# U :
Packit 6c4009
	nop
Packit 6c4009
	nop
Packit 6c4009
	nop
Packit 6c4009
Packit 6c4009
	/* On entry to this basic block:
Packit 6c4009
	   t0 == a source word not containing a null.  */
Packit 6c4009
Packit 6c4009
	/*
Packit 6c4009
	 * nops here to:
Packit 6c4009
	 *	separate store quads from load quads
Packit 6c4009
	 *	limit of 1 bcond/quad to permit training
Packit 6c4009
	 */
Packit 6c4009
$a_loop:
Packit 6c4009
	stq_u	t0, 0(a0)	# L :
Packit 6c4009
	addq	a0, 8, a0	# E :
Packit 6c4009
	subq	a2, 1, a2	# E :
Packit 6c4009
	nop
Packit 6c4009
Packit 6c4009
	ldq_u	t0, 0(a1)	# L :
Packit 6c4009
	addq	a1, 8, a1	# E :
Packit 6c4009
	cmpbge	zero, t0, t7	# E :
Packit 6c4009
	beq	a2, $a_eoc      # U :
Packit 6c4009
Packit 6c4009
	beq	t7, $a_loop	# U :
Packit 6c4009
	nop
Packit 6c4009
	nop
Packit 6c4009
	nop
Packit 6c4009
Packit 6c4009
	/* Take care of the final (partial) word store.  At this point
Packit 6c4009
	   the end-of-count bit is set in t7 iff it applies.
Packit 6c4009
Packit 6c4009
	   On entry to this basic block we have:
Packit 6c4009
	   t0 == the source word containing the null
Packit 6c4009
	   t7 == the cmpbge mask that found it.  */
Packit 6c4009
$a_eos:
Packit 6c4009
	negq	t7, t8		# E : find low bit set
Packit 6c4009
	and	t7, t8, t8	# E : (stall)
Packit 6c4009
	/* For the sake of the cache, don't read a destination word
Packit 6c4009
	   if we're not going to need it.  */
Packit 6c4009
	and	t8, 0x80, t6	# E : (stall)
Packit 6c4009
	bne	t6, 1f		# U : (stall)
Packit 6c4009
Packit 6c4009
	/* We're doing a partial word store and so need to combine
Packit 6c4009
	   our source and original destination words.  */
Packit 6c4009
	ldq_u	t1, 0(a0)	# L :
Packit 6c4009
	subq	t8, 1, t6	# E :
Packit 6c4009
	or	t8, t6, t7	# E : (stall)
Packit 6c4009
	zapnot	t0, t7, t0	# U : clear src bytes > null (stall)
Packit 6c4009
Packit 6c4009
	zap	t1, t7, t1	# .. e1 : clear dst bytes <= null
Packit 6c4009
	or	t0, t1, t0	# e1    : (stall)
Packit 6c4009
	nop
Packit 6c4009
	nop
Packit 6c4009
Packit 6c4009
1:	stq_u	t0, 0(a0)	# L :
Packit 6c4009
	ret	(t9)		# L0 : Latency=3
Packit 6c4009
	nop
Packit 6c4009
	nop
Packit 6c4009
Packit 6c4009
	/* Add the end-of-count bit to the eos detection bitmask.  */
Packit 6c4009
$a_eoc:
Packit 6c4009
	or	t10, t7, t7	# E :
Packit 6c4009
	br	$a_eos		# L0 : Latency=3
Packit 6c4009
	nop
Packit 6c4009
	nop
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
__stxncpy:
Packit 6c4009
	/* Are source and destination co-aligned?  */
Packit 6c4009
	lda	t2, -1		# E :
Packit 6c4009
	xor	a0, a1, t1	# E :
Packit 6c4009
	and	a0, 7, t0	# E : find dest misalignment
Packit 6c4009
	nop			# E :
Packit 6c4009
Packit 6c4009
	srl	t2, 1, t2	# U :
Packit 6c4009
	and	t1, 7, t1	# E :
Packit 6c4009
	cmovlt	a2, t2, a2	# E : bound count to LONG_MAX (stall)
Packit 6c4009
	nop			# E :
Packit 6c4009
Packit 6c4009
	addq	a2, t0, a2	# E : bias count by dest misalignment
Packit 6c4009
	subq	a2, 1, a2	# E : (stall)
Packit 6c4009
	and	a2, 7, t2	# E : (stall)
Packit 6c4009
	lda	t10, 1		# E :
Packit 6c4009
Packit 6c4009
	srl	a2, 3, a2	# U : a2 = loop counter = (count - 1)/8
Packit 6c4009
	sll	t10, t2, t10	# U : t10 = bitmask of last count byte
Packit 6c4009
	nop			# E :
Packit 6c4009
	bne	t1, $unaligned	# U : (stall)
Packit 6c4009
Packit 6c4009
	/* We are co-aligned; take care of a partial first word.  */
Packit 6c4009
	ldq_u	t1, 0(a1)	# L : load first src word
Packit 6c4009
	addq	a1, 8, a1	# E :
Packit 6c4009
	beq	t0, stxncpy_aligned # U : avoid loading dest word if not needed
Packit 6c4009
	ldq_u	t0, 0(a0)	# L :
Packit 6c4009
Packit 6c4009
	br	stxncpy_aligned	# U :
Packit 6c4009
	nop
Packit 6c4009
	nop
Packit 6c4009
	nop
Packit 6c4009
Packit 6c4009
Packit 6c4009
Packit 6c4009
/* The source and destination are not co-aligned.  Align the destination
Packit 6c4009
   and cope.  We have to be very careful about not reading too much and
Packit 6c4009
   causing a SEGV.  */
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
$u_head:
Packit 6c4009
	/* We know just enough now to be able to assemble the first
Packit 6c4009
	   full source word.  We can still find a zero at the end of it
Packit 6c4009
	   that prevents us from outputting the whole thing.
Packit 6c4009
Packit 6c4009
	   On entry to this basic block:
Packit 6c4009
	   t0 == the first dest word, unmasked
Packit 6c4009
	   t1 == the shifted low bits of the first source word
Packit 6c4009
	   t6 == bytemask that is -1 in dest word bytes */
Packit 6c4009
Packit 6c4009
	ldq_u	t2, 8(a1)	# L : Latency=3 load second src word
Packit 6c4009
	addq	a1, 8, a1	# E :
Packit 6c4009
	mskql	t0, a0, t0	# U : mask trailing garbage in dst
Packit 6c4009
	extqh	t2, a1, t4	# U : (3 cycle stall on t2)
Packit 6c4009
Packit 6c4009
	or	t1, t4, t1	# E : first aligned src word complete (stall)
Packit 6c4009
	mskqh	t1, a0, t1	# U : mask leading garbage in src (stall)
Packit 6c4009
	or	t0, t1, t0	# E : first output word complete (stall)
Packit 6c4009
	or	t0, t6, t6	# E : mask original data for zero test (stall)
Packit 6c4009
Packit 6c4009
	cmpbge	zero, t6, t7	# E :
Packit 6c4009
	beq	a2, $u_eocfin	# U :
Packit 6c4009
	lda	t6, -1		# E :
Packit 6c4009
	nop
Packit 6c4009
Packit 6c4009
	bne	t7, $u_final	# U :
Packit 6c4009
	mskql	t6, a1, t6	# U : mask out bits already seen
Packit 6c4009
	stq_u	t0, 0(a0)	# L : store first output word
Packit 6c4009
	or      t6, t2, t2	# E :
Packit 6c4009
Packit 6c4009
	cmpbge	zero, t2, t7	# E : find nulls in second partial
Packit 6c4009
	addq	a0, 8, a0	# E :
Packit 6c4009
	subq	a2, 1, a2	# E :
Packit 6c4009
	bne	t7, $u_late_head_exit	# U :
Packit 6c4009
Packit 6c4009
	/* Finally, we've got all the stupid leading edge cases taken care
Packit 6c4009
	   of and we can set up to enter the main loop.  */
Packit 6c4009
	extql	t2, a1, t1	# U : position hi-bits of lo word
Packit 6c4009
	beq	a2, $u_eoc	# U :
Packit 6c4009
	ldq_u	t2, 8(a1)	# L : read next high-order source word
Packit 6c4009
	addq	a1, 8, a1	# E :
Packit 6c4009
Packit 6c4009
	extqh	t2, a1, t0	# U : position lo-bits of hi word (stall)
Packit 6c4009
	cmpbge	zero, t2, t7	# E :
Packit 6c4009
	nop
Packit 6c4009
	bne	t7, $u_eos	# U :
Packit 6c4009
Packit 6c4009
	/* Unaligned copy main loop.  In order to avoid reading too much,
Packit 6c4009
	   the loop is structured to detect zeros in aligned source words.
Packit 6c4009
	   This has, unfortunately, effectively pulled half of a loop
Packit 6c4009
	   iteration out into the head and half into the tail, but it does
Packit 6c4009
	   prevent nastiness from accumulating in the very thing we want
Packit 6c4009
	   to run as fast as possible.
Packit 6c4009
Packit 6c4009
	   On entry to this basic block:
Packit 6c4009
	   t0 == the shifted low-order bits from the current source word
Packit 6c4009
	   t1 == the shifted high-order bits from the previous source word
Packit 6c4009
	   t2 == the unshifted current source word
Packit 6c4009
Packit 6c4009
	   We further know that t2 does not contain a null terminator.  */
Packit 6c4009
Packit 6c4009
	.align 4
Packit 6c4009
$u_loop:
Packit 6c4009
	or	t0, t1, t0	# E : current dst word now complete
Packit 6c4009
	subq	a2, 1, a2	# E : decrement word count
Packit 6c4009
	extql	t2, a1, t1	# U : extract high bits for next time
Packit 6c4009
	addq	a0, 8, a0	# E :
Packit 6c4009
Packit 6c4009
	stq_u	t0, -8(a0)	# L : save the current word
Packit 6c4009
	beq	a2, $u_eoc	# U :
Packit 6c4009
	ldq_u	t2, 8(a1)	# L : Latency=3 load high word for next time
Packit 6c4009
	addq	a1, 8, a1	# E :
Packit 6c4009
Packit 6c4009
	extqh	t2, a1, t0	# U : extract low bits (2 cycle stall)
Packit 6c4009
	cmpbge	zero, t2, t7	# E : test new word for eos
Packit 6c4009
	nop
Packit 6c4009
	beq	t7, $u_loop	# U :
Packit 6c4009
Packit 6c4009
	/* We've found a zero somewhere in the source word we just read.
Packit 6c4009
	   If it resides in the lower half, we have one (probably partial)
Packit 6c4009
	   word to write out, and if it resides in the upper half, we
Packit 6c4009
	   have one full and one partial word left to write out.
Packit 6c4009
Packit 6c4009
	   On entry to this basic block:
Packit 6c4009
	   t0 == the shifted low-order bits from the current source word
Packit 6c4009
	   t1 == the shifted high-order bits from the previous source word
Packit 6c4009
	   t2 == the unshifted current source word.  */
Packit 6c4009
$u_eos:
Packit 6c4009
	or	t0, t1, t0	# E : first (partial) source word complete
Packit 6c4009
	nop
Packit 6c4009
	cmpbge	zero, t0, t7	# E : is the null in this first bit? (stall)
Packit 6c4009
	bne	t7, $u_final	# U : (stall)
Packit 6c4009
Packit 6c4009
	stq_u	t0, 0(a0)	# L : the null was in the high-order bits
Packit 6c4009
	addq	a0, 8, a0	# E :
Packit 6c4009
	subq	a2, 1, a2	# E :
Packit 6c4009
	nop
Packit 6c4009
Packit 6c4009
$u_late_head_exit:
Packit 6c4009
	extql	t2, a1, t0	# U :
Packit 6c4009
	cmpbge	zero, t0, t7	# E :
Packit 6c4009
	or	t7, t10, t6	# E : (stall)
Packit 6c4009
	cmoveq	a2, t6, t7	# E : Latency=2, extra map slot (stall)
Packit 6c4009
Packit 6c4009
	/* Take care of a final (probably partial) result word.
Packit 6c4009
	   On entry to this basic block:
Packit 6c4009
	   t0 == assembled source word
Packit 6c4009
	   t7 == cmpbge mask that found the null.  */
Packit 6c4009
$u_final:
Packit 6c4009
	negq	t7, t6		# E : isolate low bit set
Packit 6c4009
	and	t6, t7, t8	# E : (stall)
Packit 6c4009
	and	t8, 0x80, t6	# E : avoid dest word load if we can (stall)
Packit 6c4009
	bne	t6, 1f		# U : (stall)
Packit 6c4009
Packit 6c4009
	ldq_u	t1, 0(a0)	# L :
Packit 6c4009
	subq	t8, 1, t6	# E :
Packit 6c4009
	or	t6, t8, t7	# E : (stall)
Packit 6c4009
	zapnot	t0, t7, t0	# U : kill source bytes > null
Packit 6c4009
Packit 6c4009
	zap	t1, t7, t1	# U : kill dest bytes <= null
Packit 6c4009
	or	t0, t1, t0	# E : (stall)
Packit 6c4009
	nop
Packit 6c4009
	nop
Packit 6c4009
Packit 6c4009
1:	stq_u	t0, 0(a0)	# L :
Packit 6c4009
	ret	(t9)		# L0 : Latency=3
Packit 6c4009
Packit 6c4009
        /* Got to end-of-count before end of string.
Packit 6c4009
           On entry to this basic block:
Packit 6c4009
           t1 == the shifted high-order bits from the previous source word  */
Packit 6c4009
$u_eoc:
Packit 6c4009
	and	a1, 7, t6	# E :
Packit 6c4009
	sll	t10, t6, t6	# U : (stall)
Packit 6c4009
	and	t6, 0xff, t6	# E : (stall)
Packit 6c4009
	bne	t6, 1f		# U : (stall)
Packit 6c4009
Packit 6c4009
	ldq_u	t2, 8(a1)	# L : load final src word
Packit 6c4009
	nop
Packit 6c4009
	extqh	t2, a1, t0	# U : extract low bits for last word (stall)
Packit 6c4009
	or	t1, t0, t1	# E : (stall)
Packit 6c4009
Packit 6c4009
1:	cmpbge	zero, t1, t7	# E :
Packit 6c4009
	mov	t1, t0
Packit 6c4009
Packit 6c4009
$u_eocfin:			# end-of-count, final word
Packit 6c4009
	or	t10, t7, t7	# E :
Packit 6c4009
	br	$u_final	# L0 : Latency=3
Packit 6c4009
Packit 6c4009
	/* Unaligned copy entry point.  */
Packit 6c4009
	.align 4
Packit 6c4009
$unaligned:
Packit 6c4009
Packit 6c4009
	ldq_u	t1, 0(a1)	# L : load first source word
Packit 6c4009
	and	a0, 7, t4	# E : find dest misalignment
Packit 6c4009
	and	a1, 7, t5	# E : find src misalignment
Packit 6c4009
	/* Conditionally load the first destination word and a bytemask
Packit 6c4009
	   with 0xff indicating that the destination byte is sacrosanct.  */
Packit 6c4009
	mov	zero, t0	# E :
Packit 6c4009
Packit 6c4009
	mov	zero, t6	# E :
Packit 6c4009
	beq	t4, 1f		# U :
Packit 6c4009
	ldq_u	t0, 0(a0)	# L :
Packit 6c4009
	lda	t6, -1		# E :
Packit 6c4009
Packit 6c4009
	mskql	t6, a0, t6	# U :
Packit 6c4009
	nop
Packit 6c4009
	nop
Packit 6c4009
1:	subq	a1, t4, a1	# E : sub dest misalignment from src addr
Packit 6c4009
Packit 6c4009
	/* If source misalignment is larger than dest misalignment, we need
Packit 6c4009
	   extra startup checks to avoid SEGV.  */
Packit 6c4009
Packit 6c4009
	cmplt	t4, t5, t8	# E :
Packit 6c4009
	extql	t1, a1, t1	# U : shift src into place
Packit 6c4009
	lda	t2, -1		# E : for creating masks later
Packit 6c4009
	beq	t8, $u_head	# U : (stall)
Packit 6c4009
Packit 6c4009
	mskqh	t2, t5, t2	# U : begin src byte validity mask
Packit 6c4009
	cmpbge	zero, t1, t7	# E : is there a zero?
Packit 6c4009
	extql	t2, a1, t2	# U :
Packit 6c4009
	or	t7, t10, t5	# E : test for end-of-count too
Packit 6c4009
Packit 6c4009
	cmpbge	zero, t2, t3	# E :
Packit 6c4009
	cmoveq	a2, t5, t7	# E : Latency=2, extra map slot
Packit 6c4009
	nop			# E : keep with cmoveq
Packit 6c4009
	andnot	t7, t3, t7	# E : (stall)
Packit 6c4009
Packit 6c4009
	beq	t7, $u_head	# U :
Packit 6c4009
	/* At this point we've found a zero in the first partial word of
Packit 6c4009
	   the source.  We need to isolate the valid source data and mask
Packit 6c4009
	   it into the original destination data.  (Incidentally, we know
Packit 6c4009
	   that we'll need at least one byte of that original dest word.) */
Packit 6c4009
	ldq_u	t0, 0(a0)	# L :
Packit 6c4009
	negq	t7, t6		# E : build bitmask of bytes <= zero
Packit 6c4009
	mskqh	t1, t4, t1	# U :
Packit 6c4009
Packit 6c4009
	and	t6, t7, t8	# E :
Packit 6c4009
	subq	t8, 1, t6	# E : (stall)
Packit 6c4009
	or	t6, t8, t7	# E : (stall)
Packit 6c4009
	zapnot	t2, t7, t2	# U : prepare source word; mirror changes (stall)
Packit 6c4009
Packit 6c4009
	zapnot	t1, t7, t1	# U : to source validity mask
Packit 6c4009
	andnot	t0, t2, t0	# E : zero place for source to reside
Packit 6c4009
	or	t0, t1, t0	# E : and put it there (stall both t0, t1)
Packit 6c4009
	stq_u	t0, 0(a0)	# L : (stall)
Packit 6c4009
Packit 6c4009
	ret	(t9)		# L0 : Latency=3
Packit 6c4009
Packit 6c4009
	cfi_endproc