Blame sysdeps/x86_64/multiarch/strspn-c.c

Packit 6c4009
/* strspn with SSE4.2 intrinsics
Packit 6c4009
   Copyright (C) 2009-2018 Free Software Foundation, Inc.
Packit 6c4009
   Contributed by Intel Corporation.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <nmmintrin.h>
Packit 6c4009
#include <string.h>
Packit 6c4009
#include "varshift.h"
Packit 6c4009
Packit 6c4009
/* We use 0x12:
Packit 6c4009
	_SIDD_SBYTE_OPS
Packit 6c4009
	| _SIDD_CMP_EQUAL_ANY
Packit 6c4009
	| _SIDD_NEGATIVE_POLARITY
Packit 6c4009
	| _SIDD_LEAST_SIGNIFICANT
Packit 6c4009
   on pcmpistri to compare xmm/mem128
Packit 6c4009
Packit 6c4009
   0 1 2 3 4 5 6 7 8 9 A B C D E F
Packit 6c4009
   X X X X X X X X X X X X X X X X
Packit 6c4009
Packit 6c4009
   against xmm
Packit 6c4009
Packit 6c4009
   0 1 2 3 4 5 6 7 8 9 A B C D E F
Packit 6c4009
   A A A A A A A A A A A A A A A A
Packit 6c4009
Packit 6c4009
   to find out if the first 16byte data element has any non-A byte and
Packit 6c4009
   the offset of the first byte.  There are 2 cases:
Packit 6c4009
Packit 6c4009
   1. The first 16byte data element has the non-A byte, including
Packit 6c4009
      EOS, at the offset X.
Packit 6c4009
   2. The first 16byte data element is valid and doesn't have the non-A
Packit 6c4009
      byte.
Packit 6c4009
Packit 6c4009
   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
Packit 6c4009
Packit 6c4009
   case		ECX	CFlag	ZFlag	SFlag
Packit 6c4009
    1		 X	  1	 0/1	  0
Packit 6c4009
    2		16	  0	  0	  0
Packit 6c4009
Packit 6c4009
   We exit from the loop for case 1.  */
Packit 6c4009
Packit 6c4009
extern size_t __strspn_sse2 (const char *, const char *) attribute_hidden;
Packit 6c4009
Packit 6c4009
Packit 6c4009
size_t
Packit 6c4009
__attribute__ ((section (".text.sse4.2")))
Packit 6c4009
__strspn_sse42 (const char *s, const char *a)
Packit 6c4009
{
Packit 6c4009
  if (*a == 0)
Packit 6c4009
    return 0;
Packit 6c4009
Packit 6c4009
  const char *aligned;
Packit 6c4009
  __m128i mask;
Packit 6c4009
  int offset = (int) ((size_t) a & 15);
Packit 6c4009
  if (offset != 0)
Packit 6c4009
    {
Packit 6c4009
      /* Load masks.  */
Packit 6c4009
      aligned = (const char *) ((size_t) a & -16L);
Packit 6c4009
      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
Packit 6c4009
Packit 6c4009
      mask = __m128i_shift_right (mask0, offset);
Packit 6c4009
Packit 6c4009
      /* Find where the NULL terminator is.  */
Packit 6c4009
      int length = _mm_cmpistri (mask, mask, 0x3a);
Packit 6c4009
      if (length == 16 - offset)
Packit 6c4009
	{
Packit 6c4009
	  /* There is no NULL terminator.  */
Packit 6c4009
	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
Packit 6c4009
	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
Packit 6c4009
	  length += index;
Packit 6c4009
Packit 6c4009
	  /* Don't use SSE4.2 if the length of A > 16.  */
Packit 6c4009
	  if (length > 16)
Packit 6c4009
	    return __strspn_sse2 (s, a);
Packit 6c4009
Packit 6c4009
	  if (index != 0)
Packit 6c4009
	    {
Packit 6c4009
	      /* Combine mask0 and mask1.  We could play games with
Packit 6c4009
		 palignr, but frankly this data should be in L1 now
Packit 6c4009
		 so do the merge via an unaligned load.  */
Packit 6c4009
	      mask = _mm_loadu_si128 ((__m128i *) a);
Packit 6c4009
	    }
Packit 6c4009
	}
Packit 6c4009
    }
Packit 6c4009
  else
Packit 6c4009
    {
Packit 6c4009
      /* A is aligned.  */
Packit 6c4009
      mask = _mm_load_si128 ((__m128i *) a);
Packit 6c4009
Packit 6c4009
      /* Find where the NULL terminator is.  */
Packit 6c4009
      int length = _mm_cmpistri (mask, mask, 0x3a);
Packit 6c4009
      if (length == 16)
Packit 6c4009
	{
Packit 6c4009
	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
Packit 6c4009
	     of A > 16.  */
Packit 6c4009
	  if (a[16] != 0)
Packit 6c4009
	    return __strspn_sse2 (s, a);
Packit 6c4009
	}
Packit 6c4009
    }
Packit 6c4009
Packit 6c4009
  offset = (int) ((size_t) s & 15);
Packit 6c4009
  if (offset != 0)
Packit 6c4009
    {
Packit 6c4009
      /* Check partial string.  */
Packit 6c4009
      aligned = (const char *) ((size_t) s & -16L);
Packit 6c4009
      __m128i value = _mm_load_si128 ((__m128i *) aligned);
Packit 6c4009
Packit 6c4009
      value = __m128i_shift_right (value, offset);
Packit 6c4009
Packit 6c4009
      int length = _mm_cmpistri (mask, value, 0x12);
Packit 6c4009
      /* No need to check CFlag since it is always 1.  */
Packit 6c4009
      if (length < 16 - offset)
Packit 6c4009
	return length;
Packit 6c4009
      /* Find where the NULL terminator is.  */
Packit 6c4009
      int index = _mm_cmpistri (value, value, 0x3a);
Packit 6c4009
      if (index < 16 - offset)
Packit 6c4009
	return length;
Packit 6c4009
      aligned += 16;
Packit 6c4009
    }
Packit 6c4009
  else
Packit 6c4009
    aligned = s;
Packit 6c4009
Packit 6c4009
  while (1)
Packit 6c4009
    {
Packit 6c4009
      __m128i value = _mm_load_si128 ((__m128i *) aligned);
Packit 6c4009
      int index = _mm_cmpistri (mask, value, 0x12);
Packit 6c4009
      int cflag = _mm_cmpistrc (mask, value, 0x12);
Packit 6c4009
      if (cflag)
Packit 6c4009
	return (size_t) (aligned + index - s);
Packit 6c4009
      aligned += 16;
Packit 6c4009
    }
Packit 6c4009
}