/* -*- c -*- */
/*
* This file is for the definitions of simd vectorized operations.
*
* Currently contains sse2 functions that are built on amd64, x32 or
* non-generic builds (CFLAGS=-march=...)
* In future it may contain other instruction sets like AVX or NEON detected
* at runtime in which case it needs to be included indirectly via a file
* compiled with special options (or use gcc target attributes) so the binary
* stays portable.
*/
#ifndef __NPY_SIMD_INC
#define __NPY_SIMD_INC
#include "lowlevel_strided_loops.h"
#include "numpy/npy_common.h"
/* for NO_FLOATING_POINT_SUPPORT */
#include "numpy/ufuncobject.h"
#include "numpy/npy_math.h"
#ifdef NPY_HAVE_SSE2_INTRINSICS
#include <emmintrin.h>
#endif
#include <assert.h>
#include <stdlib.h>
#include <float.h>
#include <string.h> /* for memcpy */
/* Figure out the right abs function for pointer addresses */
static NPY_INLINE npy_intp
abs_intp(npy_intp x)
{
#if (NPY_SIZEOF_INTP <= NPY_SIZEOF_INT)
return abs(x);
#elif (NPY_SIZEOF_INTP <= NPY_SIZEOF_LONG)
return labs(x);
#elif defined(_MSC_VER) && (_MSC_VER < 1600)
/* llabs is not available with Visual Studio 2008 */
return x > 0 ? x : -x;
#else
return llabs(x);
#endif
}
/*
* stride is equal to element size and input and destination are equal or
* don't overlap within one register
*/
#define IS_BLOCKABLE_UNARY(esize, vsize) \
(steps[0] == (esize) && steps[0] == steps[1] && \
(npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
((abs_intp(args[1] - args[0]) >= (vsize)) || \
((abs_intp(args[1] - args[0]) == 0))))
#define IS_BLOCKABLE_REDUCE(esize, vsize) \
(steps[1] == (esize) && abs_intp(args[1] - args[0]) >= (vsize) && \
npy_is_aligned(args[1], (esize)) && \
npy_is_aligned(args[0], (esize)))
#define IS_BLOCKABLE_BINARY(esize, vsize) \
(steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
npy_is_aligned(args[0], (esize)) && \
(abs_intp(args[2] - args[0]) >= (vsize) || \
abs_intp(args[2] - args[0]) == 0) && \
(abs_intp(args[2] - args[1]) >= (vsize) || \
abs_intp(args[2] - args[1]) >= 0))
#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \
(steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
((abs_intp(args[2] - args[1]) >= (vsize)) || \
(abs_intp(args[2] - args[1]) == 0)) && \
abs_intp(args[2] - args[0]) >= (esize))
#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \
(steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
((abs_intp(args[2] - args[0]) >= (vsize)) || \
(abs_intp(args[2] - args[0]) == 0)) && \
abs_intp(args[2] - args[1]) >= (esize))
#undef abs_intp
#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
(steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
npy_is_aligned(args[1], (esize)) && \
npy_is_aligned(args[0], (esize)))
#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
(steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
npy_is_aligned(args[1], (esize)))
#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
(steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
npy_is_aligned(args[0], (esize)))
/* align var to alignment */
#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
alignment, n);\
for(i = 0; i < peel; i++)
#define LOOP_BLOCKED(type, vsize)\
for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
i += (vsize / sizeof(type)))
#define LOOP_BLOCKED_END\
for (; i < n; i++)
/*
* Dispatcher functions
* decide whether the operation can be vectorized and run it
* if it was run returns true and false if nothing was done
*/
/*
*****************************************************************************
** FLOAT DISPATCHERS
*****************************************************************************
*/
/**begin repeat
* Float types
* #type = npy_float, npy_double, npy_longdouble#
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
* #vector = 1, 1, 0#
*/
/**begin repeat1
* #func = sqrt, absolute, negative, minimum, maximum#
* #check = IS_BLOCKABLE_UNARY*3, IS_BLOCKABLE_REDUCE*2 #
* #name = unary*3, unary_reduce*2#
* #minmax = 0*3, 1*2#
*/
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
/* prototypes */
static void
sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);
#endif
static NPY_INLINE int
run_@name@_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if @minmax@ && (defined NO_FLOATING_POINT_SUPPORT)
return 0;
#else
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
if (@check@(sizeof(@type@), 16)) {
sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
return 1;
}
#endif
return 0;
#endif
}
/**end repeat1**/
/**begin repeat1
* Arithmetic
* # kind = add, subtract, multiply, divide#
*/
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
/* prototypes */
static void
sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
npy_intp n);
static void
sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
npy_intp n);
static void
sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
npy_intp n);
#endif
static NPY_INLINE int
run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
@type@ * ip1 = (@type@ *)args[0];
@type@ * ip2 = (@type@ *)args[1];
@type@ * op = (@type@ *)args[2];
npy_intp n = dimensions[0];
/* argument one scalar */
if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), 16)) {
sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
/* argument two scalar */
else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), 16)) {
sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
else if (IS_BLOCKABLE_BINARY(sizeof(@type@), 16)) {
sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
#endif
return 0;
}
/**end repeat1**/
/**begin repeat1
* #kind = equal, not_equal, less, less_equal, greater, greater_equal,
* logical_and, logical_or#
* #simd = 1, 1, 1, 1, 1, 1, 0, 0#
*/
#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
/* prototypes */
static void
sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
npy_intp n);
static void
sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
npy_intp n);
static void
sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
npy_intp n);
#endif
static NPY_INLINE int
run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
@type@ * ip1 = (@type@ *)args[0];
@type@ * ip2 = (@type@ *)args[1];
npy_bool * op = (npy_bool *)args[2];
npy_intp n = dimensions[0];
/* argument one scalar */
if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), 16)) {
sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
/* argument two scalar */
else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), 16)) {
sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), 16)) {
sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
#endif
return 0;
}
/**end repeat1**/
/**begin repeat1
* #kind = isnan, isfinite, isinf, signbit#
*/
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
static void
sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n);
#endif
static NPY_INLINE int
run_@kind@_simd_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
if (steps[0] == sizeof(@type@) && steps[1] == 1 &&
npy_is_aligned(args[0], sizeof(@type@))) {
sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]);
return 1;
}
#endif
return 0;
}
/**end repeat1**/
/**end repeat**/
/*
*****************************************************************************
** BOOL DISPATCHERS
*****************************************************************************
*/
/**begin repeat
* # kind = logical_or, logical_and#
*/
#if defined NPY_HAVE_SSE2_INTRINSICS
static void
sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2,
npy_intp n);
static void
sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n);
#endif
static NPY_INLINE int
run_binary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if defined NPY_HAVE_SSE2_INTRINSICS
if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_BINARY(sizeof(npy_bool), 16)) {
sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
(npy_bool*)args[1], dimensions[0]);
return 1;
}
#endif
return 0;
}
static NPY_INLINE int
run_reduce_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if defined NPY_HAVE_SSE2_INTRINSICS
if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_REDUCE(sizeof(npy_bool), 16)) {
sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
dimensions[0]);
return 1;
}
#endif
return 0;
}
/**end repeat**/
/**begin repeat
* # kind = absolute, logical_not#
*/
#if defined NPY_HAVE_SSE2_INTRINSICS
static void
sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n);
#endif
static NPY_INLINE int
run_unary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if defined NPY_HAVE_SSE2_INTRINSICS
if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_UNARY(sizeof(npy_bool), 16)) {
sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
return 1;
}
#endif
return 0;
}
/**end repeat**/
#ifdef NPY_HAVE_SSE2_INTRINSICS
/*
* Vectorized operations
*/
/*
*****************************************************************************
** FLOAT LOOPS
*****************************************************************************
*/
/**begin repeat
* horizontal reductions on a vector
* # VOP = min, max#
*/
static NPY_INLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
{
npy_float r;
__m128 tmp = _mm_movehl_ps(v, v); /* c d ... */
__m128 m = _mm_@VOP@_ps(v, tmp); /* m(ac) m(bd) ... */
tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */
_mm_store_ss(&r, _mm_@VOP@_ps(tmp, m)); /* m(acbd) ... */
return r;
}
static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
{
npy_double r;
__m128d tmp = _mm_unpackhi_pd(v, v); /* b b */
_mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */
return r;
}
/**end repeat**/
/**begin repeat
* #type = npy_float, npy_double#
* #TYPE = FLOAT, DOUBLE#
* #scalarf = npy_sqrtf, npy_sqrt#
* #c = f, #
* #vtype = __m128, __m128d#
* #vpre = _mm, _mm#
* #vsuf = ps, pd#
* #vsufs = ss, sd#
* #nan = NPY_NANF, NPY_NAN#
* #double = 0, 1#
* #cast = _mm_castps_si128, _mm_castpd_si128#
*/
/**begin repeat1
* Arithmetic
* # kind = add, subtract, multiply, divide#
* # OP = +, -, *, /#
* # VOP = add, sub, mul, div#
*/
static void
sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
op[i] = ip1[i] @OP@ ip2[i];
/* lots of specializations, to squeeze out max performance */
if (npy_is_aligned(&ip1[i], 16) && npy_is_aligned(&ip2[i], 16)) {
if (ip1 == ip2) {
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
@vpre@_store_@vsuf@(&op[i], c);
}
}
else {
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
@vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@vpre@_store_@vsuf@(&op[i], c);
}
}
}
else if (npy_is_aligned(&ip1[i], 16)) {
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
@vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@vpre@_store_@vsuf@(&op[i], c);
}
}
else if (npy_is_aligned(&ip2[i], 16)) {
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
@vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@vpre@_store_@vsuf@(&op[i], c);
}
}
else {
if (ip1 == ip2) {
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
@vpre@_store_@vsuf@(&op[i], c);
}
}
else {
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
@vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@vpre@_store_@vsuf@(&op[i], c);
}
}
}
LOOP_BLOCKED_END {
op[i] = ip1[i] @OP@ ip2[i];
}
}
static void
sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
op[i] = ip1[0] @OP@ ip2[i];
if (npy_is_aligned(&ip2[i], 16)) {
LOOP_BLOCKED(@type@, 16) {
@vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@vpre@_store_@vsuf@(&op[i], c);
}
}
else {
LOOP_BLOCKED(@type@, 16) {
@vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@vpre@_store_@vsuf@(&op[i], c);
}
}
LOOP_BLOCKED_END {
op[i] = ip1[0] @OP@ ip2[i];
}
}
static void
sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
op[i] = ip1[i] @OP@ ip2[0];
if (npy_is_aligned(&ip1[i], 16)) {
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@vpre@_store_@vsuf@(&op[i], c);
}
}
else {
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@vpre@_store_@vsuf@(&op[i], c);
}
}
LOOP_BLOCKED_END {
op[i] = ip1[i] @OP@ ip2[0];
}
}
/**end repeat1**/
/*
* compress 4 vectors to 4/8 bytes in op with filled with 0 or 1
* the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
* calling convention leading to C2719 on 32 bit, see #4795
*/
static NPY_INLINE void
sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
npy_bool * op)
{
const __m128i mask = @vpre@_set1_epi8(0x1);
__m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
__m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(*r4));
__m128i rr = @vpre@_packs_epi16(ir1, ir2);
#if @double@
rr = @vpre@_packs_epi16(rr, rr);
rr = @vpre@_and_si128(rr, mask);
@vpre@_storel_epi64((__m128i*)op, rr);
#else
rr = @vpre@_and_si128(rr, mask);
@vpre@_storeu_si128((__m128i*)op, rr);
#endif
}
static void
sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
{
LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
op[i] = npy_signbit(ip1[i]) != 0;
}
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
int r = @vpre@_movemask_@vsuf@(a);
if (sizeof(@type@) == 8) {
op[i] = r & 1;
op[i + 1] = (r >> 1);
}
else {
op[i] = r & 1;
op[i + 1] = (r >> 1) & 1;
op[i + 2] = (r >> 2) & 1;
op[i + 3] = (r >> 3);
}
}
LOOP_BLOCKED_END {
op[i] = npy_signbit(ip1[i]) != 0;
}
}
/**begin repeat1
* #kind = isnan, isfinite, isinf#
* #var = 0, 1, 2#
*/
static void
sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
{
#if @var@ != 0 /* isinf/isfinite */
/* signbit mask 0x7FFFFFFF after andnot */
const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(),
@vpre@_setzero_@vsuf@());
#if @double@
const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX);
#else
const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX);
#endif
#endif
LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
op[i] = npy_@kind@(ip1[i]) != 0;
}
LOOP_BLOCKED(@type@, 64) {
@vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
@vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
@vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
@vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
@vtype@ r1, r2, r3, r4;
#if @var@ != 0 /* isinf/isfinite */
/* fabs via masking of sign bit */
r1 = @vpre@_andnot_@vsuf@(mask, a);
r2 = @vpre@_andnot_@vsuf@(mask, b);
r3 = @vpre@_andnot_@vsuf@(mask, c);
r4 = @vpre@_andnot_@vsuf@(mask, d);
#if @var@ == 1 /* isfinite */
/* negative compare against max float, nan is always true */
r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax);
r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax);
r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax);
r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax);
#else /* isinf */
r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1);
r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2);
r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3);
r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4);
#endif
/* flip results to what we want (andnot as there is no sse not) */
r1 = @vpre@_andnot_@vsuf@(r1, ones);
r2 = @vpre@_andnot_@vsuf@(r2, ones);
r3 = @vpre@_andnot_@vsuf@(r3, ones);
r4 = @vpre@_andnot_@vsuf@(r4, ones);
#endif
#if @var@ == 0 /* isnan */
r1 = @vpre@_cmpneq_@vsuf@(a, a);
r2 = @vpre@_cmpneq_@vsuf@(b, b);
r3 = @vpre@_cmpneq_@vsuf@(c, c);
r4 = @vpre@_cmpneq_@vsuf@(d, d);
#endif
sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
}
LOOP_BLOCKED_END {
op[i] = npy_@kind@(ip1[i]) != 0;
}
}
/**end repeat1**/
/**begin repeat1
* #kind = equal, not_equal, less, less_equal, greater, greater_equal#
* #OP = ==, !=, <, <=, >, >=#
* #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
*/
/* sets invalid fpu flag on QNaN for consistency with packed compare */
static NPY_INLINE int
sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b)
{
@vtype@ one = @vpre@_set1_@vsuf@(1);
@type@ tmp;
@vtype@ v = @vpre@_@VOP@_@vsufs@(@vpre@_load_@vsufs@(&a),
@vpre@_load_@vsufs@(&b));
v = @vpre@_and_@vsuf@(v, one);
@vpre@_store_@vsufs@(&tmp, v);
return tmp;
}
static void
sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
}
LOOP_BLOCKED(@type@, 64) {
@vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
@vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
@vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
@vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
@vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
@vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
@vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
@vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
@vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
@vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
@vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
@vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2);
sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
}
LOOP_BLOCKED_END {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
}
}
static void
sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
@vtype@ s = @vpre@_set1_@vsuf@(ip1[0]);
LOOP_BLOCK_ALIGN_VAR(ip2, @type@, 16) {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
}
LOOP_BLOCKED(@type@, 64) {
@vtype@ a = @vpre@_load_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
@vtype@ b = @vpre@_load_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
@vtype@ c = @vpre@_load_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
@vtype@ d = @vpre@_load_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
@vtype@ r1 = @vpre@_@VOP@_@vsuf@(s, a);
@vtype@ r2 = @vpre@_@VOP@_@vsuf@(s, b);
@vtype@ r3 = @vpre@_@VOP@_@vsuf@(s, c);
@vtype@ r4 = @vpre@_@VOP@_@vsuf@(s, d);
sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
}
LOOP_BLOCKED_END {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
}
}
static void
sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
@vtype@ s = @vpre@_set1_@vsuf@(ip2[0]);
LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
}
LOOP_BLOCKED(@type@, 64) {
@vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
@vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
@vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
@vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
@vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, s);
@vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, s);
@vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, s);
@vtype@ r4 = @vpre@_@VOP@_@vsuf@(d, s);
sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
}
LOOP_BLOCKED_END {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
}
}
/**end repeat1**/
static void
sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
{
/* align output to 16 bytes */
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
op[i] = @scalarf@(ip[i]);
}
assert(n < (16 / sizeof(@type@)) || npy_is_aligned(&op[i], 16));
if (npy_is_aligned(&ip[i], 16)) {
LOOP_BLOCKED(@type@, 16) {
@vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
@vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
}
}
else {
LOOP_BLOCKED(@type@, 16) {
@vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
@vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
}
}
LOOP_BLOCKED_END {
op[i] = @scalarf@(ip[i]);
}
}
static NPY_INLINE
@type@ scalar_abs_@type@(@type@ v)
{
/* add 0 to clear -0.0 */
return (v > 0 ? v: -v) + 0;
}
static NPY_INLINE
@type@ scalar_neg_@type@(@type@ v)
{
return -v;
}
/**begin repeat1
* #kind = absolute, negative#
* #VOP = andnot, xor#
* #scalar = scalar_abs, scalar_neg#
**/
static void
sse2_@kind@_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
{
/*
* get 0x7FFFFFFF mask (everything but signbit set)
* float & ~mask will remove the sign, float ^ mask flips the sign
* this is equivalent to how the compiler implements fabs on amd64
*/
const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
/* align output to 16 bytes */
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
op[i] = @scalar@_@type@(ip[i]);
}
assert(n < (16 / sizeof(@type@)) || npy_is_aligned(&op[i], 16));
if (npy_is_aligned(&ip[i], 16)) {
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
@vpre@_store_@vsuf@(&op[i], @vpre@_@VOP@_@vsuf@(mask, a));
}
}
else {
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
@vpre@_store_@vsuf@(&op[i], @vpre@_@VOP@_@vsuf@(mask, a));
}
}
LOOP_BLOCKED_END {
op[i] = @scalar@_@type@(ip[i]);
}
}
/**end repeat1**/
/**begin repeat1
* #kind = maximum, minimum#
* #VOP = max, min#
* #OP = >=, <=#
**/
/* arguments swapped as unary reduce has the swapped compared to unary */
static void
sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
{
const size_t stride = 16 / sizeof(@type@);
LOOP_BLOCK_ALIGN_VAR(ip, @type@, 16) {
*op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
}
assert(n < (stride) || npy_is_aligned(&ip[i], 16));
if (i + 3 * stride <= n) {
/* load the first elements */
@vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
@vtype@ c2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
i += 2 * stride;
/* minps/minpd will set invalid flag if nan is encountered */
npy_clear_floatstatus();
LOOP_BLOCKED(@type@, 32) {
@vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
@vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
c1 = @vpre@_@VOP@_@vsuf@(c1, v1);
c2 = @vpre@_@VOP@_@vsuf@(c2, v2);
}
c1 = @vpre@_@VOP@_@vsuf@(c1, c2);
if (npy_get_floatstatus() & NPY_FPE_INVALID) {
*op = @nan@;
}
else {
@type@ tmp = sse2_horizontal_@VOP@_@vtype@(c1);
*op = (*op @OP@ tmp || npy_isnan(*op)) ? *op : tmp;
}
}
LOOP_BLOCKED_END {
*op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
}
}
/**end repeat1**/
/**end repeat**/
/*
*****************************************************************************
** BOOL LOOPS
*****************************************************************************
*/
/**begin repeat
* # kind = logical_or, logical_and#
* # and = 0, 1#
* # op = ||, &&#
* # sc = !=, ==#
* # vpre = _mm*2#
* # vsuf = si128*2#
* # vtype = __m128i*2#
* # type = npy_bool*2#
* # vload = _mm_load_si128*2#
* # vloadu = _mm_loadu_si128*2#
* # vstore = _mm_store_si128*2#
*/
/*
* convert any bit set to boolean true so vectorized and normal operations are
* consistent, should not be required if bool is used correctly everywhere but
* you never know
*/
#if !@and@
static NPY_INLINE @vtype@ byte_to_true(@vtype@ v)
{
const @vtype@ zero = @vpre@_setzero_@vsuf@();
const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
/* get 0xFF for zeros */
@vtype@ tmp = @vpre@_cmpeq_epi8(v, zero);
/* filled with 0xFF/0x00, negate and mask to boolean true */
return @vpre@_andnot_@vsuf@(tmp, truemask);
}
#endif
static void
sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n)
{
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
op[i] = ip1[i] @op@ ip2[i];
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vloadu@((@vtype@*)&ip1[i]);
@vtype@ b = @vloadu@((@vtype@*)&ip2[i]);
#if @and@
const @vtype@ zero = @vpre@_setzero_@vsuf@();
/* get 0xFF for non zeros*/
@vtype@ tmp = @vpre@_cmpeq_epi8(a, zero);
/* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */
tmp = @vpre@_andnot_@vsuf@(tmp, b);
#else
@vtype@ tmp = @vpre@_or_@vsuf@(a, b);
#endif
@vstore@((@vtype@*)&op[i], byte_to_true(tmp));
}
LOOP_BLOCKED_END {
op[i] = (ip1[i] @op@ ip2[i]);
}
}
static void
sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
{
const @vtype@ zero = @vpre@_setzero_@vsuf@();
LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, 16) {
*op = *op @op@ ip[i];
if (*op @sc@ 0) {
return;
}
}
/* unrolled once to replace a slow movmsk with a fast pmaxb */
LOOP_BLOCKED(npy_bool, 32) {
@vtype@ v = @vload@((@vtype@*)&ip[i]);
@vtype@ v2 = @vload@((@vtype@*)&ip[i + 16]);
v = @vpre@_cmpeq_epi8(v, zero);
v2 = @vpre@_cmpeq_epi8(v2, zero);
#if @and@
if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) {
*op = 0;
#else
if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) {
*op = 1;
#endif
return;
}
}
LOOP_BLOCKED_END {
*op = *op @op@ ip[i];
if (*op @sc@ 0) {
return;
}
}
}
/**end repeat**/
/**begin repeat
* # kind = absolute, logical_not#
* # op = !=, ==#
* # not = 0, 1#
* # vpre = _mm*2#
* # vsuf = si128*2#
* # vtype = __m128i*2#
* # type = npy_bool*2#
* # vloadu = _mm_loadu_si128*2#
* # vstore = _mm_store_si128*2#
*/
static void
sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
{
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
op[i] = (ip[i] @op@ 0);
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vloadu@((@vtype@*)&ip[i]);
#if @not@
const @vtype@ zero = @vpre@_setzero_@vsuf@();
const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
/* equivalent to byte_to_true but can skip the negation */
a = @vpre@_cmpeq_epi8(a, zero);
a = @vpre@_and_@vsuf@(a, truemask);
#else
/* abs is kind of pointless but maybe its used for byte_to_true */
a = byte_to_true(a);
#endif
@vstore@((@vtype@*)&op[i], a);
}
LOOP_BLOCKED_END {
op[i] = (ip[i] @op@ 0);
}
}
/**end repeat**/
#endif /* NPY_HAVE_SSE2_INTRINSICS */
#endif