Blob Blame History Raw
/* -*- c -*- */

#define _UMATHMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "Python.h"

#include "npy_config.h"
#define PY_ARRAY_UNIQUE_SYMBOL _npy_umathmodule_ARRAY_API
#define NO_IMPORT_ARRAY

#include "numpy/npy_common.h"
#include "numpy/arrayobject.h"
#include "numpy/ufuncobject.h"
#include "numpy/npy_math.h"
#include "numpy/halffloat.h"
#include "lowlevel_strided_loops.h"

#include "npy_pycompat.h"

#include "ufunc_object.h"

#include <string.h> /* for memchr */

/*
 * cutoff blocksize for pairwise summation
 * decreasing it decreases errors slightly as more pairs are summed but
 * also lowers performance, as the inner loop is unrolled eight times it is
 * effectively 16
 */
#define PW_BLOCKSIZE    128

/*
 * include vectorized functions and dispatchers
 * this file is safe to include also for generic builds
 * platform specific instructions are either masked via the proprocessor or
 * runtime detected
 */
#include "simd.inc"


/*
 *****************************************************************************
 **                             UFUNC LOOPS                                 **
 *****************************************************************************
 */

/* unary loop input and output contiguous */
#define IS_UNARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \
                                  steps[1] == sizeof(tout))

#define IS_BINARY_REDUCE ((args[0] == args[2])\
        && (steps[0] == steps[2])\
        && (steps[0] == 0))

/* binary loop input and output contiguous */
#define IS_BINARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \
                                   steps[1] == sizeof(tin) && \
                                   steps[2] == sizeof(tout))
/* binary loop input and output contiguous with first scalar */
#define IS_BINARY_CONT_S1(tin, tout) (steps[0] == 0 && \
                                   steps[1] == sizeof(tin) && \
                                   steps[2] == sizeof(tout))
/* binary loop input and output contiguous with second scalar */
#define IS_BINARY_CONT_S2(tin, tout) (steps[0] == sizeof(tin) && \
                                   steps[1] == 0 && \
                                   steps[2] == sizeof(tout))

#define OUTPUT_LOOP\
    char *op1 = args[1];\
    npy_intp os1 = steps[1];\
    npy_intp n = dimensions[0];\
    npy_intp i;\
    for(i = 0; i < n; i++, op1 += os1)

#define UNARY_LOOP\
    char *ip1 = args[0], *op1 = args[1];\
    npy_intp is1 = steps[0], os1 = steps[1];\
    npy_intp n = dimensions[0];\
    npy_intp i;\
    for(i = 0; i < n; i++, ip1 += is1, op1 += os1)

/*
 * loop with contiguous specialization
 * op should be the code working on `tin in` and
 * storing the result in `tout * out`
 * combine with NPY_GCC_OPT_3 to allow autovectorization
 * should only be used where its worthwhile to avoid code bloat
 */
#define BASE_UNARY_LOOP(tin, tout, op) \
    UNARY_LOOP { \
        const tin in = *(tin *)ip1; \
        tout * out = (tout *)op1; \
        op; \
    }
#define UNARY_LOOP_FAST(tin, tout, op) \
    do { \
    /* condition allows compiler to optimize the generic macro */ \
    if (IS_UNARY_CONT(tin, tout)) { \
        if (args[0] == args[1]) { \
            BASE_UNARY_LOOP(tin, tout, op) \
        } \
        else { \
            BASE_UNARY_LOOP(tin, tout, op) \
        } \
    } \
    else { \
        BASE_UNARY_LOOP(tin, tout, op) \
    } \
    } \
    while (0)

#define UNARY_LOOP_TWO_OUT\
    char *ip1 = args[0], *op1 = args[1], *op2 = args[2];\
    npy_intp is1 = steps[0], os1 = steps[1], os2 = steps[2];\
    npy_intp n = dimensions[0];\
    npy_intp i;\
    for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)

#define BINARY_LOOP\
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
    npy_intp n = dimensions[0];\
    npy_intp i;\
    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1)

/*
 * loop with contiguous specialization
 * op should be the code working on `tin in1`, `tin in2` and
 * storing the result in `tout * out`
 * combine with NPY_GCC_OPT_3 to allow autovectorization
 * should only be used where its worthwhile to avoid code bloat
 */
#define BASE_BINARY_LOOP(tin, tout, op) \
    BINARY_LOOP { \
        const tin in1 = *(tin *)ip1; \
        const tin in2 = *(tin *)ip2; \
        tout * out = (tout *)op1; \
        op; \
    }
/*
 * unfortunately gcc 6/7 regressed and we need to give it additional hints to
 * vectorize inplace operations (PR80198)
 * must only be used after op1 == ip1 or ip2 has been checked
 * TODO: using ivdep might allow other compilers to vectorize too
 */
#if __GNUC__ >= 6
#define IVDEP_LOOP _Pragma("GCC ivdep")
#else
#define IVDEP_LOOP
#endif
#define BASE_BINARY_LOOP_INP(tin, tout, op) \
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
    npy_intp n = dimensions[0];\
    npy_intp i;\
    IVDEP_LOOP \
    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \
        const tin in1 = *(tin *)ip1; \
        const tin in2 = *(tin *)ip2; \
        tout * out = (tout *)op1; \
        op; \
    }
#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
    const tin cin = *(tin *)cinp; \
    BINARY_LOOP { \
        const tin vin = *(tin *)vinp; \
        tout * out = (tout *)op1; \
        op; \
    }
/* PR80198 again, scalar works without the pragma */
#define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \
    const tin cin = *(tin *)cinp; \
    BINARY_LOOP { \
        const tin vin = *(tin *)vinp; \
        tout * out = (tout *)vinp; \
        op; \
    }
#define BINARY_LOOP_FAST(tin, tout, op) \
    do { \
    /* condition allows compiler to optimize the generic macro */ \
    if (IS_BINARY_CONT(tin, tout)) { \
        if (args[2] == args[0]) { \
            BASE_BINARY_LOOP_INP(tin, tout, op) \
        } \
        else if (args[2] == args[1]) { \
            BASE_BINARY_LOOP_INP(tin, tout, op) \
        } \
        else { \
            BASE_BINARY_LOOP(tin, tout, op) \
        } \
    } \
    else if (IS_BINARY_CONT_S1(tin, tout)) { \
        if (args[1] == args[2]) { \
            BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
        } \
        else { \
            BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
        } \
    } \
    else if (IS_BINARY_CONT_S2(tin, tout)) { \
        if (args[0] == args[2]) { \
            BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
        } \
        else { \
            BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
        }\
    } \
    else { \
        BASE_BINARY_LOOP(tin, tout, op) \
    } \
    } \
    while (0)

#define BINARY_REDUCE_LOOP_INNER\
    char *ip2 = args[1]; \
    npy_intp is2 = steps[1]; \
    npy_intp n = dimensions[0]; \
    npy_intp i; \
    for(i = 0; i < n; i++, ip2 += is2)

#define BINARY_REDUCE_LOOP(TYPE)\
    char *iop1 = args[0]; \
    TYPE io1 = *(TYPE *)iop1; \
    BINARY_REDUCE_LOOP_INNER

#define BINARY_LOOP_TWO_OUT\
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2], *op2 = args[3];\
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2], os2 = steps[3];\
    npy_intp n = dimensions[0];\
    npy_intp i;\
    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1, op2 += os2)

/******************************************************************************
 **                          GENERIC FLOAT LOOPS                             **
 *****************************************************************************/


typedef float halfUnaryFunc(npy_half x);
typedef float floatUnaryFunc(float x);
typedef double doubleUnaryFunc(double x);
typedef npy_longdouble longdoubleUnaryFunc(npy_longdouble x);
typedef npy_half halfBinaryFunc(npy_half x, npy_half y);
typedef float floatBinaryFunc(float x, float y);
typedef double doubleBinaryFunc(double x, double y);
typedef npy_longdouble longdoubleBinaryFunc(npy_longdouble x, npy_longdouble y);


/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_e_e(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    halfUnaryFunc *f = (halfUnaryFunc *)func;
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *(npy_half *)op1 = f(in1);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_e_e_As_f_f(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    floatUnaryFunc *f = (floatUnaryFunc *)func;
    UNARY_LOOP {
        const float in1 = npy_half_to_float(*(npy_half *)ip1);
        *(npy_half *)op1 = npy_float_to_half(f(in1));
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_e_e_As_d_d(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    doubleUnaryFunc *f = (doubleUnaryFunc *)func;
    UNARY_LOOP {
        const double in1 = npy_half_to_double(*(npy_half *)ip1);
        *(npy_half *)op1 = npy_double_to_half(f(in1));
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_f_f(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    floatUnaryFunc *f = (floatUnaryFunc *)func;
    UNARY_LOOP {
        const float in1 = *(float *)ip1;
        *(float *)op1 = f(in1);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_f_f_As_d_d(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    doubleUnaryFunc *f = (doubleUnaryFunc *)func;
    UNARY_LOOP {
        const float in1 = *(float *)ip1;
        *(float *)op1 = (float)f((double)in1);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_ee_e(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    halfBinaryFunc *f = (halfBinaryFunc *)func;
    BINARY_LOOP {
        npy_half in1 = *(npy_half *)ip1;
        npy_half in2 = *(npy_half *)ip2;
        *(npy_half *)op1 = f(in1, in2);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_ee_e_As_ff_f(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    floatBinaryFunc *f = (floatBinaryFunc *)func;
    BINARY_LOOP {
        float in1 = npy_half_to_float(*(npy_half *)ip1);
        float in2 = npy_half_to_float(*(npy_half *)ip2);
        *(npy_half *)op1 = npy_float_to_half(f(in1, in2));
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_ee_e_As_dd_d(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    doubleBinaryFunc *f = (doubleBinaryFunc *)func;
    BINARY_LOOP {
        double in1 = npy_half_to_double(*(npy_half *)ip1);
        double in2 = npy_half_to_double(*(npy_half *)ip2);
        *(npy_half *)op1 = npy_double_to_half(f(in1, in2));
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_ff_f(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    floatBinaryFunc *f = (floatBinaryFunc *)func;
    BINARY_LOOP {
        float in1 = *(float *)ip1;
        float in2 = *(float *)ip2;
        *(float *)op1 = f(in1, in2);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_ff_f_As_dd_d(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    doubleBinaryFunc *f = (doubleBinaryFunc *)func;
    BINARY_LOOP {
        float in1 = *(float *)ip1;
        float in2 = *(float *)ip2;
        *(float *)op1 = (double)f((double)in1, (double)in2);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_d_d(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    doubleUnaryFunc *f = (doubleUnaryFunc *)func;
    UNARY_LOOP {
        double in1 = *(double *)ip1;
        *(double *)op1 = f(in1);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_dd_d(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    doubleBinaryFunc *f = (doubleBinaryFunc *)func;
    BINARY_LOOP {
        double in1 = *(double *)ip1;
        double in2 = *(double *)ip2;
        *(double *)op1 = f(in1, in2);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_g_g(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    longdoubleUnaryFunc *f = (longdoubleUnaryFunc *)func;
    UNARY_LOOP {
        npy_longdouble in1 = *(npy_longdouble *)ip1;
        *(npy_longdouble *)op1 = f(in1);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_gg_g(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    longdoubleBinaryFunc *f = (longdoubleBinaryFunc *)func;
    BINARY_LOOP {
        npy_longdouble in1 = *(npy_longdouble *)ip1;
        npy_longdouble in2 = *(npy_longdouble *)ip2;
        *(npy_longdouble *)op1 = f(in1, in2);
    }
}



/******************************************************************************
 **                          GENERIC COMPLEX LOOPS                           **
 *****************************************************************************/


typedef void cdoubleUnaryFunc(npy_cdouble *x, npy_cdouble *r);
typedef void cfloatUnaryFunc(npy_cfloat *x, npy_cfloat *r);
typedef void clongdoubleUnaryFunc(npy_clongdouble *x, npy_clongdouble *r);
typedef void cdoubleBinaryFunc(npy_cdouble *x, npy_cdouble *y, npy_cdouble *r);
typedef void cfloatBinaryFunc(npy_cfloat *x, npy_cfloat *y, npy_cfloat *r);
typedef void clongdoubleBinaryFunc(npy_clongdouble *x, npy_clongdouble *y,
                                   npy_clongdouble *r);

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_F_F(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    cfloatUnaryFunc *f = (cfloatUnaryFunc *)func;
    UNARY_LOOP {
        npy_cfloat in1 = *(npy_cfloat *)ip1;
        npy_cfloat *out = (npy_cfloat *)op1;
        f(&in1, out);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_F_F_As_D_D(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    cdoubleUnaryFunc *f = (cdoubleUnaryFunc *)func;
    UNARY_LOOP {
        npy_cdouble tmp, out;
        tmp.real = (double)((float *)ip1)[0];
        tmp.imag = (double)((float *)ip1)[1];
        f(&tmp, &out);
        ((float *)op1)[0] = (float)out.real;
        ((float *)op1)[1] = (float)out.imag;
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_FF_F(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    cfloatBinaryFunc *f = (cfloatBinaryFunc *)func;
    BINARY_LOOP {
        npy_cfloat in1 = *(npy_cfloat *)ip1;
        npy_cfloat in2 = *(npy_cfloat *)ip2;
        npy_cfloat *out = (npy_cfloat *)op1;
        f(&in1, &in2, out);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_FF_F_As_DD_D(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    cdoubleBinaryFunc *f = (cdoubleBinaryFunc *)func;
    BINARY_LOOP {
        npy_cdouble tmp1, tmp2, out;
        tmp1.real = (double)((float *)ip1)[0];
        tmp1.imag = (double)((float *)ip1)[1];
        tmp2.real = (double)((float *)ip2)[0];
        tmp2.imag = (double)((float *)ip2)[1];
        f(&tmp1, &tmp2, &out);
        ((float *)op1)[0] = (float)out.real;
        ((float *)op1)[1] = (float)out.imag;
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_D_D(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    cdoubleUnaryFunc *f = (cdoubleUnaryFunc *)func;
    UNARY_LOOP {
        npy_cdouble in1 = *(npy_cdouble *)ip1;
        npy_cdouble *out = (npy_cdouble *)op1;
        f(&in1, out);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_DD_D(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    cdoubleBinaryFunc *f = (cdoubleBinaryFunc *)func;
    BINARY_LOOP {
        npy_cdouble in1 = *(npy_cdouble *)ip1;
        npy_cdouble in2 = *(npy_cdouble *)ip2;
        npy_cdouble *out = (npy_cdouble *)op1;
        f(&in1, &in2, out);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_G_G(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    clongdoubleUnaryFunc *f = (clongdoubleUnaryFunc *)func;
    UNARY_LOOP {
        npy_clongdouble in1 = *(npy_clongdouble *)ip1;
        npy_clongdouble *out = (npy_clongdouble *)op1;
        f(&in1, out);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_GG_G(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    clongdoubleBinaryFunc *f = (clongdoubleBinaryFunc *)func;
    BINARY_LOOP {
        npy_clongdouble in1 = *(npy_clongdouble *)ip1;
        npy_clongdouble in2 = *(npy_clongdouble *)ip2;
        npy_clongdouble *out = (npy_clongdouble *)op1;
        f(&in1, &in2, out);
    }
}


/******************************************************************************
 **                         GENERIC OBJECT lOOPS                             **
 *****************************************************************************/

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_O_O(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    unaryfunc f = (unaryfunc)func;
    UNARY_LOOP {
        PyObject *in1 = *(PyObject **)ip1;
        PyObject **out = (PyObject **)op1;
        PyObject *ret = f(in1 ? in1 : Py_None);
        if (ret == NULL) {
            return;
        }
        Py_XDECREF(*out);
        *out = ret;
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_O_O_method(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    char *meth = (char *)func;
    UNARY_LOOP {
        PyObject *in1 = *(PyObject **)ip1;
        PyObject **out = (PyObject **)op1;
        PyObject *ret = PyObject_CallMethod(in1 ? in1 : Py_None, meth, NULL);
        if (ret == NULL) {
            return;
        }
        Py_XDECREF(*out);
        *out = ret;
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_OO_O(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    binaryfunc f = (binaryfunc)func;
    BINARY_LOOP {
        PyObject *in1 = *(PyObject **)ip1;
        PyObject *in2 = *(PyObject **)ip2;
        PyObject **out = (PyObject **)op1;
        PyObject *ret = f(in1 ? in1 : Py_None, in2 ? in2 : Py_None);
        if (ret == NULL) {
            return;
        }
        Py_XDECREF(*out);
        *out = ret;
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_OO_O_method(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    char *meth = (char *)func;
    BINARY_LOOP {
        PyObject *in1 = *(PyObject **)ip1;
        PyObject *in2 = *(PyObject **)ip2;
        PyObject **out = (PyObject **)op1;
        PyObject *ret = PyObject_CallMethod(in1 ? in1 : Py_None,
                                            meth, "(O)", in2);
        if (ret == NULL) {
            return;
        }
        Py_XDECREF(*out);
        *out = ret;
    }
}

/*
 * A general-purpose ufunc that deals with general-purpose Python callable.
 * func is a structure with nin, nout, and a Python callable function
 */

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_On_Om(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
{
    npy_intp n =  dimensions[0];
    PyUFunc_PyFuncData *data = (PyUFunc_PyFuncData *)func;
    int nin = data->nin;
    int nout = data->nout;
    PyObject *tocall = data->callable;
    char *ptrs[NPY_MAXARGS];
    PyObject *arglist, *result;
    PyObject *in, **op;
    npy_intp i, j, ntot;

    ntot = nin+nout;

    for(j = 0; j < ntot; j++) {
        ptrs[j] = args[j];
    }
    for(i = 0; i < n; i++) {
        arglist = PyTuple_New(nin);
        if (arglist == NULL) {
            return;
        }
        for(j = 0; j < nin; j++) {
            in = *((PyObject **)ptrs[j]);
            if (in == NULL) {
                in = Py_None;
            }
            PyTuple_SET_ITEM(arglist, j, in);
            Py_INCREF(in);
        }
        result = PyEval_CallObject(tocall, arglist);
        Py_DECREF(arglist);
        if (result == NULL) {
            return;
        }
        if (nout == 0  && result == Py_None) {
            /* No output expected, no output received, continue */
            Py_DECREF(result);
        }
        else if (nout == 1) {
            /* Single output expected, assign and continue */
            op = (PyObject **)ptrs[nin];
            Py_XDECREF(*op);
            *op = result;
        }
        else if (PyTuple_Check(result) && nout == PyTuple_Size(result)) {
            /*
             * Multiple returns match expected number of outputs, assign
             * and continue. Will also gobble empty tuples if nout == 0.
             */
            for(j = 0; j < nout; j++) {
                op = (PyObject **)ptrs[j+nin];
                Py_XDECREF(*op);
                *op = PyTuple_GET_ITEM(result, j);
                Py_INCREF(*op);
            }
            Py_DECREF(result);
        }
        else {
            /* Mismatch between returns and expected outputs, exit */
            Py_DECREF(result);
            return;
        }
        for(j = 0; j < ntot; j++) {
            ptrs[j] += steps[j];
        }
    }
}

/*
 *****************************************************************************
 **                             BOOLEAN LOOPS                               **
 *****************************************************************************
 */

/**begin repeat
 * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
 * #OP =  ==, !=, >, >=, <, <=#
 **/

NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
        *((npy_bool *)op1)= in1 @OP@ in2;
    }
}
/**end repeat**/


/**begin repeat
 * #kind = logical_and, logical_or#
 * #OP =  &&, ||#
 * #SC =  ==, !=#
 * #and = 1, 0#
 **/

NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    if(IS_BINARY_REDUCE) {
#ifdef NPY_HAVE_SSE2_INTRINSICS
        /*
         * stick with our variant for more reliable performance, only known
         * platform which outperforms it by ~20% is an i7 with glibc 2.17
         */
        if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
            return;
        }
#else
        /* for now only use libc on 32-bit/non-x86 */
        if (steps[1] == 1) {
            npy_bool * op = (npy_bool *)args[0];
#if @and@
            /* np.all(), search for a zero (false) */
            if (*op) {
                *op = memchr(args[1], 0, dimensions[0]) == NULL;
            }
#else
            /*
             * np.any(), search for a non-zero (true) via comparing against
             * zero blocks, memcmp is faster than memchr on SSE4 machines
             * with glibc >= 2.12 and memchr can only check for equal 1
             */
            static const npy_bool zero[4096]; /* zero by C standard */
            npy_uintp i, n = dimensions[0];

            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
            }
            if (!*op && n - i > 0) {
                *op = memcmp(&args[1][i], zero, n - i) != 0;
            }
#endif
            return;
        }
#endif
        else {
            BINARY_REDUCE_LOOP(npy_bool) {
                const npy_bool in2 = *(npy_bool *)ip2;
                io1 = io1 @OP@ in2;
                if (io1 @SC@ 0) {
                    break;
                }
            }
            *((npy_bool *)iop1) = io1;
        }
    }
    else {
        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
            return;
        }
        else {
            BINARY_LOOP {
                const npy_bool in1 = *(npy_bool *)ip1;
                const npy_bool in2 = *(npy_bool *)ip2;
                *((npy_bool *)op1) = in1 @OP@ in2;
            }
        }
    }
}
/**end repeat**/

/**begin repeat
 * #kind = absolute, logical_not#
 * #OP =  !=, ==#
 **/
NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
        return;
    }
    else {
        UNARY_LOOP {
            npy_bool in1 = *(npy_bool *)ip1;
            *((npy_bool *)op1) = in1 @OP@ 0;
        }
    }
}
/**end repeat**/

NPY_NO_EXPORT void
BOOL__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    OUTPUT_LOOP {
        *((npy_bool *)op1) = 1;
    }
}


/*
 *****************************************************************************
 **                           INTEGER LOOPS
 *****************************************************************************
 */

/**begin repeat
 * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
 *         LONG, ULONG, LONGLONG, ULONGLONG#
 * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
 *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
 * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double,
 *          npy_double, npy_double, npy_double, npy_double#
 * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
 */

#define @TYPE@_floor_divide @TYPE@_divide
#define @TYPE@_fmax @TYPE@_maximum
#define @TYPE@_fmin @TYPE@_minimum

NPY_NO_EXPORT void
@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    OUTPUT_LOOP {
        *((@type@ *)op1) = 1;
    }
}

NPY_NO_EXPORT void
@TYPE@_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = +in);
}

/**begin repeat1
 * #isa = , _avx2#
 * #ISA = , AVX2#
 * #CHK = 1, HAVE_ATTRIBUTE_TARGET_AVX2#
 * #ATTR = , NPY_GCC_TARGET_AVX2#
 */

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_square@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
}
#endif

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_reciprocal@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
}
#endif

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_conjugate@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = in);
}
#endif

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_negative@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = -in);
}
#endif

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_logical_not@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
}
#endif

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_invert@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
}
#endif

/**begin repeat2
 * Arithmetic
 * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
 *          left_shift, right_shift#
 * #OP = +, -,*, &, |, ^, <<, >>#
 */

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_@kind@@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    if(IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(@type@) {
            io1 @OP@= *(@type@ *)ip2;
        }
        *((@type@ *)iop1) = io1;
    }
    else {
        BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
    }
}
#endif

/**end repeat2**/

/**begin repeat2
 * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
 *         logical_and, logical_or#
 * #OP =  ==, !=, >, >=, <, <=, &&, ||#
 */

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_@kind@@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    /*
     * gcc vectorization of this is not good (PR60575) but manual integer
     * vectorization is too tedious to be worthwhile
     */
    BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
}
#endif

/**end repeat2**/

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_logical_xor@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const int t1 = !!*(@type@ *)ip1;
        const int t2 = !!*(@type@ *)ip2;
        *((npy_bool *)op1) = (t1 != t2);
    }
}
#endif

/**end repeat1**/

/**begin repeat1
 * #kind = maximum, minimum#
 * #OP =  >, <#
 **/

NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(@type@) {
            const @type@ in2 = *(@type@ *)ip2;
            io1 = (io1 @OP@ in2) ? io1 : in2;
        }
        *((@type@ *)iop1) = io1;
    }
    else {
        BINARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            const @type@ in2 = *(@type@ *)ip2;
            *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
        }
    }
}

/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_power(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        @type@ in1 = *(@type@ *)ip1;
        @type@ in2 = *(@type@ *)ip2;
        @type@ out;

#if @SIGNED@
        if (in2 < 0) {
            NPY_ALLOW_C_API_DEF
            NPY_ALLOW_C_API;
            PyErr_SetString(PyExc_ValueError,
                    "Integers to negative integer powers are not allowed.");
            NPY_DISABLE_C_API;
            return;
        }
#endif
        if (in2 == 0) {
            *((@type@ *)op1) = 1;
            continue;
        }
        if (in1 == 1) {
            *((@type@ *)op1) = 1;
            continue;
        }

        out = in2 & 1 ? in1 : 1;
        in2 >>= 1;
        while (in2 > 0) {
            in1 *= in1;
            if (in2 & 1) {
                out *= in1;
            }
            in2 >>= 1;
        }
        *((@type@ *) op1) = out;
    }
}

NPY_NO_EXPORT void
@TYPE@_fmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (in2 == 0) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
        }
        else {
            *((@type@ *)op1)= in1 % in2;
        }

    }
}

/**end repeat**/

/**begin repeat
 * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
 * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
 */

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in);
}

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
}

NPY_NO_EXPORT void
@TYPE@_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        /*
         * FIXME: On x86 at least, dividing the smallest representable integer
         * by -1 causes a SIFGPE (division overflow). We treat this case here
         * (to avoid a SIGFPE crash at python level), but a good solution would
         * be to treat integer division problems separately from FPU exceptions
         * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
         */
        if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
        }
        else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) {
            *((@type@ *)op1) = in1/in2 - 1;
        }
        else {
            *((@type@ *)op1) = in1/in2;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (in2 == 0) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
        }
        else {
            /* handle mixed case the way Python does */
            const @type@ rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((@type@ *)op1) = rem;
            }
            else {
                *((@type@ *)op1) = rem + in2;
            }
        }
    }
}

NPY_NO_EXPORT void
@TYPE@_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP_TWO_OUT {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        /* see FIXME note for divide above */
        if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
            *((@type@ *)op2) = 0;
        }
        else {
            /* handle mixed case the way Python does */
            const @type@ quo = in1 / in2;
            const @type@ rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((@type@ *)op1) = quo;
                *((@type@ *)op2) = rem;
            }
            else {
                *((@type@ *)op1) = quo - 1;
                *((@type@ *)op2) = rem + in2;
            }
        }
    }
}

/**end repeat**/

/**begin repeat
 * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
 * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
 */

NPY_NO_EXPORT void
@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = in1;
    }
}

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
}

NPY_NO_EXPORT void
@TYPE@_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (in2 == 0) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
        }
        else {
            *((@type@ *)op1)= in1/in2;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (in2 == 0) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
        }
        else {
            *((@type@ *)op1) = in1 % in2;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP_TWO_OUT {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (in2 == 0) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
            *((@type@ *)op2) = 0;
        }
        else {
            *((@type@ *)op1)= in1/in2;
            *((@type@ *)op2) = in1 % in2;
        }
    }
}

/**end repeat**/

/*
 *****************************************************************************
 **                           DATETIME LOOPS                                **
 *****************************************************************************
 */

NPY_NO_EXPORT void
TIMEDELTA_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        if (in1 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = -in1;
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        *((npy_timedelta *)op1) = +in1;
    }
}

NPY_NO_EXPORT void
TIMEDELTA_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        if (in1 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = (in1 >= 0) ? in1 : -in1;
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        *((npy_timedelta *)op1) = in1 > 0 ? 1 : (in1 < 0 ? -1 : 0);
    }
}

/**begin repeat
 * #type = npy_datetime, npy_timedelta#
 * #TYPE = DATETIME, TIMEDELTA#
 */

NPY_NO_EXPORT void
@TYPE@_isnat(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((npy_bool *)op1) = (in1 == NPY_DATETIME_NAT);
    }
}

NPY_NO_EXPORT void
@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    OUTPUT_LOOP {
        *((@type@ *)op1) = 1;
    }
}

/**begin repeat1
 * #kind = equal, greater, greater_equal, less, less_equal#
 * #OP =  ==, >, >=, <, <=#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    npy_bool give_future_warning = 0;
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        const npy_bool res = in1 @OP@ in2;
        *((npy_bool *)op1) = res;

        if ((in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) && res) {
            give_future_warning = 1;
        }
    }
    if (give_future_warning) {
        NPY_ALLOW_C_API_DEF
        NPY_ALLOW_C_API;
        /* 2016-01-18, 1.11 */
        if (DEPRECATE_FUTUREWARNING(
                "In the future, 'NAT @OP@ x' and 'x @OP@ NAT' "
                "will always be False.") < 0) {
            /* nothing to do, we return anyway */
        }
        NPY_DISABLE_C_API;
    }
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_not_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    npy_bool give_future_warning = 0;
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        *((npy_bool *)op1) = in1 != in2;

        if (in1 == NPY_DATETIME_NAT && in2 == NPY_DATETIME_NAT) {
            give_future_warning = 1;
        }
    }
    if (give_future_warning) {
        NPY_ALLOW_C_API_DEF
        NPY_ALLOW_C_API;
        /* 2016-01-18, 1.11 */
        if (DEPRECATE_FUTUREWARNING(
                "In the future, NAT != NAT will be True "
                "rather than False.") < 0) {
            /* nothing to do, we return anyway */
        }
        NPY_DISABLE_C_API;
    }
}


/**begin repeat1
 * #kind = maximum, minimum#
 * #OP =  >, <#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (in1 == NPY_DATETIME_NAT) {
            *((@type@ *)op1) = in2;
        }
        else if (in2 == NPY_DATETIME_NAT) {
            *((@type@ *)op1) = in1;
        }
        else {
            *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
        }
    }
}
/**end repeat1**/

/**end repeat**/

NPY_NO_EXPORT void
DATETIME_Mm_M_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    BINARY_LOOP {
        const npy_datetime in1 = *(npy_datetime *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((npy_datetime *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_datetime *)op1) = in1 + in2;
        }
    }
}

NPY_NO_EXPORT void
DATETIME_mM_M_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_datetime in2 = *(npy_datetime *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((npy_datetime *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_datetime *)op1) = in1 + in2;
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_mm_m_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = in1 + in2;
        }
    }
}

NPY_NO_EXPORT void
DATETIME_Mm_M_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_datetime in1 = *(npy_datetime *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((npy_datetime *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_datetime *)op1) = in1 - in2;
        }
    }
}

NPY_NO_EXPORT void
DATETIME_MM_m_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_datetime in1 = *(npy_datetime *)ip1;
        const npy_datetime in2 = *(npy_datetime *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = in1 - in2;
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_mm_m_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = in1 - in2;
        }
    }
}

/* Note: Assuming 'q' == NPY_LONGLONG */
NPY_NO_EXPORT void
TIMEDELTA_mq_m_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_int64 in2 = *(npy_int64 *)ip2;
        if (in1 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = in1 * in2;
        }
    }
}

/* Note: Assuming 'q' == NPY_LONGLONG */
NPY_NO_EXPORT void
TIMEDELTA_qm_m_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_int64 in1 = *(npy_int64 *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in2 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = in1 * in2;
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_md_m_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const double in2 = *(double *)ip2;
        if (in1 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            double result = in1 * in2;
            if (npy_isfinite(result)) {
                *((npy_timedelta *)op1) = (npy_timedelta)result;
            }
            else {
                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
            }
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_dm_m_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const double in1 = *(double *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in2 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            double result = in1 * in2;
            if (npy_isfinite(result)) {
                *((npy_timedelta *)op1) = (npy_timedelta)result;
            }
            else {
                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
            }
        }
    }
}

/* Note: Assuming 'q' == NPY_LONGLONG */
NPY_NO_EXPORT void
TIMEDELTA_mq_m_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_int64 in2 = *(npy_int64 *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == 0) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = in1 / in2;
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_md_m_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const double in2 = *(double *)ip2;
        if (in1 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            double result = in1 / in2;
            if (npy_isfinite(result)) {
                *((npy_timedelta *)op1) = (npy_timedelta)result;
            }
            else {
                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
            }
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((double *)op1) = NPY_NAN;
        }
        else {
            *((double *)op1) = (double)in1 / (double)in2;
        }
    }
}

/*
 *****************************************************************************
 **                             FLOAT LOOPS                                 **
 *****************************************************************************
 */

/**begin repeat
 * Float types
 *  #type = npy_float, npy_double#
 *  #TYPE = FLOAT, DOUBLE#
 *  #scalarf = npy_sqrtf, npy_sqrt#
 */

NPY_NO_EXPORT void
@TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    if (!run_unary_simd_sqrt_@TYPE@(args, dimensions, steps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *(@type@ *)op1 = @scalarf@(in1);
        }
    }
}

/**end repeat**/


/**begin repeat
 * Float types
 *  #type = npy_float, npy_double, npy_longdouble, npy_float#
 *  #dtype = npy_float, npy_double, npy_longdouble, npy_half#
 *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF#
 *  #c = f, , l, #
 *  #C = F, , L, #
 *  #trf = , , , npy_half_to_float#
 */

/*
 * Pairwise summation, rounding error O(lg n) instead of O(n).
 * The recursion depth is O(lg n) as well.
 * when updating also update similar complex floats summation
 */
static @type@
pairwise_sum_@TYPE@(char *a, npy_uintp n, npy_intp stride)
{
    if (n < 8) {
        npy_intp i;
        @type@ res = 0.;
        for (i = 0; i < n; i++) {
            res += @trf@(*((@dtype@*)(a + i * stride)));
        }
        return res;
    }
    else if (n <= PW_BLOCKSIZE) {
        npy_intp i;
        @type@ r[8], res;

        /*
         * sum a block with 8 accumulators
         * 8 times unroll reduces blocksize to 16 and allows vectorization with
         * avx without changing summation ordering
         */
        r[0] = @trf@(*((@dtype@ *)(a + 0 * stride)));
        r[1] = @trf@(*((@dtype@ *)(a + 1 * stride)));
        r[2] = @trf@(*((@dtype@ *)(a + 2 * stride)));
        r[3] = @trf@(*((@dtype@ *)(a + 3 * stride)));
        r[4] = @trf@(*((@dtype@ *)(a + 4 * stride)));
        r[5] = @trf@(*((@dtype@ *)(a + 5 * stride)));
        r[6] = @trf@(*((@dtype@ *)(a + 6 * stride)));
        r[7] = @trf@(*((@dtype@ *)(a + 7 * stride)));

        for (i = 8; i < n - (n % 8); i += 8) {
            /* small blocksizes seems to mess with hardware prefetch */
            NPY_PREFETCH(a + (i + 512 / sizeof(@dtype@)) * stride, 0, 3);
            r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride)));
            r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride)));
            r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride)));
            r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride)));
            r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride)));
            r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride)));
            r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride)));
            r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride)));
        }

        /* accumulate now to avoid stack spills for single peel loop */
        res = ((r[0] + r[1]) + (r[2] + r[3])) +
              ((r[4] + r[5]) + (r[6] + r[7]));

        /* do non multiple of 8 rest */
        for (; i < n; i++) {
            res += @trf@(*((@dtype@ *)(a + i * stride)));
        }
        return res;
    }
    else {
        /* divide by two but avoid non-multiples of unroll factor */
        npy_uintp n2 = n / 2;
        n2 -= n2 % 8;
        return pairwise_sum_@TYPE@(a, n2, stride) +
               pairwise_sum_@TYPE@(a + n2 * stride, n - n2, stride);
    }
}

/**end repeat**/

/**begin repeat
 * Float types
 *  #type = npy_float, npy_double, npy_longdouble#
 *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
 *  #c = f, , l#
 *  #C = F, , L#
 */

/**begin repeat1
 * Arithmetic
 * # kind = add, subtract, multiply, divide#
 * # OP = +, -, *, /#
 * # PW = 1, 0, 0, 0#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
#if @PW@
        @type@ * iop1 = (@type@ *)args[0];
        npy_intp n = dimensions[0];

        *iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]);
#else
        BINARY_REDUCE_LOOP(@type@) {
            io1 @OP@= *(@type@ *)ip2;
        }
        *((@type@ *)iop1) = io1;
#endif
    }
    else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
        BINARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            const @type@ in2 = *(@type@ *)ip2;
            *((@type@ *)op1) = in1 @OP@ in2;
        }
    }
}
/**end repeat1**/

/**begin repeat1
 * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
 *        logical_and, logical_or#
 * #OP = ==, !=, <, <=, >, >=, &&, ||#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
        BINARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            const @type@ in2 = *(@type@ *)ip2;
            *((npy_bool *)op1) = in1 @OP@ in2;
        }
    }
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const int t1 = !!*(@type@ *)ip1;
        const int t2 = !!*(@type@ *)ip2;
        *((npy_bool *)op1) = (t1 != t2);
    }
}

NPY_NO_EXPORT void
@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((npy_bool *)op1) = !in1;
    }
}

/**begin repeat1
 * #kind = isnan, isinf, isfinite, signbit#
 * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    if (!run_@kind@_simd_@TYPE@(args, dimensions, steps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *((npy_bool *)op1) = @func@(in1) != 0;
        }
    }
    npy_clear_floatstatus();
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_spacing(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = npy_spacing@c@(in1);
    }
}

NPY_NO_EXPORT void
@TYPE@_copysign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        *((@type@ *)op1)= npy_copysign@c@(in1, in2);
    }
}

NPY_NO_EXPORT void
@TYPE@_nextafter(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        *((@type@ *)op1)= npy_nextafter@c@(in1, in2);
    }
}

/**begin repeat1
 * #kind = maximum, minimum#
 * #OP =  >=, <=#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    /*  */
    if (IS_BINARY_REDUCE) {
        if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
            BINARY_REDUCE_LOOP(@type@) {
                const @type@ in2 = *(@type@ *)ip2;
                io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
            }
            *((@type@ *)iop1) = io1;
        }
    }
    else {
        BINARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            const @type@ in2 = *(@type@ *)ip2;
            *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
        }
    }
}
/**end repeat1**/

/**begin repeat1
 * #kind = fmax, fmin#
 * #OP =  >=, <=#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    /*  */
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(@type@) {
            const @type@ in2 = *(@type@ *)ip2;
            io1 = (io1 @OP@ in2 || npy_isnan(in2)) ? io1 : in2;
        }
        *((@type@ *)iop1) = io1;
    }
    else {
        BINARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            const @type@ in2 = *(@type@ *)ip2;
            *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
        }
    }
    npy_clear_floatstatus();
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        @type@ mod;
        *((@type@ *)op1) = npy_divmod@c@(in1, in2, &mod);
    }
}

NPY_NO_EXPORT void
@TYPE@_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        npy_divmod@c@(in1, in2, (@type@ *)op1);
    }
}

NPY_NO_EXPORT void
@TYPE@_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP_TWO_OUT {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        *((@type@ *)op1) = npy_divmod@c@(in1, in2, (@type@ *)op2);
    }
}

NPY_NO_EXPORT void
@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    char * margs[] = {args[0], args[0], args[1]};
    npy_intp msteps[] = {steps[0], steps[0], steps[1]};
    if (!run_binary_simd_multiply_@TYPE@(margs, dimensions, msteps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *((@type@ *)op1) = in1*in1;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    @type@ one = 1.@c@;
    char * margs[] = {(char*)&one, args[0], args[1]};
    npy_intp msteps[] = {0, steps[0], steps[1]};
    if (!run_binary_simd_divide_@TYPE@(margs, dimensions, msteps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *((@type@ *)op1) = 1/in1;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    OUTPUT_LOOP {
        *((@type@ *)op1) = 1;
    }
}

NPY_NO_EXPORT void
@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = in1;
    }
}

NPY_NO_EXPORT void
@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    if (!run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            const @type@ tmp = in1 > 0 ? in1 : -in1;
            /* add 0 to clear -0.0 */
            *((@type@ *)op1) = tmp + 0;
        }
    }
    npy_clear_floatstatus();
}

NPY_NO_EXPORT void
@TYPE@_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *((@type@ *)op1) = -in1;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = +in1;
    }
}

NPY_NO_EXPORT void
@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    /* Sign of nan is nan */
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = in1 > 0 ? 1 : (in1 < 0 ? -1 : (in1 == 0 ? 0 : in1));
    }
}

NPY_NO_EXPORT void
@TYPE@_modf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_TWO_OUT {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = npy_modf@c@(in1, (@type@ *)op2);
    }
}

NPY_NO_EXPORT void
@TYPE@_frexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_TWO_OUT {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = npy_frexp@c@(in1, (int *)op2);
    }
}

NPY_NO_EXPORT void
@TYPE@_ldexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const int in2 = *(int *)ip2;
        *((@type@ *)op1) = npy_ldexp@c@(in1, in2);
    }
}

NPY_NO_EXPORT void
@TYPE@_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    /*
     * Additional loop to handle npy_long integer inputs (cf. #866, #1633).
     * npy_long != npy_int on many 64-bit platforms, so we need this second loop
     * to handle the default integer type.
     */
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const long in2 = *(long *)ip2;
        if (((int)in2) == in2) {
            /* Range OK */
            *((@type@ *)op1) = npy_ldexp@c@(in1, ((int)in2));
        }
        else {
            /*
             * Outside npy_int range -- also ldexp will overflow in this case,
             * given that exponent has less bits than npy_int.
             */
            if (in2 > 0) {
                *((@type@ *)op1) = npy_ldexp@c@(in1, NPY_MAX_INT);
            }
            else {
                *((@type@ *)op1) = npy_ldexp@c@(in1, NPY_MIN_INT);
            }
        }
    }
}

#define @TYPE@_true_divide @TYPE@_divide

/**end repeat**/

/*
 *****************************************************************************
 **                          HALF-FLOAT LOOPS                               **
 *****************************************************************************
 */


/**begin repeat
 * Arithmetic
 * # kind = add, subtract, multiply, divide#
 * # OP = +, -, *, /#
 * # PW = 1, 0, 0, 0#
 */
NPY_NO_EXPORT void
HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        char *iop1 = args[0];
        float io1 = npy_half_to_float(*(npy_half *)iop1);
#if @PW@
        npy_intp n = dimensions[0];

        io1 @OP@= pairwise_sum_HALF(args[1], n, steps[1]);
#else
        BINARY_REDUCE_LOOP_INNER {
            io1 @OP@= npy_half_to_float(*(npy_half *)ip2);
        }
#endif
        *((npy_half *)iop1) = npy_float_to_half(io1);
    }
    else {
        BINARY_LOOP {
            const float in1 = npy_half_to_float(*(npy_half *)ip1);
            const float in2 = npy_half_to_float(*(npy_half *)ip2);
            *((npy_half *)op1) = npy_float_to_half(in1 @OP@ in2);
        }
    }
}
/**end repeat**/

#define _HALF_LOGICAL_AND(a,b) (!npy_half_iszero(a) && !npy_half_iszero(b))
#define _HALF_LOGICAL_OR(a,b) (!npy_half_iszero(a) || !npy_half_iszero(b))
/**begin repeat
 * #kind = equal, not_equal, less, less_equal, greater,
 *         greater_equal, logical_and, logical_or#
 * #OP = npy_half_eq, npy_half_ne, npy_half_lt, npy_half_le, npy_half_gt,
 *       npy_half_ge, _HALF_LOGICAL_AND, _HALF_LOGICAL_OR#
 */
NPY_NO_EXPORT void
HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        *((npy_bool *)op1) = @OP@(in1, in2);
    }
}
/**end repeat**/
#undef _HALF_LOGICAL_AND
#undef _HALF_LOGICAL_OR

NPY_NO_EXPORT void
HALF_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const int in1 = !npy_half_iszero(*(npy_half *)ip1);
        const int in2 = !npy_half_iszero(*(npy_half *)ip2);
        *((npy_bool *)op1) = (in1 != in2);
    }
}

NPY_NO_EXPORT void
HALF_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_bool *)op1) = npy_half_iszero(in1);
    }
}

/**begin repeat
 * #kind = isnan, isinf, isfinite, signbit#
 * #func = npy_half_isnan, npy_half_isinf, npy_half_isfinite, npy_half_signbit#
 **/
NPY_NO_EXPORT void
HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_bool *)op1) = @func@(in1) != 0;
    }
    npy_clear_floatstatus();
}
/**end repeat**/

NPY_NO_EXPORT void
HALF_spacing(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_half *)op1) = npy_half_spacing(in1);
    }
}

NPY_NO_EXPORT void
HALF_copysign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        *((npy_half *)op1)= npy_half_copysign(in1, in2);
    }
}

NPY_NO_EXPORT void
HALF_nextafter(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        *((npy_half *)op1)= npy_half_nextafter(in1, in2);
    }
}

/**begin repeat
 * #kind = maximum, minimum#
 * #OP =  npy_half_ge, npy_half_le#
 **/
NPY_NO_EXPORT void
HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    /*  */
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        *((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in1)) ? in1 : in2;
    }
}
/**end repeat**/

/**begin repeat
 * #kind = fmax, fmin#
 * #OP =  npy_half_ge, npy_half_le#
 **/
NPY_NO_EXPORT void
HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    /*  */
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        *((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in2)) ? in1 : in2;
    }
    npy_clear_floatstatus();
}
/**end repeat**/

NPY_NO_EXPORT void
HALF_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        npy_half mod;
        *((npy_half *)op1) = npy_half_divmod(in1, in2, &mod);
    }
}

NPY_NO_EXPORT void
HALF_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        npy_half_divmod(in1, in2, (npy_half *)op1);
    }
}

NPY_NO_EXPORT void
HALF_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP_TWO_OUT {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        *((npy_half *)op1) = npy_half_divmod(in1, in2, (npy_half *)op2);
    }
}

NPY_NO_EXPORT void
HALF_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP {
        const float in1 = npy_half_to_float(*(npy_half *)ip1);
        *((npy_half *)op1) = npy_float_to_half(in1*in1);
    }
}

NPY_NO_EXPORT void
HALF_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP {
        const float in1 = npy_half_to_float(*(npy_half *)ip1);
        *((npy_half *)op1) = npy_float_to_half(1/in1);
    }
}

NPY_NO_EXPORT void
HALF__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    OUTPUT_LOOP {
        *((npy_half *)op1) = NPY_HALF_ONE;
    }
}

NPY_NO_EXPORT void
HALF_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_half *)op1) = in1;
    }
}

NPY_NO_EXPORT void
HALF_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_half *)op1) = in1&0x7fffu;
    }
}

NPY_NO_EXPORT void
HALF_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_half *)op1) = in1^0x8000u;
    }
}

NPY_NO_EXPORT void
HALF_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_half *)op1) = +in1;
    }
}

NPY_NO_EXPORT void
HALF_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    /* Sign of nan is nan */
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_half *)op1) = npy_half_isnan(in1) ? in1 :
                    (((in1&0x7fffu) == 0) ? 0 :
                      (((in1&0x8000u) == 0) ? NPY_HALF_ONE : NPY_HALF_NEGONE));
    }
}

NPY_NO_EXPORT void
HALF_modf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    float temp;

    UNARY_LOOP_TWO_OUT {
        const float in1 = npy_half_to_float(*(npy_half *)ip1);
        *((npy_half *)op1) = npy_float_to_half(npy_modff(in1, &temp));
        *((npy_half *)op2) = npy_float_to_half(temp);
    }
}

NPY_NO_EXPORT void
HALF_frexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_TWO_OUT {
        const float in1 = npy_half_to_float(*(npy_half *)ip1);
        *((npy_half *)op1) = npy_float_to_half(npy_frexpf(in1, (int *)op2));
    }
}

NPY_NO_EXPORT void
HALF_ldexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const float in1 = npy_half_to_float(*(npy_half *)ip1);
        const int in2 = *(int *)ip2;
        *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, in2));
    }
}

NPY_NO_EXPORT void
HALF_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    /*
     * Additional loop to handle npy_long integer inputs (cf. #866, #1633).
     * npy_long != npy_int on many 64-bit platforms, so we need this second loop
     * to handle the default integer type.
     */
    BINARY_LOOP {
        const float in1 = npy_half_to_float(*(npy_half *)ip1);
        const long in2 = *(long *)ip2;
        if (((int)in2) == in2) {
            /* Range OK */
            *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, ((int)in2)));
        }
        else {
            /*
             * Outside npy_int range -- also ldexp will overflow in this case,
             * given that exponent has less bits than npy_int.
             */
            if (in2 > 0) {
                *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, NPY_MAX_INT));
            }
            else {
                *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, NPY_MIN_INT));
            }
        }
    }
}

#define HALF_true_divide HALF_divide


/*
 *****************************************************************************
 **                           COMPLEX LOOPS                                 **
 *****************************************************************************
 */

#define CGE(xr,xi,yr,yi) ((xr > yr && !npy_isnan(xi) && !npy_isnan(yi)) \
                          || (xr == yr && xi >= yi))
#define CLE(xr,xi,yr,yi) ((xr < yr && !npy_isnan(xi) && !npy_isnan(yi)) \
                          || (xr == yr && xi <= yi))
#define CGT(xr,xi,yr,yi) ((xr > yr && !npy_isnan(xi) && !npy_isnan(yi)) \
                          || (xr == yr && xi > yi))
#define CLT(xr,xi,yr,yi) ((xr < yr && !npy_isnan(xi) && !npy_isnan(yi)) \
                          || (xr == yr && xi < yi))
#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi)
#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi)

/**begin repeat
 * complex types
 * #TYPE = CFLOAT, CDOUBLE, CLONGDOUBLE#
 * #ftype = npy_float, npy_double, npy_longdouble#
 * #c = f, , l#
 * #C = F, , L#
 */

/* similar to pairwise sum of real floats */
static void
pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_uintp n,
                    npy_intp stride)
{
    assert(n % 2 == 0);
    if (n < 8) {
        npy_intp i;
        *rr = 0.;
        *ri = 0.;
        for (i = 0; i < n; i += 2) {
            *rr += *((@ftype@ *)(a + i * stride + 0));
            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
        }
        return;
    }
    else if (n <= PW_BLOCKSIZE) {
        npy_intp i;
        @ftype@ r[8];

        /*
         * sum a block with 8 accumulators
         * 8 times unroll reduces blocksize to 16 and allows vectorization with
         * avx without changing summation ordering
         */
        r[0] = *((@ftype@ *)(a + 0 * stride));
        r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@)));
        r[2] = *((@ftype@ *)(a + 2 * stride));
        r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@)));
        r[4] = *((@ftype@ *)(a + 4 * stride));
        r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@)));
        r[6] = *((@ftype@ *)(a + 6 * stride));
        r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@)));

        for (i = 8; i < n - (n % 8); i += 8) {
            /* small blocksizes seems to mess with hardware prefetch */
            NPY_PREFETCH(a + (i + 512 / sizeof(@ftype@)) * stride, 0, 3);
            r[0] += *((@ftype@ *)(a + (i + 0) * stride));
            r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
            r[2] += *((@ftype@ *)(a + (i + 2) * stride));
            r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
            r[4] += *((@ftype@ *)(a + (i + 4) * stride));
            r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
            r[6] += *((@ftype@ *)(a + (i + 6) * stride));
            r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
        }

        /* accumulate now to avoid stack spills for single peel loop */
        *rr = ((r[0] + r[2]) + (r[4] + r[6]));
        *ri = ((r[1] + r[3]) + (r[5] + r[7]));

        /* do non multiple of 8 rest */
        for (; i < n; i+=2) {
            *rr += *((@ftype@ *)(a + i * stride + 0));
            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
        }
        return;
    }
    else {
        /* divide by two but avoid non-multiples of unroll factor */
        @ftype@ rr1, ri1, rr2, ri2;
        npy_uintp n2 = n / 2;
        n2 -= n2 % 8;
        pairwise_sum_@TYPE@(&rr1, &ri1, a, n2, stride);
        pairwise_sum_@TYPE@(&rr2, &ri2, a + n2 * stride, n - n2, stride);
        *rr = rr1 + rr2;
        *ri = ri1 + ri2;
        return;
    }
}

/**begin repeat1
 * arithmetic
 * #kind = add, subtract#
 * #OP = +, -#
 * #PW = 1, 0#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE && @PW@) {
        npy_intp n = dimensions[0];
        @ftype@ * or = ((@ftype@ *)args[0]);
        @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
        @ftype@ rr, ri;

        pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2);
        *or @OP@= rr;
        *oi @OP@= ri;
        return;
    }
    else {
        BINARY_LOOP {
            const @ftype@ in1r = ((@ftype@ *)ip1)[0];
            const @ftype@ in1i = ((@ftype@ *)ip1)[1];
            const @ftype@ in2r = ((@ftype@ *)ip2)[0];
            const @ftype@ in2i = ((@ftype@ *)ip2)[1];
            ((@ftype@ *)op1)[0] = in1r @OP@ in2r;
            ((@ftype@ *)op1)[1] = in1i @OP@ in2i;
        }
    }
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
        ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
    }
}

NPY_NO_EXPORT void
@TYPE@_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        const @ftype@ in2r_abs = npy_fabs@c@(in2r);
        const @ftype@ in2i_abs = npy_fabs@c@(in2i);
        if (in2r_abs >= in2i_abs) {
            if (in2r_abs == 0 && in2i_abs == 0) {
                /* divide by zero should yield a complex inf or nan */
                ((@ftype@ *)op1)[0] = in1r/in2r_abs;
                ((@ftype@ *)op1)[1] = in1i/in2i_abs;
            }
            else {
                const @ftype@ rat = in2i/in2r;
                const @ftype@ scl = 1.0@c@/(in2r + in2i*rat);
                ((@ftype@ *)op1)[0] = (in1r + in1i*rat)*scl;
                ((@ftype@ *)op1)[1] = (in1i - in1r*rat)*scl;
            }
        }
        else {
            const @ftype@ rat = in2r/in2i;
            const @ftype@ scl = 1.0@c@/(in2i + in2r*rat);
            ((@ftype@ *)op1)[0] = (in1r*rat + in1i)*scl;
            ((@ftype@ *)op1)[1] = (in1i*rat - in1r)*scl;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        if (npy_fabs@c@(in2r) >= npy_fabs@c@(in2i)) {
            const @ftype@ rat = in2i/in2r;
            ((@ftype@ *)op1)[0] = npy_floor@c@((in1r + in1i*rat)/(in2r + in2i*rat));
            ((@ftype@ *)op1)[1] = 0;
        }
        else {
            const @ftype@ rat = in2r/in2i;
            ((@ftype@ *)op1)[0] = npy_floor@c@((in1r*rat + in1i)/(in2i + in2r*rat));
            ((@ftype@ *)op1)[1] = 0;
        }
    }
}

/**begin repeat1
 * #kind= greater, greater_equal, less, less_equal, equal, not_equal#
 * #OP = CGT, CGE, CLT, CLE, CEQ, CNE#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        *((npy_bool *)op1) = @OP@(in1r,in1i,in2r,in2i);
    }
}
/**end repeat1**/

/**begin repeat1
   #kind = logical_and, logical_or#
   #OP1 = ||, ||#
   #OP2 = &&, ||#
*/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        *((npy_bool *)op1) = (in1r @OP1@ in1i) @OP2@ (in2r @OP1@ in2i);
    }
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        const npy_bool tmp1 = (in1r || in1i);
        const npy_bool tmp2 = (in2r || in2i);
        *((npy_bool *)op1) = tmp1 != tmp2;
    }
}

NPY_NO_EXPORT void
@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        *((npy_bool *)op1) = !(in1r || in1i);
    }
}

/**begin repeat1
 * #kind = isnan, isinf, isfinite#
 * #func = npy_isnan, npy_isinf, npy_isfinite#
 * #OP = ||, ||, &&#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        *((npy_bool *)op1) = @func@(in1r) @OP@ @func@(in1i);
    }
    npy_clear_floatstatus();
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        ((@ftype@ *)op1)[0] = in1r*in1r - in1i*in1i;
        ((@ftype@ *)op1)[1] = in1r*in1i + in1i*in1r;
    }
}

NPY_NO_EXPORT void
@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        if (npy_fabs@c@(in1i) <= npy_fabs@c@(in1r)) {
            const @ftype@ r = in1i/in1r;
            const @ftype@ d = in1r + in1i*r;
            ((@ftype@ *)op1)[0] = 1/d;
            ((@ftype@ *)op1)[1] = -r/d;
        } else {
            const @ftype@ r = in1r/in1i;
            const @ftype@ d = in1r*r + in1i;
            ((@ftype@ *)op1)[0] = r/d;
            ((@ftype@ *)op1)[1] = -1/d;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
    OUTPUT_LOOP {
        ((@ftype@ *)op1)[0] = 1;
        ((@ftype@ *)op1)[1] = 0;
    }
}

NPY_NO_EXPORT void
@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) {
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        ((@ftype@ *)op1)[0] = in1r;
        ((@ftype@ *)op1)[1] = -in1i;
    }
}

NPY_NO_EXPORT void
@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        *((@ftype@ *)op1) = npy_hypot@c@(in1r, in1i);
    }
}

NPY_NO_EXPORT void
@TYPE@__arg(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        *((@ftype@ *)op1) = npy_atan2@c@(in1i, in1r);
    }
}

NPY_NO_EXPORT void
@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    /* fixme: sign of nan is currently 0 */
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        ((@ftype@ *)op1)[0] = CGT(in1r, in1i, 0.0, 0.0) ?  1 :
                            (CLT(in1r, in1i, 0.0, 0.0) ? -1 :
                            (CEQ(in1r, in1i, 0.0, 0.0) ?  0 : NPY_NAN@C@));
        ((@ftype@ *)op1)[1] = 0;
    }
}

/**begin repeat1
 * #kind = maximum, minimum#
 * #OP = CGE, CLE#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        if (@OP@(in1r, in1i, in2r, in2i) || npy_isnan(in1r) || npy_isnan(in1i)) {
            ((@ftype@ *)op1)[0] = in1r;
            ((@ftype@ *)op1)[1] = in1i;
        }
        else {
            ((@ftype@ *)op1)[0] = in2r;
            ((@ftype@ *)op1)[1] = in2i;
        }
    }
    npy_clear_floatstatus();
}
/**end repeat1**/

/**begin repeat1
 * #kind = fmax, fmin#
 * #OP = CGE, CLE#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        if (@OP@(in1r, in1i, in2r, in2i) || npy_isnan(in2r) || npy_isnan(in2i)) {
            ((@ftype@ *)op1)[0] = in1r;
            ((@ftype@ *)op1)[1] = in1i;
        }
        else {
            ((@ftype@ *)op1)[0] = in2r;
            ((@ftype@ *)op1)[1] = in2i;
        }
    }
}
/**end repeat1**/

#define @TYPE@_true_divide @TYPE@_divide

/**end repeat**/

#undef CGE
#undef CLE
#undef CGT
#undef CLT
#undef CEQ
#undef CNE

/*
 *****************************************************************************
 **                            OBJECT LOOPS                                 **
 *****************************************************************************
 */

/**begin repeat
 * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
 * #OP = EQ, NE, GT, GE, LT, LE#
 * #identity = NPY_TRUE, NPY_FALSE, -1*4#
 */
NPY_NO_EXPORT void
OBJECT_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) {
    BINARY_LOOP {
        int ret;
        PyObject *ret_obj;
        PyObject *in1 = *(PyObject **)ip1;
        PyObject *in2 = *(PyObject **)ip2;

        in1 = in1 ? in1 : Py_None;
        in2 = in2 ? in2 : Py_None;

        /*
         * Do not use RichCompareBool because it includes an identity check for
         * == and !=. This is wrong for elementwise behaviour, since it means
         * that NaN can be equal to NaN and an array is equal to itself.
         */
        ret_obj = PyObject_RichCompare(in1, in2, Py_@OP@);
        if (ret_obj == NULL) {
            return;
        }
        ret = PyObject_IsTrue(ret_obj);
        Py_DECREF(ret_obj);
        if (ret == -1) {
            return;
        }
        *((npy_bool *)op1) = (npy_bool)ret;
    }
}
/**end repeat**/

NPY_NO_EXPORT void
OBJECT_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
    PyObject *zero = PyLong_FromLong(0);

    UNARY_LOOP {
        PyObject *in1 = *(PyObject **)ip1;
        PyObject **out = (PyObject **)op1;
        PyObject *ret = NULL;
        int v;

        if (in1 == NULL) {
            in1 = Py_None;
        }

        if ((v = PyObject_RichCompareBool(in1, zero, Py_LT)) == 1) {
            ret = PyLong_FromLong(-1);
        }
        else if (v == 0 &&
                (v = PyObject_RichCompareBool(in1, zero, Py_GT)) == 1) {
            ret = PyLong_FromLong(1);
        }
        else if (v == 0 &&
                (v = PyObject_RichCompareBool(in1, zero, Py_EQ)) == 1) {
            ret = PyLong_FromLong(0);
        }
        else if (v == 0) {
            /* in1 is NaN */
            PyErr_SetString(PyExc_TypeError,
                    "unorderable types for comparison");
        }

        if (ret == NULL) {
            break;
        }
        Py_XDECREF(*out);
        *out = ret;
    }
    Py_XDECREF(zero);
}

/*
 *****************************************************************************
 **                              END LOOPS                                  **
 *****************************************************************************
 */