#define PY_SSIZE_T_CLEAN
#include <Python.h>
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#define _MULTIARRAYMODULE
#include "npy_config.h"
#include "numpy/arrayobject.h"
#define NPY_NUMBER_MAX(a, b) ((a) > (b) ? (a) : (b))
/*
* Functions used to try to avoid/elide temporaries in python expressions
* of type a + b + b by translating some operations into in-place operations.
* This example translates to this bytecode:
*
* 0 LOAD_FAST 0 (a)
* 3 LOAD_FAST 1 (b)
* 6 BINARY_ADD
* 7 LOAD_FAST 1 (b)
* 10 BINARY_ADD
*
* The two named variables get their reference count increased by the load
* instructions so they always have a reference count larger than 1.
* The temporary of the first BINARY_ADD on the other hand only has a count of
* 1. Only temporaries can have a count of 1 in python so we can use this to
* transform the second operation into an in-place operation and not affect the
* output of the program.
* CPython does the same thing to resize memory instead of copying when doing
* string concatenation.
* The gain can be very significant (4x-6x) when avoiding the temporary allows
* the operation to remain in the cpu caches and can still be 50% faster for
* array larger than cpu cache size.
*
* A complication is that a DSO (dynamic shared object) module (e.g. cython)
* could call the PyNumber functions directly on arrays with reference count of
* 1.
* This is handled by checking the call stack to verify that we have been
* called directly from the cpython interpreter.
* To achieve this we check that all functions in the callstack until the
* cpython frame evaluation function are located in cpython or numpy.
* This is an expensive operation so temporaries are only avoided for rather
* large arrays.
*
* A possible future improvement would be to change cpython to give us access
* to the top of the stack. Then we could just check that the objects involved
* are on the cpython stack instead of checking the function callstack.
*
* Elision can be applied to all operations that do have in-place variants and
* do not change types (addition, subtraction, multiplication, float division,
* logical and bitwise operations ...)
* For commutative operations (addition, multiplication, ...) if eliding into
* the lefthand side fails it can succeed on the righthand side by swapping the
* arguments. E.g. b * (a * 2) can be elided by changing it to (2 * a) * b.
*
* TODO only supports systems with backtrace(), Windows can probably be
* supported too by using the appropriate Windows APIs.
*/
#if defined HAVE_BACKTRACE && defined HAVE_DLFCN_H && ! defined PYPY_VERSION
/* 1 prints elided operations, 2 prints stacktraces */
#define NPY_ELIDE_DEBUG 0
#define NPY_MAX_STACKSIZE 10
#if PY_VERSION_HEX >= 0x03060000
/* TODO can pep523 be used to somehow? */
#define PYFRAMEEVAL_FUNC "_PyEval_EvalFrameDefault"
#else
#define PYFRAMEEVAL_FUNC "PyEval_EvalFrameEx"
#endif
/*
* Heuristic size of the array in bytes at which backtrace overhead generation
* becomes less than speed gained by in-place operations. Depends on stack depth
* being checked. Measurements with 10 stacks show it getting worthwhile
* around 100KiB but to be conservative put it higher around where the L2 cache
* spills.
*/
#ifndef Py_DEBUG
#define NPY_MIN_ELIDE_BYTES (256 * 1024)
#else
/*
* in debug mode always elide but skip scalars as these can convert to 0d array
* during in-place operations
*/
#define NPY_MIN_ELIDE_BYTES (32)
#endif
#include <dlfcn.h>
#include <execinfo.h>
/*
* linear search pointer in table
* number of pointers is usually quite small but if a performance impact can be
* measured this could be converted to a binary search
*/
static int
find_addr(void * addresses[], npy_intp naddr, void * addr)
{
npy_intp j;
for (j = 0; j < naddr; j++) {
if (addr == addresses[j]) {
return 1;
}
}
return 0;
}
static int
check_callers(int * cannot)
{
/*
* get base addresses of multiarray and python, check if
* backtrace is in these libraries only calling dladdr if a new max address
* is found.
* When after the initial multiarray stack everything is inside python we
* can elide as no C-API user could have messed up the reference counts.
* Only check until the python frame evaluation function is found
* approx 10us overhead for stack size of 10
*
* TODO some calls go over scalarmath in umath but we cannot get the base
* address of it from multiarraymodule as it is not linked against it
*/
static int init = 0;
/*
* measured DSO object memory start and end, if an address is located
* inside these bounds it is part of that library so we don't need to call
* dladdr on it (assuming linear memory)
*/
static void * pos_python_start;
static void * pos_python_end;
static void * pos_ma_start;
static void * pos_ma_end;
/* known address storage to save dladdr calls */
static void * py_addr[64];
static void * pyeval_addr[64];
static npy_intp n_py_addr = 0;
static npy_intp n_pyeval = 0;
void *buffer[NPY_MAX_STACKSIZE];
int i, nptrs;
int ok = 0;
/* cannot determine callers */
if (init == -1) {
*cannot = 1;
return 0;
}
nptrs = backtrace(buffer, NPY_MAX_STACKSIZE);
if (nptrs == 0) {
/* complete failure, disable elision */
init = -1;
*cannot = 1;
return 0;
}
/* setup DSO base addresses, ends updated later */
if (NPY_UNLIKELY(init == 0)) {
Dl_info info;
/* get python base address */
if (dladdr(&PyNumber_Or, &info)) {
pos_python_start = info.dli_fbase;
pos_python_end = info.dli_fbase;
}
else {
init = -1;
return 0;
}
/* get multiarray base address */
if (dladdr(&PyArray_SetNumericOps, &info)) {
pos_ma_start = info.dli_fbase;
pos_ma_end = info.dli_fbase;
}
else {
init = -1;
return 0;
}
init = 1;
}
/* loop over callstack addresses to check if they leave numpy or cpython */
for (i = 0; i < nptrs; i++) {
Dl_info info;
int in_python = 0;
int in_multiarray = 0;
#if NPY_ELIDE_DEBUG >= 2
dladdr(buffer[i], &info);
printf("%s(%p) %s(%p)\n", info.dli_fname, info.dli_fbase,
info.dli_sname, info.dli_saddr);
#endif
/* check stored DSO boundaries first */
if (buffer[i] >= pos_python_start && buffer[i] <= pos_python_end) {
in_python = 1;
}
else if (buffer[i] >= pos_ma_start && buffer[i] <= pos_ma_end) {
in_multiarray = 1;
}
/* update DSO boundaries via dladdr if necessary */
if (!in_python && !in_multiarray) {
if (dladdr(buffer[i], &info) == 0) {
init = -1;
ok = 0;
break;
}
/* update DSO end */
if (info.dli_fbase == pos_python_start) {
pos_python_end = NPY_NUMBER_MAX(buffer[i], pos_python_end);
in_python = 1;
}
else if (info.dli_fbase == pos_ma_start) {
pos_ma_end = NPY_NUMBER_MAX(buffer[i], pos_ma_end);
in_multiarray = 1;
}
}
/* no longer in ok libraries and not reached PyEval -> no elide */
if (!in_python && !in_multiarray) {
ok = 0;
break;
}
/* in python check if the frame eval function was reached */
if (in_python) {
/* if reached eval we are done */
if (find_addr(pyeval_addr, n_pyeval, buffer[i])) {
ok = 1;
break;
}
/*
* check if its some other function, use pointer lookup table to
* save expensive dladdr calls
*/
if (find_addr(py_addr, n_py_addr, buffer[i])) {
continue;
}
/* new python address, check for PyEvalFrame */
if (dladdr(buffer[i], &info) == 0) {
init = -1;
ok = 0;
break;
}
if (info.dli_sname &&
strcmp(info.dli_sname, PYFRAMEEVAL_FUNC) == 0) {
if (n_pyeval < sizeof(pyeval_addr) / sizeof(pyeval_addr[0])) {
/* store address to not have to dladdr it again */
pyeval_addr[n_pyeval++] = buffer[i];
}
ok = 1;
break;
}
else if (n_py_addr < sizeof(py_addr) / sizeof(py_addr[0])) {
/* store other py function to not have to dladdr it again */
py_addr[n_py_addr++] = buffer[i];
}
}
}
/* all stacks after numpy are from python, we can elide */
if (ok) {
*cannot = 0;
return 1;
}
else {
#if NPY_ELIDE_DEBUG != 0
puts("cannot elide due to c-api usage");
#endif
*cannot = 1;
return 0;
}
}
/*
* check if in "alhs @op@ orhs" that alhs is a temporary (refcnt == 1) so we
* can do in-place operations instead of creating a new temporary
* "cannot" is set to true if it cannot be done even with swapped arguments
*/
static int
can_elide_temp(PyArrayObject * alhs, PyObject * orhs, int * cannot)
{
/*
* to be a candidate the array needs to have reference count 1, be an exact
* array of a basic type, own its data and size larger than threshold
*/
if (Py_REFCNT(alhs) != 1 || !PyArray_CheckExact(alhs) ||
!PyArray_ISNUMBER(alhs) ||
!PyArray_CHKFLAGS(alhs, NPY_ARRAY_OWNDATA) ||
!PyArray_ISWRITEABLE(alhs) ||
PyArray_CHKFLAGS(alhs, NPY_ARRAY_UPDATEIFCOPY) ||
PyArray_CHKFLAGS(alhs, NPY_ARRAY_WRITEBACKIFCOPY) ||
PyArray_NBYTES(alhs) < NPY_MIN_ELIDE_BYTES) {
return 0;
}
if (PyArray_CheckExact(orhs) ||
PyArray_CheckAnyScalar(orhs)) {
PyArrayObject * arhs;
/* create array from right hand side */
Py_INCREF(orhs);
arhs = (PyArrayObject *)PyArray_EnsureArray(orhs);
if (arhs == NULL) {
return 0;
}
/*
* if rhs is not a scalar dimensions must match
* TODO: one could allow broadcasting on equal types
*/
if (!(PyArray_NDIM(arhs) == 0 ||
(PyArray_NDIM(arhs) == PyArray_NDIM(alhs) &&
PyArray_CompareLists(PyArray_DIMS(alhs), PyArray_DIMS(arhs),
PyArray_NDIM(arhs))))) {
Py_DECREF(arhs);
return 0;
}
/* must be safe to cast (checks values for scalar in rhs) */
if (PyArray_CanCastArrayTo(arhs, PyArray_DESCR(alhs),
NPY_SAFE_CASTING)) {
Py_DECREF(arhs);
return check_callers(cannot);
}
Py_DECREF(arhs);
}
return 0;
}
/*
* try eliding a binary op, if commutative is true also try swapped arguments
*/
NPY_NO_EXPORT int
try_binary_elide(PyArrayObject * m1, PyObject * m2,
PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
PyObject ** res, int commutative)
{
/* set when no elision can be done independent of argument order */
int cannot = 0;
if (can_elide_temp(m1, m2, &cannot)) {
*res = inplace_op(m1, m2);
#if NPY_ELIDE_DEBUG != 0
puts("elided temporary in binary op");
#endif
return 1;
}
else if (commutative && !cannot) {
if (can_elide_temp((PyArrayObject *)m2, (PyObject *)m1, &cannot)) {
*res = inplace_op((PyArrayObject *)m2, (PyObject *)m1);
#if NPY_ELIDE_DEBUG != 0
puts("elided temporary in commutative binary op");
#endif
return 1;
}
}
*res = NULL;
return 0;
}
/* try elide unary temporary */
NPY_NO_EXPORT int
can_elide_temp_unary(PyArrayObject * m1)
{
int cannot;
if (Py_REFCNT(m1) != 1 || !PyArray_CheckExact(m1) ||
!PyArray_ISNUMBER(m1) ||
!PyArray_CHKFLAGS(m1, NPY_ARRAY_OWNDATA) ||
!PyArray_ISWRITEABLE(m1) ||
PyArray_CHKFLAGS(m1, NPY_ARRAY_UPDATEIFCOPY) ||
PyArray_NBYTES(m1) < NPY_MIN_ELIDE_BYTES) {
return 0;
}
if (check_callers(&cannot)) {
#if NPY_ELIDE_DEBUG != 0
puts("elided temporary in unary op");
#endif
return 1;
}
else {
return 0;
}
}
#else /* unsupported interpreter or missing backtrace */
NPY_NO_EXPORT int
can_elide_temp_unary(PyArrayObject * m1)
{
return 0;
}
NPY_NO_EXPORT int
try_binary_elide(PyArrayObject * m1, PyObject * m2,
PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
PyObject ** res, int commutative)
{
*res = NULL;
return 0;
}
#endif