Blob Blame History Raw
/**
* Copyright (C) Mellanox Technologies Ltd. 2019.  ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/

#if defined(__x86_64__)

#include <common/test.h>
extern "C" {
#include <ucs/sys/sys.h>
#include <ucs/arch/cpu.h>
#include <ucs/time/time.h>
}

#include <sys/mman.h>

class test_arch : public ucs::test {
protected:
    /* have to add wrapper for ucs_memcpy_relaxed because pure "C" inline call could
     * not be used as template argument */
    static inline void *memcpy_relaxed(void *dst, const void *src, size_t size)
    {
        return ucs_memcpy_relaxed(dst, src, size);
    }

    template <void* (C)(void*, const void*, size_t)>
    double measure_memcpy_bandwidth(size_t size)
    {
        ucs_time_t start_time, end_time;
        void *src, *dst;
        double result = 0.0;
        int iter;

        src = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
        if (src == MAP_FAILED) {
            goto out;
        }

        dst = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
        if (dst == MAP_FAILED) {
            goto out_unmap_src;
        }

        memset(dst, 0, size);
        memset(src, 0, size);
        memcpy(dst, src, size);

        iter = 0;
        start_time = ucs_get_time();
        do {
            C(dst, src, size);
            end_time = ucs_get_time();
            ++iter;
        } while (end_time < start_time + ucs_time_from_sec(0.5));

        result = size * iter / ucs_time_to_sec(end_time - start_time);

        munmap(dst, size);
    out_unmap_src:
        munmap(src, size);
    out:
        return result;
    }
};

UCS_TEST_SKIP_COND_F(test_arch, memcpy, RUNNING_ON_VALGRIND || !ucs::perf_retry_count) {
    const double diff      = 0.95; /* allow 5% fluctuations */
    const double timeout   = 30; /* 30 seconds to complete test successfully */
    double memcpy_bw       = 0;
    double memcpy_relax_bw = 0;
    double secs;
    size_t size;
    char memunits_str[256];
    char thresh_min_str[16];
    char thresh_max_str[16];
    int i;

    ucs_memunits_to_str(ucs_global_opts.arch.builtin_memcpy_min,
                        thresh_min_str, sizeof(thresh_min_str));
    ucs_memunits_to_str(ucs_global_opts.arch.builtin_memcpy_max,
                        thresh_max_str, sizeof(thresh_max_str));
    UCS_TEST_MESSAGE << "Using memcpy relaxed for size " <<
                        thresh_min_str << ".." <<
                        thresh_max_str;
    for (size = 4096; size <= 256 * UCS_MBYTE; size *= 2) {
        secs = ucs_get_accurate_time();
        for (i = 0; ucs_get_accurate_time() - secs < timeout; i++) {
            memcpy_bw       = measure_memcpy_bandwidth<memcpy>(size);
            memcpy_relax_bw = measure_memcpy_bandwidth<memcpy_relaxed>(size);
            if (memcpy_relax_bw / memcpy_bw >= diff) {
                break;
            }
            usleep(1000); /* allow other tasks to complete */
        }
        ucs_memunits_to_str(size, memunits_str, sizeof(memunits_str));
        UCS_TEST_MESSAGE << memunits_str <<
                            " memcpy: "             << (memcpy_bw / UCS_GBYTE) <<
                            "GB/s memcpy relaxed: " << (memcpy_relax_bw / UCS_GBYTE) <<
                            "GB/s iterations: "     << i + 1;
        EXPECT_GE(memcpy_relax_bw / memcpy_bw, diff);
    }
}

#endif