Blob Blame History Raw
/*
 * BSD LICENSE
 *
 * Copyright(c) 2018-2020 Intel Corporation. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *   * Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the
 *     distribution.
 *   * Neither the name of Intel Corporation nor the names of its
 *     contributors may be used to endorse or promote products derived
 *     from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#define _GNU_SOURCE
#include <ctype.h>
#include <errno.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdarg.h>
#include <string.h>
#include <unistd.h>
#include <signal.h>
#include <sys/time.h>
#include <pthread.h>
#include <getopt.h>

#ifdef __linux__
#include <sched.h>
#include <cpuid.h>
#endif

#ifdef __FreeBSD__
#include <sys/param.h>
#include <sys/cpuset.h>
#endif

/**
 * MACROS
 */

#ifdef __linux__
#define PAGE_SIZE (4 * 1024)
#endif

#define MEMCHUNK_SIZE (PAGE_SIZE * 32 * 1024) /* 128MB chunk */
#define CL_SIZE       (64)
#define CHUNKS        (128)

#ifdef DEBUG
#include <assert.h>
#define ALWAYS_INLINE static inline
#else
#define assert(x)
#define ALWAYS_INLINE static inline __attribute__((always_inline))
#endif

#define MAX_OPTARG_LEN 64

#define MAX_MEM_BW 100 * 1000 /* 100GBps */

#define CPU_FEATURE_SSE4_2  (1ULL << 0)
#define CPU_FEATURE_CLWB    (1ULL << 1)
#define CPU_FEATURE_AVX512F (1ULL << 2)

/**
 * DATA STRUCTURES
 */

/**
 * Define read and write types
 */
enum cl_type {
        CL_TYPE_INVALID,
        CL_TYPE_PREFETCH_T0,
        CL_TYPE_PREFETCH_T1,
        CL_TYPE_PREFETCH_T2,
        CL_TYPE_PREFETCH_NTA,
        CL_TYPE_PREFETCH_W,
        CL_TYPE_READ_NTQ,
        CL_TYPE_READ_WB,
        CL_TYPE_READ_WB_DQA,
        CL_TYPE_READ_MOD_WRITE,
#ifdef __x86_64__
        CL_TYPE_WRITE_DQA,
        CL_TYPE_WRITE_DQA_FLUSH,
#endif
        CL_TYPE_WRITE_WB,
#ifdef __x86_64__
        CL_TYPE_WRITE_WB_AVX512,
#endif
        CL_TYPE_WRITE_WB_CLWB,
        CL_TYPE_WRITE_WB_FLUSH,
        CL_TYPE_WRITE_NTI,
        CL_TYPE_WRITE_NTI_CLWB,
#ifdef __x86_64__
        CL_TYPE_WRITE_NT512,
        CL_TYPE_WRITE_NTDQ
#endif
};

/* structure to store cpuid values */
struct cpuid_out {
        uint32_t eax;
        uint32_t ebx;
        uint32_t ecx;
        uint32_t edx;
};

static struct cpuid_out cpuid_1_0; /* leaf 1, sub-leaf 0 */
static struct cpuid_out cpuid_7_0; /* leaf 7, sub-leaf 0 */

/**
 * COMMON DATA
 */

static int stop_loop = 0;
static void *memchunk = NULL;
static unsigned memchunk_offset = 0;

/**
 * UTILS
 */

/*
 * A C wrapper for CPUID opcode
 *
 * Parameters:
 *    [in] leaf    - CPUID leaf number (EAX)
 *    [in] subleaf - CPUID sub-leaf number (ECX)
 *    [out] out    - registers structure to store results of CPUID into
 */
static void
lcpuid(const unsigned leaf, const unsigned subleaf, struct cpuid_out *out)
{
        if (out == NULL)
                return;

#ifdef __x86_64__
        asm volatile("mov %4, %%eax\n\t"
                     "mov %5, %%ecx\n\t"
                     "cpuid\n\t"
                     "mov %%eax, %0\n\t"
                     "mov %%ebx, %1\n\t"
                     "mov %%ecx, %2\n\t"
                     "mov %%edx, %3\n\t"
                     : "=g"(out->eax), "=g"(out->ebx), "=g"(out->ecx),
                       "=g"(out->edx)
                     : "g"(leaf), "g"(subleaf)
                     : "%eax", "%ebx", "%ecx", "%edx");
#else
        asm volatile("push %%ebx\n\t"
                     "mov %4, %%eax\n\t"
                     "mov %5, %%ecx\n\t"
                     "cpuid\n\t"
                     "mov %%eax, %0\n\t"
                     "mov %%ebx, %1\n\t"
                     "mov %%ecx, %2\n\t"
                     "mov %%edx, %3\n\t"
                     "pop %%ebx\n\t"
                     : "=g"(out->eax), "=g"(out->ebx), "=g"(out->ecx),
                       "=g"(out->edx)
                     : "g"(leaf), "g"(subleaf)
                     : "%eax", "%ecx", "%edx");
#endif
}

static uint32_t
detect_sse42(void)
{
        /* Check presence of SSE4.2 - bit 20 of ECX */
        return (cpuid_1_0.ecx & (1 << 20));
}

static uint32_t
detect_clwb(void)
{
        /* Check presence of CLWB - bit 24 of EBX */
        return (cpuid_7_0.ebx & (1 << 24));
}

static uint32_t
detect_avx512f(void)
{
        /* Check presence of AVX512F - bit 16 of EBX */
        return (cpuid_7_0.ebx & (1 << 16));
}

/**
 * @brief Function to detect CPU features
 *
 * @return Bitmap of supported features
 */
static uint64_t
cpu_feature_detect(void)
{
        static const struct {
                unsigned req_leaf_number;
                uint64_t feat;
                uint32_t (*detect_fn)(void);
        } feat_tab[] = {
            {1, CPU_FEATURE_SSE4_2, detect_sse42},
            {7, CPU_FEATURE_CLWB, detect_clwb},
            {7, CPU_FEATURE_AVX512F, detect_avx512f},
        };
        struct cpuid_out r;
        unsigned hi_leaf_number = 0;
        uint64_t features = 0;
        unsigned i;

        /* Get highest supported CPUID leaf number */
        lcpuid(0x0, 0x0, &r);
        hi_leaf_number = r.eax;

        /* Get the most common CPUID leafs to speed up the detection */
        if (hi_leaf_number >= 1)
                lcpuid(0x1, 0x0, &cpuid_1_0);

        if (hi_leaf_number >= 7)
                lcpuid(0x7, 0x0, &cpuid_7_0);

        for (i = 0; i < (sizeof(feat_tab) / sizeof(feat_tab[0])); i++) {
                if (hi_leaf_number < feat_tab[i].req_leaf_number)
                        continue;

                if (feat_tab[i].detect_fn() != 0)
                        features |= feat_tab[i].feat;
        }

        return features;
}

/**
 * @brief Function to bind thread to a cpu
 *
 * @param cpuid cpu to bind thread to
 */
static void
set_thread_affinity(const unsigned cpuid)
{
#ifdef __linux__
        cpu_set_t cpuset;
#endif
#ifdef __FreeBSD__
        cpuset_t cpuset;
#endif
        int res = -1;

        CPU_ZERO(&cpuset);
        CPU_SET((int)cpuid, &cpuset);

#ifdef __linux__
        res = sched_setaffinity(0, sizeof(cpuset), &cpuset);
#endif
#ifdef __FreeBSD__
        res = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1,
                                 sizeof(cpuset), &cpuset);
#endif
        if (res != 0)
                perror("Error setting core affinity ");
}

/**
 * @brief Function to flush cache
 *
 * @param p line of cache to flush
 */
ALWAYS_INLINE void
cl_flush(void *p)
{
        asm volatile("clflush (%0)\n\t" : : "r"(p) : "memory");
}

/**
 * @brief Serialize store operations, prevent reordering of writes
 */
ALWAYS_INLINE void
sb(void)
{
        asm volatile("sfence\n\t" : : : "memory");
}

/**
 * @brief Cache line write back
 *
 * @param p line of cache
 */
ALWAYS_INLINE void
cl_wb(void *p)
{
        asm volatile("clwb (%0)\n\t" : : "r"(p) : "memory");
}

/**
 * @brief Flush memory
 *
 * @param p memory allocated
 * @param s size of memory to flush
 */
ALWAYS_INLINE void
mem_flush(void *p, size_t s)
{
        char *cp = (char *)p;
        size_t i = 0;

        s = s / CL_SIZE; /* mem size in cache lines */
        for (i = 0; i < s; i++)
                cl_flush(&cp[i * CL_SIZE]);
        sb();
}

/**
 * @brief Function to initialize and allocate memory to thread
 *
 * @param s size of memory to allocate to thread
 *
 * @retval p allocated memory
 */
static void *
malloc_and_init_memory(size_t s)
{
        void *p = NULL;
        int ret;

        ret = posix_memalign(&p, PAGE_SIZE, s - s % PAGE_SIZE);

        if (ret != 0 || p == NULL) {
                printf("ERROR: Failed to allocate %lu bytes\n",
                       (unsigned long)s - s % PAGE_SIZE);
                stop_loop = 1;
                return NULL;
        }

        uint64_t *p64 = (uint64_t *)p;
        size_t s64 = s / sizeof(uint64_t);

        while (s64 > 0) {
                *p64 = (uint64_t)rand();
                p64 += (CL_SIZE / sizeof(uint64_t));
                s64 -= (CL_SIZE / sizeof(uint64_t));
        }

        mem_flush(p, MEMCHUNK_SIZE);
        return p;
}

/**
 * MEMORY OPERATIONS
 */

/**
 * @brief Perform prefetcht0
 *
 * @param p pointer to memory location
 */
ALWAYS_INLINE void
cl_prefetch_t0(void *p)
{
        asm volatile("prefetcht0 (%0)\n\t" : : "r"(p) : "memory");
}

/**
 * @brief Perform prefetcht1
 *
 * @param p pointer to memory location
 */
ALWAYS_INLINE void
cl_prefetch_t1(void *p)
{
        asm volatile("prefetcht1 (%0)\n\t" : : "r"(p) : "memory");
}

/**
 * @brief Perform prefetcht2
 *
 * @param p pointer to memory location
 */
ALWAYS_INLINE void
cl_prefetch_t2(void *p)
{
        asm volatile("prefetcht2 (%0)\n\t" : : "r"(p) : "memory");
}

/**
 * @brief Perform prefetchnta
 *
 * @param p pointer to memory location
 */
ALWAYS_INLINE void
cl_prefetch_nta(void *p)
{
        asm volatile("prefetchnta (%0)\n\t" : : "r"(p) : "memory");
}

/**
 * @brief Perform prefetchw
 *
 * @param p pointer to memory location
 */
ALWAYS_INLINE void
cl_prefetch_w(void *p)
{
        asm volatile("prefetchw (%0)\n\t" : : "r"(p) : "memory");
}

/**
 * @brief Load XOR writes
 *
 * @param p pointer to memory
 * @param v value to xor with value in memory location and write back
 */
ALWAYS_INLINE void
cl_read_mod_write(void *p, const uint64_t v)
{
        asm volatile("xor %0, (%1)\n\t"
                     "xor %0, 8(%1)\n\t"
                     "xor %0, 16(%1)\n\t"
                     "xor %0, 24(%1)\n\t"
                     "xor %0, 32(%1)\n\t"
                     "xor %0, 40(%1)\n\t"
                     "xor %0, 48(%1)\n\t"
                     "xor %0, 56(%1)\n\t"
                     :
                     : "r"(v), "r"(p)
                     : "memory");
}

#ifdef __x86_64__
#ifdef __AVX512F__
/**
 * @brief WB store vector version
 *
 * @param p pointer to memory location to be written
 * @param v value to overwrite memory location
 */
ALWAYS_INLINE void
cl_write_avx512(void *p, const uint64_t v)
{
        asm volatile("vmovq   %0, %%xmm1\n\t"
                     "vmovdqa64 %%zmm1, (%1)\n\t"
                     :
                     : "r"(v), "r"(p)
                     : "%zmm1", "memory");
}
#endif

/**
 * @brief WB vector version
 *
 * @param p pointer to memory location to be written
 * @param v value to overwrite memory location
 */
ALWAYS_INLINE void
cl_write_dqa(void *p, const uint64_t v)
{
        asm volatile("movq   %0, %%xmm1\n\t"
                     "movdqa %%xmm1, (%1)\n\t"
                     "movdqa %%xmm1, 16(%1)\n\t"
                     "movdqa %%xmm1, 32(%1)\n\t"
                     "movdqa %%xmm1, 48(%1)\n\t"
                     :
                     : "r"(v), "r"(p)
                     : "%xmm1", "memory");
}

/**
 * @brief Perform SSE write operation to specified cache line with flush
 *
 * @param p pointer to memory location to be written
 * @param v value to overwrite memory location
 */
ALWAYS_INLINE void
cl_write_dqa_flush(void *p, const uint64_t v)
{
        cl_write_dqa(p, v);
        cl_flush(p);
}
#endif

/**
 * @brief Perform write operation to specified cache line
 *
 * @param p pointer to memory location to be written
 * @param v value to overwrite memory location
 */
ALWAYS_INLINE void
cl_write(void *p, const uint64_t v)
{
#ifdef __x86_64__
        asm volatile("movq %0, (%1)\n\t"
                     "movq %0, 8(%1)\n\t"
                     "movq %0, 16(%1)\n\t"
                     "movq %0, 24(%1)\n\t"
                     "movq %0, 32(%1)\n\t"
                     "movq %0, 40(%1)\n\t"
                     "movq %0, 48(%1)\n\t"
                     "movq %0, 56(%1)\n\t"
                     :
                     : "r"(v), "r"(p)
                     : "memory");
#else
        asm volatile("movl %0, (%1)\n\t"
                     "movl %0, 4(%1)\n\t"
                     "movl %0, 8(%1)\n\t"
                     "movl %0, 12(%1)\n\t"
                     "movl %0, 16(%1)\n\t"
                     "movl %0, 20(%1)\n\t"
                     "movl %0, 24(%1)\n\t"
                     "movl %0, 28(%1)\n\t"
                     "movl %0, 32(%1)\n\t"
                     "movl %0, 36(%1)\n\t"
                     "movl %0, 40(%1)\n\t"
                     "movl %0, 44(%1)\n\t"
                     "movl %0, 48(%1)\n\t"
                     "movl %0, 52(%1)\n\t"
                     "movl %0, 56(%1)\n\t"
                     "movl %0, 64(%1)\n\t"
                     :
                     : "r"(v), "r"(p)
                     : "memory");
#endif
}

#ifdef __CLWB__
/**
 * @brief Perform write operation to specified cache line with clwb
 *
 * @param p pointer to memory location to be written
 * @param v value to overwrite memory location
 */
ALWAYS_INLINE void
cl_write_clwb(void *p, const uint64_t v)
{
        cl_write(p, v);
        cl_wb(p);
}
#endif

/**
 * @brief Perform write operation to specified cache line with flush
 *
 * @param p pointer to memory location to be written
 * @param v value to overwrite memory location
 */
ALWAYS_INLINE void
cl_write_flush(void *p, const uint64_t v)
{

        cl_write(p, v);
        cl_flush(p);
}

/**
 * @brief Perform write operation to memory giving non-temporal hint
 *
 * @param p pointer to memory location to be written
 * @param v value to overwrite memory location
 */
ALWAYS_INLINE void
cl_write_nti(void *p, const uint64_t v)
{
#ifdef __x86_64__
        asm volatile("movnti %0, (%1)\n\t"
                     "movnti %0, 8(%1)\n\t"
                     "movnti %0, 16(%1)\n\t"
                     "movnti %0, 24(%1)\n\t"
                     "movnti %0, 32(%1)\n\t"
                     "movnti %0, 40(%1)\n\t"
                     "movnti %0, 48(%1)\n\t"
                     "movnti %0, 56(%1)\n\t"
                     :
                     : "r"(v), "r"(p)
                     : "memory");
#else
        uint32_t v2 = (uint32_t)v;

        asm volatile("movnti %0, (%1)\n\t"
                     "movnti %0, 4(%1)\n\t"
                     "movnti %0, 8(%1)\n\t"
                     "movnti %0, 12(%1)\n\t"
                     "movnti %0, 16(%1)\n\t"
                     "movnti %0, 20(%1)\n\t"
                     "movnti %0, 24(%1)\n\t"
                     "movnti %0, 28(%1)\n\t"
                     "movnti %0, 32(%1)\n\t"
                     "movnti %0, 36(%1)\n\t"
                     "movnti %0, 40(%1)\n\t"
                     "movnti %0, 44(%1)\n\t"
                     "movnti %0, 48(%1)\n\t"
                     "movnti %0, 52(%1)\n\t"
                     "movnti %0, 56(%1)\n\t"
                     "movnti %0, 64(%1)\n\t"
                     :
                     : "r"(v2), "r"(p)
                     : "memory");
#endif
}

#if defined(__x86_64__) && defined(__AVX512F__)
/**
 * @brief non-temporal store vector version
 *
 * @param p pointer to memory location to be written
 * @param v value to overwrite memory location
 */
ALWAYS_INLINE void
cl_write_nt512(void *p, const uint64_t v)
{
        asm volatile("vmovq   %0, %%xmm1\n\t"
                     "vmovntpd %%zmm1, (%1)\n\t"
                     :
                     : "r"(v), "r"(p)
                     : "%zmm1", "memory");
}
#endif

#ifdef __CLWB__
/**
 * @brief Perform write operation to memory giving non-temporal hint with cache
 * line write back
 *
 * @param p pointer to memory location to be written
 * @param v value to overwrite memory location
 */
ALWAYS_INLINE void
cl_write_nti_clwb(void *p, const uint64_t v)
{
        cl_write_nti(p, v);
        cl_wb(p);
}
#endif

#ifdef __x86_64__
/**
 * @brief Non temporal store vector version
 *
 * @param p pointer to memory location to be written
 * @param v value to overwrite memory location
 */
ALWAYS_INLINE void
cl_write_ntdq(void *p, const uint64_t v)
{
        asm volatile("movq   %0, %%xmm1\n\t"
                     "movntdq %%xmm1, (%1)\n\t"
                     "movntdq %%xmm1, 16(%1)\n\t"
                     "movntdq %%xmm1, 32(%1)\n\t"
                     "movntdq %%xmm1, 48(%1)\n\t"
                     :
                     : "r"(v), "r"(p)
                     : "%xmm1", "memory");
}
#endif

/**
 * @brief Function to perform non-temporal read operation
 *        from specified memory location, vector version
 *
 * @param p pointer to memory location to read from
 */
ALWAYS_INLINE void
cl_read_ntq(void *p)
{
        asm volatile("movntdqa (%0), %%xmm1\n\t"
                     "movntdqa 16(%0), %%xmm1\n\t"
                     "movntdqa 32(%0), %%xmm1\n\t"
                     "movntdqa 48(%0), %%xmm1\n\t"
                     :
                     : "r"(p)
                     : "%xmm1", "memory");
}

/**
 * @brief Function to perform read operation from specified memory location
 *
 * @param p pointer to memory location to read from
 */
ALWAYS_INLINE void
cl_read(void *p)
{
        register uint64_t v = 0;
#ifdef __x86_64__
        asm volatile("movq (%1), %0\n\t"
                     "movq 8(%1), %0\n\t"
                     "movq 16(%1), %0\n\t"
                     "movq 24(%1), %0\n\t"
                     "movq 32(%1), %0\n\t"
                     "movq 40(%1), %0\n\t"
                     "movq 48(%1), %0\n\t"
                     "movq 56(%1), %0\n\t"
                     :
                     : "r"(v), "r"(p)
                     : "memory");
#else
        asm volatile("movl (%1), %0\n\t"
                     "movl 4(%1), %0\n\t"
                     "movl 8(%1), %0\n\t"
                     "movl 12(%1), %0\n\t"
                     "movl 16(%1), %0\n\t"
                     "movl 20(%1), %0\n\t"
                     "movl 24(%1), %0\n\t"
                     "movl 28(%1), %0\n\t"
                     "movl 32(%1), %0\n\t"
                     "movl 36(%1), %0\n\t"
                     "movl 40(%1), %0\n\t"
                     "movl 44(%1), %0\n\t"
                     "movl 48(%1), %0\n\t"
                     "movl 52(%1), %0\n\t"
                     "movl 56(%1), %0\n\t"
                     "movl 64(%1), %0\n\t"
                     :
                     : "r"(v), "r"(p)
                     : "memory");
#endif
}

/**
 * @brief Function to perform read operation from specified memory location,
 *        vector version
 *
 * @param p pointer to memory location to read from
 */
ALWAYS_INLINE void
cl_read_dqa(void *p)
{
        asm volatile("movdqa (%0), %%xmm1\n\t"
                     "movdqa 16(%0), %%xmm1\n\t"
                     "movdqa 32(%0), %%xmm1\n\t"
                     "movdqa 48(%0), %%xmm1\n\t"
                     :
                     : "r"(p)
                     : "%xmm1", "memory");
}

/**
 * @brief Function to find selected operation and execute it
 *
 * @param bw amount of bandwidth
 * @param type operation type to perform on core
 */
ALWAYS_INLINE void
mem_execute(const unsigned bw, const enum cl_type type)
{
        const uint64_t val = (uint64_t)rand();
        char *cp = (char *)memchunk;
        unsigned i = 0;
        const size_t s = MEMCHUNK_SIZE / CL_SIZE; /* mem size in cache lines */

        assert(memchunk != NULL);

        for (i = 0; i < bw; i++) {
                char *ptr = cp + (memchunk_offset * CL_SIZE);

                switch (type) {
                case CL_TYPE_PREFETCH_T0:
                        cl_prefetch_t0(ptr);
                        break;
                case CL_TYPE_PREFETCH_T1:
                        cl_prefetch_t1(ptr);
                        break;
                case CL_TYPE_PREFETCH_T2:
                        cl_prefetch_t2(ptr);
                        break;
                case CL_TYPE_PREFETCH_NTA:
                        cl_prefetch_nta(ptr);
                        break;
                case CL_TYPE_PREFETCH_W:
                        cl_prefetch_w(ptr);
                        break;
                case CL_TYPE_READ_NTQ:
                        cl_read_ntq(ptr);
                        break;
                case CL_TYPE_READ_WB:
                        cl_read(ptr);
                        break;
                case CL_TYPE_READ_WB_DQA:
                        cl_read_dqa(ptr);
                        break;
                case CL_TYPE_READ_MOD_WRITE:
                        cl_read_mod_write(ptr, val);
                        break;
#ifdef __x86_64__
                case CL_TYPE_WRITE_DQA:
                        cl_write_dqa(ptr, val);
                        break;
                case CL_TYPE_WRITE_DQA_FLUSH:
                        cl_write_dqa_flush(ptr, val);
                        break;
#endif
                case CL_TYPE_WRITE_WB:
                        cl_write(ptr, val);
                        break;
#if defined(__x86_64__) && defined(__AVX512F__)
                case CL_TYPE_WRITE_WB_AVX512:
                        cl_write_avx512(ptr, val);
                        break;
#endif
#ifdef __CLWB__
                case CL_TYPE_WRITE_WB_CLWB:
                        cl_write_clwb(ptr, val);
                        break;
#endif
                case CL_TYPE_WRITE_WB_FLUSH:
                        cl_write_flush(ptr, val);
                        break;
                case CL_TYPE_WRITE_NTI:
                        cl_write_nti(ptr, val);
                        break;
#ifdef __CLWB__
                case CL_TYPE_WRITE_NTI_CLWB:
                        cl_write_nti_clwb(ptr, val);
                        break;
#endif
#ifdef __x86_64__
#ifdef __AVX512F__
                case CL_TYPE_WRITE_NT512:
                        cl_write_nt512(ptr, val);
                        break;
#endif
                case CL_TYPE_WRITE_NTDQ:
                        cl_write_ntdq(ptr, val);
                        break;
#endif
                default:
                        assert(0);
                        break;
                }
                if (++memchunk_offset >= s)
                        memchunk_offset = 0;
        }
        sb();
}

/**
 * MAIN
 */

/**
 * @brief Function to print Membw command line usage
 *
 * @param argv list of arguments supplied by user
 */
static void
usage(char **argv)
{
        printf("Usage: %s -c <cpu> -b <BW [MB/s]> <operation type>\n"
               "Description:\n"
               "  -c, --cpu          cpu to generate B/W\n"
               "  -b, --bandwidth    memory B/W specified in MBps\n"
               "Operation types:\n"
               "  --prefetch-t0      prefetcht0\n"
               "  --prefetch-t1      prefetcht1\n"
               "  --prefetch-t2      prefetcht2\n"
               "  --prefetch-nta     prefetchtnta\n"
               "  --prefetch-w       prefetchw\n"
               "  --read             x86 loads\n"
               "  --read-sse         SSE loads\n"
               "  --nt-read-sse      SSE NT loads\n"
               "  --read-mod-write   x86 load XOR write\n"
               "  --write            x86 stores\n"
#ifdef __x86_64__
               "  --write-avx512     AVX512 stores\n"
#endif
               "  --write-clwb       x86 stores + clwb\n"
               "  --write-flush      x86 stores & clflush (naturally generates "
               "loads & stores)\n"
#ifdef __x86_64__
               "  --write-sse        SSE stores\n"
               "  --write-sse-flush  SSE stores & clflush (naturally generates "
               "loads & stores)\n"
#endif
               "  --nt-write         x86 NT stores\n"
               "  --nt-write-avx512  AVX512 NT stores\n"
               "  --nt-write-clwb    x86 NT stores + clwb\n"
#ifdef __x86_64__
               "  --nt-write-sse     SSE NT stores\n"
#endif
               ,
               argv[0]);
}

/**
 * @brief Calculate microseconds to the nearest measurement interval
 *
 * @param tv_s start time of memory operation
 * @param tv_e end time of memory operation
 *
 * @retval long time taken to execute operation
 */
ALWAYS_INLINE long
get_usec_diff(struct timeval *tv_s, struct timeval *tv_e)
{
        long usec_start, usec_end = 0;

        usec_start = ((long)tv_s->tv_usec) + ((long)tv_s->tv_sec * 1000000L);
        usec_end = ((long)tv_e->tv_usec) + ((long)tv_e->tv_sec * 1000000L);

        return usec_end - usec_start;
}

/**
 * @brief Sleep before executing operation
 *
 * @param usec_diff time taken to execute operation
 * @param interval maximum time operation should take
 */
ALWAYS_INLINE void
nano_sleep(const long interval, long usec_diff)
{
        struct timespec req, rem;

        req.tv_sec = (interval - usec_diff) / 1000000L;
        req.tv_nsec = ((interval - usec_diff) % 1000000L) * 1000L;
        if (nanosleep(&req, &rem) == -1)
                nanosleep(&rem, NULL);
}

/**
 * @brief Converts string str to UINT
 *
 * @param [in] str string
 * @param [in] base numerical base
 * @param [out] value UINT value
 *
 * @return number of parsed characters
 * @retval positive on success
 * @retval negative on error (-errno)
 */
static int
str_to_uint(const char *str, const unsigned base, unsigned *value)
{
        const char *str_start = str;
        char *str_end = NULL;
        unsigned tmp = 0;

        if (NULL == str || NULL == value)
                return -EINVAL;

        while (isblank(*str_start))
                str_start++;

        if (base == 10 && !isdigit(*str_start))
                return -EINVAL;

        if (base == 16 && !isxdigit(*str_start))
                return -EINVAL;

        errno = 0;
        tmp = strtoul(str_start, &str_end, base);
        if (errno != 0 || !(*str_start != '\0' && *str_end == '\0'))
                return -EINVAL;

        *value = tmp;
        return 0;
}

int
main(int argc, char **argv)
{
        int cmd = EXIT_SUCCESS;
        enum cl_type type = CL_TYPE_INVALID;
        unsigned mem_bw = 0;
        unsigned cpu = UINT_MAX;
        int option_index;
        int ret;
        uint64_t features;

        /* clang-format off */
        struct option options[] = {
            {"bandwidth",       required_argument, 0, 'b'},
            {"cpu",             required_argument, 0, 'c'},
            {"prefetch-t0",     no_argument, 0, CL_TYPE_PREFETCH_T0},
            {"prefetch-t1",     no_argument, 0, CL_TYPE_PREFETCH_T1},
            {"prefetch-t2",     no_argument, 0, CL_TYPE_PREFETCH_T2},
            {"prefetch-nta",    no_argument, 0, CL_TYPE_PREFETCH_NTA},
            {"prefetch-w",      no_argument, 0, CL_TYPE_PREFETCH_W},
            {"read",            no_argument, 0, CL_TYPE_READ_WB},
            {"read-sse",        no_argument, 0, CL_TYPE_READ_WB_DQA},
            {"nt-read-sse",     no_argument, 0, CL_TYPE_READ_NTQ},
            {"read-mod-write",  no_argument, 0, CL_TYPE_READ_MOD_WRITE},
            {"write",           no_argument, 0, CL_TYPE_WRITE_WB},
#ifdef __x86_64__
            {"write-avx512",    no_argument, 0, CL_TYPE_WRITE_WB_AVX512},
#endif
            {"write-clwb",      no_argument, 0, CL_TYPE_WRITE_WB_CLWB},
            {"write-flush",     no_argument, 0, CL_TYPE_WRITE_WB_FLUSH},
#ifdef __x86_64__
            {"write-sse",       no_argument, 0, CL_TYPE_WRITE_DQA},
            {"write-sse-flush", no_argument, 0, CL_TYPE_WRITE_DQA_FLUSH},
#endif
            {"nt-write",        no_argument, 0, CL_TYPE_WRITE_NTI},
#ifdef __x86_64__
            {"nt-write-avx512", no_argument, 0, CL_TYPE_WRITE_NT512},
#endif
            {"nt-write-clwb",   no_argument, 0, CL_TYPE_WRITE_NTI_CLWB},
#ifdef __x86_64__
            {"nt-write-sse",    no_argument, 0, CL_TYPE_WRITE_NTDQ},
#endif
            {0, 0, 0, 0}
        };
        /* clang-format on */

        /* Process command line arguments */
        while ((cmd = getopt_long_only(argc, argv, "b:c:", options,
                                       &option_index)) != -1) {

                switch (cmd) {
                case 'c':
                        ret = str_to_uint(optarg, 10, &cpu);
                        if (ret != 0) {
                                printf("Invalid CPU specified!\n");
                                return EXIT_FAILURE;
                        }
                        break;
                case 'b':
                        ret = str_to_uint(optarg, 10, &mem_bw);
                        if (ret != 0 || mem_bw == 0 || mem_bw > MAX_MEM_BW) {
                                printf("Invalid B/W specified!\n");
                                return EXIT_FAILURE;
                        }
                        break;
                case CL_TYPE_PREFETCH_T0:
                case CL_TYPE_PREFETCH_T1:
                case CL_TYPE_PREFETCH_T2:
                case CL_TYPE_PREFETCH_NTA:
                case CL_TYPE_PREFETCH_W:
                case CL_TYPE_READ_NTQ:
                case CL_TYPE_READ_WB:
                case CL_TYPE_READ_WB_DQA:
                case CL_TYPE_READ_MOD_WRITE:
#ifdef __x86_64__
                case CL_TYPE_WRITE_DQA:
                case CL_TYPE_WRITE_DQA_FLUSH:
#endif
                case CL_TYPE_WRITE_WB:
#ifdef __x86_64__
                case CL_TYPE_WRITE_WB_AVX512:
#endif
                case CL_TYPE_WRITE_WB_CLWB:
                case CL_TYPE_WRITE_WB_FLUSH:
                case CL_TYPE_WRITE_NTI:
                case CL_TYPE_WRITE_NTI_CLWB:
#ifdef __x86_64__
                case CL_TYPE_WRITE_NT512:
                case CL_TYPE_WRITE_NTDQ:
#endif
                        type = (enum cl_type)cmd;
                        break;
                default:
                        usage(argv);
                        return EXIT_FAILURE;
                        break;
                }
        }

        /* Check if user has supplied all required arguments */
        if (type == CL_TYPE_INVALID || cpu == UINT_MAX || !mem_bw ||
            optind < argc) {
                usage(argv);
                return EXIT_FAILURE;
        }

        features = cpu_feature_detect();

        switch (type) {
        case CL_TYPE_READ_WB_DQA:
#ifdef __x86_64__
        case CL_TYPE_WRITE_DQA:
        case CL_TYPE_WRITE_DQA_FLUSH:
        case CL_TYPE_WRITE_NTDQ:
#endif
                if (!(features & CPU_FEATURE_SSE4_2)) {
                        printf("No CPU support for SSE4.2 instructions!\n");
                        return EXIT_FAILURE;
                }
                break;
        case CL_TYPE_WRITE_NTI_CLWB:
        case CL_TYPE_WRITE_WB_CLWB:
#ifdef __CLWB__
                if (!(features & CPU_FEATURE_CLWB)) {
                        printf("No CPU support for CLWB instructions!\n");
                        return EXIT_FAILURE;
                }
#else
                printf("No compiler support for CLWB instructions!\n");
                return EXIT_FAILURE;
#endif
                break;
#ifdef __x86_64__
        case CL_TYPE_WRITE_NT512:
        case CL_TYPE_WRITE_WB_AVX512:
#ifdef __AVX512F__
                if (!(features & CPU_FEATURE_AVX512F)) {
                        printf("No CPU support for AVX512 instructions!\n");
                        return EXIT_FAILURE;
                }
#else
                printf("No compiler support for AVX512 instructions!\n");
                return EXIT_FAILURE;
#endif
                break;
#endif
        default:
                break;
        }

        printf("- THREAD logical core id: %u, "
               " memory bandwidth [MB]: %u, starting...\n",
               cpu, mem_bw);

        /* Bind thread to cpu */
        set_thread_affinity(cpu);

        /* Allocate memory */
        memchunk = malloc_and_init_memory(MEMCHUNK_SIZE);
        if (memchunk == NULL) {
                printf("Failed to allocate memory!\n");
                return EXIT_FAILURE;
        }

        /* Calculate memory bandwidth to use */
        mem_bw *= (((1024 * 1024) / CL_SIZE)) / CHUNKS;

        /* Stress memory bandwidth */
        while (stop_loop == 0) {
                struct timeval tv_s, tv_e;
                long usec_diff;
                const long interval = 1000000L / CHUNKS; /* interval in [us] */

                /* Get time before executing operation in loop */
                gettimeofday(&tv_s, NULL);

                /* Execute operation */
                mem_execute(mem_bw, type);

                /* Get time after executing operation */
                gettimeofday(&tv_e, NULL);

                usec_diff = get_usec_diff(&tv_s, &tv_e);

                if (usec_diff < interval) {
                        /* Sleep before executing operation again */
                        nano_sleep(interval, usec_diff);
                }
        }

        /* Terminate thread */
        free(memchunk);
        printf("\nexiting...\n");

        return 0;
}