Blob Blame History Raw
/**
* Copyright (C) Mellanox Technologies Ltd. 2001-2013.  ALL RIGHTS RESERVED.
* Copyright (C) ARM Ltd. 2016-2017.  ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/

#ifndef UCS_ASM_X86_64_H_
#define UCS_ASM_X86_64_H_

#include <ucs/sys/compiler.h>
#include <ucs/arch/generic/cpu.h>
#include <ucs/sys/compiler_def.h>
#include <ucs/config/types.h>
#include <ucs/config/global_opts.h>
#include <stdint.h>
#include <string.h>

#ifdef __SSE4_1__
#  include <smmintrin.h>
#endif
#ifdef __AVX__
#  include <immintrin.h>
#endif

BEGIN_C_DECLS

/** @file cpu.h */

#define UCS_ARCH_CACHE_LINE_SIZE 64

/**
 * In x86_64, there is strong ordering of each processor with respect to another
 * processor, but weak ordering with respect to the bus.
 */
#define ucs_memory_bus_fence()        asm volatile ("mfence"::: "memory")
#define ucs_memory_bus_store_fence()  asm volatile ("sfence" ::: "memory")
#define ucs_memory_bus_load_fence()   asm volatile ("lfence" ::: "memory")
#define ucs_memory_bus_cacheline_wc_flush()
#define ucs_memory_cpu_fence()        ucs_compiler_fence()
#define ucs_memory_cpu_store_fence()  ucs_compiler_fence()
#define ucs_memory_cpu_load_fence()   ucs_compiler_fence()
#define ucs_memory_cpu_wc_fence()     asm volatile ("sfence" ::: "memory")

extern ucs_ternary_value_t ucs_arch_x86_enable_rdtsc;

double ucs_arch_get_clocks_per_sec();
double ucs_x86_init_tsc_freq();

ucs_cpu_model_t ucs_arch_get_cpu_model() UCS_F_NOOPTIMIZE;
ucs_cpu_flag_t ucs_arch_get_cpu_flag() UCS_F_NOOPTIMIZE;
ucs_cpu_vendor_t ucs_arch_get_cpu_vendor();
void ucs_cpu_init();
ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes);
void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len);

static inline int ucs_arch_x86_rdtsc_enabled()
{
    double UCS_V_UNUSED dummy_freq;

    if (ucs_unlikely(ucs_arch_x86_enable_rdtsc == UCS_TRY)) {
        dummy_freq = ucs_x86_init_tsc_freq();
        ucs_assert(ucs_arch_x86_enable_rdtsc != UCS_TRY);
    }

    return ucs_arch_x86_enable_rdtsc;
}

static inline uint64_t ucs_arch_read_hres_clock()
{
    uint32_t low, high;

    if (ucs_unlikely(ucs_arch_x86_rdtsc_enabled() == UCS_NO)) {
        return ucs_arch_generic_read_hres_clock();
    }

    asm volatile ("rdtsc" : "=a" (low), "=d" (high));
    return ((uint64_t)high << 32) | (uint64_t)low;
}

#define ucs_arch_wait_mem ucs_arch_generic_wait_mem

#if !HAVE___CLEAR_CACHE
static inline void ucs_arch_clear_cache(void *start, void *end)
{
    char *ptr;

    for (ptr = (char*)start; ptr < (char*)end; ptr++) {
        asm volatile("mfence; clflush %0; mfence" :: "m" (*ptr));
    }
}
#endif

static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len)
{
#if ENABLE_BUILTIN_MEMCPY
    if (ucs_unlikely((len > ucs_global_opts.arch.builtin_memcpy_min) &&
                     (len < ucs_global_opts.arch.builtin_memcpy_max))) {
        asm volatile ("rep movsb"
                      : "=D" (dst),
                      "=S" (src),
                      "=c" (len)
                      : "0" (dst),
                      "1" (src),
                      "2" (len)
                      : "memory");
        return dst;
    }
#endif
    return memcpy(dst, src, len);
}

static UCS_F_ALWAYS_INLINE void
ucs_memcpy_nontemporal(void *dst, const void *src, size_t len)
{
    ucs_x86_memcpy_sse_movntdqa(dst, src, len);
}

END_C_DECLS

#endif