Blob Blame History Raw
/**
* Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
* Copyright (C) ARM Ltd. 2016-2019.  ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/

#ifndef UCS_AARCH64_CPU_H_
#define UCS_AARCH64_CPU_H_

#include "config.h"
#include <time.h>
#include <string.h>
#include <sys/times.h>
#include <ucs/sys/compiler_def.h>
#include <ucs/arch/generic/cpu.h>
#include <ucs/sys/math.h>
#include <ucs/type/status.h>
#if __ARM_NEON
#include <arm_neon.h>
#endif


#define UCS_ARCH_CACHE_LINE_SIZE 64

BEGIN_C_DECLS

/** @file cpu.h */

/**
 * Assume the worst - weak memory ordering.
 */

#define ucs_aarch64_dmb(_op)          asm volatile ("dmb " #_op ::: "memory")
#define ucs_aarch64_isb(_op)          asm volatile ("isb " #_op ::: "memory")

/* The macro is used to serialize stores across Normal NC (or Device) and WB
 * memory, (see Arm Spec, B2.7.2).  Based on recent changes in Linux kernel:
 * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=22ec71615d824f4f11d38d0e55a88d8956b7e45f
 *
 * The underlying barrier code was changed to use lighter weight DMB instead
 * of DSB. The barrier used for synchronization of access between write back
 * and device mapped memory (PCIe BAR).
 */
#define ucs_memory_bus_fence()        ucs_aarch64_dmb(oshsy)
#define ucs_memory_bus_store_fence()  ucs_aarch64_dmb(oshst)
#define ucs_memory_bus_load_fence()   ucs_aarch64_dmb(oshld)

/* The macro is used to flush all pending stores from write combining buffer.
 * Some uarch "auto" flush the stores once cache line is full (no need for additional barrier).
 */
#if defined(HAVE_AARCH64_THUNDERX2)
#define ucs_memory_bus_cacheline_wc_flush()
#else
/* The macro is used to flush stores to Normal NC or Device memory */
#define ucs_memory_bus_cacheline_wc_flush()     ucs_aarch64_dmb(oshst)
#endif

#define ucs_memory_cpu_fence()        ucs_aarch64_dmb(ish)
#define ucs_memory_cpu_store_fence()  ucs_aarch64_dmb(ishst)
#define ucs_memory_cpu_load_fence()   ucs_aarch64_dmb(ishld)

/* The macro is used to serialize stores to Normal NC or Device memory
 * (see Arm Spec, B2.7.2)
 */
#define ucs_memory_cpu_wc_fence()     ucs_aarch64_dmb(oshst)


/*
 * ARM processor ID (ARM ISA - Main ID Register, EL1)
 */
typedef struct ucs_aarch64_cpuid {
    int       implementer;
    int       architecture;
    int       variant;
    int       part;
    int       revision;
} ucs_aarch64_cpuid_t;


/**
 * Get ARM CPU identifier and version
 */
void ucs_aarch64_cpuid(ucs_aarch64_cpuid_t *cpuid);


#if HAVE_HW_TIMER
static inline uint64_t ucs_arch_read_hres_clock(void)
{
    uint64_t ticks;
    asm volatile("isb" : : : "memory");
    asm volatile("mrs %0, cntvct_el0" : "=r" (ticks));
    return ticks;
}

static inline double ucs_arch_get_clocks_per_sec()
{
    uint64_t freq;
    asm volatile("mrs %0, cntfrq_el0" : "=r" (freq));
    return (double) freq;
}

#else

#define ucs_arch_read_hres_clock ucs_arch_generic_read_hres_clock
#define ucs_arch_get_clocks_per_sec ucs_arch_generic_get_clocks_per_sec

#endif

static inline ucs_cpu_model_t ucs_arch_get_cpu_model()
{
    return UCS_CPU_MODEL_ARM_AARCH64;
}

static inline ucs_cpu_vendor_t ucs_arch_get_cpu_vendor()
{
    return UCS_CPU_VENDOR_GENERIC_ARM;
}

static inline int ucs_arch_get_cpu_flag()
{
    return UCS_CPU_FLAG_UNKNOWN;
}

static inline void ucs_cpu_init()
{
}

static inline void ucs_arch_wait_mem(void *address)
{
    unsigned long tmp;
    asm volatile ("ldxrb %w0, %1 \n"
                  "wfe           \n"
                  : "=&r"(tmp)
                  : "Q"(address));
}

#if !HAVE___CLEAR_CACHE
static inline void ucs_arch_clear_cache(void *start, void *end)
{
#if HAVE___AARCH64_SYNC_CACHE_RANGE
    /* do not allow global declaration of compiler intrinsic */
    void __aarch64_sync_cache_range(void* beg, void* end);

    __aarch64_sync_cache_range(start, end);
#else
    uintptr_t ptr;
    unsigned icache;
    unsigned dcache;
    unsigned ctr_el0;

    /* Get cache line size, using ctr_el0 register
     *
     * Bits    Name      Function
     * *****************************
     * [31]    -         Reserved, res1.
     * [30:28] -         Reserved, res0.
     * [27:24] CWG       Cache Write-Back granule. Log2 of the number of words of the
     *                   maximum size of memory that can be overwritten as a result of
     *                   the eviction of a cache entry that has had a memory location
     *                   in it modified:
     *                   0x4
     *                   Cache Write-Back granule size is 16 words.
     * [23:20] ERG       Exclusives Reservation Granule. Log2 of the number of words of
     *                   the maximum size of the reservation granule that has been
     *                   implemented for the Load-Exclusive and Store-Exclusive instructions:
     *                   0x4
     *                   Exclusive reservation granule size is 16 words.
     * [19:16] DminLine  Log2 of the number of words in the smallest cache line of all the
     *                   data and unified caches that the processor controls:
     *                   0x4
     *                   Smallest data cache line size is 16 words.
     * [15:14] L1lp      L1 Instruction cache policy. Indicates the indexing and tagging
     *                   policy for the L1 Instruction cache:
     *                   0b10
     *                   Virtually Indexed Physically Tagged (VIPT).
     * [13:4]  -         Reserved, res0.
     * [3:0]   IminLine  Log2 of the number of words in the smallest cache line of all
     *                   the instruction caches that the processor controls.
     *                   0x4
     *                   Smallest instruction cache line size is 16 words.
     */
    asm volatile ("mrs\t%0, ctr_el0":"=r" (ctr_el0));
    icache = sizeof(int) << (ctr_el0 & 0xf);
    dcache = sizeof(int) << ((ctr_el0 >> 16) & 0xf);

    for (ptr = ucs_align_down((uintptr_t)start, dcache); ptr < (uintptr_t)end; ptr += dcache) {
        asm volatile ("dc cvau, %0" :: "r" (ptr) : "memory");
    }
    ucs_aarch64_dsb(ish);

    for (ptr = ucs_align_down((uintptr_t)start, icache); ptr < (uintptr_t)end; ptr += icache) {
        asm volatile ("ic ivau, %0" :: "r" (ptr) : "memory");
    }
    ucs_aarch64_dsb(ish);
    ucs_aarch64_isb();
#endif
}
#endif

static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len)
{
    return memcpy(dst, src, len);
}

static UCS_F_ALWAYS_INLINE void
ucs_memcpy_nontemporal(void *dst, const void *src, size_t len)
{
    memcpy(dst, src, len);
}

static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes)
{
    return UCS_ERR_UNSUPPORTED;
}

END_C_DECLS

#endif