/** * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016-2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ #ifndef UCS_AARCH64_CPU_H_ #define UCS_AARCH64_CPU_H_ #include "config.h" #include #include #include #include #include #include #include #if __ARM_NEON #include #endif #define UCS_ARCH_CACHE_LINE_SIZE 64 BEGIN_C_DECLS /** @file cpu.h */ /** * Assume the worst - weak memory ordering. */ #define ucs_aarch64_dmb(_op) asm volatile ("dmb " #_op ::: "memory") #define ucs_aarch64_isb(_op) asm volatile ("isb " #_op ::: "memory") /* The macro is used to serialize stores across Normal NC (or Device) and WB * memory, (see Arm Spec, B2.7.2). Based on recent changes in Linux kernel: * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=22ec71615d824f4f11d38d0e55a88d8956b7e45f * * The underlying barrier code was changed to use lighter weight DMB instead * of DSB. The barrier used for synchronization of access between write back * and device mapped memory (PCIe BAR). */ #define ucs_memory_bus_fence() ucs_aarch64_dmb(oshsy) #define ucs_memory_bus_store_fence() ucs_aarch64_dmb(oshst) #define ucs_memory_bus_load_fence() ucs_aarch64_dmb(oshld) /* The macro is used to flush all pending stores from write combining buffer. * Some uarch "auto" flush the stores once cache line is full (no need for additional barrier). */ #if defined(HAVE_AARCH64_THUNDERX2) #define ucs_memory_bus_cacheline_wc_flush() #else /* The macro is used to flush stores to Normal NC or Device memory */ #define ucs_memory_bus_cacheline_wc_flush() ucs_aarch64_dmb(oshst) #endif #define ucs_memory_cpu_fence() ucs_aarch64_dmb(ish) #define ucs_memory_cpu_store_fence() ucs_aarch64_dmb(ishst) #define ucs_memory_cpu_load_fence() ucs_aarch64_dmb(ishld) /* The macro is used to serialize stores to Normal NC or Device memory * (see Arm Spec, B2.7.2) */ #define ucs_memory_cpu_wc_fence() ucs_aarch64_dmb(oshst) /* * ARM processor ID (ARM ISA - Main ID Register, EL1) */ typedef struct ucs_aarch64_cpuid { int implementer; int architecture; int variant; int part; int revision; } ucs_aarch64_cpuid_t; /** * Get ARM CPU identifier and version */ void ucs_aarch64_cpuid(ucs_aarch64_cpuid_t *cpuid); #if HAVE_HW_TIMER static inline uint64_t ucs_arch_read_hres_clock(void) { uint64_t ticks; asm volatile("isb" : : : "memory"); asm volatile("mrs %0, cntvct_el0" : "=r" (ticks)); return ticks; } static inline double ucs_arch_get_clocks_per_sec() { uint64_t freq; asm volatile("mrs %0, cntfrq_el0" : "=r" (freq)); return (double) freq; } #else #define ucs_arch_read_hres_clock ucs_arch_generic_read_hres_clock #define ucs_arch_get_clocks_per_sec ucs_arch_generic_get_clocks_per_sec #endif static inline ucs_cpu_model_t ucs_arch_get_cpu_model() { return UCS_CPU_MODEL_ARM_AARCH64; } static inline ucs_cpu_vendor_t ucs_arch_get_cpu_vendor() { return UCS_CPU_VENDOR_GENERIC_ARM; } static inline int ucs_arch_get_cpu_flag() { return UCS_CPU_FLAG_UNKNOWN; } static inline void ucs_cpu_init() { } static inline void ucs_arch_wait_mem(void *address) { unsigned long tmp; asm volatile ("ldxrb %w0, %1 \n" "wfe \n" : "=&r"(tmp) : "Q"(address)); } #if !HAVE___CLEAR_CACHE static inline void ucs_arch_clear_cache(void *start, void *end) { #if HAVE___AARCH64_SYNC_CACHE_RANGE /* do not allow global declaration of compiler intrinsic */ void __aarch64_sync_cache_range(void* beg, void* end); __aarch64_sync_cache_range(start, end); #else uintptr_t ptr; unsigned icache; unsigned dcache; unsigned ctr_el0; /* Get cache line size, using ctr_el0 register * * Bits Name Function * ***************************** * [31] - Reserved, res1. * [30:28] - Reserved, res0. * [27:24] CWG Cache Write-Back granule. Log2 of the number of words of the * maximum size of memory that can be overwritten as a result of * the eviction of a cache entry that has had a memory location * in it modified: * 0x4 * Cache Write-Back granule size is 16 words. * [23:20] ERG Exclusives Reservation Granule. Log2 of the number of words of * the maximum size of the reservation granule that has been * implemented for the Load-Exclusive and Store-Exclusive instructions: * 0x4 * Exclusive reservation granule size is 16 words. * [19:16] DminLine Log2 of the number of words in the smallest cache line of all the * data and unified caches that the processor controls: * 0x4 * Smallest data cache line size is 16 words. * [15:14] L1lp L1 Instruction cache policy. Indicates the indexing and tagging * policy for the L1 Instruction cache: * 0b10 * Virtually Indexed Physically Tagged (VIPT). * [13:4] - Reserved, res0. * [3:0] IminLine Log2 of the number of words in the smallest cache line of all * the instruction caches that the processor controls. * 0x4 * Smallest instruction cache line size is 16 words. */ asm volatile ("mrs\t%0, ctr_el0":"=r" (ctr_el0)); icache = sizeof(int) << (ctr_el0 & 0xf); dcache = sizeof(int) << ((ctr_el0 >> 16) & 0xf); for (ptr = ucs_align_down((uintptr_t)start, dcache); ptr < (uintptr_t)end; ptr += dcache) { asm volatile ("dc cvau, %0" :: "r" (ptr) : "memory"); } ucs_aarch64_dsb(ish); for (ptr = ucs_align_down((uintptr_t)start, icache); ptr < (uintptr_t)end; ptr += icache) { asm volatile ("ic ivau, %0" :: "r" (ptr) : "memory"); } ucs_aarch64_dsb(ish); ucs_aarch64_isb(); #endif } #endif static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len) { return memcpy(dst, src, len); } static UCS_F_ALWAYS_INLINE void ucs_memcpy_nontemporal(void *dst, const void *src, size_t len) { memcpy(dst, src, len); } static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes) { return UCS_ERR_UNSUPPORTED; } END_C_DECLS #endif