/** * Copyright (C) Mellanox Technologies Ltd. 2001-2016. ALL RIGHTS RESERVED. * Copyright (C) The University of Tennessee and The University * of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ #ifndef UCT_IB_MD_H_ #define UCT_IB_MD_H_ #include "ib_device.h" #include #include #include #include #define UCT_IB_MD_MAX_MR_SIZE 0x80000000UL #define UCT_IB_MD_PACKED_RKEY_SIZE sizeof(uint64_t) #define UCT_IB_MD_DEFAULT_GID_INDEX 0 /**< The gid index used by default for an IB/RoCE port */ #define UCT_IB_MEM_ACCESS_FLAGS (IBV_ACCESS_LOCAL_WRITE | \ IBV_ACCESS_REMOTE_WRITE | \ IBV_ACCESS_REMOTE_READ | \ IBV_ACCESS_REMOTE_ATOMIC) #define UCT_IB_MEM_DEREG 0 /** * IB MD statistics counters */ enum { UCT_IB_MD_STAT_MEM_ALLOC, UCT_IB_MD_STAT_MEM_REG, UCT_IB_MD_STAT_LAST }; enum { UCT_IB_MEM_FLAG_ODP = UCS_BIT(0), /**< The memory region has on demand paging enabled */ UCT_IB_MEM_FLAG_ATOMIC_MR = UCS_BIT(1), /**< The memory region has UMR for the atomic access */ UCT_IB_MEM_ACCESS_REMOTE_ATOMIC = UCS_BIT(2), /**< An atomic access was requested for the memory region */ UCT_IB_MEM_MULTITHREADED = UCS_BIT(3), /**< The memory region registration handled by chunks in parallel threads */ }; enum { UCT_IB_DEVX_OBJ_RCQP, UCT_IB_DEVX_OBJ_RCSRQ, UCT_IB_DEVX_OBJ_DCT, UCT_IB_DEVX_OBJ_DCSRQ }; typedef struct uct_ib_md_ext_config { int eth_pause; /**< Whether or not Pause Frame is enabled on the Ethernet network */ int prefer_nearest_device; /**< Give priority for near device */ int enable_indirect_atomic; /** Enable indirect atomic */ int enable_gpudirect_rdma; /** Enable GPUDirect RDMA */ #if HAVE_EXP_UMR unsigned max_inline_klm_list; /* Maximal length of inline KLM list */ #endif struct { ucs_numa_policy_t numa_policy; /**< NUMA policy flags for ODP */ int prefetch; /**< Auto-prefetch non-blocking memory registrations / allocations */ size_t max_size; /**< Maximal memory region size for ODP */ } odp; size_t gid_index; /**< IB GID index to use */ size_t min_mt_reg; /**< Multi-threaded registration threshold */ size_t mt_reg_chunk; /**< Multi-threaded registration chunk */ int mt_reg_bind; /**< Multi-threaded registration bind to core */ } uct_ib_md_ext_config_t; typedef struct uct_ib_mem { uint32_t lkey; uint32_t rkey; uint32_t atomic_rkey; uint32_t flags; } uct_ib_mem_t; /** * IB memory domain. */ typedef struct uct_ib_md { uct_md_t super; ucs_rcache_t *rcache; /**< Registration cache (can be NULL) */ uct_mem_h global_odp;/**< Implicit ODP memory handle */ struct ibv_pd *pd; /**< IB memory domain */ uct_ib_device_t dev; /**< IB device */ uct_linear_growth_t reg_cost; /**< Memory registration cost */ struct uct_ib_md_ops *ops; UCS_STATS_NODE_DECLARE(stats) uct_ib_md_ext_config_t config; /* IB external configuration */ struct { uct_ib_device_spec_t *specs; /* Custom device specifications */ unsigned count; /* Number of custom devices */ } custom_devices; int check_subnet_filter; uint64_t subnet_filter; double pci_bw; } uct_ib_md_t; /** * IB memory domain configuration. */ typedef struct uct_ib_md_config { uct_md_config_t super; /** List of registration methods in order of preference */ UCS_CONFIG_STRING_ARRAY_FIELD(rmtd) reg_methods; uct_md_rcache_config_t rcache; /**< Registration cache config */ uct_linear_growth_t uc_reg_cost; /**< Memory registration cost estimation without using the cache */ unsigned fork_init; /**< Use ibv_fork_init() */ int async_events; /**< Whether async events should be delivered */ uct_ib_md_ext_config_t ext; /**< External configuration */ UCS_CONFIG_STRING_ARRAY_FIELD(spec) custom_devices; /**< Custom device specifications */ char *subnet_prefix; /**< Filter of subnet_prefix for IB ports */ UCS_CONFIG_ARRAY_FIELD(ucs_config_bw_spec_t, device) pci_bw; /**< List of PCI BW for devices */ unsigned devx; /**< DEVX support */ unsigned devx_objs; /**< Objects to be created by DevX */ } uct_ib_md_config_t; /** * Memory domain constructor. * * @param [in] ibv_device IB device. * * @param [in] md_config Memory domain configuration parameters. * * @param [out] md_p Handle to memory domain. * * @return UCS_OK on success or error code in case of failure. */ typedef ucs_status_t (*uct_ib_md_open_func_t)(struct ibv_device *ibv_device, const uct_ib_md_config_t *md_config, struct uct_ib_md **md_p); /** * Memory domain destructor. * * @param [in] md Memory domain. */ typedef void (*uct_ib_md_cleanup_func_t)(struct uct_ib_md *); /** * Memory domain method to register memory area. * * @param [in] md Memory domain. * * @param [in] address Memory area start address. * * @param [in] length Memory area length. * * @param [in] access IB verbs registration access flags * * @param [in] memh Memory region handle. * Method should initialize lkey & rkey. * * @return UCS_OK on success or error code in case of failure. */ typedef ucs_status_t (*uct_ib_md_reg_key_func_t)(struct uct_ib_md *md, void *address, size_t length, uint64_t access, uct_ib_mem_t *memh); /** * Memory domain method to deregister memory area. * * @param [in] md Memory domain. * * @param [in] memh Memory region handle registered with * uct_ib_md_reg_key_func_t. * * @return UCS_OK on success or error code in case of failure. */ typedef ucs_status_t (*uct_ib_md_dereg_key_func_t)(struct uct_ib_md *md, uct_ib_mem_t *memh); /** * Memory domain method to register memory area optimized for atomic ops. * * @param [in] md Memory domain. * * @param [in] memh Memory region handle registered for regular ops. * Method should initialize atomic_rkey * * @return UCS_OK on success or error code in case of failure. */ typedef ucs_status_t (*uct_ib_md_reg_atomic_key_func_t)(struct uct_ib_md *md, uct_ib_mem_t *memh); /** * Memory domain method to release resources registered for atomic ops. * * @param [in] md Memory domain. * * @param [in] memh Memory region handle registered with * uct_ib_md_reg_atomic_key_func_t. * * @return UCS_OK on success or error code in case of failure. */ typedef ucs_status_t (*uct_ib_md_dereg_atomic_key_func_t)(struct uct_ib_md *md, uct_ib_mem_t *memh); /** * Memory domain method to register memory area using multiple threads. * * @param [in] md Memory domain. * * @param [in] address Memory area start address. * * @param [in] length Memory area length. * * @param [in] access IB verbs registration access flags * * @param [in] memh Memory region handle. * Method should initialize lkey & rkey. * * @return UCS_OK on success or error code in case of failure. */ typedef ucs_status_t (*uct_ib_md_reg_multithreaded_func_t)(uct_ib_md_t *md, void *address, size_t length, uint64_t access, uct_ib_mem_t *memh); /** * Memory domain method to deregister memory area. * * @param [in] md Memory domain. * * @param [in] memh Memory region handle registered with * uct_ib_md_reg_key_func_t. * * @return UCS_OK on success or error code in case of failure. */ typedef ucs_status_t (*uct_ib_md_dereg_multithreaded_func_t)(uct_ib_md_t *md, uct_ib_mem_t *memh); /** * Memory domain method to prefetch physical memory for virtual memory area. * * @param [in] md Memory domain. * * @param [in] memh Memory region handle. * * @param [in] address Memory area start address. * * @param [in] length Memory area length. * * @return UCS_OK on success or error code in case of failure. */ typedef ucs_status_t (*uct_ib_md_mem_prefetch_func_t)(uct_ib_md_t *md, uct_ib_mem_t *memh, void *addr, size_t length); typedef struct uct_ib_md_ops { uct_ib_md_open_func_t open; uct_ib_md_cleanup_func_t cleanup; size_t memh_struct_size; uct_ib_md_reg_key_func_t reg_key; uct_ib_md_dereg_key_func_t dereg_key; uct_ib_md_reg_atomic_key_func_t reg_atomic_key; uct_ib_md_dereg_atomic_key_func_t dereg_atomic_key; uct_ib_md_reg_multithreaded_func_t reg_multithreaded; uct_ib_md_dereg_multithreaded_func_t dereg_multithreaded; uct_ib_md_mem_prefetch_func_t mem_prefetch; } uct_ib_md_ops_t; /** * IB memory region in the registration cache. */ typedef struct uct_ib_rcache_region { ucs_rcache_region_t super; uct_ib_mem_t memh; /**< mr exposed to the user as the memh */ } uct_ib_rcache_region_t; /** * IB memory domain constructor. Should have following logic: * - probe provided IB device, may return UCS_ERR_UNSUPPORTED * - allocate MD and IB context * - setup atomic MR ops * - determine device attributes and flags */ typedef struct uct_ib_md_ops_entry { ucs_list_link_t list; const char *name; uct_ib_md_ops_t *ops; int priority; } uct_ib_md_ops_entry_t; #define UCT_IB_MD_OPS(_md_ops, _priority) \ UCS_STATIC_INIT { \ extern ucs_list_link_t uct_ib_md_ops_list; \ static uct_ib_md_ops_entry_t *p, entry = { \ .name = UCS_PP_MAKE_STRING(_md_ops), \ .ops = &_md_ops, \ .priority = _priority, \ }; \ ucs_list_for_each(p, &uct_ib_md_ops_list, list) { \ if (p->priority < _priority) { \ ucs_list_insert_before(&p->list, &entry.list); \ return; \ } \ } \ ucs_list_add_tail(&uct_ib_md_ops_list, &entry.list); \ } extern uct_component_t uct_ib_component; static inline uint32_t uct_ib_md_direct_rkey(uct_rkey_t uct_rkey) { return (uint32_t)uct_rkey; } static uint32_t uct_ib_md_indirect_rkey(uct_rkey_t uct_rkey) { return uct_rkey >> 32; } static UCS_F_ALWAYS_INLINE void uct_ib_md_pack_rkey(uint32_t rkey, uint32_t atomic_rkey, void *rkey_buffer) { uint64_t *rkey_p = (uint64_t*)rkey_buffer; *rkey_p = (((uint64_t)atomic_rkey) << 32) | rkey; ucs_trace("packed rkey: direct 0x%x indirect 0x%x", rkey, atomic_rkey); } /** * rkey is packed/unpacked is such a way that: * low 32 bits contain a direct key * high 32 bits contain either UCT_IB_INVALID_RKEY or a valid indirect key. */ static inline uint32_t uct_ib_resolve_atomic_rkey(uct_rkey_t uct_rkey, uint16_t atomic_mr_offset, uint64_t *remote_addr_p) { uint32_t atomic_rkey = uct_ib_md_indirect_rkey(uct_rkey); if (atomic_rkey == UCT_IB_INVALID_RKEY) { return uct_ib_md_direct_rkey(uct_rkey); } else { *remote_addr_p += atomic_mr_offset; return atomic_rkey; } } static inline uint16_t uct_ib_md_atomic_offset(uint8_t atomic_mr_id) { return 8 * atomic_mr_id; } static inline void uct_ib_memh_init_from_mr(uct_ib_mem_t *memh, struct ibv_mr *mr) { memh->lkey = mr->lkey; memh->rkey = mr->rkey; } ucs_status_t uct_ib_md_open(uct_component_t *component, const char *md_name, const uct_md_config_t *uct_md_config, uct_md_h *md_p); ucs_status_t uct_ib_md_open_common(uct_ib_md_t *md, struct ibv_device *ib_device, const uct_ib_md_config_t *md_config); void uct_ib_md_close(uct_md_h uct_md); ucs_status_t uct_ib_reg_mr(struct ibv_pd *pd, void *addr, size_t length, uint64_t access, struct ibv_mr **mr_p); ucs_status_t uct_ib_dereg_mr(struct ibv_mr *mr); ucs_status_t uct_ib_dereg_mrs(struct ibv_mr **mrs, size_t mr_num); ucs_status_t uct_ib_md_handle_mr_list_multithreaded(uct_ib_md_t *md, void *address, size_t length, uint64_t access, size_t chunk, struct ibv_mr **mrs); #endif