Blob Blame History Raw
/**
 * Copyright (C) Mellanox Technologies Ltd. 2001-2016.  ALL RIGHTS RESERVED.
 * Copyright (C) The University of Tennessee and The University
 *               of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */

#ifndef UCT_IB_MD_H_
#define UCT_IB_MD_H_

#include "ib_device.h"

#include <uct/base/uct_md.h>
#include <ucs/stats/stats.h>
#include <ucs/memory/numa.h>
#include <ucs/memory/rcache.h>

#define UCT_IB_MD_MAX_MR_SIZE       0x80000000UL
#define UCT_IB_MD_PACKED_RKEY_SIZE  sizeof(uint64_t)

#define UCT_IB_MD_DEFAULT_GID_INDEX 0   /**< The gid index used by default for an IB/RoCE port */

#define UCT_IB_MEM_ACCESS_FLAGS  (IBV_ACCESS_LOCAL_WRITE | \
                                  IBV_ACCESS_REMOTE_WRITE | \
                                  IBV_ACCESS_REMOTE_READ | \
                                  IBV_ACCESS_REMOTE_ATOMIC)

#define UCT_IB_MEM_DEREG          0

/**
 * IB MD statistics counters
 */
enum {
    UCT_IB_MD_STAT_MEM_ALLOC,
    UCT_IB_MD_STAT_MEM_REG,
    UCT_IB_MD_STAT_LAST
};


enum {
    UCT_IB_MEM_FLAG_ODP             = UCS_BIT(0), /**< The memory region has on
                                                       demand paging enabled */
    UCT_IB_MEM_FLAG_ATOMIC_MR       = UCS_BIT(1), /**< The memory region has UMR
                                                       for the atomic access */
    UCT_IB_MEM_ACCESS_REMOTE_ATOMIC = UCS_BIT(2), /**< An atomic access was
                                                       requested for the memory
                                                       region */
    UCT_IB_MEM_MULTITHREADED        = UCS_BIT(3), /**< The memory region registration
                                                       handled by chunks in parallel
                                                       threads */
};

enum {
    UCT_IB_DEVX_OBJ_RCQP,
    UCT_IB_DEVX_OBJ_RCSRQ,
    UCT_IB_DEVX_OBJ_DCT,
    UCT_IB_DEVX_OBJ_DCSRQ
};

typedef struct uct_ib_md_ext_config {
    int                      eth_pause;    /**< Whether or not Pause Frame is
                                                enabled on the Ethernet network */
    int                      prefer_nearest_device; /**< Give priority for near
                                                         device */
    int                      enable_indirect_atomic; /** Enable indirect atomic */
    int                      enable_gpudirect_rdma; /** Enable GPUDirect RDMA */
#if HAVE_EXP_UMR
    unsigned                 max_inline_klm_list; /* Maximal length of inline KLM list */
#endif

    struct {
        ucs_numa_policy_t    numa_policy;  /**< NUMA policy flags for ODP */
        int                  prefetch;     /**< Auto-prefetch non-blocking memory
                                                registrations / allocations */
        size_t               max_size;     /**< Maximal memory region size for ODP */
    } odp;

    size_t                   gid_index;    /**< IB GID index to use  */

    size_t                   min_mt_reg;   /**< Multi-threaded registration threshold */
    size_t                   mt_reg_chunk; /**< Multi-threaded registration chunk */
    int                      mt_reg_bind;  /**< Multi-threaded registration bind to core */
} uct_ib_md_ext_config_t;


typedef struct uct_ib_mem {
    uint32_t                lkey;
    uint32_t                rkey;
    uint32_t                atomic_rkey;
    uint32_t                flags;
} uct_ib_mem_t;

/**
 * IB memory domain.
 */
typedef struct uct_ib_md {
    uct_md_t                 super;
    ucs_rcache_t             *rcache;   /**< Registration cache (can be NULL) */
    uct_mem_h                global_odp;/**< Implicit ODP memory handle */
    struct ibv_pd            *pd;       /**< IB memory domain */
    uct_ib_device_t          dev;       /**< IB device */
    uct_linear_growth_t      reg_cost;  /**< Memory registration cost */
    struct uct_ib_md_ops     *ops;
    UCS_STATS_NODE_DECLARE(stats)
    uct_ib_md_ext_config_t   config;    /* IB external configuration */
    struct {
        uct_ib_device_spec_t *specs;    /* Custom device specifications */
        unsigned             count;     /* Number of custom devices */
    } custom_devices;
    int                      check_subnet_filter;
    uint64_t                 subnet_filter;
    double                   pci_bw;
} uct_ib_md_t;


/**
 * IB memory domain configuration.
 */
typedef struct uct_ib_md_config {
    uct_md_config_t          super;

    /** List of registration methods in order of preference */
    UCS_CONFIG_STRING_ARRAY_FIELD(rmtd) reg_methods;

    uct_md_rcache_config_t   rcache;       /**< Registration cache config */
    uct_linear_growth_t      uc_reg_cost;  /**< Memory registration cost estimation
                                                without using the cache */
    unsigned                 fork_init;    /**< Use ibv_fork_init() */
    int                      async_events; /**< Whether async events should be delivered */

    uct_ib_md_ext_config_t   ext;          /**< External configuration */

    UCS_CONFIG_STRING_ARRAY_FIELD(spec) custom_devices; /**< Custom device specifications */

    char                     *subnet_prefix; /**< Filter of subnet_prefix for IB ports */

    UCS_CONFIG_ARRAY_FIELD(ucs_config_bw_spec_t, device) pci_bw; /**< List of PCI BW for devices */

    unsigned                 devx;         /**< DEVX support */
    unsigned                 devx_objs;    /**< Objects to be created by DevX */
} uct_ib_md_config_t;

/**
 * Memory domain constructor.
 *
 * @param [in]  ibv_device    IB device.
 *
 * @param [in]  md_config     Memory domain configuration parameters.
 *
 * @param [out] md_p          Handle to memory domain.
 *
 * @return UCS_OK on success or error code in case of failure.
 */
typedef ucs_status_t (*uct_ib_md_open_func_t)(struct ibv_device *ibv_device,
                                              const uct_ib_md_config_t *md_config,
                                              struct uct_ib_md **md_p);

/**
 * Memory domain destructor.
 *
 * @param [in]  md      Memory domain.
 */
typedef void (*uct_ib_md_cleanup_func_t)(struct uct_ib_md *);

/**
 * Memory domain method to register memory area.
 *
 * @param [in]  md      Memory domain.
 *
 * @param [in]  address Memory area start address.
 *
 * @param [in]  length  Memory area length.
 *
 * @param [in]  access  IB verbs registration access flags
 *
 * @param [in]  memh    Memory region handle.
 *                      Method should initialize lkey & rkey.
 *
 * @return UCS_OK on success or error code in case of failure.
 */
typedef ucs_status_t (*uct_ib_md_reg_key_func_t)(struct uct_ib_md *md,
                                                 void *address, size_t length,
                                                 uint64_t access,
                                                 uct_ib_mem_t *memh);

/**
 * Memory domain method to deregister memory area.
 *
 * @param [in]  md      Memory domain.
 *
 * @param [in]  memh    Memory region handle registered with
 *                      uct_ib_md_reg_key_func_t.
 *
 * @return UCS_OK on success or error code in case of failure.
 */
typedef ucs_status_t (*uct_ib_md_dereg_key_func_t)(struct uct_ib_md *md,
                                                   uct_ib_mem_t *memh);

/**
 * Memory domain method to register memory area optimized for atomic ops.
 *
 * @param [in]  md      Memory domain.
 *
 * @param [in]  memh    Memory region handle registered for regular ops.
 *                      Method should initialize atomic_rkey
 *
 * @return UCS_OK on success or error code in case of failure.
 */
typedef ucs_status_t (*uct_ib_md_reg_atomic_key_func_t)(struct uct_ib_md *md,
                                                        uct_ib_mem_t *memh);

/**
 * Memory domain method to release resources registered for atomic ops.
 *
 * @param [in]  md      Memory domain.
 *
 * @param [in]  memh    Memory region handle registered with
 *                      uct_ib_md_reg_atomic_key_func_t.
 *
 * @return UCS_OK on success or error code in case of failure.
 */
typedef ucs_status_t (*uct_ib_md_dereg_atomic_key_func_t)(struct uct_ib_md *md,
                                                          uct_ib_mem_t *memh);

/**
 * Memory domain method to register memory area using multiple threads.
 *
 * @param [in]  md      Memory domain.
 *
 * @param [in]  address Memory area start address.
 *
 * @param [in]  length  Memory area length.
 *
 * @param [in]  access  IB verbs registration access flags
 *
 * @param [in]  memh    Memory region handle.
 *                      Method should initialize lkey & rkey.
 *
 * @return UCS_OK on success or error code in case of failure.
 */
typedef ucs_status_t (*uct_ib_md_reg_multithreaded_func_t)(uct_ib_md_t *md,
                                                           void *address,
                                                           size_t length,
                                                           uint64_t access,
                                                           uct_ib_mem_t *memh);

/**
 * Memory domain method to deregister memory area.
 *
 * @param [in]  md      Memory domain.
 *
 * @param [in]  memh    Memory region handle registered with
 *                      uct_ib_md_reg_key_func_t.
 *
 * @return UCS_OK on success or error code in case of failure.
 */
typedef ucs_status_t (*uct_ib_md_dereg_multithreaded_func_t)(uct_ib_md_t *md,
                                                             uct_ib_mem_t *memh);

/**
 * Memory domain method to prefetch physical memory for virtual memory area.
 *
 * @param [in]  md      Memory domain.
 *
 * @param [in]  memh    Memory region handle.
 *
 * @param [in]  address Memory area start address.
 *
 * @param [in]  length  Memory area length.
 *
 * @return UCS_OK on success or error code in case of failure.
 */
typedef ucs_status_t (*uct_ib_md_mem_prefetch_func_t)(uct_ib_md_t *md,
                                                      uct_ib_mem_t *memh,
                                                      void *addr, size_t length);

typedef struct uct_ib_md_ops {
    uct_ib_md_open_func_t                open;
    uct_ib_md_cleanup_func_t             cleanup;

    size_t                               memh_struct_size;

    uct_ib_md_reg_key_func_t             reg_key;
    uct_ib_md_dereg_key_func_t           dereg_key;
    uct_ib_md_reg_atomic_key_func_t      reg_atomic_key;
    uct_ib_md_dereg_atomic_key_func_t    dereg_atomic_key;
    uct_ib_md_reg_multithreaded_func_t   reg_multithreaded;
    uct_ib_md_dereg_multithreaded_func_t dereg_multithreaded;
    uct_ib_md_mem_prefetch_func_t        mem_prefetch;
} uct_ib_md_ops_t;


/**
 * IB memory region in the registration cache.
 */
typedef struct uct_ib_rcache_region {
    ucs_rcache_region_t  super;
    uct_ib_mem_t         memh;      /**<  mr exposed to the user as the memh */
} uct_ib_rcache_region_t;


/**
 * IB memory domain constructor. Should have following logic:
 * - probe provided IB device, may return UCS_ERR_UNSUPPORTED
 * - allocate MD and IB context
 * - setup atomic MR ops
 * - determine device attributes and flags
 */
typedef struct uct_ib_md_ops_entry {
    ucs_list_link_t             list;
    const char                  *name;
    uct_ib_md_ops_t             *ops;
    int                         priority;
} uct_ib_md_ops_entry_t;

#define UCT_IB_MD_OPS(_md_ops, _priority) \
    UCS_STATIC_INIT { \
        extern ucs_list_link_t uct_ib_md_ops_list; \
        static uct_ib_md_ops_entry_t *p, entry = { \
            .name     = UCS_PP_MAKE_STRING(_md_ops), \
            .ops      = &_md_ops, \
            .priority = _priority, \
        }; \
        ucs_list_for_each(p, &uct_ib_md_ops_list, list) { \
            if (p->priority < _priority) { \
                ucs_list_insert_before(&p->list, &entry.list); \
                return; \
            } \
        } \
        ucs_list_add_tail(&uct_ib_md_ops_list, &entry.list); \
    }

extern uct_component_t uct_ib_component;


static inline uint32_t uct_ib_md_direct_rkey(uct_rkey_t uct_rkey)
{
    return (uint32_t)uct_rkey;
}


static uint32_t uct_ib_md_indirect_rkey(uct_rkey_t uct_rkey)
{
    return uct_rkey >> 32;
}


static UCS_F_ALWAYS_INLINE void
uct_ib_md_pack_rkey(uint32_t rkey, uint32_t atomic_rkey, void *rkey_buffer)
{
    uint64_t *rkey_p = (uint64_t*)rkey_buffer;
    *rkey_p = (((uint64_t)atomic_rkey) << 32) | rkey;
     ucs_trace("packed rkey: direct 0x%x indirect 0x%x", rkey, atomic_rkey);
}


/**
 * rkey is packed/unpacked is such a way that:
 * low  32 bits contain a direct key
 * high 32 bits contain either UCT_IB_INVALID_RKEY or a valid indirect key.
 */
static inline uint32_t uct_ib_resolve_atomic_rkey(uct_rkey_t uct_rkey,
                                                  uint16_t atomic_mr_offset,
                                                  uint64_t *remote_addr_p)
{
    uint32_t atomic_rkey = uct_ib_md_indirect_rkey(uct_rkey);
    if (atomic_rkey == UCT_IB_INVALID_RKEY) {
        return uct_ib_md_direct_rkey(uct_rkey);
    } else {
        *remote_addr_p += atomic_mr_offset;
        return atomic_rkey;
    }
}


static inline uint16_t uct_ib_md_atomic_offset(uint8_t atomic_mr_id)
{
    return 8 * atomic_mr_id;
}

static inline void uct_ib_memh_init_from_mr(uct_ib_mem_t *memh, struct ibv_mr *mr)
{
    memh->lkey = mr->lkey;
    memh->rkey = mr->rkey;
}

ucs_status_t uct_ib_md_open(uct_component_t *component, const char *md_name,
                            const uct_md_config_t *uct_md_config, uct_md_h *md_p);

ucs_status_t uct_ib_md_open_common(uct_ib_md_t *md,
                                   struct ibv_device *ib_device,
                                   const uct_ib_md_config_t *md_config);

void uct_ib_md_close(uct_md_h uct_md);

ucs_status_t uct_ib_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
                           uint64_t access, struct ibv_mr **mr_p);
ucs_status_t uct_ib_dereg_mr(struct ibv_mr *mr);
ucs_status_t uct_ib_dereg_mrs(struct ibv_mr **mrs, size_t mr_num);

ucs_status_t
uct_ib_md_handle_mr_list_multithreaded(uct_ib_md_t *md, void *address,
                                       size_t length, uint64_t access,
                                       size_t chunk, struct ibv_mr **mrs);
#endif