/**
* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/
#ifndef UCT_IB_IFACE_H
#define UCT_IB_IFACE_H
#include "ib_md.h"
#include <uct/api/uct.h>
#include <uct/base/uct_iface.h>
#include <ucs/sys/compiler.h>
#include <ucs/sys/string.h>
#include <ucs/sys/math.h>
#include <ucs/datastruct/mpool.inl>
#define UCT_IB_MAX_IOV 8UL
#define UCT_IB_IFACE_NULL_RES_DOMAIN_KEY 0u
#define UCT_IB_MAX_ATOMIC_SIZE sizeof(uint64_t)
/* Forward declarations */
typedef struct uct_ib_iface_config uct_ib_iface_config_t;
typedef struct uct_ib_iface_ops uct_ib_iface_ops_t;
typedef struct uct_ib_iface uct_ib_iface_t;
/**
* IB port/path MTU.
*/
typedef enum uct_ib_mtu {
UCT_IB_MTU_DEFAULT = 0,
UCT_IB_MTU_512 = 1,
UCT_IB_MTU_1024 = 2,
UCT_IB_MTU_2048 = 3,
UCT_IB_MTU_4096 = 4,
UCT_IB_MTU_LAST
} uct_ib_mtu_t;
/**
* Traffic direction.
*/
typedef enum {
UCT_IB_DIR_RX,
UCT_IB_DIR_TX,
UCT_IB_DIR_NUM
} uct_ib_dir_t;
enum {
UCT_IB_QPT_UNKNOWN,
#if HAVE_DC_EXP
UCT_IB_QPT_DCI = IBV_EXP_QPT_DC_INI,
#elif HAVE_DC_DV
UCT_IB_QPT_DCI = IBV_QPT_DRIVER,
#endif
};
/**
* IB address packing flags
*/
enum {
UCT_IB_ADDRESS_PACK_FLAG_ETH = UCS_BIT(0),
UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID = UCS_BIT(1),
UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX = UCS_BIT(2)
};
struct uct_ib_iface_config {
uct_iface_config_t super;
size_t seg_size; /* Maximal size of copy-out sends */
struct {
unsigned queue_len; /* Queue length */
unsigned max_batch; /* How many fragments can be batched to one post send */
unsigned max_poll; /* How many wcs can be picked when polling tx cq */
size_t min_inline; /* Inline space to reserve for sends */
size_t inl_resp; /* Inline space to reserve for responses */
unsigned min_sge; /* How many SG entries to support */
uct_iface_mpool_config_t mp;
/* Event moderation parameters */
unsigned cq_moderation_count;
double cq_moderation_period;
} tx;
struct {
unsigned queue_len; /* Queue length */
unsigned max_batch; /* How many buffers can be batched to one post receive */
unsigned max_poll; /* How many wcs can be picked when polling rx cq */
size_t inl; /* Inline space to reserve in CQ/QP */
uct_iface_mpool_config_t mp;
/* Event moderation parameters */
unsigned cq_moderation_count;
double cq_moderation_period;
} rx;
/* Change the address type */
int addr_type;
/* Force global routing */
int is_global;
/* IB SL to use */
unsigned sl;
/* IB Traffic Class to use */
unsigned long traffic_class;
/* IB hop limit / TTL */
unsigned hop_limit;
/* Ranges of path bits */
UCS_CONFIG_ARRAY_FIELD(ucs_range_spec_t, ranges) lid_path_bits;
/* IB PKEY to use */
unsigned pkey_value;
/* Multiple resource domains */
int enable_res_domain;
};
typedef struct uct_ib_qp_attr {
int qp_type;
struct ibv_qp_cap cap;
int port;
struct ibv_srq *srq;
uint32_t srq_num;
unsigned sq_sig_all;
unsigned max_inl_recv;
unsigned max_inl_resp;
#if HAVE_DECL_IBV_EXP_CREATE_QP
struct ibv_exp_qp_init_attr ibv;
#elif HAVE_DECL_IBV_CREATE_QP_EX
struct ibv_qp_init_attr_ex ibv;
#else
struct ibv_qp_init_attr ibv;
#endif
} uct_ib_qp_attr_t;
typedef ucs_status_t (*uct_ib_iface_create_cq_func_t)(struct ibv_context *context, int cqe,
struct ibv_comp_channel *channel,
int comp_vector, int ignore_overrun,
size_t *inl, struct ibv_cq **cq_p);
typedef ucs_status_t (*uct_ib_iface_arm_cq_func_t)(uct_ib_iface_t *iface,
uct_ib_dir_t dir,
int solicited_only);
typedef void (*uct_ib_iface_event_cq_func_t)(uct_ib_iface_t *iface,
uct_ib_dir_t dir);
typedef void (*uct_ib_iface_handle_failure_func_t)(uct_ib_iface_t *iface, void *arg,
ucs_status_t status);
typedef ucs_status_t (*uct_ib_iface_set_ep_failed_func_t)(uct_ib_iface_t *iface, uct_ep_h ep,
ucs_status_t status);
struct uct_ib_iface_ops {
uct_iface_ops_t super;
uct_ib_iface_create_cq_func_t create_cq;
uct_ib_iface_arm_cq_func_t arm_cq;
uct_ib_iface_event_cq_func_t event_cq;
uct_ib_iface_handle_failure_func_t handle_failure;
uct_ib_iface_set_ep_failed_func_t set_ep_failed;
};
struct uct_ib_iface {
uct_base_iface_t super;
struct ibv_cq *cq[UCT_IB_DIR_NUM];
struct ibv_comp_channel *comp_channel;
uct_recv_desc_t release_desc;
uint8_t *path_bits;
unsigned path_bits_count;
uint16_t pkey_index;
uint16_t pkey_value;
uint8_t addr_size;
union ibv_gid gid;
int is_roce_v2;
struct {
unsigned rx_payload_offset; /* offset from desc to payload */
unsigned rx_hdr_offset; /* offset from desc to network header */
unsigned rx_headroom_offset; /* offset from desc to user headroom */
unsigned rx_max_batch;
unsigned rx_max_poll;
unsigned tx_max_poll;
unsigned seg_size;
uint8_t max_inl_resp;
uint8_t port_num;
uint8_t sl;
uint8_t traffic_class;
uint8_t hop_limit;
uint8_t gid_index; /* IB GID index to use */
uint8_t enable_res_domain; /* Disable multiple resource domains */
uint8_t qp_type;
uint8_t force_global_addr;
size_t max_iov; /* Maximum buffers in IOV array */
} config;
uct_ib_iface_ops_t *ops;
};
typedef struct uct_ib_fence_info {
uint16_t fence_beat; /* 16bit is enough because if it wraps around,
* it means the older ops are already completed
* because QP size is less than 64k */
} uct_ib_fence_info_t;
enum {
UCT_IB_CQ_IGNORE_OVERRUN = UCS_BIT(0),
UCT_IB_TM_SUPPORTED = UCS_BIT(1)
};
typedef struct uct_ib_iface_init_attr {
unsigned rx_priv_len; /* Length of transport private data to reserve */
unsigned rx_hdr_len; /* Length of transport network header */
unsigned tx_cq_len; /* Send CQ length */
unsigned rx_cq_len; /* Receive CQ length */
size_t seg_size; /* Transport segment size */
unsigned fc_req_size; /* Flow control request size */
int qp_type; /* IB QP type */
int flags; /* Various flags (see enum) */
} uct_ib_iface_init_attr_t;
UCS_CLASS_DECLARE(uct_ib_iface_t, uct_ib_iface_ops_t*, uct_md_h, uct_worker_h,
const uct_iface_params_t*, const uct_ib_iface_config_t*,
const uct_ib_iface_init_attr_t*);
/*
* The offset to the payload is the maximum between user-requested headroom
* and transport-specific data/header. When the active message callback is invoked,
* it gets a pointer to the beginning of the headroom.
* The headroom can be either smaller (1) or larger (2) than the transport data.
*
* (1)
*
* <rx_headroom_offset>
* |
* |
* uct_recv_desc_t |
* | |
* | am_callback/tag_unexp_callback
* | |
* +------+------+---+-----------+---------+
* | LKey | ??? | D | Head Room | Payload |
* +------+------+---+--+--------+---------+
* | LKey | TL data | TL hdr | Payload |
* +------+-------------+--------+---------+
* |
* post_receive
*
* (2)
* am_callback/tag_unexp_callback
* |
* +------+---+------------------+---------+
* | LKey | D | Head Room | Payload |
* +------+---+-----+---+--------+---------+
* | LKey | TL data | ? | TL hdr | Payload |
* +------+---------+---+--------+---------+
* |
* post_receive
* <dsc>
* <--- rx_headroom -->
* <------- rx_payload_offset --->
* <--- rx_hdr_offset -->
*
*/
typedef struct uct_ib_iface_recv_desc {
uint32_t lkey;
} UCS_S_PACKED uct_ib_iface_recv_desc_t;
extern ucs_config_field_t uct_ib_iface_config_table[];
extern const char *uct_ib_mtu_values[];
/**
* Create memory pool of receive descriptors.
*/
ucs_status_t uct_ib_iface_recv_mpool_init(uct_ib_iface_t *iface,
const uct_ib_iface_config_t *config,
const char *name, ucs_mpool_t *mp);
void uct_ib_iface_release_desc(uct_recv_desc_t *self, void *desc);
static UCS_F_ALWAYS_INLINE void
uct_ib_iface_invoke_am_desc(uct_ib_iface_t *iface, uint8_t am_id, void *data,
unsigned length, uct_ib_iface_recv_desc_t *ib_desc)
{
void *desc = (char*)ib_desc + iface->config.rx_headroom_offset;
ucs_status_t status;
status = uct_iface_invoke_am(&iface->super, am_id, data, length,
UCT_CB_PARAM_FLAG_DESC);
if (status == UCS_OK) {
ucs_mpool_put_inline(ib_desc);
} else {
uct_recv_desc(desc) = &iface->release_desc;
}
}
/**
* @return Whether the port used by this interface is RoCE
*/
int uct_ib_iface_is_roce(uct_ib_iface_t *iface);
/**
* @return Whether the port used by this interface is IB
*/
int uct_ib_iface_is_ib(uct_ib_iface_t *iface);
/**
* Get the expected size of IB packed address.
*
* @param [in] gid GID address to pack.
* @param [in] pack_flags Packing flags, UCT_IB_ADDRESS_PACK_FLAG_xx.
*
* @return IB address size of the given link scope.
*/
size_t uct_ib_address_size(const union ibv_gid *gid, unsigned pack_flags);
/**
* @return IB address size of the given iface.
*/
size_t uct_ib_iface_address_size(uct_ib_iface_t *iface);
/**
* Pack IB address.
*
* @param [in] gid GID address to pack.
* @param [in] lid LID address to pack.
* @param [in] pack_flags Packing flags, UCT_IB_ADDRESS_PACK_FLAG_xx.
* @param [in/out] ib_addr Filled with packed ib address. Size of the structure
* must be at least what @ref uct_ib_address_size()
* returns for the given scope.
*/
void uct_ib_address_pack(const union ibv_gid *gid, uint16_t lid,
unsigned pack_flags, uct_ib_address_t *ib_addr);
/**
* Pack the IB address of the given iface.
*
* @param [in] iface Iface whose IB address to pack.
* @param [in] gid GID address to pack.
* @param [in] lid LID address to pack.
* @param [in/out] ib_addr Filled with packed ib address. Size of the structure
* must be at least what @ref uct_ib_address_size()
* returns for the given scope.
*/
void uct_ib_iface_address_pack(uct_ib_iface_t *iface, const union ibv_gid *gid,
uint16_t lid, uct_ib_address_t *ib_addr);
/**
* Unpack IB address.
*
* @param [in] ib_addr IB address to unpack.
* @param [out] lid Filled with address LID, or 0 if not present.
* @param [out] gid Filled with address GID, or 0 if not present.
*/
void uct_ib_address_unpack(const uct_ib_address_t *ib_addr, uint16_t *lid,
union ibv_gid *gid);
/**
* Convert IB address to a human-readable string.
*/
const char *uct_ib_address_str(const uct_ib_address_t *ib_addr, char *buf,
size_t max);
ucs_status_t uct_ib_iface_get_device_address(uct_iface_h tl_iface,
uct_device_addr_t *dev_addr);
int uct_ib_iface_is_reachable(const uct_iface_h tl_iface, const uct_device_addr_t *dev_addr,
const uct_iface_addr_t *iface_addr);
/*
* @param xport_hdr_len How many bytes this transport adds on top of IB header (LRH+BTH+iCRC+vCRC)
*/
ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len,
uct_iface_attr_t *iface_attr);
static inline uct_ib_md_t* uct_ib_iface_md(uct_ib_iface_t *iface)
{
return ucs_derived_of(iface->super.md, uct_ib_md_t);
}
static inline uct_ib_device_t* uct_ib_iface_device(uct_ib_iface_t *iface)
{
return &uct_ib_iface_md(iface)->dev;
}
static inline struct ibv_port_attr* uct_ib_iface_port_attr(uct_ib_iface_t *iface)
{
return uct_ib_device_port_attr(uct_ib_iface_device(iface), iface->config.port_num);
}
static inline void* uct_ib_iface_recv_desc_hdr(uct_ib_iface_t *iface,
uct_ib_iface_recv_desc_t *desc)
{
return (void*)((char *)desc + iface->config.rx_hdr_offset);
}
typedef struct uct_ib_recv_wr {
struct ibv_recv_wr ibwr;
struct ibv_sge sg;
} uct_ib_recv_wr_t;
/**
* prepare a list of n work requests that can be passed to
* ibv_post_recv()
*
* @return number of prepared wrs
*/
int uct_ib_iface_prepare_rx_wrs(uct_ib_iface_t *iface, ucs_mpool_t *mp,
uct_ib_recv_wr_t *wrs, unsigned n);
ucs_status_t uct_ib_iface_create_ah(uct_ib_iface_t *iface,
struct ibv_ah_attr *ah_attr,
struct ibv_ah **ah_p);
ucs_status_t uct_ib_iface_pre_arm(uct_ib_iface_t *iface);
ucs_status_t uct_ib_iface_event_fd_get(uct_iface_h iface, int *fd_p);
ucs_status_t uct_ib_iface_arm_cq(uct_ib_iface_t *iface,
uct_ib_dir_t dir,
int solicited_only);
ucs_status_t uct_ib_verbs_create_cq(struct ibv_context *context, int cqe,
struct ibv_comp_channel *channel,
int comp_vector, int ignore_overrun,
size_t *inl, struct ibv_cq **cq_p);
ucs_status_t uct_ib_iface_create_qp(uct_ib_iface_t *iface,
uct_ib_qp_attr_t *attr,
struct ibv_qp **qp_p);
void uct_ib_iface_fill_attr(uct_ib_iface_t *iface,
uct_ib_qp_attr_t *attr);
#define UCT_IB_IFACE_FMT \
"%s:%d"
#define UCT_IB_IFACE_ARG(_iface) \
uct_ib_device_name(uct_ib_iface_device(_iface)), (_iface)->config.port_num
#define UCT_IB_IFACE_VERBS_COMPLETION_ERR(_type, _iface, _i, _wc) \
ucs_fatal("%s completion[%d] with error on %s/%p: %s, vendor_err 0x%x wr_id 0x%lx", \
_type, _i, uct_ib_device_name(uct_ib_iface_device(_iface)), _iface, \
uct_ib_wc_status_str(_wc[i].status), _wc[i].vendor_err, \
_wc[i].wr_id);
#define UCT_IB_IFACE_VERBS_FOREACH_RXWQE(_iface, _i, _hdr, _wc, _wc_count) \
for (_i = 0; _i < _wc_count && ({ \
if (ucs_unlikely(_wc[i].status != IBV_WC_SUCCESS)) { \
UCT_IB_IFACE_VERBS_COMPLETION_ERR("receive", _iface, _i, _wc); \
} \
_hdr = (typeof(_hdr))uct_ib_iface_recv_desc_hdr(_iface, \
(uct_ib_iface_recv_desc_t *)(uintptr_t)_wc[i].wr_id); \
VALGRIND_MAKE_MEM_DEFINED(_hdr, _wc[i].byte_len); \
1; }); ++_i)
#define UCT_IB_MAX_ZCOPY_LOG_SGE(_iface) \
(uct_ib_iface_device(_iface)->max_zcopy_log_sge)
/**
* Fill ibv_sge data structure by data provided in uct_iov_t
* The function avoids copying IOVs with zero length
*
* @return Number of elements in sge[]
*/
static UCS_F_ALWAYS_INLINE
size_t uct_ib_verbs_sge_fill_iov(struct ibv_sge *sge, const uct_iov_t *iov,
size_t iovcnt)
{
size_t iov_it, sge_it = 0;
for (iov_it = 0; iov_it < iovcnt; ++iov_it) {
sge[sge_it].length = uct_iov_get_length(&iov[iov_it]);
if (sge[sge_it].length > 0) {
sge[sge_it].addr = (uintptr_t)(iov[iov_it].buffer);
} else {
continue; /* to avoid zero length elements in sge */
}
if (iov[sge_it].memh == UCT_MEM_HANDLE_NULL) {
sge[sge_it].lkey = 0;
} else {
sge[sge_it].lkey = ((uct_ib_mem_t *)(iov[iov_it].memh))->lkey;
}
++sge_it;
}
return sge_it;
}
static UCS_F_ALWAYS_INLINE
size_t uct_ib_iface_get_max_iov(uct_ib_iface_t *iface)
{
return iface->config.max_iov;
}
static UCS_F_ALWAYS_INLINE
void uct_ib_iface_set_max_iov(uct_ib_iface_t *iface, size_t max_iov)
{
size_t min_iov_requested;
ucs_assert((ssize_t)max_iov > 0);
min_iov_requested = ucs_max(max_iov, 1UL); /* max_iov mustn't be 0 */
iface->config.max_iov = ucs_min(UCT_IB_MAX_IOV, min_iov_requested);
}
static UCS_F_ALWAYS_INLINE
void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid,
const union ibv_gid *gid,
struct ibv_ah_attr *ah_attr)
{
memset(ah_attr, 0, sizeof(*ah_attr));
ah_attr->sl = iface->config.sl;
ah_attr->src_path_bits = iface->path_bits[0];
ah_attr->dlid = lid | iface->path_bits[0];
ah_attr->port_num = iface->config.port_num;
ah_attr->grh.traffic_class = iface->config.traffic_class;
if (iface->config.force_global_addr ||
(iface->gid.global.subnet_prefix != gid->global.subnet_prefix)) {
ucs_assert_always(gid->global.interface_id != 0);
ah_attr->is_global = 1;
ah_attr->grh.dgid = *gid;
ah_attr->grh.sgid_index = iface->config.gid_index;
ah_attr->grh.hop_limit = iface->config.hop_limit;
} else {
ah_attr->is_global = 0;
}
}
static UCS_F_ALWAYS_INLINE
void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface,
const uct_ib_address_t *ib_addr,
struct ibv_ah_attr *ah_attr)
{
union ibv_gid gid;
uint16_t lid;
ucs_assert(!uct_ib_iface_is_roce(iface) ==
!(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH));
uct_ib_address_unpack(ib_addr, &lid, &gid);
uct_ib_iface_fill_ah_attr_from_gid_lid(iface, lid, &gid, ah_attr);
}
static UCS_F_ALWAYS_INLINE
size_t uct_ib_iface_hdr_size(size_t max_inline, size_t min_size)
{
return (size_t)ucs_max((ssize_t)(max_inline - min_size), 0);
}
static UCS_F_ALWAYS_INLINE void
uct_ib_fence_info_init(uct_ib_fence_info_t* fence)
{
fence->fence_beat = 0;
}
#endif