/**
* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/
#ifndef UCT_IB_DEVICE_H
#define UCT_IB_DEVICE_H
#include "ib_verbs.h"
#include <uct/api/uct.h>
#include <uct/base/uct_iface.h>
#include <ucs/stats/stats.h>
#include <ucs/debug/assert.h>
#include <ucs/datastruct/khash.h>
#include <ucs/type/spinlock.h>
#include <endian.h>
#define UCT_IB_QPN_ORDER 24 /* How many bits can be an IB QP number */
#define UCT_IB_LRH_LEN 8 /* IB Local routing header */
#define UCT_IB_GRH_LEN 40 /* IB GLobal routing header */
#define UCT_IB_BTH_LEN 12 /* IB base transport header */
#define UCT_IB_ROCE_LEN 14 /* Ethernet header -
6B for Destination MAC +
6B for Source MAC + 2B Type (RoCE) */
#define UCT_IB_DETH_LEN 8 /* IB datagram header */
#define UCT_IB_RETH_LEN 16 /* IB RDMA header */
#define UCT_IB_ATOMIC_ETH_LEN 28 /* IB atomic header */
#define UCT_IB_AETH_LEN 4 /* IB ack */
#define UCT_IB_PAYLOAD_ALIGN 4 /* IB payload padding */
#define UCT_IB_ICRC_LEN 4 /* IB invariant crc footer */
#define UCT_IB_VCRC_LEN 2 /* IB variant crc footer */
#define UCT_IB_DELIM_LEN 2 /* IB wire delimiter */
#define UCT_IB_FDR_PACKET_GAP 64 /* Minimal FDR packet gap */
#define UCT_IB_MAX_MESSAGE_SIZE (2UL << 30) /* Maximal IB message size */
#define UCT_IB_PKEY_PARTITION_MASK 0x7fff /* IB partition number mask */
#define UCT_IB_PKEY_MEMBERSHIP_MASK 0x8000 /* Full/send-only member */
#define UCT_IB_DEV_MAX_PORTS 2
#define UCT_IB_FABRIC_TIME_MAX 32
#define UCT_IB_INVALID_RKEY 0xffffffffu
#define UCT_IB_KEY 0x1ee7a330
#define UCT_IB_LINK_LOCAL_PREFIX be64toh(0xfe80000000000000ul) /* IBTA 4.1.1 12a */
#define UCT_IB_SITE_LOCAL_PREFIX be64toh(0xfec0000000000000ul) /* IBTA 4.1.1 12b */
#define UCT_IB_SITE_LOCAL_MASK be64toh(0xffffffffffff0000ul) /* IBTA 4.1.1 12b */
#define UCT_IB_DEFAULT_ROCEV2_DSCP 106 /* Default DSCP for RoCE v2 */
#define UCT_IB_DEVICE_SYSFS_FMT "/sys/class/infiniband/%s/device/%s"
enum {
UCT_IB_DEVICE_STAT_ASYNC_EVENT,
UCT_IB_DEVICE_STAT_LAST
};
enum {
UCT_IB_DEVICE_FLAG_MLX4_PRM = UCS_BIT(1), /* Device supports mlx4 PRM */
UCT_IB_DEVICE_FLAG_MLX5_PRM = UCS_BIT(2), /* Device supports mlx5 PRM */
UCT_IB_DEVICE_FLAG_MELLANOX = UCS_BIT(3), /* Mellanox device */
UCT_IB_DEVICE_FLAG_LINK_IB = UCS_BIT(5), /* Require only IB */
UCT_IB_DEVICE_FLAG_DC_V1 = UCS_BIT(6), /* Device supports DC ver 1 */
UCT_IB_DEVICE_FLAG_DC_V2 = UCS_BIT(7), /* Device supports DC ver 2 */
UCT_IB_DEVICE_FLAG_AV = UCS_BIT(8), /* Device supports compact AV */
UCT_IB_DEVICE_FLAG_DC = UCT_IB_DEVICE_FLAG_DC_V1 |
UCT_IB_DEVICE_FLAG_DC_V2, /* Device supports DC */
UCT_IB_DEVICE_FLAG_ODP_IMPLICIT = UCS_BIT(9),
};
/**
* Flags which specify which address fields are present
*/
enum {
UCT_IB_ADDRESS_FLAG_LID = UCS_BIT(0),
UCT_IB_ADDRESS_FLAG_IF_ID = UCS_BIT(1),
UCT_IB_ADDRESS_FLAG_SUBNET16 = UCS_BIT(2),
UCT_IB_ADDRESS_FLAG_SUBNET64 = UCS_BIT(3),
UCT_IB_ADDRESS_FLAG_GID = UCS_BIT(4),
UCT_IB_ADDRESS_FLAG_LINK_LAYER_IB = UCS_BIT(5),
UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH = UCS_BIT(6)
};
/**
* IB network address
*/
typedef struct uct_ib_address {
uint8_t flags;
/* Following fields appear in this order (if specified by flags).
* The full gid always appears last:
* - uint16_t lid
* - uint64_t if_id
* - uint16_t subnet16
* - uint64_t subnet64
* For RoCE:
* - uint8_t gid[16]
*/
} UCS_S_PACKED uct_ib_address_t;
/**
* PCI identifier of a device
*/
typedef struct {
uint16_t vendor;
uint16_t device;
} uct_ib_pci_id_t;
/**
* IB device specification.
*/
typedef struct uct_ib_device_spec {
const char *name;
uct_ib_pci_id_t pci_id;
unsigned flags;
uint8_t priority;
} uct_ib_device_spec_t;
KHASH_TYPE(uct_ib_ah, struct ibv_ah_attr, struct ibv_ah*);
/**
* IB device (corresponds to HCA)
*/
typedef struct uct_ib_device {
struct ibv_context *ibv_context; /* Verbs context */
uct_ib_device_attr dev_attr; /* Cached device attributes */
uint8_t first_port; /* Number of first port (usually 1) */
uint8_t num_ports; /* Amount of physical ports */
ucs_sys_cpuset_t local_cpus; /* CPUs local to device */
int numa_node; /* NUMA node of the device */
int async_events; /* Whether async events are handled */
int max_zcopy_log_sge; /* Maximum sges log for zcopy am */
UCS_STATS_NODE_DECLARE(stats)
struct ibv_port_attr port_attr[UCT_IB_DEV_MAX_PORTS]; /* Cached port attributes */
uct_ib_pci_id_t pci_id;
unsigned flags;
uint8_t atomic_arg_sizes;
uint8_t atomic_arg_sizes_be;
uint8_t ext_atomic_arg_sizes;
uint8_t ext_atomic_arg_sizes_be;
uint8_t pci_fadd_arg_sizes;
uint8_t pci_cswap_arg_sizes;
/* AH hash */
khash_t(uct_ib_ah) ah_hash;
ucs_spinlock_t ah_lock;
} uct_ib_device_t;
/**
* RoCE version priorities
*/
typedef struct uct_ib_roce_version_desc {
uint8_t roce_major;
uint8_t roce_minor;
sa_family_t address_family;
} uct_ib_roce_version_desc_t;
extern const double uct_ib_qp_rnr_time_ms[];
/**
* Check if a port on a device is active and supports the given flags.
*/
ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num,
unsigned flags);
/*
* Helper function to list IB transport resources.
*
* @param dev IB device.
* @param flags Transport requirements from IB device (see UCT_IB_RESOURCE_FLAG_xx)
* @param devices_p Filled with a pointer to an array of devices.
* @param num_devices_p Filled with the number of devices.
*/
ucs_status_t uct_ib_device_query_ports(uct_ib_device_t *dev, unsigned flags,
uct_tl_device_resource_t **devices_p,
unsigned *num_devices_p);
ucs_status_t uct_ib_device_query(uct_ib_device_t *dev,
struct ibv_device *ibv_device);
ucs_status_t uct_ib_device_init(uct_ib_device_t *dev,
struct ibv_device *ibv_device, int async_events
UCS_STATS_ARG(ucs_stats_node_t *stats_parent));
void uct_ib_device_cleanup(uct_ib_device_t *dev);
/**
* @return device specification.
*/
const uct_ib_device_spec_t* uct_ib_device_spec(uct_ib_device_t *dev);
/**
* Select the IB gid index to use.
*
* @param dev IB device.
* @param port_num Port number.
* @param md_config_index Gid index from the md configuration.
* @param ib_gid_index Filled with the selected gid index.
*/
ucs_status_t uct_ib_device_select_gid_index(uct_ib_device_t *dev,
uint8_t port_num,
size_t md_config_index,
uint8_t *ib_gid_index);
/**
* @return device name.
*/
const char *uct_ib_device_name(uct_ib_device_t *dev);
/**
* @return whether the port is InfiniBand
*/
int uct_ib_device_is_port_ib(uct_ib_device_t *dev, uint8_t port_num);
/**
* @return whether the port is RoCE
*/
int uct_ib_device_is_port_roce(uct_ib_device_t *dev, uint8_t port_num);
/**
* @return 1 if the gid_raw is 0, 0 otherwise.
*/
int uct_ib_device_is_gid_raw_empty(uint8_t *gid_raw);
/**
* Convert time-in-seconds to IB fabric QP time value
*/
uint8_t uct_ib_to_qp_fabric_time(double time);
/**
* Convert time-in-seconds to IB fabric RNR time value
*/
uint8_t uct_ib_to_rnr_fabric_time(double time);
/**
* @return MTU in bytes.
*/
size_t uct_ib_mtu_value(enum ibv_mtu mtu);
/**
* Modify QP to a given state and check for error
*/
ucs_status_t uct_ib_modify_qp(struct ibv_qp *qp, enum ibv_qp_state state);
/**
* find device mtu. This function can be used before ib
* interface is created.
*/
ucs_status_t uct_ib_device_mtu(const char *dev_name, uct_md_h md, int *p_mtu);
ucs_status_t uct_ib_device_find_port(uct_ib_device_t *dev,
const char *resource_dev_name,
uint8_t *p_port_num);
size_t uct_ib_device_odp_max_size(uct_ib_device_t *dev);
const char *uct_ib_wc_status_str(enum ibv_wc_status wc_status);
ucs_status_t uct_ib_device_create_ah_cached(uct_ib_device_t *dev,
struct ibv_ah_attr *ah_attr,
struct ibv_pd *pd,
struct ibv_ah **ah_p);
void uct_ib_device_cleanup_ah_cached(uct_ib_device_t *dev);
static inline struct ibv_port_attr*
uct_ib_device_port_attr(uct_ib_device_t *dev, uint8_t port_num)
{
return &dev->port_attr[port_num - dev->first_port];
}
static inline int uct_ib_device_has_pci_atomics(uct_ib_device_t *dev)
{
return !!((dev->pci_fadd_arg_sizes | dev->pci_cswap_arg_sizes) &
(sizeof(uint32_t) | sizeof(uint64_t)));
}
ucs_status_t uct_ib_device_query_gid(uct_ib_device_t *dev, uint8_t port_num,
unsigned gid_index, union ibv_gid *gid,
int *is_roce_v2);
int uct_ib_device_test_roce_gid_index(uct_ib_device_t *dev, uint8_t port_num,
const union ibv_gid *gid,
uint8_t gid_index);
int uct_ib_get_cqe_size(int cqe_size_min);
static inline ucs_status_t uct_ib_poll_cq(struct ibv_cq *cq, unsigned *count, struct ibv_wc *wcs)
{
int ret;
ret = ibv_poll_cq(cq, *count, wcs);
if (ret <= 0) {
if (ucs_likely(ret == 0)) {
return UCS_ERR_NO_PROGRESS;
}
ucs_fatal("failed to poll receive CQ %d", ret);
}
*count = ret;
return UCS_OK;
}
#endif