/**
* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED.
* Copyright (C) UT-Battelle, LLC. 2014. ALL RIGHTS RESERVED.
* See file LICENSE for terms.
*/
#include "ib_device.h"
#include "ib_md.h"
#include <ucs/arch/bitops.h>
#include <ucs/debug/memtrack.h>
#include <ucs/debug/log.h>
#include <ucs/async/async.h>
#include <ucs/sys/compiler.h>
#include <ucs/sys/string.h>
#include <ucs/sys/sys.h>
#include <sys/poll.h>
#include <sched.h>
typedef struct {
union ibv_gid gid;
struct {
uint8_t major;
uint8_t minor;
} roce_version;
} uct_ib_device_gid_info_t;
/* This table is according to "Encoding for RNR NAK Timer Field"
* in IBTA specification */
const double uct_ib_qp_rnr_time_ms[] = {
655.36, 0.01, 0.02, 0.03, 0.04, 0.06, 0.08, 0.12,
0.16, 0.24, 0.32, 0.48, 0.64, 0.96, 1.28, 1.92,
2.56, 3.84, 5.12, 7.68, 10.24, 15.36, 20.48, 30.72,
40.96, 61.44, 81.92, 122.88, 163.84, 245.76, 327.68, 491.52
};
/* use both gid + lid data for key generarion (lid - ib based, gid - RoCE) */
static UCS_F_ALWAYS_INLINE
khint32_t uct_ib_kh_ah_hash_func(struct ibv_ah_attr attr)
{
return kh_int64_hash_func(attr.grh.dgid.global.subnet_prefix ^
attr.grh.dgid.global.interface_id ^
attr.dlid);
}
static UCS_F_ALWAYS_INLINE
int uct_ib_kh_ah_hash_equal(struct ibv_ah_attr a, struct ibv_ah_attr b)
{
return !memcmp(&a, &b, sizeof(a));
}
KHASH_IMPL(uct_ib_ah, struct ibv_ah_attr, struct ibv_ah*, 1,
uct_ib_kh_ah_hash_func, uct_ib_kh_ah_hash_equal)
#if ENABLE_STATS
static ucs_stats_class_t uct_ib_device_stats_class = {
.name = "",
.num_counters = UCT_IB_DEVICE_STAT_LAST,
.counter_names = {
[UCT_IB_DEVICE_STAT_ASYNC_EVENT] = "async_event"
}
};
#endif
static uct_ib_device_spec_t uct_ib_builtin_device_specs[] = {
{"ConnectX-3", {0x15b3, 4099},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX4_PRM, 10},
{"ConnectX-3 Pro", {0x15b3, 4103},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX4_PRM, 11},
{"Connect-IB", {0x15b3, 4113},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
UCT_IB_DEVICE_FLAG_DC_V1, 20},
{"ConnectX-4", {0x15b3, 4115},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
UCT_IB_DEVICE_FLAG_DC_V1, 30},
{"ConnectX-4", {0x15b3, 4116},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
UCT_IB_DEVICE_FLAG_DC_V1, 29},
{"ConnectX-4 LX", {0x15b3, 4117},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
UCT_IB_DEVICE_FLAG_DC_V1, 28},
{"ConnectX-4 LX VF", {0x15b3, 4118},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
UCT_IB_DEVICE_FLAG_DC_V1, 28},
{"ConnectX-5", {0x15b3, 4119},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
UCT_IB_DEVICE_FLAG_DC_V2, 38},
{"ConnectX-5", {0x15b3, 4121},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
UCT_IB_DEVICE_FLAG_DC_V2, 40},
{"ConnectX-5", {0x15b3, 4120},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
UCT_IB_DEVICE_FLAG_DC_V2, 39},
{"ConnectX-5", {0x15b3, 41682},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
UCT_IB_DEVICE_FLAG_DC_V2, 37},
{"ConnectX-5", {0x15b3, 4122},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
UCT_IB_DEVICE_FLAG_DC_V2, 36},
{"ConnectX-6", {0x15b3, 4123},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
UCT_IB_DEVICE_FLAG_DC_V2, 50},
{"ConnectX-6 VF", {0x15b3, 4124},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
UCT_IB_DEVICE_FLAG_DC_V2, 50},
{"ConnectX-6 DX", {0x15b3, 4125},
UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
UCT_IB_DEVICE_FLAG_DC_V2, 50},
{"Generic HCA", {0, 0}, 0, 0},
{NULL}
};
static void uct_ib_device_get_locality(const char *dev_name,
ucs_sys_cpuset_t *cpu_mask,
int *numa_node)
{
char *p, buf[ucs_max(CPU_SETSIZE, 10)];
ucs_status_t status;
ssize_t nread;
uint32_t word;
int base, k;
long n;
/* Read list of CPUs close to the device */
CPU_ZERO(cpu_mask);
nread = ucs_read_file(buf, sizeof(buf) - 1, 1, UCT_IB_DEVICE_SYSFS_FMT,
dev_name, "local_cpus");
if (nread >= 0) {
buf[CPU_SETSIZE - 1] = '\0';
base = 0;
do {
p = strrchr(buf, ',');
if (p == NULL) {
p = buf;
} else if (*p == ',') {
*(p++) = 0;
}
word = strtoul(p, 0, 16);
for (k = 0; word; ++k, word >>= 1) {
if (word & 1) {
CPU_SET(base + k, cpu_mask);
}
}
base += 32;
} while ((base < CPU_SETSIZE) && (p != buf));
} else {
/* If affinity file is not present, treat all CPUs as local */
for (k = 0; k < CPU_SETSIZE; ++k) {
CPU_SET(k, cpu_mask);
}
}
/* Read NUMA node number */
status = ucs_read_file_number(&n, 1,
"/sys/class/infiniband/%s/device/numa_node",
dev_name);
*numa_node = (status == UCS_OK) ? n : -1;
}
static void uct_ib_async_event_handler(int fd, void *arg)
{
uct_ib_device_t *dev = arg;
struct ibv_async_event event;
ucs_log_level_t level;
char event_info[200];
int ret;
ret = ibv_get_async_event(dev->ibv_context, &event);
if (ret != 0) {
if (errno != EAGAIN) {
ucs_warn("ibv_get_async_event() failed: %m");
}
return;
}
switch (event.event_type) {
case IBV_EVENT_CQ_ERR:
snprintf(event_info, sizeof(event_info), "%s on CQ %p",
ibv_event_type_str(event.event_type), event.element.cq);
level = UCS_LOG_LEVEL_ERROR;
break;
case IBV_EVENT_QP_FATAL:
case IBV_EVENT_QP_REQ_ERR:
case IBV_EVENT_QP_ACCESS_ERR:
case IBV_EVENT_COMM_EST:
case IBV_EVENT_SQ_DRAINED:
case IBV_EVENT_PATH_MIG:
case IBV_EVENT_PATH_MIG_ERR:
snprintf(event_info, sizeof(event_info), "%s on QPN 0x%x",
ibv_event_type_str(event.event_type), event.element.qp->qp_num);
level = UCS_LOG_LEVEL_ERROR;
break;
case IBV_EVENT_QP_LAST_WQE_REACHED:
snprintf(event_info, sizeof(event_info), "SRQ-attached QP 0x%x was flushed",
event.element.qp->qp_num);
level = UCS_LOG_LEVEL_DEBUG;
break;
case IBV_EVENT_SRQ_ERR:
level = UCS_LOG_LEVEL_ERROR;
snprintf(event_info, sizeof(event_info), "%s on SRQ %p",
ibv_event_type_str(event.event_type), event.element.srq);
break;
case IBV_EVENT_SRQ_LIMIT_REACHED:
snprintf(event_info, sizeof(event_info), "%s on SRQ %p",
ibv_event_type_str(event.event_type), event.element.srq);
level = UCS_LOG_LEVEL_DEBUG;
break;
case IBV_EVENT_DEVICE_FATAL:
case IBV_EVENT_PORT_ERR:
snprintf(event_info, sizeof(event_info), "%s on port %d",
ibv_event_type_str(event.event_type), event.element.port_num);
level = UCS_LOG_LEVEL_ERROR;
break;
case IBV_EVENT_PORT_ACTIVE:
#if HAVE_DECL_IBV_EVENT_GID_CHANGE
case IBV_EVENT_GID_CHANGE:
#endif
case IBV_EVENT_LID_CHANGE:
case IBV_EVENT_PKEY_CHANGE:
case IBV_EVENT_SM_CHANGE:
case IBV_EVENT_CLIENT_REREGISTER:
snprintf(event_info, sizeof(event_info), "%s on port %d",
ibv_event_type_str(event.event_type), event.element.port_num);
level = UCS_LOG_LEVEL_WARN;
break;
#if HAVE_STRUCT_IBV_ASYNC_EVENT_ELEMENT_DCT
case IBV_EXP_EVENT_DCT_KEY_VIOLATION:
snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x",
"DCT key violation", event.element.dct->dct_num);
level = UCS_LOG_LEVEL_ERROR;
break;
case IBV_EXP_EVENT_DCT_ACCESS_ERR:
if (event.element.dct) {
snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x",
"DCT access error", event.element.dct->dct_num);
} else {
snprintf(event_info, sizeof(event_info), "%s on DCTN UNKNOWN",
"DCT access error");
}
level = UCS_LOG_LEVEL_ERROR;
break;
case IBV_EXP_EVENT_DCT_REQ_ERR:
snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x",
"DCT requester error", event.element.dct->dct_num);
level = UCS_LOG_LEVEL_ERROR;
break;
#endif
default:
snprintf(event_info, sizeof(event_info), "%s (%d)",
ibv_event_type_str(event.event_type), event.event_type);
level = UCS_LOG_LEVEL_INFO;
break;
};
UCS_STATS_UPDATE_COUNTER(dev->stats, UCT_IB_DEVICE_STAT_ASYNC_EVENT, +1);
ucs_log(level, "IB Async event on %s: %s", uct_ib_device_name(dev), event_info);
ibv_ack_async_event(&event);
}
static void uct_ib_device_get_ids(uct_ib_device_t *dev)
{
long vendor_id, device_id;
if ((ucs_read_file_number(&vendor_id, 1, UCT_IB_DEVICE_SYSFS_FMT,
uct_ib_device_name(dev), "vendor") == UCS_OK) &&
(ucs_read_file_number(&device_id, 1, UCT_IB_DEVICE_SYSFS_FMT,
uct_ib_device_name(dev), "device") == UCS_OK)) {
dev->pci_id.vendor = vendor_id;
dev->pci_id.device = device_id;
ucs_debug("%s vendor_id: 0x%x device_id: %d", uct_ib_device_name(dev),
dev->pci_id.vendor, dev->pci_id.device);
} else {
dev->pci_id.vendor = 0;
dev->pci_id.device = 0;
ucs_warn("%s: could not read device/vendor id from sysfs, "
"performance may be affected", uct_ib_device_name(dev));
}
}
ucs_status_t uct_ib_device_query(uct_ib_device_t *dev,
struct ibv_device *ibv_device)
{
ucs_status_t status;
uint8_t i;
int ret;
status = uct_ib_query_device(dev->ibv_context, &dev->dev_attr);
if (status != UCS_OK) {
return status;
}
/* Check device type*/
switch (ibv_device->node_type) {
case IBV_NODE_SWITCH:
dev->first_port = 0;
dev->num_ports = 1;
break;
case IBV_NODE_CA:
default:
dev->first_port = 1;
dev->num_ports = IBV_DEV_ATTR(dev, phys_port_cnt);
break;
}
if (dev->num_ports > UCT_IB_DEV_MAX_PORTS) {
ucs_error("%s has %d ports, but only up to %d are supported",
ibv_get_device_name(ibv_device), dev->num_ports,
UCT_IB_DEV_MAX_PORTS);
return UCS_ERR_UNSUPPORTED;
}
/* Query all ports */
for (i = 0; i < dev->num_ports; ++i) {
ret = ibv_query_port(dev->ibv_context, i + dev->first_port,
&dev->port_attr[i]);
if (ret != 0) {
ucs_error("ibv_query_port() returned %d: %m", ret);
return UCS_ERR_IO_ERROR;
}
}
uct_ib_device_get_ids(dev);
return UCS_OK;
}
ucs_status_t uct_ib_device_init(uct_ib_device_t *dev,
struct ibv_device *ibv_device, int async_events
UCS_STATS_ARG(ucs_stats_node_t *stats_parent))
{
ucs_status_t status;
dev->async_events = async_events;
uct_ib_device_get_locality(ibv_get_device_name(ibv_device), &dev->local_cpus,
&dev->numa_node);
status = UCS_STATS_NODE_ALLOC(&dev->stats, &uct_ib_device_stats_class,
stats_parent, "device");
if (status != UCS_OK) {
goto err;
}
status = ucs_sys_fcntl_modfl(dev->ibv_context->async_fd, O_NONBLOCK, 0);
if (status != UCS_OK) {
goto err_release_stats;
}
/* Register to IB async events */
if (dev->async_events) {
status = ucs_async_set_event_handler(UCS_ASYNC_THREAD_LOCK_TYPE,
dev->ibv_context->async_fd,
UCS_EVENT_SET_EVREAD,
uct_ib_async_event_handler, dev,
NULL);
if (status != UCS_OK) {
goto err_release_stats;
}
}
kh_init_inplace(uct_ib_ah, &dev->ah_hash);
ucs_spinlock_init(&dev->ah_lock);
ucs_debug("initialized device '%s' (%s) with %d ports", uct_ib_device_name(dev),
ibv_node_type_str(ibv_device->node_type),
dev->num_ports);
return UCS_OK;
err_release_stats:
UCS_STATS_NODE_FREE(dev->stats);
err:
return status;
}
void uct_ib_device_cleanup_ah_cached(uct_ib_device_t *dev)
{
struct ibv_ah *ah;
kh_foreach_value(&dev->ah_hash, ah, ibv_destroy_ah(ah));
}
void uct_ib_device_cleanup(uct_ib_device_t *dev)
{
ucs_status_t status;
ucs_debug("destroying ib device %s", uct_ib_device_name(dev));
kh_destroy_inplace(uct_ib_ah, &dev->ah_hash);
status = ucs_spinlock_destroy(&dev->ah_lock);
if (status != UCS_OK) {
ucs_warn("ucs_spinlock_destroy() failed (%d)", status);
}
if (dev->async_events) {
ucs_async_remove_handler(dev->ibv_context->async_fd, 1);
}
UCS_STATS_NODE_FREE(dev->stats);
}
static inline int uct_ib_device_spec_match(uct_ib_device_t *dev,
const uct_ib_device_spec_t *spec)
{
return (spec->pci_id.vendor == dev->pci_id.vendor) &&
(spec->pci_id.device == dev->pci_id.device);
}
const uct_ib_device_spec_t* uct_ib_device_spec(uct_ib_device_t *dev)
{
uct_ib_md_t *md = ucs_container_of(dev, uct_ib_md_t, dev);
uct_ib_device_spec_t *spec;
/* search through devices specified in the configuration */
for (spec = md->custom_devices.specs;
spec < md->custom_devices.specs + md->custom_devices.count; ++spec) {
if (uct_ib_device_spec_match(dev, spec)) {
return spec;
}
}
/* search through built-in list of device specifications */
spec = uct_ib_builtin_device_specs;
while ((spec->name != NULL) && !uct_ib_device_spec_match(dev, spec)) {
++spec;
}
return spec; /* if no match is found, return the last entry, which contains
default settings for unknown devices */
}
static size_t uct_ib_device_get_ib_gid_index(uct_ib_md_t *md)
{
if (md->config.gid_index == UCS_ULUNITS_AUTO) {
return UCT_IB_MD_DEFAULT_GID_INDEX;
} else {
return md->config.gid_index;
}
}
static int uct_ib_device_is_iwarp(uct_ib_device_t *dev)
{
return dev->ibv_context->device->transport_type == IBV_TRANSPORT_IWARP;
}
ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num,
unsigned flags)
{
uct_ib_md_t *md = ucs_container_of(dev, uct_ib_md_t, dev);
const uct_ib_device_spec_t *dev_info;
uint8_t required_dev_flags;
ucs_status_t status;
union ibv_gid gid;
int is_roce_v2;
if (port_num < dev->first_port || port_num >= dev->first_port + dev->num_ports) {
return UCS_ERR_NO_DEVICE;
}
if (uct_ib_device_port_attr(dev, port_num)->state != IBV_PORT_ACTIVE) {
ucs_trace("%s:%d is not active (state: %d)", uct_ib_device_name(dev),
port_num, uct_ib_device_port_attr(dev, port_num)->state);
return UCS_ERR_UNREACHABLE;
}
if (uct_ib_device_is_iwarp(dev)) {
/* TODO: enable it when support is ready */
ucs_debug("iWarp device %s is not supported", uct_ib_device_name(dev));
return UCS_ERR_UNSUPPORTED;
}
if (!uct_ib_device_is_port_ib(dev, port_num) && (flags & UCT_IB_DEVICE_FLAG_LINK_IB)) {
ucs_debug("%s:%d is not IB link layer", uct_ib_device_name(dev),
port_num);
return UCS_ERR_UNSUPPORTED;
}
if (flags & UCT_IB_DEVICE_FLAG_DC) {
if (!IBV_DEVICE_HAS_DC(dev)) {
ucs_trace("%s:%d does not support DC", uct_ib_device_name(dev), port_num);
return UCS_ERR_UNSUPPORTED;
}
}
/* check generic device flags */
dev_info = uct_ib_device_spec(dev);
required_dev_flags = flags & (UCT_IB_DEVICE_FLAG_MLX4_PRM |
UCT_IB_DEVICE_FLAG_MLX5_PRM);
if (!ucs_test_all_flags(dev_info->flags, required_dev_flags)) {
ucs_trace("%s:%d (%s) does not support flags 0x%x", uct_ib_device_name(dev),
port_num, dev_info->name, required_dev_flags);
return UCS_ERR_UNSUPPORTED;
}
if (md->check_subnet_filter && uct_ib_device_is_port_ib(dev, port_num)) {
status = uct_ib_device_query_gid(dev, port_num,
uct_ib_device_get_ib_gid_index(md), &gid,
&is_roce_v2);
if (status) {
return status;
}
ucs_assert(is_roce_v2 == 0);
if (md->subnet_filter != gid.global.subnet_prefix) {
ucs_trace("%s:%d subnet_prefix does not match",
uct_ib_device_name(dev), port_num);
return UCS_ERR_UNSUPPORTED;
}
}
return UCS_OK;
}
static int uct_ib_device_is_addr_ipv4_mcast(const struct in6_addr *raw,
const uint32_t addr_last_bits)
{
/* IPv4 encoded multicast addresses */
return (raw->s6_addr32[0] == htonl(0xff0e0000)) &&
!(raw->s6_addr32[1] | addr_last_bits);
}
static sa_family_t uct_ib_device_get_addr_family(union ibv_gid *gid, int gid_index)
{
const struct in6_addr *raw = (struct in6_addr *)gid->raw;
const uint32_t addr_last_bits = raw->s6_addr32[2] ^ htonl(0x0000ffff);
char p[128];
ucs_debug("testing addr_family on gid index %d: %s",
gid_index, inet_ntop(AF_INET6, gid, p, sizeof(p)));
if (!((raw->s6_addr32[0] | raw->s6_addr32[1]) | addr_last_bits) ||
uct_ib_device_is_addr_ipv4_mcast(raw, addr_last_bits)) {
return AF_INET;
} else {
return AF_INET6;
}
}
static ucs_status_t
uct_ib_device_query_gid_info(uct_ib_device_t *dev, uint8_t port_num,
unsigned gid_index, uct_ib_device_gid_info_t *info)
{
int ret;
#if HAVE_DECL_IBV_EXP_QUERY_GID_ATTR
struct ibv_exp_gid_attr attr;
attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE | IBV_EXP_QUERY_GID_ATTR_GID;
ret = ibv_exp_query_gid_attr(dev->ibv_context, port_num, gid_index, &attr);
if (ret == 0) {
info->gid = attr.gid;
switch (attr.type) {
case IBV_EXP_IB_ROCE_V1_GID_TYPE:
info->roce_version.major = 1;
info->roce_version.minor = 0;
return UCS_OK;
case IBV_EXP_ROCE_V1_5_GID_TYPE:
info->roce_version.major = 1;
info->roce_version.minor = 5;
return UCS_OK;
case IBV_EXP_ROCE_V2_GID_TYPE:
info->roce_version.major = 2;
info->roce_version.minor = 0;
return UCS_OK;
default:
ucs_error("Invalid GID[%d] type on %s:%d: %d",
gid_index, uct_ib_device_name(dev), port_num, attr.type);
return UCS_ERR_IO_ERROR;
}
}
#else
#define UCT_IB_SYSFS_GID_TYPE_FMT \
"/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d"
char buf[16];
ret = ibv_query_gid(dev->ibv_context, port_num, gid_index, &info->gid);
if (ret == 0) {
ret = ucs_read_file(buf, sizeof(buf) - 1, 1, UCT_IB_SYSFS_GID_TYPE_FMT,
uct_ib_device_name(dev), port_num, gid_index);
if (ret > 0) {
if (!strncmp(buf, "IB/RoCE v1", 10)) {
info->roce_version.major = 1;
info->roce_version.minor = 0;
} else if (!strncmp(buf, "RoCE v2", 7)) {
info->roce_version.major = 2;
info->roce_version.minor = 0;
} else {
ucs_error("failed to parse gid type '%s' (dev=%s port=%d index=%d)",
buf, uct_ib_device_name(dev), port_num, gid_index);
return UCS_ERR_INVALID_PARAM;
}
} else {
info->roce_version.major = 1;
info->roce_version.minor = 0;
}
return UCS_OK;
}
#endif
ucs_error("ibv_query_gid(dev=%s port=%d index=%d) failed: %m",
uct_ib_device_name(dev), port_num, gid_index);
return UCS_ERR_INVALID_PARAM;
}
int uct_ib_device_test_roce_gid_index(uct_ib_device_t *dev, uint8_t port_num,
const union ibv_gid *gid,
uint8_t gid_index)
{
struct ibv_ah_attr ah_attr;
struct ibv_ah *ah;
ucs_assert(uct_ib_device_is_port_roce(dev, port_num));
memset(&ah_attr, 0, sizeof(ah_attr));
ah_attr.port_num = port_num;
ah_attr.is_global = 1;
ah_attr.grh.dgid = *gid;
ah_attr.grh.sgid_index = gid_index;
ah_attr.grh.hop_limit = 255;
ah = ibv_create_ah(ucs_container_of(dev, uct_ib_md_t, dev)->pd, &ah_attr);
if (ah == NULL) {
return 0; /* gid entry is not operational */
}
ibv_destroy_ah(ah);
return 1;
}
static ucs_status_t uct_ib_device_set_roce_gid_index(uct_ib_device_t *dev,
uint8_t port_num,
uint8_t *gid_index)
{
static const uct_ib_roce_version_desc_t roce_prio[] = {
{2, 0, AF_INET},
{2, 0, AF_INET6},
{1, 0, AF_INET},
{1, 0, AF_INET6}
};
int gid_tbl_len = uct_ib_device_port_attr(dev, port_num)->gid_tbl_len;
ucs_status_t status = UCS_OK;
int priorities_arr_len = ucs_static_array_size(roce_prio);
uct_ib_device_gid_info_t gid_info;
int i, prio_idx;
/* search for matching GID table entries, accroding to the order defined
* in priorities array
*/
for (prio_idx = 0; prio_idx < priorities_arr_len; prio_idx++) {
for (i = 0; i < gid_tbl_len; i++) {
status = uct_ib_device_query_gid_info(dev, port_num, i, &gid_info);
if (status != UCS_OK) {
goto out;
}
if ((roce_prio[prio_idx].roce_major == gid_info.roce_version.major) &&
(roce_prio[prio_idx].roce_minor == gid_info.roce_version.minor) &&
(roce_prio[prio_idx].address_family ==
uct_ib_device_get_addr_family(&gid_info.gid, i)) &&
uct_ib_device_test_roce_gid_index(dev, port_num, &gid_info.gid, i)) {
*gid_index = i;
goto out_print;
}
}
}
*gid_index = UCT_IB_MD_DEFAULT_GID_INDEX;
out_print:
ucs_debug("%s:%d using gid_index %d", uct_ib_device_name(dev), port_num,
*gid_index);
out:
return status;
}
int uct_ib_device_is_port_ib(uct_ib_device_t *dev, uint8_t port_num)
{
#if HAVE_DECL_IBV_LINK_LAYER_INFINIBAND
return uct_ib_device_port_attr(dev, port_num)->link_layer == IBV_LINK_LAYER_INFINIBAND;
#else
return 1;
#endif
}
int uct_ib_device_is_port_roce(uct_ib_device_t *dev, uint8_t port_num)
{
return IBV_PORT_IS_LINK_LAYER_ETHERNET(uct_ib_device_port_attr(dev, port_num));
}
ucs_status_t uct_ib_device_select_gid_index(uct_ib_device_t *dev,
uint8_t port_num,
size_t md_config_index,
uint8_t *gid_index)
{
ucs_status_t status = UCS_OK;
if (md_config_index == UCS_ULUNITS_AUTO) {
if (uct_ib_device_is_port_roce(dev, port_num)) {
status = uct_ib_device_set_roce_gid_index(dev, port_num, gid_index);
} else {
*gid_index = UCT_IB_MD_DEFAULT_GID_INDEX;
}
} else {
*gid_index = md_config_index;
}
return status;
}
const char *uct_ib_device_name(uct_ib_device_t *dev)
{
return ibv_get_device_name(dev->ibv_context->device);
}
size_t uct_ib_mtu_value(enum ibv_mtu mtu)
{
switch (mtu) {
case IBV_MTU_256:
return 256;
case IBV_MTU_512:
return 512;
case IBV_MTU_1024:
return 1024;
case IBV_MTU_2048:
return 2048;
case IBV_MTU_4096:
return 4096;
}
ucs_fatal("Invalid MTU value (%d)", mtu);
}
uint8_t uct_ib_to_qp_fabric_time(double time)
{
double to;
to = log(time / 4.096e-6) / log(2.0);
if (to < 1) {
return 1; /* Very small timeout */
} else if ((long)(to + 0.5) >= UCT_IB_FABRIC_TIME_MAX) {
return 0; /* No timeout */
} else {
return (long)(to + 0.5);
}
}
uint8_t uct_ib_to_rnr_fabric_time(double time)
{
double time_ms = time * UCS_MSEC_PER_SEC;
uint8_t index, next_index;
double avg_ms;
for (index = 1; index < UCT_IB_FABRIC_TIME_MAX; index++) {
next_index = (index + 1) % UCT_IB_FABRIC_TIME_MAX;
if (time_ms <= uct_ib_qp_rnr_time_ms[next_index]) {
avg_ms = (uct_ib_qp_rnr_time_ms[index] +
uct_ib_qp_rnr_time_ms[next_index]) * 0.5;
if (time_ms < avg_ms) {
/* return previous index */
return index;
} else {
/* return current index */
return next_index;
}
}
}
return 0; /* this is a special value that means the maximum value */
}
ucs_status_t uct_ib_modify_qp(struct ibv_qp *qp, enum ibv_qp_state state)
{
struct ibv_qp_attr qp_attr;
ucs_debug("modify QP 0x%x to state %d", qp->qp_num, state);
memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.qp_state = state;
if (ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE)) {
ucs_warn("modify qp 0x%x to state %d failed: %m", qp->qp_num, state);
return UCS_ERR_IO_ERROR;
}
return UCS_OK;
}
ucs_status_t uct_ib_device_query_ports(uct_ib_device_t *dev, unsigned flags,
uct_tl_device_resource_t **tl_devices_p,
unsigned *num_tl_devices_p)
{
uct_tl_device_resource_t *tl_devices;
unsigned num_tl_devices;
ucs_status_t status;
uint8_t port_num;
/* Allocate resources array
* We may allocate more memory than really required, but it's not so bad. */
tl_devices = ucs_calloc(dev->num_ports, sizeof(*tl_devices), "ib device resource");
if (tl_devices == NULL) {
status = UCS_ERR_NO_MEMORY;
goto err;
}
/* Second pass: fill port information */
num_tl_devices = 0;
for (port_num = dev->first_port; port_num < dev->first_port + dev->num_ports;
++port_num)
{
/* Check port capabilities */
status = uct_ib_device_port_check(dev, port_num, flags);
if (status != UCS_OK) {
ucs_trace("%s:%d does not support flags 0x%x: %s",
uct_ib_device_name(dev), port_num, flags,
ucs_status_string(status));
continue;
}
/* Save device information */
ucs_snprintf_zero(tl_devices[num_tl_devices].name,
sizeof(tl_devices[num_tl_devices].name),
"%s:%d", uct_ib_device_name(dev), port_num);
tl_devices[num_tl_devices].type = UCT_DEVICE_TYPE_NET;
++num_tl_devices;
}
if (num_tl_devices == 0) {
ucs_debug("no compatible IB ports found for flags 0x%x", flags);
status = UCS_ERR_NO_DEVICE;
goto err_free;
}
*num_tl_devices_p = num_tl_devices;
*tl_devices_p = tl_devices;
return UCS_OK;
err_free:
ucs_free(tl_devices);
err:
return status;
}
ucs_status_t uct_ib_device_find_port(uct_ib_device_t *dev,
const char *resource_dev_name,
uint8_t *p_port_num)
{
const char *ibdev_name;
unsigned port_num;
size_t devname_len;
char *p;
p = strrchr(resource_dev_name, ':');
if (p == NULL) {
goto err; /* Wrong device name format */
}
devname_len = p - resource_dev_name;
ibdev_name = uct_ib_device_name(dev);
if ((strlen(ibdev_name) != devname_len) ||
strncmp(ibdev_name, resource_dev_name, devname_len))
{
goto err; /* Device name is wrong */
}
port_num = strtod(p + 1, &p);
if (*p != '\0') {
goto err; /* Failed to parse port number */
}
if ((port_num < dev->first_port) || (port_num >= dev->first_port + dev->num_ports)) {
goto err; /* Port number out of range */
}
*p_port_num = port_num;
return UCS_OK;
err:
ucs_error("%s: failed to find port", resource_dev_name);
return UCS_ERR_NO_DEVICE;
}
ucs_status_t uct_ib_device_mtu(const char *dev_name, uct_md_h md, int *p_mtu)
{
uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev;
uint8_t port_num;
ucs_status_t status;
status = uct_ib_device_find_port(dev, dev_name, &port_num);
if (status != UCS_OK) {
return status;
}
*p_mtu = uct_ib_mtu_value(uct_ib_device_port_attr(dev, port_num)->active_mtu);
return UCS_OK;
}
int uct_ib_device_is_gid_raw_empty(uint8_t *gid_raw)
{
return (*(uint64_t *)gid_raw == 0) && (*(uint64_t *)(gid_raw + 8) == 0);
}
ucs_status_t uct_ib_device_query_gid(uct_ib_device_t *dev, uint8_t port_num,
unsigned gid_index, union ibv_gid *gid,
int *is_roce_v2)
{
uct_ib_device_gid_info_t gid_info;
ucs_status_t status;
status = uct_ib_device_query_gid_info(dev, port_num, gid_index, &gid_info);
if (status != UCS_OK) {
return status;
}
if (uct_ib_device_is_gid_raw_empty(gid_info.gid.raw)) {
ucs_error("Invalid gid[%d] on %s:%d", gid_index,
uct_ib_device_name(dev), port_num);
return UCS_ERR_INVALID_ADDR;
}
*gid = gid_info.gid;
*is_roce_v2 = uct_ib_device_is_port_roce(dev, port_num) &&
(gid_info.roce_version.major >= 2);
return UCS_OK;
}
size_t uct_ib_device_odp_max_size(uct_ib_device_t *dev)
{
#if HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS
const struct ibv_exp_device_attr *dev_attr = &dev->dev_attr;
uint32_t required_ud_odp_caps = IBV_EXP_ODP_SUPPORT_SEND;
uint32_t required_rc_odp_caps = IBV_EXP_ODP_SUPPORT_SEND |
IBV_EXP_ODP_SUPPORT_WRITE |
IBV_EXP_ODP_SUPPORT_READ;
if (RUNNING_ON_VALGRIND ||
!IBV_EXP_HAVE_ODP(dev_attr) ||
!ucs_test_all_flags(IBV_EXP_ODP_CAPS(dev_attr, rc), required_rc_odp_caps) ||
!ucs_test_all_flags(IBV_EXP_ODP_CAPS(dev_attr, ud), required_ud_odp_caps))
{
return 0;
}
if (IBV_DEVICE_HAS_DC(dev)
# if HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS_PER_TRANSPORT_CAPS_DC_ODP_CAPS
&& !ucs_test_all_flags(IBV_EXP_ODP_CAPS(dev_attr, dc), required_rc_odp_caps)
# endif
)
{
return 0;
}
# if HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_MR_MAX_SIZE
return dev_attr->odp_mr_max_size;
# else
return 1ul << 28; /* Limit ODP to 256 MB by default */
# endif /* HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_MR_MAX_SIZE */
#else
return 0;
#endif /* HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS */
}
const char *uct_ib_wc_status_str(enum ibv_wc_status wc_status)
{
return ibv_wc_status_str(wc_status);
}
static ucs_status_t uct_ib_device_create_ah(uct_ib_device_t *dev,
struct ibv_ah_attr *ah_attr,
struct ibv_pd *pd,
struct ibv_ah **ah_p)
{
char buf[128];
char *p, *endp;
struct ibv_ah *ah;
ah = ibv_create_ah(pd, ah_attr);
if (ah == NULL) {
p = buf;
endp = buf + sizeof(buf);
snprintf(p, endp - p, "dlid=%d sl=%d port=%d src_path_bits=%d",
ah_attr->dlid, ah_attr->sl,
ah_attr->port_num, ah_attr->src_path_bits);
p += strlen(p);
if (ah_attr->is_global) {
snprintf(p, endp - p, " dgid=");
p += strlen(p);
inet_ntop(AF_INET6, &ah_attr->grh.dgid, p, endp - p);
p += strlen(p);
snprintf(p, endp - p, " sgid_index=%d traffic_class=%d",
ah_attr->grh.sgid_index, ah_attr->grh.traffic_class);
}
ucs_error("ibv_create_ah(%s) failed: %m", buf);
return UCS_ERR_INVALID_ADDR;
}
*ah_p = ah;
return UCS_OK;
}
ucs_status_t uct_ib_device_create_ah_cached(uct_ib_device_t *dev,
struct ibv_ah_attr *ah_attr,
struct ibv_pd *pd,
struct ibv_ah **ah_p)
{
ucs_status_t status = UCS_OK;
khiter_t iter;
int ret;
ucs_spin_lock(&dev->ah_lock);
/* looking for existing AH with same attributes */
iter = kh_get(uct_ib_ah, &dev->ah_hash, *ah_attr);
if (iter == kh_end(&dev->ah_hash)) {
/* new AH */
status = uct_ib_device_create_ah(dev, ah_attr, pd, ah_p);
if (status != UCS_OK) {
goto unlock;
}
/* store AH in hash */
iter = kh_put(uct_ib_ah, &dev->ah_hash, *ah_attr, &ret);
/* failed to store - rollback */
if (iter == kh_end(&dev->ah_hash)) {
ibv_destroy_ah(*ah_p);
status = UCS_ERR_NO_MEMORY;
goto unlock;
}
kh_value(&dev->ah_hash, iter) = *ah_p;
} else {
/* found existing AH */
*ah_p = kh_value(&dev->ah_hash, iter);
}
unlock:
ucs_spin_unlock(&dev->ah_lock);
return status;
}
int uct_ib_get_cqe_size(int cqe_size_min)
{
static int cqe_size_max = -1;
int cqe_size;
if (cqe_size_max == -1) {
#ifdef __aarch64__
char arm_board_vendor[128];
ucs_aarch64_cpuid_t cpuid;
ucs_aarch64_cpuid(&cpuid);
arm_board_vendor[0] = '\0';
ucs_read_file(arm_board_vendor, sizeof(arm_board_vendor), 1,
"/sys/devices/virtual/dmi/id/board_vendor");
ucs_debug("arm_board_vendor is '%s'", arm_board_vendor);
cqe_size_max = ((strcasestr(arm_board_vendor, "Huawei")) &&
(cpuid.implementer == 0x41) && (cpuid.architecture == 8) &&
(cpuid.variant == 0) && (cpuid.part == 0xd08) &&
(cpuid.revision == 2))
? 64 : 128;
#else
cqe_size_max = 128;
#endif
ucs_debug("max IB CQE size is %d", cqe_size_max);
}
/* Set cqe size according to inline size and cache line size. */
cqe_size = ucs_max(cqe_size_min, UCS_SYS_CACHE_LINE_SIZE);
cqe_size = ucs_max(cqe_size, 64); /* at least 64 */
cqe_size = ucs_min(cqe_size, cqe_size_max);
return cqe_size;
}