/** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. * Copyright (C) UT-Battelle, LLC. 2014. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ #include "ib_device.h" #include "ib_md.h" #include #include #include #include #include #include #include #include #include typedef struct { union ibv_gid gid; struct { uint8_t major; uint8_t minor; } roce_version; } uct_ib_device_gid_info_t; /* This table is according to "Encoding for RNR NAK Timer Field" * in IBTA specification */ const double uct_ib_qp_rnr_time_ms[] = { 655.36, 0.01, 0.02, 0.03, 0.04, 0.06, 0.08, 0.12, 0.16, 0.24, 0.32, 0.48, 0.64, 0.96, 1.28, 1.92, 2.56, 3.84, 5.12, 7.68, 10.24, 15.36, 20.48, 30.72, 40.96, 61.44, 81.92, 122.88, 163.84, 245.76, 327.68, 491.52 }; /* use both gid + lid data for key generarion (lid - ib based, gid - RoCE) */ static UCS_F_ALWAYS_INLINE khint32_t uct_ib_kh_ah_hash_func(struct ibv_ah_attr attr) { return kh_int64_hash_func(attr.grh.dgid.global.subnet_prefix ^ attr.grh.dgid.global.interface_id ^ attr.dlid); } static UCS_F_ALWAYS_INLINE int uct_ib_kh_ah_hash_equal(struct ibv_ah_attr a, struct ibv_ah_attr b) { return !memcmp(&a, &b, sizeof(a)); } KHASH_IMPL(uct_ib_ah, struct ibv_ah_attr, struct ibv_ah*, 1, uct_ib_kh_ah_hash_func, uct_ib_kh_ah_hash_equal) #if ENABLE_STATS static ucs_stats_class_t uct_ib_device_stats_class = { .name = "", .num_counters = UCT_IB_DEVICE_STAT_LAST, .counter_names = { [UCT_IB_DEVICE_STAT_ASYNC_EVENT] = "async_event" } }; #endif static uct_ib_device_spec_t uct_ib_builtin_device_specs[] = { {"ConnectX-3", {0x15b3, 4099}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX4_PRM, 10}, {"ConnectX-3 Pro", {0x15b3, 4103}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX4_PRM, 11}, {"Connect-IB", {0x15b3, 4113}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V1, 20}, {"ConnectX-4", {0x15b3, 4115}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V1, 30}, {"ConnectX-4", {0x15b3, 4116}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V1, 29}, {"ConnectX-4 LX", {0x15b3, 4117}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V1, 28}, {"ConnectX-4 LX VF", {0x15b3, 4118}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V1, 28}, {"ConnectX-5", {0x15b3, 4119}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 38}, {"ConnectX-5", {0x15b3, 4121}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 40}, {"ConnectX-5", {0x15b3, 4120}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 39}, {"ConnectX-5", {0x15b3, 41682}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 37}, {"ConnectX-5", {0x15b3, 4122}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 36}, {"ConnectX-6", {0x15b3, 4123}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 50}, {"ConnectX-6 VF", {0x15b3, 4124}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 50}, {"ConnectX-6 DX", {0x15b3, 4125}, UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM | UCT_IB_DEVICE_FLAG_DC_V2, 50}, {"Generic HCA", {0, 0}, 0, 0}, {NULL} }; static void uct_ib_device_get_locality(const char *dev_name, ucs_sys_cpuset_t *cpu_mask, int *numa_node) { char *p, buf[ucs_max(CPU_SETSIZE, 10)]; ucs_status_t status; ssize_t nread; uint32_t word; int base, k; long n; /* Read list of CPUs close to the device */ CPU_ZERO(cpu_mask); nread = ucs_read_file(buf, sizeof(buf) - 1, 1, UCT_IB_DEVICE_SYSFS_FMT, dev_name, "local_cpus"); if (nread >= 0) { buf[CPU_SETSIZE - 1] = '\0'; base = 0; do { p = strrchr(buf, ','); if (p == NULL) { p = buf; } else if (*p == ',') { *(p++) = 0; } word = strtoul(p, 0, 16); for (k = 0; word; ++k, word >>= 1) { if (word & 1) { CPU_SET(base + k, cpu_mask); } } base += 32; } while ((base < CPU_SETSIZE) && (p != buf)); } else { /* If affinity file is not present, treat all CPUs as local */ for (k = 0; k < CPU_SETSIZE; ++k) { CPU_SET(k, cpu_mask); } } /* Read NUMA node number */ status = ucs_read_file_number(&n, 1, "/sys/class/infiniband/%s/device/numa_node", dev_name); *numa_node = (status == UCS_OK) ? n : -1; } static void uct_ib_async_event_handler(int fd, void *arg) { uct_ib_device_t *dev = arg; struct ibv_async_event event; ucs_log_level_t level; char event_info[200]; int ret; ret = ibv_get_async_event(dev->ibv_context, &event); if (ret != 0) { if (errno != EAGAIN) { ucs_warn("ibv_get_async_event() failed: %m"); } return; } switch (event.event_type) { case IBV_EVENT_CQ_ERR: snprintf(event_info, sizeof(event_info), "%s on CQ %p", ibv_event_type_str(event.event_type), event.element.cq); level = UCS_LOG_LEVEL_ERROR; break; case IBV_EVENT_QP_FATAL: case IBV_EVENT_QP_REQ_ERR: case IBV_EVENT_QP_ACCESS_ERR: case IBV_EVENT_COMM_EST: case IBV_EVENT_SQ_DRAINED: case IBV_EVENT_PATH_MIG: case IBV_EVENT_PATH_MIG_ERR: snprintf(event_info, sizeof(event_info), "%s on QPN 0x%x", ibv_event_type_str(event.event_type), event.element.qp->qp_num); level = UCS_LOG_LEVEL_ERROR; break; case IBV_EVENT_QP_LAST_WQE_REACHED: snprintf(event_info, sizeof(event_info), "SRQ-attached QP 0x%x was flushed", event.element.qp->qp_num); level = UCS_LOG_LEVEL_DEBUG; break; case IBV_EVENT_SRQ_ERR: level = UCS_LOG_LEVEL_ERROR; snprintf(event_info, sizeof(event_info), "%s on SRQ %p", ibv_event_type_str(event.event_type), event.element.srq); break; case IBV_EVENT_SRQ_LIMIT_REACHED: snprintf(event_info, sizeof(event_info), "%s on SRQ %p", ibv_event_type_str(event.event_type), event.element.srq); level = UCS_LOG_LEVEL_DEBUG; break; case IBV_EVENT_DEVICE_FATAL: case IBV_EVENT_PORT_ERR: snprintf(event_info, sizeof(event_info), "%s on port %d", ibv_event_type_str(event.event_type), event.element.port_num); level = UCS_LOG_LEVEL_ERROR; break; case IBV_EVENT_PORT_ACTIVE: #if HAVE_DECL_IBV_EVENT_GID_CHANGE case IBV_EVENT_GID_CHANGE: #endif case IBV_EVENT_LID_CHANGE: case IBV_EVENT_PKEY_CHANGE: case IBV_EVENT_SM_CHANGE: case IBV_EVENT_CLIENT_REREGISTER: snprintf(event_info, sizeof(event_info), "%s on port %d", ibv_event_type_str(event.event_type), event.element.port_num); level = UCS_LOG_LEVEL_WARN; break; #if HAVE_STRUCT_IBV_ASYNC_EVENT_ELEMENT_DCT case IBV_EXP_EVENT_DCT_KEY_VIOLATION: snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x", "DCT key violation", event.element.dct->dct_num); level = UCS_LOG_LEVEL_ERROR; break; case IBV_EXP_EVENT_DCT_ACCESS_ERR: if (event.element.dct) { snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x", "DCT access error", event.element.dct->dct_num); } else { snprintf(event_info, sizeof(event_info), "%s on DCTN UNKNOWN", "DCT access error"); } level = UCS_LOG_LEVEL_ERROR; break; case IBV_EXP_EVENT_DCT_REQ_ERR: snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x", "DCT requester error", event.element.dct->dct_num); level = UCS_LOG_LEVEL_ERROR; break; #endif default: snprintf(event_info, sizeof(event_info), "%s (%d)", ibv_event_type_str(event.event_type), event.event_type); level = UCS_LOG_LEVEL_INFO; break; }; UCS_STATS_UPDATE_COUNTER(dev->stats, UCT_IB_DEVICE_STAT_ASYNC_EVENT, +1); ucs_log(level, "IB Async event on %s: %s", uct_ib_device_name(dev), event_info); ibv_ack_async_event(&event); } static void uct_ib_device_get_ids(uct_ib_device_t *dev) { long vendor_id, device_id; if ((ucs_read_file_number(&vendor_id, 1, UCT_IB_DEVICE_SYSFS_FMT, uct_ib_device_name(dev), "vendor") == UCS_OK) && (ucs_read_file_number(&device_id, 1, UCT_IB_DEVICE_SYSFS_FMT, uct_ib_device_name(dev), "device") == UCS_OK)) { dev->pci_id.vendor = vendor_id; dev->pci_id.device = device_id; ucs_debug("%s vendor_id: 0x%x device_id: %d", uct_ib_device_name(dev), dev->pci_id.vendor, dev->pci_id.device); } else { dev->pci_id.vendor = 0; dev->pci_id.device = 0; ucs_warn("%s: could not read device/vendor id from sysfs, " "performance may be affected", uct_ib_device_name(dev)); } } ucs_status_t uct_ib_device_query(uct_ib_device_t *dev, struct ibv_device *ibv_device) { ucs_status_t status; uint8_t i; int ret; status = uct_ib_query_device(dev->ibv_context, &dev->dev_attr); if (status != UCS_OK) { return status; } /* Check device type*/ switch (ibv_device->node_type) { case IBV_NODE_SWITCH: dev->first_port = 0; dev->num_ports = 1; break; case IBV_NODE_CA: default: dev->first_port = 1; dev->num_ports = IBV_DEV_ATTR(dev, phys_port_cnt); break; } if (dev->num_ports > UCT_IB_DEV_MAX_PORTS) { ucs_error("%s has %d ports, but only up to %d are supported", ibv_get_device_name(ibv_device), dev->num_ports, UCT_IB_DEV_MAX_PORTS); return UCS_ERR_UNSUPPORTED; } /* Query all ports */ for (i = 0; i < dev->num_ports; ++i) { ret = ibv_query_port(dev->ibv_context, i + dev->first_port, &dev->port_attr[i]); if (ret != 0) { ucs_error("ibv_query_port() returned %d: %m", ret); return UCS_ERR_IO_ERROR; } } uct_ib_device_get_ids(dev); return UCS_OK; } ucs_status_t uct_ib_device_init(uct_ib_device_t *dev, struct ibv_device *ibv_device, int async_events UCS_STATS_ARG(ucs_stats_node_t *stats_parent)) { ucs_status_t status; dev->async_events = async_events; uct_ib_device_get_locality(ibv_get_device_name(ibv_device), &dev->local_cpus, &dev->numa_node); status = UCS_STATS_NODE_ALLOC(&dev->stats, &uct_ib_device_stats_class, stats_parent, "device"); if (status != UCS_OK) { goto err; } status = ucs_sys_fcntl_modfl(dev->ibv_context->async_fd, O_NONBLOCK, 0); if (status != UCS_OK) { goto err_release_stats; } /* Register to IB async events */ if (dev->async_events) { status = ucs_async_set_event_handler(UCS_ASYNC_THREAD_LOCK_TYPE, dev->ibv_context->async_fd, UCS_EVENT_SET_EVREAD, uct_ib_async_event_handler, dev, NULL); if (status != UCS_OK) { goto err_release_stats; } } kh_init_inplace(uct_ib_ah, &dev->ah_hash); ucs_spinlock_init(&dev->ah_lock); ucs_debug("initialized device '%s' (%s) with %d ports", uct_ib_device_name(dev), ibv_node_type_str(ibv_device->node_type), dev->num_ports); return UCS_OK; err_release_stats: UCS_STATS_NODE_FREE(dev->stats); err: return status; } void uct_ib_device_cleanup_ah_cached(uct_ib_device_t *dev) { struct ibv_ah *ah; kh_foreach_value(&dev->ah_hash, ah, ibv_destroy_ah(ah)); } void uct_ib_device_cleanup(uct_ib_device_t *dev) { ucs_status_t status; ucs_debug("destroying ib device %s", uct_ib_device_name(dev)); kh_destroy_inplace(uct_ib_ah, &dev->ah_hash); status = ucs_spinlock_destroy(&dev->ah_lock); if (status != UCS_OK) { ucs_warn("ucs_spinlock_destroy() failed (%d)", status); } if (dev->async_events) { ucs_async_remove_handler(dev->ibv_context->async_fd, 1); } UCS_STATS_NODE_FREE(dev->stats); } static inline int uct_ib_device_spec_match(uct_ib_device_t *dev, const uct_ib_device_spec_t *spec) { return (spec->pci_id.vendor == dev->pci_id.vendor) && (spec->pci_id.device == dev->pci_id.device); } const uct_ib_device_spec_t* uct_ib_device_spec(uct_ib_device_t *dev) { uct_ib_md_t *md = ucs_container_of(dev, uct_ib_md_t, dev); uct_ib_device_spec_t *spec; /* search through devices specified in the configuration */ for (spec = md->custom_devices.specs; spec < md->custom_devices.specs + md->custom_devices.count; ++spec) { if (uct_ib_device_spec_match(dev, spec)) { return spec; } } /* search through built-in list of device specifications */ spec = uct_ib_builtin_device_specs; while ((spec->name != NULL) && !uct_ib_device_spec_match(dev, spec)) { ++spec; } return spec; /* if no match is found, return the last entry, which contains default settings for unknown devices */ } static size_t uct_ib_device_get_ib_gid_index(uct_ib_md_t *md) { if (md->config.gid_index == UCS_ULUNITS_AUTO) { return UCT_IB_MD_DEFAULT_GID_INDEX; } else { return md->config.gid_index; } } static int uct_ib_device_is_iwarp(uct_ib_device_t *dev) { return dev->ibv_context->device->transport_type == IBV_TRANSPORT_IWARP; } ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num, unsigned flags) { uct_ib_md_t *md = ucs_container_of(dev, uct_ib_md_t, dev); const uct_ib_device_spec_t *dev_info; uint8_t required_dev_flags; ucs_status_t status; union ibv_gid gid; int is_roce_v2; if (port_num < dev->first_port || port_num >= dev->first_port + dev->num_ports) { return UCS_ERR_NO_DEVICE; } if (uct_ib_device_port_attr(dev, port_num)->state != IBV_PORT_ACTIVE) { ucs_trace("%s:%d is not active (state: %d)", uct_ib_device_name(dev), port_num, uct_ib_device_port_attr(dev, port_num)->state); return UCS_ERR_UNREACHABLE; } if (uct_ib_device_is_iwarp(dev)) { /* TODO: enable it when support is ready */ ucs_debug("iWarp device %s is not supported", uct_ib_device_name(dev)); return UCS_ERR_UNSUPPORTED; } if (!uct_ib_device_is_port_ib(dev, port_num) && (flags & UCT_IB_DEVICE_FLAG_LINK_IB)) { ucs_debug("%s:%d is not IB link layer", uct_ib_device_name(dev), port_num); return UCS_ERR_UNSUPPORTED; } if (flags & UCT_IB_DEVICE_FLAG_DC) { if (!IBV_DEVICE_HAS_DC(dev)) { ucs_trace("%s:%d does not support DC", uct_ib_device_name(dev), port_num); return UCS_ERR_UNSUPPORTED; } } /* check generic device flags */ dev_info = uct_ib_device_spec(dev); required_dev_flags = flags & (UCT_IB_DEVICE_FLAG_MLX4_PRM | UCT_IB_DEVICE_FLAG_MLX5_PRM); if (!ucs_test_all_flags(dev_info->flags, required_dev_flags)) { ucs_trace("%s:%d (%s) does not support flags 0x%x", uct_ib_device_name(dev), port_num, dev_info->name, required_dev_flags); return UCS_ERR_UNSUPPORTED; } if (md->check_subnet_filter && uct_ib_device_is_port_ib(dev, port_num)) { status = uct_ib_device_query_gid(dev, port_num, uct_ib_device_get_ib_gid_index(md), &gid, &is_roce_v2); if (status) { return status; } ucs_assert(is_roce_v2 == 0); if (md->subnet_filter != gid.global.subnet_prefix) { ucs_trace("%s:%d subnet_prefix does not match", uct_ib_device_name(dev), port_num); return UCS_ERR_UNSUPPORTED; } } return UCS_OK; } static int uct_ib_device_is_addr_ipv4_mcast(const struct in6_addr *raw, const uint32_t addr_last_bits) { /* IPv4 encoded multicast addresses */ return (raw->s6_addr32[0] == htonl(0xff0e0000)) && !(raw->s6_addr32[1] | addr_last_bits); } static sa_family_t uct_ib_device_get_addr_family(union ibv_gid *gid, int gid_index) { const struct in6_addr *raw = (struct in6_addr *)gid->raw; const uint32_t addr_last_bits = raw->s6_addr32[2] ^ htonl(0x0000ffff); char p[128]; ucs_debug("testing addr_family on gid index %d: %s", gid_index, inet_ntop(AF_INET6, gid, p, sizeof(p))); if (!((raw->s6_addr32[0] | raw->s6_addr32[1]) | addr_last_bits) || uct_ib_device_is_addr_ipv4_mcast(raw, addr_last_bits)) { return AF_INET; } else { return AF_INET6; } } static ucs_status_t uct_ib_device_query_gid_info(uct_ib_device_t *dev, uint8_t port_num, unsigned gid_index, uct_ib_device_gid_info_t *info) { int ret; #if HAVE_DECL_IBV_EXP_QUERY_GID_ATTR struct ibv_exp_gid_attr attr; attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE | IBV_EXP_QUERY_GID_ATTR_GID; ret = ibv_exp_query_gid_attr(dev->ibv_context, port_num, gid_index, &attr); if (ret == 0) { info->gid = attr.gid; switch (attr.type) { case IBV_EXP_IB_ROCE_V1_GID_TYPE: info->roce_version.major = 1; info->roce_version.minor = 0; return UCS_OK; case IBV_EXP_ROCE_V1_5_GID_TYPE: info->roce_version.major = 1; info->roce_version.minor = 5; return UCS_OK; case IBV_EXP_ROCE_V2_GID_TYPE: info->roce_version.major = 2; info->roce_version.minor = 0; return UCS_OK; default: ucs_error("Invalid GID[%d] type on %s:%d: %d", gid_index, uct_ib_device_name(dev), port_num, attr.type); return UCS_ERR_IO_ERROR; } } #else #define UCT_IB_SYSFS_GID_TYPE_FMT \ "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d" char buf[16]; ret = ibv_query_gid(dev->ibv_context, port_num, gid_index, &info->gid); if (ret == 0) { ret = ucs_read_file(buf, sizeof(buf) - 1, 1, UCT_IB_SYSFS_GID_TYPE_FMT, uct_ib_device_name(dev), port_num, gid_index); if (ret > 0) { if (!strncmp(buf, "IB/RoCE v1", 10)) { info->roce_version.major = 1; info->roce_version.minor = 0; } else if (!strncmp(buf, "RoCE v2", 7)) { info->roce_version.major = 2; info->roce_version.minor = 0; } else { ucs_error("failed to parse gid type '%s' (dev=%s port=%d index=%d)", buf, uct_ib_device_name(dev), port_num, gid_index); return UCS_ERR_INVALID_PARAM; } } else { info->roce_version.major = 1; info->roce_version.minor = 0; } return UCS_OK; } #endif ucs_error("ibv_query_gid(dev=%s port=%d index=%d) failed: %m", uct_ib_device_name(dev), port_num, gid_index); return UCS_ERR_INVALID_PARAM; } int uct_ib_device_test_roce_gid_index(uct_ib_device_t *dev, uint8_t port_num, const union ibv_gid *gid, uint8_t gid_index) { struct ibv_ah_attr ah_attr; struct ibv_ah *ah; ucs_assert(uct_ib_device_is_port_roce(dev, port_num)); memset(&ah_attr, 0, sizeof(ah_attr)); ah_attr.port_num = port_num; ah_attr.is_global = 1; ah_attr.grh.dgid = *gid; ah_attr.grh.sgid_index = gid_index; ah_attr.grh.hop_limit = 255; ah = ibv_create_ah(ucs_container_of(dev, uct_ib_md_t, dev)->pd, &ah_attr); if (ah == NULL) { return 0; /* gid entry is not operational */ } ibv_destroy_ah(ah); return 1; } static ucs_status_t uct_ib_device_set_roce_gid_index(uct_ib_device_t *dev, uint8_t port_num, uint8_t *gid_index) { static const uct_ib_roce_version_desc_t roce_prio[] = { {2, 0, AF_INET}, {2, 0, AF_INET6}, {1, 0, AF_INET}, {1, 0, AF_INET6} }; int gid_tbl_len = uct_ib_device_port_attr(dev, port_num)->gid_tbl_len; ucs_status_t status = UCS_OK; int priorities_arr_len = ucs_static_array_size(roce_prio); uct_ib_device_gid_info_t gid_info; int i, prio_idx; /* search for matching GID table entries, accroding to the order defined * in priorities array */ for (prio_idx = 0; prio_idx < priorities_arr_len; prio_idx++) { for (i = 0; i < gid_tbl_len; i++) { status = uct_ib_device_query_gid_info(dev, port_num, i, &gid_info); if (status != UCS_OK) { goto out; } if ((roce_prio[prio_idx].roce_major == gid_info.roce_version.major) && (roce_prio[prio_idx].roce_minor == gid_info.roce_version.minor) && (roce_prio[prio_idx].address_family == uct_ib_device_get_addr_family(&gid_info.gid, i)) && uct_ib_device_test_roce_gid_index(dev, port_num, &gid_info.gid, i)) { *gid_index = i; goto out_print; } } } *gid_index = UCT_IB_MD_DEFAULT_GID_INDEX; out_print: ucs_debug("%s:%d using gid_index %d", uct_ib_device_name(dev), port_num, *gid_index); out: return status; } int uct_ib_device_is_port_ib(uct_ib_device_t *dev, uint8_t port_num) { #if HAVE_DECL_IBV_LINK_LAYER_INFINIBAND return uct_ib_device_port_attr(dev, port_num)->link_layer == IBV_LINK_LAYER_INFINIBAND; #else return 1; #endif } int uct_ib_device_is_port_roce(uct_ib_device_t *dev, uint8_t port_num) { return IBV_PORT_IS_LINK_LAYER_ETHERNET(uct_ib_device_port_attr(dev, port_num)); } ucs_status_t uct_ib_device_select_gid_index(uct_ib_device_t *dev, uint8_t port_num, size_t md_config_index, uint8_t *gid_index) { ucs_status_t status = UCS_OK; if (md_config_index == UCS_ULUNITS_AUTO) { if (uct_ib_device_is_port_roce(dev, port_num)) { status = uct_ib_device_set_roce_gid_index(dev, port_num, gid_index); } else { *gid_index = UCT_IB_MD_DEFAULT_GID_INDEX; } } else { *gid_index = md_config_index; } return status; } const char *uct_ib_device_name(uct_ib_device_t *dev) { return ibv_get_device_name(dev->ibv_context->device); } size_t uct_ib_mtu_value(enum ibv_mtu mtu) { switch (mtu) { case IBV_MTU_256: return 256; case IBV_MTU_512: return 512; case IBV_MTU_1024: return 1024; case IBV_MTU_2048: return 2048; case IBV_MTU_4096: return 4096; } ucs_fatal("Invalid MTU value (%d)", mtu); } uint8_t uct_ib_to_qp_fabric_time(double time) { double to; to = log(time / 4.096e-6) / log(2.0); if (to < 1) { return 1; /* Very small timeout */ } else if ((long)(to + 0.5) >= UCT_IB_FABRIC_TIME_MAX) { return 0; /* No timeout */ } else { return (long)(to + 0.5); } } uint8_t uct_ib_to_rnr_fabric_time(double time) { double time_ms = time * UCS_MSEC_PER_SEC; uint8_t index, next_index; double avg_ms; for (index = 1; index < UCT_IB_FABRIC_TIME_MAX; index++) { next_index = (index + 1) % UCT_IB_FABRIC_TIME_MAX; if (time_ms <= uct_ib_qp_rnr_time_ms[next_index]) { avg_ms = (uct_ib_qp_rnr_time_ms[index] + uct_ib_qp_rnr_time_ms[next_index]) * 0.5; if (time_ms < avg_ms) { /* return previous index */ return index; } else { /* return current index */ return next_index; } } } return 0; /* this is a special value that means the maximum value */ } ucs_status_t uct_ib_modify_qp(struct ibv_qp *qp, enum ibv_qp_state state) { struct ibv_qp_attr qp_attr; ucs_debug("modify QP 0x%x to state %d", qp->qp_num, state); memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.qp_state = state; if (ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE)) { ucs_warn("modify qp 0x%x to state %d failed: %m", qp->qp_num, state); return UCS_ERR_IO_ERROR; } return UCS_OK; } ucs_status_t uct_ib_device_query_ports(uct_ib_device_t *dev, unsigned flags, uct_tl_device_resource_t **tl_devices_p, unsigned *num_tl_devices_p) { uct_tl_device_resource_t *tl_devices; unsigned num_tl_devices; ucs_status_t status; uint8_t port_num; /* Allocate resources array * We may allocate more memory than really required, but it's not so bad. */ tl_devices = ucs_calloc(dev->num_ports, sizeof(*tl_devices), "ib device resource"); if (tl_devices == NULL) { status = UCS_ERR_NO_MEMORY; goto err; } /* Second pass: fill port information */ num_tl_devices = 0; for (port_num = dev->first_port; port_num < dev->first_port + dev->num_ports; ++port_num) { /* Check port capabilities */ status = uct_ib_device_port_check(dev, port_num, flags); if (status != UCS_OK) { ucs_trace("%s:%d does not support flags 0x%x: %s", uct_ib_device_name(dev), port_num, flags, ucs_status_string(status)); continue; } /* Save device information */ ucs_snprintf_zero(tl_devices[num_tl_devices].name, sizeof(tl_devices[num_tl_devices].name), "%s:%d", uct_ib_device_name(dev), port_num); tl_devices[num_tl_devices].type = UCT_DEVICE_TYPE_NET; ++num_tl_devices; } if (num_tl_devices == 0) { ucs_debug("no compatible IB ports found for flags 0x%x", flags); status = UCS_ERR_NO_DEVICE; goto err_free; } *num_tl_devices_p = num_tl_devices; *tl_devices_p = tl_devices; return UCS_OK; err_free: ucs_free(tl_devices); err: return status; } ucs_status_t uct_ib_device_find_port(uct_ib_device_t *dev, const char *resource_dev_name, uint8_t *p_port_num) { const char *ibdev_name; unsigned port_num; size_t devname_len; char *p; p = strrchr(resource_dev_name, ':'); if (p == NULL) { goto err; /* Wrong device name format */ } devname_len = p - resource_dev_name; ibdev_name = uct_ib_device_name(dev); if ((strlen(ibdev_name) != devname_len) || strncmp(ibdev_name, resource_dev_name, devname_len)) { goto err; /* Device name is wrong */ } port_num = strtod(p + 1, &p); if (*p != '\0') { goto err; /* Failed to parse port number */ } if ((port_num < dev->first_port) || (port_num >= dev->first_port + dev->num_ports)) { goto err; /* Port number out of range */ } *p_port_num = port_num; return UCS_OK; err: ucs_error("%s: failed to find port", resource_dev_name); return UCS_ERR_NO_DEVICE; } ucs_status_t uct_ib_device_mtu(const char *dev_name, uct_md_h md, int *p_mtu) { uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev; uint8_t port_num; ucs_status_t status; status = uct_ib_device_find_port(dev, dev_name, &port_num); if (status != UCS_OK) { return status; } *p_mtu = uct_ib_mtu_value(uct_ib_device_port_attr(dev, port_num)->active_mtu); return UCS_OK; } int uct_ib_device_is_gid_raw_empty(uint8_t *gid_raw) { return (*(uint64_t *)gid_raw == 0) && (*(uint64_t *)(gid_raw + 8) == 0); } ucs_status_t uct_ib_device_query_gid(uct_ib_device_t *dev, uint8_t port_num, unsigned gid_index, union ibv_gid *gid, int *is_roce_v2) { uct_ib_device_gid_info_t gid_info; ucs_status_t status; status = uct_ib_device_query_gid_info(dev, port_num, gid_index, &gid_info); if (status != UCS_OK) { return status; } if (uct_ib_device_is_gid_raw_empty(gid_info.gid.raw)) { ucs_error("Invalid gid[%d] on %s:%d", gid_index, uct_ib_device_name(dev), port_num); return UCS_ERR_INVALID_ADDR; } *gid = gid_info.gid; *is_roce_v2 = uct_ib_device_is_port_roce(dev, port_num) && (gid_info.roce_version.major >= 2); return UCS_OK; } size_t uct_ib_device_odp_max_size(uct_ib_device_t *dev) { #if HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS const struct ibv_exp_device_attr *dev_attr = &dev->dev_attr; uint32_t required_ud_odp_caps = IBV_EXP_ODP_SUPPORT_SEND; uint32_t required_rc_odp_caps = IBV_EXP_ODP_SUPPORT_SEND | IBV_EXP_ODP_SUPPORT_WRITE | IBV_EXP_ODP_SUPPORT_READ; if (RUNNING_ON_VALGRIND || !IBV_EXP_HAVE_ODP(dev_attr) || !ucs_test_all_flags(IBV_EXP_ODP_CAPS(dev_attr, rc), required_rc_odp_caps) || !ucs_test_all_flags(IBV_EXP_ODP_CAPS(dev_attr, ud), required_ud_odp_caps)) { return 0; } if (IBV_DEVICE_HAS_DC(dev) # if HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS_PER_TRANSPORT_CAPS_DC_ODP_CAPS && !ucs_test_all_flags(IBV_EXP_ODP_CAPS(dev_attr, dc), required_rc_odp_caps) # endif ) { return 0; } # if HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_MR_MAX_SIZE return dev_attr->odp_mr_max_size; # else return 1ul << 28; /* Limit ODP to 256 MB by default */ # endif /* HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_MR_MAX_SIZE */ #else return 0; #endif /* HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS */ } const char *uct_ib_wc_status_str(enum ibv_wc_status wc_status) { return ibv_wc_status_str(wc_status); } static ucs_status_t uct_ib_device_create_ah(uct_ib_device_t *dev, struct ibv_ah_attr *ah_attr, struct ibv_pd *pd, struct ibv_ah **ah_p) { char buf[128]; char *p, *endp; struct ibv_ah *ah; ah = ibv_create_ah(pd, ah_attr); if (ah == NULL) { p = buf; endp = buf + sizeof(buf); snprintf(p, endp - p, "dlid=%d sl=%d port=%d src_path_bits=%d", ah_attr->dlid, ah_attr->sl, ah_attr->port_num, ah_attr->src_path_bits); p += strlen(p); if (ah_attr->is_global) { snprintf(p, endp - p, " dgid="); p += strlen(p); inet_ntop(AF_INET6, &ah_attr->grh.dgid, p, endp - p); p += strlen(p); snprintf(p, endp - p, " sgid_index=%d traffic_class=%d", ah_attr->grh.sgid_index, ah_attr->grh.traffic_class); } ucs_error("ibv_create_ah(%s) failed: %m", buf); return UCS_ERR_INVALID_ADDR; } *ah_p = ah; return UCS_OK; } ucs_status_t uct_ib_device_create_ah_cached(uct_ib_device_t *dev, struct ibv_ah_attr *ah_attr, struct ibv_pd *pd, struct ibv_ah **ah_p) { ucs_status_t status = UCS_OK; khiter_t iter; int ret; ucs_spin_lock(&dev->ah_lock); /* looking for existing AH with same attributes */ iter = kh_get(uct_ib_ah, &dev->ah_hash, *ah_attr); if (iter == kh_end(&dev->ah_hash)) { /* new AH */ status = uct_ib_device_create_ah(dev, ah_attr, pd, ah_p); if (status != UCS_OK) { goto unlock; } /* store AH in hash */ iter = kh_put(uct_ib_ah, &dev->ah_hash, *ah_attr, &ret); /* failed to store - rollback */ if (iter == kh_end(&dev->ah_hash)) { ibv_destroy_ah(*ah_p); status = UCS_ERR_NO_MEMORY; goto unlock; } kh_value(&dev->ah_hash, iter) = *ah_p; } else { /* found existing AH */ *ah_p = kh_value(&dev->ah_hash, iter); } unlock: ucs_spin_unlock(&dev->ah_lock); return status; } int uct_ib_get_cqe_size(int cqe_size_min) { static int cqe_size_max = -1; int cqe_size; if (cqe_size_max == -1) { #ifdef __aarch64__ char arm_board_vendor[128]; ucs_aarch64_cpuid_t cpuid; ucs_aarch64_cpuid(&cpuid); arm_board_vendor[0] = '\0'; ucs_read_file(arm_board_vendor, sizeof(arm_board_vendor), 1, "/sys/devices/virtual/dmi/id/board_vendor"); ucs_debug("arm_board_vendor is '%s'", arm_board_vendor); cqe_size_max = ((strcasestr(arm_board_vendor, "Huawei")) && (cpuid.implementer == 0x41) && (cpuid.architecture == 8) && (cpuid.variant == 0) && (cpuid.part == 0xd08) && (cpuid.revision == 2)) ? 64 : 128; #else cqe_size_max = 128; #endif ucs_debug("max IB CQE size is %d", cqe_size_max); } /* Set cqe size according to inline size and cache line size. */ cqe_size = ucs_max(cqe_size_min, UCS_SYS_CACHE_LINE_SIZE); cqe_size = ucs_max(cqe_size, 64); /* at least 64 */ cqe_size = ucs_min(cqe_size, cqe_size_max); return cqe_size; }