Blob Blame History Raw
/**
* Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/

#include "ib_iface.h"
#include "ib_log.h"

#include <uct/base/uct_md.h>
#include <ucs/arch/bitops.h>
#include <ucs/arch/cpu.h>
#include <ucs/type/class.h>
#include <ucs/type/cpu_set.h>
#include <ucs/debug/log.h>
#include <ucs/time/time.h>
#include <ucs/memory/numa.h>
#include <string.h>
#include <stdlib.h>
#include <poll.h>


static UCS_CONFIG_DEFINE_ARRAY(path_bits_spec,
                               sizeof(ucs_range_spec_t),
                               UCS_CONFIG_TYPE_RANGE_SPEC);

const char *uct_ib_mtu_values[] = {
    [UCT_IB_MTU_DEFAULT]    = "default",
    [UCT_IB_MTU_512]        = "512",
    [UCT_IB_MTU_1024]       = "1024",
    [UCT_IB_MTU_2048]       = "2048",
    [UCT_IB_MTU_4096]       = "4096",
    [UCT_IB_MTU_LAST]       = NULL
};

enum {
    UCT_IB_ADDRESS_TYPE_LINK_LOCAL,
    UCT_IB_ADDRESS_TYPE_SITE_LOCAL,
    UCT_IB_ADDRESS_TYPE_GLOBAL,
    UCT_IB_ADDRESS_TYPE_ETH,
    UCT_IB_ADDRESS_TYPE_LAST,
    UCT_IB_IFACE_ADDRESS_TYPE_AUTO  = UCT_IB_ADDRESS_TYPE_LAST,
    UCT_IB_IFACE_ADDRESS_TYPE_LAST
};

static const char *uct_ib_iface_addr_types[] = {
   [UCT_IB_ADDRESS_TYPE_LINK_LOCAL] = "ib_local",
   [UCT_IB_ADDRESS_TYPE_SITE_LOCAL] = "ib_site_local",
   [UCT_IB_ADDRESS_TYPE_GLOBAL]     = "ib_global",
   [UCT_IB_ADDRESS_TYPE_ETH]        = "eth",
   [UCT_IB_IFACE_ADDRESS_TYPE_AUTO] = "auto",
   [UCT_IB_IFACE_ADDRESS_TYPE_LAST] = NULL
};

ucs_config_field_t uct_ib_iface_config_table[] = {
  {"", "", NULL,
   ucs_offsetof(uct_ib_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)},

  {"SEG_SIZE", "8192",
   "Size of bounce buffers used for post_send and post_recv.",
   ucs_offsetof(uct_ib_iface_config_t, seg_size), UCS_CONFIG_TYPE_MEMUNITS},

  {"TX_QUEUE_LEN", "256",
   "Length of send queue in the QP.",
   ucs_offsetof(uct_ib_iface_config_t, tx.queue_len), UCS_CONFIG_TYPE_UINT},

  {"TX_MAX_BATCH", "16",
   "Number of send WQEs to batch in one post-send list. Larger values reduce\n"
   "the CPU usage, but increase the latency and pipelining between sender and\n"
   "receiver.",
   ucs_offsetof(uct_ib_iface_config_t, tx.max_batch), UCS_CONFIG_TYPE_UINT},

  {"TX_MAX_POLL", "16",
   "Max number of receive completions to pick during TX poll",
   ucs_offsetof(uct_ib_iface_config_t, tx.max_poll), UCS_CONFIG_TYPE_UINT},

  {"TX_MIN_INLINE", "64",
   "Bytes to reserve in send WQE for inline data. Messages which are small\n"
   "enough will be sent inline.",
   ucs_offsetof(uct_ib_iface_config_t, tx.min_inline), UCS_CONFIG_TYPE_MEMUNITS},

  {"TX_INLINE_RESP", "32",
   "Bytes to reserve in send WQE for inline response. Responses which are small\n"
   "enough, such as of atomic operations and small reads, will be received inline.",
   ucs_offsetof(uct_ib_iface_config_t, tx.inl_resp), UCS_CONFIG_TYPE_MEMUNITS},

  {"TX_MIN_SGE", "3",
   "Number of SG entries to reserve in the send WQE.",
   ucs_offsetof(uct_ib_iface_config_t, tx.min_sge), UCS_CONFIG_TYPE_UINT},

#if HAVE_DECL_IBV_EXP_CQ_MODERATION
  {"TX_EVENT_MOD_COUNT", "0",
   "Number of send completions for which an event would be generated (0 - disabled).",
   ucs_offsetof(uct_ib_iface_config_t, tx.cq_moderation_count), UCS_CONFIG_TYPE_UINT},

  {"TX_EVENT_MOD_PERIOD", "0us",
   "Time period to generate send event (0 - disabled).",
   ucs_offsetof(uct_ib_iface_config_t, tx.cq_moderation_period), UCS_CONFIG_TYPE_TIME},

  {"RX_EVENT_MOD_COUNT", "0",
   "Number of received messages for which an event would be generated (0 - disabled).",
   ucs_offsetof(uct_ib_iface_config_t, rx.cq_moderation_count), UCS_CONFIG_TYPE_UINT},

  {"RX_EVENT_MOD_PERIOD", "0us",
   "Time period to generate receive event (0 - disabled).",
   ucs_offsetof(uct_ib_iface_config_t, rx.cq_moderation_period), UCS_CONFIG_TYPE_TIME},
#endif /* HAVE_DECL_IBV_EXP_CQ_MODERATION */

  UCT_IFACE_MPOOL_CONFIG_FIELDS("TX_", -1, 1024, "send",
                                ucs_offsetof(uct_ib_iface_config_t, tx.mp),
      "\nAttention: Setting this param with value != -1 is a dangerous thing\n"
      "in RC/DC and could cause deadlock or performance degradation."),

  {"RX_QUEUE_LEN", "4096",
   "Length of receive queue in the QPs.",
   ucs_offsetof(uct_ib_iface_config_t, rx.queue_len), UCS_CONFIG_TYPE_UINT},

  {"RX_MAX_BATCH", "16",
   "How many post-receives to perform in one batch.",
   ucs_offsetof(uct_ib_iface_config_t, rx.max_batch), UCS_CONFIG_TYPE_UINT},

  {"RX_MAX_POLL", "16",
   "Max number of receive completions to pick during RX poll",
   ucs_offsetof(uct_ib_iface_config_t, rx.max_poll), UCS_CONFIG_TYPE_UINT},

  {"RX_INLINE", "0",
   "Number of bytes to request for inline receive. If the maximal supported size\n"
   "is smaller, it will be used instead. If it is possible to support a larger\n"
   "size than requested with the same hardware resources, it will be used instead.",
   ucs_offsetof(uct_ib_iface_config_t, rx.inl), UCS_CONFIG_TYPE_MEMUNITS},

  UCT_IFACE_MPOOL_CONFIG_FIELDS("RX_", -1, 0, "receive",
                                ucs_offsetof(uct_ib_iface_config_t, rx.mp), ""),

  {"ADDR_TYPE", "auto",
   "Set the interface address type. \"auto\" mode detects the type according to\n"
   "link layer type and IB subnet prefix.\n"
   "Deprecated. To force use of global routing use IS_GLOBAL.",
   ucs_offsetof(uct_ib_iface_config_t, addr_type),
   UCS_CONFIG_TYPE_ENUM(uct_ib_iface_addr_types)},

  {"IS_GLOBAL", "n",
   "Force interface to use global routing.",
   ucs_offsetof(uct_ib_iface_config_t, is_global), UCS_CONFIG_TYPE_BOOL},

  {"SL", "0",
   "IB Service Level / RoCEv2 Ethernet Priority.\n",
   ucs_offsetof(uct_ib_iface_config_t, sl), UCS_CONFIG_TYPE_UINT},

  {"TRAFFIC_CLASS", "auto",
   "IB Traffic Class / RoCEv2 Differentiated Services Code Point (DSCP).\n"
   "\"auto\" option selects 106 on RoCEv2 and 0 otherwise.",
   ucs_offsetof(uct_ib_iface_config_t, traffic_class), UCS_CONFIG_TYPE_ULUNITS},

  {"HOP_LIMIT", "255",
   "IB Hop limit / RoCEv2 Time to Live. Should be between 0 and 255.\n",
   ucs_offsetof(uct_ib_iface_config_t, hop_limit), UCS_CONFIG_TYPE_UINT},

  {"LID_PATH_BITS", "0-17",
   "List of IB Path bits separated by comma (a,b,c) "
   "which will be the low portion of the LID, according to the LMC in the fabric.",
   ucs_offsetof(uct_ib_iface_config_t, lid_path_bits), UCS_CONFIG_TYPE_ARRAY(path_bits_spec)},

  {"PKEY", "auto",
   "Which pkey value to use. Should be between 0 and 0x7fff.\n"
   "\"auto\" option selects a first valid pkey value with full membership.",
   ucs_offsetof(uct_ib_iface_config_t, pkey_value), UCS_CONFIG_TYPE_HEX},

#if HAVE_IBV_EXP_RES_DOMAIN
  {"RESOURCE_DOMAIN", "y",
   "Enable multiple resource domains (experimental).",
   ucs_offsetof(uct_ib_iface_config_t, enable_res_domain), UCS_CONFIG_TYPE_BOOL},
#endif


  {NULL}
};

int uct_ib_iface_is_roce(uct_ib_iface_t *iface)
{
    return uct_ib_device_is_port_roce(uct_ib_iface_device(iface),
                                      iface->config.port_num);
}

int uct_ib_iface_is_ib(uct_ib_iface_t *iface)
{
    return uct_ib_device_is_port_ib(uct_ib_iface_device(iface),
                                    iface->config.port_num);
}

static void uct_ib_iface_recv_desc_init(uct_iface_h tl_iface, void *obj, uct_mem_h memh)
{
    uct_ib_iface_recv_desc_t *desc = obj;
    uct_ib_mem_t *ib_memh = memh;

    desc->lkey = ib_memh->lkey;
}

ucs_status_t uct_ib_iface_recv_mpool_init(uct_ib_iface_t *iface,
                                          const uct_ib_iface_config_t *config,
                                          const char *name, ucs_mpool_t *mp)
{
    unsigned grow;

    if (config->rx.queue_len < 1024) {
        grow = 1024;
    } else {
        /* We want to have some free (+10%) elements to avoid mem pool expansion */
        grow = ucs_min( (int)(1.1 * config->rx.queue_len + 0.5),
                        config->rx.mp.max_bufs);
    }

    return uct_iface_mpool_init(&iface->super, mp,
                                iface->config.rx_payload_offset + iface->config.seg_size,
                                iface->config.rx_hdr_offset,
                                UCS_SYS_CACHE_LINE_SIZE,
                                &config->rx.mp, grow,
                                uct_ib_iface_recv_desc_init,
                                name);
}

void uct_ib_iface_release_desc(uct_recv_desc_t *self, void *desc)
{
    uct_ib_iface_t *iface = ucs_container_of(self, uct_ib_iface_t, release_desc);
    void *ib_desc;

    ib_desc = UCS_PTR_BYTE_OFFSET(desc, -(ptrdiff_t)iface->config.rx_headroom_offset);
    ucs_mpool_put_inline(ib_desc);
}

size_t uct_ib_address_size(const union ibv_gid *gid, unsigned pack_flags)
{
    size_t size = sizeof(uct_ib_address_t);

    if (pack_flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) {
        /* Ethernet: address contains only raw GID */
        return size + sizeof(union ibv_gid);
    }

    /* InfiniBand: address always contains LID */
    size += sizeof(uint16_t); /* lid */

    if (pack_flags & UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID) {
        /* Add GUID */
        UCS_STATIC_ASSERT(sizeof(gid->global.interface_id) == sizeof(uint64_t));
        size += sizeof(uint64_t);
    }

    if (pack_flags & UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX) {
        if ((gid->global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) ==
                       UCT_IB_SITE_LOCAL_PREFIX) {
            /* 16-bit subnet prefix */
            size += sizeof(uint16_t);
        } else if (gid->global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX) {
            /* 64-bit subnet prefix */
            size += sizeof(uint64_t);
        }
        /* Note: if subnet prefix is LINK_LOCAL, no need to pack it because
         * it's a well-known value defined by IB specification.
         */
    }

    return size;
}

void uct_ib_address_pack(const union ibv_gid *gid, uint16_t lid,
                         unsigned pack_flags, uct_ib_address_t *ib_addr)
{
    void *ptr = ib_addr + 1;

    if (pack_flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) {
        /* RoCE, in this case we don't use the lid and set the GID flag */
        ib_addr->flags = UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH |
                         UCT_IB_ADDRESS_FLAG_GID;
        /* uint8_t raw[16]; */
        memcpy(ptr, gid->raw, sizeof(gid->raw) * sizeof(uint8_t));
        return;
    }

    /* IB, LID */
    ib_addr->flags   = UCT_IB_ADDRESS_FLAG_LINK_LAYER_IB |
                       UCT_IB_ADDRESS_FLAG_LID;
    ptr              = ib_addr + 1;
    *(uint16_t*)ptr  = lid;
    ptr              = UCS_PTR_BYTE_OFFSET(ptr, sizeof(uint16_t));

    if (pack_flags & UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID) {
        /* Pack GUID */
        ib_addr->flags  |= UCT_IB_ADDRESS_FLAG_IF_ID;
        *(uint64_t*) ptr = gid->global.interface_id;
        ptr              = UCS_PTR_BYTE_OFFSET(ptr, sizeof(uint64_t));
    }

    if (pack_flags & UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX) {
        if ((gid->global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) ==
                                         UCT_IB_SITE_LOCAL_PREFIX) {
            /* Site-local */
            ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET16;
            *(uint16_t*)ptr = gid->global.subnet_prefix >> 48;
        } else if (gid->global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX) {
            /* Global */
            ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET64;
            *(uint64_t*)ptr = gid->global.subnet_prefix;
        }
    }
}

static unsigned uct_ib_iface_address_pack_flags(uct_ib_iface_t *iface)
{
    if (uct_ib_iface_is_roce(iface)) {
        /* pack Ethernet address */
        return UCT_IB_ADDRESS_PACK_FLAG_ETH;
    } else if (iface->config.force_global_addr) {
        /* pack full IB address */
        return UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX |
               UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID;
    } else {
        /* pack only subnet prefix for reachability test */
        return UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX;
    }
}

size_t uct_ib_iface_address_size(uct_ib_iface_t *iface)
{
    return uct_ib_address_size(&iface->gid,
                               uct_ib_iface_address_pack_flags(iface));
}

void uct_ib_iface_address_pack(uct_ib_iface_t *iface, const union ibv_gid *gid,
                               uint16_t lid, uct_ib_address_t *ib_addr)
{
    uct_ib_address_pack(gid, lid, uct_ib_iface_address_pack_flags(iface),
                        ib_addr);
}

void uct_ib_address_unpack(const uct_ib_address_t *ib_addr, uint16_t *lid,
                           union ibv_gid *gid)
{
    const void *ptr = ib_addr + 1;

    *lid                      = 0;

    if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_GID) {
        memcpy(gid->raw, ptr, sizeof(gid->raw) * sizeof(uint8_t)); /* uint8_t raw[16]; */
        ucs_assert(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH);
        ucs_assert(!(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LID));
        return;
    }

    gid->global.subnet_prefix = UCT_IB_LINK_LOCAL_PREFIX; /* Default prefix */
    gid->global.interface_id  = 0;

    if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LID) {
        *lid = *(uint16_t*)ptr;
        ptr  = UCS_PTR_BYTE_OFFSET(ptr, sizeof(uint16_t));
    }

    if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_IF_ID) {
        gid->global.interface_id = *(uint64_t*)ptr;
        ptr                      = UCS_PTR_BYTE_OFFSET(ptr, sizeof(uint64_t));
    }

    if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET16) {
        gid->global.subnet_prefix = UCT_IB_SITE_LOCAL_PREFIX |
                                    ((uint64_t) *(uint16_t*) ptr << 48);
        ptr                       = UCS_PTR_BYTE_OFFSET(ptr, sizeof(uint16_t));
        ucs_assert(!(ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64));
    }

    if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64) {
        gid->global.subnet_prefix = *(uint64_t*) ptr;
    }
}

const char *uct_ib_address_str(const uct_ib_address_t *ib_addr, char *buf,
                               size_t max)
{
    union ibv_gid gid;
    uint16_t lid;
    char *p, *endp;

    uct_ib_address_unpack(ib_addr, &lid, &gid);

    p    = buf;
    endp = buf + max;
    if (lid != 0) {
        snprintf(p, endp - p, "lid %d ", lid);
        p += strlen(p);
    }
    inet_ntop(AF_INET6, &gid, p, endp - p);

    return buf;
}

ucs_status_t uct_ib_iface_get_device_address(uct_iface_h tl_iface,
                                             uct_device_addr_t *dev_addr)
{
    uct_ib_iface_t   *iface   = ucs_derived_of(tl_iface, uct_ib_iface_t);

    uct_ib_iface_address_pack(iface, &iface->gid, uct_ib_iface_port_attr(iface)->lid,
                              (void*)dev_addr);
    return UCS_OK;
}

int uct_ib_iface_is_reachable(const uct_iface_h tl_iface, const uct_device_addr_t *dev_addr,
                              const uct_iface_addr_t *iface_addr)
{
    uct_ib_iface_t *iface = ucs_derived_of(tl_iface, uct_ib_iface_t);
    int is_local_eth = uct_ib_iface_is_roce(iface);
    const uct_ib_address_t *ib_addr = (const void*)dev_addr;
    union ibv_gid gid;
    uint16_t lid;

    uct_ib_address_unpack(ib_addr, &lid, &gid);

    if (!is_local_eth && (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_IB)) {
        /* same subnet prefix */
        return gid.global.subnet_prefix == iface->gid.global.subnet_prefix;
    } else if (is_local_eth && (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)) {
        /* there shouldn't be a lid and the gid flag should be on */
        ucs_assert(ib_addr->flags & UCT_IB_ADDRESS_FLAG_GID);
        ucs_assert(!(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LID));
        return 1;
    } else {
        /* local and remote have different link layers and therefore are unreachable */
        return 0;
    }
}

ucs_status_t uct_ib_iface_create_ah(uct_ib_iface_t *iface,
                                    struct ibv_ah_attr *ah_attr,
                                    struct ibv_ah **ah_p)
{
    return uct_ib_device_create_ah_cached(uct_ib_iface_device(iface), ah_attr,
                                          uct_ib_iface_md(iface)->pd, ah_p);
}

static ucs_status_t uct_ib_iface_init_pkey(uct_ib_iface_t *iface,
                                           const uct_ib_iface_config_t *config)
{
    uct_ib_device_t *dev  = uct_ib_iface_device(iface);
    uint16_t pkey_tbl_len = uct_ib_iface_port_attr(iface)->pkey_tbl_len;
    int pkey_found        = 0;
    uint16_t pkey_index, port_pkey, pkey;

    if ((config->pkey_value != UCS_HEXUNITS_AUTO) &&
        (config->pkey_value > UCT_IB_PKEY_PARTITION_MASK)) {
        ucs_error("Requested pkey 0x%x is invalid, should be in the range 0..0x%x",
                  config->pkey_value, UCT_IB_PKEY_PARTITION_MASK);
        return UCS_ERR_INVALID_PARAM;
    }

    /* get the user's pkey value and find its index in the port's pkey table */
    for (pkey_index = 0; pkey_index < pkey_tbl_len; ++pkey_index) {
        /* get the pkey values from the port's pkeys table */
        if (ibv_query_pkey(dev->ibv_context, iface->config.port_num, pkey_index,
                           &port_pkey))
        {
            ucs_debug("ibv_query_pkey("UCT_IB_IFACE_FMT", index=%d) failed: %m",
                      UCT_IB_IFACE_ARG(iface), pkey_index);
            continue;
        }

        pkey = ntohs(port_pkey);
        if (!(pkey & UCT_IB_PKEY_MEMBERSHIP_MASK)) {
            /* if pkey = 0x0, just skip it w/o debug trace, because 0x0
             * means that there is no real pkey configured at this index */
            if (pkey) {
                ucs_trace("skipping send-only pkey[%d]=0x%x on "UCT_IB_IFACE_FMT,
                          pkey_index, pkey, UCT_IB_IFACE_ARG(iface));
            }
            continue;
        }

        /* take only the lower 15 bits for the comparison */
        if ((config->pkey_value == UCS_HEXUNITS_AUTO) ||
            ((pkey & UCT_IB_PKEY_PARTITION_MASK) == config->pkey_value)) {
            iface->pkey_index = pkey_index;
            iface->pkey_value = pkey;
            pkey_found        = 1;
            break;
        }
    }

    if (!pkey_found) {
        if (config->pkey_value == UCS_HEXUNITS_AUTO) {
            ucs_error("There is no valid pkey with full membership on "
                      UCT_IB_IFACE_FMT, UCT_IB_IFACE_ARG(iface));
        } else {
            ucs_error("Unable to find specified pkey 0x%x on "UCT_IB_IFACE_FMT,
                      config->pkey_value, UCT_IB_IFACE_ARG(iface));
        }

        return UCS_ERR_INVALID_PARAM;
    }

    ucs_debug("using pkey[%d] 0x%x on "UCT_IB_IFACE_FMT, iface->pkey_index,
              iface->pkey_value, UCT_IB_IFACE_ARG(iface));
    return UCS_OK;
}

static ucs_status_t uct_ib_iface_init_lmc(uct_ib_iface_t *iface,
                                          const uct_ib_iface_config_t *config)
{
    unsigned i, j, num_path_bits;
    unsigned first, last;
    uint8_t lmc;
    int step;

    if (config->lid_path_bits.count == 0) {
        ucs_error("List of path bits must not be empty");
        return UCS_ERR_INVALID_PARAM;
    }

    /* count the number of lid_path_bits */
    num_path_bits = 0;
    for (i = 0; i < config->lid_path_bits.count; i++) {
        num_path_bits += 1 + abs((int)(config->lid_path_bits.ranges[i].first -
                                       config->lid_path_bits.ranges[i].last));
    }

    iface->path_bits = ucs_calloc(1, num_path_bits * sizeof(*iface->path_bits),
                                  "ib_path_bits");
    if (iface->path_bits == NULL) {
        return UCS_ERR_NO_MEMORY;
    }

    lmc = uct_ib_iface_port_attr(iface)->lmc;

    /* go over the list of values (ranges) for the lid_path_bits and set them */
    iface->path_bits_count = 0;
    for (i = 0; i < config->lid_path_bits.count; ++i) {

        first = config->lid_path_bits.ranges[i].first;
        last  = config->lid_path_bits.ranges[i].last;

        /* range of values or one value */
        if (first < last) {
            step = 1;
        } else {
            step = -1;
        }

        /* fill the value/s */
        for (j = first; j != (last + step); j += step) {
            if (j >= UCS_BIT(lmc)) {
                ucs_debug("Not using value %d for path_bits - must be < 2^lmc (lmc=%d)",
                          j, lmc);
                if (step == 1) {
                    break;
                } else {
                    continue;
                }
            }

            ucs_assert(iface->path_bits_count < num_path_bits);
            iface->path_bits[iface->path_bits_count] = j;
            iface->path_bits_count++;
        }
    }

    return UCS_OK;
}

void uct_ib_iface_fill_attr(uct_ib_iface_t *iface, uct_ib_qp_attr_t *attr)
{
    attr->ibv.send_cq             = iface->cq[UCT_IB_DIR_TX];
    attr->ibv.recv_cq             = iface->cq[UCT_IB_DIR_RX];

    attr->ibv.srq                 = attr->srq;
    attr->ibv.cap                 = attr->cap;
    attr->ibv.qp_type             = (enum ibv_qp_type)attr->qp_type;
    attr->ibv.sq_sig_all          = attr->sq_sig_all;

#if HAVE_DECL_IBV_EXP_CREATE_QP
    if (!(attr->ibv.comp_mask & IBV_EXP_QP_INIT_ATTR_PD)) {
        attr->ibv.comp_mask       = IBV_EXP_QP_INIT_ATTR_PD;
        attr->ibv.pd              = uct_ib_iface_md(iface)->pd;
    }
#elif HAVE_DECL_IBV_CREATE_QP_EX
    if (!(attr->ibv.comp_mask & IBV_QP_INIT_ATTR_PD)) {
        attr->ibv.comp_mask       = IBV_QP_INIT_ATTR_PD;
        attr->ibv.pd              = uct_ib_iface_md(iface)->pd;
    }
#endif

    attr->port                    = iface->config.port_num;

    if (attr->qp_type == IBV_QPT_UD) {
        return;
    }

    /* MOFED requires this to enable IB spec atomic */
#if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE
    if (uct_ib_iface_device(iface)->dev_attr.exp_atomic_cap ==
                                     IBV_EXP_ATOMIC_HCA_REPLY_BE) {
        attr->ibv.comp_mask       |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS;
        attr->ibv.exp_create_flags = IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY;
    }
#endif
}

ucs_status_t uct_ib_iface_create_qp(uct_ib_iface_t *iface,
                                    uct_ib_qp_attr_t *attr,
                                    struct ibv_qp **qp_p)
{
    uct_ib_device_t *dev = uct_ib_iface_device(iface);
    struct ibv_qp *qp;

    uct_ib_iface_fill_attr(iface, attr);

#if HAVE_DECL_IBV_EXP_CREATE_QP
    qp = ibv_exp_create_qp(dev->ibv_context, &attr->ibv);
#elif HAVE_DECL_IBV_CREATE_QP_EX
    qp = ibv_create_qp_ex(dev->ibv_context, &attr->ibv);
#else
    qp = ibv_create_qp(uct_ib_iface_md(iface)->pd, &attr->ibv);
#endif
    if (qp == NULL) {
        ucs_error("iface=%p: failed to create %s QP TX wr:%d sge:%d inl:%d RX wr:%d sge:%d inl %d: %m",
                  iface, uct_ib_qp_type_str(attr->qp_type),
                  attr->cap.max_send_wr, attr->cap.max_send_sge, attr->cap.max_inline_data,
                  attr->cap.max_recv_wr, attr->cap.max_recv_sge, attr->max_inl_recv);
        return UCS_ERR_IO_ERROR;
    }

    attr->cap  = attr->ibv.cap;
    *qp_p      = qp;

    ucs_debug("iface=%p: created %s QP 0x%x on %s:%d TX wr:%d sge:%d inl:%d RX wr:%d sge:%d inl %d",
              iface, uct_ib_qp_type_str(attr->qp_type), qp->qp_num,
              uct_ib_device_name(dev), iface->config.port_num,
              attr->cap.max_send_wr, attr->cap.max_send_sge, attr->cap.max_inline_data,
              attr->cap.max_recv_wr, attr->cap.max_recv_sge, attr->max_inl_recv);

    return UCS_OK;
}

ucs_status_t uct_ib_verbs_create_cq(struct ibv_context *context, int cqe,
                                    struct ibv_comp_channel *channel,
                                    int comp_vector, int ignore_overrun,
                                    size_t *inl, struct ibv_cq **cq_p)
{
    struct ibv_cq *cq;
#if HAVE_DECL_IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN
    struct ibv_cq_init_attr_ex cq_attr = {};

    cq_attr.cqe = cqe;
    cq_attr.channel = channel;
    cq_attr.comp_vector = comp_vector;
    if (ignore_overrun) {
        cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_FLAGS;
        cq_attr.flags = IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN;
    }

    cq = ibv_cq_ex_to_cq(ibv_create_cq_ex(context, &cq_attr));
    if (!cq && (errno == ENOSYS))
#endif
    {
        *inl = 0;
        cq = ibv_create_cq(context, cqe, NULL, channel, comp_vector);
    }

    if (!cq) {
        ucs_error("ibv_create_cq(cqe=%d) failed: %m", cqe);
        return UCS_ERR_IO_ERROR;
    }

    *cq_p = cq;
    return UCS_OK;
}

static ucs_status_t uct_ib_iface_create_cq(uct_ib_iface_t *iface, int cq_length,
                                           size_t *inl, int preferred_cpu,
                                           int flags, struct ibv_cq **cq_p)
{
    uct_ib_device_t *dev = uct_ib_iface_device(iface);
    ucs_status_t status;
#if HAVE_DECL_IBV_EXP_SETENV && !HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE
    static const char *cqe_size_env_var = "MLX5_CQE_SIZE";
    const char *cqe_size_env_value;
    size_t cqe_size = 64;
    size_t cqe_size_min;
    char cqe_size_buf[32];
    int env_var_added = 0;
    int ret;

    cqe_size_min       = (*inl > 32) ? 128 : 64;
    cqe_size_env_value = getenv(cqe_size_env_var);

    if (cqe_size_env_value != NULL) {
        cqe_size = atol(cqe_size_env_value);
        if (cqe_size < cqe_size_min) {
            ucs_error("%s is set to %zu, but at least %zu is required (inl: %zu)",
                      cqe_size_env_var, cqe_size, cqe_size_min, *inl);
            return UCS_ERR_INVALID_PARAM;
        }
    } else {
        cqe_size = uct_ib_get_cqe_size(cqe_size_min);
        snprintf(cqe_size_buf, sizeof(cqe_size_buf),"%zu", cqe_size);
        ucs_debug("%s: setting %s=%s", uct_ib_device_name(dev), cqe_size_env_var,
                  cqe_size_buf);
        ret = ibv_exp_setenv(dev->ibv_context, cqe_size_env_var, cqe_size_buf, 1);
        if (ret) {
            ucs_error("ibv_exp_setenv(%s=%s) failed: %m", cqe_size_env_var,
                      cqe_size_buf);
            return UCS_ERR_INVALID_PARAM;
        }

        env_var_added = 1;
    }
#endif
    status = iface->ops->create_cq(dev->ibv_context, cq_length,
                                   iface->comp_channel, preferred_cpu,
                                   flags & UCT_IB_CQ_IGNORE_OVERRUN, inl, cq_p);
    if (status != UCS_OK) {
        goto out_unsetenv;
    }

    status = UCS_OK;

out_unsetenv:
#if HAVE_DECL_IBV_EXP_SETENV && !HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE
    *inl = cqe_size / 2;
    if (env_var_added) {
        /* if we created a new environment variable, remove it */
        ret = ibv_exp_unsetenv(dev->ibv_context, cqe_size_env_var);
        if (ret) {
            ucs_warn("unsetenv(%s) failed: %m", cqe_size_env_var);
        }
    }
#endif
    return status;
}


static ucs_status_t uct_ib_iface_set_moderation(struct ibv_cq *cq,
                                                unsigned count, double period_usec)
{
#if HAVE_DECL_IBV_EXP_CQ_MODERATION
    unsigned period = (unsigned)(period_usec * UCS_USEC_PER_SEC);

    if (count > UINT16_MAX) {
        ucs_error("CQ moderation count is too high: %u, max value: %u", count, UINT16_MAX);
        return UCS_ERR_INVALID_PARAM;
    } else if (count == 0) {
        /* in case if count value is 0 (unchanged default value) - set it to maximum
         * possible value */
        count = UINT16_MAX;
    }

    if (period > UINT16_MAX) {
        ucs_error("CQ moderation period is too high: %u, max value: %uus", period, UINT16_MAX);
        return UCS_ERR_INVALID_PARAM;
    } else if (period == 0) {
        /* in case if count value is 0 (unchanged default value) - set it to maximum
         * possible value, the same behavior as counter */
        period = UINT16_MAX;
    }

    if ((count < UINT16_MAX) || (period < UINT16_MAX)) {
        struct ibv_exp_cq_attr cq_attr = {
            .comp_mask            = IBV_EXP_CQ_ATTR_MODERATION,
            .moderation.cq_count  = (uint16_t)(count),
            .moderation.cq_period = (uint16_t)(period),
            .cq_cap_flags         = 0
        };
        if (ibv_exp_modify_cq(cq, &cq_attr, IBV_EXP_CQ_MODERATION)) {
            ucs_error("ibv_exp_modify_cq(count=%d, period=%d) failed: %m", count, period);
            return UCS_ERR_IO_ERROR;
        }
    }
#endif /* HAVE_DECL_IBV_EXP_CQ_MODERATION */

    return UCS_OK;
}

UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md,
                    uct_worker_h worker, const uct_iface_params_t *params,
                    const uct_ib_iface_config_t *config,
                    const uct_ib_iface_init_attr_t *init_attr)
{
    uct_ib_md_t *ib_md   = ucs_derived_of(md, uct_ib_md_t);
    uct_ib_device_t *dev = &ib_md->dev;
    size_t rx_headroom   = (params->field_mask &
                            UCT_IFACE_PARAM_FIELD_RX_HEADROOM) ?
                           params->rx_headroom : 0;
    ucs_cpu_set_t cpu_mask;
    int preferred_cpu;
    ucs_status_t status;
    uint8_t port_num;
    size_t inl;

    if (!(params->open_mode & UCT_IFACE_OPEN_MODE_DEVICE)) {
        return UCS_ERR_UNSUPPORTED;
    }

    if (params->field_mask & UCT_IFACE_PARAM_FIELD_CPU_MASK) {
        cpu_mask = params->cpu_mask;
    } else {
        memset(&cpu_mask, 0, sizeof(cpu_mask));
    }

    preferred_cpu = ucs_cpu_set_find_lcs(&cpu_mask);

    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &ops->super, md, worker,
                              params, &config->super
                              UCS_STATS_ARG(((params->field_mask &
                                              UCT_IFACE_PARAM_FIELD_STATS_ROOT) &&
                                             (params->stats_root != NULL)) ?
                                            params->stats_root :
                                            dev->stats)
                              UCS_STATS_ARG(params->mode.device.dev_name));

    status = uct_ib_device_find_port(dev, params->mode.device.dev_name, &port_num);
    if (status != UCS_OK) {
        goto err;
    }

    self->ops                       = ops;

    self->config.rx_payload_offset  = sizeof(uct_ib_iface_recv_desc_t) +
                                      ucs_max(sizeof(uct_recv_desc_t) +
                                              rx_headroom,
                                              init_attr->rx_priv_len +
                                              init_attr->rx_hdr_len);
    self->config.rx_hdr_offset      = self->config.rx_payload_offset -
                                      init_attr->rx_hdr_len;
    self->config.rx_headroom_offset = self->config.rx_payload_offset -
                                      rx_headroom;
    self->config.seg_size           = init_attr->seg_size;
    self->config.tx_max_poll        = config->tx.max_poll;
    self->config.rx_max_poll        = config->rx.max_poll;
    self->config.rx_max_batch       = ucs_min(config->rx.max_batch,
                                              config->rx.queue_len / 4);
    self->config.port_num           = port_num;
    self->config.sl                 = config->sl;
    self->config.hop_limit          = config->hop_limit;
    self->release_desc.cb           = uct_ib_iface_release_desc;
    self->config.enable_res_domain  = config->enable_res_domain;
    self->config.qp_type            = init_attr->qp_type;

    if (ucs_derived_of(worker, uct_priv_worker_t)->thread_mode == UCS_THREAD_MODE_MULTI) {
        ucs_error("IB transports do not support multi-threaded worker");
        return UCS_ERR_INVALID_PARAM;
    }

    status = uct_ib_iface_init_pkey(self, config);
    if (status != UCS_OK) {
        goto err;
    }

    status = uct_ib_device_select_gid_index(dev, self->config.port_num,
                                            ib_md->config.gid_index,
                                            &self->config.gid_index);
    if (status != UCS_OK) {
        goto err;
    }

    status = uct_ib_device_query_gid(dev, self->config.port_num,
                                     self->config.gid_index, &self->gid,
                                     &self->is_roce_v2);
    if (status != UCS_OK) {
        goto err;
    }

    if (config->traffic_class == UCS_ULUNITS_AUTO) {
        self->config.traffic_class = self->is_roce_v2 ?
                                     UCT_IB_DEFAULT_ROCEV2_DSCP : 0;
    } else {
        self->config.traffic_class = config->traffic_class;
    }

    status = uct_ib_iface_init_lmc(self, config);
    if (status != UCS_OK) {
        goto err;
    }

    self->comp_channel = ibv_create_comp_channel(dev->ibv_context);
    if (self->comp_channel == NULL) {
        ucs_error("ibv_create_comp_channel() failed: %m");
        status = UCS_ERR_IO_ERROR;
        goto err_cleanup;
    }

    status = ucs_sys_fcntl_modfl(self->comp_channel->fd, O_NONBLOCK, 0);
    if (status != UCS_OK) {
        goto err_destroy_comp_channel;
    }

    inl = config->rx.inl;
    status = uct_ib_iface_create_cq(self, init_attr->tx_cq_len, &inl,
                                    preferred_cpu, init_attr->flags,
                                    &self->cq[UCT_IB_DIR_TX]);
    if (status != UCS_OK) {
        goto err_destroy_comp_channel;
    }
    ucs_assert_always(inl <= UINT8_MAX);
    self->config.max_inl_resp = inl;

    status = uct_ib_iface_set_moderation(self->cq[UCT_IB_DIR_TX],
                                         config->tx.cq_moderation_count,
                                         config->tx.cq_moderation_period);
    if (status != UCS_OK) {
        goto err_destroy_send_cq;
    }

    inl = config->rx.inl;
    status = uct_ib_iface_create_cq(self, init_attr->rx_cq_len, &inl,
                                    preferred_cpu, init_attr->flags,
                                    &self->cq[UCT_IB_DIR_RX]);
    if (status != UCS_OK) {
        goto err_destroy_send_cq;
    }

    status = uct_ib_iface_set_moderation(self->cq[UCT_IB_DIR_RX],
                                         config->rx.cq_moderation_count,
                                         config->rx.cq_moderation_period);
    if (status != UCS_OK) {
        goto err_destroy_recv_cq;
    }

    /* Address scope and size */
    if (uct_ib_iface_is_roce(self) || config->is_global ||
        /* check ADDR_TYPE for backward compatibility */
        (config->addr_type == UCT_IB_ADDRESS_TYPE_SITE_LOCAL) ||
        (config->addr_type == UCT_IB_ADDRESS_TYPE_GLOBAL)) {
        self->config.force_global_addr = 1;
    } else {
        self->config.force_global_addr = 0;
    }

    self->addr_size  = uct_ib_iface_address_size(self);

    ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d",
              self->config.rx_headroom_offset, self->config.rx_payload_offset,
              self->config.rx_hdr_offset, self->config.seg_size);

    return UCS_OK;

err_destroy_recv_cq:
    ibv_destroy_cq(self->cq[UCT_IB_DIR_RX]);
err_destroy_send_cq:
    ibv_destroy_cq(self->cq[UCT_IB_DIR_TX]);
err_destroy_comp_channel:
    ibv_destroy_comp_channel(self->comp_channel);
err_cleanup:
    ucs_free(self->path_bits);
err:
    return status;
}

static UCS_CLASS_CLEANUP_FUNC(uct_ib_iface_t)
{
    int ret;

    ret = ibv_destroy_cq(self->cq[UCT_IB_DIR_RX]);
    if (ret != 0) {
        ucs_warn("ibv_destroy_cq(recv_cq) returned %d: %m", ret);
    }

    ret = ibv_destroy_cq(self->cq[UCT_IB_DIR_TX]);
    if (ret != 0) {
        ucs_warn("ibv_destroy_cq(send_cq) returned %d: %m", ret);
    }

    ret = ibv_destroy_comp_channel(self->comp_channel);
    if (ret != 0) {
        ucs_warn("ibv_destroy_comp_channel(comp_channel) returned %d: %m", ret);
    }

    ucs_free(self->path_bits);
}

UCS_CLASS_DEFINE(uct_ib_iface_t, uct_base_iface_t);

int uct_ib_iface_prepare_rx_wrs(uct_ib_iface_t *iface, ucs_mpool_t *mp,
                                uct_ib_recv_wr_t *wrs, unsigned n)
{
    uct_ib_iface_recv_desc_t *desc;
    unsigned count;

    count = 0;
    while (count < n) {
        UCT_TL_IFACE_GET_RX_DESC(&iface->super, mp, desc, break);
        wrs[count].sg.addr   = (uintptr_t)uct_ib_iface_recv_desc_hdr(iface, desc);
        wrs[count].sg.length = iface->config.rx_payload_offset + iface->config.seg_size;
        wrs[count].sg.lkey   = desc->lkey;
        wrs[count].ibwr.num_sge = 1;
        wrs[count].ibwr.wr_id   = (uintptr_t)desc;
        wrs[count].ibwr.sg_list = &wrs[count].sg;
        wrs[count].ibwr.next    = &wrs[count + 1].ibwr;
        ++count;
    }

    if (count > 0) {
        wrs[count - 1].ibwr.next = NULL;
    }

    return count;
}

static ucs_status_t uct_ib_iface_get_numa_latency(uct_ib_iface_t *iface,
                                                  double *latency)
{
    uct_ib_device_t *dev = uct_ib_iface_device(iface);
    uct_ib_md_t *md      = uct_ib_iface_md(iface);
    ucs_sys_cpuset_t temp_cpu_mask, process_affinity;
#if HAVE_NUMA
    int distance, min_cpu_distance;
    int cpu, num_cpus;
#endif
    int ret;

    if (!md->config.prefer_nearest_device) {
        *latency = 0;
        return UCS_OK;
    }

    ret = ucs_sys_getaffinity(&process_affinity);
    if (ret) {
        ucs_error("sched_getaffinity() failed: %m");
        return UCS_ERR_INVALID_PARAM;
    }

#if HAVE_NUMA
    /* Try to estimate the extra device latency according to NUMA distance */
    if (dev->numa_node != -1) {
        min_cpu_distance = INT_MAX;
        num_cpus         = ucs_min(CPU_SETSIZE, numa_num_configured_cpus());
        for (cpu = 0; cpu < num_cpus; ++cpu) {
            if (!CPU_ISSET(cpu, &process_affinity)) {
                continue;
            }
            distance = numa_distance(ucs_numa_node_of_cpu(cpu), dev->numa_node);
            if (distance >= UCS_NUMA_MIN_DISTANCE) {
                min_cpu_distance = ucs_min(min_cpu_distance, distance);
            }
        }

        if (min_cpu_distance != INT_MAX) {
            /* set the extra latency to (numa_distance - 10) * 20nsec */
            *latency = (min_cpu_distance - UCS_NUMA_MIN_DISTANCE) * 20e-9;
            return UCS_OK;
        }
    }
#endif

    /* Estimate the extra device latency according to its local CPUs mask */
    CPU_AND(&temp_cpu_mask, &dev->local_cpus, &process_affinity);
    if (CPU_EQUAL(&process_affinity, &temp_cpu_mask)) {
        *latency = 0;
    } else {
        *latency = 200e-9;
    }
    return UCS_OK;
}

ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len,
                                uct_iface_attr_t *iface_attr)
{
    uct_ib_device_t *dev = uct_ib_iface_device(iface);
    uct_ib_md_t     *md  = uct_ib_iface_md(iface);
    static const unsigned ib_port_widths[] = {
        [0] = 1,
        [1] = 4,
        [2] = 8,
        [3] = 12,
        [4] = 16
    };
    uint8_t active_width, active_speed, active_mtu, width_idx;
    double encoding, signal_rate, wire_speed;
    size_t mtu, width, extra_pkt_len;
    ucs_status_t status;
    double numa_latency;

    uct_base_iface_query(&iface->super, iface_attr);
    
    active_width = uct_ib_iface_port_attr(iface)->active_width;
    active_speed = uct_ib_iface_port_attr(iface)->active_speed;
    active_mtu   = uct_ib_iface_port_attr(iface)->active_mtu;

    /* Get active width */
    width_idx = ucs_ilog2(active_width);
    if (!ucs_is_pow2(active_width) ||
        (active_width < 1) || (width_idx > 4))
    {
        ucs_error("Invalid active_width on %s:%d: %d",
                  UCT_IB_IFACE_ARG(iface), active_width);
        return UCS_ERR_IO_ERROR;
    }

    iface_attr->device_addr_len = iface->addr_size;

    switch (active_speed) {
    case 1: /* SDR */
        iface_attr->latency.overhead = 5000e-9;
        signal_rate                  = 2.5e9;
        encoding                     = 8.0/10.0;
        break;
    case 2: /* DDR */
        iface_attr->latency.overhead = 2500e-9;
        signal_rate                  = 5.0e9;
        encoding                     = 8.0/10.0;
        break;
    case 4:
        iface_attr->latency.overhead = 1300e-9;
        if (uct_ib_iface_is_roce(iface)) {
            /* 10/40g Eth  */
            signal_rate              = 10.3125e9;
            encoding                 = 64.0/66.0;
        } else {
            /* QDR */
            signal_rate              = 10.0e9;
            encoding                 = 8.0/10.0;
        }
        break;
    case 8: /* FDR10 */
        iface_attr->latency.overhead = 700e-9;
        signal_rate                  = 10.3125e9;
        encoding                     = 64.0/66.0;
        break;
    case 16: /* FDR */
        iface_attr->latency.overhead = 700e-9;
        signal_rate                  = 14.0625e9;
        encoding                     = 64.0/66.0;
        break;
    case 32: /* EDR / 100g Eth */
        iface_attr->latency.overhead = 600e-9;
        signal_rate                  = 25.78125e9;
        encoding                     = 64.0/66.0;
        break;
    case 64: /* 50g Eth */
        iface_attr->latency.overhead = 600e-9;
        signal_rate                  = 25.78125e9 * 2;
        encoding                     = 64.0/66.0;
        break;
    default:
        ucs_error("Invalid active_speed on %s:%d: %d",
                  UCT_IB_IFACE_ARG(iface), active_speed);
        return UCS_ERR_IO_ERROR;
    }

    status = uct_ib_iface_get_numa_latency(iface, &numa_latency);
    if (status != UCS_OK) {
        return status;
    }

    iface_attr->latency.overhead += numa_latency;
    iface_attr->latency.growth    = 0;

    /* Wire speed calculation: Width * SignalRate * Encoding */
    width                 = ib_port_widths[width_idx];
    wire_speed            = (width * signal_rate * encoding) / 8.0;

    /* Calculate packet overhead  */
    mtu                   = ucs_min(uct_ib_mtu_value((enum ibv_mtu)active_mtu),
                                    iface->config.seg_size);

    extra_pkt_len = UCT_IB_BTH_LEN + xport_hdr_len +  UCT_IB_ICRC_LEN + UCT_IB_VCRC_LEN + UCT_IB_DELIM_LEN;

    if (uct_ib_iface_is_roce(iface)) {
        extra_pkt_len += UCT_IB_GRH_LEN + UCT_IB_ROCE_LEN;
        iface_attr->latency.overhead += 200e-9;
    } else {
        /* TODO check if UCT_IB_DELIM_LEN is present in RoCE as well */
        extra_pkt_len += UCT_IB_LRH_LEN;
    }

    iface_attr->bandwidth.shared    = ucs_min((wire_speed * mtu) / (mtu + extra_pkt_len), md->pci_bw);
    iface_attr->bandwidth.dedicated = 0;
    iface_attr->priority            = uct_ib_device_spec(dev)->priority;

    return UCS_OK;
}

ucs_status_t uct_ib_iface_event_fd_get(uct_iface_h tl_iface, int *fd_p)
{
    uct_ib_iface_t *iface = ucs_derived_of(tl_iface, uct_ib_iface_t);
    *fd_p  = iface->comp_channel->fd;
    return UCS_OK;
}

ucs_status_t uct_ib_iface_pre_arm(uct_ib_iface_t *iface)
{
    int res, send_cq_count, recv_cq_count;
    struct ibv_cq *cq;
    void *cq_context;

    send_cq_count = 0;
    recv_cq_count = 0;
    do {
        res = ibv_get_cq_event(iface->comp_channel, &cq, &cq_context);
        if (0 == res) {
            if (iface->cq[UCT_IB_DIR_TX] == cq) {
                iface->ops->event_cq(iface, UCT_IB_DIR_TX);
                ++send_cq_count;
            }
            if (iface->cq[UCT_IB_DIR_RX] == cq) {
                iface->ops->event_cq(iface, UCT_IB_DIR_RX);
                ++recv_cq_count;
            }
        }
    } while (res == 0);

    if (errno != EAGAIN) {
        return UCS_ERR_IO_ERROR;
    }

    if (send_cq_count > 0) {
        ibv_ack_cq_events(iface->cq[UCT_IB_DIR_TX], send_cq_count);
    }

    if (recv_cq_count > 0) {
        ibv_ack_cq_events(iface->cq[UCT_IB_DIR_RX], recv_cq_count);
    }

    /* avoid re-arming the interface if any events exists */
    if ((send_cq_count > 0) || (recv_cq_count > 0)) {
        ucs_trace("arm_cq: got %d send and %d recv events, returning BUSY",
                  send_cq_count, recv_cq_count);
        return UCS_ERR_BUSY;
    }

    return UCS_OK;
}

ucs_status_t uct_ib_iface_arm_cq(uct_ib_iface_t *iface,
                                 uct_ib_dir_t dir,
                                 int solicited_only)
{
    int ret;

    ret = ibv_req_notify_cq(iface->cq[dir], solicited_only);
    if (ret != 0) {
        ucs_error("ibv_req_notify_cq("UCT_IB_IFACE_FMT", %d, sol=%d) failed: %m",
                  UCT_IB_IFACE_ARG(iface), dir, solicited_only);
        return UCS_ERR_IO_ERROR;
    }
    return UCS_OK;
}