Blob Blame History Raw
/**
* Copyright (C) Mellanox Technologies Ltd. 2001-2018.  ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/

#include <uct/ib/mlx5/ib_mlx5.h>
#include <uct/ib/mlx5/ib_mlx5_log.h>
#include <uct/ib/mlx5/dv/ib_mlx5_dv.h>
#include <uct/ib/base/ib_device.h>
#include <uct/base/uct_md.h>
#include <ucs/arch/cpu.h>
#include <ucs/debug/log.h>

#include "rc_mlx5.inl"


enum {
    UCT_RC_MLX5_IFACE_ADDR_TYPE_BASIC,

    /* Tag Matching address. It additionaly contains QP number which
     * is used for hardware offloads. */
    UCT_RC_MLX5_IFACE_ADDR_TYPE_TM
};


/**
 * RC mlx5 interface configuration
 */
typedef struct uct_rc_mlx5_iface_config {
    uct_rc_iface_config_t             super;
    uct_rc_mlx5_iface_common_config_t rc_mlx5_common;
    /* TODO wc_mode, UAR mode SnB W/A... */
} uct_rc_mlx5_iface_config_t;


ucs_config_field_t uct_rc_mlx5_iface_config_table[] = {
  {"RC_", "", NULL,
   ucs_offsetof(uct_rc_mlx5_iface_config_t, super),
   UCS_CONFIG_TYPE_TABLE(uct_rc_iface_config_table)},

  {"RC_", "", NULL,
   ucs_offsetof(uct_rc_mlx5_iface_config_t, rc_mlx5_common),
   UCS_CONFIG_TYPE_TABLE(uct_rc_mlx5_common_config_table)},

  {NULL}
};


static uct_rc_iface_ops_t uct_rc_mlx5_iface_ops;

#if ENABLE_STATS
ucs_stats_class_t uct_rc_mlx5_iface_stats_class = {
    .name = "mlx5",
    .num_counters = UCT_RC_MLX5_IFACE_STAT_LAST,
    .counter_names = {
     [UCT_RC_MLX5_IFACE_STAT_RX_INL_32] = "rx_inl_32",
     [UCT_RC_MLX5_IFACE_STAT_RX_INL_64] = "rx_inl_64"
    }
};
#endif

void uct_rc_mlx5_iface_check_rx_completion(uct_rc_mlx5_iface_common_t *iface,
                                           struct mlx5_cqe64 *cqe)
{
    uct_ib_mlx5_cq_t *cq      = &iface->cq[UCT_IB_DIR_RX];
    struct mlx5_err_cqe *ecqe = (void*)cqe;
    uct_ib_mlx5_srq_seg_t *seg;
    uint16_t wqe_ctr;

    ucs_memory_cpu_load_fence();

    if (((ecqe->op_own >> 4) == MLX5_CQE_RESP_ERR) &&
        (ecqe->syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR) &&
        (ecqe->vendor_err_synd == UCT_IB_MLX5_CQE_VENDOR_SYND_ODP))
    {
        /* Release the aborted segment */
        wqe_ctr = ntohs(ecqe->wqe_counter);
        seg     = uct_ib_mlx5_srq_get_wqe(&iface->rx.srq, wqe_ctr);
        ++cq->cq_ci;
        /* TODO: Check if ib_stride_index valid for error CQE */
        uct_rc_mlx5_iface_release_srq_seg(iface, seg, cqe, wqe_ctr, UCS_OK,
                                          iface->super.super.config.rx_headroom_offset,
                                          &iface->super.super.release_desc);
    } else {
        ucs_assert((ecqe->op_own >> 4) != MLX5_CQE_INVALID);
        uct_ib_mlx5_check_completion(&iface->super.super, cq, cqe);
    }
}

static UCS_F_ALWAYS_INLINE unsigned
uct_rc_mlx5_iface_poll_tx(uct_rc_mlx5_iface_common_t *iface)
{
    struct mlx5_cqe64 *cqe;
    uct_rc_mlx5_ep_t *ep;
    unsigned qp_num;
    uint16_t hw_ci;

    cqe = uct_ib_mlx5_poll_cq(&iface->super.super, &iface->cq[UCT_IB_DIR_TX]);
    if (cqe == NULL) {
        return 0;
    }

    UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1);

    ucs_memory_cpu_load_fence();

    qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER);
    ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, qp_num), uct_rc_mlx5_ep_t);
    ucs_assert(ep != NULL);
    hw_ci = ntohs(cqe->wqe_counter);
    ucs_trace_poll("rc_mlx5 iface %p tx_cqe: ep %p qpn 0x%x hw_ci %d", iface, ep,
                   qp_num, hw_ci);

    uct_rc_mlx5_common_update_tx_res(&iface->super, &ep->tx.wq, &ep->super.txqp,
                                     hw_ci);
    uct_rc_mlx5_txqp_process_tx_cqe(&ep->super.txqp, cqe, hw_ci);

    ucs_arbiter_group_schedule(&iface->super.tx.arbiter, &ep->super.arb_group);
    ucs_arbiter_dispatch(&iface->super.tx.arbiter, 1, uct_rc_ep_process_pending, NULL);

    return 1;
}

unsigned uct_rc_mlx5_iface_progress(void *arg)
{
    uct_rc_mlx5_iface_common_t *iface = arg;
    unsigned count;

    count = uct_rc_mlx5_iface_common_poll_rx(iface, UCT_RC_MLX5_POLL_FLAG_HAS_EP);
    if (count > 0) {
        return count;
    }
    return uct_rc_mlx5_iface_poll_tx(iface);
}

static ucs_status_t uct_rc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr)
{
    uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_iface, uct_rc_mlx5_iface_common_t);
    uct_rc_iface_t *rc_iface   = &iface->super;
    size_t max_am_inline       = UCT_IB_MLX5_AM_MAX_SHORT(0);
    size_t max_put_inline      = UCT_IB_MLX5_PUT_MAX_SHORT(0);
    ucs_status_t status;

#if HAVE_IBV_DM
    if (iface->dm.dm != NULL) {
        max_am_inline  = ucs_max(iface->dm.dm->seg_len, UCT_IB_MLX5_AM_MAX_SHORT(0));
        max_put_inline = ucs_max(iface->dm.dm->seg_len, UCT_IB_MLX5_PUT_MAX_SHORT(0));
    }
#endif

    status = uct_rc_iface_query(rc_iface, iface_attr,
                                max_put_inline,
                                max_am_inline,
                                UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(0),
                                UCT_IB_MLX5_AM_ZCOPY_MAX_IOV,
                                UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(0),
                                sizeof(uct_rc_mlx5_hdr_t));
    if (status != UCS_OK) {
        return status;
    }

    uct_rc_mlx5_iface_common_query(&rc_iface->super, iface_attr, max_am_inline, 0);
    iface_attr->latency.growth += 1e-9; /* 1 ns per each extra QP */
    iface_attr->ep_addr_len     = sizeof(uct_rc_mlx5_ep_address_t);
    iface_attr->iface_addr_len  = sizeof(uint8_t);
    return UCS_OK;
}

static ucs_status_t uct_rc_mlx5_iface_arm_cq(uct_ib_iface_t *ib_iface,
                                             uct_ib_dir_t dir,
                                             int solicited_only)
{
    uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ib_iface, uct_rc_mlx5_iface_common_t);
#if HAVE_DECL_MLX5DV_INIT_OBJ
    return uct_ib_mlx5dv_arm_cq(&iface->cq[dir], solicited_only);
#else
    uct_ib_mlx5_update_cq_ci(iface->super.super.cq[dir],
                             iface->cq[dir].cq_ci);
    return uct_ib_iface_arm_cq(ib_iface, dir, solicited_only);
#endif
}

static void
uct_rc_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, void *arg,
                                 ucs_status_t status)
{
    struct mlx5_cqe64  *cqe    = arg;
    uct_rc_iface_t     *iface  = ucs_derived_of(ib_iface, uct_rc_iface_t);
    unsigned           qp_num  = ntohl(cqe->sop_drop_qpn) &
                                 UCS_MASK(UCT_IB_QPN_ORDER);
    uct_rc_mlx5_ep_t   *ep     = ucs_derived_of(uct_rc_iface_lookup_ep(iface,
                                                                       qp_num),
                                                uct_rc_mlx5_ep_t);
    ucs_log_level_t    log_lvl = UCS_LOG_LEVEL_FATAL;
    uct_ib_mlx5_txwq_t txwq_copy;
    size_t             txwq_size;

    if (!ep) {
        return;
    }

    /* Create a copy of RC txwq for completion error reporting, since the QP
     * would be released by set_ep_failed()*/
    txwq_copy = ep->tx.wq;
    txwq_size = UCS_PTR_BYTE_DIFF(ep->tx.wq.qstart, ep->tx.wq.qend);
    txwq_copy.qstart = ucs_malloc(txwq_size, "rc_txwq_copy");
    if (txwq_copy.qstart != NULL) {
        memcpy(txwq_copy.qstart, ep->tx.wq.qstart, txwq_size);
        txwq_copy.qend = UCS_PTR_BYTE_OFFSET(txwq_copy.qstart, txwq_size);
    }

    if (uct_rc_mlx5_ep_handle_failure(ep, status) == UCS_OK) {
        log_lvl = ib_iface->super.config.failure_level;
    }

    uct_ib_mlx5_completion_with_err(ib_iface, arg,
                                    txwq_copy.qstart ? &txwq_copy : NULL,
                                    log_lvl);
    ucs_free(txwq_copy.qstart);
}

static void uct_rc_mlx5_iface_progress_enable(uct_iface_h tl_iface, unsigned flags)
{
    uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_iface, uct_rc_mlx5_iface_common_t);

    if (flags & UCT_PROGRESS_RECV) {
        uct_rc_mlx5_iface_common_prepost_recvs(iface);
    }

    uct_base_iface_progress_enable_cb(&iface->super.super.super,
                                      iface->super.progress, flags);
}

ucs_status_t uct_rc_mlx5_iface_create_qp(uct_rc_mlx5_iface_common_t *iface,
                                         uct_ib_mlx5_qp_t *qp,
                                         uct_ib_mlx5_txwq_t *txwq,
                                         uct_ib_qp_attr_t *attr)
{
    uct_ib_iface_t *ib_iface           = &iface->super.super;
    ucs_status_t status;
#if HAVE_DECL_MLX5DV_CREATE_QP
    uct_ib_mlx5_md_t *md               = ucs_derived_of(ib_iface->super.md,
                                                        uct_ib_mlx5_md_t);
    uct_ib_device_t *dev               = &md->super.dev;
    struct mlx5dv_qp_init_attr dv_attr = {};

    if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_RC_QP) {
        return uct_ib_mlx5_devx_create_qp(ib_iface, qp, txwq, attr);
    }

    status = uct_ib_mlx5_iface_fill_attr(ib_iface, qp, attr);
    if (status != UCS_OK) {
        return status;
    }

    uct_ib_iface_fill_attr(ib_iface, attr);
#if HAVE_DECL_MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE
    dv_attr.comp_mask    = MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
    dv_attr.create_flags = MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE;
#endif
    qp->verbs.qp = mlx5dv_create_qp(dev->ibv_context, &attr->ibv, &dv_attr);
    if (qp->verbs.qp == NULL) {
        ucs_error("mlx5dv_create_qp("UCT_IB_IFACE_FMT"): failed: %m",
                  UCT_IB_IFACE_ARG(ib_iface));
        status = UCS_ERR_IO_ERROR;
        goto err;
    }

    qp->qp_num = qp->verbs.qp->qp_num;
#else
    status = uct_ib_mlx5_iface_create_qp(ib_iface, qp, attr);
    if (status != UCS_OK) {
        goto err;
    }
#endif

    status = uct_rc_iface_qp_init(&iface->super, qp->verbs.qp);
    if (status != UCS_OK) {
        goto err_destory_qp;
    }

    if (attr->cap.max_send_wr) {
        status = uct_ib_mlx5_txwq_init(iface->super.super.super.worker,
                                       iface->tx.mmio_mode, txwq,
                                       qp->verbs.qp);
        if (status != UCS_OK) {
            ucs_error("Failed to get mlx5 QP information");
            goto err_destory_qp;
        }
    }

    return UCS_OK;

err_destory_qp:
    ibv_destroy_qp(qp->verbs.qp);
err:
    return status;
}

static UCS_F_MAYBE_UNUSED unsigned uct_rc_mlx5_iface_progress_tm(void *arg)
{
    uct_rc_mlx5_iface_common_t *iface = arg;
    unsigned count;

    count = uct_rc_mlx5_iface_common_poll_rx(iface,
                                             UCT_RC_MLX5_POLL_FLAG_HAS_EP |
                                             UCT_RC_MLX5_POLL_FLAG_TM);
    if (count > 0) {
        return count;
    }
    return uct_rc_mlx5_iface_poll_tx(iface);
}

#if IBV_HW_TM
static ucs_status_t uct_rc_mlx5_iface_tag_recv_zcopy(uct_iface_h tl_iface,
                                                     uct_tag_t tag,
                                                     uct_tag_t tag_mask,
                                                     const uct_iov_t *iov,
                                                     size_t iovcnt,
                                                     uct_tag_context_t *ctx)
{
    uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_iface, uct_rc_mlx5_iface_common_t);

    return uct_rc_mlx5_iface_common_tag_recv(iface, tag, tag_mask, iov,
                                             iovcnt, ctx);
}

static ucs_status_t uct_rc_mlx5_iface_tag_recv_cancel(uct_iface_h tl_iface,
                                                      uct_tag_context_t *ctx,
                                                      int force)
{
   uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_iface, uct_rc_mlx5_iface_common_t);

   return uct_rc_mlx5_iface_common_tag_recv_cancel(iface, ctx, force);
}
#endif

static ucs_status_t uct_rc_mlx5_iface_preinit(uct_rc_mlx5_iface_common_t *iface,
                                              uct_md_h tl_md,
                                              uct_rc_iface_common_config_t *rc_config,
                                              uct_rc_mlx5_iface_common_config_t *mlx5_config,
                                              const uct_iface_params_t *params,
                                              uct_ib_iface_init_attr_t *init_attr)
{
#if IBV_HW_TM
    uct_ib_mlx5_md_t *md              = ucs_derived_of(tl_md, uct_ib_mlx5_md_t);
    uct_ib_device_t UCS_V_UNUSED *dev = &md->super.dev;
    struct ibv_tmh tmh;
    int mtu;
    ucs_status_t status;

    iface->tm.enabled = mlx5_config->tm.enable && (init_attr->flags &
                                                   UCT_IB_TM_SUPPORTED);
    if (!iface->tm.enabled) {
        goto out_tm_disabled;
    }

    /* Compile-time check that THM and uct_rc_mlx5_hdr_t are wire-compatible for the
     * case of no-tag protocol.
     */
    UCS_STATIC_ASSERT(sizeof(tmh.opcode) == sizeof(((uct_rc_mlx5_hdr_t*)0)->tmh_opcode));
    UCS_STATIC_ASSERT(ucs_offsetof(struct ibv_tmh, opcode) ==
                      ucs_offsetof(uct_rc_mlx5_hdr_t, tmh_opcode));

    UCS_STATIC_ASSERT(sizeof(uct_rc_mlx5_ctx_priv_t) <= UCT_TAG_PRIV_LEN);

    iface->tm.eager_unexp.cb  = (params->field_mask &
                                 UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_CB) ?
                                params->eager_cb : NULL;
    iface->tm.eager_unexp.arg = (params->field_mask &
                                 UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_ARG) ?
                                params->eager_arg : NULL;
    iface->tm.rndv_unexp.cb   = (params->field_mask &
                                 UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB) ?
                                params->rndv_cb : NULL;
    iface->tm.rndv_unexp.arg  = (params->field_mask &
                                 UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_ARG) ?
                                params->rndv_arg : NULL;
    iface->tm.unexpected_cnt  = 0;
    iface->tm.num_outstanding = 0;
    iface->tm.num_tags        = ucs_min(IBV_DEVICE_TM_CAPS(dev, max_num_tags),
                                        mlx5_config->tm.list_size);

    /* There can be:
     * - up to rx.queue_len RX CQEs
     * - up to 3 CQEs for every posted tag: ADD, TM_CONSUMED and MSG_ARRIVED
     * - one SYNC CQE per every IBV_DEVICE_MAX_UNEXP_COUNT unexpected receives */
    UCS_STATIC_ASSERT(IBV_DEVICE_MAX_UNEXP_COUNT);
    init_attr->rx_cq_len     = rc_config->super.rx.queue_len + iface->tm.num_tags * 3 +
                               rc_config->super.rx.queue_len /
                               IBV_DEVICE_MAX_UNEXP_COUNT;
    init_attr->seg_size      = ucs_max(mlx5_config->tm.seg_size,
                                       rc_config->super.seg_size);
    iface->tm.mp.num_strides = 1;
    iface->tm.max_bcopy      = init_attr->seg_size;

    /* Multi-Packet XRQ initialization */
    if (!ucs_test_all_flags(md->flags, UCT_IB_MLX5_MD_FLAG_MP_RQ       |
                                       UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ |
                                       UCT_IB_MLX5_MD_FLAG_DEVX_RC_QP)) {
        return UCS_OK;
    }

    if ((mlx5_config->tm.mp_num_strides == UCS_ULUNITS_AUTO) ||
        (mlx5_config->tm.mp_num_strides == 1)) {
        return UCS_OK;
        /* TODO: make the following to be default when MP support is added to UCP
        iface->tm.mp.num_strides = UCS_BIT(IBV_DEVICE_MP_MIN_LOG_NUM_STRIDES); */
    } else if ((mlx5_config->tm.mp_num_strides != 8) &&
               (mlx5_config->tm.mp_num_strides != 16)){
        ucs_error("invalid value of TM_NUM_STRIDES: %lu, must be 1,8 or 16",
                  mlx5_config->tm.mp_num_strides);
        return UCS_ERR_INVALID_PARAM;
    }

    status = uct_ib_device_mtu(params->mode.device.dev_name, tl_md, &mtu);
    if (status != UCS_OK) {
        ucs_error("failed to get port MTU: %s", ucs_status_string(status));
        return UCS_ERR_IO_ERROR;
    }

    iface->tm.mp.num_strides = mlx5_config->tm.mp_num_strides;
    init_attr->seg_size      = mtu;

    return UCS_OK;

out_tm_disabled:
#else
    iface->tm.enabled        = 0;
#endif
    init_attr->rx_cq_len     = rc_config->super.rx.queue_len;
    init_attr->seg_size      = rc_config->super.seg_size;
    iface->tm.mp.num_strides = 1;

    return UCS_OK;
}

static ucs_status_t
uct_rc_mlx5_iface_init_rx(uct_rc_iface_t *rc_iface,
                          const uct_rc_iface_common_config_t *rc_config)
{
    uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(rc_iface, uct_rc_mlx5_iface_common_t);
    uct_ib_mlx5_md_t *md = ucs_derived_of(rc_iface->super.super.md, uct_ib_mlx5_md_t);
    struct ibv_srq_init_attr_ex srq_attr = {};
    ucs_status_t status;

    if (UCT_RC_MLX5_TM_ENABLED(iface)) {
        if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ) {
            status = uct_rc_mlx5_devx_init_rx_tm(iface, rc_config, 0,
                                                 UCT_RC_RNDV_HDR_LEN);
        } else {
            status = uct_rc_mlx5_init_rx_tm(iface, rc_config, &srq_attr,
                                            UCT_RC_RNDV_HDR_LEN);
        }

        if (status != UCS_OK) {
            goto err;
        }

        iface->super.progress = uct_rc_mlx5_iface_progress_tm;
        return UCS_OK;
    }

    /* MP XRQ is supported with HW TM only */
    ucs_assert(iface->tm.mp.num_strides == 1);

    status = uct_rc_iface_init_rx(rc_iface, rc_config, &iface->rx.srq.verbs.srq);
    if (status != UCS_OK) {
        goto err;
    }

    status = uct_ib_mlx5_srq_init(&iface->rx.srq, iface->rx.srq.verbs.srq,
                                  iface->super.super.config.seg_size,
                                  iface->tm.mp.num_strides);
    if (status != UCS_OK) {
        goto err_free_srq;
    }

    iface->rx.srq.type    = UCT_IB_MLX5_OBJ_TYPE_VERBS;
    iface->super.progress = uct_rc_mlx5_iface_progress;
    return UCS_OK;

err_free_srq:
    uct_rc_mlx5_destroy_srq(&iface->rx.srq);
err:
    return status;
}

static void uct_rc_mlx5_iface_cleanup_rx(uct_rc_iface_t *rc_iface)
{
    uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(rc_iface, uct_rc_mlx5_iface_common_t);

    uct_rc_mlx5_destroy_srq(&iface->rx.srq);
}

static void uct_rc_mlx5_iface_event_cq(uct_ib_iface_t *ib_iface,
                                       uct_ib_dir_t dir)
{
    uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ib_iface, uct_rc_mlx5_iface_common_t);

    iface->cq[dir].cq_sn++;
}

static uint8_t uct_rc_mlx5_iface_get_address_type(uct_iface_h tl_iface)
{
    uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_iface,
                                                       uct_rc_mlx5_iface_common_t);

    return UCT_RC_MLX5_TM_ENABLED(iface) ?  UCT_RC_MLX5_IFACE_ADDR_TYPE_TM :
                                            UCT_RC_MLX5_IFACE_ADDR_TYPE_BASIC;
}

static ucs_status_t uct_rc_mlx5_iface_get_address(uct_iface_h tl_iface,
                                                  uct_iface_addr_t *addr)
{
    *(uint8_t*)addr = uct_rc_mlx5_iface_get_address_type(tl_iface);

    return UCS_OK;
}

int uct_rc_mlx5_iface_is_reachable(const uct_iface_h tl_iface,
                                   const uct_device_addr_t *dev_addr,
                                   const uct_iface_addr_t *iface_addr)
{
    uint8_t my_type = uct_rc_mlx5_iface_get_address_type(tl_iface);

    if ((iface_addr != NULL) && (my_type != *(uint8_t*)iface_addr)) {
        return 0;
    }

    return uct_ib_iface_is_reachable(tl_iface, dev_addr, iface_addr);
}


static int uct_rc_mlx5_iface_srq_topo(uct_rc_mlx5_iface_common_t *iface,
                                      uct_md_h md,
                                      uct_rc_mlx5_iface_common_config_t *mlx5_config)
{
    uct_ib_mlx5_md_t *ib_md = ucs_derived_of(md, uct_ib_mlx5_md_t);

    /* Cyclic SRQ is supported with HW TM and DEVX only. */
    if (((mlx5_config->srq_topo == UCT_RC_MLX5_SRQ_TOPO_AUTO) ||
        (mlx5_config->srq_topo == UCT_RC_MLX5_SRQ_TOPO_CYCLIC)) &&
        UCT_RC_MLX5_TM_ENABLED(iface) &&
        (ib_md->flags & UCT_IB_MLX5_MD_FLAG_DEVX)) {

        return UCT_RC_MLX5_MP_ENABLED(iface) ?
               UCT_IB_MLX5_SRQ_TOPO_CYCLIC_MP_RQ : UCT_IB_MLX5_SRQ_TOPO_CYCLIC;
    }

    return UCT_RC_MLX5_MP_ENABLED(iface) ?
           UCT_IB_MLX5_SRQ_TOPO_LIST_MP_RQ : UCT_IB_MLX5_SRQ_TOPO_LIST;
}

UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_common_t,
                    uct_rc_iface_ops_t *ops,
                    uct_md_h md, uct_worker_h worker,
                    const uct_iface_params_t *params,
                    uct_rc_iface_common_config_t *rc_config,
                    uct_rc_mlx5_iface_common_config_t *mlx5_config,
                    uct_ib_iface_init_attr_t *init_attr)
{
    uct_ib_device_t *dev;
    ucs_status_t status;

    status = uct_rc_mlx5_iface_preinit(self, md, rc_config, mlx5_config,
                                       params, init_attr);
    if (status != UCS_OK) {
        return status;
    }

    self->rx.srq.type                = UCT_IB_MLX5_OBJ_TYPE_LAST;
    self->rx.srq.topo                = uct_rc_mlx5_iface_srq_topo(self, md, mlx5_config);
    self->tm.cmd_wq.super.super.type = UCT_IB_MLX5_OBJ_TYPE_LAST;
    init_attr->rx_hdr_len            = UCT_RC_MLX5_MP_ENABLED(self) ?
                                       0 : sizeof(uct_rc_mlx5_hdr_t);

    UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, ops, md, worker, params,
                              rc_config, init_attr);

    dev                       = uct_ib_iface_device(&self->super.super);
    self->tx.mmio_mode        = mlx5_config->super.mmio_mode;
    self->tx.bb_max           = ucs_min(mlx5_config->tx_max_bb, UINT16_MAX);
    self->tm.am_desc.super.cb = uct_rc_mlx5_release_desc;

    if (!UCT_RC_MLX5_MP_ENABLED(self)) {
        self->tm.am_desc.offset = self->super.super.config.rx_headroom_offset;
    }

    status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_TX],
                                &self->cq[UCT_IB_DIR_TX]);
    if (status != UCS_OK) {
        return status;
    }

    status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_RX],
                                &self->cq[UCT_IB_DIR_RX]);
    if (status != UCS_OK) {
        return status;
    }

    status = UCS_STATS_NODE_ALLOC(&self->stats, &uct_rc_mlx5_iface_stats_class,
                                  self->super.stats);
    if (status != UCS_OK) {
        return status;
    }

    status = uct_rc_mlx5_iface_common_tag_init(self);
    if (status != UCS_OK) {
        goto cleanup_stats;
    }

    status = uct_rc_mlx5_iface_common_dm_init(self, &self->super,
                                              &mlx5_config->super);
    if (status != UCS_OK) {
        goto cleanup_tm;
    }

    self->super.config.fence_mode  = (uct_rc_fence_mode_t)rc_config->fence_mode;
    self->super.rx.srq.quota       = self->rx.srq.mask + 1;
    self->super.config.exp_backoff = mlx5_config->exp_backoff;

    if ((rc_config->fence_mode == UCT_RC_FENCE_MODE_WEAK) ||
        ((rc_config->fence_mode == UCT_RC_FENCE_MODE_AUTO) &&
         uct_ib_device_has_pci_atomics(dev))) {
        self->config.atomic_fence_flag = UCT_IB_MLX5_WQE_CTRL_FLAG_FENCE;
        self->config.put_fence_flag    = 0;
        self->super.config.fence_mode  = UCT_RC_FENCE_MODE_WEAK;
    } else if (rc_config->fence_mode == UCT_RC_FENCE_MODE_STRONG) {
        self->config.atomic_fence_flag = UCT_IB_MLX5_WQE_CTRL_FLAG_STRONG_ORDER;
        self->config.put_fence_flag    = UCT_IB_MLX5_WQE_CTRL_FLAG_STRONG_ORDER;
        self->super.config.fence_mode  = UCT_RC_FENCE_MODE_STRONG;
    } else if ((rc_config->fence_mode == UCT_RC_FENCE_MODE_NONE) ||
               ((rc_config->fence_mode == UCT_RC_FENCE_MODE_AUTO) &&
                !uct_ib_device_has_pci_atomics(dev))) {
        self->config.atomic_fence_flag = 0;
        self->config.put_fence_flag    = 0;
        self->super.config.fence_mode  = UCT_RC_FENCE_MODE_NONE;
    } else {
        ucs_error("incorrect fence value: %d", self->super.config.fence_mode);
        status = UCS_ERR_INVALID_PARAM;
        goto cleanup_tm;
    }

    /* By default set to something that is always in cache */
    self->rx.pref_ptr = self;

    status = uct_iface_mpool_init(&self->super.super.super,
                                  &self->tx.atomic_desc_mp,
                                  sizeof(uct_rc_iface_send_desc_t) + UCT_IB_MAX_ATOMIC_SIZE,
                                  sizeof(uct_rc_iface_send_desc_t) + UCT_IB_MAX_ATOMIC_SIZE,
                                  UCS_SYS_CACHE_LINE_SIZE,
                                  &rc_config->super.tx.mp,
                                  self->super.config.tx_qp_len,
                                  uct_rc_iface_send_desc_init,
                                  "rc_mlx5_atomic_desc");
    if (status != UCS_OK) {
        goto cleanup_dm;
    }

    /* For little-endian atomic reply, override the default functions, to still
     * treat the response as big-endian when it arrives in the CQE.
     */
    if (!(uct_ib_iface_device(&self->super.super)->atomic_arg_sizes_be & sizeof(uint64_t))) {
        self->super.config.atomic64_handler     = uct_rc_mlx5_common_atomic64_le_handler;
    }
    if (!(uct_ib_iface_device(&self->super.super)->ext_atomic_arg_sizes_be & sizeof(uint32_t))) {
        self->super.config.atomic32_ext_handler = uct_rc_mlx5_common_atomic32_le_handler;
    }
    if (!(uct_ib_iface_device(&self->super.super)->ext_atomic_arg_sizes_be & sizeof(uint64_t))) {
        self->super.config.atomic64_ext_handler = uct_rc_mlx5_common_atomic64_le_handler;
    }

    return UCS_OK;

cleanup_dm:
    uct_rc_mlx5_iface_common_dm_cleanup(self);
cleanup_tm:
    uct_rc_mlx5_iface_common_tag_cleanup(self);
cleanup_stats:
    UCS_STATS_NODE_FREE(self->stats);
    return status;
}

static UCS_CLASS_CLEANUP_FUNC(uct_rc_mlx5_iface_common_t)
{
    ucs_mpool_cleanup(&self->tx.atomic_desc_mp, 1);
    uct_rc_mlx5_iface_common_dm_cleanup(self);
    uct_rc_mlx5_iface_common_tag_cleanup(self);
    UCS_STATS_NODE_FREE(self->stats);
}

UCS_CLASS_DEFINE(uct_rc_mlx5_iface_common_t, uct_rc_iface_t);

typedef struct {
    uct_rc_mlx5_iface_common_t  super;
} uct_rc_mlx5_iface_t;

UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_t,
                    uct_md_h tl_md, uct_worker_h worker,
                    const uct_iface_params_t *params,
                    const uct_iface_config_t *tl_config)
{
    uct_rc_mlx5_iface_config_t *config = ucs_derived_of(tl_config,
                                                        uct_rc_mlx5_iface_config_t);
    uct_ib_mlx5_md_t UCS_V_UNUSED *md  = ucs_derived_of(tl_md, uct_ib_mlx5_md_t);
    uct_ib_iface_init_attr_t init_attr = {};
    ucs_status_t status;

    init_attr.fc_req_size = sizeof(uct_rc_fc_request_t);
    init_attr.flags       = UCT_IB_CQ_IGNORE_OVERRUN;
    init_attr.rx_hdr_len  = sizeof(uct_rc_mlx5_hdr_t);
    init_attr.tx_cq_len   = config->super.tx_cq_len;
    init_attr.qp_type     = IBV_QPT_RC;

    if (IBV_DEVICE_TM_FLAGS(&md->super.dev)) {
        init_attr.flags  |= UCT_IB_TM_SUPPORTED;
    }

    UCS_CLASS_CALL_SUPER_INIT(uct_rc_mlx5_iface_common_t, &uct_rc_mlx5_iface_ops,
                              tl_md, worker, params, &config->super.super,
                              &config->rc_mlx5_common, &init_attr);

    self->super.super.config.tx_moderation = ucs_min(config->super.tx_cq_moderation,
                                                     self->super.tx.bb_max / 4);

    status = uct_rc_init_fc_thresh(&config->super, &self->super.super);
    if (status != UCS_OK) {
        return status;
    }

    /* Set max_iov for put_zcopy and get_zcopy */
    uct_ib_iface_set_max_iov(&self->super.super.super,
                             (UCT_IB_MLX5_MAX_SEND_WQE_SIZE -
                             sizeof(struct mlx5_wqe_raddr_seg) -
                             sizeof(struct mlx5_wqe_ctrl_seg)) /
                             sizeof(struct mlx5_wqe_data_seg));

    return UCS_OK;
}

static UCS_CLASS_CLEANUP_FUNC(uct_rc_mlx5_iface_t)
{
    uct_base_iface_progress_disable(&self->super.super.super.super.super,
                                    UCT_PROGRESS_SEND | UCT_PROGRESS_RECV);
}

UCS_CLASS_DEFINE(uct_rc_mlx5_iface_t, uct_rc_mlx5_iface_common_t);

static UCS_CLASS_DEFINE_NEW_FUNC(uct_rc_mlx5_iface_t, uct_iface_t, uct_md_h,
                                 uct_worker_h, const uct_iface_params_t*,
                                 const uct_iface_config_t*);

static UCS_CLASS_DEFINE_DELETE_FUNC(uct_rc_mlx5_iface_t, uct_iface_t);

static uct_rc_iface_ops_t uct_rc_mlx5_iface_ops = {
    {
    {
    .ep_put_short             = uct_rc_mlx5_ep_put_short,
    .ep_put_bcopy             = uct_rc_mlx5_ep_put_bcopy,
    .ep_put_zcopy             = uct_rc_mlx5_ep_put_zcopy,
    .ep_get_bcopy             = uct_rc_mlx5_ep_get_bcopy,
    .ep_get_zcopy             = uct_rc_mlx5_ep_get_zcopy,
    .ep_am_short              = uct_rc_mlx5_ep_am_short,
    .ep_am_bcopy              = uct_rc_mlx5_ep_am_bcopy,
    .ep_am_zcopy              = uct_rc_mlx5_ep_am_zcopy,
    .ep_atomic_cswap64        = uct_rc_mlx5_ep_atomic_cswap64,
    .ep_atomic_cswap32        = uct_rc_mlx5_ep_atomic_cswap32,
    .ep_atomic64_post         = uct_rc_mlx5_ep_atomic64_post,
    .ep_atomic32_post         = uct_rc_mlx5_ep_atomic32_post,
    .ep_atomic64_fetch        = uct_rc_mlx5_ep_atomic64_fetch,
    .ep_atomic32_fetch        = uct_rc_mlx5_ep_atomic32_fetch,
    .ep_pending_add           = uct_rc_ep_pending_add,
    .ep_pending_purge         = uct_rc_ep_pending_purge,
    .ep_flush                 = uct_rc_mlx5_ep_flush,
    .ep_fence                 = uct_rc_mlx5_ep_fence,
    .ep_create                = UCS_CLASS_NEW_FUNC_NAME(uct_rc_mlx5_ep_t),
    .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_rc_mlx5_ep_t),
    .ep_get_address           = uct_rc_mlx5_ep_get_address,
    .ep_connect_to_ep         = uct_rc_mlx5_ep_connect_to_ep,
#if IBV_HW_TM
    .ep_tag_eager_short       = uct_rc_mlx5_ep_tag_eager_short,
    .ep_tag_eager_bcopy       = uct_rc_mlx5_ep_tag_eager_bcopy,
    .ep_tag_eager_zcopy       = uct_rc_mlx5_ep_tag_eager_zcopy,
    .ep_tag_rndv_zcopy        = uct_rc_mlx5_ep_tag_rndv_zcopy,
    .ep_tag_rndv_request      = uct_rc_mlx5_ep_tag_rndv_request,
    .ep_tag_rndv_cancel       = uct_rc_mlx5_ep_tag_rndv_cancel,
    .iface_tag_recv_zcopy     = uct_rc_mlx5_iface_tag_recv_zcopy,
    .iface_tag_recv_cancel    = uct_rc_mlx5_iface_tag_recv_cancel,
#endif
    .iface_flush              = uct_rc_iface_flush,
    .iface_fence              = uct_rc_iface_fence,
    .iface_progress_enable    = uct_rc_mlx5_iface_progress_enable,
    .iface_progress_disable   = uct_base_iface_progress_disable,
    .iface_progress           = uct_rc_iface_do_progress,
    .iface_event_fd_get       = uct_ib_iface_event_fd_get,
    .iface_event_arm          = uct_rc_iface_event_arm,
    .iface_close              = UCS_CLASS_DELETE_FUNC_NAME(uct_rc_mlx5_iface_t),
    .iface_query              = uct_rc_mlx5_iface_query,
    .iface_get_address        = uct_rc_mlx5_iface_get_address,
    .iface_get_device_address = uct_ib_iface_get_device_address,
    .iface_is_reachable       = uct_rc_mlx5_iface_is_reachable
    },
    .create_cq                = uct_ib_mlx5_create_cq,
    .arm_cq                   = uct_rc_mlx5_iface_arm_cq,
    .event_cq                 = uct_rc_mlx5_iface_event_cq,
    .handle_failure           = uct_rc_mlx5_iface_handle_failure,
    .set_ep_failed            = uct_rc_mlx5_ep_set_failed,
    },
    .init_rx                  = uct_rc_mlx5_iface_init_rx,
    .cleanup_rx               = uct_rc_mlx5_iface_cleanup_rx,
    .fc_ctrl                  = uct_rc_mlx5_ep_fc_ctrl,
    .fc_handler               = uct_rc_iface_fc_handler,
};

static ucs_status_t
uct_rc_mlx5_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
                             unsigned *num_tl_devices_p)
{
    uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t);
    int flags;

    flags = UCT_IB_DEVICE_FLAG_MLX5_PRM |
            (ib_md->config.eth_pause ? 0 : UCT_IB_DEVICE_FLAG_LINK_IB);
    return uct_ib_device_query_ports(&ib_md->dev, flags, tl_devices_p,
                                     num_tl_devices_p);
}

UCT_TL_DEFINE(&uct_ib_component, rc_mlx5, uct_rc_mlx5_query_tl_devices,
              uct_rc_mlx5_iface_t, "RC_MLX5_", uct_rc_mlx5_iface_config_table,
              uct_rc_mlx5_iface_config_t);