/** * Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ #include #include #include #include #include #include #include #include "rc_mlx5.inl" enum { UCT_RC_MLX5_IFACE_ADDR_TYPE_BASIC, /* Tag Matching address. It additionaly contains QP number which * is used for hardware offloads. */ UCT_RC_MLX5_IFACE_ADDR_TYPE_TM }; /** * RC mlx5 interface configuration */ typedef struct uct_rc_mlx5_iface_config { uct_rc_iface_config_t super; uct_rc_mlx5_iface_common_config_t rc_mlx5_common; /* TODO wc_mode, UAR mode SnB W/A... */ } uct_rc_mlx5_iface_config_t; ucs_config_field_t uct_rc_mlx5_iface_config_table[] = { {"RC_", "", NULL, ucs_offsetof(uct_rc_mlx5_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_rc_iface_config_table)}, {"RC_", "", NULL, ucs_offsetof(uct_rc_mlx5_iface_config_t, rc_mlx5_common), UCS_CONFIG_TYPE_TABLE(uct_rc_mlx5_common_config_table)}, {NULL} }; static uct_rc_iface_ops_t uct_rc_mlx5_iface_ops; #if ENABLE_STATS ucs_stats_class_t uct_rc_mlx5_iface_stats_class = { .name = "mlx5", .num_counters = UCT_RC_MLX5_IFACE_STAT_LAST, .counter_names = { [UCT_RC_MLX5_IFACE_STAT_RX_INL_32] = "rx_inl_32", [UCT_RC_MLX5_IFACE_STAT_RX_INL_64] = "rx_inl_64" } }; #endif void uct_rc_mlx5_iface_check_rx_completion(uct_rc_mlx5_iface_common_t *iface, struct mlx5_cqe64 *cqe) { uct_ib_mlx5_cq_t *cq = &iface->cq[UCT_IB_DIR_RX]; struct mlx5_err_cqe *ecqe = (void*)cqe; uct_ib_mlx5_srq_seg_t *seg; uint16_t wqe_ctr; ucs_memory_cpu_load_fence(); if (((ecqe->op_own >> 4) == MLX5_CQE_RESP_ERR) && (ecqe->syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR) && (ecqe->vendor_err_synd == UCT_IB_MLX5_CQE_VENDOR_SYND_ODP)) { /* Release the aborted segment */ wqe_ctr = ntohs(ecqe->wqe_counter); seg = uct_ib_mlx5_srq_get_wqe(&iface->rx.srq, wqe_ctr); ++cq->cq_ci; /* TODO: Check if ib_stride_index valid for error CQE */ uct_rc_mlx5_iface_release_srq_seg(iface, seg, cqe, wqe_ctr, UCS_OK, iface->super.super.config.rx_headroom_offset, &iface->super.super.release_desc); } else { ucs_assert((ecqe->op_own >> 4) != MLX5_CQE_INVALID); uct_ib_mlx5_check_completion(&iface->super.super, cq, cqe); } } static UCS_F_ALWAYS_INLINE unsigned uct_rc_mlx5_iface_poll_tx(uct_rc_mlx5_iface_common_t *iface) { struct mlx5_cqe64 *cqe; uct_rc_mlx5_ep_t *ep; unsigned qp_num; uint16_t hw_ci; cqe = uct_ib_mlx5_poll_cq(&iface->super.super, &iface->cq[UCT_IB_DIR_TX]); if (cqe == NULL) { return 0; } UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ucs_memory_cpu_load_fence(); qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, qp_num), uct_rc_mlx5_ep_t); ucs_assert(ep != NULL); hw_ci = ntohs(cqe->wqe_counter); ucs_trace_poll("rc_mlx5 iface %p tx_cqe: ep %p qpn 0x%x hw_ci %d", iface, ep, qp_num, hw_ci); uct_rc_mlx5_common_update_tx_res(&iface->super, &ep->tx.wq, &ep->super.txqp, hw_ci); uct_rc_mlx5_txqp_process_tx_cqe(&ep->super.txqp, cqe, hw_ci); ucs_arbiter_group_schedule(&iface->super.tx.arbiter, &ep->super.arb_group); ucs_arbiter_dispatch(&iface->super.tx.arbiter, 1, uct_rc_ep_process_pending, NULL); return 1; } unsigned uct_rc_mlx5_iface_progress(void *arg) { uct_rc_mlx5_iface_common_t *iface = arg; unsigned count; count = uct_rc_mlx5_iface_common_poll_rx(iface, UCT_RC_MLX5_POLL_FLAG_HAS_EP); if (count > 0) { return count; } return uct_rc_mlx5_iface_poll_tx(iface); } static ucs_status_t uct_rc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_iface, uct_rc_mlx5_iface_common_t); uct_rc_iface_t *rc_iface = &iface->super; size_t max_am_inline = UCT_IB_MLX5_AM_MAX_SHORT(0); size_t max_put_inline = UCT_IB_MLX5_PUT_MAX_SHORT(0); ucs_status_t status; #if HAVE_IBV_DM if (iface->dm.dm != NULL) { max_am_inline = ucs_max(iface->dm.dm->seg_len, UCT_IB_MLX5_AM_MAX_SHORT(0)); max_put_inline = ucs_max(iface->dm.dm->seg_len, UCT_IB_MLX5_PUT_MAX_SHORT(0)); } #endif status = uct_rc_iface_query(rc_iface, iface_attr, max_put_inline, max_am_inline, UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(0), UCT_IB_MLX5_AM_ZCOPY_MAX_IOV, UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(0), sizeof(uct_rc_mlx5_hdr_t)); if (status != UCS_OK) { return status; } uct_rc_mlx5_iface_common_query(&rc_iface->super, iface_attr, max_am_inline, 0); iface_attr->latency.growth += 1e-9; /* 1 ns per each extra QP */ iface_attr->ep_addr_len = sizeof(uct_rc_mlx5_ep_address_t); iface_attr->iface_addr_len = sizeof(uint8_t); return UCS_OK; } static ucs_status_t uct_rc_mlx5_iface_arm_cq(uct_ib_iface_t *ib_iface, uct_ib_dir_t dir, int solicited_only) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ib_iface, uct_rc_mlx5_iface_common_t); #if HAVE_DECL_MLX5DV_INIT_OBJ return uct_ib_mlx5dv_arm_cq(&iface->cq[dir], solicited_only); #else uct_ib_mlx5_update_cq_ci(iface->super.super.cq[dir], iface->cq[dir].cq_ci); return uct_ib_iface_arm_cq(ib_iface, dir, solicited_only); #endif } static void uct_rc_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, void *arg, ucs_status_t status) { struct mlx5_cqe64 *cqe = arg; uct_rc_iface_t *iface = ucs_derived_of(ib_iface, uct_rc_iface_t); unsigned qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); uct_rc_mlx5_ep_t *ep = ucs_derived_of(uct_rc_iface_lookup_ep(iface, qp_num), uct_rc_mlx5_ep_t); ucs_log_level_t log_lvl = UCS_LOG_LEVEL_FATAL; uct_ib_mlx5_txwq_t txwq_copy; size_t txwq_size; if (!ep) { return; } /* Create a copy of RC txwq for completion error reporting, since the QP * would be released by set_ep_failed()*/ txwq_copy = ep->tx.wq; txwq_size = UCS_PTR_BYTE_DIFF(ep->tx.wq.qstart, ep->tx.wq.qend); txwq_copy.qstart = ucs_malloc(txwq_size, "rc_txwq_copy"); if (txwq_copy.qstart != NULL) { memcpy(txwq_copy.qstart, ep->tx.wq.qstart, txwq_size); txwq_copy.qend = UCS_PTR_BYTE_OFFSET(txwq_copy.qstart, txwq_size); } if (uct_rc_mlx5_ep_handle_failure(ep, status) == UCS_OK) { log_lvl = ib_iface->super.config.failure_level; } uct_ib_mlx5_completion_with_err(ib_iface, arg, txwq_copy.qstart ? &txwq_copy : NULL, log_lvl); ucs_free(txwq_copy.qstart); } static void uct_rc_mlx5_iface_progress_enable(uct_iface_h tl_iface, unsigned flags) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_iface, uct_rc_mlx5_iface_common_t); if (flags & UCT_PROGRESS_RECV) { uct_rc_mlx5_iface_common_prepost_recvs(iface); } uct_base_iface_progress_enable_cb(&iface->super.super.super, iface->super.progress, flags); } ucs_status_t uct_rc_mlx5_iface_create_qp(uct_rc_mlx5_iface_common_t *iface, uct_ib_mlx5_qp_t *qp, uct_ib_mlx5_txwq_t *txwq, uct_ib_qp_attr_t *attr) { uct_ib_iface_t *ib_iface = &iface->super.super; ucs_status_t status; #if HAVE_DECL_MLX5DV_CREATE_QP uct_ib_mlx5_md_t *md = ucs_derived_of(ib_iface->super.md, uct_ib_mlx5_md_t); uct_ib_device_t *dev = &md->super.dev; struct mlx5dv_qp_init_attr dv_attr = {}; if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_RC_QP) { return uct_ib_mlx5_devx_create_qp(ib_iface, qp, txwq, attr); } status = uct_ib_mlx5_iface_fill_attr(ib_iface, qp, attr); if (status != UCS_OK) { return status; } uct_ib_iface_fill_attr(ib_iface, attr); #if HAVE_DECL_MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS; dv_attr.create_flags = MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE; #endif qp->verbs.qp = mlx5dv_create_qp(dev->ibv_context, &attr->ibv, &dv_attr); if (qp->verbs.qp == NULL) { ucs_error("mlx5dv_create_qp("UCT_IB_IFACE_FMT"): failed: %m", UCT_IB_IFACE_ARG(ib_iface)); status = UCS_ERR_IO_ERROR; goto err; } qp->qp_num = qp->verbs.qp->qp_num; #else status = uct_ib_mlx5_iface_create_qp(ib_iface, qp, attr); if (status != UCS_OK) { goto err; } #endif status = uct_rc_iface_qp_init(&iface->super, qp->verbs.qp); if (status != UCS_OK) { goto err_destory_qp; } if (attr->cap.max_send_wr) { status = uct_ib_mlx5_txwq_init(iface->super.super.super.worker, iface->tx.mmio_mode, txwq, qp->verbs.qp); if (status != UCS_OK) { ucs_error("Failed to get mlx5 QP information"); goto err_destory_qp; } } return UCS_OK; err_destory_qp: ibv_destroy_qp(qp->verbs.qp); err: return status; } static UCS_F_MAYBE_UNUSED unsigned uct_rc_mlx5_iface_progress_tm(void *arg) { uct_rc_mlx5_iface_common_t *iface = arg; unsigned count; count = uct_rc_mlx5_iface_common_poll_rx(iface, UCT_RC_MLX5_POLL_FLAG_HAS_EP | UCT_RC_MLX5_POLL_FLAG_TM); if (count > 0) { return count; } return uct_rc_mlx5_iface_poll_tx(iface); } #if IBV_HW_TM static ucs_status_t uct_rc_mlx5_iface_tag_recv_zcopy(uct_iface_h tl_iface, uct_tag_t tag, uct_tag_t tag_mask, const uct_iov_t *iov, size_t iovcnt, uct_tag_context_t *ctx) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_iface, uct_rc_mlx5_iface_common_t); return uct_rc_mlx5_iface_common_tag_recv(iface, tag, tag_mask, iov, iovcnt, ctx); } static ucs_status_t uct_rc_mlx5_iface_tag_recv_cancel(uct_iface_h tl_iface, uct_tag_context_t *ctx, int force) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_iface, uct_rc_mlx5_iface_common_t); return uct_rc_mlx5_iface_common_tag_recv_cancel(iface, ctx, force); } #endif static ucs_status_t uct_rc_mlx5_iface_preinit(uct_rc_mlx5_iface_common_t *iface, uct_md_h tl_md, uct_rc_iface_common_config_t *rc_config, uct_rc_mlx5_iface_common_config_t *mlx5_config, const uct_iface_params_t *params, uct_ib_iface_init_attr_t *init_attr) { #if IBV_HW_TM uct_ib_mlx5_md_t *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t); uct_ib_device_t UCS_V_UNUSED *dev = &md->super.dev; struct ibv_tmh tmh; int mtu; ucs_status_t status; iface->tm.enabled = mlx5_config->tm.enable && (init_attr->flags & UCT_IB_TM_SUPPORTED); if (!iface->tm.enabled) { goto out_tm_disabled; } /* Compile-time check that THM and uct_rc_mlx5_hdr_t are wire-compatible for the * case of no-tag protocol. */ UCS_STATIC_ASSERT(sizeof(tmh.opcode) == sizeof(((uct_rc_mlx5_hdr_t*)0)->tmh_opcode)); UCS_STATIC_ASSERT(ucs_offsetof(struct ibv_tmh, opcode) == ucs_offsetof(uct_rc_mlx5_hdr_t, tmh_opcode)); UCS_STATIC_ASSERT(sizeof(uct_rc_mlx5_ctx_priv_t) <= UCT_TAG_PRIV_LEN); iface->tm.eager_unexp.cb = (params->field_mask & UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_CB) ? params->eager_cb : NULL; iface->tm.eager_unexp.arg = (params->field_mask & UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_ARG) ? params->eager_arg : NULL; iface->tm.rndv_unexp.cb = (params->field_mask & UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB) ? params->rndv_cb : NULL; iface->tm.rndv_unexp.arg = (params->field_mask & UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_ARG) ? params->rndv_arg : NULL; iface->tm.unexpected_cnt = 0; iface->tm.num_outstanding = 0; iface->tm.num_tags = ucs_min(IBV_DEVICE_TM_CAPS(dev, max_num_tags), mlx5_config->tm.list_size); /* There can be: * - up to rx.queue_len RX CQEs * - up to 3 CQEs for every posted tag: ADD, TM_CONSUMED and MSG_ARRIVED * - one SYNC CQE per every IBV_DEVICE_MAX_UNEXP_COUNT unexpected receives */ UCS_STATIC_ASSERT(IBV_DEVICE_MAX_UNEXP_COUNT); init_attr->rx_cq_len = rc_config->super.rx.queue_len + iface->tm.num_tags * 3 + rc_config->super.rx.queue_len / IBV_DEVICE_MAX_UNEXP_COUNT; init_attr->seg_size = ucs_max(mlx5_config->tm.seg_size, rc_config->super.seg_size); iface->tm.mp.num_strides = 1; iface->tm.max_bcopy = init_attr->seg_size; /* Multi-Packet XRQ initialization */ if (!ucs_test_all_flags(md->flags, UCT_IB_MLX5_MD_FLAG_MP_RQ | UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ | UCT_IB_MLX5_MD_FLAG_DEVX_RC_QP)) { return UCS_OK; } if ((mlx5_config->tm.mp_num_strides == UCS_ULUNITS_AUTO) || (mlx5_config->tm.mp_num_strides == 1)) { return UCS_OK; /* TODO: make the following to be default when MP support is added to UCP iface->tm.mp.num_strides = UCS_BIT(IBV_DEVICE_MP_MIN_LOG_NUM_STRIDES); */ } else if ((mlx5_config->tm.mp_num_strides != 8) && (mlx5_config->tm.mp_num_strides != 16)){ ucs_error("invalid value of TM_NUM_STRIDES: %lu, must be 1,8 or 16", mlx5_config->tm.mp_num_strides); return UCS_ERR_INVALID_PARAM; } status = uct_ib_device_mtu(params->mode.device.dev_name, tl_md, &mtu); if (status != UCS_OK) { ucs_error("failed to get port MTU: %s", ucs_status_string(status)); return UCS_ERR_IO_ERROR; } iface->tm.mp.num_strides = mlx5_config->tm.mp_num_strides; init_attr->seg_size = mtu; return UCS_OK; out_tm_disabled: #else iface->tm.enabled = 0; #endif init_attr->rx_cq_len = rc_config->super.rx.queue_len; init_attr->seg_size = rc_config->super.seg_size; iface->tm.mp.num_strides = 1; return UCS_OK; } static ucs_status_t uct_rc_mlx5_iface_init_rx(uct_rc_iface_t *rc_iface, const uct_rc_iface_common_config_t *rc_config) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(rc_iface, uct_rc_mlx5_iface_common_t); uct_ib_mlx5_md_t *md = ucs_derived_of(rc_iface->super.super.md, uct_ib_mlx5_md_t); struct ibv_srq_init_attr_ex srq_attr = {}; ucs_status_t status; if (UCT_RC_MLX5_TM_ENABLED(iface)) { if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ) { status = uct_rc_mlx5_devx_init_rx_tm(iface, rc_config, 0, UCT_RC_RNDV_HDR_LEN); } else { status = uct_rc_mlx5_init_rx_tm(iface, rc_config, &srq_attr, UCT_RC_RNDV_HDR_LEN); } if (status != UCS_OK) { goto err; } iface->super.progress = uct_rc_mlx5_iface_progress_tm; return UCS_OK; } /* MP XRQ is supported with HW TM only */ ucs_assert(iface->tm.mp.num_strides == 1); status = uct_rc_iface_init_rx(rc_iface, rc_config, &iface->rx.srq.verbs.srq); if (status != UCS_OK) { goto err; } status = uct_ib_mlx5_srq_init(&iface->rx.srq, iface->rx.srq.verbs.srq, iface->super.super.config.seg_size, iface->tm.mp.num_strides); if (status != UCS_OK) { goto err_free_srq; } iface->rx.srq.type = UCT_IB_MLX5_OBJ_TYPE_VERBS; iface->super.progress = uct_rc_mlx5_iface_progress; return UCS_OK; err_free_srq: uct_rc_mlx5_destroy_srq(&iface->rx.srq); err: return status; } static void uct_rc_mlx5_iface_cleanup_rx(uct_rc_iface_t *rc_iface) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(rc_iface, uct_rc_mlx5_iface_common_t); uct_rc_mlx5_destroy_srq(&iface->rx.srq); } static void uct_rc_mlx5_iface_event_cq(uct_ib_iface_t *ib_iface, uct_ib_dir_t dir) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ib_iface, uct_rc_mlx5_iface_common_t); iface->cq[dir].cq_sn++; } static uint8_t uct_rc_mlx5_iface_get_address_type(uct_iface_h tl_iface) { uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_iface, uct_rc_mlx5_iface_common_t); return UCT_RC_MLX5_TM_ENABLED(iface) ? UCT_RC_MLX5_IFACE_ADDR_TYPE_TM : UCT_RC_MLX5_IFACE_ADDR_TYPE_BASIC; } static ucs_status_t uct_rc_mlx5_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *addr) { *(uint8_t*)addr = uct_rc_mlx5_iface_get_address_type(tl_iface); return UCS_OK; } int uct_rc_mlx5_iface_is_reachable(const uct_iface_h tl_iface, const uct_device_addr_t *dev_addr, const uct_iface_addr_t *iface_addr) { uint8_t my_type = uct_rc_mlx5_iface_get_address_type(tl_iface); if ((iface_addr != NULL) && (my_type != *(uint8_t*)iface_addr)) { return 0; } return uct_ib_iface_is_reachable(tl_iface, dev_addr, iface_addr); } static int uct_rc_mlx5_iface_srq_topo(uct_rc_mlx5_iface_common_t *iface, uct_md_h md, uct_rc_mlx5_iface_common_config_t *mlx5_config) { uct_ib_mlx5_md_t *ib_md = ucs_derived_of(md, uct_ib_mlx5_md_t); /* Cyclic SRQ is supported with HW TM and DEVX only. */ if (((mlx5_config->srq_topo == UCT_RC_MLX5_SRQ_TOPO_AUTO) || (mlx5_config->srq_topo == UCT_RC_MLX5_SRQ_TOPO_CYCLIC)) && UCT_RC_MLX5_TM_ENABLED(iface) && (ib_md->flags & UCT_IB_MLX5_MD_FLAG_DEVX)) { return UCT_RC_MLX5_MP_ENABLED(iface) ? UCT_IB_MLX5_SRQ_TOPO_CYCLIC_MP_RQ : UCT_IB_MLX5_SRQ_TOPO_CYCLIC; } return UCT_RC_MLX5_MP_ENABLED(iface) ? UCT_IB_MLX5_SRQ_TOPO_LIST_MP_RQ : UCT_IB_MLX5_SRQ_TOPO_LIST; } UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_common_t, uct_rc_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, uct_rc_iface_common_config_t *rc_config, uct_rc_mlx5_iface_common_config_t *mlx5_config, uct_ib_iface_init_attr_t *init_attr) { uct_ib_device_t *dev; ucs_status_t status; status = uct_rc_mlx5_iface_preinit(self, md, rc_config, mlx5_config, params, init_attr); if (status != UCS_OK) { return status; } self->rx.srq.type = UCT_IB_MLX5_OBJ_TYPE_LAST; self->rx.srq.topo = uct_rc_mlx5_iface_srq_topo(self, md, mlx5_config); self->tm.cmd_wq.super.super.type = UCT_IB_MLX5_OBJ_TYPE_LAST; init_attr->rx_hdr_len = UCT_RC_MLX5_MP_ENABLED(self) ? 0 : sizeof(uct_rc_mlx5_hdr_t); UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, ops, md, worker, params, rc_config, init_attr); dev = uct_ib_iface_device(&self->super.super); self->tx.mmio_mode = mlx5_config->super.mmio_mode; self->tx.bb_max = ucs_min(mlx5_config->tx_max_bb, UINT16_MAX); self->tm.am_desc.super.cb = uct_rc_mlx5_release_desc; if (!UCT_RC_MLX5_MP_ENABLED(self)) { self->tm.am_desc.offset = self->super.super.config.rx_headroom_offset; } status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_TX], &self->cq[UCT_IB_DIR_TX]); if (status != UCS_OK) { return status; } status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_RX], &self->cq[UCT_IB_DIR_RX]); if (status != UCS_OK) { return status; } status = UCS_STATS_NODE_ALLOC(&self->stats, &uct_rc_mlx5_iface_stats_class, self->super.stats); if (status != UCS_OK) { return status; } status = uct_rc_mlx5_iface_common_tag_init(self); if (status != UCS_OK) { goto cleanup_stats; } status = uct_rc_mlx5_iface_common_dm_init(self, &self->super, &mlx5_config->super); if (status != UCS_OK) { goto cleanup_tm; } self->super.config.fence_mode = (uct_rc_fence_mode_t)rc_config->fence_mode; self->super.rx.srq.quota = self->rx.srq.mask + 1; self->super.config.exp_backoff = mlx5_config->exp_backoff; if ((rc_config->fence_mode == UCT_RC_FENCE_MODE_WEAK) || ((rc_config->fence_mode == UCT_RC_FENCE_MODE_AUTO) && uct_ib_device_has_pci_atomics(dev))) { self->config.atomic_fence_flag = UCT_IB_MLX5_WQE_CTRL_FLAG_FENCE; self->config.put_fence_flag = 0; self->super.config.fence_mode = UCT_RC_FENCE_MODE_WEAK; } else if (rc_config->fence_mode == UCT_RC_FENCE_MODE_STRONG) { self->config.atomic_fence_flag = UCT_IB_MLX5_WQE_CTRL_FLAG_STRONG_ORDER; self->config.put_fence_flag = UCT_IB_MLX5_WQE_CTRL_FLAG_STRONG_ORDER; self->super.config.fence_mode = UCT_RC_FENCE_MODE_STRONG; } else if ((rc_config->fence_mode == UCT_RC_FENCE_MODE_NONE) || ((rc_config->fence_mode == UCT_RC_FENCE_MODE_AUTO) && !uct_ib_device_has_pci_atomics(dev))) { self->config.atomic_fence_flag = 0; self->config.put_fence_flag = 0; self->super.config.fence_mode = UCT_RC_FENCE_MODE_NONE; } else { ucs_error("incorrect fence value: %d", self->super.config.fence_mode); status = UCS_ERR_INVALID_PARAM; goto cleanup_tm; } /* By default set to something that is always in cache */ self->rx.pref_ptr = self; status = uct_iface_mpool_init(&self->super.super.super, &self->tx.atomic_desc_mp, sizeof(uct_rc_iface_send_desc_t) + UCT_IB_MAX_ATOMIC_SIZE, sizeof(uct_rc_iface_send_desc_t) + UCT_IB_MAX_ATOMIC_SIZE, UCS_SYS_CACHE_LINE_SIZE, &rc_config->super.tx.mp, self->super.config.tx_qp_len, uct_rc_iface_send_desc_init, "rc_mlx5_atomic_desc"); if (status != UCS_OK) { goto cleanup_dm; } /* For little-endian atomic reply, override the default functions, to still * treat the response as big-endian when it arrives in the CQE. */ if (!(uct_ib_iface_device(&self->super.super)->atomic_arg_sizes_be & sizeof(uint64_t))) { self->super.config.atomic64_handler = uct_rc_mlx5_common_atomic64_le_handler; } if (!(uct_ib_iface_device(&self->super.super)->ext_atomic_arg_sizes_be & sizeof(uint32_t))) { self->super.config.atomic32_ext_handler = uct_rc_mlx5_common_atomic32_le_handler; } if (!(uct_ib_iface_device(&self->super.super)->ext_atomic_arg_sizes_be & sizeof(uint64_t))) { self->super.config.atomic64_ext_handler = uct_rc_mlx5_common_atomic64_le_handler; } return UCS_OK; cleanup_dm: uct_rc_mlx5_iface_common_dm_cleanup(self); cleanup_tm: uct_rc_mlx5_iface_common_tag_cleanup(self); cleanup_stats: UCS_STATS_NODE_FREE(self->stats); return status; } static UCS_CLASS_CLEANUP_FUNC(uct_rc_mlx5_iface_common_t) { ucs_mpool_cleanup(&self->tx.atomic_desc_mp, 1); uct_rc_mlx5_iface_common_dm_cleanup(self); uct_rc_mlx5_iface_common_tag_cleanup(self); UCS_STATS_NODE_FREE(self->stats); } UCS_CLASS_DEFINE(uct_rc_mlx5_iface_common_t, uct_rc_iface_t); typedef struct { uct_rc_mlx5_iface_common_t super; } uct_rc_mlx5_iface_t; UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_t, uct_md_h tl_md, uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *tl_config) { uct_rc_mlx5_iface_config_t *config = ucs_derived_of(tl_config, uct_rc_mlx5_iface_config_t); uct_ib_mlx5_md_t UCS_V_UNUSED *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t); uct_ib_iface_init_attr_t init_attr = {}; ucs_status_t status; init_attr.fc_req_size = sizeof(uct_rc_fc_request_t); init_attr.flags = UCT_IB_CQ_IGNORE_OVERRUN; init_attr.rx_hdr_len = sizeof(uct_rc_mlx5_hdr_t); init_attr.tx_cq_len = config->super.tx_cq_len; init_attr.qp_type = IBV_QPT_RC; if (IBV_DEVICE_TM_FLAGS(&md->super.dev)) { init_attr.flags |= UCT_IB_TM_SUPPORTED; } UCS_CLASS_CALL_SUPER_INIT(uct_rc_mlx5_iface_common_t, &uct_rc_mlx5_iface_ops, tl_md, worker, params, &config->super.super, &config->rc_mlx5_common, &init_attr); self->super.super.config.tx_moderation = ucs_min(config->super.tx_cq_moderation, self->super.tx.bb_max / 4); status = uct_rc_init_fc_thresh(&config->super, &self->super.super); if (status != UCS_OK) { return status; } /* Set max_iov for put_zcopy and get_zcopy */ uct_ib_iface_set_max_iov(&self->super.super.super, (UCT_IB_MLX5_MAX_SEND_WQE_SIZE - sizeof(struct mlx5_wqe_raddr_seg) - sizeof(struct mlx5_wqe_ctrl_seg)) / sizeof(struct mlx5_wqe_data_seg)); return UCS_OK; } static UCS_CLASS_CLEANUP_FUNC(uct_rc_mlx5_iface_t) { uct_base_iface_progress_disable(&self->super.super.super.super.super, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); } UCS_CLASS_DEFINE(uct_rc_mlx5_iface_t, uct_rc_mlx5_iface_common_t); static UCS_CLASS_DEFINE_NEW_FUNC(uct_rc_mlx5_iface_t, uct_iface_t, uct_md_h, uct_worker_h, const uct_iface_params_t*, const uct_iface_config_t*); static UCS_CLASS_DEFINE_DELETE_FUNC(uct_rc_mlx5_iface_t, uct_iface_t); static uct_rc_iface_ops_t uct_rc_mlx5_iface_ops = { { { .ep_put_short = uct_rc_mlx5_ep_put_short, .ep_put_bcopy = uct_rc_mlx5_ep_put_bcopy, .ep_put_zcopy = uct_rc_mlx5_ep_put_zcopy, .ep_get_bcopy = uct_rc_mlx5_ep_get_bcopy, .ep_get_zcopy = uct_rc_mlx5_ep_get_zcopy, .ep_am_short = uct_rc_mlx5_ep_am_short, .ep_am_bcopy = uct_rc_mlx5_ep_am_bcopy, .ep_am_zcopy = uct_rc_mlx5_ep_am_zcopy, .ep_atomic_cswap64 = uct_rc_mlx5_ep_atomic_cswap64, .ep_atomic_cswap32 = uct_rc_mlx5_ep_atomic_cswap32, .ep_atomic64_post = uct_rc_mlx5_ep_atomic64_post, .ep_atomic32_post = uct_rc_mlx5_ep_atomic32_post, .ep_atomic64_fetch = uct_rc_mlx5_ep_atomic64_fetch, .ep_atomic32_fetch = uct_rc_mlx5_ep_atomic32_fetch, .ep_pending_add = uct_rc_ep_pending_add, .ep_pending_purge = uct_rc_ep_pending_purge, .ep_flush = uct_rc_mlx5_ep_flush, .ep_fence = uct_rc_mlx5_ep_fence, .ep_create = UCS_CLASS_NEW_FUNC_NAME(uct_rc_mlx5_ep_t), .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_rc_mlx5_ep_t), .ep_get_address = uct_rc_mlx5_ep_get_address, .ep_connect_to_ep = uct_rc_mlx5_ep_connect_to_ep, #if IBV_HW_TM .ep_tag_eager_short = uct_rc_mlx5_ep_tag_eager_short, .ep_tag_eager_bcopy = uct_rc_mlx5_ep_tag_eager_bcopy, .ep_tag_eager_zcopy = uct_rc_mlx5_ep_tag_eager_zcopy, .ep_tag_rndv_zcopy = uct_rc_mlx5_ep_tag_rndv_zcopy, .ep_tag_rndv_request = uct_rc_mlx5_ep_tag_rndv_request, .ep_tag_rndv_cancel = uct_rc_mlx5_ep_tag_rndv_cancel, .iface_tag_recv_zcopy = uct_rc_mlx5_iface_tag_recv_zcopy, .iface_tag_recv_cancel = uct_rc_mlx5_iface_tag_recv_cancel, #endif .iface_flush = uct_rc_iface_flush, .iface_fence = uct_rc_iface_fence, .iface_progress_enable = uct_rc_mlx5_iface_progress_enable, .iface_progress_disable = uct_base_iface_progress_disable, .iface_progress = uct_rc_iface_do_progress, .iface_event_fd_get = uct_ib_iface_event_fd_get, .iface_event_arm = uct_rc_iface_event_arm, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_rc_mlx5_iface_t), .iface_query = uct_rc_mlx5_iface_query, .iface_get_address = uct_rc_mlx5_iface_get_address, .iface_get_device_address = uct_ib_iface_get_device_address, .iface_is_reachable = uct_rc_mlx5_iface_is_reachable }, .create_cq = uct_ib_mlx5_create_cq, .arm_cq = uct_rc_mlx5_iface_arm_cq, .event_cq = uct_rc_mlx5_iface_event_cq, .handle_failure = uct_rc_mlx5_iface_handle_failure, .set_ep_failed = uct_rc_mlx5_ep_set_failed, }, .init_rx = uct_rc_mlx5_iface_init_rx, .cleanup_rx = uct_rc_mlx5_iface_cleanup_rx, .fc_ctrl = uct_rc_mlx5_ep_fc_ctrl, .fc_handler = uct_rc_iface_fc_handler, }; static ucs_status_t uct_rc_mlx5_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p, unsigned *num_tl_devices_p) { uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t); int flags; flags = UCT_IB_DEVICE_FLAG_MLX5_PRM | (ib_md->config.eth_pause ? 0 : UCT_IB_DEVICE_FLAG_LINK_IB); return uct_ib_device_query_ports(&ib_md->dev, flags, tl_devices_p, num_tl_devices_p); } UCT_TL_DEFINE(&uct_ib_component, rc_mlx5, uct_rc_mlx5_query_tl_devices, uct_rc_mlx5_iface_t, "RC_MLX5_", uct_rc_mlx5_iface_config_table, uct_rc_mlx5_iface_config_t);