/** * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ #include #include "ib_mlx5_ifc.h" #include #include typedef struct { struct mlx5dv_devx_obj *atomic_dvmr; int mr_num; size_t length; struct ibv_mr *mrs[]; } uct_ib_mlx5_ksm_data_t; typedef struct uct_ib_mlx5_mem { uct_ib_mem_t super; union { struct ibv_mr *mr; #if HAVE_DEVX struct mlx5dv_devx_obj *dvmr; }; union { struct mlx5dv_devx_obj *atomic_dvmr; uct_ib_mlx5_ksm_data_t *ksm_data; #endif }; } uct_ib_mlx5_mem_t; typedef struct uct_ib_mlx5_dbrec_page { struct mlx5dv_devx_umem *mem; } uct_ib_mlx5_dbrec_page_t; static ucs_status_t uct_ib_mlx5_reg_key(uct_ib_md_t *md, void *address, size_t length, uint64_t access, uct_ib_mem_t *ib_memh) { uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); ucs_status_t status; status = uct_ib_reg_mr(md->pd, address, length, access, &memh->mr); if (status != UCS_OK) { return status; } uct_ib_memh_init_from_mr(&memh->super, memh->mr); return UCS_OK; } static ucs_status_t uct_ib_mlx5_dereg_key(uct_ib_md_t *md, uct_ib_mem_t *ib_memh) { uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); return uct_ib_dereg_mr(memh->mr); } static ucs_status_t uct_ib_mlx5_mem_prefetch(uct_ib_md_t *md, uct_ib_mem_t *ib_memh, void *addr, size_t length) { #if HAVE_DECL_IBV_ADVISE_MR struct ibv_sge sg_list; int ret; if (!(ib_memh->flags & UCT_IB_MEM_FLAG_ODP)) { return UCS_OK; } ucs_debug("memh %p prefetch %p length %zu", ib_memh, addr, length); sg_list.lkey = ib_memh->lkey; sg_list.addr = (uintptr_t)addr; sg_list.length = length; ret = UCS_PROFILE_CALL(ibv_advise_mr, md->pd, IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE, IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sg_list, 1); if (ret) { ucs_error("ibv_advise_mr(addr=%p length=%zu) returned %d: %m", addr, length, ret); return UCS_ERR_IO_ERROR; } #endif return UCS_OK; } static int uct_ib_mlx5_has_roce_port(uct_ib_device_t *dev) { int port_num; for (port_num = dev->first_port; port_num < dev->first_port + dev->num_ports; port_num++) { if (uct_ib_device_is_port_roce(dev, port_num)) { return 1; } } return 0; } #if HAVE_DEVX static size_t uct_ib_mlx5_calc_mkey_inlen(int list_size) { return UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_in) + UCT_IB_MLX5DV_ST_SZ_BYTES(klm) * list_size; } static ucs_status_t uct_ib_mlx5_alloc_mkey_inbox(int list_size, char **in_p) { size_t inlen; char *in; inlen = uct_ib_mlx5_calc_mkey_inlen(list_size); in = ucs_calloc(1, inlen, "mkey mailbox"); if (in == NULL) { return UCS_ERR_NO_MEMORY; } *in_p = in; return UCS_OK; } static ucs_status_t uct_ib_mlx5_devx_reg_ksm(uct_ib_mlx5_md_t *md, intptr_t addr, size_t length, int list_size, size_t entity_size, char *in, struct mlx5dv_devx_obj **mr_p, uint32_t *mkey) { char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_out)] = {}; struct mlx5dv_pd dvpd = {}; struct mlx5dv_obj dv = {}; struct mlx5dv_devx_obj *mr; void *mkc; dv.pd.in = md->super.pd; dv.pd.out = &dvpd; mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_KSM); UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); UCT_IB_MLX5DV_SET(mkc, mkc, rr, 1); UCT_IB_MLX5DV_SET(mkc, mkc, lw, 1); UCT_IB_MLX5DV_SET(mkc, mkc, lr, 1); UCT_IB_MLX5DV_SET(mkc, mkc, pd, dvpd.pdn); UCT_IB_MLX5DV_SET(mkc, mkc, translations_octword_size, list_size); UCT_IB_MLX5DV_SET(mkc, mkc, log_entity_size, ucs_ilog2(entity_size)); UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, addr & 0xff); UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, addr); UCT_IB_MLX5DV_SET64(mkc, mkc, len, length); UCT_IB_MLX5DV_SET(create_mkey_in, in, translations_octword_actual_size, list_size); mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, uct_ib_mlx5_calc_mkey_inlen(list_size), out, sizeof(out)); if (mr == NULL) { ucs_debug("mlx5dv_devx_obj_create(CREATE_MKEY, mode=KSM) failed, syndrome %x: %m", UCT_IB_MLX5DV_GET(create_mkey_out, out, syndrome)); return UCS_ERR_UNSUPPORTED; } *mr_p = mr; *mkey = (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) << 8) | (addr & 0xff); return UCS_OK; } static ucs_status_t uct_ib_mlx5_devx_reg_ksm_data(uct_ib_mlx5_md_t *md, uct_ib_mlx5_ksm_data_t *ksm_data, size_t length, off_t off, struct mlx5dv_devx_obj **mr_p, uint32_t *mkey) { ucs_status_t status; char *in; void *klm; int i; status = uct_ib_mlx5_alloc_mkey_inbox(ksm_data->mr_num, &in); if (status != UCS_OK) { return UCS_ERR_NO_MEMORY; } klm = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, klm_pas_mtt); for (i = 0; i < ksm_data->mr_num; i++) { UCT_IB_MLX5DV_SET64(klm, klm, address, (intptr_t)ksm_data->mrs[i]->addr); UCT_IB_MLX5DV_SET(klm, klm, byte_count, ksm_data->mrs[i]->length); UCT_IB_MLX5DV_SET(klm, klm, mkey, ksm_data->mrs[i]->lkey); klm = UCS_PTR_BYTE_OFFSET(klm, UCT_IB_MLX5DV_ST_SZ_BYTES(klm)); } status = uct_ib_mlx5_devx_reg_ksm(md, (intptr_t)ksm_data->mrs[0]->addr + off, length, ksm_data->mr_num, ksm_data->mrs[0]->length, in, mr_p, mkey); ucs_free(in); return status; } static ucs_status_t uct_ib_mlx5_devx_reg_atomic_key(uct_ib_md_t *ibmd, uct_ib_mem_t *ib_memh) { uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); off_t offset = uct_ib_md_atomic_offset(uct_ib_mlx5_md_get_atomic_mr_id(md)); struct ibv_mr *mr = memh->mr; size_t reg_length, length; ucs_status_t status; int list_size, i; void *klm; char *in; intptr_t addr; if (!(md->flags & UCT_IB_MLX5_MD_FLAG_KSM)) { return UCS_ERR_UNSUPPORTED; } if (memh->super.flags & UCT_IB_MEM_MULTITHREADED) { return uct_ib_mlx5_devx_reg_ksm_data(md, memh->ksm_data, memh->mr->length, offset, &memh->ksm_data->atomic_dvmr, &memh->super.atomic_rkey); } reg_length = UCT_IB_MD_MAX_MR_SIZE; addr = (intptr_t)mr->addr & ~(reg_length - 1); length = mr->length + (intptr_t)mr->addr - addr; list_size = ucs_div_round_up(length, reg_length); status = uct_ib_mlx5_alloc_mkey_inbox(list_size, &in); if (status != UCS_OK) { return status; } klm = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, klm_pas_mtt); for (i = 0; i < list_size; i++) { if (i == list_size - 1) { UCT_IB_MLX5DV_SET(klm, klm, byte_count, length % reg_length); } else { UCT_IB_MLX5DV_SET(klm, klm, byte_count, reg_length); } UCT_IB_MLX5DV_SET(klm, klm, mkey, mr->lkey); UCT_IB_MLX5DV_SET64(klm, klm, address, addr + (i * reg_length)); klm = UCS_PTR_BYTE_OFFSET(klm, UCT_IB_MLX5DV_ST_SZ_BYTES(klm)); } status = uct_ib_mlx5_devx_reg_ksm(md, addr + offset, length, list_size, reg_length, in, &memh->atomic_dvmr, &memh->super.atomic_rkey); if (status != UCS_OK) { if (status == UCS_ERR_UNSUPPORTED) { md->flags &= ~UCT_IB_MLX5_MD_FLAG_KSM; } goto out; } ucs_debug("KSM registered memory %p..%p offset 0x%lx on %s rkey 0x%x", mr->addr, UCS_PTR_BYTE_OFFSET(mr->addr, mr->length), offset, uct_ib_device_name(&md->super.dev), memh->super.atomic_rkey); out: ucs_free(in); return status; } static ucs_status_t uct_ib_mlx5_devx_dereg_atomic_key(uct_ib_md_t *ibmd, uct_ib_mem_t *ib_memh) { uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); int ret; ret = mlx5dv_devx_obj_destroy(memh->atomic_dvmr); if (ret != 0) { ucs_error("mlx5dv_devx_obj_destroy(MKEY, ATOMIC KSM) failed: %m"); return UCS_ERR_IO_ERROR; } return UCS_OK; } static ucs_status_t uct_ib_mlx5_devx_reg_multithreaded(uct_ib_md_t *ibmd, void *address, size_t length, uint64_t access, uct_ib_mem_t *ib_memh) { uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); size_t chunk = md->super.config.mt_reg_chunk; uct_ib_mlx5_ksm_data_t *ksm_data; size_t ksm_data_size; ucs_status_t status; int mr_num; if (!(md->flags & UCT_IB_MLX5_MD_FLAG_KSM) || !(md->flags & UCT_IB_MLX5_MD_FLAG_INDIRECT_ATOMICS)) { return UCS_ERR_UNSUPPORTED; } mr_num = ucs_div_round_up(length, chunk); ksm_data_size = (mr_num * sizeof(*ksm_data->mrs)) + sizeof(*ksm_data); ksm_data = ucs_calloc(1, ksm_data_size, "ksm_data"); if (!ksm_data) { status = UCS_ERR_NO_MEMORY; goto err; } ucs_trace("multithreaded register memory %p..%p chunks %d", address, UCS_PTR_BYTE_OFFSET(address, length), mr_num); ksm_data->mr_num = mr_num; status = uct_ib_md_handle_mr_list_multithreaded(ibmd, address, length, access, chunk, ksm_data->mrs); if (status != UCS_OK) { goto err; } status = uct_ib_mlx5_devx_reg_ksm_data(md, ksm_data, length, 0, &memh->dvmr, &memh->super.lkey); if (status != UCS_OK) { goto err_dereg; } ksm_data->length = length; memh->ksm_data = ksm_data; memh->super.rkey = memh->super.lkey; return UCS_OK; err_dereg: uct_ib_md_handle_mr_list_multithreaded(ibmd, address, length, UCT_IB_MEM_DEREG, chunk, ksm_data->mrs); err: ucs_free(ksm_data); return status; } static ucs_status_t uct_ib_mlx5_devx_dereg_multithreaded(uct_ib_md_t *ibmd, uct_ib_mem_t *ib_memh) { uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); size_t chunk = ibmd->config.mt_reg_chunk; ucs_status_t s, status = UCS_OK; int ret; if (memh->super.flags & UCT_IB_MEM_FLAG_ATOMIC_MR) { ret = mlx5dv_devx_obj_destroy(memh->ksm_data->atomic_dvmr); if (ret != 0) { ucs_error("mlx5dv_devx_obj_destroy(MKEY, ATOMIC) failed: %m"); status = UCS_ERR_IO_ERROR; } } s = uct_ib_md_handle_mr_list_multithreaded(ibmd, 0, memh->ksm_data->length, UCT_IB_MEM_DEREG, chunk, memh->ksm_data->mrs); if (s == UCS_ERR_UNSUPPORTED) { s = uct_ib_dereg_mrs(memh->ksm_data->mrs, memh->ksm_data->mr_num); if (s != UCS_OK) { status = s; } } else if (s != UCS_OK) { status = s; } ret = mlx5dv_devx_obj_destroy(memh->dvmr); if (ret != 0) { ucs_error("mlx5dv_devx_obj_destroy(MKEY, KSM) failed: %m"); status = UCS_ERR_IO_ERROR; } ucs_free(memh->ksm_data); return status; } static ucs_status_t uct_ib_mlx5_add_page(ucs_mpool_t *mp, size_t *size_p, void **page_p) { uct_ib_mlx5_md_t *md = ucs_container_of(mp, uct_ib_mlx5_md_t, dbrec_pool); uintptr_t ps = ucs_get_page_size(); uct_ib_mlx5_dbrec_page_t *page; size_t size = ucs_align_up(*size_p + sizeof(*page), ps); int ret; ret = ucs_posix_memalign((void **)&page, ps, size, "devx dbrec"); if (ret != 0) { goto err; } page->mem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, page, size, 0); if (page->mem == NULL) { goto err_free; } *size_p = size; *page_p = page + 1; return UCS_OK; err_free: ucs_free(page); err: return UCS_ERR_IO_ERROR; } static void uct_ib_mlx5_init_dbrec(ucs_mpool_t *mp, void *obj, void *chunk) { uct_ib_mlx5_dbrec_page_t *page = (uct_ib_mlx5_dbrec_page_t*)chunk - 1; uct_ib_mlx5_dbrec_t *dbrec = obj; dbrec->mem_id = page->mem->umem_id; dbrec->offset = UCS_PTR_BYTE_DIFF(chunk, obj) + sizeof(*page); } static void uct_ib_mlx5_free_page(ucs_mpool_t *mp, void *chunk) { uct_ib_mlx5_dbrec_page_t *page = (uct_ib_mlx5_dbrec_page_t*)chunk - 1; mlx5dv_devx_umem_dereg(page->mem); ucs_free(page); } static ucs_mpool_ops_t uct_ib_mlx5_dbrec_ops = { .chunk_alloc = uct_ib_mlx5_add_page, .chunk_release = uct_ib_mlx5_free_page, .obj_init = uct_ib_mlx5_init_dbrec, .obj_cleanup = NULL }; static UCS_F_MAYBE_UNUSED ucs_status_t uct_ib_mlx5_devx_check_odp(uct_ib_mlx5_md_t *md, const uct_ib_md_config_t *md_config, void *cap) { char out[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_out)] = {}; char in[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_in)] = {}; void *odp; int ret; if (md_config->devx_objs & UCS_BIT(UCT_IB_DEVX_OBJ_RCQP)) { ucs_debug("%s: disable ODP because it's not supported for DevX QP", uct_ib_device_name(&md->super.dev)); goto no_odp; } if (uct_ib_mlx5_has_roce_port(&md->super.dev)) { ucs_debug("%s: disable ODP on RoCE", uct_ib_device_name(&md->super.dev)); goto no_odp; } if (!UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, pg)) { goto no_odp; } odp = UCT_IB_MLX5DV_ADDR_OF(query_hca_cap_out, out, capability); UCT_IB_MLX5DV_SET(query_hca_cap_in, in, opcode, UCT_IB_MLX5_CMD_OP_QUERY_HCA_CAP); UCT_IB_MLX5DV_SET(query_hca_cap_in, in, op_mod, UCT_IB_MLX5_HCA_CAP_OPMOD_GET_CUR | (UCT_IB_MLX5_CAP_ODP << 1)); ret = mlx5dv_devx_general_cmd(md->super.dev.ibv_context, in, sizeof(in), out, sizeof(out)); if (ret != 0) { ucs_error("mlx5dv_devx_general_cmd(QUERY_HCA_CAP, ODP) failed: %m"); return UCS_ERR_IO_ERROR; } if (!UCT_IB_MLX5DV_GET(odp_cap, odp, ud_odp_caps.send) || !UCT_IB_MLX5DV_GET(odp_cap, odp, rc_odp_caps.send) || !UCT_IB_MLX5DV_GET(odp_cap, odp, rc_odp_caps.write) || !UCT_IB_MLX5DV_GET(odp_cap, odp, rc_odp_caps.read)) { goto no_odp; } if ((md->super.dev.flags & UCT_IB_DEVICE_FLAG_DC) && (!UCT_IB_MLX5DV_GET(odp_cap, odp, dc_odp_caps.send) || !UCT_IB_MLX5DV_GET(odp_cap, odp, dc_odp_caps.write) || !UCT_IB_MLX5DV_GET(odp_cap, odp, dc_odp_caps.read))) { goto no_odp; } if (md->super.config.odp.max_size == UCS_MEMUNITS_AUTO) { if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, umr_extended_translation_offset)) { md->super.config.odp.max_size = 1ul << 55; } else { md->super.config.odp.max_size = 1ul << 28; } } if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, fixed_buffer_size) && UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, null_mkey) && UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, umr_extended_translation_offset)) { md->super.dev.flags |= UCT_IB_DEVICE_FLAG_ODP_IMPLICIT; } return UCS_OK; no_odp: md->super.config.odp.max_size = 0; return UCS_OK; } static struct ibv_context * uct_ib_mlx5_devx_open_device(struct ibv_device *ibv_device, struct mlx5dv_context_attr *dv_attr) { struct ibv_context *ctx; struct ibv_cq *cq; ctx = mlx5dv_open_device(ibv_device, dv_attr); if (ctx == NULL) { return NULL; } cq = ibv_create_cq(ctx, 1, NULL, NULL, 0); if (cq == NULL) { ibv_close_device(ctx); return NULL; } ibv_destroy_cq(cq); return ctx; } static uct_ib_md_ops_t uct_ib_mlx5_devx_md_ops; static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device, const uct_ib_md_config_t *md_config, uct_ib_md_t **p_md) { char out[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_out)] = {}; char in[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_in)] = {}; struct mlx5dv_context_attr dv_attr = {}; ucs_status_t status = UCS_OK; struct ibv_context *ctx; uct_ib_device_t *dev; uct_ib_mlx5_md_t *md; void *cap; int ret; #if HAVE_DECL_MLX5DV_IS_SUPPORTED if (!mlx5dv_is_supported(ibv_device)) { return UCS_ERR_UNSUPPORTED; } #endif if (md_config->devx == UCS_NO) { return UCS_ERR_UNSUPPORTED; } dv_attr.flags |= MLX5DV_CONTEXT_FLAGS_DEVX; ctx = uct_ib_mlx5_devx_open_device(ibv_device, &dv_attr); if (ctx == NULL) { if (md_config->devx == UCS_YES) { status = UCS_ERR_IO_ERROR; ucs_error("DEVX requested but not supported by %s", ibv_get_device_name(ibv_device)); } else { status = UCS_ERR_UNSUPPORTED; ucs_debug("mlx5dv_open_device(%s) failed: %m", ibv_get_device_name(ibv_device)); } goto err; } md = ucs_calloc(1, sizeof(*md), "ib_mlx5_md"); if (md == NULL) { status = UCS_ERR_NO_MEMORY; goto err_free_context; } dev = &md->super.dev; dev->ibv_context = ctx; md->super.config = md_config->ext; status = uct_ib_device_query(dev, ibv_device); if (status != UCS_OK) { goto err_free; } cap = UCT_IB_MLX5DV_ADDR_OF(query_hca_cap_out, out, capability); UCT_IB_MLX5DV_SET(query_hca_cap_in, in, opcode, UCT_IB_MLX5_CMD_OP_QUERY_HCA_CAP); UCT_IB_MLX5DV_SET(query_hca_cap_in, in, op_mod, UCT_IB_MLX5_HCA_CAP_OPMOD_GET_CUR | (UCT_IB_MLX5_CAP_GENERAL << 1)); ret = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); if (ret != 0) { if ((errno == EPERM) || (errno == EPROTONOSUPPORT) || (errno == EOPNOTSUPP)) { status = UCS_ERR_UNSUPPORTED; ucs_debug("mlx5dv_devx_general_cmd(QUERY_HCA_CAP) failed: %m"); } else { ucs_error("mlx5dv_devx_general_cmd(QUERY_HCA_CAP) failed: %m"); status = UCS_ERR_IO_ERROR; } goto err_free; } if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, log_max_msg) != UCT_IB_MLX5_LOG_MAX_MSG_SIZE) { status = UCS_ERR_UNSUPPORTED; ucs_debug("Unexpected QUERY_HCA_CAP.log_max_msg %d\n", UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, log_max_msg)); goto err_free; } if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, dct)) { dev->flags |= UCT_IB_DEVICE_FLAG_DC; } if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, rndv_offload_dc)) { md->flags |= UCT_IB_MLX5_MD_FLAG_DC_TM; } if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, compact_address_vector)) { dev->flags |= UCT_IB_DEVICE_FLAG_AV; } if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, fixed_buffer_size)) { md->flags |= UCT_IB_MLX5_MD_FLAG_KSM; } if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, ext_stride_num_range)) { /* TODO: check if need to check for XRQ (not RQ) MP support */ md->flags |= UCT_IB_MLX5_MD_FLAG_MP_RQ; } if (!UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, umr_modify_atomic_disabled)) { md->flags |= UCT_IB_MLX5_MD_FLAG_INDIRECT_ATOMICS; } status = uct_ib_mlx5_devx_check_odp(md, md_config, cap); if (status != UCS_OK) { goto err_free; } if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, atomic)) { int ops = UCT_IB_MLX5_ATOMIC_OPS_CMP_SWAP | UCT_IB_MLX5_ATOMIC_OPS_FETCH_ADD; uint8_t arg_size; int cap_ops, mode8b; UCT_IB_MLX5DV_SET(query_hca_cap_in, in, op_mod, UCT_IB_MLX5_HCA_CAP_OPMOD_GET_CUR | (UCT_IB_MLX5_CAP_ATOMIC << 1)); ret = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); if (ret != 0) { ucs_error("mlx5dv_devx_general_cmd(QUERY_HCA_CAP, ATOMIC) failed: %m"); status = UCS_ERR_IO_ERROR; goto err_free; } arg_size = UCT_IB_MLX5DV_GET(atomic_caps, cap, atomic_size_qp); cap_ops = UCT_IB_MLX5DV_GET(atomic_caps, cap, atomic_operations); mode8b = UCT_IB_MLX5DV_GET(atomic_caps, cap, atomic_req_8B_endianness_mode); if ((cap_ops & ops) == ops) { dev->atomic_arg_sizes = sizeof(uint64_t); if (!mode8b) { dev->atomic_arg_sizes_be = sizeof(uint64_t); } } ops |= UCT_IB_MLX5_ATOMIC_OPS_MASKED_CMP_SWAP | UCT_IB_MLX5_ATOMIC_OPS_MASKED_FETCH_ADD; arg_size &= UCT_IB_MLX5DV_GET(query_hca_cap_out, out, capability.atomic_caps.atomic_size_dc); if ((cap_ops & ops) == ops) { dev->ext_atomic_arg_sizes = arg_size; if (mode8b) { arg_size &= ~(sizeof(uint64_t)); } dev->ext_atomic_arg_sizes_be = arg_size; } dev->pci_fadd_arg_sizes = UCT_IB_MLX5DV_GET(atomic_caps, cap, fetch_add_pci_atomic) << 2; dev->pci_cswap_arg_sizes = UCT_IB_MLX5DV_GET(atomic_caps, cap, compare_swap_pci_atomic) << 2; } md->super.ops = &uct_ib_mlx5_devx_md_ops; status = uct_ib_md_open_common(&md->super, ibv_device, md_config); if (status != UCS_OK) { goto err_free; } ucs_spinlock_init(&md->dbrec_lock); status = ucs_mpool_init(&md->dbrec_pool, 0, sizeof(uct_ib_mlx5_dbrec_t), 0, UCS_SYS_CACHE_LINE_SIZE, ucs_get_page_size() / UCS_SYS_CACHE_LINE_SIZE - 1, UINT_MAX, &uct_ib_mlx5_dbrec_ops, "devx dbrec"); if (status != UCS_OK) { goto err_free; } ret = ucs_posix_memalign(&md->zero_buf, ucs_get_page_size(), ucs_get_page_size(), "zero umem"); if (ret != 0) { ucs_error("failed to allocate zero buffer: %m"); goto err_release_dbrec; } md->zero_mem = mlx5dv_devx_umem_reg(dev->ibv_context, md->zero_buf, ucs_get_page_size(), 0); if (!md->zero_mem) { ucs_error("mlx5dv_devx_umem_reg() zero umem failed: %m"); goto err_free_zero_buf; } dev->flags |= UCT_IB_DEVICE_FLAG_MLX5_PRM; md->flags |= UCT_IB_MLX5_MD_FLAG_DEVX; md->flags |= UCT_IB_MLX5_MD_FLAGS_DEVX_OBJS(md_config->devx_objs); *p_md = &md->super; return status; err_free_zero_buf: ucs_free(md->zero_buf); err_release_dbrec: ucs_mpool_cleanup(&md->dbrec_pool, 1); err_free: ucs_free(md); err_free_context: ibv_close_device(ctx); err: return status; } void uct_ib_mlx5_devx_md_cleanup(uct_ib_md_t *ibmd) { uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); ucs_status_t status; mlx5dv_devx_umem_dereg(md->zero_mem); ucs_free(md->zero_buf); ucs_mpool_cleanup(&md->dbrec_pool, 1); status = ucs_spinlock_destroy(&md->dbrec_lock); if (status != UCS_OK) { ucs_warn("ucs_spinlock_destroy() failed (%d)", status); } } static uct_ib_md_ops_t uct_ib_mlx5_devx_md_ops = { .open = uct_ib_mlx5_devx_md_open, .cleanup = uct_ib_mlx5_devx_md_cleanup, .memh_struct_size = sizeof(uct_ib_mlx5_mem_t), .reg_key = uct_ib_mlx5_reg_key, .dereg_key = uct_ib_mlx5_dereg_key, .reg_atomic_key = uct_ib_mlx5_devx_reg_atomic_key, .dereg_atomic_key = uct_ib_mlx5_devx_dereg_atomic_key, .reg_multithreaded = uct_ib_mlx5_devx_reg_multithreaded, .dereg_multithreaded = uct_ib_mlx5_devx_dereg_multithreaded, .mem_prefetch = uct_ib_mlx5_mem_prefetch, }; UCT_IB_MD_OPS(uct_ib_mlx5_devx_md_ops, 2); #endif static ucs_status_t uct_ib_mlx5dv_check_dc(uct_ib_device_t *dev) { ucs_status_t status = UCS_OK; #if HAVE_DC_DV struct ibv_srq_init_attr srq_attr = {}; struct ibv_context *ctx = dev->ibv_context; struct ibv_qp_init_attr_ex qp_attr = {}; struct mlx5dv_qp_init_attr dv_attr = {}; struct ibv_qp_attr attr = {}; struct ibv_srq *srq; struct ibv_pd *pd; struct ibv_cq *cq; struct ibv_qp *qp; int ret; ucs_debug("checking for DC support on %s", uct_ib_device_name(dev)); pd = ibv_alloc_pd(ctx); if (pd == NULL) { ucs_error("ibv_alloc_pd() failed: %m"); return UCS_ERR_IO_ERROR; } cq = ibv_create_cq(ctx, 1, NULL, NULL, 0); if (cq == NULL) { ucs_error("ibv_create_cq() failed: %m"); status = UCS_ERR_IO_ERROR; goto err_cq; } srq_attr.attr.max_sge = 1; srq_attr.attr.max_wr = 1; srq = ibv_create_srq(pd, &srq_attr); if (srq == NULL) { ucs_error("ibv_create_srq() failed: %m"); status = UCS_ERR_IO_ERROR; goto err_srq; } qp_attr.send_cq = cq; qp_attr.recv_cq = cq; qp_attr.qp_type = IBV_QPT_DRIVER; qp_attr.comp_mask = IBV_QP_INIT_ATTR_PD; qp_attr.pd = pd; qp_attr.srq = srq; dv_attr.comp_mask = MLX5DV_QP_INIT_ATTR_MASK_DC; dv_attr.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCT; dv_attr.dc_init_attr.dct_access_key = UCT_IB_KEY; /* create DCT qp successful means DC is supported */ qp = mlx5dv_create_qp(ctx, &qp_attr, &dv_attr); if (qp == NULL) { ucs_debug("failed to create DCT on %s: %m", uct_ib_device_name(dev)); goto err_qp; } attr.qp_state = IBV_QPS_INIT; attr.port_num = 1; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; ret = ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS); if (ret != 0) { ucs_debug("failed to ibv_modify_qp(DCT, INIT) on %s: %m", uct_ib_device_name(dev)); goto err; } /* always set global address parameters, in case the port is RoCE or SRIOV */ attr.qp_state = IBV_QPS_RTR; attr.min_rnr_timer = 1; attr.path_mtu = IBV_MTU_256; attr.ah_attr.port_num = 1; attr.ah_attr.sl = 0; attr.ah_attr.is_global = 1; attr.ah_attr.grh.hop_limit = 1; attr.ah_attr.grh.traffic_class = 0; attr.ah_attr.grh.sgid_index = 0; ret = ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_MIN_RNR_TIMER | IBV_QP_AV | IBV_QP_PATH_MTU); if (ret == 0) { ucs_debug("DC is supported on %s", uct_ib_device_name(dev)); dev->flags |= UCT_IB_DEVICE_FLAG_DC; } else { ucs_debug("failed to ibv_modify_qp(DCT, RTR) on %s: %m", uct_ib_device_name(dev)); } err: uct_ib_destroy_qp(qp); err_qp: uct_ib_destroy_srq(srq); err_srq: ibv_destroy_cq(cq); err_cq: ibv_dealloc_pd(pd); #endif return status; } static uct_ib_md_ops_t uct_ib_mlx5_md_ops; static ucs_status_t uct_ib_mlx5dv_md_open(struct ibv_device *ibv_device, const uct_ib_md_config_t *md_config, uct_ib_md_t **p_md) { ucs_status_t status = UCS_OK; struct ibv_context *ctx; uct_ib_device_t *dev; uct_ib_mlx5_md_t *md; #if HAVE_DECL_MLX5DV_IS_SUPPORTED if (!mlx5dv_is_supported(ibv_device)) { return UCS_ERR_UNSUPPORTED; } #endif ctx = ibv_open_device(ibv_device); if (ctx == NULL) { ucs_debug("ibv_open_device(%s) failed: %m", ibv_get_device_name(ibv_device)); status = UCS_ERR_UNSUPPORTED; goto err; } md = ucs_calloc(1, sizeof(*md), "ib_mlx5_md"); if (md == NULL) { status = UCS_ERR_NO_MEMORY; goto err_free_context; } dev = &md->super.dev; dev->ibv_context = ctx; md->super.config = md_config->ext; status = uct_ib_device_query(dev, ibv_device); if (status != UCS_OK) { goto err_free; } if (!(uct_ib_device_spec(dev)->flags & UCT_IB_DEVICE_FLAG_MLX5_PRM)) { status = UCS_ERR_UNSUPPORTED; goto err_free; } if (UCT_IB_HAVE_ODP_IMPLICIT(&dev->dev_attr) && !uct_ib_mlx5_has_roce_port(dev)) { dev->flags |= UCT_IB_DEVICE_FLAG_ODP_IMPLICIT; } if (IBV_EXP_HAVE_ATOMIC_HCA(&dev->dev_attr)) { dev->atomic_arg_sizes = sizeof(uint64_t); } status = uct_ib_mlx5dv_check_dc(dev); if (status != UCS_OK) { goto err_free; } md->super.ops = &uct_ib_mlx5_md_ops; status = uct_ib_md_open_common(&md->super, ibv_device, md_config); if (status != UCS_OK) { goto err_free; } dev->flags |= UCT_IB_DEVICE_FLAG_MLX5_PRM; /* cppcheck-suppress autoVariables */ *p_md = &md->super; return UCS_OK; err_free: ucs_free(md); err_free_context: ibv_close_device(ctx); err: return status; } static uct_ib_md_ops_t uct_ib_mlx5_md_ops = { .open = uct_ib_mlx5dv_md_open, .cleanup = (uct_ib_md_cleanup_func_t)ucs_empty_function, .memh_struct_size = sizeof(uct_ib_mlx5_mem_t), .reg_key = uct_ib_mlx5_reg_key, .dereg_key = uct_ib_mlx5_dereg_key, .reg_atomic_key = (uct_ib_md_reg_atomic_key_func_t)ucs_empty_function_return_unsupported, .dereg_atomic_key = (uct_ib_md_dereg_atomic_key_func_t)ucs_empty_function_return_unsupported, .reg_multithreaded = (uct_ib_md_reg_multithreaded_func_t)ucs_empty_function_return_unsupported, .dereg_multithreaded = (uct_ib_md_dereg_multithreaded_func_t)ucs_empty_function_return_unsupported, .mem_prefetch = uct_ib_mlx5_mem_prefetch, }; UCT_IB_MD_OPS(uct_ib_mlx5_md_ops, 1);