/** * Copyright (c) UT-Battelle, LLC. 2014-2017. ALL RIGHTS RESERVED. * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ #include "ugni_rdma_ep.h" #include "ugni_rdma_iface.h" #include #include #include static ucs_config_field_t uct_ugni_rdma_iface_config_table[] = { /* This tuning controls the allocation priorities for bouncing buffers */ { "", "ALLOC=huge,mmap,heap", NULL, ucs_offsetof(uct_ugni_rdma_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, UCT_IFACE_MPOOL_CONFIG_FIELDS("RDMA", -1, 0, "rdma", ucs_offsetof(uct_ugni_rdma_iface_config_t, mpool), "\nAttention: Setting this param with value != -1 is a dangerous thing\n" "and could cause deadlock or performance degradation."), {NULL} }; static ucs_status_t uct_ugni_rdma_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_rdma_iface_t); uct_base_iface_query(&iface->super.super, iface_attr); iface_attr->cap.put.max_short = iface->config.fma_seg_size; iface_attr->cap.put.max_bcopy = iface->config.fma_seg_size; iface_attr->cap.put.min_zcopy = 0; iface_attr->cap.put.max_zcopy = iface->config.rdma_max_size; iface_attr->cap.put.opt_zcopy_align = 1; iface_attr->cap.put.align_mtu = iface_attr->cap.put.opt_zcopy_align; iface_attr->cap.put.max_iov = 1; iface_attr->cap.get.max_bcopy = iface->config.fma_seg_size - 8; /* alignment offset 4 (addr)+ 4 (len)*/ iface_attr->cap.get.min_zcopy = 0; iface_attr->cap.get.max_zcopy = iface->config.rdma_max_size; iface_attr->cap.get.opt_zcopy_align = 1; iface_attr->cap.get.align_mtu = iface_attr->cap.get.opt_zcopy_align; iface_attr->cap.get.max_iov = 1; iface_attr->cap.am.max_iov = 1; iface_attr->cap.am.opt_zcopy_align = 1; iface_attr->cap.am.align_mtu = iface_attr->cap.am.opt_zcopy_align; iface_attr->device_addr_len = sizeof(uct_devaddr_ugni_t); iface_attr->iface_addr_len = sizeof(uct_sockaddr_ugni_t); iface_attr->ep_addr_len = 0; iface_attr->max_conn_priv = 0; iface_attr->cap.flags = UCT_IFACE_FLAG_PUT_SHORT | UCT_IFACE_FLAG_PUT_BCOPY | UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_BCOPY | UCT_IFACE_FLAG_GET_ZCOPY | UCT_IFACE_FLAG_CONNECT_TO_IFACE | UCT_IFACE_FLAG_ATOMIC_DEVICE | UCT_IFACE_FLAG_PENDING; iface_attr->cap.atomic64.op_flags = UCS_BIT(UCT_ATOMIC_OP_ADD)| UCS_BIT(UCT_ATOMIC_OP_AND) | UCS_BIT(UCT_ATOMIC_OP_OR) | UCS_BIT(UCT_ATOMIC_OP_XOR); iface_attr->cap.atomic64.fop_flags = UCS_BIT(UCT_ATOMIC_OP_ADD) | UCS_BIT(UCT_ATOMIC_OP_AND) | UCS_BIT(UCT_ATOMIC_OP_OR) | UCS_BIT(UCT_ATOMIC_OP_XOR) | UCS_BIT(UCT_ATOMIC_OP_SWAP) | UCS_BIT(UCT_ATOMIC_OP_CSWAP); if (uct_ugni_check_device_type(&iface->super, GNI_DEVICE_ARIES)) { iface_attr->cap.flags |= UCT_IFACE_FLAG_PUT_SHORT; iface_attr->cap.atomic64.fop_flags |= UCS_BIT(UCT_ATOMIC_OP_SWAP); iface_attr->cap.atomic32.op_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD) | UCS_BIT(UCT_ATOMIC_OP_AND) | UCS_BIT(UCT_ATOMIC_OP_OR) | UCS_BIT(UCT_ATOMIC_OP_XOR); iface_attr->cap.atomic32.fop_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD) | UCS_BIT(UCT_ATOMIC_OP_AND) | UCS_BIT(UCT_ATOMIC_OP_OR) | UCS_BIT(UCT_ATOMIC_OP_XOR) | UCS_BIT(UCT_ATOMIC_OP_SWAP) | UCS_BIT(UCT_ATOMIC_OP_CSWAP); } iface_attr->overhead = 80e-9; /* 80 ns */ iface_attr->latency.overhead = 900e-9; /* 900 ns */ iface_attr->latency.growth = 0; iface_attr->bandwidth.dedicated = 6911 * pow(1024,2); /* bytes */ iface_attr->bandwidth.shared = 0; iface_attr->priority = 0; return UCS_OK; } void uct_ugni_base_desc_init(ucs_mpool_t *mp, void *obj, void *chunk) { uct_ugni_base_desc_t *base = (uct_ugni_base_desc_t *) obj; /* zero base descriptor */ memset(base, 0 , sizeof(*base)); base->free_cb = ucs_mpool_put; } void uct_ugni_base_desc_key_init(uct_iface_h iface, void *obj, uct_mem_h memh) { uct_ugni_base_desc_t *base = (uct_ugni_base_desc_t *)obj; /* call base initialization */ uct_ugni_base_desc_init(NULL, obj, NULL); /* set local keys */ base->desc.local_mem_hndl = *(gni_mem_handle_t *)memh; } unsigned uct_ugni_progress(void *arg) { gni_cq_entry_t event_data = 0; gni_post_descriptor_t *event_post_desc_ptr; uct_ugni_base_desc_t *desc; uct_ugni_iface_t * iface = (uct_ugni_iface_t *)arg; gni_return_t ugni_rc; unsigned count = 0; while (1) { uct_ugni_cdm_lock(&iface->cdm); ugni_rc = GNI_CqGetEvent(iface->local_cq, &event_data); if (GNI_RC_NOT_DONE == ugni_rc) { uct_ugni_cdm_unlock(&iface->cdm); break; } if ((GNI_RC_SUCCESS != ugni_rc && !event_data) || GNI_CQ_OVERRUN(event_data)) { uct_ugni_cdm_unlock(&iface->cdm); ucs_error("GNI_CqGetEvent falied. Error status %s %d ", gni_err_str[ugni_rc], ugni_rc); return count; } ugni_rc = GNI_GetCompleted(iface->local_cq, event_data, &event_post_desc_ptr); uct_ugni_cdm_unlock(&iface->cdm); if (GNI_RC_SUCCESS != ugni_rc && GNI_RC_TRANSACTION_ERROR != ugni_rc) { ucs_error("GNI_GetCompleted falied. Error status %s %d", gni_err_str[ugni_rc], ugni_rc); return count; } desc = (uct_ugni_base_desc_t *)event_post_desc_ptr; ucs_trace_async("Completion received on %p", desc); if (NULL != desc->comp_cb) { uct_invoke_completion(desc->comp_cb, UCS_OK); } desc->free_cb(desc); iface->outstanding--; uct_ugni_check_flush(desc->flush_group); ++count; } /* have a go a processing the pending queue */ ucs_arbiter_dispatch(&iface->arbiter, 1, uct_ugni_ep_process_pending, NULL); return count; } static UCS_CLASS_CLEANUP_FUNC(uct_ugni_rdma_iface_t) { uct_worker_progress_remove(self->super.super.worker, &self->super.super.prog); ucs_mpool_cleanup(&self->free_desc_get_buffer, 1); ucs_mpool_cleanup(&self->free_desc_get, 1); ucs_mpool_cleanup(&self->free_desc_famo, 1); ucs_mpool_cleanup(&self->free_desc_buffer, 1); ucs_mpool_cleanup(&self->free_desc, 1); } static UCS_CLASS_DEFINE_DELETE_FUNC(uct_ugni_rdma_iface_t, uct_iface_t); static uct_iface_ops_t uct_ugni_aries_rdma_iface_ops = { .ep_put_short = uct_ugni_ep_put_short, .ep_put_bcopy = uct_ugni_ep_put_bcopy, .ep_put_zcopy = uct_ugni_ep_put_zcopy, .ep_get_bcopy = uct_ugni_ep_get_bcopy, .ep_get_zcopy = uct_ugni_ep_get_zcopy, .ep_am_short = uct_ugni_ep_am_short, .ep_atomic_cswap64 = uct_ugni_ep_atomic_cswap64, .ep_atomic_cswap32 = uct_ugni_ep_atomic_cswap32, .ep_atomic64_post = uct_ugni_ep_atomic64_post, .ep_atomic32_post = uct_ugni_ep_atomic32_post, .ep_atomic64_fetch = uct_ugni_ep_atomic64_fetch, .ep_atomic32_fetch = uct_ugni_ep_atomic32_fetch, .ep_pending_add = uct_ugni_ep_pending_add, .ep_pending_purge = uct_ugni_ep_pending_purge, .ep_flush = uct_ugni_ep_flush, .ep_fence = uct_base_ep_fence, .ep_create = UCS_CLASS_NEW_FUNC_NAME(uct_ugni_rdma_ep_t), .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_ugni_ep_t), .iface_flush = uct_ugni_iface_flush, .iface_fence = uct_base_iface_fence, .iface_progress_enable = ucs_empty_function, .iface_progress_disable = ucs_empty_function, .iface_progress = (void*)uct_ugni_progress, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_ugni_rdma_iface_t), .iface_query = uct_ugni_rdma_iface_query, .iface_get_device_address = uct_ugni_iface_get_dev_address, .iface_get_address = uct_ugni_iface_get_address, .iface_is_reachable = uct_ugni_iface_is_reachable }; static uct_iface_ops_t uct_ugni_gemini_rdma_iface_ops = { .ep_put_short = uct_ugni_ep_put_short, .ep_put_bcopy = uct_ugni_ep_put_bcopy, .ep_put_zcopy = uct_ugni_ep_put_zcopy, .ep_get_bcopy = uct_ugni_ep_get_bcopy, .ep_get_zcopy = uct_ugni_ep_get_zcopy, .ep_am_short = uct_ugni_ep_am_short, .ep_atomic_cswap64 = uct_ugni_ep_atomic_cswap64, .ep_pending_add = uct_ugni_ep_pending_add, .ep_pending_purge = uct_ugni_ep_pending_purge, .ep_flush = uct_ugni_ep_flush, .ep_fence = uct_base_ep_fence, .ep_create = UCS_CLASS_NEW_FUNC_NAME(uct_ugni_rdma_ep_t), .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_ugni_ep_t), .iface_flush = uct_ugni_iface_flush, .iface_fence = uct_base_iface_fence, .iface_progress_enable = ucs_empty_function, .iface_progress_disable = ucs_empty_function, .iface_progress = (void*)uct_ugni_progress, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_ugni_rdma_iface_t), .iface_query = uct_ugni_rdma_iface_query, .iface_get_device_address = uct_ugni_iface_get_dev_address, .iface_get_address = uct_ugni_iface_get_address, .iface_is_reachable = uct_ugni_iface_is_reachable }; static ucs_mpool_ops_t uct_ugni_rdma_desc_mpool_ops = { .chunk_alloc = ucs_mpool_hugetlb_malloc, .chunk_release = ucs_mpool_hugetlb_free, .obj_init = uct_ugni_base_desc_init, .obj_cleanup = NULL }; static uct_iface_ops_t *uct_ugni_rdma_choose_ops_by_device(uct_ugni_device_t *dev) { switch(dev->type) { case GNI_DEVICE_GEMINI: return &uct_ugni_gemini_rdma_iface_ops; case GNI_DEVICE_ARIES: return &uct_ugni_aries_rdma_iface_ops; default: ucs_error("Unexpected device found in uct_ugni_rdma_choose_ops_by_device." "unexpected device type %s", dev->type_name); return NULL; } } static UCS_CLASS_INIT_FUNC(uct_ugni_rdma_iface_t, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *tl_config) { uct_ugni_rdma_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_rdma_iface_config_t); ucs_status_t status; uct_ugni_device_t *dev = uct_ugni_device_by_name(params->mode.device.dev_name); uct_iface_ops_t *ops; ops = uct_ugni_rdma_choose_ops_by_device(dev); if (NULL == ops) { status = UCS_ERR_NO_DEVICE; goto exit; } UCS_CLASS_CALL_SUPER_INIT(uct_ugni_iface_t, md, worker, params, ops, &config->super UCS_STATS_ARG(NULL)); /* Setting initial configuration */ self->config.fma_seg_size = UCT_UGNI_MAX_FMA; self->config.rdma_max_size = UCT_UGNI_MAX_RDMA; status = ucs_mpool_init(&self->free_desc, 0, sizeof(uct_ugni_base_desc_t), 0, /* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128, /* grow */ config->mpool.max_bufs, /* max buffers */ &uct_ugni_rdma_desc_mpool_ops, "UGNI-DESC-ONLY"); if (UCS_OK != status) { ucs_error("Mpool creation failed"); goto exit; } status = ucs_mpool_init(&self->free_desc_get, 0, sizeof(uct_ugni_rdma_fetch_desc_t), 0, /* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128 , /* grow */ config->mpool.max_bufs, /* max buffers */ &uct_ugni_rdma_desc_mpool_ops, "UGNI-GET-DESC-ONLY"); if (UCS_OK != status) { ucs_error("Mpool creation failed"); goto clean_desc; } status = ucs_mpool_init(&self->free_desc_buffer, 0, sizeof(uct_ugni_base_desc_t) + self->config.fma_seg_size, sizeof(uct_ugni_base_desc_t), /* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128 , /* grow */ config->mpool.max_bufs, /* max buffers */ &uct_ugni_rdma_desc_mpool_ops, "UGNI-DESC-BUFFER"); if (UCS_OK != status) { ucs_error("Mpool creation failed"); goto clean_desc_get; } status = uct_iface_mpool_init(&self->super.super, &self->free_desc_famo, sizeof(uct_ugni_rdma_fetch_desc_t) + 8, sizeof(uct_ugni_rdma_fetch_desc_t),/* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ &config->mpool, /* mpool config */ 128 , /* grow */ uct_ugni_base_desc_key_init, /* memory/key init */ "UGNI-DESC-FAMO"); if (UCS_OK != status) { ucs_error("Mpool creation failed"); goto clean_buffer; } status = uct_iface_mpool_init(&self->super.super, &self->free_desc_get_buffer, sizeof(uct_ugni_rdma_fetch_desc_t) + self->config.fma_seg_size, sizeof(uct_ugni_rdma_fetch_desc_t), /* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ &config->mpool, /* mpool config */ 128 , /* grow */ uct_ugni_base_desc_key_init, /* memory/key init */ "UGNI-DESC-GET"); if (UCS_OK != status) { ucs_error("Mpool creation failed"); goto clean_famo; } /* TBD: eventually the uct_ugni_progress has to be moved to * rdma layer so each ugni layer will have own progress */ uct_worker_progress_add_safe(self->super.super.worker, uct_ugni_progress, self, &self->super.super.prog); return UCS_OK; clean_famo: ucs_mpool_cleanup(&self->free_desc_famo, 1); clean_buffer: ucs_mpool_cleanup(&self->free_desc_buffer, 1); clean_desc_get: ucs_mpool_cleanup(&self->free_desc_get, 1); clean_desc: ucs_mpool_cleanup(&self->free_desc, 1); exit: uct_ugni_cleanup_base_iface(&self->super); ucs_error("Failed to activate interface"); return status; } UCS_CLASS_DEFINE(uct_ugni_rdma_iface_t, uct_ugni_iface_t); UCS_CLASS_DEFINE_NEW_FUNC(uct_ugni_rdma_iface_t, uct_iface_t, uct_md_h, uct_worker_h, const uct_iface_params_t*, const uct_iface_config_t*); UCT_TL_DEFINE(&uct_ugni_component, ugni_rdma, uct_ugni_query_devices, uct_ugni_rdma_iface_t, "UGNI_RDMA_", uct_ugni_rdma_iface_config_table, uct_ugni_rdma_iface_config_t);