Blob Blame History Raw
/**
 * Copyright (c) UT-Battelle, LLC. 2014-2017. ALL RIGHTS RESERVED.
 * Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
 * See file LICENSE for terms.
 */

#include "ugni_rdma_ep.h"
#include "ugni_rdma_iface.h"
#include <uct/ugni/base/ugni_def.h>
#include <uct/ugni/base/ugni_md.h>
#include <uct/ugni/base/ugni_device.h>

static ucs_config_field_t uct_ugni_rdma_iface_config_table[] = {
    /* This tuning controls the allocation priorities for bouncing buffers */
    { "", "ALLOC=huge,mmap,heap", NULL,
    ucs_offsetof(uct_ugni_rdma_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)},

    UCT_IFACE_MPOOL_CONFIG_FIELDS("RDMA", -1, 0, "rdma",
                                  ucs_offsetof(uct_ugni_rdma_iface_config_t, mpool),
                                  "\nAttention: Setting this param with value != -1 is a dangerous thing\n"
                                  "and could cause deadlock or performance degradation."),

    {NULL}
};

static ucs_status_t uct_ugni_rdma_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr)
{
    uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_rdma_iface_t);

    uct_base_iface_query(&iface->super.super, iface_attr);

    iface_attr->cap.put.max_short       = iface->config.fma_seg_size;
    iface_attr->cap.put.max_bcopy       = iface->config.fma_seg_size;
    iface_attr->cap.put.min_zcopy       = 0;
    iface_attr->cap.put.max_zcopy       = iface->config.rdma_max_size;
    iface_attr->cap.put.opt_zcopy_align = 1;
    iface_attr->cap.put.align_mtu       = iface_attr->cap.put.opt_zcopy_align;
    iface_attr->cap.put.max_iov         = 1;

    iface_attr->cap.get.max_bcopy       = iface->config.fma_seg_size - 8; /* alignment offset 4 (addr)+ 4 (len)*/
    iface_attr->cap.get.min_zcopy       = 0;
    iface_attr->cap.get.max_zcopy       = iface->config.rdma_max_size;
    iface_attr->cap.get.opt_zcopy_align = 1;
    iface_attr->cap.get.align_mtu       = iface_attr->cap.get.opt_zcopy_align;
    iface_attr->cap.get.max_iov         = 1;

    iface_attr->cap.am.max_iov         = 1;
    iface_attr->cap.am.opt_zcopy_align = 1;
    iface_attr->cap.am.align_mtu       = iface_attr->cap.am.opt_zcopy_align;

    iface_attr->device_addr_len        = sizeof(uct_devaddr_ugni_t);
    iface_attr->iface_addr_len         = sizeof(uct_sockaddr_ugni_t);
    iface_attr->ep_addr_len            = 0;
    iface_attr->max_conn_priv          = 0;
    iface_attr->cap.flags              = UCT_IFACE_FLAG_PUT_SHORT |
                                         UCT_IFACE_FLAG_PUT_BCOPY |
                                         UCT_IFACE_FLAG_PUT_ZCOPY |
                                         UCT_IFACE_FLAG_GET_BCOPY      |
                                         UCT_IFACE_FLAG_GET_ZCOPY      |
                                         UCT_IFACE_FLAG_CONNECT_TO_IFACE |
                                         UCT_IFACE_FLAG_ATOMIC_DEVICE |
                                         UCT_IFACE_FLAG_PENDING;

    iface_attr->cap.atomic64.op_flags  = UCS_BIT(UCT_ATOMIC_OP_ADD)|
                                         UCS_BIT(UCT_ATOMIC_OP_AND) |
                                         UCS_BIT(UCT_ATOMIC_OP_OR)  |
                                         UCS_BIT(UCT_ATOMIC_OP_XOR);

    iface_attr->cap.atomic64.fop_flags = UCS_BIT(UCT_ATOMIC_OP_ADD)    |
                                         UCS_BIT(UCT_ATOMIC_OP_AND)   |
                                         UCS_BIT(UCT_ATOMIC_OP_OR)    |
                                         UCS_BIT(UCT_ATOMIC_OP_XOR)   |
                                         UCS_BIT(UCT_ATOMIC_OP_SWAP)  |
                                         UCS_BIT(UCT_ATOMIC_OP_CSWAP);


    if (uct_ugni_check_device_type(&iface->super, GNI_DEVICE_ARIES)) {
        iface_attr->cap.flags              |= UCT_IFACE_FLAG_PUT_SHORT;

        iface_attr->cap.atomic64.fop_flags |= UCS_BIT(UCT_ATOMIC_OP_SWAP);
        iface_attr->cap.atomic32.op_flags  |= UCS_BIT(UCT_ATOMIC_OP_ADD) |
                                              UCS_BIT(UCT_ATOMIC_OP_AND) |
                                              UCS_BIT(UCT_ATOMIC_OP_OR)  |
                                              UCS_BIT(UCT_ATOMIC_OP_XOR);
        iface_attr->cap.atomic32.fop_flags |= UCS_BIT(UCT_ATOMIC_OP_ADD)   |
                                              UCS_BIT(UCT_ATOMIC_OP_AND)   |
                                              UCS_BIT(UCT_ATOMIC_OP_OR)    |
                                              UCS_BIT(UCT_ATOMIC_OP_XOR)   |
                                              UCS_BIT(UCT_ATOMIC_OP_SWAP)  |
                                              UCS_BIT(UCT_ATOMIC_OP_CSWAP);
    }
    iface_attr->overhead               = 80e-9; /* 80 ns */
    iface_attr->latency.overhead       = 900e-9; /* 900 ns */
    iface_attr->latency.growth         = 0;
    iface_attr->bandwidth.dedicated    = 6911 * pow(1024,2); /* bytes */
    iface_attr->bandwidth.shared       = 0;
    iface_attr->priority               = 0;

    return UCS_OK;
}

void uct_ugni_base_desc_init(ucs_mpool_t *mp, void *obj, void *chunk)
{
  uct_ugni_base_desc_t *base = (uct_ugni_base_desc_t *) obj;
  /* zero base descriptor */
  memset(base, 0 , sizeof(*base));
  base->free_cb = ucs_mpool_put;
}

void uct_ugni_base_desc_key_init(uct_iface_h iface, void *obj, uct_mem_h memh)
{
  uct_ugni_base_desc_t *base = (uct_ugni_base_desc_t *)obj;
  /* call base initialization */
  uct_ugni_base_desc_init(NULL, obj, NULL);
  /* set local keys */
  base->desc.local_mem_hndl = *(gni_mem_handle_t *)memh;
}

unsigned uct_ugni_progress(void *arg)
{
    gni_cq_entry_t  event_data = 0;
    gni_post_descriptor_t *event_post_desc_ptr;
    uct_ugni_base_desc_t *desc;
    uct_ugni_iface_t * iface = (uct_ugni_iface_t *)arg;
    gni_return_t ugni_rc;
    unsigned count = 0;

    while (1) {
        uct_ugni_cdm_lock(&iface->cdm);
        ugni_rc = GNI_CqGetEvent(iface->local_cq, &event_data);
        if (GNI_RC_NOT_DONE == ugni_rc) {
            uct_ugni_cdm_unlock(&iface->cdm);
            break;
        }
        if ((GNI_RC_SUCCESS != ugni_rc && !event_data) || GNI_CQ_OVERRUN(event_data)) {
            uct_ugni_cdm_unlock(&iface->cdm);
            ucs_error("GNI_CqGetEvent falied. Error status %s %d ",
                      gni_err_str[ugni_rc], ugni_rc);
            return count;
        }

        ugni_rc = GNI_GetCompleted(iface->local_cq, event_data, &event_post_desc_ptr);
        uct_ugni_cdm_unlock(&iface->cdm);
        if (GNI_RC_SUCCESS != ugni_rc && GNI_RC_TRANSACTION_ERROR != ugni_rc) {
            ucs_error("GNI_GetCompleted falied. Error status %s %d",
                      gni_err_str[ugni_rc], ugni_rc);
            return count;
        }

        desc = (uct_ugni_base_desc_t *)event_post_desc_ptr;
        ucs_trace_async("Completion received on %p", desc);
        if (NULL != desc->comp_cb) {
            uct_invoke_completion(desc->comp_cb, UCS_OK);
        }
        desc->free_cb(desc);
        iface->outstanding--;
        uct_ugni_check_flush(desc->flush_group);
        ++count;
    }
    /* have a go a processing the pending queue */
    ucs_arbiter_dispatch(&iface->arbiter, 1, uct_ugni_ep_process_pending, NULL);
    return count;
}

static UCS_CLASS_CLEANUP_FUNC(uct_ugni_rdma_iface_t)
{
    uct_worker_progress_remove(self->super.super.worker, &self->super.super.prog);
    ucs_mpool_cleanup(&self->free_desc_get_buffer, 1);
    ucs_mpool_cleanup(&self->free_desc_get, 1);
    ucs_mpool_cleanup(&self->free_desc_famo, 1);
    ucs_mpool_cleanup(&self->free_desc_buffer, 1);
    ucs_mpool_cleanup(&self->free_desc, 1);
}

static UCS_CLASS_DEFINE_DELETE_FUNC(uct_ugni_rdma_iface_t, uct_iface_t);

static uct_iface_ops_t uct_ugni_aries_rdma_iface_ops = {
    .ep_put_short             = uct_ugni_ep_put_short,
    .ep_put_bcopy             = uct_ugni_ep_put_bcopy,
    .ep_put_zcopy             = uct_ugni_ep_put_zcopy,
    .ep_get_bcopy             = uct_ugni_ep_get_bcopy,
    .ep_get_zcopy             = uct_ugni_ep_get_zcopy,
    .ep_am_short              = uct_ugni_ep_am_short,
    .ep_atomic_cswap64        = uct_ugni_ep_atomic_cswap64,
    .ep_atomic_cswap32        = uct_ugni_ep_atomic_cswap32,
    .ep_atomic64_post         = uct_ugni_ep_atomic64_post,
    .ep_atomic32_post         = uct_ugni_ep_atomic32_post,
    .ep_atomic64_fetch        = uct_ugni_ep_atomic64_fetch,
    .ep_atomic32_fetch        = uct_ugni_ep_atomic32_fetch,
    .ep_pending_add           = uct_ugni_ep_pending_add,
    .ep_pending_purge         = uct_ugni_ep_pending_purge,
    .ep_flush                 = uct_ugni_ep_flush,
    .ep_fence                 = uct_base_ep_fence,
    .ep_create                = UCS_CLASS_NEW_FUNC_NAME(uct_ugni_rdma_ep_t),
    .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_ugni_ep_t),
    .iface_flush              = uct_ugni_iface_flush,
    .iface_fence              = uct_base_iface_fence,
    .iface_progress_enable    = ucs_empty_function,
    .iface_progress_disable   = ucs_empty_function,
    .iface_progress           = (void*)uct_ugni_progress,
    .iface_close              = UCS_CLASS_DELETE_FUNC_NAME(uct_ugni_rdma_iface_t),
    .iface_query              = uct_ugni_rdma_iface_query,
    .iface_get_device_address = uct_ugni_iface_get_dev_address,
    .iface_get_address        = uct_ugni_iface_get_address,
    .iface_is_reachable       = uct_ugni_iface_is_reachable
};

static uct_iface_ops_t uct_ugni_gemini_rdma_iface_ops = {
    .ep_put_short             = uct_ugni_ep_put_short,
    .ep_put_bcopy             = uct_ugni_ep_put_bcopy,
    .ep_put_zcopy             = uct_ugni_ep_put_zcopy,
    .ep_get_bcopy             = uct_ugni_ep_get_bcopy,
    .ep_get_zcopy             = uct_ugni_ep_get_zcopy,
    .ep_am_short              = uct_ugni_ep_am_short,
    .ep_atomic_cswap64        = uct_ugni_ep_atomic_cswap64,
    .ep_pending_add           = uct_ugni_ep_pending_add,
    .ep_pending_purge         = uct_ugni_ep_pending_purge,
    .ep_flush                 = uct_ugni_ep_flush,
    .ep_fence                 = uct_base_ep_fence,
    .ep_create                = UCS_CLASS_NEW_FUNC_NAME(uct_ugni_rdma_ep_t),
    .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_ugni_ep_t),
    .iface_flush              = uct_ugni_iface_flush,
    .iface_fence              = uct_base_iface_fence,
    .iface_progress_enable    = ucs_empty_function,
    .iface_progress_disable   = ucs_empty_function,
    .iface_progress           = (void*)uct_ugni_progress,
    .iface_close              = UCS_CLASS_DELETE_FUNC_NAME(uct_ugni_rdma_iface_t),
    .iface_query              = uct_ugni_rdma_iface_query,
    .iface_get_device_address = uct_ugni_iface_get_dev_address,
    .iface_get_address        = uct_ugni_iface_get_address,
    .iface_is_reachable       = uct_ugni_iface_is_reachable
};

static ucs_mpool_ops_t uct_ugni_rdma_desc_mpool_ops = {
    .chunk_alloc   = ucs_mpool_hugetlb_malloc,
    .chunk_release = ucs_mpool_hugetlb_free,
    .obj_init      = uct_ugni_base_desc_init,
    .obj_cleanup   = NULL
};

static uct_iface_ops_t *uct_ugni_rdma_choose_ops_by_device(uct_ugni_device_t *dev)
{
    switch(dev->type) {
    case GNI_DEVICE_GEMINI:
        return &uct_ugni_gemini_rdma_iface_ops;
    case GNI_DEVICE_ARIES:
        return &uct_ugni_aries_rdma_iface_ops;
    default:
        ucs_error("Unexpected device found in uct_ugni_rdma_choose_ops_by_device."
                  "unexpected device type %s", dev->type_name);
        return NULL;
    }
}

static UCS_CLASS_INIT_FUNC(uct_ugni_rdma_iface_t, uct_md_h md, uct_worker_h worker,
                           const uct_iface_params_t *params,
                           const uct_iface_config_t *tl_config)
{
    uct_ugni_rdma_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_rdma_iface_config_t);
    ucs_status_t status;
    uct_ugni_device_t *dev = uct_ugni_device_by_name(params->mode.device.dev_name);
    uct_iface_ops_t *ops;

    ops = uct_ugni_rdma_choose_ops_by_device(dev);
    if (NULL == ops) {
        status = UCS_ERR_NO_DEVICE;
        goto exit;
    }
    UCS_CLASS_CALL_SUPER_INIT(uct_ugni_iface_t, md, worker, params, ops,
                              &config->super UCS_STATS_ARG(NULL));
    /* Setting initial configuration */
    self->config.fma_seg_size  = UCT_UGNI_MAX_FMA;
    self->config.rdma_max_size = UCT_UGNI_MAX_RDMA;

    status = ucs_mpool_init(&self->free_desc,
                            0,
                            sizeof(uct_ugni_base_desc_t),
                            0,                            /* alignment offset */
                            UCS_SYS_CACHE_LINE_SIZE,      /* alignment */
                            128,                          /* grow */
                            config->mpool.max_bufs,       /* max buffers */
                            &uct_ugni_rdma_desc_mpool_ops,
                            "UGNI-DESC-ONLY");
    if (UCS_OK != status) {
        ucs_error("Mpool creation failed");
        goto exit;
    }

    status = ucs_mpool_init(&self->free_desc_get,
                            0,
                            sizeof(uct_ugni_rdma_fetch_desc_t),
                            0,                            /* alignment offset */
                            UCS_SYS_CACHE_LINE_SIZE,      /* alignment */
                            128 ,                         /* grow */
                            config->mpool.max_bufs,       /* max buffers */
                            &uct_ugni_rdma_desc_mpool_ops,
                            "UGNI-GET-DESC-ONLY");
    if (UCS_OK != status) {
        ucs_error("Mpool creation failed");
        goto clean_desc;
    }

    status = ucs_mpool_init(&self->free_desc_buffer,
                            0,
                            sizeof(uct_ugni_base_desc_t) + self->config.fma_seg_size,
                            sizeof(uct_ugni_base_desc_t), /* alignment offset */
                            UCS_SYS_CACHE_LINE_SIZE,      /* alignment */
                            128 ,                         /* grow */
                            config->mpool.max_bufs,       /* max buffers */
                            &uct_ugni_rdma_desc_mpool_ops,
                            "UGNI-DESC-BUFFER");
    if (UCS_OK != status) {
        ucs_error("Mpool creation failed");
        goto clean_desc_get;
    }

    status = uct_iface_mpool_init(&self->super.super,
                                  &self->free_desc_famo,
                                  sizeof(uct_ugni_rdma_fetch_desc_t) + 8,
                                  sizeof(uct_ugni_rdma_fetch_desc_t),/* alignment offset */
                                  UCS_SYS_CACHE_LINE_SIZE,      /* alignment */
                                  &config->mpool,               /* mpool config */
                                  128 ,                         /* grow */
                                  uct_ugni_base_desc_key_init,  /* memory/key init */
                                  "UGNI-DESC-FAMO");
    if (UCS_OK != status) {
        ucs_error("Mpool creation failed");
        goto clean_buffer;
    }

    status = uct_iface_mpool_init(&self->super.super,
                                  &self->free_desc_get_buffer,
                                  sizeof(uct_ugni_rdma_fetch_desc_t) +
                                  self->config.fma_seg_size,
                                  sizeof(uct_ugni_rdma_fetch_desc_t), /* alignment offset */
                                  UCS_SYS_CACHE_LINE_SIZE,      /* alignment */
                                  &config->mpool,               /* mpool config */
                                  128 ,                         /* grow */
                                  uct_ugni_base_desc_key_init,  /* memory/key init */
                                  "UGNI-DESC-GET");
    if (UCS_OK != status) {
        ucs_error("Mpool creation failed");
        goto clean_famo;
    }

    /* TBD: eventually the uct_ugni_progress has to be moved to
     * rdma layer so each ugni layer will have own progress */
    uct_worker_progress_add_safe(self->super.super.worker, uct_ugni_progress, self,
                                 &self->super.super.prog);
    return UCS_OK;

clean_famo:
    ucs_mpool_cleanup(&self->free_desc_famo, 1);
clean_buffer:
    ucs_mpool_cleanup(&self->free_desc_buffer, 1);
clean_desc_get:
    ucs_mpool_cleanup(&self->free_desc_get, 1);
clean_desc:
    ucs_mpool_cleanup(&self->free_desc, 1);
exit:
    uct_ugni_cleanup_base_iface(&self->super);
    ucs_error("Failed to activate interface");
    return status;
}

UCS_CLASS_DEFINE(uct_ugni_rdma_iface_t, uct_ugni_iface_t);
UCS_CLASS_DEFINE_NEW_FUNC(uct_ugni_rdma_iface_t, uct_iface_t, uct_md_h,
                          uct_worker_h, const uct_iface_params_t*,
                          const uct_iface_config_t*);

UCT_TL_DEFINE(&uct_ugni_component, ugni_rdma, uct_ugni_query_devices,
              uct_ugni_rdma_iface_t, "UGNI_RDMA_",
              uct_ugni_rdma_iface_config_table, uct_ugni_rdma_iface_config_t);