Blob Blame History Raw
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/**
 * Copyright (c) UT-Battelle, LLC. 2014-2017. ALL RIGHTS RESERVED.
 * Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
 * Copyright (c) Los Alamos National Security, LLC. 2018. ALL RIGHTS RESERVED.
 * Copyright (c) Triad National Security, LLC. 2018. ALL RIGHTS RESERVED.
 * See file LICENSE for terms.
 */

#include "ugni_device.h"
#include "ugni_md.h"
#include "ugni_iface.h"
#include <uct/base/uct_md.h>
#include <ucs/arch/atomic.h>
#include <ucs/sys/string.h>
#include <pmi.h>

/**
 * @breif Static information about UGNI job
 *
 * This is static information about Cray's job.
 * The information is static and does not change since job launch.
 * Therefore, the information is only fetched once.
 */
typedef struct uct_ugni_job_info {
    uint8_t             ptag;                           /**< Protection tag */
    uint32_t            cookie;                         /**< Unique identifier generated by the PMI system */
    int                 num_devices;                    /**< Number of devices */
    uct_ugni_device_t   devices[UCT_UGNI_MAX_DEVICES];  /**< Array of devices */
    int                 initialized;                    /**< Info status */
} uct_ugni_job_info_t;

static uct_ugni_job_info_t job_info = {
    .num_devices        = -1,
};

uint32_t ugni_domain_counter = 0;

void uct_ugni_device_get_resource(uct_ugni_device_t *dev,
                                  uct_tl_device_resource_t *tl_device)
{
    ucs_snprintf_zero(tl_device->name, sizeof(tl_device->name), "%s", dev->fname);
    tl_device->type = UCT_DEVICE_TYPE_NET;
}

ucs_status_t uct_ugni_query_devices(uct_md_h md,
                                    uct_tl_device_resource_t **tl_devices_p,
                                    unsigned *num_tl_devices_p)
{
    uct_tl_device_resource_t *resources;
    int num_devices = job_info.num_devices;
    uct_ugni_device_t *devs = job_info.devices;
    int i;
    ucs_status_t status = UCS_OK;

    resources = ucs_calloc(job_info.num_devices, sizeof(*resources),
                           "resource desc");
    if (NULL == resources) {
      ucs_error("Failed to allocate memory");
      num_devices = 0;
      resources = NULL;
      status = UCS_ERR_NO_MEMORY;
      goto error;
    }

    for (i = 0; i < job_info.num_devices; i++) {
        uct_ugni_device_get_resource(&devs[i], &resources[i]);
    }

error:
    *num_tl_devices_p = num_devices;
    *tl_devices_p     = resources;

    return status;
}

static ucs_status_t get_cookie(uint32_t *cookie)
{
    char           *cookie_str;
    char           *cookie_token;

    cookie_str = getenv("PMI_GNI_COOKIE");
    if (NULL == cookie_str) {
        ucs_error("getenv PMI_GNI_COOKIE failed");
        return UCS_ERR_IO_ERROR;
    }

    cookie_token = strtok(cookie_str, ":");
    if (NULL == cookie_token) {
        ucs_error("Failed to read PMI_GNI_COOKIE token");
        return UCS_ERR_IO_ERROR;
    }

    *cookie = (uint32_t) atoi(cookie_token);
    return UCS_OK;
}

static ucs_status_t get_ptag(uint8_t *ptag)
{
    char           *ptag_str;
    char           *ptag_token;

    ptag_str = getenv("PMI_GNI_PTAG");
    if (NULL == ptag_str) {
        ucs_error("getenv PMI_GNI_PTAG failed");
        return UCS_ERR_IO_ERROR;
    }

    ptag_token = strtok(ptag_str, ":");
    if (NULL == ptag_token) {
        ucs_error("Failed to read PMI_GNI_PTAG token");
        return UCS_ERR_IO_ERROR;
    }

    *ptag = (uint8_t) atoi(ptag_token);
    return UCS_OK;
}

static ucs_status_t uct_ugni_fetch_pmi()
{
    int spawned = 0,
        rc;

    if(job_info.initialized) {
        return UCS_OK;
    }

    if (NULL == getenv ("PMI_GNI_COOKIE")) {
        /* Fetch information from Cray's PMI if needed */
        rc = PMI_Init(&spawned);
        if (PMI_SUCCESS != rc) {
            ucs_error("PMI_Init failed, Error status: %d", rc);
            return UCS_ERR_IO_ERROR;
        }
        ucs_debug("PMI spawned %d", spawned);
    }

    rc = get_ptag(&job_info.ptag);
    if (UCS_OK != rc) {
        ucs_error("get_ptag failed, Error status: %d", rc);
        return rc;
    }
    ucs_debug("PMI ptag %d", job_info.ptag);

    rc = get_cookie(&job_info.cookie);
    if (UCS_OK != rc) {
        ucs_error("get_cookie failed, Error status: %d", rc);
        return rc;
    }
    ucs_debug("PMI cookie %d", job_info.cookie);

    /* Context and domain is activated */
    job_info.initialized = 1;
    ucs_debug("UGNI job info was activated");
    return UCS_OK;
}

static uct_ugni_job_info_t *uct_ugni_get_job_info()
{
    ucs_status_t status;

    status = uct_ugni_fetch_pmi();
    if (UCS_OK != status) {
        ucs_error("Could not fetch PMI info.");
        return NULL;
    }
    return &job_info;
}

ucs_status_t init_device_list()
{
    ucs_status_t status = UCS_OK;
    int i, num_active_devices;
    int *dev_ids = NULL;
    gni_return_t ugni_rc = GNI_RC_SUCCESS;
    uct_ugni_job_info_t *inf = NULL;

    /* check if devices were already initilized */

    inf = uct_ugni_get_job_info();
    if (NULL == inf) {
        ucs_error("Unable to get Cray PMI info");
        status = UCS_ERR_IO_ERROR;
        goto err_zero;
    }

    if (-1 != inf->num_devices) {
        ucs_debug("The device list is already initialized");
        status = UCS_OK;
        goto err_zero;
    }

    ugni_rc = GNI_GetNumLocalDevices(&inf->num_devices);
    if (GNI_RC_SUCCESS != ugni_rc) {
        ucs_error("GNI_GetNumLocalDevices failed, Error status: %s %d",
                  gni_err_str[ugni_rc], ugni_rc);
        status = UCS_ERR_NO_DEVICE;
        goto err_zero;
    }

    if (0 == inf->num_devices) {
        ucs_debug("UGNI No device found");
        status = UCS_OK;
        goto err_zero;
    }

    if (inf->num_devices >= UCT_UGNI_MAX_DEVICES) {
        ucs_error("UGNI, number of discovered devices (%d) " \
                  "is above the maximum supported devices (%d)",
                  inf->num_devices, UCT_UGNI_MAX_DEVICES);
        status = UCS_ERR_UNSUPPORTED;
        goto err_zero;
    }

    dev_ids = ucs_calloc(inf->num_devices, sizeof(int), "ugni device ids");
    if (NULL == dev_ids) {
        ucs_error("Failed to allocate memory");
        status = UCS_ERR_NO_MEMORY;
        goto err_zero;
    }

    ugni_rc = GNI_GetLocalDeviceIds(inf->num_devices, dev_ids);
    if (GNI_RC_SUCCESS != ugni_rc) {
        ucs_error("GNI_GetLocalDeviceIds failed, Error status: %s %d",
                  gni_err_str[ugni_rc], ugni_rc);
        status = UCS_ERR_NO_DEVICE;
        goto err_dev_id;
    }

    num_active_devices = 0;
    for (i = 0; i < inf->num_devices; i++) {
        status = uct_ugni_device_create(dev_ids[i], num_active_devices, &inf->devices[i]);
        if (status != UCS_OK) {
            ucs_warn("Failed to initialize ugni device %d (%s), ignoring it",
                     i, ucs_status_string(status));
        } else {
            ++num_active_devices;
        }
    }

    if (num_active_devices != inf->num_devices) {
        ucs_warn("Error in detection devices");
        status = UCS_ERR_NO_DEVICE;
        goto err_dev_id;
    }

    ucs_debug("Initialized UGNI component with %d devices", inf->num_devices);

err_dev_id:
    ucs_free(dev_ids);
err_zero:
    return status;
}

uct_ugni_device_t *uct_ugni_device_by_name(const char *dev_name)
{
    uct_ugni_device_t *dev;
    unsigned dev_index;

    if ((NULL == dev_name)) {
        ucs_error("Bad parameter. Device name is set to NULL");
        return NULL;
    }

    for (dev_index = 0; dev_index < job_info.num_devices; ++dev_index) {
        dev = &job_info.devices[dev_index];
        if ((strlen(dev_name) == strlen(dev->fname)) &&
            (0 == strncmp(dev_name, dev->fname, strlen(dev->fname)))) {
            ucs_debug("Device found: %s", dev_name);
            return dev;
        }
    }

    /* Device not found */
    ucs_error("Cannot find: %s", dev_name);
    return NULL;
}

static ucs_status_t get_nic_address(uct_ugni_device_t *dev_p)
{
    int             alps_addr = -1;
    int             alps_dev_id = -1;
    int             i;
    char           *token, *pmi_env;

    pmi_env = getenv("PMI_GNI_DEV_ID");
    if (NULL == pmi_env) {
        gni_return_t ugni_rc;
        ugni_rc = GNI_CdmGetNicAddress(dev_p->device_id, &dev_p->address,
                                       &dev_p->cpu_id);
        if (GNI_RC_SUCCESS != ugni_rc) {
            ucs_error("GNI_CdmGetNicAddress failed, device %d, Error status: %s %d",
                      dev_p->device_id, gni_err_str[ugni_rc], ugni_rc);
            return UCS_ERR_NO_DEVICE;
        }
        CPU_SET(dev_p->cpu_id, &(dev_p->cpu_mask));
        ucs_debug("(GNI) NIC address: %d", dev_p->address);
    } else {
        while ((token = strtok(pmi_env, ":")) != NULL) {
            alps_dev_id = atoi(token);
            if (alps_dev_id == dev_p->device_id) {
                break;
            }
            pmi_env = NULL;
        }
        ucs_assert(alps_dev_id != -1);

        pmi_env = getenv("PMI_GNI_LOC_ADDR");
        ucs_assert(NULL != pmi_env);
        i = 0;
        while ((token = strtok(pmi_env, ":")) != NULL) {
            if (i == alps_dev_id) {
                alps_addr = atoi(token);
                break;
            }
            pmi_env = NULL;
            ++i;
        }
        ucs_assert(alps_addr != -1);
        dev_p->address = alps_addr;
        ucs_debug("(PMI) NIC address: %d", dev_p->address);
    }
    return UCS_OK;
}

ucs_status_t uct_ugni_device_create(int dev_id, int index, uct_ugni_device_t *dev_p)
{
    ucs_status_t status;
    gni_return_t ugni_rc;

    dev_p->device_id = (uint32_t)dev_id;

    status = get_nic_address(dev_p);
    if (UCS_OK != status) {
        ucs_error("Failed to get NIC address");
        return status;
    }

    ugni_rc = GNI_GetDeviceType(&dev_p->type);
    if (GNI_RC_SUCCESS != ugni_rc) {
        ucs_error("GNI_GetDeviceType failed, device %d, Error status: %s %d",
                  dev_id, gni_err_str[ugni_rc], ugni_rc);
        return UCS_ERR_NO_DEVICE;
    }

    switch (dev_p->type) {
    case GNI_DEVICE_GEMINI:
        ucs_snprintf_zero(dev_p->type_name, sizeof(dev_p->type_name), "%s",
                          "GEMINI");
        break;
    case GNI_DEVICE_ARIES:
        ucs_snprintf_zero(dev_p->type_name, sizeof(dev_p->type_name), "%s",
                          "ARIES");
        break;
    default:
        ucs_snprintf_zero(dev_p->type_name, sizeof(dev_p->type_name), "%s",
                          "UNKNOWN");
    }

    ucs_snprintf_zero(dev_p->fname, sizeof(dev_p->fname), "%s:%d",
                      dev_p->type_name, index);

    return UCS_OK;
}

void uct_ugni_device_destroy(uct_ugni_device_t *dev)
{
}

ucs_status_t uct_ugni_iface_get_dev_address(uct_iface_t *tl_iface, uct_device_addr_t *addr)
{
    uct_ugni_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_iface_t);
    uct_devaddr_ugni_t *ugni_dev_addr = (uct_devaddr_ugni_t *)addr;
    uct_ugni_device_t *dev = uct_ugni_iface_device(iface);

    ugni_dev_addr->nic_addr = dev->address;

    return UCS_OK;
}

static int uct_ugni_next_power_of_two_inclusive (int value)
{
    int i, j, bit;

    for (i = 3, bit = 31 ; i >= 0 ; --i) {
        if (!(value & (0xff << (i << 3)))) {
            /* short circuit. no set bits present in this byte */
            bit -= 8;
            continue;
        }

        for (j = 7 ; j >= 0 ; --j, --bit) {
            int tmp = (1 << bit);
            if (value & tmp) {
                return (value == tmp) ? bit : bit + 1;
            }
        }
    }

    return 0;
}

ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device, ucs_thread_mode_t thread_mode)
{
    uct_ugni_job_info_t *job_info;
    int modes;
    gni_return_t ugni_rc;
    ucs_status_t status = UCS_OK;
    int pid_max = 32768, free_bits;
    FILE *fh;

    job_info = uct_ugni_get_job_info();
    if (NULL == job_info) {
        return UCS_ERR_IO_ERROR;
    }

    fh = fopen ("/proc/sys/kernel/pid_max", "r");
    if (NULL != fh) {
        fscanf (fh, "%d", &pid_max);
        fclose (fh);
    }

    /* determine how many free bits we have in the PID space (10 (64-bit) or more (32-bit)) */
    free_bits = 31 - (uct_ugni_next_power_of_two_inclusive (pid_max) - 1);

    cdm->thread_mode = thread_mode;
    cdm->dev = device;
    /* don't colide with the btl/ugni CDM space if used in the same process. this is done by setting the
     * highest bit in the CDM identifier */
    cdm->domain_id = 0x80000000ul | ((getpid () << free_bits) + ucs_atomic_fadd32(&ugni_domain_counter, 1));
    ucs_debug("Creating new command domain with id 0x%08x (0x80000000ul | ((%d << %d) + %d))",
              cdm->domain_id, getpid (), free_bits, ugni_domain_counter);
    modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED |
        GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL | GNI_CDM_MODE_FMA_SHARED;
    ugni_rc = GNI_CdmCreate(cdm->domain_id, job_info->ptag, job_info->cookie,
                            modes, &cdm->cdm_handle);
    if (GNI_RC_SUCCESS != ugni_rc) {
        ucs_error("GNI_CdmCreate failed, Error status: %s %d",
                  gni_err_str[ugni_rc], ugni_rc);
        return UCS_ERR_NO_DEVICE;
    }

    ugni_rc = GNI_CdmAttach(cdm->cdm_handle, device->device_id,
                            &cdm->address, &cdm->nic_handle);
    if (GNI_RC_SUCCESS != ugni_rc) {
        ucs_error("GNI_CdmAttach failed, Error status: %s\n"
                  "Created domain 0x%08x",
                  gni_err_str[ugni_rc], cdm->domain_id);
        uct_ugni_destroy_cdm(cdm);
        return UCS_ERR_NO_DEVICE;
    }

    status = uct_ugni_cdm_init_lock(cdm);
    if (UCS_OK != status) {
        ucs_error("Couldn't initalize CDM lock.");
    }

    if (UCS_OK == status) {
        ucs_debug("Made ugni cdm. nic_addr = %i domain_id = 0x%08x", device->address, cdm->domain_id);
    }
    return status;
}

ucs_status_t uct_ugni_create_md_cdm(uct_ugni_cdm_t *cdm)
{
    return uct_ugni_create_cdm(cdm, &job_info.devices[0], UCS_THREAD_MODE_MULTI);
}

ucs_status_t uct_ugni_destroy_cdm(uct_ugni_cdm_t *cdm)
{
    gni_return_t ugni_rc;

    uct_ugni_cdm_destroy_lock(cdm);

    ucs_trace_func("cdm=%p", cdm);
    ugni_rc = GNI_CdmDestroy(cdm->cdm_handle);
    if (GNI_RC_SUCCESS != ugni_rc) {
        ucs_error("GNI_CdmDestroy error status: %s (%d)",
                 gni_err_str[ugni_rc], ugni_rc);
        return UCS_ERR_IO_ERROR;
    }
    return UCS_OK;
}

ucs_status_t uct_ugni_create_cq(gni_cq_handle_t *cq, unsigned cq_size, uct_ugni_cdm_t *cdm)
{
    gni_return_t ugni_rc;

    ugni_rc = GNI_CqCreate(cdm->nic_handle, UCT_UGNI_LOCAL_CQ, 0,
                           GNI_CQ_NOBLOCK,
                           NULL, NULL, cq);
    if (GNI_RC_SUCCESS != ugni_rc) {
        ucs_error("GNI_CqCreate failed, Error status: %s %d",
                  gni_err_str[ugni_rc], ugni_rc);
        return UCS_ERR_NO_DEVICE;
    }

    return UCS_OK;
}

ucs_status_t uct_ugni_destroy_cq(gni_cq_handle_t cq, uct_ugni_cdm_t *cdm)
{
    gni_return_t ugni_rc;

    ugni_rc = GNI_CqDestroy(cq);
    if (GNI_RC_SUCCESS != ugni_rc) {
        ucs_warn("GNI_CqDestroy failed, Error status: %s %d",
                 gni_err_str[ugni_rc], ugni_rc);
        return UCS_ERR_IO_ERROR;
    }

    return UCS_OK;
}