|
Packit |
961e70 |
/*
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
This file is provided under a dual BSD/GPLv2 license. When using or
|
|
Packit |
961e70 |
redistributing this file, you may do so under either license.
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
GPL LICENSE SUMMARY
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
Copyright(c) 2015 Intel Corporation.
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
This program is free software; you can redistribute it and/or modify
|
|
Packit |
961e70 |
it under the terms of version 2 of the GNU General Public License as
|
|
Packit |
961e70 |
published by the Free Software Foundation.
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
This program is distributed in the hope that it will be useful, but
|
|
Packit |
961e70 |
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
Packit |
961e70 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Packit |
961e70 |
General Public License for more details.
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
Contact Information:
|
|
Packit |
961e70 |
Intel Corporation, www.intel.com
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
BSD LICENSE
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
Copyright(c) 2015 Intel Corporation.
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
Redistribution and use in source and binary forms, with or without
|
|
Packit |
961e70 |
modification, are permitted provided that the following conditions
|
|
Packit |
961e70 |
are met:
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
* Redistributions of source code must retain the above copyright
|
|
Packit |
961e70 |
notice, this list of conditions and the following disclaimer.
|
|
Packit |
961e70 |
* Redistributions in binary form must reproduce the above copyright
|
|
Packit |
961e70 |
notice, this list of conditions and the following disclaimer in
|
|
Packit |
961e70 |
the documentation and/or other materials provided with the
|
|
Packit |
961e70 |
distribution.
|
|
Packit |
961e70 |
* Neither the name of Intel Corporation nor the names of its
|
|
Packit |
961e70 |
contributors may be used to endorse or promote products derived
|
|
Packit |
961e70 |
from this software without specific prior written permission.
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
Packit |
961e70 |
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
Packit |
961e70 |
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
Packit |
961e70 |
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
Packit |
961e70 |
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
Packit |
961e70 |
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
Packit |
961e70 |
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
Packit |
961e70 |
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
Packit |
961e70 |
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
Packit |
961e70 |
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
Packit |
961e70 |
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
*/
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
#include <sys/types.h>
|
|
Packit |
961e70 |
#include <sys/stat.h>
|
|
Packit |
961e70 |
#include "psm_user.h"
|
|
Packit |
961e70 |
#include "psm2_hal.h"
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
static int psmi_get_hfi_selection_algorithm(void);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
psm2_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
int poll_type;
|
|
Packit |
961e70 |
int ret;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
if (!enable == !psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED))
|
|
Packit |
961e70 |
return PSM2_OK;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
if (enable)
|
|
Packit |
961e70 |
poll_type = PSMI_HAL_POLL_TYPE_URGENT;
|
|
Packit |
961e70 |
else
|
|
Packit |
961e70 |
poll_type = 0;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
ret = psmi_hal_poll_type(poll_type, context->psm_hw_ctxt);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
if (ret != 0)
|
|
Packit |
961e70 |
return PSM2_EP_NO_RESOURCES;
|
|
Packit |
961e70 |
else {
|
|
Packit |
961e70 |
if (enable)
|
|
Packit |
961e70 |
psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED);
|
|
Packit |
961e70 |
else
|
|
Packit |
961e70 |
psmi_hal_sub_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED);
|
|
Packit |
961e70 |
return PSM2_OK;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
int psmi_context_interrupt_isenabled(psmi_context_t *context)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
return psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED);
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* Returns 1 when all of the active units have their free contexts
|
|
Packit |
961e70 |
* equal the number of contexts. This is an indication that no
|
|
Packit |
961e70 |
* jobs are currently running.
|
|
Packit |
961e70 |
*
|
|
Packit |
961e70 |
* Note that this code is clearly racy (this code may happen concurrently
|
|
Packit |
961e70 |
* by two or more processes, and this point of observation,
|
|
Packit |
961e70 |
* occurs earlier in time to when the decision is made for deciding which
|
|
Packit |
961e70 |
* context to assign, which will also occurs earlier in time to when the
|
|
Packit |
961e70 |
* context is actually assigned. And, when the context is finally
|
|
Packit |
961e70 |
* assigned, this will change the "nfreectxts" observed below.)
|
|
Packit |
961e70 |
*/
|
|
Packit |
961e70 |
static int psmi_all_active_units_have_max_freecontexts(int nunits)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
int u;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
for (u=0;u < nunits;u++)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
if (psmi_hal_get_unit_active(u) > 0)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
int nfreectxts=psmi_hal_get_num_free_contexts(u),
|
|
Packit |
961e70 |
nctxts=psmi_hal_get_num_contexts(u);
|
|
Packit |
961e70 |
if (nfreectxts > 0 && nctxts > 0)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
if (nfreectxts != nctxts)
|
|
Packit |
961e70 |
return 0;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
return 1;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* returns the integer value of an environment variable, or 0 if the environment
|
|
Packit |
961e70 |
* variable is not set. */
|
|
Packit |
961e70 |
static int psmi_get_envvar(const char *env)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
const char *env_val = getenv(env);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
if (env_val && *env_val)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
int r = atoi(env_val);
|
|
Packit |
961e70 |
return (r >= 0) ? r : 0;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
return 0;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* returns the 8-bit hash value of an uuid. */
|
|
Packit |
961e70 |
static inline
|
|
Packit |
961e70 |
uint8_t
|
|
Packit |
961e70 |
psmi_get_uuid_hash(psm2_uuid_t const uuid)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
int i;
|
|
Packit |
961e70 |
uint8_t hashed_uuid = 0;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
for (i=0; i < sizeof(psm2_uuid_t); ++i)
|
|
Packit |
961e70 |
hashed_uuid ^= *((uint8_t const *)uuid + i);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
return hashed_uuid;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
int psmi_get_current_proc_location()
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
int core_id, node_id;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
core_id = sched_getcpu();
|
|
Packit |
961e70 |
if (core_id < 0)
|
|
Packit |
961e70 |
return -EINVAL;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
node_id = numa_node_of_cpu(core_id);
|
|
Packit |
961e70 |
if (node_id < 0)
|
|
Packit |
961e70 |
return -EINVAL;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
return node_id;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
static void
|
|
Packit |
961e70 |
psmi_spread_hfi_selection(psm2_uuid_t const job_key, long *unit_start,
|
|
Packit |
961e70 |
long *unit_end, int nunits)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
/* if the number of ranks on the host is 1 and ... */
|
|
Packit |
961e70 |
if ((psmi_get_envvar("MPI_LOCALNRANKS") == 1) &&
|
|
Packit |
961e70 |
/*
|
|
Packit |
961e70 |
* All of the active units have free contexts equal the
|
|
Packit |
961e70 |
* number of contexts.
|
|
Packit |
961e70 |
*/
|
|
Packit |
961e70 |
psmi_all_active_units_have_max_freecontexts(nunits)) {
|
|
Packit |
961e70 |
/* we start looking at unit 0, and end at nunits-1: */
|
|
Packit |
961e70 |
*unit_start = 0;
|
|
Packit |
961e70 |
*unit_end = nunits - 1;
|
|
Packit |
961e70 |
} else {
|
|
Packit |
961e70 |
/* else, we are going to look at:
|
|
Packit |
961e70 |
(a hash of the job key plus the local rank id) mod nunits. */
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
*unit_start = (psmi_get_envvar("MPI_LOCALRANKID") +
|
|
Packit |
961e70 |
psmi_get_uuid_hash(job_key)) % nunits;
|
|
Packit |
961e70 |
if (*unit_start > 0)
|
|
Packit |
961e70 |
*unit_end = *unit_start - 1;
|
|
Packit |
961e70 |
else
|
|
Packit |
961e70 |
*unit_end = nunits-1;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
static int
|
|
Packit |
961e70 |
psmi_create_and_open_affinity_shm(psm2_uuid_t const job_key)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
int shm_fd, ret;
|
|
Packit |
961e70 |
int first_to_create = 0;
|
|
Packit |
961e70 |
size_t shm_name_len = 256;
|
|
Packit |
961e70 |
shared_affinity_ptr = NULL;
|
|
Packit |
961e70 |
affinity_shm_name = NULL;
|
|
Packit |
961e70 |
affinity_shm_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, shm_name_len);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
psmi_assert_always(affinity_shm_name != NULL);
|
|
Packit |
961e70 |
snprintf(affinity_shm_name, shm_name_len,
|
|
Packit |
961e70 |
AFFINITY_SHM_BASENAME".%d",
|
|
Packit |
961e70 |
psmi_get_uuid_hash(job_key));
|
|
Packit |
961e70 |
shm_fd = shm_open(affinity_shm_name, O_RDWR | O_CREAT | O_EXCL,
|
|
Packit |
961e70 |
S_IRUSR | S_IWUSR);
|
|
Packit |
961e70 |
if ((shm_fd < 0) && (errno == EEXIST)) {
|
|
Packit |
961e70 |
shm_fd = shm_open(affinity_shm_name, O_RDWR, S_IRUSR | S_IWUSR);
|
|
Packit |
961e70 |
if (shm_fd < 0) {
|
|
Packit |
961e70 |
_HFI_VDBG("Cannot open affinity shared mem fd:%s, errno=%d\n",
|
|
Packit |
961e70 |
affinity_shm_name, errno);
|
|
Packit |
961e70 |
return shm_fd;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
} else if (shm_fd > 0) {
|
|
Packit |
961e70 |
first_to_create = 1;
|
|
Packit |
961e70 |
} else {
|
|
Packit |
961e70 |
_HFI_VDBG("Cannot create affinity shared mem fd:%s, errno=%d\n",
|
|
Packit |
961e70 |
affinity_shm_name, errno);
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
ret = ftruncate(shm_fd, AFFINITY_SHMEMSIZE);
|
|
Packit Service |
7ed5cc |
if ( ret < 0 ) {
|
|
Packit Service |
7ed5cc |
_HFI_VDBG("Cannot truncate affinity shared mem fd:%s, errno=%d\n",
|
|
Packit Service |
7ed5cc |
affinity_shm_name, errno);
|
|
Packit Service |
7ed5cc |
if (shm_fd >= 0) close(shm_fd);
|
|
Packit |
961e70 |
return ret;
|
|
Packit Service |
7ed5cc |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
shared_affinity_ptr = (uint64_t *) mmap(NULL, AFFINITY_SHMEMSIZE, PROT_READ | PROT_WRITE,
|
|
Packit |
961e70 |
MAP_SHARED, shm_fd, 0);
|
|
Packit |
961e70 |
if (shared_affinity_ptr == MAP_FAILED) {
|
|
Packit |
961e70 |
_HFI_VDBG("Cannot mmap affinity shared memory. errno=%d\n",
|
|
Packit |
961e70 |
errno);
|
|
Packit |
961e70 |
close(shm_fd);
|
|
Packit |
961e70 |
return -1;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
close(shm_fd);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
psmi_affinity_shared_file_opened = 1;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
if (first_to_create) {
|
|
Packit |
961e70 |
_HFI_VDBG("Creating shm to store HFI affinity per socket\n");
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
memset(shared_affinity_ptr, 0, AFFINITY_SHMEMSIZE);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/*
|
|
Packit |
961e70 |
* Once shm object is initialized, unlock others to be able to
|
|
Packit |
961e70 |
* use it.
|
|
Packit |
961e70 |
*/
|
|
Packit |
961e70 |
psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name);
|
|
Packit |
961e70 |
} else {
|
|
Packit |
961e70 |
_HFI_VDBG("Opening shm object to read/write HFI affinity per socket\n");
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/*
|
|
Packit |
961e70 |
* Start critical section to increment reference count when creating
|
|
Packit |
961e70 |
* or opening shm object. Decrement of ref count will be done before
|
|
Packit |
961e70 |
* closing the shm.
|
|
Packit |
961e70 |
*/
|
|
Packit |
961e70 |
if (psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name)) {
|
|
Packit |
961e70 |
_HFI_VDBG("Could not enter critical section to update shm refcount\n");
|
|
Packit |
961e70 |
return -1;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] += 1;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* End critical section */
|
|
Packit |
961e70 |
psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
return 0;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/*
|
|
Packit |
961e70 |
* Spread HFI selection between units if we find more than one within a socket.
|
|
Packit |
961e70 |
*/
|
|
Packit |
961e70 |
static void
|
|
Packit |
961e70 |
psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id,
|
|
Packit |
961e70 |
int *saved_hfis, int found, psm2_uuid_t const job_key)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
int ret, shm_location;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/*
|
|
Packit |
961e70 |
* Take affinity lock and open shared memory region to be able to
|
|
Packit |
961e70 |
* accurately determine which HFI to pick for this process. If any
|
|
Packit |
961e70 |
* issues, bail by picking first known HFI.
|
|
Packit |
961e70 |
*/
|
|
Packit |
961e70 |
if (!psmi_affinity_semaphore_open)
|
|
Packit |
961e70 |
goto spread_hfi_fallback;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
ret = psmi_create_and_open_affinity_shm(job_key);
|
|
Packit |
961e70 |
if (ret < 0)
|
|
Packit |
961e70 |
goto spread_hfi_fallback;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id;
|
|
Packit |
961e70 |
if (shm_location > AFFINITY_SHMEMSIZE)
|
|
Packit |
961e70 |
goto spread_hfi_fallback;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* Start critical section to read/write shm object */
|
|
Packit |
961e70 |
if (psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name)) {
|
|
Packit |
961e70 |
_HFI_VDBG("Could not enter critical section to update HFI index\n");
|
|
Packit |
961e70 |
goto spread_hfi_fallback;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
*unit_start = *unit_end = shared_affinity_ptr[shm_location];
|
|
Packit |
961e70 |
shared_affinity_ptr[shm_location] =
|
|
Packit |
961e70 |
(shared_affinity_ptr[shm_location] + 1) % found;
|
|
Packit |
961e70 |
_HFI_VDBG("Selected HFI index= %ld, Next HFI=%ld, node = %d, local rank=%d, found=%d.\n",
|
|
Packit |
961e70 |
*unit_start, shared_affinity_ptr[shm_location], node_id,
|
|
Packit |
961e70 |
psmi_get_envvar("MPI_LOCALRANKID"), found);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* End Critical Section */
|
|
Packit |
961e70 |
psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
return;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
spread_hfi_fallback:
|
|
Packit |
961e70 |
*unit_start = *unit_end = saved_hfis[0];
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
static void
|
|
Packit |
961e70 |
psmi_create_affinity_semaphores(psm2_uuid_t const job_key)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
int ret;
|
|
Packit |
961e70 |
sem_affinity_shm_rw_name = NULL;
|
|
Packit |
961e70 |
size_t sem_len = 256;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/*
|
|
Packit |
961e70 |
* If already opened, no need to do anything else.
|
|
Packit |
961e70 |
* This could be true for Multi-EP cases where a different thread has
|
|
Packit |
961e70 |
* already created the semaphores. We don't need separate locks here as
|
|
Packit |
961e70 |
* we are protected by the overall "psmi_creation_lock" which each
|
|
Packit |
961e70 |
* thread will take in psm2_ep_open()
|
|
Packit |
961e70 |
*/
|
|
Packit |
961e70 |
if (psmi_affinity_semaphore_open)
|
|
Packit |
961e70 |
return;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
sem_affinity_shm_rw_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, sem_len);
|
|
Packit |
961e70 |
psmi_assert_always(sem_affinity_shm_rw_name != NULL);
|
|
Packit |
961e70 |
snprintf(sem_affinity_shm_rw_name, sem_len,
|
|
Packit |
961e70 |
SEM_AFFINITY_SHM_RW_BASENAME".%d",
|
|
Packit |
961e70 |
psmi_get_uuid_hash(job_key));
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
ret = psmi_init_semaphore(&sem_affinity_shm_rw, sem_affinity_shm_rw_name,
|
|
Packit |
961e70 |
S_IRUSR | S_IWUSR, 0);
|
|
Packit |
961e70 |
if (ret) {
|
|
Packit |
961e70 |
_HFI_VDBG("Cannot initialize semaphore: %s for read-write access to shm object.\n",
|
|
Packit |
961e70 |
sem_affinity_shm_rw_name);
|
|
Packit |
961e70 |
sem_close(sem_affinity_shm_rw);
|
|
Packit |
961e70 |
psmi_free(sem_affinity_shm_rw_name);
|
|
Packit |
961e70 |
sem_affinity_shm_rw_name = NULL;
|
|
Packit |
961e70 |
return;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
_HFI_VDBG("Semaphore: %s created for read-write access to shm object.\n",
|
|
Packit |
961e70 |
sem_affinity_shm_rw_name);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
psmi_affinity_semaphore_open = 1;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
return;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
static
|
|
Packit |
961e70 |
psm2_error_t
|
|
Packit |
961e70 |
psmi_compute_start_and_end_unit(long unit_param,int nunitsactive,int nunits,
|
|
Packit |
961e70 |
psm2_uuid_t const job_key,
|
|
Packit |
961e70 |
long *unit_start,long *unit_end)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
unsigned short hfi_sel_alg = PSMI_UNIT_SEL_ALG_ACROSS;
|
|
Packit |
961e70 |
int node_id, unit_id, found = 0;
|
|
Packit |
961e70 |
int saved_hfis[nunits];
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* if the user did not set HFI_UNIT then ... */
|
|
Packit |
961e70 |
if (unit_param == HFI_UNIT_ID_ANY)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
/* Get the actual selection algorithm from the environment: */
|
|
Packit |
961e70 |
hfi_sel_alg = psmi_get_hfi_selection_algorithm();
|
|
Packit |
961e70 |
/* If round-robin is selection algorithm and ... */
|
|
Packit |
961e70 |
if ((hfi_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS) &&
|
|
Packit |
961e70 |
/* there are more than 1 active units then ... */
|
|
Packit |
961e70 |
(nunitsactive > 1))
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
/*
|
|
Packit |
961e70 |
* Pick first HFI we find on same root complex
|
|
Packit |
961e70 |
* as current task. If none found, fall back to
|
|
Packit |
961e70 |
* load-balancing algorithm.
|
|
Packit |
961e70 |
*/
|
|
Packit |
961e70 |
node_id = psmi_get_current_proc_location();
|
|
Packit |
961e70 |
if (node_id >= 0) {
|
|
Packit |
961e70 |
for (unit_id = 0; unit_id < nunits; unit_id++) {
|
|
Packit |
961e70 |
if (psmi_hal_get_unit_active(unit_id) <= 0)
|
|
Packit |
961e70 |
continue;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
int node_id_i;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
if (!psmi_hal_get_node_id(unit_id, &node_id_i)) {
|
|
Packit |
961e70 |
if (node_id_i == node_id) {
|
|
Packit |
961e70 |
saved_hfis[found] = unit_id;
|
|
Packit |
961e70 |
found++;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
if (found > 1) {
|
|
Packit |
961e70 |
psmi_create_affinity_semaphores(job_key);
|
|
Packit |
961e70 |
psmi_spread_hfi_within_socket(unit_start, unit_end,
|
|
Packit |
961e70 |
node_id, saved_hfis,
|
|
Packit |
961e70 |
found, job_key);
|
|
Packit |
961e70 |
} else if (found == 1) {
|
|
Packit |
961e70 |
*unit_start = *unit_end = saved_hfis[0];
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
if (node_id < 0 || !found) {
|
|
Packit |
961e70 |
psmi_spread_hfi_selection(job_key, unit_start,
|
|
Packit |
961e70 |
unit_end, nunits);
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
} else if ((hfi_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS_ALL) &&
|
|
Packit |
961e70 |
(nunitsactive > 1)) {
|
|
Packit |
961e70 |
psmi_spread_hfi_selection(job_key, unit_start,
|
|
Packit |
961e70 |
unit_end, nunits);
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
else {
|
|
Packit |
961e70 |
*unit_start = 0;
|
|
Packit |
961e70 |
*unit_end = nunits - 1;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
} else if (unit_param >= 0) {
|
|
Packit |
961e70 |
/* the user specified HFI_UNIT, we use it. */
|
|
Packit |
961e70 |
*unit_start = *unit_end = unit_param;
|
|
Packit |
961e70 |
} else {
|
|
Packit |
961e70 |
psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
|
|
Packit |
961e70 |
"PSM2 can't open unit: %ld for reading and writing",
|
|
Packit |
961e70 |
unit_param);
|
|
Packit |
961e70 |
return PSM2_EP_DEVICE_FAILURE;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
return PSM2_OK;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
psm2_error_t
|
|
Packit |
961e70 |
psmi_context_open(const psm2_ep_t ep, long unit_param, long port,
|
|
Packit |
961e70 |
psm2_uuid_t const job_key, int64_t timeout_ns,
|
|
Packit |
961e70 |
psmi_context_t *context)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
long open_timeout = 0, unit_start, unit_end, unit_id, unit_id_prev;
|
|
Packit |
961e70 |
psm2_error_t err = PSM2_OK;
|
|
Packit |
961e70 |
int nunits = psmi_hal_get_num_units(), nunitsactive=0;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/*
|
|
Packit |
961e70 |
* If shared contexts are enabled, try our best to schedule processes
|
|
Packit |
961e70 |
* across one or many devices
|
|
Packit |
961e70 |
*/
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* if no units, then no joy. */
|
|
Packit |
961e70 |
if (nunits <= 0)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
|
|
Packit |
961e70 |
"PSM2 no hfi units are available");
|
|
Packit |
961e70 |
goto ret;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* Calculate the number of active units: */
|
|
Packit |
961e70 |
for (unit_id=0;unit_id < nunits;unit_id++)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
if (psmi_hal_get_unit_active(unit_id) > 0)
|
|
Packit |
961e70 |
nunitsactive++;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
/* if no active units, then no joy. */
|
|
Packit |
961e70 |
if (nunitsactive == 0)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
|
|
Packit |
961e70 |
"PSM2 no hfi units are active");
|
|
Packit |
961e70 |
goto ret;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
if (timeout_ns > 0)
|
|
Packit |
961e70 |
open_timeout = (long)(timeout_ns / MSEC_ULL);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
unit_start = 0; unit_end = nunits - 1;
|
|
Packit |
961e70 |
err = psmi_compute_start_and_end_unit(unit_param, nunitsactive,
|
|
Packit |
961e70 |
nunits, job_key,
|
|
Packit |
961e70 |
&unit_start, &unit_end);
|
|
Packit |
961e70 |
if (err != PSM2_OK)
|
|
Packit |
961e70 |
return err;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* this is the start of a loop that starts at unit_start and goes to unit_end.
|
|
Packit |
961e70 |
but note that the way the loop computes the loop control variable is by
|
|
Packit |
961e70 |
an expression involving the mod operator. */
|
|
Packit |
961e70 |
int success = 0;
|
|
Packit |
961e70 |
unit_id_prev = unit_id = unit_start;
|
|
Packit |
961e70 |
do
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
/* close previous opened unit fd before attempting open of current unit. */
|
|
Packit |
961e70 |
if (psmi_hal_get_fd(context->psm_hw_ctxt) > 0) {
|
|
Packit |
961e70 |
psmi_hal_close_context(&context->psm_hw_ctxt);
|
|
Packit |
961e70 |
context->psm_hw_ctxt = 0;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* if the unit_id is not active, go to next one. */
|
|
Packit |
961e70 |
if (psmi_hal_get_unit_active(unit_id) <= 0) {
|
|
Packit |
961e70 |
unit_id_prev = unit_id;
|
|
Packit |
961e70 |
unit_id = (unit_id + 1) % nunits;
|
|
Packit |
961e70 |
continue;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* open this unit. */
|
|
Packit |
961e70 |
int rv = psmi_hal_context_open(unit_id, port, open_timeout,
|
|
Packit |
961e70 |
ep, job_key, context,
|
|
Packit |
961e70 |
psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED),
|
|
Packit |
961e70 |
HAL_CONTEXT_OPEN_RETRY_MAX);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* go to next unit if failed to open. */
|
|
Packit |
961e70 |
if (rv || context->psm_hw_ctxt == NULL) {
|
|
Packit |
961e70 |
unit_id_prev = unit_id;
|
|
Packit |
961e70 |
unit_id = (unit_id + 1) % nunits;
|
|
Packit |
961e70 |
continue;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
success = 1;
|
|
Packit |
961e70 |
break;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
} while (unit_id_prev != unit_end);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
if (!success)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
|
|
Packit |
961e70 |
"PSM2 can't open hfi unit: %ld",unit_param);
|
|
Packit |
961e70 |
goto bail;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
context->ep = (psm2_ep_t) ep;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* Check backward compatibility bits here and save the info */
|
|
Packit |
961e70 |
if (psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT_OT))
|
|
Packit Service |
7ed5cc |
{
|
|
Packit Service |
7ed5cc |
#ifdef PSM_CUDA
|
|
Packit |
961e70 |
is_driver_gpudirect_enabled = 1;
|
|
Packit Service |
7ed5cc |
#else
|
|
Packit Service |
7ed5cc |
psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "FATAL ERROR: "
|
|
Packit Service |
7ed5cc |
"CUDA version of hfi1 driver is loaded with non-CUDA version of "
|
|
Packit Service |
7ed5cc |
"psm2 library.\n");
|
|
Packit Service |
7ed5cc |
#endif
|
|
Packit Service |
7ed5cc |
}
|
|
Packit Service |
7ed5cc |
#ifdef PSM_CUDA
|
|
Packit Service |
7ed5cc |
else
|
|
Packit Service |
7ed5cc |
fprintf(stderr,"WARNING: running CUDA version of libpsm2 with non CUDA version of hfi1 driver.\n");
|
|
Packit |
961e70 |
#endif
|
|
Packit |
961e70 |
_HFI_VDBG("hfi_userinit() passed.\n");
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* Fetch hw parameters from HAL (that were obtained during opening the context above. */
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
int lid = psmi_hal_get_lid(context->psm_hw_ctxt);
|
|
Packit |
961e70 |
ep->unit_id = psmi_hal_get_unit_id(context->psm_hw_ctxt);
|
|
Packit |
961e70 |
ep->portnum = psmi_hal_get_port_num(context->psm_hw_ctxt);
|
|
Packit |
961e70 |
ep->gid_lo = psmi_hal_get_gid_lo(context->psm_hw_ctxt);
|
|
Packit |
961e70 |
ep->gid_hi = psmi_hal_get_gid_hi(context->psm_hw_ctxt);
|
|
Packit |
961e70 |
int ctxt = psmi_hal_get_context(context->psm_hw_ctxt);
|
|
Packit |
961e70 |
int subctxt = psmi_hal_get_subctxt(context->psm_hw_ctxt);
|
|
Packit |
961e70 |
uint32_t hfi_type = psmi_hal_get_hfi_type(context->psm_hw_ctxt);
|
|
Packit |
961e70 |
context->ep = (psm2_ep_t) ep;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* Construct epid for this Endpoint */
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
switch (PSMI_EPID_VERSION) {
|
|
Packit |
961e70 |
case PSMI_EPID_V1:
|
|
Packit |
961e70 |
context->epid = PSMI_EPID_PACK_V1(lid, ctxt,
|
|
Packit |
961e70 |
subctxt,
|
|
Packit |
961e70 |
ep->unit_id,
|
|
Packit |
961e70 |
PSMI_EPID_VERSION, 0x3ffffff);
|
|
Packit |
961e70 |
break;
|
|
Packit |
961e70 |
case PSMI_EPID_V2:
|
|
Packit |
961e70 |
context->epid = PSMI_EPID_PACK_V2(lid, ctxt,
|
|
Packit |
961e70 |
subctxt,
|
|
Packit |
961e70 |
PSMI_EPID_IPS_SHM, /*Not a only-shm epid */
|
|
Packit |
961e70 |
PSMI_EPID_VERSION, ep->gid_hi);
|
|
Packit |
961e70 |
break;
|
|
Packit |
961e70 |
default:
|
|
Packit |
961e70 |
/* Epid version is greater than max supportd version. */
|
|
Packit |
961e70 |
psmi_assert_always(PSMI_EPID_VERSION <= PSMI_EPID_V2);
|
|
Packit |
961e70 |
break;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
_HFI_VDBG
|
|
Packit |
961e70 |
("construct epid: lid %d ctxt %d subctxt %d hcatype %d mtu %d\n",
|
|
Packit |
961e70 |
lid, ctxt,
|
|
Packit |
961e70 |
subctxt, hfi_type, ep->mtu);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
goto ret;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
bail:
|
|
Packit |
961e70 |
_HFI_PRDBG("open failed: unit_id: %ld, err: %d (%s)\n", unit_id, err, strerror(errno));
|
|
Packit |
961e70 |
if (psmi_hal_get_fd(context->psm_hw_ctxt) > 0)
|
|
Packit |
961e70 |
psmi_hal_close_context(&context->psm_hw_ctxt);
|
|
Packit |
961e70 |
ret:
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
_HFI_VDBG("psmi_context_open() return %d\n", err);
|
|
Packit |
961e70 |
return err;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
psm2_error_t psmi_context_close(psmi_context_t *context)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
if (psmi_hal_get_fd(context->psm_hw_ctxt) > 0)
|
|
Packit |
961e70 |
psmi_hal_close_context(&context->psm_hw_ctxt);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
return PSM2_OK;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/*
|
|
Packit |
961e70 |
* This function works whether a context is initialized or not in a psm2_ep.
|
|
Packit |
961e70 |
*
|
|
Packit |
961e70 |
* Returns one of
|
|
Packit |
961e70 |
*
|
|
Packit |
961e70 |
* PSM2_OK: Port status is ok (or context not initialized yet but still "ok")
|
|
Packit |
961e70 |
* PSM2_OK_NO_PROGRESS: Cable pulled
|
|
Packit |
961e70 |
* PSM2_EP_NO_NETWORK: No network, no lid, ...
|
|
Packit |
961e70 |
* PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc.
|
|
Packit |
961e70 |
* The message follows the per-port status
|
|
Packit |
961e70 |
* As of 7322-ready driver, need to check port-specific qword for IB
|
|
Packit |
961e70 |
* as well as older unit-only. For now, we don't have the port interface
|
|
Packit |
961e70 |
* defined, so just check port 0 qword for spi_status
|
|
Packit |
961e70 |
*/
|
|
Packit |
961e70 |
psm2_error_t psmi_context_check_status(const psmi_context_t *contexti)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
psm2_error_t err = PSM2_OK;
|
|
Packit |
961e70 |
psmi_context_t *context = (psmi_context_t *) contexti;
|
|
Packit |
961e70 |
char *errmsg = NULL;
|
|
Packit |
961e70 |
uint64_t status = psmi_hal_get_hw_status(context->psm_hw_ctxt);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* Fatal chip-related errors */
|
|
Packit |
961e70 |
if (!(status & PSM_HAL_HW_STATUS_CHIP_PRESENT) ||
|
|
Packit |
961e70 |
!(status & PSM_HAL_HW_STATUS_INITTED) ||
|
|
Packit |
961e70 |
(status & PSM_HAL_HW_STATUS_HWERROR)) {
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
err = PSM2_EP_DEVICE_FAILURE;
|
|
Packit |
961e70 |
if (err != context->status_lasterr) { /* report once */
|
|
Packit |
961e70 |
volatile char *errmsg_sp="no err msg";
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
psmi_hal_get_hw_status_freezemsg(&errmsg_sp,
|
|
Packit |
961e70 |
context->psm_hw_ctxt);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
if (*errmsg_sp)
|
|
Packit |
961e70 |
psmi_handle_error(context->ep, err,
|
|
Packit |
961e70 |
"Hardware problem: %s",
|
|
Packit |
961e70 |
errmsg_sp);
|
|
Packit |
961e70 |
else {
|
|
Packit |
961e70 |
if (status & PSM_HAL_HW_STATUS_HWERROR)
|
|
Packit |
961e70 |
errmsg = "Hardware error";
|
|
Packit |
961e70 |
else
|
|
Packit |
961e70 |
errmsg = "Hardware not found";
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
psmi_handle_error(context->ep, err,
|
|
Packit |
961e70 |
"%s", errmsg);
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
/* Fatal network-related errors with timeout: */
|
|
Packit |
961e70 |
else if (!(status & PSM_HAL_HW_STATUS_IB_CONF) ||
|
|
Packit |
961e70 |
!(status & PSM_HAL_HW_STATUS_IB_READY)) {
|
|
Packit |
961e70 |
err = PSM2_EP_NO_NETWORK;
|
|
Packit |
961e70 |
if (err != context->status_lasterr) { /* report once */
|
|
Packit |
961e70 |
context->networkLostTime = time(NULL);
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
else
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
time_t now = time(NULL);
|
|
Packit |
961e70 |
static const double seventySeconds = 70.0;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* The linkup time duration for a system should allow the time needed
|
|
Packit |
961e70 |
to complete 3 LNI passes which is:
|
|
Packit |
961e70 |
50 seconds for a passive copper channel
|
|
Packit |
961e70 |
65 seconds for optical channel.
|
|
Packit |
961e70 |
(we add 5 seconds of margin.) */
|
|
Packit |
961e70 |
if (difftime(now,context->networkLostTime) > seventySeconds)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
volatile char *errmsg_sp="no err msg";
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
psmi_hal_get_hw_status_freezemsg(&errmsg_sp,
|
|
Packit |
961e70 |
context->psm_hw_ctxt);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
psmi_handle_error(context->ep, err, "%s",
|
|
Packit |
961e70 |
*errmsg_sp ? errmsg_sp :
|
|
Packit |
961e70 |
"Network down");
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
if (err == PSM2_OK && context->status_lasterr != PSM2_OK)
|
|
Packit |
961e70 |
context->status_lasterr = PSM2_OK; /* clear error */
|
|
Packit |
961e70 |
else if (err != PSM2_OK)
|
|
Packit |
961e70 |
context->status_lasterr = err; /* record error */
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
return err;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
static
|
|
Packit |
961e70 |
int psmi_get_hfi_selection_algorithm(void)
|
|
Packit |
961e70 |
{
|
|
Packit |
961e70 |
union psmi_envvar_val env_hfi1_alg;
|
|
Packit |
961e70 |
int hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS;
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
/* If a specific unit is set in the environment, use that one. */
|
|
Packit |
961e70 |
psmi_getenv("HFI_SELECTION_ALG",
|
|
Packit |
961e70 |
"HFI Device Selection Algorithm to use. Round Robin (Default) "
|
|
Packit |
961e70 |
", Packed or Round Robin All.",
|
|
Packit |
961e70 |
PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
|
|
Packit |
961e70 |
(union psmi_envvar_val)"Round Robin", &env_hfi1_alg);
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin"))
|
|
Packit |
961e70 |
hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS;
|
|
Packit |
961e70 |
else if (!strcasecmp(env_hfi1_alg.e_str, "Packed"))
|
|
Packit |
961e70 |
hfi1_alg = PSMI_UNIT_SEL_ALG_WITHIN;
|
|
Packit |
961e70 |
else if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin All"))
|
|
Packit |
961e70 |
hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS_ALL;
|
|
Packit |
961e70 |
else {
|
|
Packit |
961e70 |
_HFI_ERROR
|
|
Packit |
961e70 |
("Unknown HFI selection algorithm %s. Defaulting to Round Robin "
|
|
Packit |
961e70 |
"allocation of HFIs.\n", env_hfi1_alg.e_str);
|
|
Packit |
961e70 |
hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS;
|
|
Packit |
961e70 |
}
|
|
Packit |
961e70 |
|
|
Packit |
961e70 |
return hfi1_alg;
|
|
Packit |
961e70 |
}
|