Blob Blame History Raw
/* begin_generated_IBM_copyright_prolog                             */
/*                                                                  */
/* This is an automatically generated copyright prolog.             */
/* After initializing,  DO NOT MODIFY OR MOVE                       */
/*  --------------------------------------------------------------- */
/* Licensed Materials - Property of IBM                             */
/* Blue Gene/Q 5765-PER 5765-PRP                                    */
/*                                                                  */
/* (C) Copyright IBM Corp. 2011, 2012 All Rights Reserved           */
/* US Government Users Restricted Rights -                          */
/* Use, duplication, or disclosure restricted                       */
/* by GSA ADP Schedule Contract with IBM Corp.                      */
/*                                                                  */
/*  --------------------------------------------------------------- */
/*                                                                  */
/* end_generated_IBM_copyright_prolog                               */
/*  (C)Copyright IBM Corp.  2007, 2011  */
/**
 * \file src/mpid_init.c
 * \brief Normal job startup code
 */

#include <stdlib.h>
#include <string.h>

#include <mpidimpl.h>
#include "mpidi_platform.h"
#include "onesided/mpidi_onesided.h"

#include "mpidi_util.h"

#ifdef DYNAMIC_TASKING
#define PAMIX_CLIENT_DYNAMIC_TASKING 1032
#define PAMIX_CLIENT_WORLD_TASKS     1033
#define MAX_JOBID_LEN                1024
int     world_rank;
int     world_size;
extern int (*mp_world_exiting_handler)(int);
extern int _mpi_world_exiting_handler(int);
#endif
int mpidi_dynamic_tasking = 0;

#if TOKEN_FLOW_CONTROL
  extern int MPIDI_mm_init(int,uint *,unsigned long *);
  extern int MPIDI_tfctrl_enabled;
#endif

#if (MPIDI_STATISTICS || MPIDI_PRINTENV)
  pami_extension_t pe_extension;
#endif

pami_client_t    MPIDI_Client;
pami_context_t   MPIDI_Context[MPIDI_MAX_CONTEXTS];

MPIDI_Process_t  MPIDI_Process = {
  .verbose               = 0,
  .statistics            = 0,

#if (MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY_PER_OBJECT)
  .avail_contexts        = MPIDI_MAX_CONTEXTS,
  .async_progress = {
    .active              = 0,
    .mode                = ASYNC_PROGRESS_MODE_DEFAULT,
  },
  .perobj = {
    .context_post = {
      .requested         = (ASYNC_PROGRESS_MODE_DEFAULT == ASYNC_PROGRESS_MODE_LOCKED),
      .active            = 0,
    },
  },
#else
  .avail_contexts        = 1,
  .async_progress = {
    .active              = 0,
    .mode                = ASYNC_PROGRESS_MODE_DISABLED,
  },
  .perobj = {
    .context_post = {
      .requested         = 0,
      .active            = 0,
    },
  },
#endif
  .pt2pt = {
    .limits = {
      .application = {
        .eager = {
          .remote        = MPIDI_EAGER_LIMIT,
          .local         = MPIDI_EAGER_LIMIT_LOCAL,
        },
        .immediate = {
          .remote        = MPIDI_SHORT_LIMIT,
          .local         = MPIDI_SHORT_LIMIT,
        },
      },
      .internal = {
        .eager = {
          .remote        = MPIDI_EAGER_LIMIT,
          .local         = MPIDI_EAGER_LIMIT_LOCAL,
        },
        .immediate = {
          .remote        = MPIDI_SHORT_LIMIT,
          .local         = MPIDI_SHORT_LIMIT,
        },
      },
    },
  },
  .disable_internal_eager_scale = MPIDI_DISABLE_INTERNAL_EAGER_SCALE,
#if TOKEN_FLOW_CONTROL
  .mp_buf_mem          = BUFFER_MEM_DEFAULT,
  .mp_buf_mem_max      = BUFFER_MEM_DEFAULT,
  .is_token_flow_control_on = 0,
#endif
#if (MPIDI_STATISTICS || MPIDI_PRINTENV)
  .mp_infolevel          = 0,
  .mp_statistics         = 0,
  .mp_printenv           = 0,
#endif
#ifdef QUEUE_BINARY_SEARCH_SUPPORT
  .queue_binary_search_support_on = 0,
#endif
#if CUDA_AWARE_SUPPORT
  .cuda_aware_support_on = 0,
#endif
  .rma_pending           = 1000,
  .shmem_pt2pt           = 1,
  .smp_detect            = MPIDI_SMP_DETECT_DEFAULT,
  .optimized = {
    .collectives         = MPIDI_OPTIMIZED_COLLECTIVE_DEFAULT,
    .subcomms            = 1,
    .select_colls        = 2,
    .memory              = 0,
    .num_requests        = 16,
  },

  .mpir_nbc              = 1,
  .numTasks              = 0,
  .typed_onesided        = 0,
};


struct protocol_t
{
  pami_dispatch_p2p_function func;
  size_t                     dispatch;
  size_t                     immediate_min;
  pami_dispatch_hint_t       options;
};
static struct
{
  struct protocol_t Short;
  struct protocol_t ShortSync;
  struct protocol_t Eager;
  struct protocol_t RVZ;
  struct protocol_t Cancel;
  struct protocol_t Control;
  struct protocol_t WinCtrl;
  struct protocol_t WinAccum;
  struct protocol_t RVZ_zerobyte;
  struct protocol_t WinGetAccum;
  struct protocol_t WinGetAccumAck;
  struct protocol_t WinAtomic;
  struct protocol_t WinAtomicAck;
#ifdef DYNAMIC_TASKING
  struct protocol_t Dyntask;
  struct protocol_t Dyntask_disconnect;
#endif
} proto_list = {
  .Short = {
    .func = MPIDI_RecvShortAsyncCB,
    .dispatch = MPIDI_Protocols_Short,
    .options = {
      .consistency     = USE_PAMI_CONSISTENCY,
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_ENABLE,
      .use_rdma        = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_MsgInfo),
  },
  .ShortSync = {
    .func = MPIDI_RecvShortSyncCB,
    .dispatch = MPIDI_Protocols_ShortSync,
    .options = {
      .consistency     = USE_PAMI_CONSISTENCY,
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_ENABLE,
      .use_rdma        = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_MsgInfo),
  },
  .Eager = {
    .func = MPIDI_RecvCB,
    .dispatch = MPIDI_Protocols_Eager,
    .options = {
      .consistency     = USE_PAMI_CONSISTENCY,
      .long_header     = PAMI_HINT_DISABLE,
      .recv_contiguous = PAMI_HINT_ENABLE,
      .recv_copy =       PAMI_HINT_ENABLE,
    },
    .immediate_min     = sizeof(MPIDI_MsgInfo),
  },
  .RVZ = {
    .func = MPIDI_RecvRzvCB,
    .dispatch = MPIDI_Protocols_RVZ,
    .options = {
      .consistency     = USE_PAMI_CONSISTENCY,
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_ENABLE,
      .use_rdma        = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_MsgEnvelope),
  },
  .Cancel = {
    .func = MPIDI_ControlCB,
    .dispatch = MPIDI_Protocols_Cancel,
    .options = {
      .consistency     = PAMI_HINT_ENABLE,
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_ENABLE,
      .use_rdma        = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_MsgInfo),
  },
  .Control = {
    .func = MPIDI_ControlCB,
    .dispatch = MPIDI_Protocols_Control,
    .options = {
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_ENABLE,
      .use_rdma        = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_MsgInfo),
  },
  .WinCtrl = {
    .func = MPIDI_WinControlCB,
    .dispatch = MPIDI_Protocols_WinCtrl,
    .options = {
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_ENABLE,
      .use_rdma        = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_Win_control_t),
  },
  .WinAccum = {
    .func = MPIDI_WinAccumCB,
    .dispatch = MPIDI_Protocols_WinAccum,
    .options = {
      .consistency     = PAMI_HINT_ENABLE,
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_MsgInfo),
  },
  .RVZ_zerobyte = {
    .func = MPIDI_RecvRzvCB_zerobyte,
    .dispatch = MPIDI_Protocols_RVZ_zerobyte,
    .options = {
      .consistency     = USE_PAMI_CONSISTENCY,
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_ENABLE,
      .use_rdma        = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_MsgEnvelope),
  },
  .WinGetAccum = {
    .func = MPIDI_WinGetAccumCB,
    .dispatch = MPIDI_Protocols_WinGetAccum,
    .options = {
      .consistency    = PAMI_HINT_ENABLE,
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_Win_GetAccMsgInfo),
  },
  .WinGetAccumAck = {
    .func = MPIDI_WinGetAccumAckCB,
    .dispatch = MPIDI_Protocols_WinGetAccumAck,
    .options = {
      .consistency    = PAMI_HINT_ENABLE,
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_Win_GetAccMsgInfo),
  },
  .WinAtomic = {
    .func = MPIDI_WinAtomicCB,
    .dispatch = MPIDI_Protocols_WinAtomic,
    .options = {
      .consistency     = USE_PAMI_CONSISTENCY,
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_ENABLE,
      .use_rdma        = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_AtomicHeader_t),
  },
  .WinAtomicAck = {
    .func = MPIDI_WinAtomicAckCB,
    .dispatch = MPIDI_Protocols_WinAtomicAck,
    .options = {
      .consistency     = USE_PAMI_CONSISTENCY,
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_ENABLE,
      .use_rdma        = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_AtomicHeader_t),
  },
#ifdef DYNAMIC_TASKING
  .Dyntask = {
    .func = MPIDI_Recvfrom_remote_world,
    .dispatch = MPIDI_Protocols_Dyntask,
    .options = {
      .consistency     = USE_PAMI_CONSISTENCY,
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_ENABLE,
      .use_rdma        = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_MsgInfo),
  },
  .Dyntask_disconnect = {
    .func = MPIDI_Recvfrom_remote_world_disconnect,
    .dispatch = MPIDI_Protocols_Dyntask_disconnect,
    .options = {
      .consistency     = USE_PAMI_CONSISTENCY,
      .long_header     = PAMI_HINT_DISABLE,
      .recv_immediate  = PAMI_HINT_ENABLE,
      .use_rdma        = PAMI_HINT_DISABLE,
    },
    .immediate_min     = sizeof(MPIDI_MsgInfo),
  },
#endif
};


#undef FUNCNAME
#define FUNCNAME split_type
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
static int split_type(MPID_Comm * comm_ptr, int stype, int key,
                      MPID_Info *info_ptr, MPID_Comm ** newcomm_ptr)
{
    MPID_Node_id_t id;
    int nid;
    int mpi_errno = MPI_SUCCESS;

    mpi_errno = MPID_Get_node_id(comm_ptr, comm_ptr->rank, &id);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

    nid = (stype == MPI_COMM_TYPE_SHARED) ? id : MPI_UNDEFINED;
    mpi_errno = MPIR_Comm_split_impl(comm_ptr, nid, key, newcomm_ptr);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

  fn_exit:
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

static MPID_CommOps comm_fns = {
    split_type
};


/* ------------------------------ */
/* Collective selection extension */
/* ------------------------------ */
pami_extension_t MPIDI_Collsel_extension;
advisor_t        MPIDI_Collsel_advisor;
advisor_table_t  MPIDI_Collsel_advisor_table;
advisor_params_t MPIDI_Collsel_advisor_params;
char            *MPIDI_Collsel_output_file;
pami_extension_collsel_advise MPIDI_Pamix_collsel_advise;
static void
MPIDI_PAMI_client_init(int* rank, int* size, int* mpidi_dynamic_tasking, char **world_tasks)
{
  /* ------------------------------------ */
  /*  Initialize the MPICH->PAMI Client  */
  /* ------------------------------------ */
  pami_result_t        rc = PAMI_ERROR;
  
  pami_configuration_t config[2];
  size_t numconfigs = 0;

  /* Set the status for memory optimized collectives */
  {
    char* env = getenv("PAMID_COLLECTIVES_MEMORY_OPTIMIZED");
    if (env != NULL)
      MPIDI_atoi(env,&MPIDI_Process.optimized.memory);
  }

#ifdef HAVE_PAMI_CLIENT_NONCONTIG
  config[0].name = PAMI_CLIENT_NONCONTIG;
  if(MPIDI_Process.optimized.memory & MPID_OPT_LVL_NONCONTIG) 
    config[0].value.intval = 0; // Disable non-contig, pamid doesn't use pami for non-contig data collectives so save memory
  else
    config[0].value.intval = 1; // Enable non-contig even though pamid doesn't use pami for non-contig data collectives, 
                                // we still possibly want those collectives for other reasons.
  ++numconfigs;
#endif
#ifdef HAVE_PAMI_CLIENT_MEMORY_OPTIMIZE
  if(MPIDI_Process.optimized.memory) 
  {
    config[numconfigs].name = PAMI_CLIENT_MEMORY_OPTIMIZE;
    config[numconfigs].value.intval = MPIDI_Process.optimized.memory;
    ++numconfigs;
  }
#endif

  rc = PAMI_Client_create("MPI", &MPIDI_Client, config, numconfigs);
  MPID_assert_always(rc == PAMI_SUCCESS);
  PAMIX_Initialize(MPIDI_Client);


  *mpidi_dynamic_tasking=0;
#ifdef DYNAMIC_TASKING
  *world_tasks = NULL;
  pami_result_t status = PAMI_ERROR;

  typedef pami_result_t (*dyn_task_query_fn) (
             pami_client_t          client,
             pami_configuration_t   config[],
             size_t                 num_configs);
  dyn_task_query_fn  dyn_task_query = NULL;

  pami_extension_t extension;
  status = PAMI_Extension_open (MPIDI_Client, "PE_dyn_task", &extension);
  if(status != PAMI_SUCCESS)
  {
    TRACE_ERR("Error. The PE_dyn_task extension is not implemented. result = %d\n", status);
  }

  dyn_task_query =  (dyn_task_query_fn) PAMI_Extension_symbol(extension, "query");
  if (dyn_task_query == (void*)NULL) {
    TRACE_ERR("Err: the Dynamic Tasking extension function dyn_task_query is not implememted.\n");

  } else {
    pami_configuration_t config2[] =
    {
       {PAMI_CLIENT_TASK_ID, -1},
       {PAMI_CLIENT_NUM_TASKS, -1},
       {(pami_attribute_name_t)PAMIX_CLIENT_DYNAMIC_TASKING},
       {(pami_attribute_name_t)PAMIX_CLIENT_WORLD_TASKS},
    };

    dyn_task_query(MPIDI_Client, config2, 4);
    TRACE_ERR("dyn_task_query: task_id %d num_tasks %d dynamic_tasking %d world_tasks %s\n",
              config2[0].value.intval,
              config2[1].value.intval,
              config2[2].value.intval,
              config2[3].value.chararray);
    *rank = world_rank = config2[0].value.intval;
    *size = world_size = config2[1].value.intval;
    *mpidi_dynamic_tasking  = config2[2].value.intval;
    *world_tasks = config2[3].value.chararray;
  }

  status = PAMI_Extension_close (extension);
  if(status != PAMI_SUCCESS)
  {
    TRACE_ERR("Error. The PE_dyn_task extension could not be closed. result = %d\n", status);
  }
#endif

  if(*mpidi_dynamic_tasking == 0) {
     /* ---------------------------------- */
     /*  Get my rank and the process size  */
     /* ---------------------------------- */
     *rank = PAMIX_Client_query(MPIDI_Client, PAMI_CLIENT_TASK_ID  ).value.intval;
     MPIR_Process.comm_world->rank = *rank; /* Set the rank early to make tracing better */
     *size = PAMIX_Client_query(MPIDI_Client, PAMI_CLIENT_NUM_TASKS).value.intval;
  }

  /* --------------------------------------------------------------- */
  /* Determine if the eager point-to-point protocol for internal mpi */
  /* operations should be disabled.                                  */
  /* --------------------------------------------------------------- */
  {
    char * env = getenv("PAMID_DISABLE_INTERNAL_EAGER_TASK_LIMIT");
    if (env != NULL)
      {
        size_t n = strlen(env);
        char * tmp = (char *) MPIU_Malloc(n+1);
        strncpy(tmp,env,n);
        if (n>0) tmp[n]=0;

        MPIDI_atoi(tmp, &MPIDI_Process.disable_internal_eager_scale);

        MPIU_Free(tmp);
      }

    if (MPIDI_Process.disable_internal_eager_scale <= *size)
      {
        MPIDI_Process.pt2pt.limits.internal.eager.remote     = 0;
        MPIDI_Process.pt2pt.limits.internal.eager.local      = 0;
        MPIDI_Process.pt2pt.limits.internal.immediate.remote = 0;
        MPIDI_Process.pt2pt.limits.internal.immediate.local  = 0;
      }
  }
}

void MPIDI_Init_collsel_extension()
{
  pami_result_t status = PAMI_ERROR;
  status = PAMI_Extension_open (MPIDI_Client, "EXT_collsel", &MPIDI_Collsel_extension);
  if(status == PAMI_SUCCESS)
  {
    if(MPIDI_Process.optimized.auto_select_colls == MPID_AUTO_SELECT_COLLS_TUNE)
    {
      advisor_configuration_t configuration[1];
      pami_extension_collsel_init pamix_collsel_init =
         (pami_extension_collsel_init) PAMI_Extension_symbol (MPIDI_Collsel_extension, "Collsel_init");
      status = pamix_collsel_init (MPIDI_Client, configuration, 1, &MPIDI_Context[0], 1, &MPIDI_Collsel_advisor);
      if(status != PAMI_SUCCESS)
      {
        fprintf (stderr, "Error. The collsel_init failed. result = %d\n", status);
        MPIDI_Process.optimized.auto_select_colls = MPID_AUTO_SELECT_COLLS_NONE;
      }

    }
    else if(MPIDI_Process.optimized.auto_select_colls == MPID_AUTO_SELECT_COLLS_ALL)
    {
      pami_extension_collsel_initialized pamix_collsel_initialized =
         (pami_extension_collsel_initialized) PAMI_Extension_symbol(MPIDI_Collsel_extension,
                                                                    "Collsel_initialized");
      if(pamix_collsel_initialized(MPIDI_Client, &MPIDI_Collsel_advisor) == 1)
      {
        char *collselfile;
        collselfile = getenv("MP_COLLECTIVE_SELECTION_FILE");
        pami_extension_collsel_table_load pamix_collsel_table_load =
           (pami_extension_collsel_table_load) PAMI_Extension_symbol(MPIDI_Collsel_extension,
                                                                       "Collsel_table_load");
        if(collselfile != NULL)
          status = pamix_collsel_table_load(MPIDI_Collsel_advisor, collselfile, &MPIDI_Collsel_advisor_table);
        else
          status = pamix_collsel_table_load(MPIDI_Collsel_advisor, "pami_tune_results.xml", &MPIDI_Collsel_advisor_table);
          if (status == PAMI_SUCCESS)
          {
            pami_xfer_type_t *collsel_collectives = NULL;
            unsigned          num_collectives;
            pami_extension_collsel_get_collectives pamix_collsel_get_collectives =
               (pami_extension_collsel_get_collectives) PAMI_Extension_symbol(MPIDI_Collsel_extension,
                                                                              "Collsel_get_collectives");
            status = pamix_collsel_get_collectives(MPIDI_Collsel_advisor_table, &collsel_collectives, &num_collectives);
            MPIDI_Process.optimized.auto_select_colls = MPID_AUTO_SELECT_COLLS_NONE;
            if(collsel_collectives != NULL)
            {
              unsigned i = 0;
              for(i = 0; i < num_collectives; i++)
              {
                switch(collsel_collectives[i])
                {
                  case PAMI_XFER_BROADCAST:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_BCAST;
                    break;
                  case PAMI_XFER_ALLREDUCE:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_ALLREDUCE;
                    break;
                  case PAMI_XFER_REDUCE:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_REDUCE;
                    break;
                  case PAMI_XFER_ALLGATHER:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_ALLGATHER;
                    break;
                  case PAMI_XFER_ALLGATHERV:
#if (MPIDI_STATISTICS || MPIDI_PRINTENV)
                    if(MPIDI_Process.mp_infolevel >= 1)
                      fprintf(stderr,"WARNING: MPICH (collective selection) doesn't support ALLGATHERV, only ALLGATHERV_INT is supported\n");
#endif
                    break;
                  case PAMI_XFER_ALLGATHERV_INT:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_ALLGATHERV;
                    break;
                  case PAMI_XFER_SCATTER:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_SCATTER;
                    break;
                  case PAMI_XFER_SCATTERV:
#if (MPIDI_STATISTICS || MPIDI_PRINTENV)
                    if(MPIDI_Process.mp_infolevel >= 1)
                      fprintf(stderr,"WARNING: MPICH (collective selection) doesn't support SCATTERV, only SCATTERV_INT is supported\n");
#endif
                    break;
                  case PAMI_XFER_SCATTERV_INT:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_SCATTERV;
                    break;
                  case PAMI_XFER_GATHER:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_GATHER;
                    break;
                  case PAMI_XFER_GATHERV:
#if (MPIDI_STATISTICS || MPIDI_PRINTENV)
                    if(MPIDI_Process.mp_infolevel >= 1)
                      fprintf(stderr,"WARNING: MPICH (collective selection) doesn't support GATHERV, only GATHERV_INT is supported\n");
#endif
                    break;
                  case PAMI_XFER_GATHERV_INT:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_GATHERV;
                    break;
                  case PAMI_XFER_BARRIER:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_BARRIER;
                    break;
                  case PAMI_XFER_ALLTOALL:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_ALLTOALL;
                    break;
                  case PAMI_XFER_ALLTOALLV:
#if (MPIDI_STATISTICS || MPIDI_PRINTENV)
                    if(MPIDI_Process.mp_infolevel >= 1)
                      fprintf(stderr,"WARNING: MPICH (collective selection) doesn't support ALLTOALLV, only ALLTOALLV_INT is supported\n");
#endif
                    break;
                  case PAMI_XFER_ALLTOALLV_INT:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_ALLTOALLV;
                    break;
                  case PAMI_XFER_SCAN:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_SCAN;
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_EXSCAN;
                    break;
                  case PAMI_XFER_REDUCE_SCATTER:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_REDUCE_SCATTER;
                    break;
                  default:
                    MPIDI_Process.optimized.auto_select_colls |= MPID_AUTO_SELECT_COLLS_NONE;
                }
              }
              MPIU_Free(collsel_collectives);
            }
          }
          else
          {
            fprintf (stderr, "Error. Collsel_table_load failed. result = %d\n", status);
            MPIDI_Process.optimized.auto_select_colls = MPID_AUTO_SELECT_COLLS_NONE;
          }
      }
      else
        MPIDI_Process.optimized.auto_select_colls = MPID_AUTO_SELECT_COLLS_NONE;
    }
    else
      PAMI_Extension_close(MPIDI_Collsel_extension);
  }
  else
    MPIDI_Process.optimized.auto_select_colls = MPID_AUTO_SELECT_COLLS_NONE;

#ifndef __BGQ__
  //If collective selection will be disabled, check on fca and CUDA if both not required, disable pami alltogether
  if(MPIDI_Process.optimized.auto_select_colls == MPID_AUTO_SELECT_COLLS_NONE && MPIDI_Process.optimized.collectives != MPID_COLL_FCA && MPIDI_Process.optimized.collectives != MPID_COLL_CUDA)
    MPIDI_Process.optimized.collectives = MPID_COLL_OFF;
#endif
}

void MPIDI_Collsel_table_generate()
{
  external_geometry_ops_t external_ops;
  external_ops.geometry_create     = MPIDI_Comm_create_from_pami_geom;
  external_ops.geometry_destroy    = MPIDI_Comm_destroy_external;
  external_ops.register_algorithms = MPIDI_Register_algorithms_ext;
  pami_result_t status = PAMI_SUCCESS;
  pami_extension_collsel_table_generate pamix_collsel_table_generate =
    (pami_extension_collsel_table_generate) PAMI_Extension_symbol (MPIDI_Collsel_extension, "Collsel_table_generate");

  status = pamix_collsel_table_generate (MPIDI_Collsel_advisor, MPIDI_Collsel_output_file, &MPIDI_Collsel_advisor_params, &external_ops, 1);
  if(status != PAMI_SUCCESS)
  {
    fprintf (stderr, "Error. The collsel_table_generate failed. result = %d\n", status);
  }

}


static void
MPIDI_PAMI_context_init(int* threading, int *size)
{
#ifdef TRACE_ON
  int requested_thread_level;
  requested_thread_level = *threading;
#endif
  int  numTasks;

#if (MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY_PER_OBJECT)
  /*
   * ASYNC_PROGRESS_MODE_LOCKED requires context post because the async thread
   * will hold the context lock indefinitely; the only option for an application
   * thread to interact with the context is to use context post. See discussion
   * in src/mpid/pamid/src/mpid_progress.h for more information.
   *
   * There are three possible resolutions for the situation when context post is
   * disabled and async progess mode is 'locked':
   *  1. abort
   *  2. silently enable context post
   *  3. silently demote async progress mode to ASYNC_PROGRESS_MODE_TRIGGER
   *
   * For now this configuration is considered erroneous and mpi will abort.
   */
  if (MPIDI_Process.async_progress.mode == ASYNC_PROGRESS_MODE_LOCKED &&
      MPIDI_Process.perobj.context_post.requested == 0)
    MPID_Abort (NULL, 0, 1, "'locking' async progress requires context post");

#else /* MPICH_THREAD_GRANULARITY != MPICH_THREAD_GRANULARITY_PER_OBJECT */
  /*
   * ASYNC_PROGRESS_MODE_LOCKED is not applicable in the "global lock" thread
   * mode. See discussion in src/mpid/pamid/src/mpid_progress.h for more
   * information.
   *
   * This configuration is considered erroneous and mpi will abort.
   */
  if (MPIDI_Process.async_progress.mode == ASYNC_PROGRESS_MODE_LOCKED)
    MPID_Abort (NULL, 0, 1, "'locking' async progress not applicable");
#endif

  /* ----------------------------------
   *  Figure out the context situation
   * ---------------------------------- */
#if (MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY_PER_OBJECT)

  /* Limit the number of requested contexts by the maximum number of contexts
   * allowed.  The default number of requested contexts depends on the mpich
   * lock mode, 'global' or 'perobj', and may be changed before this point
   * by an environment variables.
   */
  if (MPIDI_Process.avail_contexts > MPIDI_MAX_CONTEXTS)
    MPIDI_Process.avail_contexts = MPIDI_MAX_CONTEXTS;

  unsigned same = PAMIX_Client_query(MPIDI_Client, PAMI_CLIENT_CONST_CONTEXTS).value.intval;
  if (same)
    {
      /* Determine the maximum number of contexts supported; limit the number of
       * requested contexts by this value.
       */
      unsigned possible_contexts = PAMIX_Client_query(MPIDI_Client, PAMI_CLIENT_NUM_CONTEXTS).value.intval;
      TRACE_ERR("PAMI allows up to %u contexts; MPICH allows up to %u\n",
                possible_contexts, MPIDI_Process.avail_contexts);
      if (MPIDI_Process.avail_contexts > possible_contexts)
        MPIDI_Process.avail_contexts = possible_contexts;
    }
  else
    {
      /* If PAMI didn't give all nodes the same number of contexts, all bets
       * are off for now */
      MPIDI_Process.avail_contexts = 1;
    }

  /* The number of contexts must be a power-of-two, as required by the
   * MPIDI_Context_hash() function. Decrement until we hit a power-of-two */
  while(MPIDI_Process.avail_contexts & (MPIDI_Process.avail_contexts-1))
    --MPIDI_Process.avail_contexts;
  MPID_assert_always(MPIDI_Process.avail_contexts);

#else /* (MPICH_THREAD_GRANULARITY != MPICH_THREAD_GRANULARITY_PER_OBJECT) */

  /* Only a single context is supported in the 'global' mpich lock mode.
   *
   * A multi-context application will always perform better with the
   * 'per object' mpich lock mode - regardless of whether async progress is
   * enabled or not. This is because all threads, application and async
   * progress, must acquire the single global lock which effectively serializes
   * the threads and negates any benefit of multiple contexts.
   *
   * This single context limitation removes code and greatly simplifies logic.
   */
  MPIDI_Process.avail_contexts = 1;

#endif

  TRACE_ERR ("Thread-level=%d, requested=%d\n", *threading, requested_thread_level);

  MPIDI_Process.numTasks= numTasks = PAMIX_Client_query(MPIDI_Client, PAMI_CLIENT_NUM_TASKS).value.intval;
#ifdef OUT_OF_ORDER_HANDLING
  MPIDI_In_cntr = MPIU_Calloc0(numTasks, MPIDI_In_cntr_t);
  if(MPIDI_In_cntr == NULL)
    MPID_abort();
  MPIDI_Out_cntr = MPIU_Calloc0(numTasks, MPIDI_Out_cntr_t);
  if(MPIDI_Out_cntr == NULL)
    MPID_abort();
  memset((void *) MPIDI_In_cntr,0, sizeof(MPIDI_In_cntr_t));
  memset((void *) MPIDI_Out_cntr,0, sizeof(MPIDI_Out_cntr_t));
#endif


#ifdef MPIDI_TRACE
      int i;
      MPIDI_Trace_buf = MPIU_Calloc0(numTasks, MPIDI_Trace_buf_t);
      if(MPIDI_Trace_buf == NULL) MPID_abort();
      memset((void *) MPIDI_Trace_buf,0, sizeof(MPIDI_Trace_buf_t));
      for (i=0; i < numTasks; i++) {
          MPIDI_Trace_buf[i].R=MPIU_Calloc0(N_MSGS, recv_status);
          if (MPIDI_Trace_buf[i].R==NULL) MPID_abort();
          MPIDI_Trace_buf[i].PR=MPIU_Calloc0(N_MSGS, posted_recv);
          if (MPIDI_Trace_buf[i].PR ==NULL) MPID_abort();
          MPIDI_Trace_buf[i].S=MPIU_Calloc0(N_MSGS, send_status);
          if (MPIDI_Trace_buf[i].S ==NULL) MPID_abort();
      }
#endif

  /* ----------------------------------- */
  /*  Create the communication contexts  */
  /* ----------------------------------- */
  TRACE_ERR("Creating %d contexts\n", MPIDI_Process.avail_contexts);
  pami_result_t rc = PAMI_ERROR;
  pami_configuration_t config[3];
  int  cfgval=0;
  config[cfgval].name = PAMI_CLIENT_CONST_CONTEXTS,
  config[cfgval].value.intval = 1;
  cfgval++;
#ifndef HAVE_ERROR_CHECKING
#ifdef OUT_OF_ORDER_HANDLING
  /* disable parameter checking in PAMI - fast library only */
  config[cfgval].name = PAMI_CONTEXT_CHECK_PARAM;
  config[cfgval].value.intval = 0;
  cfgval++;
#endif
#endif
  rc = PAMI_Context_createv(MPIDI_Client, config, cfgval, MPIDI_Context, MPIDI_Process.avail_contexts);

  MPID_assert_always(rc == PAMI_SUCCESS);

  /* --------------------------------------------- */
  /* Get collective selection advisor and cache it */
  /* --------------------------------------------- */
  /* Context is created, i.e. collective selection extension is initialized in PAMI. Now I can get the
     advisor if I am not in TUNE mode. If in TUNE mode, I can init collsel and generate the table.
     This is not supported on BGQ.
  */
#ifndef __BGQ_
  MPIDI_Init_collsel_extension();
#endif

#if (MPIDI_STATISTICS || MPIDI_PRINTENV)
  MPIDI_open_pe_extension();
#endif
}


static void
MPIDI_PAMI_dispath_set(size_t              dispatch,
                       struct protocol_t * proto,
                       unsigned          * immediate_max)
{
  size_t im_max = 0;
  pami_dispatch_callback_function Recv = {.p2p = proto->func};
  MPID_assert_always(dispatch == proto->dispatch);

  if (MPIDI_Process.shmem_pt2pt == 0)
    proto->options.use_shmem = PAMI_HINT_DISABLE;

  PAMIX_Dispatch_set(MPIDI_Context,
                     MPIDI_Process.avail_contexts,
                     proto->dispatch,
                     Recv,
                     proto->options,
                     &im_max);
  TRACE_ERR("Immediate-max query:  dispatch=%zu  got=%zu  required=%zu\n",
            dispatch, im_max, proto->immediate_min);
  MPID_assert_always(proto->immediate_min <= im_max);
  if ((immediate_max != NULL) && (im_max < *immediate_max))
    *immediate_max = im_max;
}


static void
MPIDI_PAMI_dispath_init()
{
#ifdef OUT_OF_ORDER_HANDLING
  {
    pami_configuration_t config;
    pami_result_t        rc = PAMI_ERROR;

    memset(&config, 0, sizeof(config));
    config.name = PAMI_DISPATCH_SEND_IMMEDIATE_MAX;
    rc = PAMI_Dispatch_query(MPIDI_Context[0], (size_t)0, &config, 1);
    if ( rc == PAMI_SUCCESS )
      {
        TRACE_ERR("PAMI_DISPATCH_SEND_IMMEDIATE_MAX=%d.\n", config.value.intval, rc);
        MPIDI_Process.pt2pt.limits_array[2] = config.value.intval;
      }
    else
      {
        TRACE_ERR(" Attention: PAMI_Client_query(DISPATCH_SEND_IMMEDIATE_MAX=%d) rc=%d\n", config.name, rc);
        MPIDI_Process.pt2pt.limits_array[2] = 256;
      }

    MPIDI_Process.pt2pt.limits_array[3] = MPIDI_Process.pt2pt.limits_array[2];
    MPIDI_Process.pt2pt.limits_array[6] = MPIDI_Process.pt2pt.limits_array[2];
    MPIDI_Process.pt2pt.limits_array[7] = MPIDI_Process.pt2pt.limits_array[2];
  }
#endif
  /* ------------------------------------ */
  /*  Set up the communication protocols  */
  /* ------------------------------------ */
  unsigned send_immediate_max_bytes = (unsigned) -1;
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_Short,     &proto_list.Short,     &send_immediate_max_bytes);
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_ShortSync, &proto_list.ShortSync, &send_immediate_max_bytes);
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_Eager,     &proto_list.Eager,     NULL);
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_RVZ,       &proto_list.RVZ,       NULL);
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_Cancel,    &proto_list.Cancel,    NULL);
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_Control,   &proto_list.Control,   NULL);
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_WinCtrl,   &proto_list.WinCtrl,   NULL);
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_WinAccum,  &proto_list.WinAccum,  NULL);
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_RVZ_zerobyte, &proto_list.RVZ_zerobyte, NULL);
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_WinGetAccum, &proto_list.WinGetAccum, NULL);
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_WinGetAccumAck, &proto_list.WinGetAccumAck, NULL);
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_WinAtomic, &proto_list.WinAtomic,   NULL);
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_WinAtomicAck, &proto_list.WinAtomicAck,   NULL);

#ifdef DYNAMIC_TASKING
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_Dyntask,   &proto_list.Dyntask,  NULL);
  MPIDI_PAMI_dispath_set(MPIDI_Protocols_Dyntask_disconnect,   &proto_list.Dyntask_disconnect,  NULL);
#endif

  /*
   * The first two protocols are our short protocols: they use
   * PAMI_Send_immediate() exclusively.  We get the short limit twice
   * because they could be different.
   *
   * - The returned value is the max amount of header+data.  We have
   *     to remove the header size.
   *
   * - We need to add one back, since we don't use "=" in the
   *     comparison.  We use "if (size < short_limit) ...".
   *
   * - We use the min of the results just to be safe.
   */
  send_immediate_max_bytes -= (sizeof(MPIDI_MsgInfo) - 1);

  if (MPIDI_Process.pt2pt.limits.application.immediate.remote > send_immediate_max_bytes)
    MPIDI_Process.pt2pt.limits.application.immediate.remote = send_immediate_max_bytes;

  if (MPIDI_Process.pt2pt.limits.application.immediate.local > send_immediate_max_bytes)
    MPIDI_Process.pt2pt.limits.application.immediate.local = send_immediate_max_bytes;

  if (MPIDI_Process.pt2pt.limits.internal.immediate.remote > send_immediate_max_bytes)
    MPIDI_Process.pt2pt.limits.internal.immediate.remote = send_immediate_max_bytes;

  if (MPIDI_Process.pt2pt.limits.internal.immediate.local > send_immediate_max_bytes)
    MPIDI_Process.pt2pt.limits.internal.immediate.local = send_immediate_max_bytes;

  if (TOKEN_FLOW_CONTROL_ON)
     {
       #if TOKEN_FLOW_CONTROL
        int i;
        MPIDI_mm_init(MPIDI_Process.numTasks,&MPIDI_Process.pt2pt.limits.application.eager.remote,&MPIDI_Process.mp_buf_mem);
        MPIDI_Token_cntr = MPIU_Calloc0(MPIDI_Process.numTasks, MPIDI_Token_cntr_t);
        memset((void *) MPIDI_Token_cntr,0, (sizeof(MPIDI_Token_cntr_t) * MPIDI_Process.numTasks));
        for (i=0; i < MPIDI_Process.numTasks; i++)
        {
          MPIDI_Token_cntr[i].tokens=MPIDI_tfctrl_enabled;
        }
        #else
         MPID_assert_always(0);
        #endif
     }
}



extern char **environ;
static void
printEnvVars(char *type)
{
   printf("The following %s* environment variables were specified:\n", type);
   char **env;
   for(env = environ; *env != 0 ; env++)
   {
      if(!strncasecmp(*env, type, strlen(type)))
        printf("  %s\n", *env);
   }
}


static void
MPIDI_PAMI_init(int* rank, int* size, int* threading)
{
  MPIDI_PAMI_context_init(threading, size);


  MPIDI_PAMI_dispath_init();


  if ( (*rank == 0) && (MPIDI_Process.verbose >= MPIDI_VERBOSE_SUMMARY_0) )
    {
      printf("MPIDI_Process.*\n"
             "  verbose               : %u\n"
             "  statistics            : %u\n"
             "  contexts              : %u\n"
             "  async_progress        : %u\n"
             "  context_post          : %u\n"
             "  pt2pt.limits\n"
             "    application\n"
             "      eager\n"
             "        remote, local   : %u, %u\n"
             "      short\n"
             "        remote, local   : %u, %u\n"
             "    internal\n"
             "      eager\n"
             "        remote, local   : %u, %u\n"
             "      short\n"
             "        remote, local   : %u, %u\n"
             "  rma_pending           : %u\n"
             "  shmem_pt2pt           : %u\n"
             "  disable_internal_eager_scale : %u\n"
#if TOKEN_FLOW_CONTROL
             "  mp_buf_mem               : %u\n"
             "  mp_buf_mem_max           : %u\n"
             "  is_token_flow_control_on : %u\n"
#endif
#if (MPIDI_STATISTICS || MPIDI_PRINTENV)
             "  mp_infolevel : %u\n"
             "  mp_statistics: %u\n"
             "  mp_printenv  : %u\n"
             "  mp_interrupts: %u\n"
#endif
#ifdef QUEUE_BINARY_SEARCH_SUPPORT
             "  queue_binary_search_support_on : %u\n"
#endif
             "  optimized.collectives : %u\n"
             "  optimized.select_colls: %u\n"
             "  optimized.subcomms    : %u\n"
             "  optimized.memory      : %u\n"
             "  optimized.num_requests: %u\n"
             "  mpir_nbc              : %u\n" 
             "  numTasks              : %u\n",
             "  typed_onesided        : %u\n",
             MPIDI_Process.verbose,
             MPIDI_Process.statistics,
             MPIDI_Process.avail_contexts,
             MPIDI_Process.async_progress.mode,
             MPIDI_Process.perobj.context_post.requested,
             MPIDI_Process.pt2pt.limits_array[0],
             MPIDI_Process.pt2pt.limits_array[1],
             MPIDI_Process.pt2pt.limits_array[2],
             MPIDI_Process.pt2pt.limits_array[3],
             MPIDI_Process.pt2pt.limits_array[4],
             MPIDI_Process.pt2pt.limits_array[5],
             MPIDI_Process.pt2pt.limits_array[6],
             MPIDI_Process.pt2pt.limits_array[7],
             MPIDI_Process.rma_pending,
             MPIDI_Process.shmem_pt2pt,
             MPIDI_Process.disable_internal_eager_scale,
#if TOKEN_FLOW_CONTROL
             MPIDI_Process.mp_buf_mem,
             MPIDI_Process.mp_buf_mem_max,
             MPIDI_Process.is_token_flow_control_on,
#endif
#if (MPIDI_STATISTICS || MPIDI_PRINTENV)
             MPIDI_Process.mp_infolevel,
             MPIDI_Process.mp_statistics,
             MPIDI_Process.mp_printenv,
             (MPIDI_Process.async_progress.mode != ASYNC_PROGRESS_MODE_DISABLED),
#endif
#ifdef QUEUE_BINARY_SEARCH_SUPPORT
             MPIDI_Process.queue_binary_search_support_on,
#endif
             MPIDI_Process.optimized.collectives,
             MPIDI_Process.optimized.select_colls,
             MPIDI_Process.optimized.subcomms,
             MPIDI_Process.optimized.memory,
             MPIDI_Process.optimized.num_requests,
             MPIDI_Process.mpir_nbc, 
             MPIDI_Process.numTasks,
             MPIDI_Process.typed_onesided);
      switch (*threading)
        {
          case MPI_THREAD_MULTIPLE:
            printf("mpi thread level        : 'MPI_THREAD_MULTIPLE'\n");
            break;
          case MPI_THREAD_SERIALIZED:
            printf("mpi thread level        : 'MPI_THREAD_SERIALIZED'\n");
            break;
          case MPI_THREAD_FUNNELED:
            printf("mpi thread level        : 'MPI_THREAD_FUNNELED'\n");
            break;
          case MPI_THREAD_SINGLE:
            printf("mpi thread level        : 'MPI_THREAD_SINGLE'\n");
            break;
        }
      printf("MPICH_THREAD_GRANULARITY : '%s'\n",
             (MPICH_THREAD_GRANULARITY==MPICH_THREAD_GRANULARITY_PER_OBJECT)?"per object":"global");
#ifdef ASSERT_LEVEL
      printf("ASSERT_LEVEL            : %d\n", ASSERT_LEVEL);
#else
      printf("ASSERT_LEVEL            : not defined\n");
#endif
#ifdef MPICH_LIBDIR
      printf("MPICH_LIBDIR           : %s\n", MPICH_LIBDIR);
#else
      printf("MPICH_LIBDIR           : not defined\n");
#endif
      printEnvVars("MPICH_");
      printEnvVars("PAMID_");
      printEnvVars("PAMI_");
      printEnvVars("COMMAGENT_");
      printEnvVars("MUSPI_");
      printEnvVars("BG_");
    }
#ifdef MPIDI_BANNER
  if ((*rank == 0) && (MPIDI_Process.mp_infolevel >=1)) {
       char* buf = (char *) MPIU_Malloc(160);
       int   rc  = MPIDI_Banner(buf);
       if ( rc == 0 )
            fprintf(stderr, "%s\n", buf);
       else
            TRACE_ERR("mpid_banner() return code=%d task %d",rc,*rank);
       MPIU_Free(buf);
  }
#endif
}

#ifndef DYNAMIC_TASKING
static void
MPIDI_VCRT_init(int rank, int size)
#else
static void
MPIDI_VCRT_init(int rank, int size, char *world_tasks, MPIDI_PG_t *pg)
#endif
{
  int i, rc;
  MPID_Comm * comm;
#ifdef DYNAMIC_TASKING
  int p, mpi_errno=0;
  char *world_tasks_save,*cp;
  char *pg_id;
#endif

  /* ------------------------------- */
  /* Initialize MPI_COMM_SELF object */
  /* ------------------------------- */
  comm = MPIR_Process.comm_self;
  comm->rank = 0;
  comm->remote_size = comm->local_size = 1;
  rc = MPID_VCRT_Create(comm->remote_size, &comm->vcrt);
  MPID_assert_always(rc == MPI_SUCCESS);
  rc = MPID_VCRT_Get_ptr(comm->vcrt, &comm->vcr);
  MPID_assert_always(rc == MPI_SUCCESS);
  comm->vcr[0]->taskid= PAMIX_Client_query(MPIDI_Client, PAMI_CLIENT_TASK_ID  ).value.intval;

#ifdef DYNAMIC_TASKING
  if(mpidi_dynamic_tasking) {
    comm->vcr[0]->pg=pg->vct[rank].pg;
    comm->vcr[0]->pg_rank=pg->vct[rank].pg_rank;
    pg->vct[rank].taskid = comm->vcr[0]->taskid;
    if(comm->vcr[0]->pg) {
      TRACE_ERR("Adding ref for comm=%x vcr=%x pg=%x\n", comm, comm->vcr[0], comm->vcr[0]->pg);
      MPIDI_PG_add_ref(comm->vcr[0]->pg);
    }
    comm->local_vcr = comm->vcr;
  }
 
#endif

  /* -------------------------------- */
  /* Initialize MPI_COMM_WORLD object */
  /* -------------------------------- */
  comm = MPIR_Process.comm_world;
  comm->rank = rank;
  comm->remote_size = comm->local_size = size;
  rc = MPID_VCRT_Create(comm->remote_size, &comm->vcrt);
  MPID_assert_always(rc == MPI_SUCCESS);
  rc = MPID_VCRT_Get_ptr(comm->vcrt, &comm->vcr);
  MPID_assert_always(rc == MPI_SUCCESS);

#ifdef DYNAMIC_TASKING
  if(mpidi_dynamic_tasking) {
    i=0;
    world_tasks_save = MPIU_Strdup(world_tasks);
    if(world_tasks != NULL) {
      comm->vcr[0]->taskid = atoi(strtok(world_tasks, ":"));
      while( (cp=strtok(NULL, ":")) != NULL) {
        comm->vcr[++i]->taskid= atoi(cp);
      }
    }
    MPIU_Free(world_tasks_save);

        /* This memory will be freed by the PG_Destroy if there is an error */
        pg_id = MPIU_Malloc(MAX_JOBID_LEN);

        mpi_errno = PMI2_Job_GetId(pg_id, MAX_JOBID_LEN);
        TRACE_ERR("PMI2_Job_GetId - pg_id=%s\n", pg_id);

    /* Initialize the connection table on COMM_WORLD from the process group's
       connection table */
    for (p = 0; p < comm->local_size; p++)
    {
	  comm->vcr[p]->pg=pg->vct[p].pg;
          comm->vcr[p]->pg_rank=pg->vct[p].pg_rank;
          pg->vct[p].taskid = comm->vcr[p]->taskid;
	  if(comm->vcr[p]->pg) {
            TRACE_ERR("Adding ref for comm=%x vcr=%x pg=%x\n", comm, comm->vcr[p], comm->vcr[p]->pg);
            MPIDI_PG_add_ref(comm->vcr[p]->pg);
	  }
       /* MPID_VCR_Dup(&pg->vct[p], &(comm->vcr[p]));*/
	  TRACE_ERR("comm->vcr[%d]->pg->id=%s comm->vcr[%d]->pg_rank=%d\n", p, comm->vcr[p]->pg->id, p, comm->vcr[p]->pg_rank);
	  TRACE_ERR("TASKID -- comm->vcr[%d]=%d\n", p, comm->vcr[p]->taskid);
    }

   comm->local_vcr = comm->vcr;
  }else {
	for (i=0; i<size; i++) {
	  comm->vcr[i]->taskid = i;
	  TRACE_ERR("comm->vcr[%d]=%d\n", i, comm->vcr[i]->taskid);
        }
	TRACE_ERR("MP_I_WORLD_TASKS not SET\n");
  }
#else
  for (i=0; i<size; i++) {
    comm->vcr[i]->taskid = i;
    TRACE_ERR("comm->vcr[%d]=%d\n", i, comm->vcr[i]->taskid);
  }
#endif
}


/**
 * \brief Initialize MPICH at ADI level.
 * \param[in,out] argc Unused
 * \param[in,out] argv Unused
 * \param[in]     requested The thread model requested by the user.
 * \param[out]    provided  The thread model provided to user.  It is the same as requested, except in VNM.
 * \param[out]    has_args  Set to TRUE
 * \param[out]    has_env   Set to TRUE
 * \return MPI_SUCCESS
 */
int MPID_Init(int * argc,
              char *** argv,
              int   requested,
              int * provided,
              int * has_args,
              int * has_env)
{
  int rank, size;
#ifdef DYNAMIC_TASKING
  int has_parent=0;
  MPIDI_PG_t * pg=NULL;
  int pg_rank=-1;
  int pg_size;
  int appnum,mpi_errno;
  MPID_Comm * comm;
  int i,j;
  pami_configuration_t config;
  int world_size;
#endif
  char *world_tasks;
  pami_result_t rc;

  /* Override split_type */
  MPID_Comm_fns = &comm_fns;

  /* ------------------------------------------------------------------------------- */
  /*  Initialize the pami client to get the process rank; needed for env var output. */
  /* ------------------------------------------------------------------------------- */
  MPIDI_PAMI_client_init(&rank, &size, &mpidi_dynamic_tasking, &world_tasks);
  TRACE_OUT("after MPIDI_PAMI_client_init rank=%d size=%d mpidi_dynamic_tasking=%d\n", rank, size, mpidi_dynamic_tasking);

  /* ------------------------------------ */
  /*  Get new defaults from the Env Vars  */
  /* ------------------------------------ */
  MPIDI_Env_setup(rank, requested);

  /* ----------------------------- */
  /* Initialize messager           */
  /* ----------------------------- */
  if ( (MPIDI_Process.async_progress.mode == ASYNC_PROGRESS_MODE_TRIGGER) || mpidi_dynamic_tasking)
  {
    /* The 'trigger' async progress mode requires MPI_THREAD_MULTIPLE.
     * Silently promote the thread level.
     *
     * See discussion in src/mpid/pamid/src/mpid_progress.h for more
     * information.
     */
    *provided = MPI_THREAD_MULTIPLE;
  }
  else
  {
    *provided = requested;
  }
#if (MPIDI_STATISTICS || MPIDI_PRINTENV)
   if (requested != MPI_THREAD_MULTIPLE)
       mpich_env->single_thread=1;
#endif
  MPIDI_PAMI_init(&rank, &size, provided);

#ifdef DYNAMIC_TASKING
  if (mpidi_dynamic_tasking) {

    /*
     * Perform PMI initialization
     */
    mpi_errno = MPIDI_InitPG( argc, argv,
			      has_args, has_env, &has_parent, &pg_rank, &pg );
    if (mpi_errno) {
	TRACE_ERR("MPIDI_InitPG returned with mpi_errno=%d\n", mpi_errno);
    }

    /* FIXME: Why are pg_size and pg_rank handled differently? */
    pg_size = MPIDI_PG_Get_size(pg);

    TRACE_ERR("MPID_Init - pg_size=%d\n", pg_size);
    MPIDI_Process.my_pg = pg;  /* brad : this is rework for shared memories
				* because they need this set earlier
                                * for getting the business card
                                */
    MPIDI_Process.my_pg_rank = pg_rank;

  }
#endif

  /* ------------------------- */
  /* initialize request queues */
  /* ------------------------- */
  MPIDI_Recvq_init();

  /* -------------------------------------- */
  /* Fill in some hardware structure fields */
  /* -------------------------------------- */
  extern void MPIX_Init();
  MPIX_Init();

  /* ------------------------------- */
  /* Set process attributes          */
  /* ------------------------------- */
  MPIR_Process.attrs.tag_ub = INT_MAX;
  MPIR_Process.attrs.wtime_is_global = 1;
  MPIR_Process.attrs.io   = MPI_ANY_SOURCE;


  /* ------------------------------- */
  /* Initialize communicator objects */
  /* ------------------------------- */
#ifndef DYNAMIC_TASKING
  MPIDI_VCRT_init(rank, size);
#else
  MPIDI_VCRT_init(rank, size, world_tasks, pg);
#endif

  /* ------------------------------- */
  /* Setup optimized communicators   */
  /* ------------------------------- */
  TRACE_ERR("creating world geometry\n");
  rc = PAMI_Geometry_world(MPIDI_Client, &MPIDI_Process.world_geometry);
  MPID_assert_always(rc == PAMI_SUCCESS);
  TRACE_ERR("calling comm_create on comm world %p\n", MPIR_Process.comm_world);
  MPIR_Process.comm_world->mpid.geometry = MPIDI_Process.world_geometry;
  MPIR_Process.comm_world->mpid.parent   = PAMI_GEOMETRY_NULL;
  MPIR_Comm_commit(MPIR_Process.comm_world);

#ifdef DYNAMIC_TASKING
  if (has_parent) {
     char * parent_port;

     /* FIXME: To allow just the "root" process to
        request the port and then use MPIR_Bcast_intra to
        distribute it to the rest of the processes,
        we need to perform the Bcast after MPI is
        otherwise initialized.  We could do this
        by adding another MPID call that the MPI_Init(_thread)
        routine would make after the rest of MPI is
        initialized, but before MPI_Init returns.
        In fact, such a routine could be used to
        perform various checks, including parameter
        consistency value (e.g., all processes have the
        same environment variable values). Alternately,
        we could allow a few routines to operate with
        predefined parameter choices (e.g., bcast, allreduce)
        for the purposes of initialization. */
	mpi_errno = MPIDI_GetParentPort(&parent_port);
	if (mpi_errno != MPI_SUCCESS) {
          TRACE_ERR("MPIDI_GetParentPort returned with mpi_errno=%d\n", mpi_errno);
	}

	mpi_errno = MPID_Comm_connect(parent_port, NULL, 0,
				      MPIR_Process.comm_world, &comm);
	if (mpi_errno != MPI_SUCCESS) {
	    TRACE_ERR("mpi_errno from Comm_connect=%d\n", mpi_errno);
	}

	MPIR_Process.comm_parent = comm;
	MPIU_Assert(MPIR_Process.comm_parent != NULL);
	MPIU_Strncpy(comm->name, "MPI_COMM_PARENT", MPI_MAX_OBJECT_NAME);

	/* FIXME: Check that this intercommunicator gets freed in MPI_Finalize
	   if not already freed.  */
   }
  mp_world_exiting_handler = &(_mpi_world_exiting_handler);
#endif
  /* ------------------------------- */
  /* Initialize timer data           */
  /* ------------------------------- */
  MPID_Wtime_init();


  /* ------------------------------- */
  /* ???                             */
  /* ------------------------------- */
  *has_args = TRUE;
  *has_env  = TRUE;
#ifdef MPIDI_PRINTENV
  if (MPIDI_Process.mp_printenv) {
      MPIDI_Print_mpenv(rank,size);
  }
#endif
  /* ----------------------------------------------- */
  /* parse params for pami_tune if in benchmark mode */
  /* ----------------------------------------------- */
  if(MPIDI_Process.optimized.auto_select_colls == MPID_AUTO_SELECT_COLLS_TUNE)
  {
    if(argc != NULL && argv != NULL)
    {
      if(MPIDI_collsel_pami_tune_parse_params(*argc, *argv) != PAMI_SUCCESS)
      {
        MPIDI_Process.optimized.auto_select_colls = MPID_AUTO_SELECT_COLLS_NONE;
      }
    }
    else
    {
      if(MPIDI_collsel_pami_tune_parse_params(0, NULL) != PAMI_SUCCESS)
      {
        MPIDI_Process.optimized.auto_select_colls = MPID_AUTO_SELECT_COLLS_NONE;
      }
    }
  }
  return MPI_SUCCESS;
}


/*
 * \brief This is called by MPI to let us know that MPI_Init is done.
 */
int MPID_InitCompleted()
{
  MPIDI_NBC_init();
  MPIDI_Progress_init();
  /* ----------------------------------------------- */
  /*    Now all is ready.. call table generate       */
  /* ----------------------------------------------- */
  if(MPIDI_Process.optimized.auto_select_colls == MPID_AUTO_SELECT_COLLS_TUNE)
  {
    MPIDI_Collsel_table_generate();
    MPIDI_collsel_pami_tune_cleanup();
  }
  return MPI_SUCCESS;
}

#if (MPIDI_PRINTENV || MPIDI_STATISTICS || MPIDI_BANNER)
void MPIDI_open_pe_extension() {
    int rc;
     /* open PE extension       */
     memset(&pe_extension,0, sizeof(pami_extension_t));
     rc = PAMI_Extension_open (MPIDI_Client, "EXT_pe_extension", &pe_extension);
     TRACE_ERR("PAMI_Extension_open: rc %d\n", rc);
     if (rc != PAMI_SUCCESS) {
         TRACE_ERR("ERROR open PAMI_Extension_open failed rc %d", rc);
         MPID_assert_always(rc == PAMI_SUCCESS);
     }
}


int MPIDI_Banner(char * bufPtr) {
    char  *cp, *level=NULL;
    char buf[30];
    char *ASC_time;
    time_t  ltime;
    char msgBuf[60];
    char type[64], ver_buf[64];
    struct  tm  *tmx,*tm1;

    /* Note: The _ibm_release_version will be expanded to a full string  */
    /*       ONLY IF this file is extracted from CMVC.                   */
    /*       If this file is cloned from GIT the the string will be just */
    /*       "%W%.                                                       */
    if ( strncmp(_ibm_release_version_, "%W", 2) ) {
       /* IBMPE's expanded version string has release name like ppe_rbarlx */
       /* and that '_' in the name is what we are looking for.             */
       /* BGQ's version string does not have a '_' in it.                  */
       level = strrchr(_ibm_release_version_, '_');
       if ( level ) {
          level -=3;

          /* The version string generated by CMVC during weekly build has a */
          /* date which is the time when the mpidi_platform.h file is last  */
          /* updated.  This date can be quite old and is better removed.    */
          memset(ver_buf, 0, sizeof(ver_buf));
          strncpy(ver_buf, level, sizeof(ver_buf)-1);
          if ( cp = strchr(ver_buf, ',') ) *cp = '\0';
       }
    }

    if(sizeof(void*) == 8)
      strcpy(type, "64bit (MPI over PAMI)");
    else if(sizeof(int) == 4)
      strcpy(type, "32bit (MPI over PAMI)");
    else
      strcpy(type, "UNKNOWN-bit (MPI over PAMI)");

    sprintf(msgBuf,"MPICH library was compiled on");

    tmx=MPIU_Malloc(sizeof(struct tm));
    sprintf(buf,__DATE__" "__TIME__);

    if ((void *) NULL == strptime(buf, "%B %d %Y %T", tmx))
       return(1);

   /*  update isdst in tmx structure    */
    ltime=0;
    time(&ltime);
    tm1 = localtime(&ltime);
    tmx->tm_isdst=tm1->tm_isdst;

   /* localtime updates tm_wday in tmx structure  */
    ltime=mktime(tmx);
    tm1 = localtime(&ltime);
    tmx->tm_wday = tm1->tm_wday;
    ASC_time = asctime(tmx);

    if (level) {
       sprintf(bufPtr, "%s %s %s %s ", type, ver_buf, msgBuf, ASC_time);
    } else {
       sprintf(bufPtr, "%s %s %s ", type, msgBuf, ASC_time);

    }

    MPIU_Free(tmx);
    return MPI_SUCCESS;
}
#endif


static inline void
static_assertions()
{
  MPID_assert_static(sizeof(void*) == sizeof(size_t));
  MPID_assert_static(sizeof(uintptr_t) == sizeof(size_t));
#ifdef __BGQ__
  /* MPID_VCR_GET_LPIDS relies on the VCR being a simple task list */
  MPID_VCR vcr=NULL;
  MPID_assert_static(sizeof(*vcr) == sizeof(pami_task_t));/* VCR is a simple task list */
  MPID_assert_static(sizeof(vcr->taskid) == sizeof(*vcr));/* VCR is a simple task list */

  MPID_assert_static(sizeof(MPIDI_MsgInfo) == 16);
  MPID_assert_static(sizeof(uint64_t) == sizeof(size_t));
#endif
}

#ifdef DYNAMIC_TASKING
/* FIXME: The PG code should supply these, since it knows how the
   pg_ids and other data are represented */
int MPIDI_PG_Compare_ids(void * id1, void * id2)
{
    return (strcmp((char *) id1, (char *) id2) == 0) ? TRUE : FALSE;
}

int MPIDI_PG_Destroy_id(MPIDI_PG_t * pg)
{
    if (pg->id != NULL)
    {
	TRACE_ERR("free pg id =%p pg=%p\n", pg->id, pg);
	MPIU_Free(pg->id);
	TRACE_ERR("done free pg id \n");
    }

    return MPI_SUCCESS;
}


int MPIDI_InitPG( int *argc, char ***argv,
	          int *has_args, int *has_env, int *has_parent,
	          int *pg_rank_p, MPIDI_PG_t **pg_p )
{
    int pmi_errno;
    int mpi_errno = MPI_SUCCESS;
    int pg_rank, pg_size, appnum, pg_id_sz;
    int usePMI=1;
    char *pg_id;
    MPIDI_PG_t *pg = 0;

    /* If we use PMI here, make the PMI calls to get the
       basic values.  Note that systems that return setvals == true
       do not make use of PMI for the KVS routines either (it is
       assumed that the discover connection information through some
       other mechanism */
    /* FIXME: We may want to allow the channel to ifdef out the use
       of PMI calls, or ask the channel to provide stubs that
       return errors if the routines are in fact used */
    if (usePMI) {
	/*
	 * Initialize the process manangement interface (PMI),
	 * and get rank and size information about our process group
	 */

#ifdef USE_PMI2_API
	TRACE_ERR("Calling PMI2_Init\n");
        mpi_errno = PMI2_Init(has_parent, &pg_size, &pg_rank, &appnum);
	TRACE_ERR("PMI2_Init - pg_size=%d pg_rank=%d\n", pg_size, pg_rank);
        /*if (mpi_errno) MPIU_ERR_POP(mpi_errno);*/
#else
	TRACE_ERR("Calling PMI_Init\n");
	pmi_errno = PMI_Init(has_parent);
	if (pmi_errno != PMI_SUCCESS) {
	/*    MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER, "**pmi_init",
			     "**pmi_init %d", pmi_errno); */
	}

	pmi_errno = PMI_Get_rank(&pg_rank);
	if (pmi_errno != PMI_SUCCESS) {
	    /*MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER, "**pmi_get_rank",
			     "**pmi_get_rank %d", pmi_errno); */
	}

	pmi_errno = PMI_Get_size(&pg_size);
	if (pmi_errno != 0) {
	/*MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER, "**pmi_get_size",
			     "**pmi_get_size %d", pmi_errno);*/
	}

	pmi_errno = PMI_Get_appnum(&appnum);
	if (pmi_errno != PMI_SUCCESS) {
/*	    MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER, "**pmi_get_appnum",
				 "**pmi_get_appnum %d", pmi_errno); */
	}
#endif
	/* Note that if pmi is not availble, the value of MPI_APPNUM is
	   not set */
	if (appnum != -1) {
	    MPIR_Process.attrs.appnum = appnum;
	}

#ifdef USE_PMI2_API

        /* This memory will be freed by the PG_Destroy if there is an error */
	pg_id = MPIU_Malloc(MAX_JOBID_LEN);

        mpi_errno = PMI2_Job_GetId(pg_id, MAX_JOBID_LEN);
	TRACE_ERR("PMI2_Job_GetId - pg_id=%s\n", pg_id);
#else
	/* Now, initialize the process group information with PMI calls */
	/*
	 * Get the process group id
	 */
	pmi_errno = PMI_KVS_Get_name_length_max(&pg_id_sz);
	if (pmi_errno != PMI_SUCCESS) {
          TRACE_ERR("PMI_KVS_Get_name_length_max returned with pmi_errno=%d\n", pmi_errno);
	}

	/* This memory will be freed by the PG_Destroy if there is an error */
	pg_id = MPIU_Malloc(pg_id_sz + 1);

	/* Note in the singleton init case, the pg_id is a dummy.
	   We'll want to replace this value if we join an
	   Process manager */
	pmi_errno = PMI_KVS_Get_my_name(pg_id, pg_id_sz);
	if (pmi_errno != PMI_SUCCESS) {
          TRACE_ERR("PMI_KVS_Get_my_name returned with pmi_errno=%d\n", pmi_errno);
	}
#endif
    }
    else {
	/* Create a default pg id */
	pg_id = MPIU_Malloc(2);
	MPIU_Strncpy( pg_id, "0", 2 );
    }

	TRACE_ERR("pg_size=%d pg_id=%s\n", pg_size, pg_id);
    /*
     * Initialize the process group tracking subsystem
     */
    mpi_errno = MPIDI_PG_Init(argc, argv,
			     MPIDI_PG_Compare_ids, MPIDI_PG_Destroy_id);
    if (mpi_errno != MPI_SUCCESS) {
      TRACE_ERR("MPIDI_PG_Init returned with mpi_errno=%d\n", mpi_errno);
    }

    /*
     * Create a new structure to track the process group for our MPI_COMM_WORLD
     */
    TRACE_ERR("pg_size=%d pg_id=%p pg_id=%s\n", pg_size, pg_id, pg_id);
    mpi_errno = MPIDI_PG_Create(pg_size, pg_id, &pg);
    MPIU_Free(pg_id);
    if (mpi_errno != MPI_SUCCESS) {
      TRACE_ERR("MPIDI_PG_Create returned with mpi_errno=%d\n", mpi_errno);
    }

    /* FIXME: We can allow the channels to tell the PG how to get
       connection information by passing the pg to the channel init routine */
    if (usePMI) {
	/* Tell the process group how to get connection information */
        mpi_errno = MPIDI_PG_InitConnKVS( pg );
        if (mpi_errno)
          TRACE_ERR("MPIDI_PG_InitConnKVS returned with mpi_errno=%d\n", mpi_errno);
    }

    /* FIXME: has_args and has_env need to come from PMI eventually... */
    *has_args = TRUE;
    *has_env  = TRUE;

    *pg_p      = pg;
    *pg_rank_p = pg_rank;

 fn_exit:
    return mpi_errno;
 fn_fail:
    /* --BEGIN ERROR HANDLING-- */
    if (pg) {
	MPIDI_PG_Destroy( pg );
    }
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
#endif