Blob Blame History Raw
/* begin_generated_IBM_copyright_prolog                             */
/*                                                                  */
/* This is an automatically generated copyright prolog.             */
/* After initializing,  DO NOT MODIFY OR MOVE                       */
/*  --------------------------------------------------------------- */
/* Licensed Materials - Property of IBM                             */
/* Blue Gene/Q 5765-PER 5765-PRP                                    */
/*                                                                  */
/* (C) Copyright IBM Corp. 2011, 2012 All Rights Reserved           */
/* US Government Users Restricted Rights -                          */
/* Use, duplication, or disclosure restricted                       */
/* by GSA ADP Schedule Contract with IBM Corp.                      */
/*                                                                  */
/*  --------------------------------------------------------------- */
/*                                                                  */
/* end_generated_IBM_copyright_prolog                               */
/*  (C)Copyright IBM Corp.  2007, 2011  */
/**
 * \file include/mpidi_datatypes.h
 * \brief ???
 */
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */


#ifndef __include_mpidi_datatypes_h__
#define __include_mpidi_datatypes_h__

#ifdef MPIDI_STATISTICS
#include <pami_ext_pe.h>
#endif
#include "mpidi_constants.h"
#include "mpidi_platform.h"
#include "pami.h"
#include "mpidi_trace.h"

#include "opa_primitives.h"



#if (MPIU_HANDLE_ALLOCATION_METHOD == MPIU_HANDLE_ALLOCATION_THREAD_LOCAL) && defined(__BGQ__)
struct MPID_Request;
typedef struct
{
  struct MPID_Request  * head;
  size_t                 count;
} MPIDI_RequestHandle_t;
#endif

#define MPIDI_PT2PT_LIMIT_SET(is_internal,is_immediate,is_local,value)		\
  MPIDI_Process.pt2pt.limits_lookup[is_internal][is_immediate][is_local] = value\

typedef struct
{
  unsigned remote;
  unsigned local;
} MPIDI_remote_and_local_limits_t;

typedef struct
{
  MPIDI_remote_and_local_limits_t eager;
  MPIDI_remote_and_local_limits_t immediate;
} MPIDI_immediate_and_eager_limits_t;

typedef struct
{
  MPIDI_immediate_and_eager_limits_t application;
  MPIDI_immediate_and_eager_limits_t internal;
} MPIDI_pt2pt_limits_t;

/**
 * \brief MPI Process descriptor
 *
 * This structure contains global configuration flags.
 */
typedef struct
{
  unsigned avail_contexts;
  union
  {
    unsigned             limits_array[8];
    unsigned             limits_lookup[2][2][2];
    MPIDI_pt2pt_limits_t limits;
  } pt2pt;
  unsigned disable_internal_eager_scale; /**< The number of tasks at which point eager will be disabled */
#if TOKEN_FLOW_CONTROL
  unsigned long long mp_buf_mem;
  unsigned long long mp_buf_mem_max;
  unsigned is_token_flow_control_on;
#endif
#if (MPIDI_STATISTICS || MPIDI_PRINTENV)
  unsigned mp_infolevel;
  unsigned mp_statistics;     /* print pamid statistcs data                           */
  unsigned mp_printenv; ;     /* print env data                                       */
  unsigned mp_interrupts; ;   /* interrupts                                           */
#endif
#ifdef RDMA_FAILOVER
  unsigned mp_s_use_pami_get; /* force the PAMI_Get path instead of PAMI_Rget         */
#endif

#if (MPIU_HANDLE_ALLOCATION_METHOD == MPIU_HANDLE_ALLOCATION_THREAD_LOCAL) && defined(__BGQ__)
  MPIDI_RequestHandle_t request_handles[MPIDI_MAX_THREADS];
#endif

#if QUEUE_BINARY_SEARCH_SUPPORT
  unsigned queue_binary_search_support_on;
#endif

#if CUDA_AWARE_SUPPORT
  unsigned cuda_aware_support_on;
#endif

  unsigned verbose;        /**< The current level of verbosity for end-of-job stats. */
  unsigned statistics;     /**< The current level of stats collection.               */
  unsigned rma_pending;    /**< The max num outstanding requests during an RMA op    */
  unsigned shmem_pt2pt;    /**< Enable optimized shared memory point-to-point functions. */
  unsigned smp_detect;
  pami_geometry_t world_geometry;

  struct
  {
    unsigned collectives;       /**< Enable optimized collective functions. */
    unsigned subcomms;          /**< Enable hardware optimized subcomm's */
    unsigned select_colls;      /**< Enable collective selection */
    unsigned auto_select_colls; /**< Enable automatic collective selection */
    unsigned memory;            /**< Enable memory optimized subcomm's - See MPID_OPT_LVL_xxxx */
    unsigned num_requests;      /**< Number of requests between flow control barriers */
  }
  optimized;

  struct
  {
    volatile unsigned active;  /**< Number of contexts with active async progress */
    unsigned          mode;    /**< 0 == 'disabled', 1 == 'locked', 2 == 'trigger' */
  }
  async_progress;

  struct
  {
    struct
    {
      unsigned requested;    /**< 1 == application requests context post */
      unsigned active;       /**< 1 == context post is currently required */
    } context_post;
  } perobj;                  /**< This structure is only used in the 'perobj' mpich lock mode. */

  unsigned mpir_nbc;         /**< Enable MPIR_* non-blocking collectives implementations. */
  int  numTasks;             /* total number of tasks on a job                            */
  unsigned typed_onesided;       /**< Enable typed PAMI calls for derived types within MPID_Put and MPID_Get. */
#ifdef DYNAMIC_TASKING
  struct MPIDI_PG_t * my_pg; /**< Process group I belong to */
  int                 my_pg_rank; /**< Rank in process group */
#endif
} MPIDI_Process_t;


enum
  {
    MPIDI_Protocols_Short,
    MPIDI_Protocols_ShortSync,
    MPIDI_Protocols_Eager,
    MPIDI_Protocols_RVZ,
    MPIDI_Protocols_Cancel,
    MPIDI_Protocols_Control,
    MPIDI_Protocols_WinCtrl,
    MPIDI_Protocols_WinAccum,
    MPIDI_Protocols_RVZ_zerobyte,
    MPIDI_Protocols_WinGetAccum,
    MPIDI_Protocols_WinGetAccumAck,
#ifdef DYNAMIC_TASKING
    MPIDI_Protocols_Dyntask,
    MPIDI_Protocols_Dyntask_disconnect,
#endif
    MPIDI_Protocols_WinAtomic,
    MPIDI_Protocols_WinAtomicAck,
    MPIDI_Protocols_COUNT,
  };


/**
 * \brief This defines the type of message being sent/received
 * mpid_startall() invokes the correct start based on the type of the request
 */
typedef enum
  {
    MPIDI_REQUEST_PTYPE_RECV,
    MPIDI_REQUEST_PTYPE_SEND,
    MPIDI_REQUEST_PTYPE_BSEND,
    MPIDI_REQUEST_PTYPE_SSEND,
  } MPIDI_REQUEST_PTYPE;


typedef enum
  {
    MPIDI_CONTROL_SSEND_ACKNOWLEDGE,
    MPIDI_CONTROL_CANCEL_REQUEST,
    MPIDI_CONTROL_CANCEL_ACKNOWLEDGE,
    MPIDI_CONTROL_CANCEL_NOT_ACKNOWLEDGE,
    MPIDI_CONTROL_RENDEZVOUS_ACKNOWLEDGE,
    MPIDI_CONTROL_RETURN_TOKENS,
  } MPIDI_CONTROL;


/** \brief Request completion actions */
typedef enum
  {
    MPIDI_CA_COMPLETE,
    MPIDI_CA_UNPACK_UEBUF_AND_COMPLETE,         /**< Unpack uebuf, then complete. */
  } MPIDI_CA;


/**
 * \brief MPIDI_Message_match contains enough information to match an
 * MPI message.
 */
typedef struct
{
  int tag;        /**< match tag     */
  int rank;       /**< match rank    */
  int context_id; /**< match context */
#ifdef OUT_OF_ORDER_HANDLING
  int seqno;      /**< match seqno */
#endif
} MPIDI_Message_match;


/**
 * \brief MPID pt2pt message header
 */
typedef struct
{
  MPI_Request req;         /**< peer's request handle  */
  unsigned    MPItag;      /**< match tag              */
  unsigned    MPIrank;     /**< match rank             */
  uint16_t    MPIctxt;     /**< match context          */

  union {
    uint16_t  flags;
    struct {
      unsigned control:3;  /**< message type for control protocols */
      unsigned isSync:1;   /**< set for sync sends     */
      unsigned isRzv :1;   /**< use pt2pt rendezvous   */
      unsigned    noRDMA:1;    /**< msg sent via shm or mem reg. fails */
      unsigned    reserved:6;  /**< unused bits                        */
      unsigned    tokens:4;    /** tokens need to be returned          */
    } __attribute__ ((__packed__));
  };

#ifdef OUT_OF_ORDER_HANDLING
  unsigned    MPIseqno;    /**< match seqno            */
#endif
#if TOKEN_FLOW_CONTROL
  unsigned    alltokens;   /* control:MPIDI_CONTROL_RETURN_TOKENS  */
#endif
} MPIDI_MsgInfo;

/** \brief Full Rendezvous msg info to be set as two quads of unexpected data. */
typedef struct
{
  MPIDI_MsgInfo    msginfo;
  pami_memregion_t memregion;
#ifdef RDMA_FAILOVER
  uint32_t         memregion_used;
#endif
  void           * data;
  size_t           length;
} MPIDI_MsgEnvelope;

/** \brief This defines the portion of MPID_Request that is specific to the Device */
struct MPIDI_Request
{
  struct MPID_Request  *next;         /**< Link to next req. in queue */
  struct MPID_Datatype *datatype_ptr; /**< Info about the datatype    */
  pami_work_t           post_request; /**<                            */

  MPIDI_MsgEnvelope     envelope;

  void                 *userbuf;      /**< User buffer                */
  MPI_Aint              userbufcount; /**< Userbuf data count         */
  MPI_Datatype          datatype;     /**< Data type of message       */
  pami_task_t           peer_pami;    /**< The other guy's rank (in PAMI) */
  unsigned              peer_comm;    /**< The other guy's rank (in the orig communicator) */
  unsigned            cancel_pending:16; /**< Cancel status              */
  unsigned            uebuf_malloc:16;   /**< does uebuf require free()  */

  unsigned              uebuflen;     /**< Length (bytes) of uebuf    */
  void                 *uebuf;        /**< Unexpected buffer          */

  MPIDI_REQUEST_PTYPE   ptype;        /**< The persistent msg type    */
  MPIDI_CA              ca;           /**< Completion action          */
  pami_memregion_t      memregion;    /**< Rendezvous recv memregion  */
#ifdef OUT_OF_ORDER_HANDLING
  struct MPID_Request  *prev;         /**< Link to prev req. in queue */
  void                 *nextR;        /** < pointer to next recv for the out-of-order list, the out-of-order list is a list per source */
  void                 *prevR;        /** < pointer to prev recv for the out-of-order list, the out-of-order list is a list per source */
  struct MPID_Request  *oo_peer;      /** < pointer to the matched post recv request to complete in the out-of-order case */
#endif
#ifdef RDMA_FAILOVER
  uint32_t             memregion_used:16;
  uint32_t             shm:16;
#endif
#ifdef MPIDI_TRACE
  int   partner_id;
  int   idx;
  int   PR_idx;
#endif
  struct MPIDI_Win_request   *win_req; /* anchor of request based rma handle so as to free it properly when wait is called */
};

typedef void* fast_query_t;
/** \brief This defines the portion of MPID_Comm that is specific to the Device */
struct MPIDI_Comm
{
  pami_geometry_t geometry; /**< Geometry component for collectives      */
  pami_geometry_t parent; /**< The parent geometry this communicator came from */
  pami_algorithm_t *coll_algorithm[PAMI_XFER_COUNT][2];
  pami_metadata_t *coll_metadata[PAMI_XFER_COUNT][2];
  char coll_count[PAMI_XFER_COUNT][2];
  pami_algorithm_t user_selected[PAMI_XFER_COUNT];
  /* no way to tell if user_selected[] is NULL */
  /* could probably union these two though? */
  char user_selected_type[PAMI_XFER_COUNT];
  pami_metadata_t user_metadata[PAMI_XFER_COUNT];
  char last_algorithm[100];
  char preallreduces[MPID_NUM_PREALLREDUCES];
  /* \todo Need to figure out how to deal with algorithms above the pami level */
  char allgathers[4];
  char allgathervs[4];
  char scattervs[2];
  char optgather, optscatter, optreduce;
  unsigned num_requests;
  /* These need to be freed at geom destroy, so we need to store them
   * inside the communicator struct until destroy time rather than
   * allocating pointers on the stack
   */
  /* For create_taskrange */
  pami_geometry_range_t range;
  /* For create_tasklist/endpoints if we ever use it */
  pami_task_t *tasks;
  pami_endpoint_t *endpoints;
  /* There are some protocols where the optimized protocol always works and
   * is the best performance */
  /* Assume we have small vs large cutoffs vs medium for some protocols */
  pami_algorithm_t opt_protocol[PAMI_XFER_COUNT][2];
  int must_query[PAMI_XFER_COUNT][2];
  pami_metadata_t opt_protocol_md[PAMI_XFER_COUNT][2];
  int cutoff_size[PAMI_XFER_COUNT][2];
  /* Our best allreduce protocol always works on 
   * doubles and sum/min/max. Since that is a common
   * occurance let's cache that protocol and call
   * it without checking.  Any other dt/op must be 
   * checked */ 
  pami_algorithm_t cached_allreduce;
  pami_metadata_t cached_allreduce_md;
  int query_cached_allreduce; 

  union tasks_descrip_t {
    /* For create_taskrange */
    pami_geometry_range_t *ranges;
    /* For create_tasklist/endpoints if we ever use it */
    pami_task_t *tasks;
    pami_endpoint_t *endpoints;
  } tasks_descriptor;
#ifdef DYNAMIC_TASKING
  int local_leader;
  long long world_intercomm_cntr;
  int *world_ids;      /* ids of worlds that composed this communicator (inter communicator created for dynamic tasking */
#endif
  fast_query_t collsel_fast_query;
};


typedef struct
{
  pami_work_t state;
  pami_xfer_t *coll_struct;
} MPIDI_Post_coll_t;


/** \brief Forward declaration of the MPID_Comm structure */
struct MPID_Comm;
/** \brief Forward declaration of the MPID_Win structure */
struct MPID_Win;
/** \brief Forward declaration of the MPID_Group structure */
struct MPID_Group;

typedef enum
  {
    MPIDI_REQUEST_LOCK,
    MPIDI_REQUEST_LOCKALL,
  } MPIDI_LOCK_TYPE_t;

struct MPIDI_Win_lock
{
  struct MPIDI_Win_lock *next;
  unsigned               rank;
  MPIDI_LOCK_TYPE_t      mtype;    /* MPIDI_REQUEST_LOCK or MPIDI_REQUEST_LOCKALL    */
  int                    type;
  void                   *flagAddr;
};
struct MPIDI_Win_queue
{
  struct MPIDI_Win_lock *head;
  struct MPIDI_Win_lock *tail;
};

typedef enum {
    MPIDI_ACCU_ORDER_RAR = 1,
    MPIDI_ACCU_ORDER_RAW = 2,
    MPIDI_ACCU_ORDER_WAR = 4,
    MPIDI_ACCU_ORDER_WAW = 8
} MPIDI_Win_info_accumulate_ordering;

typedef enum {
    MPIDI_ACCU_SAME_OP,
    MPIDI_ACCU_SAME_OP_NO_OP
} MPIDI_Win_info_accumulate_ops;

typedef struct MPIDI_Win_info_args {
    int no_locks;
    MPIDI_Win_info_accumulate_ordering accumulate_ordering;
    MPIDI_Win_info_accumulate_ops      accumulate_ops;       /* default is same_op_no_op  */
    int same_size;
    int alloc_shared_noncontig;
} MPIDI_Win_info_args;

typedef struct workQ_t {
   void *msgQ;
   int  count;
} workQ_t;


/**
 * \brief Collective information related to a window
 *
 * This structure is used to share information about a local window with
 * all nodes in the window communicator. Part of that information includes
 * statistics about RMA operations during access/exposure epochs.
 *
 * The structure is allocated as an array sized for the window communicator.
 * Each entry in the array corresponds directly to the node of the same rank.
 */
typedef struct MPIDI_Win_info
{
  void             * base_addr;     /**< Node's exposure window base address                  */
  struct MPID_Win  * win;
  uint32_t           disp_unit;     /**< Node's exposure window displacement units            */
  pami_memregion_t   memregion;     /**< Memory region descriptor for each node               */
  uint32_t           memregion_used;
  MPI_Aint           base_size;     /**< Node's exposure window base size in bytes            */
} MPIDI_Win_info;

typedef pthread_mutex_t MPIDI_SHM_MUTEX;

typedef struct MPIDI_Win_shm_ctrl_t
{
  MPIDI_SHM_MUTEX  mutex_lock;    /* shared memory windows -- lock for    */
                                     /*     accumulate/atomic operations     */
  OPA_int_t       active;
  int        shm_count;
} MPIDI_Win_shm_ctrl_t;

typedef struct MPIDI_Win_shm_t
{
    int allocated;                  /* flag: TRUE iff this window has a shared memory
                                                 region associated with it */
    void *base_addr;                /* base address of shared memory region */
    MPI_Aint segment_len;           /* size of shared memory region         */
    union
    {
      uint32_t shm_id;                /* shared memory id - sysv            */
      char     shm_key[64];         /* shared memory key - posix            */
    };
    MPIDI_Win_shm_ctrl_t  *ctrl;
} MPIDI_Win_shm_t;

/**
 * \brief Structure of PAMI extensions to MPID_Win structure
 */
struct MPIDI_Win
{
  struct MPIDI_Win_info     *info;          /**< allocated array of collective info             */
  MPIDI_Win_info_args info_args;
  void             ** shm_base_addrs; /* base address shared by all process in comm      */
  MPIDI_Win_shm_t  *shm;             /* shared memory info                             */
  workQ_t work;
  int   max_ctrlsends;
  struct MPIDI_Win_sync
  {
#if 0
    /** \todo optimize some of the synchronization assertion */
    uint32_t assert; /**< MPI_MODE_* bits asserted at epoch start              */
#endif

    volatile int origin_epoch_type; /**< curretn epoch type for origin */
    volatile int target_epoch_type; /**< curretn epoch type for target */

    /* These fields are reset by the sync functions */
    uint32_t          total;    /**< The number of PAMI requests that we know about (updated only by calling thread) */
    volatile uint32_t started;  /**< The number of PAMI requests made (updated only in the context_post callback) */
    volatile uint32_t complete; /**< The number of completed PAMI requests (only updated by the done callbacks) */

    struct MPIDI_Win_sync_pscw
    {
      struct MPID_Group * group;
      volatile unsigned   count;
    } sc, pw;
    struct MPIDI_Win_sync_lock
    {
      struct
      {
        volatile unsigned locked;
        volatile unsigned allLocked;
      } remote;
      struct
      {
        struct MPIDI_Win_queue requested;
        int                    type;
        unsigned               count;
      } local;
    } lock;
  } sync;
  int request_based;          /* flag for request based rma */
  struct MPID_Request *rreq;  /* anchor of MPID_Request for request based rma */
};

/**
 * \brief Structures and typedefs for collective selection extensions in PAMI
 */

typedef void* advisor_t;
typedef void* advisor_table_t;
typedef void* advisor_attribute_name_t;

typedef union
{
  size_t         intval;
  double         doubleval;
  const char *   chararray;
  const size_t * intarray;
} advisor_attribute_value_t;

typedef struct
{
  advisor_attribute_name_t  name;
  advisor_attribute_value_t value;
} advisor_configuration_t;

typedef struct {
   pami_xfer_type_t  *collectives;
   size_t             num_collectives;
   size_t            *procs_per_node;
   size_t             num_procs_per_node;
   size_t            *geometry_sizes;
   size_t             num_geometry_sizes;
   size_t            *message_sizes;
   size_t             num_message_sizes;
   int                iter;
   int                verify;
   int                verbose;
   int                checkpoint;
} advisor_params_t;

typedef enum
{
  COLLSEL_ALGO = 0,      /* 'Always works' PAMI algorithm */
  COLLSEL_QUERY_ALGO,    /* 'Must query' PAMI algorithm */
  COLLSEL_EXTERNAL_ALGO, /* External algorithm */
} advisor_algorithm_type_t;

/* External algorithm callback function */
typedef pami_result_t (*external_algorithm_fn)(pami_xfer_t *, void *);

typedef struct
{
  external_algorithm_fn callback;
  void                 *cookie;
} external_algorithm_t;

typedef struct
{
  union
  {
    pami_algorithm_t     internal;/* PAMI Algorithm */
    external_algorithm_t external;/* External Algorithm */
  } algorithm;
  pami_metadata_t *metadata;
  advisor_algorithm_type_t algorithm_type;
} advisor_algorithm_t;


typedef pami_result_t (*pami_extension_collsel_init)            (pami_client_t,
                                                                 advisor_configuration_t [],
                                                                 size_t,
                                                                 pami_context_t [],
                                                                 size_t,
                                                                 advisor_t *);

typedef pami_result_t (*pami_extension_collsel_destroy)         (advisor_t *);

typedef int (*pami_extension_collsel_initialized)               (pami_client_t, advisor_t *);

typedef pami_result_t (*pami_extension_collsel_table_load)      (advisor_t,
                                                                 char *,
                                                                 advisor_table_t *);

typedef pami_result_t (*pami_extension_collsel_get_collectives) (advisor_table_t,
                                                                 pami_xfer_type_t **,
                                                                 unsigned          *);

typedef pami_result_t (*pami_extension_collsel_register_algorithms) (advisor_table_t,
                                                                     pami_geometry_t,
                                                                     pami_xfer_type_t,
                                                                     advisor_algorithm_t *,
                                                                     size_t);

typedef pami_result_t (*external_geometry_create_fn)(pami_geometry_range_t* task_slices,
                                                     size_t           slice_count,
                                                     pami_geometry_t *geometry,
                                                     void           **cookie);

typedef pami_result_t (*external_geometry_destroy_fn)(void *cookie);

typedef pami_result_t (*external_register_algorithms_fn)(void *cookie,
                                                         pami_xfer_type_t collective,
                                                         advisor_algorithm_t **algorithms,
                                                         size_t              *num_algorithms);

typedef struct
{
  external_geometry_create_fn   geometry_create;
  external_geometry_destroy_fn  geometry_destroy;
  external_register_algorithms_fn register_algorithms;
} external_geometry_ops_t;

typedef pami_result_t (*pami_extension_collsel_table_generate)  (advisor_t,
                                                                 char *,
                                                                 advisor_params_t *,
                                                                 external_geometry_ops_t *,
                                                                 int);

typedef pami_result_t (*pami_extension_collsel_query_create) (advisor_table_t  advisor_table,
                                                              pami_geometry_t  geometry,
                                                              fast_query_t    *query);

typedef pami_result_t (*pami_extension_collsel_query_destroy) (fast_query_t *query);

typedef int (*pami_extension_collsel_advise) (fast_query_t        fast_query,
                                              pami_xfer_type_t    xfer_type,
                                              size_t              message_size,
                                              advisor_algorithm_t algorithms_optimized[],
                                              size_t              max_algorithms);



#endif