Blob Blame History Raw
#!/bin/sh
# BEGIN_ICS_COPYRIGHT8 ****************************************
# 
# Copyright (c) 2015, Intel Corporation
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#     * Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of Intel Corporation nor the names of its contributors
#       may be used to endorse or promote products derived from this software
#       without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# 
# END_ICS_COPYRIGHT8   ****************************************

# [ICS VERSION STRING: unknown]


#------------------------------------------------------------------
# Initialization:
# - specify the absolute paths mpirun if is is not in $PATH 
# - EXIT_VALUE should not be set to 0
#------------------------------------------------------------------
cleanup_openmpi()
{
    trap "" 1 2 3 15

    for afile in $APP_FILE $HOST_FILE
    do
        if [ -f "$afile" ]; then
            rm -f $afile >/dev/null 1>&2
        fi
    done
}

sig_handler_opempi()
{
    trap "" 1 2 3 15
    echo "Signal received. Exiting ..." | tee -a $LOGFILE
    cleanup_openmpi
    exit 1
}

# -----------------------------------------------------
# Source the LSF environment. Optional.
# -----------------------------------------------------
. ${LSF_ENVDIR}/lsf.conf

# -----------------------------------------------------
# Set up the variable LSF_TS representing the TaskStarter.
# -----------------------------------------------------
LSF_TS="$LSF_BINDIR/TaskStarter"
USR_APP_FILE=""

# This script assumes mpirun of OPENMPI can be found in PATH 
# environment variable.
MPIHOME=/opt/openmpi/gnu
#MPIRUN_CMD="$MPIHOME/bin/mpirun"
MPIRUN_CMD="$MPICH_PREFIX/bin/mpirun"

LOGFILE="/dev/null"
EXIT_VALUE="66"
TASK_GEOM_OK="0"
JOB_SHAREDIR="$HOME"
OPENMPI_OPTS=" "
if [ "$JOB_SHAREDIR" = "" ]; then
    JOB_SHAREDIR=`pwd`
fi

#
# If task geometry is not used 
# Reverse order of the host list to gain better performance
#
if [ -n "$LSB_PJL_TASK_GEOMETRY" ]; then
    REVERSE_ORDER="n"
else
    REVERSE_ORDER="y"
fi

#------------------------------------------------------------------
# Create files with a unique name based on the LSF job ID:
# - log file 
# - machine file 
# - temp file 
#------------------------------------------------------------------
if [ "$LSB_BATCH_JID" != "" ]; then
    UNIQUE_ID="$LSB_BATCH_JID"
else
    UNIQUE_ID=`hostname`_"$$"
fi

APP_FILE="$JOB_SHAREDIR/.openmpi_appfile_${UNIQUE_ID}"
HOST_FILE="$JOB_SHAREDIR/.host_file_${UNIQUE_ID}"

trap "sig_handler_opempi" 1 2 3 15

#
# Reverse LSB_MCPU_HOSTS
#
if [ "$REVERSE_ORDER" = "y" ]; then
    HOST=""
    NEW_LSB_MCPU_HOSTS=""
    for i in $LSB_MCPU_HOSTS
    do
        if [ -z "$HOST" ]
        then
            HOST="$i"
        else
            NEW_LSB_MCPU_HOSTS="$HOST $i $NEW_LSB_MCPU_HOSTS"
            HOST=""
        fi
    done
    LSB_MCPU_HOSTS=$NEW_LSB_MCPU_HOSTS
fi

#  -----------------------------------------------------
#  Process the command line: 
# - extract [mpiopts] from the command line
# - extract jobname [jobopts] from the command line
#  -----------------------------------------------------
while [ $# -gt 0 ]
do
    case "$1" in
       -aborted|--aborted|-path|--path|--tmpdir|--universe|-x|-wdir|--wdir|--prefix|-debugger|--debugger)
           OPENMPI_OPTS="$OPENMPI_OPTS $1 $2 "
           shift
           shift
           ;;
        -app|--app) 
            # --app <arg0>  Provide an appfile; ignore all other command line options
            echo "User defined -app option found" >> $LOGFILE
            USR_APP_FILE="$2"
            shift
            shift
            ;;
        -bynode|--bynode|-byslot|--byslot)
            if [ -z "$LSB_PJL_TASK_GEOMETRY" ]; then
                OPENMPI_OPTS="$OPENMPI_OPTS $1 "
            else
                echo "LSB_PJL_TASK_GEOMETRY is defined, mpirun option $1 will be ignored" >> $LOGFILE
            fi
            shift
            ;;
        -c|-np|--np|-n|--n|-tv|--tv)
            echo "mpirun option $1 $2 will be ignored" >> $LOGFILE
            # <arg0>       Number of processes to run
            shift
            shift
            ;;
        -d|-debug|--debug|--debug-daemons|--debug-daemons-file|--no-daemonize|--debug-devel|-q|--quiet|-V|--version|-nooversubscribe|--nooversubscribe )  
            OPENMPI_OPTS="$OPENMPI_OPTS $1 "
            shift
            ;;
        -gmca|--gmca|-mca|--mca) 
            #<arg0> <arg1>
            OPENMPI_OPTS="$OPENMPI_OPTS $1 $2 $3 "
            shift
            shift
            shift
            ;;
        -h|--help|-nw|--nw|-v|--verbose)
            OPENMPI_OPTS="$OPENMPI_OPTS $1 "
            shift
            ;;
        -H|-host|--host|-hostfile|--hostfile|-machinefile|--machinefile)
            echo "mpirun option $1 $2 will be ignored" >> $LOGFILE
            # <arg0>   List of hosts to invoke processes on
            shift
            shift
            ;;
        -nolocal|--nolocal)
            echo "mpirun option $1 will be ignored" >> $LOGFILE
            shift
            ;;
	*)
            break
            ;;
    esac
done

JOB_CMDLN="$*"

# -----------------------------------------------------
#  Set up the CMD_LINE variable representing the integrated section of the command line:
# - LSF_TS, script variable representing the TaskStarter binary. 
#   TaskStarter must start each and every job task process.
# - LSF_TS_OPTIONS, LSF environment variable containing all necessary information 
#   for TaskStarter to callback to LSF's Parallel Application Manager.
# - JOB_CMDLN, script variable containing the job and job options
#--------------------------------------------------------------------------------
if [ -z "$LSF_TS_OPTIONS" ]
then
    echo CMD_LINE="$JOB_CMDLN" >> $LOGFILE
    CMD_LINE="$JOB_CMDLN "
else
    echo CMD_LINE="$LSF_TS $LSF_TS_OPTIONS $JOB_CMDLN" >> $LOGFILE
    CMD_LINE="$LSF_TS $LSF_TS_OPTIONS $JOB_CMDLN "
fi

#------------------------------------------------------------------
# Construct LSF Job -app file
#------------------------------------------------------------------
if [ "$USR_APP_FILE" != "" ]; then
    # User defined --app <appfile>
    echo "User defined -app $USR_APP_FILE is not allowed. Exiting..." | tee -a $LOGFILE
    cleanup_openmpi
    exit 1
elif [ "$LSB_PJL_TASK_GEOMETRY" != "" ]; then
    #------------------------------------------------------------------
    # handle $LSB_PJL_TASK_GEOMETRY
    # It will shuffle the order the appfile
    # based on the order of task geometry
    #------------------------------------------------------------------
    . $LSF_BINDIR/pjllib.sh
    TOTAL_CPUS=`echo $LSB_MCPU_HOSTS | /bin/awk '
BEGIN {counter=0}
{
    size = split($0, a, " ");
    for (i = 1; i <= size; i += 2) {
        counter = counter + a[i + 1];
        for (j = 0; j < a[i + 1]; ++j) {
            print a[i] > hfile;
        }
    }
}
END { print counter}' hfile=$HOST_FILE`

    if [ -f "${LSF_BINDIR}/pjllib.sh" ]; then
        # get a host list each host per line that satisfies the task geometry
        # then construct the appfile based on new host list
        . ${LSF_BINDIR}/pjllib.sh
        reorder_file_based_on_task_geom "$HOST_FILE" "sort"
        EXIT_VALUE=$?
        if [ "$EXIT_VALUE" != "0" ]; then
            echo "Error in reorder_file_based_on_task_geom \"$HOST_FILE\" \"sort\", Exit ..." 1>&2
            cleanup_openmpi
            exit ${EXIT_VALUE}

        fi
        _new_host_list=`cat $HOST_FILE`
        echo $_new_host_list | /bin/awk ' {
            size = split($0, a, " ");
            for (i = 1; i <= size; i++) {
                newln = sprintf("-host %s -n 1 %s %s",a[i], (openmpi_opt)?openmpi_opt:" ", (cmdln)?cmdln:" ");
                print newln > appfile
            }
        }' appfile="$APP_FILE" cmdln="$CMD_LINE" openmpi_opt="$OPENMPI_OPTS"
    else
        echo "Cannot find pjllib.sh in $LSF_BINDIR. Exiting..." 1>&2
        cleanup_openmpi
        exit 1
        
    fi
else
    echo $LSB_MCPU_HOSTS | /bin/awk ' {
        size = split($0, a, " ");
        for (i = 1; i < size; i++) {
            newln = "";
            newln = sprintf("-host %s -n %s %s %s",a[i], a[++i], (openmpi_opt)?openmpi_opt:" ", (cmdln)?cmdln:" ");
            print newln > appfile
        }
    }' appfile="$APP_FILE" cmdln="$CMD_LINE" openmpi_opt="$OPENMPI_OPTS"
fi

echo "appfile $APP_FILE reads" >> $LOGFILE
cat $APP_FILE >> $LOGFILE
echo "command used to launch the job : " >> $LOGFILE
echo "   $MPIRUN_CMD -app $APP_FILE " >> $LOGFILE
$MPIRUN_CMD --app $APP_FILE 
EXIT_VALUE=$?

#------------------------------------------------------------------
# Clean up:
# - remove temporary files 
# - exit with the exit value of the mpirun command 
#------------------------------------------------------------------
cleanup_openmpi
exit $EXIT_VALUE

#-------------------------------------------------------------------------
# End the script.
#-------------------------------------------------------------------------