|
Packit |
857059 |
#!/bin/bash
|
|
Packit |
857059 |
# BEGIN_ICS_COPYRIGHT8 ****************************************
|
|
Packit |
857059 |
#
|
|
Packit |
857059 |
# Copyright (c) 2015, Intel Corporation
|
|
Packit |
857059 |
#
|
|
Packit |
857059 |
# Redistribution and use in source and binary forms, with or without
|
|
Packit |
857059 |
# modification, are permitted provided that the following conditions are met:
|
|
Packit |
857059 |
#
|
|
Packit |
857059 |
# * Redistributions of source code must retain the above copyright notice,
|
|
Packit |
857059 |
# this list of conditions and the following disclaimer.
|
|
Packit |
857059 |
# * Redistributions in binary form must reproduce the above copyright
|
|
Packit |
857059 |
# notice, this list of conditions and the following disclaimer in the
|
|
Packit |
857059 |
# documentation and/or other materials provided with the distribution.
|
|
Packit |
857059 |
# * Neither the name of Intel Corporation nor the names of its contributors
|
|
Packit |
857059 |
# may be used to endorse or promote products derived from this software
|
|
Packit |
857059 |
# without specific prior written permission.
|
|
Packit |
857059 |
#
|
|
Packit |
857059 |
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
Packit |
857059 |
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
Packit |
857059 |
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
Packit |
857059 |
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
|
Packit |
857059 |
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
Packit |
857059 |
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
Packit |
857059 |
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
Packit |
857059 |
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
Packit |
857059 |
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
Packit |
857059 |
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
Packit |
857059 |
#
|
|
Packit |
857059 |
# END_ICS_COPYRIGHT8 ****************************************
|
|
Packit |
857059 |
|
|
Packit |
857059 |
# [ICS VERSION STRING: unknown]
|
|
Packit |
857059 |
|
|
Packit |
857059 |
temp1="$(mktemp)"
|
|
Packit |
857059 |
temp2="$(mktemp)"
|
|
Packit |
857059 |
tmpdir=$PWD/tmp
|
|
Packit |
857059 |
|
|
Packit |
857059 |
trap "rm -f $temp1 $temp2; exit 1" SIGHUP SIGTERM SIGINT
|
|
Packit |
857059 |
trap "rm -f $temp1 $temp2" EXIT
|
|
Packit |
857059 |
|
|
Packit |
857059 |
BATCH_SIZE=${BATCH_SIZE:-18} # now many hosts per mpi job
|
|
Packit |
857059 |
MIN_BATCH_SIZE=${MIN_BATCH_SIZE:-2} # now many hosts per mpi job
|
|
Packit |
857059 |
|
|
Packit |
857059 |
# start the given run_* script as a series of small BATCH_SIZE node jobs.
|
|
Packit |
857059 |
# This approach limits the impact should a few hosts
|
|
Packit |
857059 |
# crash and also allows for quicker job startup.
|
|
Packit |
857059 |
|
|
Packit |
857059 |
# jobs are nohup'ed so they can run for long duration in the background
|
|
Packit |
857059 |
|
|
Packit |
857059 |
Usage() {
|
|
Packit |
857059 |
echo "Usage: ./run_batch_script [-e] run_script [args]" >&2
|
|
Packit |
857059 |
echo " or" >&2
|
|
Packit |
857059 |
echo " ./run_batch_script --help" >&2
|
|
Packit |
857059 |
echo " -e - force an even number of hosts in final batch by skipping the last one" >&2
|
|
Packit |
857059 |
echo " run_script - a run_* script from this directory" >&2
|
|
Packit |
857059 |
echo " args - arguments for the run_script" >&2
|
|
Packit |
857059 |
echo " if the 1st arg is 'NP' it will be replaced with the process count" >&2
|
|
Packit |
857059 |
echo >&2
|
|
Packit |
857059 |
echo " This will build a set of mpi_hosts.# files with no" >&2
|
|
Packit |
857059 |
echo " more than BATCH_SIZE hosts each. If -e is specified and an odd number of" >&2
|
|
Packit |
857059 |
echo " hosts appear in mpi_hosts, the last one is skipped" >&2
|
|
Packit |
857059 |
echo " Each run_script MPI job will have its output saved to a corresponding" >&2
|
|
Packit |
857059 |
echo " ./tmp/nohup.#.out file" >&2
|
|
Packit |
857059 |
echo >&2
|
|
Packit |
857059 |
echo " This may only used for scripts which do use MPI_HOSTS." >&2
|
|
Packit |
857059 |
echo " To run run_cabletest use run_batch_cabletest" >&2
|
|
Packit |
857059 |
echo "Environment:" >&2
|
|
Packit |
857059 |
echo " MPI_HOSTS - mpi_hosts file to use, default is mpi_hosts" >&2
|
|
Packit |
857059 |
echo " BATCH_SIZE - max hosts per MPI job, default is 18, if -e must be even" >&2
|
|
Packit |
857059 |
echo " MIN_BATCH_SIZE - min hosts per MPI job, default is 2, if -e must be even" >&2
|
|
Packit |
857059 |
echo "Examples:" >&2
|
|
Packit |
857059 |
echo " ./run_batch_script run_deviation NP ff" >&2
|
|
Packit |
857059 |
echo " BATCH_SIZE=2 MPI_HOSTS=good ./run_batch_script run_lat2" >&2
|
|
Packit |
857059 |
echo " BATCH_SIZE=16 MPI_HOSTS=good ./run_batch_script run_deviation ff" >&2
|
|
Packit |
857059 |
echo " MIN_BATCH_SIZE=16 BATCH_SIZE=16 ./run_batch_script run_hpl2 16" >&2
|
|
Packit |
857059 |
exit 2
|
|
Packit |
857059 |
}
|
|
Packit |
857059 |
|
|
Packit |
857059 |
if [ x"$1" = x"--help" ]
|
|
Packit |
857059 |
then
|
|
Packit |
857059 |
Usage
|
|
Packit |
857059 |
fi
|
|
Packit |
857059 |
|
|
Packit |
857059 |
even=n
|
|
Packit |
857059 |
if [ x"$1" = x"-e" ]
|
|
Packit |
857059 |
then
|
|
Packit |
857059 |
even=y
|
|
Packit |
857059 |
shift
|
|
Packit |
857059 |
fi
|
|
Packit |
857059 |
|
|
Packit |
857059 |
if [ $# -lt 1 ]
|
|
Packit |
857059 |
then
|
|
Packit |
857059 |
Usage
|
|
Packit |
857059 |
fi
|
|
Packit |
857059 |
script=$1
|
|
Packit |
857059 |
shift
|
|
Packit |
857059 |
|
|
Packit |
857059 |
if [ $(basename $script) = 'run_cabletest' ]
|
|
Packit |
857059 |
then
|
|
Packit |
857059 |
echo "To execute run_cabletest use run_batch_cabletest instead" >&2
|
|
Packit |
857059 |
exit 1
|
|
Packit |
857059 |
fi
|
|
Packit |
857059 |
|
|
Packit |
857059 |
np=n
|
|
Packit |
857059 |
if [ x"$1" = x"NP" ]
|
|
Packit |
857059 |
then
|
|
Packit |
857059 |
np=y
|
|
Packit |
857059 |
shift
|
|
Packit |
857059 |
fi
|
|
Packit |
857059 |
|
|
Packit |
857059 |
|
|
Packit |
857059 |
export MPI_HOSTS=${MPI_HOSTS:-mpi_hosts}
|
|
Packit |
857059 |
if [ ! -e $MPI_HOSTS ]
|
|
Packit |
857059 |
then
|
|
Packit |
857059 |
echo "MPI_HOSTS='$MPI_HOSTS': Not Found" >&2
|
|
Packit |
857059 |
Usage
|
|
Packit |
857059 |
fi
|
|
Packit |
857059 |
|
|
Packit |
857059 |
if ! [ $BATCH_SIZE -gt 0 ] 2>/dev/null
|
|
Packit |
857059 |
then
|
|
Packit |
857059 |
echo "Invalid BATCH_SIZE='$BATCH_SIZE': must be a number > 0" >&2
|
|
Packit |
857059 |
Usage
|
|
Packit |
857059 |
fi
|
|
Packit |
857059 |
if [ $even = y -a $(($BATCH_SIZE % 2)) -eq 1 ]
|
|
Packit |
857059 |
then
|
|
Packit |
857059 |
echo "Invalid BATCH_SIZE='$BATCH_SIZE': must be an even number" >&2
|
|
Packit |
857059 |
Usage
|
|
Packit |
857059 |
fi
|
|
Packit |
857059 |
if ! [ $MIN_BATCH_SIZE -gt 0 ] 2>/dev/null
|
|
Packit |
857059 |
then
|
|
Packit |
857059 |
echo "Invalid MIN_BATCH_SIZE='$MIN_BATCH_SIZE': must be a number > 0" >&2
|
|
Packit |
857059 |
Usage
|
|
Packit |
857059 |
fi
|
|
Packit |
857059 |
if [ $even = y -a $(($MIN_BATCH_SIZE % 2)) -eq 1 ]
|
|
Packit |
857059 |
then
|
|
Packit |
857059 |
echo "Invalid MIN_BATCH_SIZE='$MIN_BATCH_SIZE': must be an even number" >&2
|
|
Packit |
857059 |
Usage
|
|
Packit |
857059 |
fi
|
|
Packit |
857059 |
if [ $MIN_BATCH_SIZE -gt $BATCH_SIZE ] 2>/dev/null
|
|
Packit |
857059 |
then
|
|
Packit |
857059 |
echo "Invalid MIN_BATCH_SIZE='$MIN_BATCH_SIZE': must be < BATCH_SIZE of $BATCH_SIZE" >&2
|
|
Packit |
857059 |
Usage
|
|
Packit |
857059 |
fi
|
|
Packit |
857059 |
|
|
Packit |
857059 |
# ignore blank and comment lines in mpi_hosts
|
|
Packit |
857059 |
cat $MPI_HOSTS|egrep -v '^[[:space:]]*#'|egrep -v '^[[:space:]]*$' > $temp1
|
|
Packit |
857059 |
echo "Starting $script jobs using $MPI_HOSTS: $(cat $temp1|wc -l) hosts"
|
|
Packit |
857059 |
echo " Max batch size $BATCH_SIZE, Min batch size $MIN_BATCH_SIZE"
|
|
Packit |
857059 |
|
|
Packit |
857059 |
#set -x
|
|
Packit |
857059 |
mkdir -p $tmpdir
|
|
Packit |
857059 |
loop=1
|
|
Packit |
857059 |
while [ $(cat $temp1|wc -l) -ge $MIN_BATCH_SIZE ]
|
|
Packit |
857059 |
do
|
|
Packit |
857059 |
head -$BATCH_SIZE $temp1 > $tmpdir/mpi_hosts.$loop
|
|
Packit |
857059 |
if [ $even = y -a $(($(cat $tmpdir/mpi_hosts.$loop|wc -l) % 2)) -eq 1 ]
|
|
Packit |
857059 |
then
|
|
Packit |
857059 |
# need an even number, drop the last one
|
|
Packit |
857059 |
echo "Skipping odd host:" `tail -1 $temp1`
|
|
Packit |
857059 |
head -$(($(cat $temp1|wc -l) - 1)) $temp1 > $tmpdir/mpi_hosts.$loop
|
|
Packit |
857059 |
fi
|
|
Packit |
857059 |
tail -n +$(($BATCH_SIZE + 1)) $temp1 > $temp2; mv $temp2 $temp1
|
|
Packit |
857059 |
NP=$(cat $tmpdir/mpi_hosts.$loop|wc -l)
|
|
Packit |
857059 |
echo "starting $script job $loop on $NP hosts"
|
|
Packit |
857059 |
[ $np = n ] && NP= # don't insert process count
|
|
Packit |
857059 |
export MPI_HOSTS=$tmpdir/mpi_hosts.$loop
|
|
Packit |
857059 |
export LOGSUFFIX=".$loop"
|
|
Packit |
857059 |
nohup ./$script $NP "$@" > $tmpdir/nohup.$loop.out 2>&1 &
|
|
Packit |
857059 |
loop=$(($loop + 1))
|
|
Packit |
857059 |
#echo "Enter to Continue"
|
|
Packit |
857059 |
#read trash
|
|
Packit |
857059 |
done
|
|
Packit |
857059 |
|
|
Packit |
857059 |
if [ $(cat $temp1|wc -l) -gt 0 ]
|
|
Packit |
857059 |
then
|
|
Packit |
857059 |
echo "Skipping odd hosts:" `cat $temp1`
|
|
Packit |
857059 |
fi
|
|
Packit |
857059 |
rm -f $temp1 $temp2
|
|
Packit |
857059 |
|
|
Packit |
857059 |
echo "Started $(($loop - 1 )) MPI $script jobs"
|