Blob Blame History Raw
#!/bin/bash
# BEGIN_ICS_COPYRIGHT8 ****************************************
# 
# Copyright (c) 2015-2018, Intel Corporation
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#     * Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of Intel Corporation nor the names of its contributors
#       may be used to endorse or promote products derived from this software
#       without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# 
# END_ICS_COPYRIGHT8   ****************************************

# [ICS VERSION STRING: unknown]

# Usage: opacapture output_file_name
# captures system information for IntelOPA problem reporting

Usage_full()
{
	echo "Usage: opacapture [-d detail] output_tgz_file" >&2
	echo "              or" >&2
	echo "       opacapture --help" >&2
	echo "   --help - produce full help text" >&2
	echo "   -d detail - level of detail of capture" >&2
	echo "               1-Local 2-Fabric 3-Fabric+FDB 4-Analysis (default=1)" >&2
	echo "This will capture critical system information into a zipped tar file" >&2
	echo "The program will automatically append .tgz to the <output_tgz_file>" >&2
	echo "if it does not already have a .tgz suffix" >&2
	echo "The resulting tar file should be sent to Customer Support along with any" 2>&1
	echo "IntelOPA problem report regarding this system" >&2
}

Usage()
{
	Usage_full
	exit 2
}

if [ `basename $0` = ics_capture ]
then
	echo "warning: ics_capture is depricated, use opacapture" >&2
fi

if [ x"$1" = "x--help" ]
then
	Usage_full
	exit 0
fi


if [ -f /usr/lib/opa/tools/ff_funcs ]
then
	. /usr/lib/opa/tools/ff_funcs
	ff_available=y
else
	ff_available=n
fi

if [ $ff_available = "y" ]
then	
	if [ -f $CONFIG_DIR/opa/opafastfabric.conf  ]
	then
		. $CONFIG_DIR/opa/opafastfabric.conf
	fi
	
	. /usr/lib/opa/tools/opafastfabric.conf.def
fi

detail=1

while getopts d: param
do
	case $param in
        d)
                detail="$OPTARG";;
	?)
		Usage;;
	esac
done
shift $((OPTIND -1))

if [ $# != 1 ]
then
	Usage
fi

if [ `id -u` -ne 0 ]
then
	echo "This must be run as user root" >&2
	Usage
fi

if [ x`expr "$1" : '\(/\).*'` != x'/' ]
then
	# relative path
	tar_file=`pwd`/$1
else
	# absolute path
	tar_file=$1
fi

# append .tgz suffix if not already present
if [ x$(expr "$tar_file" : '.*\(\.tgz\)') != x'.tgz' ]
then
	tar_file="$tar_file.tgz"
fi

dir=tmp/capture$$
rm -rf /$dir
mkdir /$dir

echo "Capture Info: Detail: $detail; Date: $(date)" >> /$dir/capture_info

echo "Getting software and firmware version information ..."
echo "[ICS VERSION STRING: unknown]" > /$dir/sw_version
uname -a > /$dir/os_version
# we use query format so we can get ARCH information
rpm --queryformat '[%{NAME}-%{VERSION}-%{RELEASE}.%{ARCH}\n]' -qa > /$dir/rpms.detailed
# get simple version just to be safe
rpm -qa > /$dir/rpms

sha256sum /usr/lib/opa-fm/runtime/sm > /$dir/sha256sums
sha256sum /usr/lib/opa-fm/runtime/fe >> /$dir/sha256sums

echo "Capturing FM binaries and debuginfo if available"
rpm -q opa-fm-debuginfo > /dev/null 2>&1
if [ $? -eq 0 ]; then
	debuginfofiles=$(rpm -ql opa-fm-debuginfo | xargs -I% bash -c "if [ -f % ]; then echo %; fi")
	tar -zcf /$dir/opa-fm-debuginfo.tgz $debuginfofiles > /dev/null 2>&1
	unset debuginfofiles
fi
tar -zcf /$dir/opa-fm-bins.tgz /usr/lib/opa-fm/runtime/ > /dev/null 2>&1

# Finding the PCI devices
for fw in `lspci -n | grep "8086:24f0"`
do
	# Just get the PCI info for now...
	echo "$fw" >> /$dir/fw_info 
done

opahfirev > /$dir/opahfirev 2>&1

echo "Capturing Firmware info if available"
type hfi1_eprom > /dev/null 2>&1
if [ $? -eq 0 ]; then
	hfi1_eprom -d all -V > /$dir/uefi_version 2>&1
fi

type opatmmtool > /dev/null 2>&1
if [ $? -eq 0 ]; then
	opatmmtool 2>/dev/null 1>/dev/null
	tmm=$?
	if [ $tmm -ne 3 ]
	then
		echo "Getting TMM information..."
		opatmmtool -v status > /$dir/f4status 2>&1
		opatmmtool -f /$dir/f4otpdump dumpotp 2>&1
	fi
fi

echo "Obtaining OS configuration ..."
# get library config
ldconfig -p > /$dir/ldconfig
# get current runlevel
who -r > /$dir/who_r 2>/dev/null	# not available on all OSs
runlevel > /$dir/runlevel	# not available on all OSs
# get service startup configuration
if [ $(command -v systemctl) ]; then
	systemctl list-unit-files > /$dir/chkconfig.systemd
fi
chkconfig --list > /$dir/chkconfig 2>/dev/null
ulimit -a > /$dir/ulimit
uptime > /$dir/uptime

echo "Obtaining dmesg logs ..."
dmesg -T > /$dir/dmesg

echo "Obtaining present process and module list ..."
lsmod > /$dir/lsmod 2>&1
depmod -a 2>&1
cp -p /lib/modules/`uname -r`/modules.dep /$dir/modules.dep
ps -welf > /$dir/ps 2>&1

echo "Obtaining module info for hfi1 ..."
modinfo hfi1 > /$dir/modinfo_hfi1 2>&1

echo "Obtaining PCI device list ..."
lspci -vvv -xxxx > /$dir/lspci 2>&1
ls -l /dev/ipath* /dev/hfi* /dev/infiniband > /$dir/lsdev

echo "Obtaining processor information ..."
cpucount=$(grep -c processor /proc/cpuinfo)
cpupower -c 0-$((cpucount - 1)) frequency-info > /$dir/cpu-frequency-info 2>&1
grep . /sys/devices/system/cpu/cpu*/cpufreq/scaling* > /$dir/cpu-scaling-info 2>&1
grep . /sys/devices/system/cpu/intel_pstate/* > /$dir/cpu-intel_pstate 2>&1
unset cpucount

lscpu > /$dir/lscpu 2>&1
lscpu --extended=CPU,CORE,SOCKET,NODE,BOOK,DRAWER,CACHE,POLARIZATION,ADDRESS,CONFIGURED > /$dir/lscpu-extended 2>&1

echo "Obtaining environment variables ..."
env > /$dir/env 2>&1

echo "Obtaining network interfaces ..."
ip addr show > /$dir/ifconfig 2>&1

echo "Obtaining DMI information ..."
dmidecode > /$dir/dmidecode 2>&1

echo "Obtaining Shared Memory information ..."
sysctl -a > /$dir/sysctl 2>&1
sysctl -a 2>/dev/null | grep kernel.shm > /$dir/shm 2>&1
ls -d /dev/shm >> /$dir/shm 2>&1
ls -lR /dev/shm >> /$dir/shm 2>&1

echo "Obtaining OmniPath information ..."

# concise port statistics
opainfo > /$dir/opainfo 2>&1
opainfo -o stats > /$dir/opainfo-stats 2>&1
opainfo -o info > /$dir/opainfo-info 2>&1

echo "Obtaining MPI configuration ..."
if type mpi-selector >/dev/null 2>/dev/null
then
	mpi-selector --list > /$dir/mpi-selector-list
	mpi-selector --system --query > /$dir/mpi-selector-system
	mpi-selector --user --query > /$dir/mpi-selector-user
fi

mkdir /$dir/proc
echo "Copying configuration and statistics for OPA drivers from /proc ..."
for proc_file in cmdline cpuinfo ksyms meminfo mtrr modules net/arp net/dev net/dev_mcast net/route net/rt_cache net/vlan pci interrupts devices filesystems iomem ioports slabinfo version uptime scsi iba driver/ics_dsc driver/ipoib driver/sdp driver/rds irq acpi/processor
do
	if [ -e /proc/$proc_file ]
	then
		cp -p -r /proc/$proc_file /$dir/proc
	fi
done
for proc_file in `ps -eo pid`
do
	if [ -e /proc/$proc_file/stack ]
	then
		mkdir -p /$dir/proc/$proc_file
		cp -p -r /proc/$proc_file/stack /$dir/proc/$proc_file
	fi
done

echo "Obtaining additional CPU info..."
cpupower frequency-info > /$dir/cpupower-freq-info

# Check if HFI driver debug data dir) is present; log only if present
HFI_DEBUGDIR="/sys/kernel/debug/hfi1"
if [ -d ${HFI_DEBUGDIR} ]
then
	#hfi1stats requires the existance of ${HFI_DEBUGDIR}
	echo "Obtaining HFI statistics ..."
	hfi1stats > /$dir/hfi1stats 2>&1

	mkdir -p /${dir}${HFI_DEBUGDIR}
	echo "Copying kernel debug information from ${HFI_DEBUGDIR}..."
	cp -p -r ${HFI_DEBUGDIR}/* /${dir}/${HFI_DEBUGDIR} 2>/dev/null
fi

# Check if IPOIB debug data dir is present; log if present
IPOIB_DEBUGDIR="/sys/kernel/debug/ipoib"
if [ -d ${IPOIB_DEBUGDIR} ]
then
	echo "Obtaining IPOIB debug information from ${IPOIB_DEBUGDIR}..."
	mkdir -p /${dir}${IPOIB_DEBUGDIR}
	echo "Copying IPOIB debug information from ${IPOIB_DEBUGDIR}..."
	cp -r ${IPOIB_DEBUGDIR}/* /${dir}/${IPOIB_DEBUGDIR} 2>/dev/null
fi

# Check if side channel security issue mitigation information files are
# present; log if present
SIDE_CHANNEL_MITIGATION_INFO="/sys/devices/system/cpu/vulnerabilities"
if [ -d ${SIDE_CHANNEL_MITIGATION_INFO} ]
then
	echo "Obtaining side channel security issue mitigation information from ${SIDE_CHANNEL_MITIGATION_INFO}"
	mkdir -p /${dir}${SIDE_CHANNEL_MITIGATION_INFO}
	echo "Copying side channel security issue mitigation information from ${SIDE_CHANNEL_MITIGATION_INFO}..."
	cp -r ${SIDE_CHANNEL_MITIGATION_INFO}/* /${dir}${SIDE_CHANNEL_MITIGATION_INFO} 2>/dev/null
fi

# Check if side channel security issue mitigation kernel configuration files are
# present; log if present
KERNEL_CONFIG_LOC=/sys/kernel/debug/x86
SIDE_CHANNEL_MITIGATION_KERNEL_CONFIG_FILES="ibpb_enabled ibrs_enabled pti_enabled retp_enabled"
for fname in ${SIDE_CHANNEL_MITIGATION_KERNEL_CONFIG_FILES}
do
	if [ -e ${KERNEL_CONFIG_LOC}/${fname} ]
	then
		echo "Obtaining kernel configuration file ${KERNEL_CONFIG_LOC}/${fname}"
		if [ ! -d /${dir}${KERNEL_CONFIG_LOC} ]
		then
			mkdir -p /${dir}${KERNEL_CONFIG_LOC}
		fi
		echo "Copying kernel configuration file ${KERNEL_CONFIG_LOC}/${fname}..."
		cp ${KERNEL_CONFIG_LOC}/${fname} /${dir}${KERNEL_CONFIG_LOC} 2>/dev/null
	fi
done

mkdir -p /$dir/sys/class
if [ -e /sys/class/infiniband ]
then
	echo "Copying configuration and statistics for ib_ drivers from /sys ..."
	cp -p -r /sys/class/*infiniband* /$dir/sys/class 2>/dev/null
	mkdir -p /$dir/sys/class/infiniband
	for f in /sys/class/infiniband/*
	do
		if [ -h $f ]
		then
			rm -f /$dir/$f
			cp -p -r $f/ /$dir/sys/class/infiniband/ 2>/dev/null
			if [ -h $f/device ]
			then
				dev=`basename $f`
				unit=`expr $dev : 'hfi1_\(.*\)'`
				if [ ! -z "$unit" ]
				then
					echo "   Getting statedump for $dev ..."
					echo -e "unit $unit\nstate save hw /$dir/opa${dev}.dump"|hfidiags -s - > /$dir/opa${dev}.dump.res 2>&1
				fi
			#	rm -f /$dir/$f/device
			#	mkdir -p /$dir/sys/class/infiniband/$dev/ 2>/dev/null
			#	cp -p -r $f/device/ /$dir/sys/class/infiniband/$dev/ 2>/dev/null
			fi
		fi
	done
fi

if [ -e /sys/class/scsi_host ]
then
	cp -p -r /sys/class/scsi_host /$dir/sys/class 2>/dev/null
fi
if [ -e /sys/class/scsi_device ]
then
	cp -p -r /sys/class/scsi_device /$dir/sys/class 2>/dev/null
fi

if [ -e /sys/class/net/ ]
then
	echo "Copying interface information for ipoib"
	mkdir -p /$dir/sys/class/net
	cp -r /sys/class/net/ /$dir/sys/class 2>/dev/null
	for f in /sys/class/net/*
	do
		if [ -h $f ]
		then
			rm -f /$dir/$f
			cp -r $f/ /$dir/sys/class/net/ 2>/dev/null
			if [ -h $f/device ]
			then
				iface=`basename $f`
				rm -f /$dir/$f/device
				mkdir -p /$dir/sys/class/net/$iface/ 2>/dev/null
				cp -r $f/device/ /$dir/sys/class/net/$iface/ 2>/dev/null
			fi
		fi
	done
fi

if [ -e /sys/module ]
then
	echo "Copying configuration and statistics for OPA from /sys/module ..."
	cp -p -r /sys/module /$dir/sys 2>/dev/null 
fi

if [ -f /usr/lib/opa-fm/bin/fm_capture ]
then
	echo "Gathering Host FM Information ..."
	(cd /$dir; /usr/lib/opa-fm/bin/fm_capture)
fi

if [ -f /usr/bin/opa_osd_dump ]
then
	echo "Gathering Distributed SA data..."
	opa_osd_dump > /$dir/opa_osd_dump 2>&1
fi

if [ $detail -ge 2 ]
then
    mkdir -p /$dir/fabric
    cd /$dir/fabric/
    if [ "$ff_available" = y ]
    then
        check_ports_args opacapture
    else
        PORTS='0:0'
        if [ $detail -ge 3 ]
        then
            echo "Warning: opacapture detail=$detail but FastFabric not available"
        fi
    fi

    if [ $detail -ge 3 -a "$ff_available" = y ]
    then
        echo "Gathering Fabric-Level Information with FDBs ..."
    else
        echo "Gathering Fabric-Level Information ..."
    fi

    for hfi_port in $PORTS
    do
        hfi=$(expr $hfi_port : '\([0-9]*\):[0-9]*')
        port=$(expr $hfi_port : '[0-9]*:\([0-9]*\)')
        if [ "$hfi" = "" -o "$port" = "" ]
        then
            echo "opacapture: Error: Invalid port specification: $hfi_port" >&2
            continue
		fi
		## fixup name so winzip won't complain
		hfi_port_dir=${hfi}_${port}

        # make hfi_port directory
        mkdir $hfi_port_dir

        # opasaquery doesn't require FF available
        /usr/sbin/opasaquery -h $hfi -p $port -o node > $hfi_port_dir/nodes 2>&1
        /usr/sbin/opafabricinfo -p $hfi_port > $hfi_port_dir/opafabricinfo 2>&1

        if [ "$ff_available" = y ]
        then
            router_opt=""

            if [ $port -eq 0 ]
            then
                 port_opt="-h $hfi"
            else
                 port_opt="-h $hfi -p $port"
            fi

            # determine if port is management enabled
            /usr/sbin/opasmaquery $port_opt -o pkey 2>/dev/null|grep -q 0xffff
            mgmt_disabled=$?

            # add router table information to snapshot report
            if [ $detail -gt 2 ]
            then
                router_opt="-r"
            fi

            if [ $mgmt_disabled -eq 0 ]
            then
                /usr/sbin/opareport $port_opt -o snapshot -s -V $router_opt > $hfi_port_dir/snapshot.xml 2> $hfi_port_dir/snapshot.xml.err
                /usr/sbin/opareport $port_opt -o snapshot -m -M -s -V $router_opt > $hfi_port_dir/snapshot_direct.xml 2> $hfi_port_dir/snapshot_direct.xml.err
                /usr/sbin/opareport -o links -X $hfi_port_dir/snapshot.xml > $hfi_port_dir/fabric_links 2>&1
                /usr/sbin/opareport -o comps -X $hfi_port_dir/snapshot.xml > $hfi_port_dir/fabric_comps 2>&1
                /usr/sbin/opareport -o errors -X $hfi_port_dir/snapshot.xml > $hfi_port_dir/fabric_errors 2>&1
                /usr/sbin/opareport -o extlinks -X $hfi_port_dir/snapshot.xml > $hfi_port_dir/fabric_extlinks 2>&1
                /usr/sbin/opareport -o slowlinks -X $hfi_port_dir/snapshot.xml > $hfi_port_dir/fabric_slowlinks 2>&1
                /usr/sbin/opareport -o vfmember -V -d 4  > $hfi_port_dir/fabric_vfmember 2>&1
            fi

            if [ $detail -gt 2 ]
            then
                echo "Gathering Multicast Membership ..."
                /usr/sbin/opashowmc -p $hfi:$port > $hfi_port_dir/fabric_showmc 2>&1
                # create cable health report directory and generate report
                if [ $ff_available = "y" ] && [ -e "${FF_CABLE_HEALTH_REPORT_DIR}" ]
                then
                    echo "Gathering Cable Health Report ..."
                    FF_CABLE_HEALTH_REPORT_DIR_WITH_HFI_PORT="${FF_CABLE_HEALTH_REPORT_DIR}/${hfi_port_dir}/"
                    mkdir -p ${FF_CABLE_HEALTH_REPORT_DIR_WITH_HFI_PORT}
                    /usr/sbin/opareport -h $hfi -p $port -o cablehealth 2>/dev/null>${FF_CABLE_HEALTH_REPORT_DIR_WITH_HFI_PORT}/cablehealth$(date "+%Y%m%d%H%M%S").csv
                fi
            fi
        fi
    done
fi

if [ $ff_available = "y" ] && [ -e "${FF_CABLE_HEALTH_REPORT_DIR}" ]
then
        echo "Copying all Cable Health Reports"
        cp -p -r ${FF_CABLE_HEALTH_REPORT_DIR} /$dir/ 2>/dev/null
fi

cd /
files="$dir"
for f in var/log/opa* var/log/ics_* var/log/messages* var/log/ksyms.* var/log/boot* etc/*release* etc/sysconfig/ipoib.cfg* etc/opa etc/modules.conf* etc/modprobe.conf* etc/sysconfig/network-scripts/ifcfg* etc/dapl/ibhosts etc/hosts etc/sysconfig/boot etc/sysconfig/firstboot etc/dat.conf etc/sysconfig/network/ifcfg* etc/infiniband etc/sysconfig/*config etc/security etc/opa-fm/opafm.xml etc/sysconfig/iview_fm.config var/log/fm* var/log/sm* var/log/bm* var/log/pm* var/log/fe* var/log/opensm* var/log/ipath* etc/rc.d/rc.local etc/modprobe.d boot/grub/menu.lst boot/grub/grub.conf boot/grub2/grub.cfg boot/grub2/grubenv boot/grub2/device.map etc/grub*.conf etc/udev* etc/opensm etc/sysconfig/opensm etc/rdma/* etc/modprobe.d/* etc/dracut.conf.d/* etc/nsswitch.conf etc/sysconfig/irqbalance
do
	if [ -e "$f" ]
	then
		files="$files $f"
	fi
done
if [ -e /usr/bin/opaxmlextract ]
then
	for f in $(opaxmlextract -H -e LogFile < /etc/opa-fm/opafm.xml 2>/dev/null)
	do
		case "$f" in
		/var/log/fm*|/var/log/sm*|/var/log/bm*|/var/log/pm*|/var/log/fe*)
			>/dev/null;;
		*)
			if [ -e "$f" ]
			then
				files="$files $(echo $f|sed -e 's|^/||')"
			fi;;
		esac
	done
fi

for f in usr/local/src/mpi_apps/core* usr/src/opa/mpi_apps/core* usr/src/opa/shmem_apps/core*
do
	if [ -e "$f" ]
	then
		files="$files $f"
	fi
done
if [ "$ff_available" = y -a -d "$FF_MPI_APPS_DIR" ]
then
	apps_dir=$(echo $FF_MPI_APPS_DIR|sed -e 's|^/||')	# drop leading /
	for f in $apps_dir/core*
	do
		if [ -e "$f" ]
		then
			files="$files $f"
		fi
	done
fi

if [ $detail -ge 4 -a "$ff_available" = y ]
then
	if [ ! -d $FF_ANALYSIS_DIR/latest ]
	then
		rm -f $FF_ANALYSIS_DIR/opaallanalysis
		mkdir -p $FF_ANALYSIS_DIR
		baseline_opt=""
		if [ ! -d $FF_ANALYSIS_DIR/baseline ]
		then
			echo "Performing Fabric Analysis Baseline ..."
			baseline_opt="-b"
		else
			echo "Performing Fabric Analysis ..."
		fi
		/usr/sbin/opaallanalysis $baseline_opt > $FF_ANALYSIS_DIR/opaallanalysis 2>&1
	else
		echo "Copying Fabric Analysis ..."
	fi
	files="$files $(echo $FF_ANALYSIS_DIR|sed -e 's|^/||')"
fi

echo "Creating tar file $tar_file ..."
tar --format=gnu -czf $tar_file $files
retval=$?
rm -rf /$dir

if [ $retval -ne 0 ]
then
	echo "tar encountered an issue while generating the tarball. Please verify the tarball was created successfully, files that have changed are acceptable." >&2
fi

echo "Done."
echo
echo "Please include $tar_file with any problem reports to Customer Support"

exit $retval