#!/usr/libexec/platform-python
#
# Authors: Jiri Jaburek <jjaburek@redhat.com>
#
# Description: Test watching wrapper for runtest.sh or similar runnable
#
# Copyright (c) 2013 Red Hat, Inc. All rights reserved. This copyrighted
# material is made available to anyone wishing to use, modify, copy, or
# redistribute it subject to the terms and conditions of the GNU General
# Public License v.2.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
# test watcher - an idea
#
# LWD = Beaker Local Watchdog, expires when TestTime (Makefile) reaches zero
# EWD = Beaker External Watchdog, expires 30m after LWD expiration
#
# - set up a temporary file usable for cleanup file path transfer
# and export its path via an env variable
# - hook LWD when run from Beaker
# - this hook will send SIGHUP to the watcher on LWD expire and block
# until the watcher process exits
# - set up SIGHUP handling, which
# - sends SIGKILL to test if running
# - sets up SIGALRM handler for EWD and schedules alarm(2)
# - EWD handler SIGKILLs cleanup (if running)
# - run test
# - if it finishes in time, do nothing (unset pid)
# - if INT is received while it is running, SIGKILL the test, unset pid
# - execute possible cleanup
# - if it still finishes in time (no HUP so far), do nothing (unset pid)
# - if INT is received while it is running, SIGKILL cleanup, unset pid
# - exit cleanly
#
# Some considerations taken into account / tested:
#
# - SIGHUP is received while running cleanup (TestTime expired after test exit)
# - testpid is already 0, only SIGALRM (cleanup kill) is scheduled, giving
# the cleanup another ewd_maxsecs seconds to finish
# - SIGTERM is received at any time
# - the only reasonable case is system reboot/poweroff, which we cannot
# delay anyway (to allow cleanup execution), so just exit (SIG_DFL)
# - this doesn't really damage anything, the test can be easily re-run and
# continue creating the cleanup, if the previous cleanup state was saved
# and the test sends the cleanup path to the watcher again
from __future__ import print_function
import os
import sys
import signal
import time
import errno
import fcntl
import tempfile
### CONFIG
#
# Beaker External watchdog = 30 minutes after LWD
# (25 minutes = 1500 secs by default, configurable via env)
if 'TESTWATCHER_EWD_SECS' in os.environ:
ewd_maxsecs = int(os.environ['TESTWATCHER_EWD_SECS'])
if ewd_maxsecs <= 0:
raise Exception("invalid TESTWATCHER_EWD_SECS env var value")
else:
ewd_maxsecs = 1500
# beah LWD hook
lwd_guard_file = '/usr/share/rhts/hooks/watchdog/testwatcher-cleanup-guard'
# file descriptor and file path (name) used for cleanup filename transfer
# via temporary file from test to watcher, the watcher expects the test
# to write path to cleanup executable into it, it's checked just before
# cleanup execution
clfd, clpath = tempfile.mkstemp(prefix='testwatcher-', dir='/var/tmp') # no-reboot
# env var containing the path, so the test can write to it
os.environ['TESTWATCHER_CLPATH'] = clpath
#
###
### GLOBALS
#
selfname = os.path.basename(__file__)
testpid = 0
cleanuppid = 0
if os.environ.get('TASKID'):
beah = True
else:
beah = False
#
###
### HELPERS
#
def debug(msg):
print('TESTWATCHER: '+msg)
sys.stdout.flush()
def fatal(msg):
print('TESTWATCHER fatal: '+msg, file=sys.stderr)
sys.stderr.flush()
sys.exit(1)
def sigpgkill_safe(pid):
# if pid does not exist / is not related, return
try:
os.kill(pid, 0)
except:
return
os.killpg(pid, signal.SIGKILL)
def beah_warn(part):
# python "subprocess" not on RHEL4
os.system('rhts-report-result "TESTWATCHER ('+part+')" WARN /dev/null')
#
###
### BEAH LWD WATCHDOG
#
# custom shell-based watchdog guard
# (selfname[:15] works around 15-char /proc/pid/comm limitation)
watchdog_guard_cont = r"""
#!/bin/sh
rm -f "$0"
wrap_pid='"""+str(os.getpid())+r"""'
wrap_name="$(ps c --no-headers -o comm --pid $wrap_pid)"
[ $? -ne 0 ] && { echo "wrapper pid is not running"; exit 0; }
[ "$wrap_name" != '"""+selfname[:15]+r"""' ] && \
{ echo "wrapper pid not a testwatch process: $wrap_name"; exit 0; }
kill -HUP "$wrap_pid"
while ps --no-headers -o pid --pid $wrap_pid >/dev/null; do sleep 1; done;
"""
# write out custom watchdog guard to beaker hooks dir,
# causing it to be launched when local watchdog expires
def beah_lwd_hook():
debug('hooking beah LWD')
try:
os.makedirs(os.path.dirname(lwd_guard_file))
except OSError as e:
if e.errno == errno.EEXIST:
pass
f = open(lwd_guard_file, 'w')
f.write(watchdog_guard_cont)
f.close()
os.chmod(lwd_guard_file, 0o755)
# called when EWD (external watchdog) is about to expire
def beah_ewd_action(signum, frame):
debug('beah EWD is about to strike')
global cleanuppid
if cleanuppid != 0:
sigpgkill_safe(cleanuppid)
if beah:
beah_warn('external watchdog')
# called when LWD expires
def beah_lwd_action(signum, frame):
debug('beah LWD expired')
global testpid
signal.signal(signal.SIGHUP, signal.SIG_IGN)
if testpid != 0:
sigpgkill_safe(testpid)
testpid = 0
signal.signal(signal.SIGALRM, beah_ewd_action)
signal.alarm(ewd_maxsecs)
if beah:
beah_warn('local watchdog')
#
###
### CLEANUP WATCHER
#
# executed by INT sent to the test watcher process
def cleanup_interrupt(signum, frame):
debug('cleanup interrupted')
global cleanuppid
signal.signal(signal.SIGINT, signal.SIG_IGN)
if cleanuppid != 0:
sigpgkill_safe(cleanuppid)
if beah:
beah_warn('cleanup interrupt')
def exec_cleanup():
global cleanuppid
# no os.SEEK_SET on RHEL4
os.lseek(clfd, 0, 0)
filename = os.read(clfd, 1024).strip()
# no cleanup
if not filename:
debug('no cleanup set')
return
if not os.path.isfile(filename) or not os.access(filename, os.X_OK):
debug('cleanup file not found / not executable, skipping')
return
signal.signal(signal.SIGINT, cleanup_interrupt)
cleanuppid = os.fork()
if cleanuppid == 0:
os.setpgrp()
debug('executing cleanup at '+filename)
os.execvp(filename, [filename])
else:
debug('parent waiting for cleanup '+str(cleanuppid))
while cleanuppid != 0:
try:
os.waitpid(cleanuppid, 0)
cleanuppid = 0
except OSError as e:
if e.errno == errno.EINTR:
pass
if e.errno == errno.ECHILD:
cleanuppid = 0
#
###
### TEST WATCHER
#
# executed by INT sent to the test watcher process
def test_interrupt(signum, frame):
debug('test interrupted')
global testpid
# ignore future INT
signal.signal(signal.SIGINT, signal.SIG_IGN)
# kill frozen test + its process group
if testpid != 0:
sigpgkill_safe(testpid)
# log warn
if beah:
beah_warn('test interrupt')
def exec_test():
# NOTE: signal handling can be set up before fork, it won't make it
# past execve/execvp and it's safer this way
# (the child can be killed via SIGINT received right after pid is available
# to the parent, ie. right after fork)
# beaker LWD
signal.signal(signal.SIGHUP, beah_lwd_action)
# user interrupt
signal.signal(signal.SIGINT, test_interrupt)
# fork and exec the test, wait for it in the parent process
global testpid
testpid = os.fork()
if testpid == 0:
# become process group leader, so we can kill all related
# processes (from the parent) when interrupted
os.setpgrp()
debug('executing test at '+' '.join(sys.argv[1:]))
os.execvp(sys.argv[1], sys.argv[1:])
else:
debug('parent waiting for test '+str(testpid))
while testpid != 0:
try:
# wait for entire process group
os.waitpid(testpid, 0)
testpid = 0
except OSError as e:
# no traceback if interrupted by a signal
if e.errno == errno.EINTR:
pass
# safety measure, shouldn't happen
if e.errno == errno.ECHILD:
testpid = 0
#
###
### MAIN
#
# sanity check
if len(sys.argv) < 2:
fatal('usage: '+selfname+' <command> [args]')
if beah:
beah_lwd_hook()
exec_test()
debug('parent done waiting for test')
exec_cleanup()
debug('parent done waiting for cleanup')
# remove temporary (mkstemp'ed) file # no-reboot
os.unlink(clpath)
debug('all done, finishing watcher')
sys.exit(0)