| """ Test-specific classes for Pacemaker's Cluster Test Suite (CTS) |
| """ |
| |
| |
| from __future__ import print_function, unicode_literals, absolute_import, division |
| |
| __copyright__ = "Copyright 2000-2019 the Pacemaker project contributors" |
| __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| import os |
| import re |
| import time |
| import subprocess |
| import tempfile |
| |
| from stat import * |
| from cts import CTS |
| from cts.CTSaudits import * |
| from cts.CTSvars import * |
| from cts.patterns import PatternSelector |
| from cts.logging import LogFactory |
| from cts.remote import RemoteFactory, input_wrapper |
| from cts.watcher import LogWatcher |
| from cts.environment import EnvFactory |
| |
| AllTestClasses = [ ] |
| |
| |
| class CTSTest(object): |
| ''' |
| A Cluster test. |
| We implement the basic set of properties and behaviors for a generic |
| cluster test. |
| |
| Cluster tests track their own statistics. |
| We keep each of the kinds of counts we track as separate {name,value} |
| pairs. |
| ''' |
| |
| def __init__(self, cm): |
| |
| self.Stats = {"calls":0 |
| , "success":0 |
| , "failure":0 |
| , "skipped":0 |
| , "auditfail":0} |
| |
| |
| |
| self.CM = cm |
| self.Env = EnvFactory().getInstance() |
| self.rsh = RemoteFactory().getInstance() |
| self.logger = LogFactory() |
| self.templates = PatternSelector(cm["Name"]) |
| self.Audits = [] |
| self.timeout = 120 |
| self.passed = 1 |
| self.is_loop = 0 |
| self.is_unsafe = 0 |
| self.is_docker_unsafe = 0 |
| self.is_experimental = 0 |
| self.is_container = 0 |
| self.is_valgrind = 0 |
| self.benchmark = 0 |
| self.timer = {} |
| |
| def log(self, args): |
| self.logger.log(args) |
| |
| def debug(self, args): |
| self.logger.debug(args) |
| |
| def has_key(self, key): |
| return key in self.Stats |
| |
| def __setitem__(self, key, value): |
| self.Stats[key] = value |
| |
| def __getitem__(self, key): |
| if str(key) == "0": |
| raise ValueError("Bad call to 'foo in X', should reference 'foo in X.Stats' instead") |
| |
| if key in self.Stats: |
| return self.Stats[key] |
| return None |
| |
| def log_mark(self, msg): |
| self.debug("MARK: test %s %s %d" % (self.name,msg,time.time())) |
| return |
| |
| def get_timer(self,key = "test"): |
| try: return self.timer[key] |
| except: return 0 |
| |
| def set_timer(self,key = "test"): |
| self.timer[key] = time.time() |
| return self.timer[key] |
| |
| def log_timer(self,key = "test"): |
| elapsed = 0 |
| if key in self.timer: |
| elapsed = time.time() - self.timer[key] |
| s = key == "test" and self.name or "%s:%s" % (self.name,key) |
| self.debug("%s runtime: %.2f" % (s, elapsed)) |
| del self.timer[key] |
| return elapsed |
| |
| def incr(self, name): |
| '''Increment (or initialize) the value associated with the given name''' |
| if not name in self.Stats: |
| self.Stats[name] = 0 |
| self.Stats[name] = self.Stats[name]+1 |
| |
| |
| if name == "calls": |
| self.passed = 1 |
| |
| def failure(self, reason="none"): |
| '''Increment the failure count''' |
| self.passed = 0 |
| self.incr("failure") |
| self.logger.log(("Test %s" % self.name).ljust(35) + " FAILED: %s" % reason) |
| return None |
| |
| def success(self): |
| '''Increment the success count''' |
| self.incr("success") |
| return 1 |
| |
| def skipped(self): |
| '''Increment the skipped count''' |
| self.incr("skipped") |
| return 1 |
| |
| def __call__(self, node): |
| '''Perform the given test''' |
| raise ValueError("Abstract Class member (__call__)") |
| self.incr("calls") |
| return self.failure() |
| |
| def audit(self): |
| passed = 1 |
| if len(self.Audits) > 0: |
| for audit in self.Audits: |
| if not audit(): |
| self.logger.log("Internal %s Audit %s FAILED." % (self.name, audit.name())) |
| self.incr("auditfail") |
| passed = 0 |
| return passed |
| |
| def setup(self, node): |
| '''Setup the given test''' |
| return self.success() |
| |
| def teardown(self, node): |
| '''Tear down the given test''' |
| return self.success() |
| |
| def create_watch(self, patterns, timeout, name=None): |
| if not name: |
| name = self.name |
| return LogWatcher(self.Env["LogFileName"], patterns, name, timeout, kind=self.Env["LogWatcher"], hosts=self.Env["nodes"]) |
| |
| def local_badnews(self, prefix, watch, local_ignore=[]): |
| errcount = 0 |
| if not prefix: |
| prefix = "LocalBadNews:" |
| |
| ignorelist = [] |
| ignorelist.append(" CTS: ") |
| ignorelist.append(prefix) |
| ignorelist.extend(local_ignore) |
| |
| while errcount < 100: |
| match = watch.look(0) |
| if match: |
| add_err = 1 |
| for ignore in ignorelist: |
| if add_err == 1 and re.search(ignore, match): |
| add_err = 0 |
| if add_err == 1: |
| self.logger.log(prefix + " " + match) |
| errcount = errcount + 1 |
| else: |
| break |
| else: |
| self.logger.log("Too many errors!") |
| |
| watch.end() |
| return errcount |
| |
| def is_applicable(self): |
| return self.is_applicable_common() |
| |
| def is_applicable_common(self): |
| '''Return TRUE if we are applicable in the current test configuration''' |
| |
| |
| if self.is_loop and not self.Env["loop-tests"]: |
| return 0 |
| elif self.is_unsafe and not self.Env["unsafe-tests"]: |
| return 0 |
| elif self.is_valgrind and not self.Env["valgrind-tests"]: |
| return 0 |
| elif self.is_experimental and not self.Env["experimental-tests"]: |
| return 0 |
| elif self.is_docker_unsafe and self.Env["docker"]: |
| return 0 |
| elif self.is_container and not self.Env["container-tests"]: |
| return 0 |
| elif self.Env["benchmark"] and self.benchmark == 0: |
| return 0 |
| |
| return 1 |
| |
| def find_ocfs2_resources(self, node): |
| self.r_o2cb = None |
| self.r_ocfs2 = [] |
| |
| (rc, lines) = self.rsh(node, "crm_resource -c", None) |
| for line in lines: |
| if re.search("^Resource", line): |
| r = AuditResource(self.CM, line) |
| if r.rtype == "o2cb" and r.parent != "NA": |
| self.debug("Found o2cb: %s" % self.r_o2cb) |
| self.r_o2cb = r.parent |
| if re.search("^Constraint", line): |
| c = AuditConstraint(self.CM, line) |
| if c.type == "rsc_colocation" and c.target == self.r_o2cb: |
| self.r_ocfs2.append(c.rsc) |
| |
| self.debug("Found ocfs2 filesystems: %s" % repr(self.r_ocfs2)) |
| return len(self.r_ocfs2) |
| |
| def canrunnow(self, node): |
| '''Return TRUE if we can meaningfully run right now''' |
| return 1 |
| |
| def errorstoignore(self): |
| '''Return list of errors which are 'normal' and should be ignored''' |
| return [] |
| |
| |
| class StopTest(CTSTest): |
| '''Stop (deactivate) the cluster manager on a node''' |
| def __init__(self, cm): |
| CTSTest.__init__(self, cm) |
| self.name = "Stop" |
| |
| def __call__(self, node): |
| '''Perform the 'stop' test. ''' |
| self.incr("calls") |
| if self.CM.ShouldBeStatus[node] != "up": |
| return self.skipped() |
| |
| patterns = [] |
| |
| patterns.append(self.templates["Pat:We_stopped"] % node) |
| |
| |
| |
| for other in self.Env["nodes"]: |
| if self.CM.ShouldBeStatus[other] == "up" and other != node: |
| patterns.append(self.templates["Pat:They_stopped"] %(other, self.CM.key_for_node(node))) |
| |
| |
| watch = self.create_watch(patterns, self.Env["DeadTime"]) |
| watch.setwatch() |
| |
| if node == self.CM.OurNode: |
| self.incr("us") |
| else: |
| if self.CM.upcount() <= 1: |
| self.incr("all") |
| else: |
| self.incr("them") |
| |
| self.CM.StopaCM(node) |
| watch_result = watch.lookforall() |
| |
| failreason = None |
| UnmatchedList = "||" |
| if watch.unmatched: |
| (rc, output) = self.rsh(node, "/bin/ps axf", None) |
| for line in output: |
| self.debug(line) |
| |
| (rc, output) = self.rsh(node, "/usr/sbin/dlm_tool dump", None) |
| for line in output: |
| self.debug(line) |
| |
| for regex in watch.unmatched: |
| self.logger.log ("ERROR: Shutdown pattern not found: %s" % (regex)) |
| UnmatchedList += regex + "||"; |
| failreason = "Missing shutdown pattern" |
| |
| self.CM.cluster_stable(self.Env["DeadTime"]) |
| |
| if not watch.unmatched or self.CM.upcount() == 0: |
| return self.success() |
| |
| if len(watch.unmatched) >= self.CM.upcount(): |
| return self.failure("no match against (%s)" % UnmatchedList) |
| |
| if failreason == None: |
| return self.success() |
| else: |
| return self.failure(failreason) |
| |
| |
| |
| |
| |
| |
| class StartTest(CTSTest): |
| '''Start (activate) the cluster manager on a node''' |
| def __init__(self, cm, debug=None): |
| CTSTest.__init__(self,cm) |
| self.name = "start" |
| self.debug = debug |
| |
| def __call__(self, node): |
| '''Perform the 'start' test. ''' |
| self.incr("calls") |
| |
| if self.CM.upcount() == 0: |
| self.incr("us") |
| else: |
| self.incr("them") |
| |
| if self.CM.ShouldBeStatus[node] != "down": |
| return self.skipped() |
| elif self.CM.StartaCM(node): |
| return self.success() |
| else: |
| return self.failure("Startup %s on node %s failed" |
| % (self.Env["Name"], node)) |
| |
| |
| |
| |
| |
| |
| |
| class FlipTest(CTSTest): |
| '''If it's running, stop it. If it's stopped start it. |
| Overthrow the status quo... |
| ''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "Flip" |
| self.start = StartTest(cm) |
| self.stop = StopTest(cm) |
| |
| def __call__(self, node): |
| '''Perform the 'Flip' test. ''' |
| self.incr("calls") |
| if self.CM.ShouldBeStatus[node] == "up": |
| self.incr("stopped") |
| ret = self.stop(node) |
| type = "up->down" |
| |
| time.sleep(self.Env["StableTime"]) |
| elif self.CM.ShouldBeStatus[node] == "down": |
| self.incr("started") |
| ret = self.start(node) |
| type = "down->up" |
| else: |
| return self.skipped() |
| |
| self.incr(type) |
| if ret: |
| return self.success() |
| else: |
| return self.failure("%s failure" % type) |
| |
| |
| AllTestClasses.append(FlipTest) |
| |
| |
| class RestartTest(CTSTest): |
| '''Stop and restart a node''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "Restart" |
| self.start = StartTest(cm) |
| self.stop = StopTest(cm) |
| self.benchmark = 1 |
| |
| def __call__(self, node): |
| '''Perform the 'restart' test. ''' |
| self.incr("calls") |
| |
| self.incr("node:" + node) |
| |
| ret1 = 1 |
| if self.CM.StataCM(node): |
| self.incr("WasStopped") |
| if not self.start(node): |
| return self.failure("start (setup) failure: "+node) |
| |
| self.set_timer() |
| if not self.stop(node): |
| return self.failure("stop failure: "+node) |
| if not self.start(node): |
| return self.failure("start failure: "+node) |
| return self.success() |
| |
| |
| AllTestClasses.append(RestartTest) |
| |
| |
| class StonithdTest(CTSTest): |
| def __init__(self, cm): |
| CTSTest.__init__(self, cm) |
| self.name = "Stonithd" |
| self.startall = SimulStartLite(cm) |
| self.benchmark = 1 |
| |
| def __call__(self, node): |
| self.incr("calls") |
| if len(self.Env["nodes"]) < 2: |
| return self.skipped() |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Setup failed") |
| |
| is_dc = self.CM.is_node_dc(node) |
| |
| watchpats = [] |
| watchpats.append(self.templates["Pat:FenceOpOK"] % node) |
| watchpats.append(self.templates["Pat:NodeFenced"] % node) |
| |
| if self.Env["at-boot"] == 0: |
| self.debug("Expecting %s to stay down" % node) |
| self.CM.ShouldBeStatus[node] = "down" |
| else: |
| self.debug("Expecting %s to come up again %d" % (node, self.Env["at-boot"])) |
| watchpats.append("%s.* S_STARTING -> S_PENDING" % node) |
| watchpats.append("%s.* S_PENDING -> S_NOT_DC" % node) |
| |
| watch = self.create_watch(watchpats, 30 + self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"]) |
| watch.setwatch() |
| |
| origin = self.Env.RandomGen.choice(self.Env["nodes"]) |
| |
| rc = self.rsh(origin, "stonith_admin --reboot %s -VVVVVV" % node) |
| |
| if rc == 194: |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| self.logger.log("Fencing command on %s to fence %s timed out" % (origin, node)) |
| |
| elif origin != node and rc != 0: |
| self.debug("Waiting for the cluster to recover") |
| self.CM.cluster_stable() |
| |
| self.debug("Waiting for fenced node to come back up") |
| self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600) |
| |
| self.logger.log("Fencing command on %s failed to fence %s (rc=%d)" % (origin, node, rc)) |
| |
| elif origin == node and rc != 255: |
| |
| self.logger.log("Locally originated fencing returned %d" % rc) |
| |
| self.set_timer("fence") |
| matched = watch.lookforall() |
| self.log_timer("fence") |
| self.set_timer("reform") |
| if watch.unmatched: |
| self.logger.log("Patterns not found: " + repr(watch.unmatched)) |
| |
| self.debug("Waiting for the cluster to recover") |
| self.CM.cluster_stable() |
| |
| self.debug("Waiting for fenced node to come back up") |
| self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600) |
| |
| self.debug("Waiting for the cluster to re-stabilize with all nodes") |
| is_stable = self.CM.cluster_stable(self.Env["StartTime"]) |
| |
| if not matched: |
| return self.failure("Didn't find all expected patterns") |
| elif not is_stable: |
| return self.failure("Cluster did not become stable") |
| |
| self.log_timer("reform") |
| return self.success() |
| |
| def errorstoignore(self): |
| return [ |
| self.templates["Pat:Fencing_start"] % ".*", |
| self.templates["Pat:Fencing_ok"] % ".*", |
| r"error.*: Resource .*stonith::.* is active on 2 nodes attempting recovery", |
| r"error.*: Operation 'reboot' targeting .* on .* for stonith_admin.*: Timer expired", |
| ] |
| |
| def is_applicable(self): |
| if not self.is_applicable_common(): |
| return 0 |
| |
| if "DoFencing" in list(self.Env.keys()): |
| return self.Env["DoFencing"] |
| |
| return 1 |
| |
| AllTestClasses.append(StonithdTest) |
| |
| |
| class StartOnebyOne(CTSTest): |
| '''Start all the nodes ~ one by one''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "StartOnebyOne" |
| self.stopall = SimulStopLite(cm) |
| self.start = StartTest(cm) |
| self.ns = CTS.NodeStatus(cm.Env) |
| |
| def __call__(self, dummy): |
| '''Perform the 'StartOnebyOne' test. ''' |
| self.incr("calls") |
| |
| |
| |
| |
| ret = self.stopall(None) |
| if not ret: |
| return self.failure("Test setup failed") |
| |
| failed = [] |
| self.set_timer() |
| for node in self.Env["nodes"]: |
| if not self.start(node): |
| failed.append(node) |
| |
| if len(failed) > 0: |
| return self.failure("Some node failed to start: " + repr(failed)) |
| |
| return self.success() |
| |
| |
| AllTestClasses.append(StartOnebyOne) |
| |
| |
| class SimulStart(CTSTest): |
| '''Start all the nodes ~ simultaneously''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "SimulStart" |
| self.stopall = SimulStopLite(cm) |
| self.startall = SimulStartLite(cm) |
| |
| def __call__(self, dummy): |
| '''Perform the 'SimulStart' test. ''' |
| self.incr("calls") |
| |
| |
| |
| |
| ret = self.stopall(None) |
| if not ret: |
| return self.failure("Setup failed") |
| |
| if not self.startall(None): |
| return self.failure("Startall failed") |
| |
| return self.success() |
| |
| |
| AllTestClasses.append(SimulStart) |
| |
| |
| class SimulStop(CTSTest): |
| '''Stop all the nodes ~ simultaneously''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "SimulStop" |
| self.startall = SimulStartLite(cm) |
| self.stopall = SimulStopLite(cm) |
| |
| def __call__(self, dummy): |
| '''Perform the 'SimulStop' test. ''' |
| self.incr("calls") |
| |
| |
| |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Setup failed") |
| |
| if not self.stopall(None): |
| return self.failure("Stopall failed") |
| |
| return self.success() |
| |
| |
| AllTestClasses.append(SimulStop) |
| |
| |
| class StopOnebyOne(CTSTest): |
| '''Stop all the nodes in order''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "StopOnebyOne" |
| self.startall = SimulStartLite(cm) |
| self.stop = StopTest(cm) |
| |
| def __call__(self, dummy): |
| '''Perform the 'StopOnebyOne' test. ''' |
| self.incr("calls") |
| |
| |
| |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Setup failed") |
| |
| failed = [] |
| self.set_timer() |
| for node in self.Env["nodes"]: |
| if not self.stop(node): |
| failed.append(node) |
| |
| if len(failed) > 0: |
| return self.failure("Some node failed to stop: " + repr(failed)) |
| |
| return self.success() |
| |
| |
| AllTestClasses.append(StopOnebyOne) |
| |
| |
| class RestartOnebyOne(CTSTest): |
| '''Restart all the nodes in order''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "RestartOnebyOne" |
| self.startall = SimulStartLite(cm) |
| |
| def __call__(self, dummy): |
| '''Perform the 'RestartOnebyOne' test. ''' |
| self.incr("calls") |
| |
| |
| |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Setup failed") |
| |
| did_fail = [] |
| self.set_timer() |
| self.restart = RestartTest(self.CM) |
| for node in self.Env["nodes"]: |
| if not self.restart(node): |
| did_fail.append(node) |
| |
| if did_fail: |
| return self.failure("Could not restart %d nodes: %s" |
| % (len(did_fail), repr(did_fail))) |
| return self.success() |
| |
| |
| AllTestClasses.append(RestartOnebyOne) |
| |
| |
| class PartialStart(CTSTest): |
| '''Start a node - but tell it to stop before it finishes starting up''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "PartialStart" |
| self.startall = SimulStartLite(cm) |
| self.stopall = SimulStopLite(cm) |
| self.stop = StopTest(cm) |
| |
| |
| def __call__(self, node): |
| '''Perform the 'PartialStart' test. ''' |
| self.incr("calls") |
| |
| ret = self.stopall(None) |
| if not ret: |
| return self.failure("Setup failed") |
| |
| |
| |
| watchpats = [] |
| watchpats.append("pacemaker-controld.*Connecting to cluster infrastructure") |
| watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) |
| watch.setwatch() |
| |
| self.CM.StartaCMnoBlock(node) |
| ret = watch.lookforall() |
| if not ret: |
| self.logger.log("Patterns not found: " + repr(watch.unmatched)) |
| return self.failure("Setup of %s failed" % node) |
| |
| ret = self.stop(node) |
| if not ret: |
| return self.failure("%s did not stop in time" % node) |
| |
| return self.success() |
| |
| def errorstoignore(self): |
| '''Return list of errors which should be ignored''' |
| |
| |
| return [ |
| r"Executing reboot fencing operation", |
| r"Requesting fencing \([^)]+\) of node ", |
| ] |
| |
| |
| AllTestClasses.append(PartialStart) |
| |
| |
| class StandbyTest(CTSTest): |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "Standby" |
| self.benchmark = 1 |
| |
| self.start = StartTest(cm) |
| self.startall = SimulStartLite(cm) |
| |
| |
| |
| |
| |
| |
| |
| def __call__(self, node): |
| |
| self.incr("calls") |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Start all nodes failed") |
| |
| self.debug("Make sure node %s is active" % node) |
| if self.CM.StandbyStatus(node) != "off": |
| if not self.CM.SetStandbyMode(node, "off"): |
| return self.failure("can't set node %s to active mode" % node) |
| |
| self.CM.cluster_stable() |
| |
| status = self.CM.StandbyStatus(node) |
| if status != "off": |
| return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status)) |
| |
| self.debug("Getting resources running on node %s" % node) |
| rsc_on_node = self.CM.active_resources(node) |
| |
| watchpats = [] |
| watchpats.append(r"State transition .* -> S_POLICY_ENGINE") |
| watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) |
| watch.setwatch() |
| |
| self.debug("Setting node %s to standby mode" % node) |
| if not self.CM.SetStandbyMode(node, "on"): |
| return self.failure("can't set node %s to standby mode" % node) |
| |
| self.set_timer("on") |
| |
| ret = watch.lookforall() |
| if not ret: |
| self.logger.log("Patterns not found: " + repr(watch.unmatched)) |
| self.CM.SetStandbyMode(node, "off") |
| return self.failure("cluster didn't react to standby change on %s" % node) |
| |
| self.CM.cluster_stable() |
| |
| status = self.CM.StandbyStatus(node) |
| if status != "on": |
| return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status)) |
| self.log_timer("on") |
| |
| self.debug("Checking resources") |
| bad_run = self.CM.active_resources(node) |
| if len(bad_run) > 0: |
| rc = self.failure("%s set to standby, %s is still running on it" % (node, repr(bad_run))) |
| self.debug("Setting node %s to active mode" % node) |
| self.CM.SetStandbyMode(node, "off") |
| return rc |
| |
| self.debug("Setting node %s to active mode" % node) |
| if not self.CM.SetStandbyMode(node, "off"): |
| return self.failure("can't set node %s to active mode" % node) |
| |
| self.set_timer("off") |
| self.CM.cluster_stable() |
| |
| status = self.CM.StandbyStatus(node) |
| if status != "off": |
| return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status)) |
| self.log_timer("off") |
| |
| return self.success() |
| |
| AllTestClasses.append(StandbyTest) |
| |
| |
| class ValgrindTest(CTSTest): |
| '''Check for memory leaks''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "Valgrind" |
| self.stopall = SimulStopLite(cm) |
| self.startall = SimulStartLite(cm) |
| self.is_valgrind = 1 |
| self.is_loop = 1 |
| |
| def setup(self, node): |
| self.incr("calls") |
| |
| ret = self.stopall(None) |
| if not ret: |
| return self.failure("Stop all nodes failed") |
| |
| |
| |
| |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Start all nodes failed") |
| |
| return self.success() |
| |
| def teardown(self, node): |
| |
| |
| ret = self.stopall(None) |
| if not ret: |
| return self.failure("Stop all nodes failed") |
| |
| return self.success() |
| |
| def find_leaks(self): |
| |
| |
| leaked = [] |
| self.stop = StopTest(self.CM) |
| |
| for node in self.Env["nodes"]: |
| rc = self.stop(node) |
| if not rc: |
| self.failure("Couldn't shut down %s" % node) |
| |
| rc = self.rsh(node, "grep -e indirectly.*lost:.*[1-9] -e definitely.*lost:.*[1-9] -e (ERROR|error).*SUMMARY:.*[1-9].*errors %s" % self.logger.logPat, 0) |
| if rc != 1: |
| leaked.append(node) |
| self.failure("Valgrind errors detected on %s" % node) |
| (rc, output) = self.rsh(node, "grep -e lost: -e SUMMARY: %s" % self.logger.logPat, None) |
| for line in output: |
| self.logger.log(line) |
| (rc, output) = self.rsh(node, "cat %s" % self.logger.logPat, None) |
| for line in output: |
| self.debug(line) |
| |
| self.rsh(node, "rm -f %s" % self.logger.logPat, None) |
| return leaked |
| |
| def __call__(self, node): |
| |
| |
| |
| |
| return self.success() |
| |
| def errorstoignore(self): |
| '''Return list of errors which should be ignored''' |
| return [ |
| r"pacemaker-based.*: \*\*\*\*\*\*\*\*\*\*\*\*\*", |
| r"pacemaker-based.*: .* avoid confusing Valgrind", |
| r"HA_VALGRIND_ENABLED", |
| ] |
| |
| |
| class StandbyLoopTest(ValgrindTest): |
| '''Check for memory leaks by putting a node in and out of standby for an hour''' |
| |
| def __init__(self, cm): |
| ValgrindTest.__init__(self,cm) |
| self.name = "StandbyLoop" |
| |
| def __call__(self, node): |
| |
| lpc = 0 |
| delay = 2 |
| failed = 0 |
| done = time.time() + self.Env["loop-minutes"] * 60 |
| while time.time() <= done and not failed: |
| lpc = lpc + 1 |
| |
| time.sleep(delay) |
| if not self.CM.SetStandbyMode(node, "on"): |
| self.failure("can't set node %s to standby mode" % node) |
| failed = lpc |
| |
| time.sleep(delay) |
| if not self.CM.SetStandbyMode(node, "off"): |
| self.failure("can't set node %s to active mode" % node) |
| failed = lpc |
| |
| leaked = self.find_leaks() |
| if failed: |
| return self.failure("Iteration %d failed" % failed) |
| elif len(leaked) > 0: |
| return self.failure("Nodes %s leaked" % repr(leaked)) |
| |
| return self.success() |
| |
| |
| |
| |
| class BandwidthTest(CTSTest): |
| |
| |
| |
| '''Test the bandwidth which the cluster uses''' |
| def __init__(self, cm): |
| CTSTest.__init__(self, cm) |
| self.name = "Bandwidth" |
| self.start = StartTest(cm) |
| self.__setitem__("min",0) |
| self.__setitem__("max",0) |
| self.__setitem__("totalbandwidth",0) |
| (handle, self.tempfile) = tempfile.mkstemp(".cts") |
| os.close(handle) |
| self.startall = SimulStartLite(cm) |
| |
| def __call__(self, node): |
| '''Perform the Bandwidth test''' |
| self.incr("calls") |
| |
| if self.CM.upcount() < 1: |
| return self.skipped() |
| |
| Path = self.CM.InternalCommConfig() |
| if "ip" not in Path["mediatype"]: |
| return self.skipped() |
| |
| port = Path["port"][0] |
| port = int(port) |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Test setup failed") |
| time.sleep(5) |
| |
| fstmpfile = "/var/run/band_estimate" |
| dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \ |
| % (port, fstmpfile) |
| |
| rc = self.rsh(node, dumpcmd) |
| if rc == 0: |
| farfile = "root@%s:%s" % (node, fstmpfile) |
| self.rsh.cp(farfile, self.tempfile) |
| Bandwidth = self.countbandwidth(self.tempfile) |
| if not Bandwidth: |
| self.logger.log("Could not compute bandwidth.") |
| return self.success() |
| intband = int(Bandwidth + 0.5) |
| self.logger.log("...bandwidth: %d bits/sec" % intband) |
| self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth |
| if self.Stats["min"] == 0: |
| self.Stats["min"] = Bandwidth |
| if Bandwidth > self.Stats["max"]: |
| self.Stats["max"] = Bandwidth |
| if Bandwidth < self.Stats["min"]: |
| self.Stats["min"] = Bandwidth |
| self.rsh(node, "rm -f %s" % fstmpfile) |
| os.unlink(self.tempfile) |
| return self.success() |
| else: |
| return self.failure("no response from tcpdump command [%d]!" % rc) |
| |
| def countbandwidth(self, file): |
| fp = open(file, "r") |
| fp.seek(0) |
| count = 0 |
| sum = 0 |
| while 1: |
| line = fp.readline() |
| if not line: |
| return None |
| if re.search("udp",line) or re.search("UDP,", line): |
| count = count + 1 |
| linesplit = line.split(" ") |
| for j in range(len(linesplit)-1): |
| if linesplit[j] == "udp": break |
| if linesplit[j] == "length:": break |
| |
| try: |
| sum = sum + int(linesplit[j+1]) |
| except ValueError: |
| self.logger.log("Invalid tcpdump line: %s" % line) |
| return None |
| T1 = linesplit[0] |
| timesplit = T1.split(":") |
| time2split = timesplit[2].split(".") |
| time1 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001 |
| break |
| |
| while count < 100: |
| line = fp.readline() |
| if not line: |
| return None |
| if re.search("udp",line) or re.search("UDP,", line): |
| count = count+1 |
| linessplit = line.split(" ") |
| for j in range(len(linessplit)-1): |
| if linessplit[j] == "udp": break |
| if linessplit[j] == "length:": break |
| try: |
| sum = int(linessplit[j+1]) + sum |
| except ValueError: |
| self.logger.log("Invalid tcpdump line: %s" % line) |
| return None |
| |
| T2 = linessplit[0] |
| timesplit = T2.split(":") |
| time2split = timesplit[2].split(".") |
| time2 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001 |
| time = time2-time1 |
| if (time <= 0): |
| return 0 |
| return int((sum*8)/time) |
| |
| def is_applicable(self): |
| '''BandwidthTest never applicable''' |
| return 0 |
| |
| AllTestClasses.append(BandwidthTest) |
| |
| |
| |
| class MaintenanceMode(CTSTest): |
| |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "MaintenanceMode" |
| self.start = StartTest(cm) |
| self.startall = SimulStartLite(cm) |
| self.max = 30 |
| |
| self.benchmark = 1 |
| self.action = "asyncmon" |
| self.interval = 0 |
| self.rid = "maintenanceDummy" |
| |
| def toggleMaintenanceMode(self, node, action): |
| pats = [] |
| pats.append(self.templates["Pat:DC_IDLE"]) |
| |
| |
| |
| if action == "On": |
| pats.append(self.templates["Pat:RscOpFail"] % (self.action, self.rid)) |
| else: |
| pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) |
| pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid)) |
| |
| watch = self.create_watch(pats, 60) |
| watch.setwatch() |
| |
| self.debug("Turning maintenance mode %s" % action) |
| self.rsh(node, self.templates["MaintenanceMode%s" % (action)]) |
| if (action == "On"): |
| self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node)) |
| |
| self.set_timer("recover%s" % (action)) |
| watch.lookforall() |
| self.log_timer("recover%s" % (action)) |
| if watch.unmatched: |
| self.debug("Failed to find patterns when turning maintenance mode %s" % action) |
| return repr(watch.unmatched) |
| |
| return "" |
| |
| def insertMaintenanceDummy(self, node): |
| pats = [] |
| pats.append(("%s.*" % node) + (self.templates["Pat:RscOpOK"] % ("start", self.rid))) |
| |
| watch = self.create_watch(pats, 60) |
| watch.setwatch() |
| |
| self.CM.AddDummyRsc(node, self.rid) |
| |
| self.set_timer("addDummy") |
| watch.lookforall() |
| self.log_timer("addDummy") |
| |
| if watch.unmatched: |
| self.debug("Failed to find patterns when adding maintenance dummy resource") |
| return repr(watch.unmatched) |
| return "" |
| |
| def removeMaintenanceDummy(self, node): |
| pats = [] |
| pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) |
| |
| watch = self.create_watch(pats, 60) |
| watch.setwatch() |
| self.CM.RemoveDummyRsc(node, self.rid) |
| |
| self.set_timer("removeDummy") |
| watch.lookforall() |
| self.log_timer("removeDummy") |
| |
| if watch.unmatched: |
| self.debug("Failed to find patterns when removing maintenance dummy resource") |
| return repr(watch.unmatched) |
| return "" |
| |
| def managedRscList(self, node): |
| rscList = [] |
| (rc, lines) = self.rsh(node, "crm_resource -c", None) |
| for line in lines: |
| if re.search("^Resource", line): |
| tmp = AuditResource(self.CM, line) |
| if tmp.managed(): |
| rscList.append(tmp.id) |
| |
| return rscList |
| |
| def verifyResources(self, node, rscList, managed): |
| managedList = list(rscList) |
| managed_str = "managed" |
| if not managed: |
| managed_str = "unmanaged" |
| |
| (rc, lines) = self.rsh(node, "crm_resource -c", None) |
| for line in lines: |
| if re.search("^Resource", line): |
| tmp = AuditResource(self.CM, line) |
| if managed and not tmp.managed(): |
| continue |
| elif not managed and tmp.managed(): |
| continue |
| elif managedList.count(tmp.id): |
| managedList.remove(tmp.id) |
| |
| if len(managedList) == 0: |
| self.debug("Found all %s resources on %s" % (managed_str, node)) |
| return True |
| |
| self.logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managedList)) |
| return False |
| |
| def __call__(self, node): |
| '''Perform the 'MaintenanceMode' test. ''' |
| self.incr("calls") |
| verify_managed = False |
| verify_unmanaged = False |
| failPat = "" |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Setup failed") |
| |
| |
| |
| |
| |
| managedResources = self.managedRscList(node) |
| if len(managedResources) == 0: |
| self.logger.log("No managed resources on %s" % node) |
| return self.skipped() |
| |
| |
| |
| |
| failPat = failPat + self.insertMaintenanceDummy(node) |
| |
| |
| failPat = failPat + self.toggleMaintenanceMode(node, "On") |
| |
| |
| if self.verifyResources(node, managedResources, False): |
| verify_unmanaged = True |
| |
| |
| failPat = failPat + self.toggleMaintenanceMode(node, "Off") |
| |
| |
| if self.verifyResources(node, managedResources, True): |
| verify_managed = True |
| |
| |
| failPat = failPat + self.removeMaintenanceDummy(node) |
| |
| self.CM.cluster_stable() |
| |
| if failPat != "": |
| return self.failure("Unmatched patterns: %s" % (failPat)) |
| elif verify_unmanaged is False: |
| return self.failure("Failed to verify resources became unmanaged during maintenance mode") |
| elif verify_managed is False: |
| return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode") |
| |
| return self.success() |
| |
| def errorstoignore(self): |
| '''Return list of errors which should be ignored''' |
| return [ |
| r"Updating failcount for %s" % self.rid, |
| r"schedulerd.*: Recover %s\s*\(.*\)" % self.rid, |
| r"Unknown operation: fail", |
| self.templates["Pat:RscOpOK"] % (self.action, self.rid), |
| r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval), |
| ] |
| |
| AllTestClasses.append(MaintenanceMode) |
| |
| |
| class ResourceRecover(CTSTest): |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "ResourceRecover" |
| self.start = StartTest(cm) |
| self.startall = SimulStartLite(cm) |
| self.max = 30 |
| self.rid = None |
| self.rid_alt = None |
| |
| self.benchmark = 1 |
| |
| |
| self.action = "asyncmon" |
| self.interval = 0 |
| |
| def __call__(self, node): |
| '''Perform the 'ResourceRecover' test. ''' |
| self.incr("calls") |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Setup failed") |
| |
| resourcelist = self.CM.active_resources(node) |
| |
| if len(resourcelist) == 0: |
| self.logger.log("No active resources on %s" % node) |
| return self.skipped() |
| |
| self.rid = self.Env.RandomGen.choice(resourcelist) |
| self.rid_alt = self.rid |
| |
| rsc = None |
| (rc, lines) = self.rsh(node, "crm_resource -c", None) |
| for line in lines: |
| if re.search("^Resource", line): |
| tmp = AuditResource(self.CM, line) |
| if tmp.id == self.rid: |
| rsc = tmp |
| |
| self.rid = rsc.clone_id |
| break |
| |
| if not rsc: |
| return self.failure("Could not find %s in the resource list" % self.rid) |
| |
| self.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id)) |
| |
| pats = [] |
| pats.append(self.templates["Pat:CloneOpFail"] % (self.action, rsc.id, rsc.clone_id)) |
| |
| if rsc.managed(): |
| pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid)) |
| if rsc.unique(): |
| pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid)) |
| else: |
| |
| pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*")) |
| |
| watch = self.create_watch(pats, 60) |
| watch.setwatch() |
| |
| self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node)) |
| |
| self.set_timer("recover") |
| watch.lookforall() |
| self.log_timer("recover") |
| |
| self.CM.cluster_stable() |
| recovered = self.CM.ResourceLocation(self.rid) |
| |
| if watch.unmatched: |
| return self.failure("Patterns not found: %s" % repr(watch.unmatched)) |
| |
| elif rsc.unique() and len(recovered) > 1: |
| return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered))) |
| |
| elif len(recovered) > 0: |
| self.debug("%s is running on: %s" % (self.rid, repr(recovered))) |
| |
| elif rsc.managed(): |
| return self.failure("%s was not recovered and is inactive" % self.rid) |
| |
| return self.success() |
| |
| def errorstoignore(self): |
| '''Return list of errors which should be ignored''' |
| return [ |
| r"Updating failcount for %s" % self.rid, |
| r"schedulerd.*: Recover (%s|%s)\s*\(.*\)" % (self.rid, self.rid_alt), |
| r"Unknown operation: fail", |
| self.templates["Pat:RscOpOK"] % (self.action, self.rid), |
| r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval), |
| ] |
| |
| AllTestClasses.append(ResourceRecover) |
| |
| |
| class ComponentFail(CTSTest): |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "ComponentFail" |
| |
| self.is_docker_unsafe = 1 |
| self.startall = SimulStartLite(cm) |
| self.complist = cm.Components() |
| self.patterns = [] |
| self.okerrpatterns = [] |
| self.is_unsafe = 1 |
| |
| def __call__(self, node): |
| '''Perform the 'ComponentFail' test. ''' |
| self.incr("calls") |
| self.patterns = [] |
| self.okerrpatterns = [] |
| |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Setup failed") |
| |
| if not self.CM.cluster_stable(self.Env["StableTime"]): |
| return self.failure("Setup failed - unstable") |
| |
| node_is_dc = self.CM.is_node_dc(node, None) |
| |
| |
| chosen = self.Env.RandomGen.choice(self.complist) |
| while chosen.dc_only == 1 and node_is_dc == 0: |
| chosen = self.Env.RandomGen.choice(self.complist) |
| |
| self.debug("...component %s (dc=%d,boot=%d)" % (chosen.name, node_is_dc,chosen.triggersreboot)) |
| self.incr(chosen.name) |
| |
| if chosen.name != "corosync": |
| self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name)) |
| self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name)) |
| |
| self.patterns.extend(chosen.pats) |
| if node_is_dc: |
| self.patterns.extend(chosen.dc_pats) |
| |
| |
| if chosen.name in [ "corosync", "pacemaker-based", "pacemaker-fenced" ]: |
| |
| |
| (rc, lines) = self.rsh(node, "crm_resource -c", None) |
| for line in lines: |
| if re.search("^Resource", line): |
| r = AuditResource(self.CM, line) |
| if r.rclass == "stonith": |
| self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id) |
| self.okerrpatterns.append(self.templates["Pat:Fencing_active"] % r.id) |
| self.okerrpatterns.append(self.templates["Pat:Fencing_probe"] % r.id) |
| |
| |
| tmpPats = [] |
| tmpPats.extend(self.patterns) |
| self.patterns.extend(chosen.badnews_ignore) |
| |
| |
| stonithPats = [] |
| stonithPats.append(self.templates["Pat:Fencing_ok"] % node) |
| stonith = self.create_watch(stonithPats, 0) |
| stonith.setwatch() |
| |
| |
| watch = self.create_watch( |
| tmpPats, self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"]) |
| watch.setwatch() |
| |
| |
| chosen.kill(node) |
| |
| self.debug("Waiting for the cluster to recover") |
| self.CM.cluster_stable() |
| |
| self.debug("Waiting for any fenced node to come back up") |
| self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600) |
| |
| self.debug("Waiting for the cluster to re-stabilize with all nodes") |
| self.CM.cluster_stable(self.Env["StartTime"]) |
| |
| self.debug("Checking if %s was shot" % node) |
| shot = stonith.look(60) |
| if shot: |
| self.debug("Found: " + repr(shot)) |
| self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node) |
| |
| if self.Env["at-boot"] == 0: |
| self.CM.ShouldBeStatus[node] = "down" |
| |
| |
| |
| return self.success() |
| |
| |
| matched = watch.lookforall(allow_multiple_matches=1) |
| if watch.unmatched: |
| self.logger.log("Patterns not found: " + repr(watch.unmatched)) |
| |
| self.debug("Waiting for the cluster to re-stabilize with all nodes") |
| is_stable = self.CM.cluster_stable(self.Env["StartTime"]) |
| |
| if not matched: |
| return self.failure("Didn't find all expected %s patterns" % chosen.name) |
| elif not is_stable: |
| return self.failure("Cluster did not become stable after killing %s" % chosen.name) |
| |
| return self.success() |
| |
| def errorstoignore(self): |
| '''Return list of errors which should be ignored''' |
| |
| |
| self.okerrpatterns.extend(self.patterns) |
| return self.okerrpatterns |
| |
| AllTestClasses.append(ComponentFail) |
| |
| |
| class SplitBrainTest(CTSTest): |
| '''It is used to test split-brain. when the path between the two nodes break |
| check the two nodes both take over the resource''' |
| def __init__(self,cm): |
| CTSTest.__init__(self,cm) |
| self.name = "SplitBrain" |
| self.start = StartTest(cm) |
| self.startall = SimulStartLite(cm) |
| self.is_experimental = 1 |
| |
| def isolate_partition(self, partition): |
| other_nodes = [] |
| other_nodes.extend(self.Env["nodes"]) |
| |
| for node in partition: |
| try: |
| other_nodes.remove(node) |
| except ValueError: |
| self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]) + " from " +repr(partition)) |
| |
| if len(other_nodes) == 0: |
| return 1 |
| |
| self.debug("Creating partition: " + repr(partition)) |
| self.debug("Everyone else: " + repr(other_nodes)) |
| |
| for node in partition: |
| if not self.CM.isolate_node(node, other_nodes): |
| self.logger.log("Could not isolate %s" % node) |
| return 0 |
| |
| return 1 |
| |
| def heal_partition(self, partition): |
| other_nodes = [] |
| other_nodes.extend(self.Env["nodes"]) |
| |
| for node in partition: |
| try: |
| other_nodes.remove(node) |
| except ValueError: |
| self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"])) |
| |
| if len(other_nodes) == 0: |
| return 1 |
| |
| self.debug("Healing partition: " + repr(partition)) |
| self.debug("Everyone else: " + repr(other_nodes)) |
| |
| for node in partition: |
| self.CM.unisolate_node(node, other_nodes) |
| |
| def __call__(self, node): |
| '''Perform split-brain test''' |
| self.incr("calls") |
| self.passed = 1 |
| partitions = {} |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Setup failed") |
| |
| while 1: |
| |
| partitions = {} |
| p_max = len(self.Env["nodes"]) |
| for node in self.Env["nodes"]: |
| p = self.Env.RandomGen.randint(1, p_max) |
| if not p in partitions: |
| partitions[p] = [] |
| partitions[p].append(node) |
| p_max = len(list(partitions.keys())) |
| if p_max > 1: |
| break |
| |
| |
| self.debug("Created %d partitions" % p_max) |
| for key in list(partitions.keys()): |
| self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key])) |
| |
| |
| self.rsh(node, "crm_attribute -V -n stonith-enabled -v false") |
| |
| for key in list(partitions.keys()): |
| self.isolate_partition(partitions[key]) |
| |
| count = 30 |
| while count > 0: |
| if len(self.CM.find_partitions()) != p_max: |
| time.sleep(10) |
| else: |
| break |
| else: |
| self.failure("Expected partitions were not created") |
| |
| |
| if not self.CM.cluster_stable(): |
| self.failure("Partitioned cluster not stable") |
| |
| |
| self.CM.partitions_expected = p_max |
| if not self.audit(): |
| self.failure("Audits failed") |
| self.CM.partitions_expected = 1 |
| |
| |
| for key in list(partitions.keys()): |
| self.heal_partition(partitions[key]) |
| |
| |
| count = 30 |
| while count > 0: |
| if len(self.CM.find_partitions()) != 1: |
| time.sleep(10) |
| count -= 1 |
| else: |
| break |
| else: |
| self.failure("Cluster did not reform") |
| |
| |
| count = 30 |
| while count > 0: |
| members = [] |
| |
| partitions = self.CM.find_partitions() |
| if len(partitions) > 0: |
| members = partitions[0].split() |
| |
| if len(members) != len(self.Env["nodes"]): |
| time.sleep(10) |
| count -= 1 |
| else: |
| break |
| else: |
| self.failure("Cluster did not completely reform") |
| |
| |
| |
| if not self.CM.cluster_stable(1200): |
| self.failure("Reformed cluster not stable") |
| if self.Env["continue"] == 1: |
| answer = "Y" |
| else: |
| try: |
| answer = input_wrapper('Continue? [nY]') |
| except EOFError as e: |
| answer = "n" |
| if answer and answer == "n": |
| raise ValueError("Reformed cluster not stable") |
| |
| |
| if self.Env["DoFencing"]: |
| self.rsh(node, "crm_attribute -V -D -n stonith-enabled") |
| |
| self.CM.cluster_stable() |
| |
| if self.passed: |
| return self.success() |
| return self.failure("See previous errors") |
| |
| def errorstoignore(self): |
| '''Return list of errors which are 'normal' and should be ignored''' |
| return [ |
| r"Another DC detected:", |
| r"(ERROR|error).*: .*Application of an update diff failed", |
| r"pacemaker-controld.*:.*not in our membership list", |
| r"CRIT:.*node.*returning after partition", |
| ] |
| |
| def is_applicable(self): |
| if not self.is_applicable_common(): |
| return 0 |
| return len(self.Env["nodes"]) > 2 |
| |
| AllTestClasses.append(SplitBrainTest) |
| |
| |
| class Reattach(CTSTest): |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "Reattach" |
| self.startall = SimulStartLite(cm) |
| self.restart1 = RestartTest(cm) |
| self.stopall = SimulStopLite(cm) |
| self.is_unsafe = 0 |
| |
| def _is_managed(self, node): |
| is_managed = self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", 1) |
| is_managed = is_managed[:-1] |
| return is_managed == "true" |
| |
| def _set_unmanaged(self, node): |
| self.debug("Disable resource management") |
| self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false") |
| |
| def _set_managed(self, node): |
| self.debug("Re-enable resource management") |
| self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D") |
| |
| def setup(self, node): |
| attempt = 0 |
| if not self.startall(None): |
| return None |
| |
| |
| |
| |
| while not self.CM.cluster_stable(double_check=True): |
| if attempt < 5: |
| attempt += 1 |
| self.debug("Not stable yet, re-testing") |
| else: |
| self.logger.log("Cluster is not stable") |
| return None |
| |
| return 1 |
| |
| def teardown(self, node): |
| |
| |
| start = StartTest(self.CM) |
| start(node) |
| |
| if not self._is_managed(node): |
| self.logger.log("Attempting to re-enable resource management on %s" % node) |
| self._set_managed(node) |
| self.CM.cluster_stable() |
| if not self._is_managed(node): |
| self.logger.log("Could not re-enable resource management") |
| return 0 |
| |
| return 1 |
| |
| def canrunnow(self, node): |
| '''Return TRUE if we can meaningfully run right now''' |
| if self.find_ocfs2_resources(node): |
| self.logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present") |
| return 0 |
| return 1 |
| |
| def __call__(self, node): |
| self.incr("calls") |
| |
| pats = [] |
| |
| |
| managed = self.create_watch(["Delaying fencing operations"], 60) |
| managed.setwatch() |
| |
| self._set_unmanaged(node) |
| |
| if not managed.lookforall(): |
| self.logger.log("Patterns not found: " + repr(managed.unmatched)) |
| return self.failure("Resource management not disabled") |
| |
| pats = [] |
| pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*")) |
| pats.append(self.templates["Pat:RscOpOK"] % ("stop", ".*")) |
| pats.append(self.templates["Pat:RscOpOK"] % ("promote", ".*")) |
| pats.append(self.templates["Pat:RscOpOK"] % ("demote", ".*")) |
| pats.append(self.templates["Pat:RscOpOK"] % ("migrate", ".*")) |
| |
| watch = self.create_watch(pats, 60, "ShutdownActivity") |
| watch.setwatch() |
| |
| self.debug("Shutting down the cluster") |
| ret = self.stopall(None) |
| if not ret: |
| self._set_managed(node) |
| return self.failure("Couldn't shut down the cluster") |
| |
| self.debug("Bringing the cluster back up") |
| ret = self.startall(None) |
| time.sleep(5) |
| if not ret: |
| self._set_managed(node) |
| return self.failure("Couldn't restart the cluster") |
| |
| if self.local_badnews("ResourceActivity:", watch): |
| self._set_managed(node) |
| return self.failure("Resources stopped or started during cluster restart") |
| |
| watch = self.create_watch(pats, 60, "StartupActivity") |
| watch.setwatch() |
| |
| |
| self._set_managed(node) |
| self.CM.cluster_stable() |
| if not self._is_managed(node): |
| return self.failure("Could not re-enable resource management") |
| |
| |
| ignore = [] |
| (rc, lines) = self.rsh(node, "crm_resource -c", None) |
| for line in lines: |
| if re.search("^Resource", line): |
| r = AuditResource(self.CM, line) |
| if r.rclass == "stonith": |
| |
| self.debug("Ignoring start actions for %s" % r.id) |
| ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id)) |
| |
| if self.local_badnews("ResourceActivity:", watch, ignore): |
| return self.failure("Resources stopped or started after resource management was re-enabled") |
| |
| return ret |
| |
| def errorstoignore(self): |
| '''Return list of errors which should be ignored''' |
| return [ |
| r"resource( was|s were) active at shutdown", |
| ] |
| |
| def is_applicable(self): |
| return 1 |
| |
| AllTestClasses.append(Reattach) |
| |
| |
| class SpecialTest1(CTSTest): |
| '''Set up a custom test to cause quorum failure issues for Andrew''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "SpecialTest1" |
| self.startall = SimulStartLite(cm) |
| self.restart1 = RestartTest(cm) |
| self.stopall = SimulStopLite(cm) |
| |
| def __call__(self, node): |
| '''Perform the 'SpecialTest1' test for Andrew. ''' |
| self.incr("calls") |
| |
| |
| ret = self.stopall(None) |
| if not ret: |
| return self.failure("Could not stop all nodes") |
| |
| |
| self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*") |
| |
| |
| ret = self.restart1(node) |
| if not ret: |
| return self.failure("Could not start "+node) |
| |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Could not start the remaining nodes") |
| |
| return self.success() |
| |
| def errorstoignore(self): |
| '''Return list of errors which should be ignored''' |
| |
| return [ |
| r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed", |
| r"error.*: Resource start-up disabled since no STONITH resources have been defined", |
| r"error.*: Either configure some or disable STONITH with the stonith-enabled option", |
| r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity", |
| ] |
| |
| AllTestClasses.append(SpecialTest1) |
| |
| |
| class HAETest(CTSTest): |
| '''Set up a custom test to cause quorum failure issues for Andrew''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "HAETest" |
| self.stopall = SimulStopLite(cm) |
| self.startall = SimulStartLite(cm) |
| self.is_loop = 1 |
| |
| def setup(self, node): |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Couldn't start all nodes") |
| return self.success() |
| |
| def teardown(self, node): |
| |
| ret = self.stopall(None) |
| if not ret: |
| return self.failure("Couldn't stop all nodes") |
| return self.success() |
| |
| def wait_on_state(self, node, resource, expected_clones, attempts=240): |
| while attempts > 0: |
| active = 0 |
| (rc, lines) = self.rsh(node, "crm_resource -r %s -W -Q" % resource, stdout=None) |
| |
| |
| if rc == 0 and lines: |
| active = len(lines) |
| |
| if len(lines) == expected_clones: |
| return 1 |
| |
| elif rc == 1: |
| self.debug("Resource %s is still inactive" % resource) |
| |
| elif rc == 234: |
| self.logger.log("Unknown resource %s" % resource) |
| return 0 |
| |
| elif rc == 246: |
| self.logger.log("Cluster is inactive") |
| return 0 |
| |
| elif rc != 0: |
| self.logger.log("Call to crm_resource failed, rc=%d" % rc) |
| return 0 |
| |
| else: |
| self.debug("Resource %s is active on %d times instead of %d" % (resource, active, expected_clones)) |
| |
| attempts -= 1 |
| time.sleep(1) |
| |
| return 0 |
| |
| def find_dlm(self, node): |
| self.r_dlm = None |
| |
| (rc, lines) = self.rsh(node, "crm_resource -c", None) |
| for line in lines: |
| if re.search("^Resource", line): |
| r = AuditResource(self.CM, line) |
| if r.rtype == "controld" and r.parent != "NA": |
| self.debug("Found dlm: %s" % self.r_dlm) |
| self.r_dlm = r.parent |
| return 1 |
| return 0 |
| |
| def find_hae_resources(self, node): |
| self.r_dlm = None |
| self.r_o2cb = None |
| self.r_ocfs2 = [] |
| |
| if self.find_dlm(node): |
| self.find_ocfs2_resources(node) |
| |
| def is_applicable(self): |
| if not self.is_applicable_common(): |
| return 0 |
| if self.Env["Schema"] == "hae": |
| return 1 |
| return None |
| |
| |
| class HAERoleTest(HAETest): |
| def __init__(self, cm): |
| '''Lars' mount/unmount test for the HA extension. ''' |
| HAETest.__init__(self,cm) |
| self.name = "HAERoleTest" |
| |
| def change_state(self, node, resource, target): |
| rc = self.rsh(node, "crm_resource -V -r %s -p target-role -v %s --meta" % (resource, target)) |
| return rc |
| |
| def __call__(self, node): |
| self.incr("calls") |
| lpc = 0 |
| failed = 0 |
| delay = 2 |
| done = time.time() + self.Env["loop-minutes"]*60 |
| self.find_hae_resources(node) |
| |
| clone_max = len(self.Env["nodes"]) |
| while time.time() <= done and not failed: |
| lpc = lpc + 1 |
| |
| self.change_state(node, self.r_dlm, "Stopped") |
| if not self.wait_on_state(node, self.r_dlm, 0): |
| self.failure("%s did not go down correctly" % self.r_dlm) |
| failed = lpc |
| |
| self.change_state(node, self.r_dlm, "Started") |
| if not self.wait_on_state(node, self.r_dlm, clone_max): |
| self.failure("%s did not come up correctly" % self.r_dlm) |
| failed = lpc |
| |
| if not self.wait_on_state(node, self.r_o2cb, clone_max): |
| self.failure("%s did not come up correctly" % self.r_o2cb) |
| failed = lpc |
| |
| for fs in self.r_ocfs2: |
| if not self.wait_on_state(node, fs, clone_max): |
| self.failure("%s did not come up correctly" % fs) |
| failed = lpc |
| |
| if failed: |
| return self.failure("iteration %d failed" % failed) |
| return self.success() |
| |
| AllTestClasses.append(HAERoleTest) |
| |
| |
| class HAEStandbyTest(HAETest): |
| '''Set up a custom test to cause quorum failure issues for Andrew''' |
| def __init__(self, cm): |
| HAETest.__init__(self,cm) |
| self.name = "HAEStandbyTest" |
| |
| def change_state(self, node, resource, target): |
| rc = self.rsh(node, "crm_standby -V -l reboot -v %s" % (target)) |
| return rc |
| |
| def __call__(self, node): |
| self.incr("calls") |
| |
| lpc = 0 |
| failed = 0 |
| done = time.time() + self.Env["loop-minutes"]*60 |
| self.find_hae_resources(node) |
| |
| clone_max = len(self.Env["nodes"]) |
| while time.time() <= done and not failed: |
| lpc = lpc + 1 |
| |
| self.change_state(node, self.r_dlm, "true") |
| if not self.wait_on_state(node, self.r_dlm, clone_max-1): |
| self.failure("%s did not go down correctly" % self.r_dlm) |
| failed = lpc |
| |
| self.change_state(node, self.r_dlm, "false") |
| if not self.wait_on_state(node, self.r_dlm, clone_max): |
| self.failure("%s did not come up correctly" % self.r_dlm) |
| failed = lpc |
| |
| if not self.wait_on_state(node, self.r_o2cb, clone_max): |
| self.failure("%s did not come up correctly" % self.r_o2cb) |
| failed = lpc |
| |
| for fs in self.r_ocfs2: |
| if not self.wait_on_state(node, fs, clone_max): |
| self.failure("%s did not come up correctly" % fs) |
| failed = lpc |
| |
| if failed: |
| return self.failure("iteration %d failed" % failed) |
| return self.success() |
| |
| AllTestClasses.append(HAEStandbyTest) |
| |
| |
| class NearQuorumPointTest(CTSTest): |
| ''' |
| This test brings larger clusters near the quorum point (50%). |
| In addition, it will test doing starts and stops at the same time. |
| |
| Here is how I think it should work: |
| - loop over the nodes and decide randomly which will be up and which |
| will be down Use a 50% probability for each of up/down. |
| - figure out what to do to get into that state from the current state |
| - in parallel, bring up those going up and bring those going down. |
| ''' |
| |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "NearQuorumPoint" |
| |
| def __call__(self, dummy): |
| '''Perform the 'NearQuorumPoint' test. ''' |
| self.incr("calls") |
| startset = [] |
| stopset = [] |
| |
| stonith = self.CM.prepare_fencing_watcher("NearQuorumPoint") |
| |
| for node in self.Env["nodes"]: |
| action = self.Env.RandomGen.choice(["start","stop"]) |
| |
| if action == "start" : |
| startset.append(node) |
| elif action == "stop" : |
| stopset.append(node) |
| |
| self.debug("start nodes:" + repr(startset)) |
| self.debug("stop nodes:" + repr(stopset)) |
| |
| |
| watchpats = [ ] |
| for node in stopset: |
| if self.CM.ShouldBeStatus[node] == "up": |
| watchpats.append(self.templates["Pat:We_stopped"] % node) |
| |
| for node in startset: |
| if self.CM.ShouldBeStatus[node] == "down": |
| |
| watchpats.append(self.templates["Pat:Local_started"] % node) |
| else: |
| for stopping in stopset: |
| if self.CM.ShouldBeStatus[stopping] == "up": |
| watchpats.append(self.templates["Pat:They_stopped"] % (node, self.CM.key_for_node(stopping))) |
| |
| if len(watchpats) == 0: |
| return self.skipped() |
| |
| if len(startset) != 0: |
| watchpats.append(self.templates["Pat:DC_IDLE"]) |
| |
| watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) |
| |
| watch.setwatch() |
| |
| |
| for node in stopset: |
| if self.CM.ShouldBeStatus[node] == "up": |
| self.CM.StopaCMnoBlock(node) |
| |
| for node in startset: |
| if self.CM.ShouldBeStatus[node] == "down": |
| self.CM.StartaCMnoBlock(node) |
| |
| |
| if watch.lookforall(): |
| self.CM.cluster_stable() |
| self.CM.fencing_cleanup("NearQuorumPoint", stonith) |
| return self.success() |
| |
| self.logger.log("Warn: Patterns not found: " + repr(watch.unmatched)) |
| |
| |
| upnodes = [] |
| for node in stopset: |
| if self.CM.StataCM(node) == 1: |
| upnodes.append(node) |
| |
| downnodes = [] |
| for node in startset: |
| if self.CM.StataCM(node) == 0: |
| downnodes.append(node) |
| |
| self.CM.fencing_cleanup("NearQuorumPoint", stonith) |
| if upnodes == [] and downnodes == []: |
| self.CM.cluster_stable() |
| |
| |
| for node in stopset: |
| self.rsh(node, self.templates["StopCmd"]) |
| |
| return self.success() |
| |
| if len(upnodes) > 0: |
| self.logger.log("Warn: Unstoppable nodes: " + repr(upnodes)) |
| |
| if len(downnodes) > 0: |
| self.logger.log("Warn: Unstartable nodes: " + repr(downnodes)) |
| |
| return self.failure() |
| |
| def is_applicable(self): |
| return 1 |
| |
| AllTestClasses.append(NearQuorumPointTest) |
| |
| |
| class RollingUpgradeTest(CTSTest): |
| '''Perform a rolling upgrade of the cluster''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "RollingUpgrade" |
| self.start = StartTest(cm) |
| self.stop = StopTest(cm) |
| self.stopall = SimulStopLite(cm) |
| self.startall = SimulStartLite(cm) |
| |
| def setup(self, node): |
| |
| ret = self.stopall(None) |
| if not ret: |
| return self.failure("Couldn't stop all nodes") |
| |
| for node in self.Env["nodes"]: |
| if not self.downgrade(node, None): |
| return self.failure("Couldn't downgrade %s" % node) |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Couldn't start all nodes") |
| return self.success() |
| |
| def teardown(self, node): |
| |
| ret = self.stopall(None) |
| if not ret: |
| return self.failure("Couldn't stop all nodes") |
| |
| for node in self.Env["nodes"]: |
| if not self.upgrade(node, None): |
| return self.failure("Couldn't upgrade %s" % node) |
| |
| return self.success() |
| |
| def install(self, node, version, start=1, flags="--force"): |
| |
| target_dir = "/tmp/rpm-%s" % version |
| src_dir = "%s/%s" % (self.Env["rpm-dir"], version) |
| |
| self.logger.log("Installing %s on %s with %s" % (version, node, flags)) |
| if not self.stop(node): |
| return self.failure("stop failure: "+node) |
| |
| rc = self.rsh(node, "mkdir -p %s" % target_dir) |
| rc = self.rsh(node, "rm -f %s/*.rpm" % target_dir) |
| (rc, lines) = self.rsh(node, "ls -1 %s/*.rpm" % src_dir, None) |
| for line in lines: |
| line = line[:-1] |
| rc = self.rsh.cp("%s" % (line), "%s:%s/" % (node, target_dir)) |
| rc = self.rsh(node, "rpm -Uvh %s %s/*.rpm" % (flags, target_dir)) |
| |
| if start and not self.start(node): |
| return self.failure("start failure: "+node) |
| |
| return self.success() |
| |
| def upgrade(self, node, start=1): |
| return self.install(node, self.Env["current-version"], start) |
| |
| def downgrade(self, node, start=1): |
| return self.install(node, self.Env["previous-version"], start, "--force --nodeps") |
| |
| def __call__(self, node): |
| '''Perform the 'Rolling Upgrade' test. ''' |
| self.incr("calls") |
| |
| for node in self.Env["nodes"]: |
| if self.upgrade(node): |
| return self.failure("Couldn't upgrade %s" % node) |
| |
| self.CM.cluster_stable() |
| |
| return self.success() |
| |
| def is_applicable(self): |
| if not self.is_applicable_common(): |
| return None |
| |
| if not "rpm-dir" in list(self.Env.keys()): |
| return None |
| if not "current-version" in list(self.Env.keys()): |
| return None |
| if not "previous-version" in list(self.Env.keys()): |
| return None |
| |
| return 1 |
| |
| |
| AllTestClasses.append(RollingUpgradeTest) |
| |
| |
| class BSC_AddResource(CTSTest): |
| '''Add a resource to the cluster''' |
| def __init__(self, cm): |
| CTSTest.__init__(self, cm) |
| self.name = "AddResource" |
| self.resource_offset = 0 |
| self.cib_cmd = """cibadmin -C -o %s -X '%s' """ |
| |
| def __call__(self, node): |
| self.incr("calls") |
| self.resource_offset = self.resource_offset + 1 |
| |
| r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset) |
| start_pat = "pacemaker-controld.*%s_start_0.*confirmed.*ok" |
| |
| patterns = [] |
| patterns.append(start_pat % r_id) |
| |
| watch = self.create_watch(patterns, self.Env["DeadTime"]) |
| watch.setwatch() |
| |
| ip = self.NextIP() |
| if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip): |
| return self.failure("Make resource %s failed" % r_id) |
| |
| failed = 0 |
| watch_result = watch.lookforall() |
| if watch.unmatched: |
| for regex in watch.unmatched: |
| self.logger.log ("Warn: Pattern not found: %s" % (regex)) |
| failed = 1 |
| |
| if failed: |
| return self.failure("Resource pattern(s) not found") |
| |
| if not self.CM.cluster_stable(self.Env["DeadTime"]): |
| return self.failure("Unstable cluster") |
| |
| return self.success() |
| |
| def NextIP(self): |
| ip = self.Env["IPBase"] |
| if ":" in ip: |
| fields = ip.rpartition(":") |
| fields[2] = str(hex(int(fields[2], 16)+1)) |
| print(str(hex(int(f[2], 16)+1))) |
| else: |
| fields = ip.rpartition('.') |
| fields[2] = str(int(fields[2])+1) |
| |
| ip = fields[0] + fields[1] + fields[3]; |
| self.Env["IPBase"] = ip |
| return ip.strip() |
| |
| def make_ip_resource(self, node, id, rclass, type, ip): |
| self.logger.log("Creating %s::%s:%s (%s) on %s" % (rclass,type,id,ip,node)) |
| rsc_xml=""" |
| <primitive id="%s" class="%s" type="%s" provider="heartbeat"> |
| <instance_attributes id="%s"><attributes> |
| <nvpair id="%s" name="ip" value="%s"/> |
| </attributes></instance_attributes> |
| </primitive>""" % (id, rclass, type, id, id, ip) |
| |
| node_constraint = """ |
| <rsc_location id="run_%s" rsc="%s"> |
| <rule id="pref_run_%s" score="100"> |
| <expression id="%s_loc_expr" attribute="#uname" operation="eq" value="%s"/> |
| </rule> |
| </rsc_location>""" % (id, id, id, id, node) |
| |
| rc = 0 |
| (rc, lines) = self.rsh(node, self.cib_cmd % ("constraints", node_constraint), None) |
| if rc != 0: |
| self.logger.log("Constraint creation failed: %d" % rc) |
| return None |
| |
| (rc, lines) = self.rsh(node, self.cib_cmd % ("resources", rsc_xml), None) |
| if rc != 0: |
| self.logger.log("Resource creation failed: %d" % rc) |
| return None |
| |
| return 1 |
| |
| def is_applicable(self): |
| if self.Env["DoBSC"]: |
| return 1 |
| return None |
| |
| AllTestClasses.append(BSC_AddResource) |
| |
| |
| class SimulStopLite(CTSTest): |
| '''Stop any active nodes ~ simultaneously''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "SimulStopLite" |
| |
| def __call__(self, dummy): |
| '''Perform the 'SimulStopLite' setup work. ''' |
| self.incr("calls") |
| |
| self.debug("Setup: " + self.name) |
| |
| |
| watchpats = [ ] |
| |
| for node in self.Env["nodes"]: |
| if self.CM.ShouldBeStatus[node] == "up": |
| self.incr("WasStarted") |
| watchpats.append(self.templates["Pat:We_stopped"] % node) |
| |
| if len(watchpats) == 0: |
| return self.success() |
| |
| |
| watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) |
| |
| watch.setwatch() |
| self.set_timer() |
| for node in self.Env["nodes"]: |
| if self.CM.ShouldBeStatus[node] == "up": |
| self.CM.StopaCMnoBlock(node) |
| if watch.lookforall(): |
| |
| for node in self.Env["nodes"]: |
| self.rsh(node, self.templates["StopCmd"]) |
| |
| return self.success() |
| |
| did_fail = 0 |
| up_nodes = [] |
| for node in self.Env["nodes"]: |
| if self.CM.StataCM(node) == 1: |
| did_fail = 1 |
| up_nodes.append(node) |
| |
| if did_fail: |
| return self.failure("Active nodes exist: " + repr(up_nodes)) |
| |
| self.logger.log("Warn: All nodes stopped but CTS didnt detect: " |
| + repr(watch.unmatched)) |
| |
| return self.failure("Missing log message: "+repr(watch.unmatched)) |
| |
| def is_applicable(self): |
| '''SimulStopLite is a setup test and never applicable''' |
| return 0 |
| |
| |
| class SimulStartLite(CTSTest): |
| '''Start any stopped nodes ~ simultaneously''' |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "SimulStartLite" |
| |
| def __call__(self, dummy): |
| '''Perform the 'SimulStartList' setup work. ''' |
| self.incr("calls") |
| self.debug("Setup: " + self.name) |
| |
| |
| node_list = [] |
| for node in self.Env["nodes"]: |
| if self.CM.ShouldBeStatus[node] == "down": |
| self.incr("WasStopped") |
| node_list.append(node) |
| |
| self.set_timer() |
| while len(node_list) > 0: |
| |
| watchpats = [ ] |
| |
| uppat = self.templates["Pat:NonDC_started"] |
| if self.CM.upcount() == 0: |
| uppat = self.templates["Pat:Local_started"] |
| |
| watchpats.append(self.templates["Pat:DC_IDLE"]) |
| for node in node_list: |
| watchpats.append(uppat % node) |
| watchpats.append(self.templates["Pat:InfraUp"] % node) |
| watchpats.append(self.templates["Pat:PacemakerUp"] % node) |
| |
| |
| watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) |
| watch.setwatch() |
| |
| stonith = self.CM.prepare_fencing_watcher(self.name) |
| |
| for node in node_list: |
| self.CM.StartaCMnoBlock(node) |
| |
| watch.lookforall() |
| |
| node_list = self.CM.fencing_cleanup(self.name, stonith) |
| |
| if node_list == None: |
| return self.failure("Cluster did not stabilize") |
| |
| |
| for node in node_list: |
| self.logger.debug("Dealing with stonith operations for %s" % repr(node_list)) |
| if watch.unmatched: |
| try: |
| watch.unmatched.remove(uppat % node) |
| except: |
| self.debug("Already matched: %s" % (uppat % node)) |
| try: |
| watch.unmatched.remove(self.templates["Pat:InfraUp"] % node) |
| except: |
| self.debug("Already matched: %s" % (self.templates["Pat:InfraUp"] % node)) |
| try: |
| watch.unmatched.remove(self.templates["Pat:PacemakerUp"] % node) |
| except: |
| self.debug("Already matched: %s" % (self.templates["Pat:PacemakerUp"] % node)) |
| |
| if watch.unmatched: |
| for regex in watch.unmatched: |
| self.logger.log ("Warn: Startup pattern not found: %s" %(regex)) |
| |
| if not self.CM.cluster_stable(): |
| return self.failure("Cluster did not stabilize") |
| |
| did_fail = 0 |
| unstable = [] |
| for node in self.Env["nodes"]: |
| if self.CM.StataCM(node) == 0: |
| did_fail = 1 |
| unstable.append(node) |
| |
| if did_fail: |
| return self.failure("Unstarted nodes exist: " + repr(unstable)) |
| |
| unstable = [] |
| for node in self.Env["nodes"]: |
| if not self.CM.node_stable(node): |
| did_fail = 1 |
| unstable.append(node) |
| |
| if did_fail: |
| return self.failure("Unstable cluster nodes exist: " + repr(unstable)) |
| |
| return self.success() |
| |
| def is_applicable(self): |
| '''SimulStartLite is a setup test and never applicable''' |
| return 0 |
| |
| |
| def TestList(cm, audits): |
| result = [] |
| for testclass in AllTestClasses: |
| bound_test = testclass(cm) |
| if bound_test.is_applicable(): |
| bound_test.Audits = audits |
| result.append(bound_test) |
| return result |
| |
| |
| class RemoteLXC(CTSTest): |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = "RemoteLXC" |
| self.start = StartTest(cm) |
| self.startall = SimulStartLite(cm) |
| self.num_containers = 2 |
| self.is_container = 1 |
| self.is_docker_unsafe = 1 |
| self.failed = 0 |
| self.fail_string = "" |
| |
| def start_lxc_simple(self, node): |
| |
| |
| self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") |
| |
| |
| pats = [ ] |
| watch = self.create_watch(pats, 120) |
| watch.setwatch() |
| pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc1")) |
| pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc2")) |
| pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc-ms")) |
| pats.append(self.templates["Pat:RscOpOK"] % ("promote", "lxc-ms")) |
| |
| self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers) |
| self.set_timer("remoteSimpleInit") |
| watch.lookforall() |
| self.log_timer("remoteSimpleInit") |
| if watch.unmatched: |
| self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched)) |
| self.failed = 1 |
| |
| def cleanup_lxc_simple(self, node): |
| |
| pats = [ ] |
| |
| |
| if self.failed == 1: |
| |
| self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") |
| return |
| |
| watch = self.create_watch(pats, 120) |
| watch.setwatch() |
| |
| pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container1")) |
| pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container2")) |
| |
| self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null") |
| self.set_timer("remoteSimpleCleanup") |
| watch.lookforall() |
| self.log_timer("remoteSimpleCleanup") |
| |
| if watch.unmatched: |
| self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched)) |
| self.failed = 1 |
| |
| |
| self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null") |
| |
| def __call__(self, node): |
| '''Perform the 'RemoteLXC' test. ''' |
| self.incr("calls") |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("Setup failed, start all nodes failed.") |
| |
| rc = self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null") |
| if rc == 1: |
| self.log("Environment test for lxc support failed.") |
| return self.skipped() |
| |
| self.start_lxc_simple(node) |
| self.cleanup_lxc_simple(node) |
| |
| self.debug("Waiting for the cluster to recover") |
| self.CM.cluster_stable() |
| |
| if self.failed == 1: |
| return self.failure(self.fail_string) |
| |
| return self.success() |
| |
| def errorstoignore(self): |
| '''Return list of errors which should be ignored''' |
| return [ |
| r"Updating failcount for ping", |
| r"schedulerd.*: Recover (ping|lxc-ms|container)\s*\(.*\)", |
| |
| |
| |
| |
| |
| r"Calculated [Tt]ransition .*pe-error", |
| r"Resource lxc-ms .* is active on 2 nodes attempting recovery", |
| r"Unknown operation: fail", |
| r"VirtualDomain.*ERROR: Unable to determine emulator", |
| ] |
| |
| AllTestClasses.append(RemoteLXC) |
| |
| |
| class RemoteDriver(CTSTest): |
| |
| def __init__(self, cm): |
| CTSTest.__init__(self,cm) |
| self.name = self.__class__.__name__ |
| self.is_docker_unsafe = 1 |
| self.start = StartTest(cm) |
| self.startall = SimulStartLite(cm) |
| self.stop = StopTest(cm) |
| self.remote_rsc = "remote-rsc" |
| self.cib_cmd = """cibadmin -C -o %s -X '%s' """ |
| self.reset() |
| |
| def reset(self): |
| self.pcmk_started = 0 |
| self.failed = False |
| self.fail_string = "" |
| self.remote_node_added = 0 |
| self.remote_rsc_added = 0 |
| self.remote_use_reconnect_interval = self.Env.RandomGen.choice([True,False]) |
| |
| def fail(self, msg): |
| """ Mark test as failed. """ |
| |
| self.failed = True |
| |
| |
| self.logger.log(msg) |
| |
| |
| if not self.fail_string: |
| self.fail_string = msg |
| |
| def get_othernode(self, node): |
| for othernode in self.Env["nodes"]: |
| if othernode == node: |
| |
| |
| continue |
| else: |
| return othernode |
| |
| def del_rsc(self, node, rsc): |
| othernode = self.get_othernode(node) |
| rc = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc)) |
| if rc != 0: |
| self.fail("Removal of resource '%s' failed" % rsc) |
| |
| def add_rsc(self, node, rsc_xml): |
| othernode = self.get_othernode(node) |
| rc = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml)) |
| if rc != 0: |
| self.fail("resource creation failed") |
| |
| def add_primitive_rsc(self, node): |
| rsc_xml = """ |
| <primitive class="ocf" id="%(node)s" provider="heartbeat" type="Dummy"> |
| <meta_attributes id="%(node)s-meta_attributes"/> |
| <operations> |
| <op id="%(node)s-monitor-interval-20s" interval="20s" name="monitor"/> |
| </operations> |
| </primitive>""" % { "node": self.remote_rsc } |
| self.add_rsc(node, rsc_xml) |
| if not self.failed: |
| self.remote_rsc_added = 1 |
| |
| def add_connection_rsc(self, node): |
| rsc_xml = """ |
| <primitive class="ocf" id="%(node)s" provider="pacemaker" type="remote"> |
| <instance_attributes id="%(node)s-instance_attributes"> |
| <nvpair id="%(node)s-instance_attributes-server" name="server" value="%(server)s"/> |
| """ % { "node": self.remote_node, "server": node } |
| |
| if self.remote_use_reconnect_interval: |
| |
| rsc_xml = rsc_xml + """ |
| <nvpair id="%s-instance_attributes-reconnect_interval" name="reconnect_interval" value="60s"/> |
| """ % (self.remote_node) |
| |
| rsc_xml = rsc_xml + """ |
| </instance_attributes> |
| <operations> |
| <op id="%(node)s-start" name="start" interval="0" timeout="120s"/> |
| <op id="%(node)s-monitor-20s" name="monitor" interval="20s" timeout="45s"/> |
| </operations> |
| </primitive> |
| """ % { "node": self.remote_node } |
| |
| self.add_rsc(node, rsc_xml) |
| if not self.failed: |
| self.remote_node_added = 1 |
| |
| def disable_services(self, node): |
| self.corosync_enabled = self.Env.service_is_enabled(node, "corosync") |
| if self.corosync_enabled: |
| self.Env.disable_service(node, "corosync") |
| |
| self.pacemaker_enabled = self.Env.service_is_enabled(node, "pacemaker") |
| if self.pacemaker_enabled: |
| self.Env.disable_service(node, "pacemaker") |
| |
| def restore_services(self, node): |
| if self.corosync_enabled: |
| self.Env.enable_service(node, "corosync") |
| |
| if self.pacemaker_enabled: |
| self.Env.enable_service(node, "pacemaker") |
| |
| def stop_pcmk_remote(self, node): |
| |
| for i in range(10): |
| rc = self.rsh(node, "service pacemaker_remote stop") |
| if rc != 0: |
| time.sleep(6) |
| else: |
| break |
| |
| def start_pcmk_remote(self, node): |
| for i in range(10): |
| rc = self.rsh(node, "service pacemaker_remote start") |
| if rc != 0: |
| time.sleep(6) |
| else: |
| self.pcmk_started = 1 |
| break |
| |
| def freeze_pcmk_remote(self, node): |
| """ Simulate a Pacemaker Remote daemon failure. """ |
| |
| |
| self.rsh(node, "killall -STOP pacemaker-remoted") |
| |
| def resume_pcmk_remote(self, node): |
| |
| self.rsh(node, "killall -CONT pacemaker-remoted") |
| |
| def start_metal(self, node): |
| |
| |
| |
| |
| |
| |
| |
| self.disable_services(node) |
| |
| pcmk_started = 0 |
| |
| |
| self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_rsc)) |
| self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_node)) |
| |
| if not self.stop(node): |
| self.fail("Failed to shutdown cluster node %s" % node) |
| return |
| |
| self.start_pcmk_remote(node) |
| |
| if self.pcmk_started == 0: |
| self.fail("Failed to start pacemaker_remote on node %s" % node) |
| return |
| |
| |
| pats = [ ] |
| watch = self.create_watch(pats, 120) |
| watch.setwatch() |
| pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node)) |
| pats.append(self.templates["Pat:DC_IDLE"]) |
| |
| self.add_connection_rsc(node) |
| |
| self.set_timer("remoteMetalInit") |
| watch.lookforall() |
| self.log_timer("remoteMetalInit") |
| if watch.unmatched: |
| self.fail("Unmatched patterns: %s" % watch.unmatched) |
| |
| def migrate_connection(self, node): |
| if self.failed: |
| return |
| |
| pats = [ ] |
| pats.append(self.templates["Pat:RscOpOK"] % ("migrate_to", self.remote_node)) |
| pats.append(self.templates["Pat:RscOpOK"] % ("migrate_from", self.remote_node)) |
| pats.append(self.templates["Pat:DC_IDLE"]) |
| watch = self.create_watch(pats, 120) |
| watch.setwatch() |
| |
| (rc, lines) = self.rsh(node, "crm_resource -M -r %s" % (self.remote_node), None) |
| if rc != 0: |
| self.fail("failed to move remote node connection resource") |
| return |
| |
| self.set_timer("remoteMetalMigrate") |
| watch.lookforall() |
| self.log_timer("remoteMetalMigrate") |
| |
| if watch.unmatched: |
| self.fail("Unmatched patterns: %s" % watch.unmatched) |
| return |
| |
| def fail_rsc(self, node): |
| if self.failed: |
| return |
| |
| watchpats = [ ] |
| watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("stop", self.remote_rsc, self.remote_node)) |
| watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node)) |
| watchpats.append(self.templates["Pat:DC_IDLE"]) |
| |
| watch = self.create_watch(watchpats, 120) |
| watch.setwatch() |
| |
| self.debug("causing dummy rsc to fail.") |
| |
| rc = self.rsh(node, "rm -f /var/run/resource-agents/Dummy*") |
| |
| self.set_timer("remoteRscFail") |
| watch.lookforall() |
| self.log_timer("remoteRscFail") |
| if watch.unmatched: |
| self.fail("Unmatched patterns during rsc fail: %s" % watch.unmatched) |
| |
| def fail_connection(self, node): |
| if self.failed: |
| return |
| |
| watchpats = [ ] |
| watchpats.append(self.templates["Pat:FenceOpOK"] % self.remote_node) |
| watchpats.append(self.templates["Pat:NodeFenced"] % self.remote_node) |
| |
| watch = self.create_watch(watchpats, 120) |
| watch.setwatch() |
| |
| |
| self.debug("Force stopped active remote node") |
| self.freeze_pcmk_remote(node) |
| |
| self.debug("Waiting for remote node to be fenced.") |
| self.set_timer("remoteMetalFence") |
| watch.lookforall() |
| self.log_timer("remoteMetalFence") |
| if watch.unmatched: |
| self.fail("Unmatched patterns: %s" % watch.unmatched) |
| return |
| |
| self.debug("Waiting for the remote node to come back up") |
| self.CM.ns.WaitForNodeToComeUp(node, 120); |
| |
| pats = [ ] |
| watch = self.create_watch(pats, 240) |
| watch.setwatch() |
| pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node)) |
| if self.remote_rsc_added == 1: |
| pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node)) |
| |
| |
| self.start_pcmk_remote(node) |
| if self.pcmk_started == 0: |
| self.fail("Failed to start pacemaker_remote on node %s" % node) |
| return |
| |
| self.debug("Waiting for remote node to rejoin cluster after being fenced.") |
| self.set_timer("remoteMetalRestart") |
| watch.lookforall() |
| self.log_timer("remoteMetalRestart") |
| if watch.unmatched: |
| self.fail("Unmatched patterns: %s" % watch.unmatched) |
| return |
| |
| def add_dummy_rsc(self, node): |
| if self.failed: |
| return |
| |
| |
| pats = [ ] |
| watch = self.create_watch(pats, 120) |
| watch.setwatch() |
| pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node)) |
| pats.append(self.templates["Pat:DC_IDLE"]) |
| |
| |
| self.add_primitive_rsc(node) |
| |
| |
| (rc, line) = self.CM.rsh(node, "crm_resource -M -r %s -N %s -f" % (self.remote_rsc, self.remote_node), None) |
| if rc != 0: |
| self.fail("Failed to place remote resource on remote node.") |
| return |
| |
| self.set_timer("remoteMetalRsc") |
| watch.lookforall() |
| self.log_timer("remoteMetalRsc") |
| if watch.unmatched: |
| self.fail("Unmatched patterns: %s" % watch.unmatched) |
| |
| def test_attributes(self, node): |
| if self.failed: |
| return |
| |
| |
| |
| (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -v testval -N %s" % (self.remote_node), None) |
| if rc != 0: |
| self.fail("Failed to set remote-node attribute. rc:%s output:%s" % (rc, line)) |
| return |
| |
| (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -q -N %s" % (self.remote_node), None) |
| if rc != 0: |
| self.fail("Failed to get remote-node attribute") |
| return |
| |
| (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -D -N %s" % (self.remote_node), None) |
| if rc != 0: |
| self.fail("Failed to delete remote-node attribute") |
| return |
| |
| def cleanup_metal(self, node): |
| self.restore_services(node) |
| |
| if self.pcmk_started == 0: |
| return |
| |
| pats = [ ] |
| |
| watch = self.create_watch(pats, 120) |
| watch.setwatch() |
| |
| if self.remote_rsc_added == 1: |
| pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_rsc)) |
| if self.remote_node_added == 1: |
| pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_node)) |
| |
| self.set_timer("remoteMetalCleanup") |
| |
| self.resume_pcmk_remote(node) |
| |
| if self.remote_rsc_added == 1: |
| |
| |
| self.debug("Cleaning up dummy rsc put on remote node") |
| self.rsh(self.get_othernode(node), "crm_resource -U -r %s" % self.remote_rsc) |
| self.del_rsc(node, self.remote_rsc) |
| |
| if self.remote_node_added == 1: |
| |
| |
| self.debug("Cleaning up remote node connection resource") |
| self.rsh(self.get_othernode(node), "crm_resource -U -r %s" % (self.remote_node)) |
| self.del_rsc(node, self.remote_node) |
| |
| watch.lookforall() |
| self.log_timer("remoteMetalCleanup") |
| |
| if watch.unmatched: |
| self.fail("Unmatched patterns: %s" % watch.unmatched) |
| |
| self.stop_pcmk_remote(node) |
| |
| self.debug("Waiting for the cluster to recover") |
| self.CM.cluster_stable() |
| |
| if self.remote_node_added == 1: |
| |
| self.debug("Cleaning up node entry for remote node") |
| self.rsh(self.get_othernode(node), "crm_node --force --remove %s" % self.remote_node) |
| |
| def setup_env(self, node): |
| |
| self.remote_node = "remote-%s" % (node) |
| |
| |
| |
| |
| if self.rsh.exists_on_all("/etc/pacemaker/authkey", self.Env["nodes"]): |
| return |
| |
| |
| (handle, keyfile) = tempfile.mkstemp(".cts") |
| os.close(handle) |
| devnull = open(os.devnull, 'wb') |
| subprocess.check_call(["dd", "if=/dev/urandom", "of=%s" % keyfile, "bs=4096", "count=1"], |
| stdout=devnull, stderr=devnull) |
| devnull.close() |
| |
| |
| for node in self.Env["nodes"]: |
| self.rsh(node, "mkdir -p --mode=0750 /etc/pacemaker") |
| self.rsh.cp(keyfile, "root@%s:/etc/pacemaker/authkey" % node) |
| self.rsh(node, "chgrp haclient /etc/pacemaker /etc/pacemaker/authkey") |
| self.rsh(node, "chmod 0640 /etc/pacemaker/authkey") |
| os.unlink(keyfile) |
| |
| def is_applicable(self): |
| if not self.is_applicable_common(): |
| return False |
| |
| for node in self.Env["nodes"]: |
| rc = self.rsh(node, "which pacemaker-remoted >/dev/null 2>&1") |
| if rc != 0: |
| return False |
| return True |
| |
| def start_new_test(self, node): |
| self.incr("calls") |
| self.reset() |
| |
| ret = self.startall(None) |
| if not ret: |
| return self.failure("setup failed: could not start all nodes") |
| |
| self.setup_env(node) |
| self.start_metal(node) |
| self.add_dummy_rsc(node) |
| return True |
| |
| def __call__(self, node): |
| return self.failure("This base class is not meant to be called directly.") |
| |
| def errorstoignore(self): |
| '''Return list of errors which should be ignored''' |
| return [ r"""is running on remote.*which isn't allowed""", |
| r"""Connection terminated""", |
| r"""Could not send remote""", |
| ] |
| |
| |
| |
| |
| class RemoteBasic(RemoteDriver): |
| |
| def __call__(self, node): |
| '''Perform the 'RemoteBaremetal' test. ''' |
| |
| if not self.start_new_test(node): |
| return self.failure(self.fail_string) |
| |
| self.test_attributes(node) |
| self.cleanup_metal(node) |
| |
| self.debug("Waiting for the cluster to recover") |
| self.CM.cluster_stable() |
| if self.failed: |
| return self.failure(self.fail_string) |
| |
| return self.success() |
| |
| AllTestClasses.append(RemoteBasic) |
| |
| class RemoteStonithd(RemoteDriver): |
| |
| def __call__(self, node): |
| '''Perform the 'RemoteStonithd' test. ''' |
| |
| if not self.start_new_test(node): |
| return self.failure(self.fail_string) |
| |
| self.fail_connection(node) |
| self.cleanup_metal(node) |
| |
| self.debug("Waiting for the cluster to recover") |
| self.CM.cluster_stable() |
| if self.failed: |
| return self.failure(self.fail_string) |
| |
| return self.success() |
| |
| def is_applicable(self): |
| if not RemoteDriver.is_applicable(self): |
| return False |
| |
| if "DoFencing" in list(self.Env.keys()): |
| return self.Env["DoFencing"] |
| |
| return True |
| |
| def errorstoignore(self): |
| ignore_pats = [ |
| r"Lost connection to Pacemaker Remote node", |
| r"Software caused connection abort", |
| r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor", |
| r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*", |
| r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)", |
| r"error: Result of monitor operation for .* on remote-.*: No executor connection", |
| ] |
| |
| ignore_pats.extend(RemoteDriver.errorstoignore(self)) |
| return ignore_pats |
| |
| AllTestClasses.append(RemoteStonithd) |
| |
| |
| class RemoteMigrate(RemoteDriver): |
| |
| def __call__(self, node): |
| '''Perform the 'RemoteMigrate' test. ''' |
| |
| if not self.start_new_test(node): |
| return self.failure(self.fail_string) |
| |
| self.migrate_connection(node) |
| self.cleanup_metal(node) |
| |
| self.debug("Waiting for the cluster to recover") |
| self.CM.cluster_stable() |
| if self.failed: |
| return self.failure(self.fail_string) |
| |
| return self.success() |
| |
| AllTestClasses.append(RemoteMigrate) |
| |
| |
| class RemoteRscFailure(RemoteDriver): |
| |
| def __call__(self, node): |
| '''Perform the 'RemoteRscFailure' test. ''' |
| |
| if not self.start_new_test(node): |
| return self.failure(self.fail_string) |
| |
| |
| |
| |
| self.migrate_connection(node) |
| |
| self.fail_rsc(node) |
| self.cleanup_metal(node) |
| |
| self.debug("Waiting for the cluster to recover") |
| self.CM.cluster_stable() |
| if self.failed: |
| return self.failure(self.fail_string) |
| |
| return self.success() |
| |
| def errorstoignore(self): |
| ignore_pats = [ |
| r"schedulerd.*: Recover remote-rsc\s*\(.*\)", |
| r"Dummy.*: No process state file found", |
| ] |
| |
| ignore_pats.extend(RemoteDriver.errorstoignore(self)) |
| return ignore_pats |
| |
| AllTestClasses.append(RemoteRscFailure) |
| |
| |