From 02f0109099ac673ae534e8827b5f710f2fdaace2 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Feb 05 2021 06:23:36 +0000 Subject: Fix: scheduler: process remote shutdowns correctly When unpacking node histories, the scheduler can make multiple passes through the node_state entries, because the state of remote node connections (on other nodes) must be known before the history of the remote node itself can be unpacked. When unpacking a remote or guest node's history, the scheduler also unpacks its transient attributes. If the shutdown attribute has been set, the scheduler marks the node as shutting down. Previously, at that time, it would also set the remote connection's next role to stopped. However, if it so happened that remote connection history on another node was processed later in the node history unpacking, and a probe had found the connection not running, this would reset the next role to unknown. The connection stop would not be scheduled, and the shutdown would hang until it timed out. Now, set the remote connection to stopped for shutdowns after all node histories have been unpacked. --- diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index ce51429..2d91abc 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -979,9 +979,6 @@ unpack_handle_remote_attrs(pe_node_t *this_node, xmlNode *state, pe_working_set_ if (pe__shutdown_requested(this_node)) { crm_info("Node %s is shutting down", this_node->details->uname); this_node->details->shutdown = TRUE; - if (rsc) { - pe__set_next_role(rsc, RSC_ROLE_STOPPED, "remote shutdown"); - } } if (crm_is_true(pe_node_attribute_raw(this_node, "standby"))) { @@ -1289,17 +1286,24 @@ unpack_status(xmlNode * status, pe_working_set_t * data_set) data_set->stop_needed = NULL; } + /* Now that we know status of all Pacemaker Remote connections and nodes, + * we can stop connections for node shutdowns, and check the online status + * of remote/guest nodes that didn't have any node history to unpack. + */ for (GListPtr gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { pe_node_t *this_node = gIter->data; - if (this_node == NULL) { - continue; - } else if (!pe__is_guest_or_remote_node(this_node)) { - continue; - } else if(this_node->details->unpacked) { + if (!pe__is_guest_or_remote_node(this_node)) { continue; } - determine_remote_online_status(data_set, this_node); + if (this_node->details->shutdown + && (this_node->details->remote_rsc != NULL)) { + pe__set_next_role(this_node->details->remote_rsc, RSC_ROLE_STOPPED, + "remote shutdown"); + } + if (!this_node->details->unpacked) { + determine_remote_online_status(data_set, this_node); + } } return TRUE;