From fefe3f420864db4194b8dbed7a7afcc3688ea81a Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Apr 07 2021 22:24:55 +0000 Subject: Feature: scheduler: new on-fail="demote" recovery policy for promoted resources --- diff --git a/include/crm/pengine/pe_types.h b/include/crm/pengine/pe_types.h index ba88491..ed5eb12 100644 --- a/include/crm/pengine/pe_types.h +++ b/include/crm/pengine/pe_types.h @@ -246,6 +246,7 @@ struct pe_node_s { # define pe_rsc_allocating 0x00000200ULL # define pe_rsc_merging 0x00000400ULL +# define pe_rsc_stop 0x00001000ULL # define pe_rsc_reload 0x00002000ULL # define pe_rsc_allow_remote_remotes 0x00004000ULL diff --git a/lib/pacemaker/pcmk_sched_native.c b/lib/pacemaker/pcmk_sched_native.c index b9bca80..4e3bd7c 100644 --- a/lib/pacemaker/pcmk_sched_native.c +++ b/lib/pacemaker/pcmk_sched_native.c @@ -1205,6 +1205,7 @@ native_create_actions(pe_resource_t * rsc, pe_working_set_t * data_set) pe_node_t *chosen = NULL; pe_node_t *current = NULL; gboolean need_stop = FALSE; + bool need_promote = FALSE; gboolean is_moving = FALSE; gboolean allow_migrate = is_set(rsc->flags, pe_rsc_allow_migrate) ? TRUE : FALSE; @@ -1309,8 +1310,15 @@ native_create_actions(pe_resource_t * rsc, pe_working_set_t * data_set) need_stop = TRUE; } else if (is_set(rsc->flags, pe_rsc_failed)) { - pe_rsc_trace(rsc, "Recovering %s", rsc->id); - need_stop = TRUE; + if (is_set(rsc->flags, pe_rsc_stop)) { + need_stop = TRUE; + pe_rsc_trace(rsc, "Recovering %s", rsc->id); + } else { + pe_rsc_trace(rsc, "Recovering %s by demotion", rsc->id); + if (rsc->next_role == RSC_ROLE_MASTER) { + need_promote = TRUE; + } + } } else if (is_set(rsc->flags, pe_rsc_block)) { pe_rsc_trace(rsc, "Block %s", rsc->id); @@ -1344,10 +1352,16 @@ native_create_actions(pe_resource_t * rsc, pe_working_set_t * data_set) while (rsc->role <= rsc->next_role && role != rsc->role && is_not_set(rsc->flags, pe_rsc_block)) { + bool required = need_stop; + next_role = rsc_state_matrix[role][rsc->role]; + if ((next_role == RSC_ROLE_MASTER) && need_promote) { + required = true; + } pe_rsc_trace(rsc, "Up: Executing: %s->%s (%s)%s", role2text(role), role2text(next_role), - rsc->id, need_stop ? " required" : ""); - if (rsc_action_matrix[role][next_role] (rsc, chosen, !need_stop, data_set) == FALSE) { + rsc->id, (required? " required" : "")); + if (rsc_action_matrix[role][next_role](rsc, chosen, !required, + data_set) == FALSE) { break; } role = next_role; @@ -2631,7 +2645,8 @@ LogActions(pe_resource_t * rsc, pe_working_set_t * data_set, gboolean terminal) free(key); - } else if (stop && is_set(rsc->flags, pe_rsc_failed)) { + } else if (stop && is_set(rsc->flags, pe_rsc_failed) + && is_set(rsc->flags, pe_rsc_stop)) { /* 'stop' may be NULL if the failure was ignored */ LogAction("Recover", rsc, current, next, stop, start, terminal); STOP_SANITY_ASSERT(__LINE__); diff --git a/lib/pengine/common.c b/lib/pengine/common.c index ded6df8..f4f2106 100644 --- a/lib/pengine/common.c +++ b/lib/pengine/common.c @@ -326,6 +326,9 @@ fail2text(enum action_fail_response fail) case action_fail_ignore: result = "ignore"; break; + case action_fail_demote: + result = "demote"; + break; case action_fail_block: result = "block"; break; diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index 6a350e5..a219805 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -108,6 +108,7 @@ pe_fence_node(pe_working_set_t * data_set, pe_node_t * node, */ node->details->remote_requires_reset = TRUE; set_bit(rsc->flags, pe_rsc_failed); + set_bit(rsc->flags, pe_rsc_stop); } } @@ -117,6 +118,7 @@ pe_fence_node(pe_working_set_t * data_set, pe_node_t * node, "and guest resource no longer exists", node->details->uname, reason); set_bit(node->details->remote_rsc->flags, pe_rsc_failed); + set_bit(node->details->remote_rsc->flags, pe_rsc_stop); } else if (pe__is_remote_node(node)) { pe_resource_t *rsc = node->details->remote_rsc; @@ -1914,6 +1916,7 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, */ if (pe__is_guest_node(node)) { set_bit(rsc->flags, pe_rsc_failed); + set_bit(rsc->flags, pe_rsc_stop); should_fence = TRUE; } else if (is_set(data_set->flags, pe_flag_stonith_enabled)) { @@ -1956,6 +1959,11 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, /* nothing to do */ break; + case action_fail_demote: + set_bit(rsc->flags, pe_rsc_failed); + demote_action(rsc, node, FALSE); + break; + case action_fail_fence: /* treat it as if it is still running * but also mark the node as unclean @@ -1992,12 +2000,14 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, case action_fail_recover: if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) { set_bit(rsc->flags, pe_rsc_failed); + set_bit(rsc->flags, pe_rsc_stop); stop_action(rsc, node, FALSE); } break; case action_fail_restart_container: set_bit(rsc->flags, pe_rsc_failed); + set_bit(rsc->flags, pe_rsc_stop); if (rsc->container && pe_rsc_is_bundled(rsc)) { /* A bundle's remote connection can run on a different node than @@ -2016,6 +2026,7 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, case action_fail_reset_remote: set_bit(rsc->flags, pe_rsc_failed); + set_bit(rsc->flags, pe_rsc_stop); if (is_set(data_set->flags, pe_flag_stonith_enabled)) { tmpnode = NULL; if (rsc->is_remote_node) { @@ -2071,8 +2082,17 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, } native_add_running(rsc, node, data_set); - if (on_fail != action_fail_ignore) { - set_bit(rsc->flags, pe_rsc_failed); + switch (on_fail) { + case action_fail_ignore: + break; + case action_fail_demote: + case action_fail_block: + set_bit(rsc->flags, pe_rsc_failed); + break; + default: + set_bit(rsc->flags, pe_rsc_failed); + set_bit(rsc->flags, pe_rsc_stop); + break; } } else if (rsc->clone_name && strchr(rsc->clone_name, ':') != NULL) { @@ -2595,6 +2615,7 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, } else { /* Consider it failed here - forces a restart, prevents migration */ set_bit(rsc->flags, pe_rsc_failed); + set_bit(rsc->flags, pe_rsc_stop); clear_bit(rsc->flags, pe_rsc_allow_migrate); } } @@ -2785,9 +2806,21 @@ static int cmp_on_fail(enum action_fail_response first, enum action_fail_response second) { switch (first) { + case action_fail_demote: + switch (second) { + case action_fail_ignore: + return 1; + case action_fail_demote: + return 0; + default: + return -1; + } + break; + case action_fail_reset_remote: switch (second) { case action_fail_ignore: + case action_fail_demote: case action_fail_recover: return 1; case action_fail_reset_remote: @@ -2800,6 +2833,7 @@ cmp_on_fail(enum action_fail_response first, enum action_fail_response second) case action_fail_restart_container: switch (second) { case action_fail_ignore: + case action_fail_demote: case action_fail_recover: case action_fail_reset_remote: return 1; @@ -2814,9 +2848,13 @@ cmp_on_fail(enum action_fail_response first, enum action_fail_response second) break; } switch (second) { + case action_fail_demote: + return (first == action_fail_ignore)? -1 : 1; + case action_fail_reset_remote: switch (first) { case action_fail_ignore: + case action_fail_demote: case action_fail_recover: return -1; default: @@ -2827,6 +2865,7 @@ cmp_on_fail(enum action_fail_response first, enum action_fail_response second) case action_fail_restart_container: switch (first) { case action_fail_ignore: + case action_fail_demote: case action_fail_recover: case action_fail_reset_remote: return -1; @@ -3426,7 +3465,11 @@ update_resource_state(pe_resource_t * rsc, pe_node_t * node, xmlNode * xml_op, c clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_DEMOTE)) { - /* Demote from Master does not clear an error */ + + if (*on_fail == action_fail_demote) { + // Demote clears an error only if on-fail=demote + clear_past_failure = TRUE; + } rsc->role = RSC_ROLE_SLAVE; } else if (safe_str_eq(task, CRMD_ACTION_MIGRATED)) { @@ -3454,6 +3497,7 @@ update_resource_state(pe_resource_t * rsc, pe_node_t * node, xmlNode * xml_op, c case action_fail_block: case action_fail_ignore: + case action_fail_demote: case action_fail_recover: case action_fail_restart_container: *on_fail = action_fail_ignore; @@ -3714,6 +3758,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, * that, ensure the remote connection is considered failed. */ set_bit(node->details->remote_rsc->flags, pe_rsc_failed); + set_bit(node->details->remote_rsc->flags, pe_rsc_stop); } // fall through diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c index 3fb7e62..fee9efb 100644 --- a/lib/pengine/utils.c +++ b/lib/pengine/utils.c @@ -720,6 +720,7 @@ static bool valid_stop_on_fail(const char *value) { return safe_str_neq(value, "standby") + && safe_str_neq(value, "demote") && safe_str_neq(value, "stop"); } @@ -727,6 +728,11 @@ static const char * unpack_operation_on_fail(pe_action_t * action) { + const char *name = NULL; + const char *role = NULL; + const char *on_fail = NULL; + const char *interval_spec = NULL; + const char *enabled = NULL; const char *value = g_hash_table_lookup(action->meta, XML_OP_ATTR_ON_FAIL); if (safe_str_eq(action->task, CRMD_ACTION_STOP) @@ -736,14 +742,10 @@ unpack_operation_on_fail(pe_action_t * action) "action to default value because '%s' is not " "allowed for stop", action->rsc->id, value); return NULL; + } else if (safe_str_eq(action->task, CRMD_ACTION_DEMOTE) && !value) { /* demote on_fail defaults to master monitor value if present */ xmlNode *operation = NULL; - const char *name = NULL; - const char *role = NULL; - const char *on_fail = NULL; - const char *interval_spec = NULL; - const char *enabled = NULL; CRM_CHECK(action->rsc != NULL, return NULL); @@ -766,12 +768,31 @@ unpack_operation_on_fail(pe_action_t * action) continue; } else if (crm_parse_interval_spec(interval_spec) == 0) { continue; + } else if (safe_str_eq(on_fail, "demote")) { + continue; } value = on_fail; } } else if (safe_str_eq(action->task, CRM_OP_LRM_DELETE)) { value = "ignore"; + + } else if (safe_str_eq(value, "demote")) { + name = crm_element_value(action->op_entry, "name"); + role = crm_element_value(action->op_entry, "role"); + on_fail = crm_element_value(action->op_entry, XML_OP_ATTR_ON_FAIL); + interval_spec = crm_element_value(action->op_entry, + XML_LRM_ATTR_INTERVAL); + + if (safe_str_neq(name, CRMD_ACTION_PROMOTE) + && (safe_str_neq(name, CRMD_ACTION_STATUS) + || safe_str_neq(role, "Master") + || (crm_parse_interval_spec(interval_spec) == 0))) { + pcmk__config_err("Resetting '" XML_OP_ATTR_ON_FAIL "' for %s %s " + "action to default value because 'demote' is not " + "allowed for it", action->rsc->id, name); + return NULL; + } } return value; @@ -1170,6 +1191,10 @@ unpack_operation(pe_action_t * action, xmlNode * xml_obj, pe_resource_t * contai value = NULL; } + } else if (safe_str_eq(value, "demote")) { + action->on_fail = action_fail_demote; + value = "demote instance"; + } else { pe_err("Resource %s: Unknown failure type (%s)", action->rsc->id, value); value = NULL;