diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd index e1425ec02..27553b3df 100755 --- a/heartbeat/podman-etcd +++ b/heartbeat/podman-etcd @@ -267,7 +267,7 @@ The directory where the resource agent stores its backups. - + @@ -1029,6 +1029,48 @@ get_peer_node_name() { crm_node -l | awk '{print $2}' | grep -v "$NODENAME" } +# Calculate the count of truly active resources by excluding those being stopped. +# According to Pacemaker documentation, during "Post-notification (stop) / +# Pre-notification (start)" transitions, the true active resource count should be: +# Active resources = $OCF_RESKEY_CRM_meta_notify_active_resource +# minus $OCF_RESKEY_CRM_meta_notify_stop_resource +# This handles the case where a resource appears in both the active and stop lists +# during rapid restart scenarios (e.g., process crash recovery). +get_truly_active_resources_count() { + local active_list="$OCF_RESKEY_CRM_meta_notify_active_resource" + local stop_list="$OCF_RESKEY_CRM_meta_notify_stop_resource" + local truly_active="" + + # If no active resources, return 0 + if [ -z "$active_list" ]; then + echo "0" + return + fi + + # If no resources being stopped, return count of active resources + if [ -z "$stop_list" ]; then + echo "$active_list" | wc -w + return + fi + + # Filter out resources that are being stopped from the active list + for resource in $active_list; do + local is_stopping=0 + for stop_resource in $stop_list; do + if [ "$resource" = "$stop_resource" ]; then + is_stopping=1 + break + fi + done + if [ $is_stopping -eq 0 ]; then + truly_active="$truly_active $resource" + fi + done + + # Count the truly active resources (trim leading space and count words) + echo "$truly_active" | wc -w +} + get_all_etcd_endpoints() { for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do name=$(echo "$node" | cut -d: -f1) @@ -1529,8 +1571,9 @@ podman_start() # - 0 active agents, 1 starting: we are starting; the peer is not starting # - 0 active agents, 2 starting: both agents are starting simultaneously local active_resources_count - active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w) - ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')" + active_resources_count=$(get_truly_active_resources_count) + ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')" + ocf_log info "DEBUG: truly_active_count=$active_resources_count, raw_active='$OCF_RESKEY_CRM_meta_notify_active_resource', stop='$OCF_RESKEY_CRM_meta_notify_stop_resource'" case "$active_resources_count" in 1) if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then @@ -1545,6 +1588,7 @@ podman_start() local start_resources_count start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w) ocf_log info "found '$start_resources_count' starting etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_start_resource')" + ocf_log info "DEBUG: start_count=$start_resources_count, start='$OCF_RESKEY_CRM_meta_notify_start_resource'" # we need to compare the revisions in any of the following branches # so call the function only once here @@ -1599,18 +1643,32 @@ podman_start() fi fi - podman_create_mounts - local run_opts="--detach --name=${CONTAINER} --replace" + # IMPORTANT: Check for force-new-cluster deadlock BEFORE checking container status + # If we check container status first and it's already running, we return early + # and never get to detect the deadlock condition + if ocf_is_true "$JOIN_AS_LEARNER"; then + # Check if peer needs to force-new-cluster first + # This prevents a deadlock where we wait for the peer to add us as learner, + # but the peer has lost quorum and needs to restart with force-new-cluster first. + # By failing fast here, we let Pacemaker recalculate and restart the peer first. + local peer_node=$(get_peer_node_name) - run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}" + ocf_log info "checking if peer ($peer_node) needs force-new-cluster before joining as learner" + if ! fnc_holders=$(get_force_new_cluster); then + ocf_exit_reason "Failed to get force_new_cluster node holders while joining as learner" + return $OCF_ERR_GENERIC + fi - # check to see if the container has already started - podman_simple_status - if [ $? -eq $OCF_SUCCESS ]; then - return "$OCF_SUCCESS" - fi + if echo "$fnc_holders" | grep -qw "$peer_node"; then + ocf_log warn "peer ($peer_node) needs to force-new-cluster but we are trying to join as learner" + ocf_log warn "this creates a deadlock: peer cannot add us until it restarts with force-new-cluster" + ocf_log warn "failing fast to allow Pacemaker to restart peer first" + ocf_exit_reason "Peer node ($peer_node) needs force-new-cluster, cannot join as learner. Pacemaker should restart peer first." + return $OCF_NOT_RUNNING + fi + + ocf_log info "peer does not need force-new-cluster, proceeding to join as learner" - if ocf_is_true "$JOIN_AS_LEARNER"; then local wait_timeout_sec=$((10*60)) local poll_interval_sec=5 local retries=$(( wait_timeout_sec / poll_interval_sec )) @@ -1634,6 +1692,17 @@ podman_start() archive_data_folder fi + podman_create_mounts + local run_opts="--detach --name=${CONTAINER} --replace" + + run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}" + + # check to see if the container has already started + podman_simple_status + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced" if ! can_reuse_container ; then rc="$?"