valkey-io · hpatro · Apr 7, 2025 · Apr 8, 2025 · Apr 9, 2025 · Apr 9, 2025
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
@@ -2146,12 +2146,22 @@ void clearNodeFailureIfNeeded(clusterNode *node) {
         clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG);
     }
 
+    /* If none of the replica can failover or it's primary only setup,
+     * then immediately mark the node as alive. */
+    int dont_wait = 1;
+    for (int j = 0; j < node->num_replicas; j++) {
+        if (!clusterNodeIsNoFailover(node->replicas[j])) {
+            dont_wait = 0;
+            break;
+        }
+    }
+
     /* If it is a primary and...
      * 1) The FAIL state is old enough.
      * 2) It is yet serving slots from our point of view (not failed over).
      * Apparently no one is going to fix these slots, clear the FAIL flag. */
     if (clusterNodeIsVotingPrimary(node) &&
-        (now - node->fail_time) > (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT)) {
+        ((now - node->fail_time) > (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT) || dont_wait)) {
         serverLog(
             LL_NOTICE,
             "Clear FAIL state for node %.40s (%s): is reachable again and nobody is serving its slots after some time.",
@@ -4735,6 +4745,10 @@ void clusterLogCantFailover(int reason) {
     case CLUSTER_CANT_FAILOVER_WAITING_DELAY: msg = "Waiting the delay before I can start a new failover."; break;
     case CLUSTER_CANT_FAILOVER_EXPIRED: msg = "Failover attempt expired."; break;
     case CLUSTER_CANT_FAILOVER_WAITING_VOTES: msg = "Waiting for votes, but majority still not reached."; break;
+    case CLUSTER_CANT_FAILOVER_DISABLED:
+        msg = "Failover has been disabled. "
+              "Please check the 'cluster-replica-no-failover' configuration option";
+        break;
     default: serverPanic("Unknown cant failover reason code.");
     }
     lastlog_time = time(NULL);
@@ -4827,14 +4841,19 @@ void clusterHandleReplicaFailover(void) {
      * 3) We don't have the no failover configuration set, and this is
      *    not a manual failover. */
     if (clusterNodeIsPrimary(myself) || myself->replicaof == NULL ||
-        (!nodeFailed(myself->replicaof) && !manual_failover) ||
-        (server.cluster_replica_no_failover && !manual_failover)) {
+        (!nodeFailed(myself->replicaof) && !manual_failover)) {
         /* There are no reasons to failover, so we set the reason why we
          * are returning without failing over to NONE. */
         server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
         return;
     }
 
+    if (server.cluster_replica_no_failover && !manual_failover) {
+        server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_DISABLED;
+        clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DISABLED);
+        return;
+    }
+
     /* Set data_age to the number of milliseconds we are disconnected from
      * the primary. */
     if (server.repl_state == REPL_STATE_CONNECTED) {
@@ -6602,7 +6621,7 @@ int clusterNodeIsFailing(clusterNode *node) {
 }
 
 int clusterNodeIsNoFailover(clusterNode *node) {
-    return node->flags & CLUSTER_NODE_NOFAILOVER;
+    return nodeCantFailover(node);
 }
 
 const char **clusterDebugCommandExtendedHelp(void) {

diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
@@ -16,6 +16,7 @@
 #define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2
 #define CLUSTER_CANT_FAILOVER_EXPIRED 3
 #define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4
+#define CLUSTER_CANT_FAILOVER_DISABLED 5
 #define CLUSTER_CANT_FAILOVER_RELOG_PERIOD 1 /* seconds. */
 
 /* clusterState todo_before_sleep flags. */

diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl
@@ -101,6 +101,66 @@ start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval
     }
 } ;# start_cluster
 
+# Tests to verify scenarios where failover is not possible and verify faster availability
+# of primary once the network partition heals.
+foreach type {"primary-only" "primary-with-replicas"} {
+    set ::node_timeout 5000
+    if {$type eq "primary-only"} {
+        set ::primary_count 6
+        set ::replica_count 0
+    } else {
+        set ::primary_count 3
+        set ::replica_count 3
+    }
+
+    set options [list \
+    tags {external:skip cluster} \
+    overrides [list \
+        cluster-ping-interval 1000 \
+        cluster-node-timeout $::node_timeout \
+        cluster-replica-no-failover yes \
+    ]]
+
+    start_cluster $::primary_count $::replica_count $options {
+        # Killing one primary node.
+        pause_process [srv 0 pid]
+
+        if {$::replica_count > 0} {
+            test "no failover - verify replica is not promoted if failover has been disabled" {
+                # Observe no failover
+                wait_for_log_messages -3 {"*Currently unable to failover: Failover has been disabled*"} 0 200 50
+            }
+        } else {
+            # wait for node failure detection
+            after $::node_timeout
+        }
+
+        test "no failover - cluster is in failed state" {
+            for {set j 0} {$j < [llength $::servers]} {incr j} {                
+                if {[process_is_paused [srv -$j pid]]} continue
+                wait_for_condition 100 25 {
+                    [CI $j cluster_state] eq "fail"
+                } else {
+                    set ts [clock format [clock seconds] -format %H:%M:%S]
+                    fail "Cluster node $j cluster_state:[r -1 CLUSTER NODES]"
+                }
+            }
+        }
+
+        resume_process [srv 0 pid]
+
+        test "no failover - cluster is in healthy state" {
+            for {set j 0} {$j < [llength $::servers]} {incr j} {
+                wait_for_condition 100 50 {
+                    [CI $j cluster_state] eq "ok"
+                } else {
+                    fail "Cluster node $j cluster_state:[CI $j cluster_state]"
+                }
+            }
+        }
+    } ;# start_cluster
+}
+
 run_solo {cluster} {
     start_cluster 32 15 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} {
         test "Multiple primary nodes are down, rank them based on the failed primary" {