NRG: When clfs=0, don't snapshot very often (#6277)

derekcollison · web-flow · commit dc8b3445f996 · 2024-12-18T09:07:32.000-05:00
If the stream is ingesting loads of new messages, for example in a
benchmark, either the `compactNumMin` or `compactSizeMin` would be
reached fairly quickly. The `minSnapDelta` (10s) is used to not snapshot
too often, since that would degrade performance.

However, if `clfs=0` then we'd be forcing snapshots every time the
compact minimums are reached.

Signed-off-by: Maurice van Veen &lt;github@mauricevanveen.com&gt;
diff --git a/server/jetstream_cluster.go b/server/jetstream_cluster.go
@@ -2606,12 +2606,6 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps
 			// Check about snapshotting
 			// If we have at least min entries to compact, go ahead and try to snapshot/compact.
 			if ne >= compactNumMin || nb > compactSizeMin || mset.getCLFS() > pclfs {
-				// We want to make sure we do not short circuit if transistioning from no clfs.
-				if pclfs == 0 {
-					// This is always false by default.
-					lastState.firstNeedsUpdate = true
-					lastSnapTime = time.Time{}
-				}
 				doSnapshot()
 			}
 
diff --git a/server/jetstream_cluster_1_test.go b/server/jetstream_cluster_1_test.go
@@ -6945,6 +6945,45 @@ func TestJetStreamClusterConsumerInfoAfterCreate(t *testing.T) {
 	require_NoError(t, err)
 }
 
+func TestJetStreamClusterDontSnapshotTooOften(t *testing.T) {
+	c := createJetStreamClusterExplicit(t, "R3S", 3)
+	defer c.shutdown()
+
+	nc, js := jsClientConnect(t, c.randomServer())
+	defer nc.Close()
+
+	_, err := js.AddStream(&nats.StreamConfig{
+		Name:     "TEST",
+		Subjects: []string{"foo"},
+		Replicas: 3,
+	})
+	require_NoError(t, err)
+
+	// We force the snapshot compact size to hit multiple times.
+	// But, we should not be making snapshots too often since that would degrade performance.
+	data := make([]byte, 1024*1024) // 1MB payload
+	_, err = crand.Read(data)
+	require_NoError(t, err)
+	for i := 0; i < 50; i++ {
+		// We do synchronous publishes so we're more likely to have entries pass through the apply queue.
+		_, err = js.Publish("foo", data)
+		require_NoError(t, err)
+	}
+
+	for _, s := range c.servers {
+		acc, err := s.lookupAccount(globalAccountName)
+		require_NoError(t, err)
+		mset, err := acc.lookupStream("TEST")
+		require_NoError(t, err)
+		snap, err := mset.node.(*raft).loadLastSnapshot()
+		require_NoError(t, err)
+		// This measure is not exact and more of a side effect.
+		// We expect one snapshot to be made pretty soon and to be on cooldown after.
+		// So no snapshots should be made after that.
+		require_LessThan(t, snap.lastIndex, 20)
+	}
+}
+
 //
 // DO NOT ADD NEW TESTS IN THIS FILE (unless to balance test times)
 // Add at the end of jetstream_cluster_<n>_test.go, with <n> being the highest value.