manticoresoftware · donhardman · Mar 4, 2025
diff --git a/src/searchdreplication.cpp b/src/searchdreplication.cpp
@@ -923,6 +923,17 @@ bool HandleCmdReplicated ( RtAccum_t & tAcc )
 	if ( tAcc.m_dCmd.IsEmpty() )
 		return TlsMsg::Err ( "empty accumulator" );
 
+	// Increment active replication operations counter if the index is an RT index
+	auto pIndex = (RtIndex_c*)tAcc.m_pIndex;
+	if ( pIndex )
+		pIndex->m_iActiveReplicationOperations.fetch_add(1, std::memory_order_release);
+
+	// Ensure we decrement the counter when we leave this function
+	auto tCleanup = AtScopeExit([pIndex]() {
+		if ( pIndex )
+			pIndex->m_iActiveReplicationOperations.fetch_sub(1, std::memory_order_release);
+	});
+
 	const ReplicationCommand_t & tCmd = *tAcc.m_dCmd[0];
 	bool bCmdCluster = ( tAcc.m_dCmd.GetLength()==1
 		&& ( tCmd.m_eCommand==ReplCmd_e::CLUSTER_ALTER_ADD || tCmd.m_eCommand==ReplCmd_e::CLUSTER_ALTER_DROP ) );
@@ -1130,6 +1141,17 @@ static bool HandleCmdReplicateImpl ( RtAccum_t & tAcc, int * pDeletedCount, CSph
 {
 	TRACE_CORO ( "sph", "HandleCmdReplicateImpl" );
 
+	// Increment active replication operations counter if the index is an RT index
+	auto pIndex = (RtIndex_c*)tAcc.m_pIndex;
+	if ( pIndex )
+		pIndex->m_iActiveReplicationOperations.fetch_add(1, std::memory_order_release);
+
+	// Ensure we decrement the counter when we leave this function
+	auto tCleanup = AtScopeExit([pIndex]() {
+		if ( pIndex )
+			pIndex->m_iActiveReplicationOperations.fetch_sub(1, std::memory_order_release);
+	});
+
 	CommitMonitor_c tMonitor ( tAcc, pDeletedCount, pWarning, pUpdated );
 
 	// with cluster path

diff --git a/src/sphinxrt.cpp b/src/sphinxrt.cpp
@@ -1453,6 +1453,7 @@ class RtIndex_c final : public RtIndex_i, public ISphNoncopyable, public ISphWor
 	mutable DWORD				m_uDiskAttrStatus = 0;
 	std::atomic<int64_t>		m_tmDataWriten = 0;
 	std::atomic<int64_t>		m_tmDataSearched = 0;
+	std::atomic<int>			m_iActiveReplicationOperations = 0;  // Counter for active replication operations
 
 	bool						m_bKeywordDict;
 	int							m_iWordsCheckpoint = RTDICT_CHECKPOINT_V5;
@@ -4250,6 +4251,15 @@ bool RtIndex_c::SaveDiskChunk ( bool bForced, bool bEmergent ) REQUIRES ( m_tWor
 
 	assert ( Coro::CurrentScheduler() == m_tWorkers.SerialChunkAccess() );
 
+	// Skip saving if replication operations are in progress and this is not a forced save
+	// This prevents data loss in clustered environments where replication operations
+	// might not have fully propagated before disk chunks are saved
+	if ( m_iActiveReplicationOperations.load(std::memory_order_acquire) > 0 && !bForced )
+	{
+		RTSAVELOG << "SaveDiskChunk skipped due to active replication operations";
+		return true;
+	}
+
 	RTSAVELOG << "SaveDiskChunk (" << ( bForced ? "forced, " : "not forced, " ) << ( bEmergent ? "emergent, " : "not emergent " ) << ")";
 
 	m_tNSavesNow.Wait ( [] ( int iVal ) { return iVal < SIMULTANEOUS_SAVE_LIMIT; } );