Skip to content

Commit 99d935c

Browse files
Fix Concurrent Snapshot Create+Delete + Delete Index (#61770) (#61774)
We had a bug here were we put a `null` value into the shard assignment mapping when reassigning work after a snapshot delete had gone through. This only affects partial snaphots but essentially dead-locks the snapshot process. Closes #61762
1 parent 13b9536 commit 99d935c

File tree

3 files changed

+47
-6
lines changed

3 files changed

+47
-6
lines changed

server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -819,6 +819,28 @@ public void testMultipleSnapshotsQueuedAfterDelete() throws Exception {
819819
assertAcked(deleteFuture.get());
820820
}
821821

822+
public void testMultiplePartialSnapshotsQueuedAfterDelete() throws Exception {
823+
final String masterNode = internalCluster().startMasterOnlyNode();
824+
internalCluster().startDataOnlyNode();
825+
final String repoName = "test-repo";
826+
createRepository(repoName, "mock");
827+
createIndexWithContent("index-one");
828+
createIndexWithContent("index-two");
829+
createNSnapshots(repoName, randomIntBetween(1, 5));
830+
831+
final ActionFuture<AcknowledgedResponse> deleteFuture = startAndBlockOnDeleteSnapshot(repoName, "*");
832+
final ActionFuture<CreateSnapshotResponse> snapshotThree = startFullSnapshot(repoName, "snapshot-three", true);
833+
final ActionFuture<CreateSnapshotResponse> snapshotFour = startFullSnapshot(repoName, "snapshot-four", true);
834+
awaitNSnapshotsInProgress(2);
835+
836+
assertAcked(client().admin().indices().prepareDelete("index-two"));
837+
unblockNode(repoName, masterNode);
838+
839+
assertThat(snapshotThree.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
840+
assertThat(snapshotFour.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
841+
assertAcked(deleteFuture.get());
842+
}
843+
822844
public void testQueuedSnapshotsWaitingForShardReady() throws Exception {
823845
internalCluster().startMasterOnlyNode();
824846
internalCluster().startDataOnlyNodes(2);
@@ -1238,8 +1260,13 @@ private ActionFuture<CreateSnapshotResponse> startFullSnapshotFromMasterClient(S
12381260
}
12391261

12401262
private ActionFuture<CreateSnapshotResponse> startFullSnapshot(String repoName, String snapshotName) {
1263+
return startFullSnapshot(repoName, snapshotName, false);
1264+
}
1265+
1266+
private ActionFuture<CreateSnapshotResponse> startFullSnapshot(String repoName, String snapshotName, boolean partial) {
12411267
logger.info("--> creating full snapshot [{}] to repo [{}]", snapshotName, repoName);
1242-
return client().admin().cluster().prepareCreateSnapshot(repoName, snapshotName).setWaitForCompletion(true).execute();
1268+
return client().admin().cluster().prepareCreateSnapshot(repoName, snapshotName).setWaitForCompletion(true)
1269+
.setPartial(partial).execute();
12431270
}
12441271

12451272
private void awaitClusterState(Predicate<ClusterState> statePredicate) throws Exception {

server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,13 @@ public static class ShardSnapshotStatus {
345345
public static final ShardSnapshotStatus UNASSIGNED_QUEUED =
346346
new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.QUEUED, null);
347347

348+
/**
349+
* Shard snapshot status for shards that could not be snapshotted because their index was deleted from before the shard snapshot
350+
* started.
351+
*/
352+
public static final ShardSnapshotStatus MISSING =
353+
new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.MISSING, "missing index", null);
354+
348355
private final ShardState state;
349356

350357
@Nullable

server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2192,9 +2192,17 @@ private SnapshotsInProgress updatedSnapshotsInProgress(ClusterState currentState
21922192
final ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> updatedAssignmentsBuilder =
21932193
ImmutableOpenMap.builder(entry.shards());
21942194
for (ShardId shardId : canBeUpdated) {
2195-
final boolean added = reassignedShardIds.add(shardId);
2196-
assert added;
2197-
updatedAssignmentsBuilder.put(shardId, shardAssignments.get(shardId));
2195+
final ShardSnapshotStatus updated = shardAssignments.get(shardId);
2196+
if (updated == null) {
2197+
// We don't have a new assignment for this shard because its index was concurrently deleted
2198+
assert currentState.routingTable().hasIndex(shardId.getIndex()) == false :
2199+
"Missing assignment for [" + shardId + "]";
2200+
updatedAssignmentsBuilder.put(shardId, ShardSnapshotStatus.MISSING);
2201+
} else {
2202+
final boolean added = reassignedShardIds.add(shardId);
2203+
assert added;
2204+
updatedAssignmentsBuilder.put(shardId, updated);
2205+
}
21982206
}
21992207
snapshotEntries.add(entry.withShards(updatedAssignmentsBuilder.build()));
22002208
changed = true;
@@ -2284,8 +2292,7 @@ private static ImmutableOpenMap<ShardId, SnapshotsInProgress.ShardSnapshotStatus
22842292
IndexMetadata indexMetadata = metadata.index(indexName);
22852293
if (indexMetadata == null) {
22862294
// The index was deleted before we managed to start the snapshot - mark it as missing.
2287-
builder.put(new ShardId(indexName, IndexMetadata.INDEX_UUID_NA_VALUE, 0),
2288-
new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.MISSING, "missing index", null));
2295+
builder.put(new ShardId(indexName, IndexMetadata.INDEX_UUID_NA_VALUE, 0), ShardSnapshotStatus.MISSING);
22892296
} else {
22902297
final IndexRoutingTable indexRoutingTable = routingTable.index(indexName);
22912298
for (int i = 0; i < indexMetadata.getNumberOfShards(); i++) {

0 commit comments

Comments
 (0)