Skip to content

Commit b18d572

Browse files
committed
[Bugfix] Fix NPE in ReplicaShardAllocator (#13993)
1 parent 0d38d14 commit b18d572

File tree

2 files changed

+59
-2
lines changed

2 files changed

+59
-2
lines changed

server/src/main/java/org/opensearch/gateway/ReplicaShardAllocator.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,10 @@ protected Runnable cancelExistingRecoveryForBetterMatch(
100100
Metadata metadata = allocation.metadata();
101101
RoutingNodes routingNodes = allocation.routingNodes();
102102
ShardRouting primaryShard = allocation.routingNodes().activePrimary(shard.shardId());
103-
assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary";
104-
assert primaryShard.currentNodeId() != null;
103+
if (primaryShard == null || primaryShard.currentNodeId() == null) {
104+
logger.trace("{}: no active primary shard found or allocated, letting actual allocation figure it out", shard);
105+
return null;
106+
}
105107
final DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId());
106108

107109
final StoreFilesMetadata primaryStore = findStore(primaryNode, nodeShardStores);

server/src/test/java/org/opensearch/gateway/ReplicaShardBatchAllocatorTests.java

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,26 @@ public void testDoNotCancelForBrokenNode() {
644644
assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED), empty());
645645
}
646646

647+
public void testDoNotCancelForInactivePrimaryNode() {
648+
RoutingAllocation allocation = oneInactivePrimaryOnNode1And1ReplicaRecovering(yesAllocationDeciders(), null);
649+
testBatchAllocator.addData(
650+
node1,
651+
null,
652+
"MATCH",
653+
null,
654+
new StoreFileMetadata("file1", 10, "MATCH_CHECKSUM", MIN_SUPPORTED_LUCENE_VERSION)
655+
)
656+
.addData(node2, randomSyncId(), null, new StoreFileMetadata("file1", 10, "MATCH_CHECKSUM", MIN_SUPPORTED_LUCENE_VERSION));
657+
658+
testBatchAllocator.processExistingRecoveries(
659+
allocation,
660+
Collections.singletonList(new ArrayList<>(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING)))
661+
);
662+
663+
assertThat(allocation.routingNodesChanged(), equalTo(false));
664+
assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED), empty());
665+
}
666+
647667
public void testAllocateUnassignedBatchThrottlingAllocationDeciderIsHonoured() throws InterruptedException {
648668
ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
649669
AllocationDeciders allocationDeciders = randomAllocationDeciders(
@@ -872,6 +892,41 @@ private RoutingAllocation onePrimaryOnNode1And1ReplicaRecovering(AllocationDecid
872892
);
873893
}
874894

895+
private RoutingAllocation oneInactivePrimaryOnNode1And1ReplicaRecovering(AllocationDeciders deciders, UnassignedInfo unassignedInfo) {
896+
ShardRouting primaryShard = TestShardRouting.newShardRouting(shardId, node1.getId(), true, ShardRoutingState.INITIALIZING);
897+
RoutingTable routingTable = RoutingTable.builder()
898+
.add(
899+
IndexRoutingTable.builder(shardId.getIndex())
900+
.addIndexShard(
901+
new IndexShardRoutingTable.Builder(shardId).addShard(primaryShard)
902+
.addShard(
903+
TestShardRouting.newShardRouting(
904+
shardId,
905+
node2.getId(),
906+
null,
907+
false,
908+
ShardRoutingState.INITIALIZING,
909+
unassignedInfo
910+
)
911+
)
912+
.build()
913+
)
914+
)
915+
.build();
916+
ClusterState state = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
917+
.routingTable(routingTable)
918+
.nodes(DiscoveryNodes.builder().add(node1).add(node2))
919+
.build();
920+
return new RoutingAllocation(
921+
deciders,
922+
new RoutingNodes(state, false),
923+
state,
924+
ClusterInfo.EMPTY,
925+
SnapshotShardSizeInfo.EMPTY,
926+
System.nanoTime()
927+
);
928+
}
929+
875930
private RoutingAllocation onePrimaryOnNode1And1ReplicaRecovering(AllocationDeciders deciders) {
876931
return onePrimaryOnNode1And1ReplicaRecovering(deciders, new UnassignedInfo(UnassignedInfo.Reason.CLUSTER_RECOVERED, null));
877932
}

0 commit comments

Comments
 (0)