|
67 | 67 | import org.opensearch.cluster.routing.ShardRouting;
|
68 | 68 | import org.opensearch.cluster.routing.ShardRoutingState;
|
69 | 69 | import org.opensearch.cluster.routing.UnassignedInfo;
|
| 70 | +import org.opensearch.cluster.routing.allocation.ExistingShardsAllocator; |
70 | 71 | import org.opensearch.cluster.routing.allocation.command.AllocateEmptyPrimaryAllocationCommand;
|
71 | 72 | import org.opensearch.cluster.routing.allocation.command.MoveAllocationCommand;
|
72 | 73 | import org.opensearch.cluster.service.ClusterService;
|
@@ -1479,9 +1480,14 @@ public void testDoNotInfinitelyWaitForMapping() {
|
1479 | 1480 | }
|
1480 | 1481 |
|
1481 | 1482 | /** Makes sure the new cluster-manager does not repeatedly fetch index metadata from recovering replicas */
|
1482 |
| - public void testOngoingRecoveryAndClusterManagerFailOver() throws Exception { |
| 1483 | + public void testOngoingRecoveryAndClusterManagerFailOverForASFDisabled() throws Exception { |
1483 | 1484 | String indexName = "test";
|
1484 |
| - internalCluster().startNodes(2); |
| 1485 | + // ASF Disabled |
| 1486 | + internalCluster().startNodes( |
| 1487 | + 2, |
| 1488 | + Settings.builder().put(ExistingShardsAllocator.EXISTING_SHARDS_ALLOCATOR_BATCH_MODE.getKey(), false).build() |
| 1489 | + ); |
| 1490 | + |
1485 | 1491 | String nodeWithPrimary = internalCluster().startDataOnlyNode();
|
1486 | 1492 | assertAcked(
|
1487 | 1493 | client().admin()
|
@@ -1544,6 +1550,84 @@ public void testOngoingRecoveryAndClusterManagerFailOver() throws Exception {
|
1544 | 1550 | ensureGreen(indexName);
|
1545 | 1551 | }
|
1546 | 1552 |
|
| 1553 | + public void testOngoingRecoveryAndClusterManagerFailOver() throws Exception { |
| 1554 | + String indexName = "test"; |
| 1555 | + internalCluster().startNodes(2); |
| 1556 | + String nodeWithPrimary = internalCluster().startDataOnlyNode(); |
| 1557 | + assertAcked( |
| 1558 | + client().admin() |
| 1559 | + .indices() |
| 1560 | + .prepareCreate(indexName) |
| 1561 | + .setSettings( |
| 1562 | + Settings.builder() |
| 1563 | + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) |
| 1564 | + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) |
| 1565 | + .put("index.routing.allocation.include._name", nodeWithPrimary) |
| 1566 | + ) |
| 1567 | + ); |
| 1568 | + MockTransportService transport = (MockTransportService) internalCluster().getInstance(TransportService.class, nodeWithPrimary); |
| 1569 | + CountDownLatch phase1ReadyBlocked = new CountDownLatch(1); |
| 1570 | + CountDownLatch allowToCompletePhase1Latch = new CountDownLatch(1); |
| 1571 | + Semaphore blockRecovery = new Semaphore(1); |
| 1572 | + transport.addSendBehavior((connection, requestId, action, request, options) -> { |
| 1573 | + if (PeerRecoveryTargetService.Actions.CLEAN_FILES.equals(action) && blockRecovery.tryAcquire()) { |
| 1574 | + phase1ReadyBlocked.countDown(); |
| 1575 | + try { |
| 1576 | + allowToCompletePhase1Latch.await(); |
| 1577 | + } catch (InterruptedException e) { |
| 1578 | + throw new AssertionError(e); |
| 1579 | + } |
| 1580 | + } |
| 1581 | + connection.sendRequest(requestId, action, request, options); |
| 1582 | + }); |
| 1583 | + try { |
| 1584 | + String nodeWithReplica = internalCluster().startDataOnlyNode(); |
| 1585 | + assertAcked( |
| 1586 | + client().admin() |
| 1587 | + .indices() |
| 1588 | + .prepareUpdateSettings(indexName) |
| 1589 | + .setSettings( |
| 1590 | + Settings.builder() |
| 1591 | + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1) |
| 1592 | + .put("index.routing.allocation.include._name", nodeWithPrimary + "," + nodeWithReplica) |
| 1593 | + ) |
| 1594 | + ); |
| 1595 | + phase1ReadyBlocked.await(); |
| 1596 | + internalCluster().restartNode( |
| 1597 | + clusterService().state().nodes().getClusterManagerNode().getName(), |
| 1598 | + new InternalTestCluster.RestartCallback() |
| 1599 | + ); |
| 1600 | + internalCluster().ensureAtLeastNumDataNodes(3); |
| 1601 | + assertAcked( |
| 1602 | + client().admin() |
| 1603 | + .indices() |
| 1604 | + .prepareUpdateSettings(indexName) |
| 1605 | + .setSettings( |
| 1606 | + Settings.builder() |
| 1607 | + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 2) |
| 1608 | + .putNull("index.routing.allocation.include._name") |
| 1609 | + ) |
| 1610 | + ); |
| 1611 | + |
| 1612 | + ClusterState state = client().admin().cluster().prepareState().get().getState(); |
| 1613 | + assertTrue( |
| 1614 | + state.routingTable().index(indexName).shardsWithState(ShardRoutingState.UNASSIGNED).size() == 1 |
| 1615 | + && state.routingTable().index(indexName).shardsWithState(ShardRoutingState.INITIALIZING).size() == 1 |
| 1616 | + ); |
| 1617 | + /* |
| 1618 | + Shard assignment is stuck because recovery is blocked at CLEAN_FILES stage. Once, it times out after 60s the replica shards get assigned. |
| 1619 | + https://github.com/opensearch-project/OpenSearch/issues/18098. |
| 1620 | +
|
| 1621 | + Stack trace: |
| 1622 | + Caused by: org.opensearch.transport.ReceiveTimeoutTransportException: [node_t3][127.0.0.1:56648][internal:index/shard/recovery/clean_files] request_id [20] timed out after [60026ms] |
| 1623 | + at org.opensearch.transport.TransportService$TimeoutHandler.run(TransportService.java:1399) ~[main/:?] |
| 1624 | + */ |
| 1625 | + ensureGreen(TimeValue.timeValueSeconds(62), indexName); |
| 1626 | + } finally { |
| 1627 | + allowToCompletePhase1Latch.countDown(); |
| 1628 | + } |
| 1629 | + } |
| 1630 | + |
1547 | 1631 | public void testRecoverLocallyUpToGlobalCheckpoint() throws Exception {
|
1548 | 1632 | internalCluster().ensureAtLeastNumDataNodes(2);
|
1549 | 1633 | List<String> nodes = randomSubsetOf(
|
|
0 commit comments