Skip to content

Commit 4cdd33e

Browse files
author
Harish Bhakuni
committed
[Snapshot Interop][Bug Fix] Make sure lock file gets deleted if snapshot shard md upload fails during snapshot creation.
Signed-off-by: Harish Bhakuni <[email protected]>
1 parent 293905a commit 4cdd33e

File tree

5 files changed

+385
-42
lines changed

5 files changed

+385
-42
lines changed
Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package org.opensearch.remotestore;
10+
11+
import org.opensearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse;
12+
import org.opensearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
13+
import org.opensearch.client.Client;
14+
import org.opensearch.cluster.metadata.IndexMetadata;
15+
import org.opensearch.common.settings.Settings;
16+
import org.opensearch.index.IndexSettings;
17+
import org.opensearch.plugins.Plugin;
18+
import org.opensearch.remotestore.multipart.mocks.MockFsRepositoryPlugin;
19+
import org.opensearch.repositories.blobstore.BlobStoreRepository;
20+
import org.opensearch.snapshots.AbstractSnapshotIntegTestCase;
21+
import org.opensearch.snapshots.SnapshotState;
22+
import org.opensearch.test.OpenSearchIntegTestCase;
23+
import org.junit.After;
24+
import org.junit.Before;
25+
26+
import java.io.IOException;
27+
import java.nio.file.Path;
28+
import java.util.Arrays;
29+
import java.util.Collection;
30+
import java.util.stream.Collectors;
31+
import java.util.stream.Stream;
32+
33+
import static org.opensearch.remotestore.RemoteStoreBaseIntegTestCase.remoteStoreClusterSettings;
34+
import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked;
35+
import static org.hamcrest.Matchers.equalTo;
36+
import static org.hamcrest.Matchers.greaterThan;
37+
38+
@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0)
39+
public class RemoteSnapshotFailureScenariosIT extends AbstractSnapshotIntegTestCase {
40+
private static final String BASE_REMOTE_REPO = "test-rs-repo" + TEST_REMOTE_STORE_REPO_SUFFIX;
41+
private static final String SNAP_REPO = "test-snap-repo";
42+
private static final String INDEX_NAME_1 = "testindex1";
43+
private static final String RESTORED_INDEX_NAME_1 = INDEX_NAME_1 + "-restored";
44+
private static final String INDEX_NAME_2 = "testindex2";
45+
private static final String RESTORED_INDEX_NAME_2 = INDEX_NAME_2 + "-restored";
46+
47+
private Path remoteRepoPath;
48+
private Path snapRepoPath;
49+
50+
@Override
51+
protected Collection<Class<? extends Plugin>> nodePlugins() {
52+
return Stream.concat(super.nodePlugins().stream(), Stream.of(MockFsRepositoryPlugin.class)).collect(Collectors.toList());
53+
}
54+
55+
@Before
56+
public void setup() {
57+
remoteRepoPath = randomRepoPath().toAbsolutePath();
58+
snapRepoPath = randomRepoPath().toAbsolutePath();
59+
}
60+
61+
@After
62+
public void teardown() {
63+
clusterAdmin().prepareCleanupRepository(BASE_REMOTE_REPO).get();
64+
clusterAdmin().prepareCleanupRepository(SNAP_REPO).get();
65+
}
66+
67+
@Override
68+
protected Settings nodeSettings(int nodeOrdinal) {
69+
return Settings.builder()
70+
.put(super.nodeSettings(nodeOrdinal))
71+
.put(remoteStoreClusterSettings(BASE_REMOTE_REPO, remoteRepoPath, "mock", BASE_REMOTE_REPO, remoteRepoPath, "mock"))
72+
.build();
73+
}
74+
75+
private Settings.Builder getIndexSettings(int numOfShards, int numOfReplicas) {
76+
Settings.Builder settingsBuilder = Settings.builder()
77+
.put(super.indexSettings())
78+
.put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, numOfShards)
79+
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, numOfReplicas)
80+
.put(IndexSettings.INDEX_REFRESH_INTERVAL_SETTING.getKey(), "300s");
81+
return settingsBuilder;
82+
}
83+
84+
private void indexDocuments(Client client, String indexName, int numOfDocs) {
85+
indexDocuments(client, indexName, 0, numOfDocs);
86+
}
87+
88+
private void indexDocuments(Client client, String indexName, int fromId, int toId) {
89+
for (int i = fromId; i < toId; i++) {
90+
String id = Integer.toString(i);
91+
client.prepareIndex(indexName).setId(id).setSource("text", "sometext").get();
92+
}
93+
client.admin().indices().prepareFlush(indexName).get();
94+
}
95+
96+
public void testRestoreDownloadFromRemoteStore() {
97+
String snapshotName1 = "test-restore-snapshot1";
98+
99+
createRepository(SNAP_REPO, "fs", getRepositorySettings(snapRepoPath, true));
100+
101+
Client client = client();
102+
Settings indexSettings = getIndexSettings(1, 0).build();
103+
createIndex(INDEX_NAME_1, indexSettings);
104+
105+
Settings indexSettings2 = getIndexSettings(1, 0).build();
106+
createIndex(INDEX_NAME_2, indexSettings2);
107+
108+
final int numDocsInIndex1 = 5;
109+
final int numDocsInIndex2 = 6;
110+
indexDocuments(client, INDEX_NAME_1, numDocsInIndex1);
111+
indexDocuments(client, INDEX_NAME_2, numDocsInIndex2);
112+
ensureGreen(INDEX_NAME_1, INDEX_NAME_2);
113+
114+
logger.info("--> snapshot");
115+
CreateSnapshotResponse createSnapshotResponse = client.admin()
116+
.cluster()
117+
.prepareCreateSnapshot(SNAP_REPO, snapshotName1)
118+
.setWaitForCompletion(true)
119+
.setIndices(INDEX_NAME_1, INDEX_NAME_2)
120+
.get();
121+
assertThat(createSnapshotResponse.getSnapshotInfo().successfulShards(), greaterThan(0));
122+
assertThat(
123+
createSnapshotResponse.getSnapshotInfo().successfulShards(),
124+
equalTo(createSnapshotResponse.getSnapshotInfo().totalShards())
125+
);
126+
assertThat(createSnapshotResponse.getSnapshotInfo().state(), equalTo(SnapshotState.SUCCESS));
127+
assertTrue(createSnapshotResponse.getSnapshotInfo().isRemoteStoreIndexShallowCopyEnabled());
128+
129+
logger.info("--> updating repository to induce remote store upload failure");
130+
assertAcked(
131+
client.admin()
132+
.cluster()
133+
.preparePutRepository(BASE_REMOTE_REPO)
134+
.setType("mock")
135+
.setSettings(
136+
Settings.builder().put("location", remoteRepoPath).put("regexes_to_fail_io", ".si").put("max_failure_number", 6L)
137+
)
138+
); // we retry IO 5 times, keeping it 6 so that first read for single segment file will fail
139+
140+
RestoreSnapshotResponse restoreSnapshotResponse = client.admin()
141+
.cluster()
142+
.prepareRestoreSnapshot(SNAP_REPO, snapshotName1)
143+
.setWaitForCompletion(true)
144+
.setIndices(INDEX_NAME_1)
145+
.setRenamePattern(INDEX_NAME_1)
146+
.setRenameReplacement(RESTORED_INDEX_NAME_1)
147+
.get();
148+
149+
ensureRed(RESTORED_INDEX_NAME_1);
150+
assertEquals(1, restoreSnapshotResponse.getRestoreInfo().failedShards());
151+
152+
assertAcked(client().admin().indices().prepareClose(RESTORED_INDEX_NAME_1).get());
153+
restoreSnapshotResponse = client.admin()
154+
.cluster()
155+
.prepareRestoreSnapshot(SNAP_REPO, snapshotName1)
156+
.setWaitForCompletion(true)
157+
.setIndices(INDEX_NAME_1)
158+
.setRenamePattern(INDEX_NAME_1)
159+
.setRenameReplacement(RESTORED_INDEX_NAME_1)
160+
.get();
161+
162+
ensureGreen(RESTORED_INDEX_NAME_1);
163+
assertEquals(0, restoreSnapshotResponse.getRestoreInfo().failedShards());
164+
165+
// resetting repository to original settings.
166+
logger.info("--> removing repository settings overrides");
167+
assertAcked(
168+
client.admin()
169+
.cluster()
170+
.preparePutRepository(BASE_REMOTE_REPO)
171+
.setType("mock")
172+
.setSettings(Settings.builder().put("location", remoteRepoPath))
173+
);
174+
}
175+
176+
public void testAcquireLockFails() throws IOException {
177+
final Client client = client();
178+
179+
logger.info("--> creating snapshot repository");
180+
createRepository(SNAP_REPO, "mock", getRepositorySettings(snapRepoPath, true));
181+
182+
logger.info("--> updating remote store repository to induce lock file upload failure");
183+
assertAcked(
184+
client.admin()
185+
.cluster()
186+
.preparePutRepository(BASE_REMOTE_REPO)
187+
.setType("mock")
188+
.setSettings(Settings.builder().put("location", remoteRepoPath).put("regexes_to_fail_io", "lock$"))
189+
);
190+
191+
logger.info("--> creating indices and index documents");
192+
Settings indexSettings = getIndexSettings(1, 0).build();
193+
createIndex(INDEX_NAME_1, indexSettings);
194+
createIndex(INDEX_NAME_2, indexSettings);
195+
196+
final int numDocsInIndex1 = 5;
197+
final int numDocsInIndex2 = 6;
198+
indexDocuments(client, INDEX_NAME_1, numDocsInIndex1);
199+
indexDocuments(client, INDEX_NAME_2, numDocsInIndex2);
200+
ensureGreen(INDEX_NAME_1, INDEX_NAME_2);
201+
202+
logger.info("--> create first shallow snapshot");
203+
CreateSnapshotResponse createSnapshotResponse = client.admin()
204+
.cluster()
205+
.prepareCreateSnapshot(SNAP_REPO, "test-snap")
206+
.setWaitForCompletion(true)
207+
.setIndices(INDEX_NAME_1, INDEX_NAME_2)
208+
.get();
209+
assertTrue(createSnapshotResponse.getSnapshotInfo().isRemoteStoreIndexShallowCopyEnabled());
210+
assertTrue(createSnapshotResponse.getSnapshotInfo().failedShards() > 0);
211+
assertSame(createSnapshotResponse.getSnapshotInfo().state(), SnapshotState.PARTIAL);
212+
logger.info("--> delete partial snapshot");
213+
assertAcked(client().admin().cluster().prepareDeleteSnapshot(SNAP_REPO, "test-snap").get());
214+
// resetting repository to original settings.
215+
logger.info("--> removing repository settings overrides");
216+
assertAcked(
217+
client.admin()
218+
.cluster()
219+
.preparePutRepository(BASE_REMOTE_REPO)
220+
.setType("mock")
221+
.setSettings(Settings.builder().put("location", remoteRepoPath))
222+
);
223+
}
224+
225+
public void testWriteShallowSnapFileFails() throws IOException, InterruptedException {
226+
final Client client = client();
227+
228+
Settings.Builder snapshotRepoSettingsBuilder = randomRepositorySettings().put(
229+
BlobStoreRepository.REMOTE_STORE_INDEX_SHALLOW_COPY.getKey(),
230+
Boolean.TRUE
231+
).putList("regexes_to_fail_io", "^" + BlobStoreRepository.SHALLOW_SNAPSHOT_PREFIX);
232+
233+
logger.info("--> creating snapshot repository");
234+
createRepository(SNAP_REPO, "mock", snapshotRepoSettingsBuilder);
235+
236+
logger.info("--> creating indices and index documents");
237+
Settings indexSettings = getIndexSettings(1, 0).build();
238+
createIndex(INDEX_NAME_1, indexSettings);
239+
createIndex(INDEX_NAME_2, indexSettings);
240+
241+
final int numDocsInIndex1 = 5;
242+
final int numDocsInIndex2 = 6;
243+
indexDocuments(client, INDEX_NAME_1, numDocsInIndex1);
244+
indexDocuments(client, INDEX_NAME_2, numDocsInIndex2);
245+
ensureGreen(INDEX_NAME_1, INDEX_NAME_2);
246+
247+
logger.info("--> create first shallow snapshot");
248+
CreateSnapshotResponse createSnapshotResponse = client().admin()
249+
.cluster()
250+
.prepareCreateSnapshot(SNAP_REPO, "test-snap")
251+
.setWaitForCompletion(true)
252+
.setIndices(INDEX_NAME_1, INDEX_NAME_2)
253+
.get();
254+
assertTrue(createSnapshotResponse.getSnapshotInfo().isRemoteStoreIndexShallowCopyEnabled());
255+
256+
assertTrue(createSnapshotResponse.getSnapshotInfo().failedShards() > 0);
257+
assertSame(createSnapshotResponse.getSnapshotInfo().state(), SnapshotState.PARTIAL);
258+
String[] lockFiles = getLockFilesInRemoteStore(INDEX_NAME_1, BASE_REMOTE_REPO);
259+
assertEquals("there should be no lock files, but found " + Arrays.toString(lockFiles), 0, lockFiles.length);
260+
assertAcked(client().admin().cluster().prepareDeleteSnapshot(SNAP_REPO, "test-snap").get());
261+
}
262+
}

server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -490,17 +490,28 @@ public void acquireLock(long primaryTerm, long generation, String acquirerId) th
490490

491491
/**
492492
* Releases a lock which was acquired on given segment commit.
493+
* it will be no-op if corresponding metadata file or lock file is not present.
493494
*
494495
* @param primaryTerm Primary Term of index at the time of commit.
495496
* @param generation Commit Generation
496497
* @param acquirerId Acquirer ID for which lock needs to be released.
497-
* @throws IOException will be thrown in case i) listing lock files failed or ii) deleting the lock file failed.
498-
* @throws NoSuchFileException when metadata file is not present for given commit point.
498+
* @throws IOException will be thrown in case i) listing lock files failed or ii) deleting the lock file failed.
499499
*/
500500
@Override
501501
public void releaseLock(long primaryTerm, long generation, String acquirerId) throws IOException {
502-
String metadataFile = getMetadataFileForCommit(primaryTerm, generation);
503-
mdLockManager.release(FileLockInfo.getLockInfoBuilder().withFileToLock(metadataFile).withAcquirerId(acquirerId).build());
502+
String metadataFilePrefix = MetadataFilenameUtils.getMetadataFilePrefixForCommit(primaryTerm, generation);
503+
try {
504+
String metadataFile = ((RemoteStoreMetadataLockManager) mdLockManager).fetchLockedMetadataFile(metadataFilePrefix, acquirerId);
505+
mdLockManager.release(FileLockInfo.getLockInfoBuilder().withFileToLock(metadataFile).withAcquirerId(acquirerId).build());
506+
} catch (FileNotFoundException e) {
507+
// Ignoring if the metadata file or the lock to be released is not present.
508+
logger.info(
509+
"No lock file found for acquirerId: {} during release lock operation for primaryTerm: {} and generation: {}",
510+
primaryTerm,
511+
generation,
512+
acquirerId
513+
);
514+
}
504515
}
505516

506517
/**

server/src/main/java/org/opensearch/index/store/lockmanager/RemoteStoreMetadataLockManager.java

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import java.io.FileNotFoundException;
1919
import java.io.IOException;
2020
import java.nio.file.NoSuchFileException;
21+
import java.util.Arrays;
2122
import java.util.Collection;
2223
import java.util.List;
2324
import java.util.Objects;
@@ -66,7 +67,7 @@ public void acquire(LockInfo lockInfo) throws IOException {
6667
@Override
6768
public void release(LockInfo lockInfo) throws IOException {
6869
assert lockInfo instanceof FileLockInfo : "lockInfo should be instance of FileLockInfo";
69-
String[] lockFiles = lockDirectory.listAll();
70+
String[] lockFiles = getLockFiles(lockDirectory.listAll());
7071
try {
7172
String lockToRelease = ((FileLockInfo) lockInfo).getLockForAcquirer(lockFiles);
7273
lockDirectory.deleteFile(lockToRelease);
@@ -122,7 +123,7 @@ public void cloneLock(LockInfo originalLockInfo, LockInfo clonedLockInfo) throws
122123
String originalResourceId = Objects.requireNonNull(((FileLockInfo) originalLockInfo).getAcquirerId());
123124
String clonedResourceId = Objects.requireNonNull(((FileLockInfo) clonedLockInfo).getAcquirerId());
124125
assert originalResourceId != null && clonedResourceId != null : "provided resourceIds should not be null";
125-
String[] lockFiles = lockDirectory.listAll();
126+
String[] lockFiles = getLockFiles(lockDirectory.listAll());
126127
String lockNameForAcquirer = ((FileLockInfo) originalLockInfo).getLockForAcquirer(lockFiles);
127128
String fileToLockName = FileLockInfo.LockFileUtils.getFileToLockNameFromLock(lockNameForAcquirer);
128129
acquire(FileLockInfo.getLockInfoBuilder().withFileToLock(fileToLockName).withAcquirerId(clonedResourceId).build());
@@ -131,4 +132,19 @@ public void cloneLock(LockInfo originalLockInfo, LockInfo clonedLockInfo) throws
131132
public void delete() throws IOException {
132133
lockDirectory.delete();
133134
}
135+
136+
private String[] getLockFiles(String[] lockDirectoryContents) throws IOException {
137+
if (lockDirectoryContents == null || lockDirectoryContents.length == 0) {
138+
return new String[0];
139+
}
140+
// filtering lock files from lock directory contents.
141+
// this is a good to have check, there is no known prod scenarios where this can happen
142+
// however, during tests sometimes while creating local file directory lucene adds extraFS files.
143+
return Arrays.stream(lockDirectory.listAll())
144+
.filter(
145+
file -> file.endsWith(RemoteStoreLockManagerUtils.LOCK_FILE_EXTENSION)
146+
|| file.endsWith(RemoteStoreLockManagerUtils.PRE_OS210_LOCK_FILE_EXTENSION)
147+
)
148+
.toArray(String[]::new);
149+
}
134150
}

0 commit comments

Comments
 (0)