66
66
import org .opensearch .common .Nullable ;
67
67
import org .opensearch .common .Numbers ;
68
68
import org .opensearch .common .Priority ;
69
+ import org .opensearch .common .Randomness ;
69
70
import org .opensearch .common .SetOnce ;
70
71
import org .opensearch .common .UUIDs ;
71
72
import org .opensearch .common .blobstore .BlobContainer ;
@@ -831,7 +832,7 @@ boolean getPrefixModeVerification() {
831
832
* maintains single lazy instance of {@link BlobContainer}
832
833
*/
833
834
protected BlobContainer blobContainer () {
834
- // assertSnapshotOrGenericThread();
835
+ assertSnapshotOrGenericThread ();
835
836
836
837
BlobContainer blobContainer = this .blobContainer .get ();
837
838
if (blobContainer == null ) {
@@ -1204,6 +1205,10 @@ private void asyncCleanupUnlinkedShardLevelBlobs(
1204
1205
ActionListener <Void > listener
1205
1206
) {
1206
1207
final List <Tuple <BlobPath , String >> filesToDelete = resolveFilesToDelete (oldRepositoryData , snapshotIds , deleteResults );
1208
+ long startTimeNs = System .nanoTime ();
1209
+ Randomness .shuffle (filesToDelete );
1210
+ logger .debug ("[{}] shuffled the filesToDelete with timeElapsedNs={}" , metadata .name (), (System .nanoTime () - startTimeNs ));
1211
+
1207
1212
if (filesToDelete .isEmpty ()) {
1208
1213
listener .onResponse (null );
1209
1214
return ;
@@ -1221,8 +1226,8 @@ private void asyncCleanupUnlinkedShardLevelBlobs(
1221
1226
staleFilesToDeleteInBatch .size ()
1222
1227
);
1223
1228
1224
- // Start as many workers as fit into the snapshot pool at once at the most
1225
- final int workers = Math .min (threadPool .info (ThreadPool .Names .SNAPSHOT ).getMax (), staleFilesToDeleteInBatch .size ());
1229
+ // Start as many workers as fit into the snapshot_deletion pool at once at the most
1230
+ final int workers = Math .min (threadPool .info (ThreadPool .Names .SNAPSHOT_DELETION ).getMax (), staleFilesToDeleteInBatch .size ());
1226
1231
for (int i = 0 ; i < workers ; ++i ) {
1227
1232
executeStaleShardDelete (staleFilesToDeleteInBatch , remoteStoreLockManagerFactory , groupedListener );
1228
1233
}
@@ -1326,7 +1331,7 @@ private void executeStaleShardDelete(
1326
1331
if (filesToDelete == null ) {
1327
1332
return ;
1328
1333
}
1329
- threadPool .executor (ThreadPool .Names .SNAPSHOT ).execute (ActionRunnable .wrap (listener , l -> {
1334
+ threadPool .executor (ThreadPool .Names .SNAPSHOT_DELETION ).execute (ActionRunnable .wrap (listener , l -> {
1330
1335
try {
1331
1336
// filtering files for which remote store lock release and cleanup succeeded,
1332
1337
// remaining files for which it failed will be retried in next snapshot delete run.
@@ -1390,7 +1395,7 @@ private void writeUpdatedShardMetaDataAndComputeDeletes(
1390
1395
ActionListener <Collection <ShardSnapshotMetaDeleteResult >> onAllShardsCompleted
1391
1396
) {
1392
1397
1393
- final Executor executor = threadPool .executor (ThreadPool .Names .SNAPSHOT );
1398
+ final Executor executor = threadPool .executor (ThreadPool .Names .SNAPSHOT_DELETION );
1394
1399
final List <IndexId > indices = oldRepositoryData .indicesToUpdateAfterRemovingSnapshot (snapshotIds );
1395
1400
1396
1401
if (indices .isEmpty ()) {
@@ -1578,7 +1583,7 @@ private void cleanupStaleBlobs(
1578
1583
listener .onResponse (deleteResult );
1579
1584
}, listener ::onFailure ), 2 );
1580
1585
1581
- final Executor executor = threadPool .executor (ThreadPool .Names .SNAPSHOT );
1586
+ final Executor executor = threadPool .executor (ThreadPool .Names .SNAPSHOT_DELETION );
1582
1587
final List <String > staleRootBlobs = staleRootBlobs (newRepoData , rootBlobs .keySet ());
1583
1588
if (staleRootBlobs .isEmpty ()) {
1584
1589
groupedListener .onResponse (DeleteResult .ZERO );
@@ -1781,7 +1786,7 @@ void cleanupStaleIndices(
1781
1786
1782
1787
// Start as many workers as fit into the snapshot pool at once at the most
1783
1788
final int workers = Math .min (
1784
- threadPool .info (ThreadPool .Names .SNAPSHOT ).getMax (),
1789
+ threadPool .info (ThreadPool .Names .SNAPSHOT_DELETION ).getMax (),
1785
1790
foundIndices .size () - survivingIndexIds .size ()
1786
1791
);
1787
1792
for (int i = 0 ; i < workers ; ++i ) {
@@ -1833,7 +1838,7 @@ private void executeOneStaleIndexDelete(
1833
1838
return ;
1834
1839
}
1835
1840
final String indexSnId = indexEntry .getKey ();
1836
- threadPool .executor (ThreadPool .Names .SNAPSHOT ).execute (ActionRunnable .supply (listener , () -> {
1841
+ threadPool .executor (ThreadPool .Names .SNAPSHOT_DELETION ).execute (ActionRunnable .supply (listener , () -> {
1837
1842
try {
1838
1843
logger .debug ("[{}] Found stale index [{}]. Cleaning it up" , metadata .name (), indexSnId );
1839
1844
List <String > matchingShardPaths = findMatchingShardPaths (indexSnId , snapshotShardPaths );
@@ -2097,8 +2102,7 @@ public void finalizeSnapshot(
2097
2102
stateTransformer ,
2098
2103
repositoryUpdatePriority ,
2099
2104
ActionListener .wrap (newRepoData -> {
2100
- cleanupOldShardGens (existingRepositoryData , updatedRepositoryData );
2101
- listener .onResponse (newRepoData );
2105
+ cleanupOldShardGens (existingRepositoryData , updatedRepositoryData , newRepoData , listener );
2102
2106
}, onUpdateFailure )
2103
2107
);
2104
2108
}, onUpdateFailure ), 2 + indices .size ());
@@ -2254,7 +2258,12 @@ private void logShardPathsOperationWarning(IndexId indexId, SnapshotId snapshotI
2254
2258
}
2255
2259
2256
2260
// Delete all old shard gen blobs that aren't referenced any longer as a result from moving to updated repository data
2257
- private void cleanupOldShardGens (RepositoryData existingRepositoryData , RepositoryData updatedRepositoryData ) {
2261
+ private void cleanupOldShardGens (
2262
+ RepositoryData existingRepositoryData ,
2263
+ RepositoryData updatedRepositoryData ,
2264
+ RepositoryData newRepositoryData ,
2265
+ ActionListener <RepositoryData > listener
2266
+ ) {
2258
2267
final List <String > toDelete = new ArrayList <>();
2259
2268
updatedRepositoryData .shardGenerations ()
2260
2269
.obsoleteShardGenerations (existingRepositoryData .shardGenerations ())
@@ -2263,10 +2272,62 @@ private void cleanupOldShardGens(RepositoryData existingRepositoryData, Reposito
2263
2272
(shardId , oldGen ) -> toDelete .add (shardPath (indexId , shardId ).buildAsString () + INDEX_FILE_PREFIX + oldGen )
2264
2273
)
2265
2274
);
2275
+ if (toDelete .isEmpty ()) {
2276
+ listener .onResponse (newRepositoryData );
2277
+ return ;
2278
+ }
2266
2279
try {
2267
- deleteFromContainer (rootBlobContainer (), toDelete );
2280
+ AtomicInteger counter = new AtomicInteger ();
2281
+ Collection <List <String >> subList = toDelete .stream ()
2282
+ .collect (Collectors .groupingBy (it -> counter .getAndIncrement () / maxShardBlobDeleteBatch ))
2283
+ .values ();
2284
+ final BlockingQueue <List <String >> staleFilesToDeleteInBatch = new LinkedBlockingQueue <>(subList );
2285
+ logger .info (
2286
+ "[{}] cleanupOldShardGens toDeleteSize={} groupSize={}" ,
2287
+ metadata .name (),
2288
+ toDelete .size (),
2289
+ staleFilesToDeleteInBatch .size ()
2290
+ );
2291
+ final GroupedActionListener <Void > groupedListener = new GroupedActionListener <>(ActionListener .wrap (r -> {
2292
+ logger .info ("[{}] completed cleanupOldShardGens" , metadata .name ());
2293
+ listener .onResponse (newRepositoryData );
2294
+ }, ex -> {
2295
+ logger .error (new ParameterizedMessage ("[{}] exception in cleanupOldShardGens" , metadata .name ()), ex );
2296
+ listener .onResponse (newRepositoryData );
2297
+ }), staleFilesToDeleteInBatch .size ());
2298
+
2299
+ // Start as many workers as fit into the snapshot pool at once at the most
2300
+ final int workers = Math .min (threadPool .info (ThreadPool .Names .SNAPSHOT_DELETION ).getMax (), staleFilesToDeleteInBatch .size ());
2301
+ for (int i = 0 ; i < workers ; ++i ) {
2302
+ executeOldShardGensCleanup (staleFilesToDeleteInBatch , groupedListener );
2303
+ }
2268
2304
} catch (Exception e ) {
2269
- logger .warn ("Failed to clean up old shard generation blobs" , e );
2305
+ logger .warn (new ParameterizedMessage (" [{}] Failed to clean up old shard generation blobs" , metadata .name ()), e );
2306
+ listener .onResponse (newRepositoryData );
2307
+ }
2308
+ }
2309
+
2310
+ private void executeOldShardGensCleanup (BlockingQueue <List <String >> staleFilesToDeleteInBatch , GroupedActionListener <Void > listener )
2311
+ throws InterruptedException {
2312
+ List <String > filesToDelete = staleFilesToDeleteInBatch .poll (0L , TimeUnit .MILLISECONDS );
2313
+ if (filesToDelete != null ) {
2314
+ threadPool .executor (ThreadPool .Names .SNAPSHOT_DELETION ).execute (ActionRunnable .wrap (listener , l -> {
2315
+ try {
2316
+ deleteFromContainer (rootBlobContainer (), filesToDelete );
2317
+ l .onResponse (null );
2318
+ } catch (Exception e ) {
2319
+ logger .warn (
2320
+ () -> new ParameterizedMessage (
2321
+ "[{}] Failed to delete following blobs during cleanupOldFiles : {}" ,
2322
+ metadata .name (),
2323
+ filesToDelete
2324
+ ),
2325
+ e
2326
+ );
2327
+ l .onFailure (e );
2328
+ }
2329
+ executeOldShardGensCleanup (staleFilesToDeleteInBatch , listener );
2330
+ }));
2270
2331
}
2271
2332
}
2272
2333
@@ -2383,10 +2444,11 @@ public long getRemoteDownloadThrottleTimeInNanos() {
2383
2444
}
2384
2445
2385
2446
protected void assertSnapshotOrGenericThread () {
2386
- assert Thread .currentThread ().getName ().contains ('[' + ThreadPool .Names .SNAPSHOT + ']' )
2447
+ assert Thread .currentThread ().getName ().contains ('[' + ThreadPool .Names .SNAPSHOT_DELETION + ']' )
2448
+ || Thread .currentThread ().getName ().contains ('[' + ThreadPool .Names .SNAPSHOT + ']' )
2387
2449
|| Thread .currentThread ().getName ().contains ('[' + ThreadPool .Names .GENERIC + ']' ) : "Expected current thread ["
2388
2450
+ Thread .currentThread ()
2389
- + "] to be the snapshot or generic thread." ;
2451
+ + "] to be the snapshot_deletion or snapshot or generic thread." ;
2390
2452
}
2391
2453
2392
2454
@ Override
0 commit comments