Skip to content

Commit cba3ccd

Browse files
authored
Destination S3: Deferred deletes on sync success in OVERWRITE (#42579)
1 parent 325525d commit cba3ccd

File tree

12 files changed

+912
-401
lines changed

12 files changed

+912
-401
lines changed

airbyte-cdk/java/airbyte-cdk/README.md

+289-288
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
version=0.44.12
1+
version=0.44.13

airbyte-cdk/java/airbyte-cdk/db-destinations/src/testFixtures/kotlin/io/airbyte/cdk/integrations/standardtest/destination/DestinationAcceptanceTest.kt

+1-1
Original file line numberDiff line numberDiff line change
@@ -796,7 +796,7 @@ abstract class DestinationAcceptanceTest(
796796
.withDestinationSyncMode(DestinationSyncMode.APPEND)
797797
.withSyncId(42)
798798
.withGenerationId(12)
799-
.withMinimumGenerationId(12)
799+
.withMinimumGenerationId(0)
800800
}
801801

802802
val firstSyncMessages: List<AirbyteMessage> =

airbyte-cdk/java/airbyte-cdk/s3-destinations/src/main/kotlin/io/airbyte/cdk/integrations/destination/s3/BlobStorageOperations.kt

+17
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,23 @@ abstract class BlobStorageOperations protected constructor() {
5353
pathFormat: String
5454
)
5555

56+
/** Clean up all the objects matching the provided [keysToDelete] */
57+
abstract fun cleanUpObjects(keysToDelete: List<String>)
58+
59+
/**
60+
* List all the existing bucket objects for a given [namespace], [streamName], [objectPath] and
61+
* an optional [currentGenerationId] which matches the [pathFormat] regex. The returned objects
62+
* will be filtered with generationId metadata strictly less than [currentGenerationId]
63+
* @return List of keys of the objects
64+
*/
65+
abstract fun listExistingObjects(
66+
namespace: String?,
67+
streamName: String,
68+
objectPath: String,
69+
pathFormat: String,
70+
currentGenerationId: Long? = null // Sentinel default
71+
): List<String>
72+
5673
abstract fun dropBucketObject(objectPath: String)
5774

5875
abstract fun isValidData(jsonNode: JsonNode): Boolean

airbyte-cdk/java/airbyte-cdk/s3-destinations/src/main/kotlin/io/airbyte/cdk/integrations/destination/s3/S3ConsumerFactory.kt

+106-61
Original file line numberDiff line numberDiff line change
@@ -69,19 +69,20 @@ class S3ConsumerFactory {
6969
val stream = writeConfig.streamName
7070
val outputBucketPath = writeConfig.outputBucketPath
7171
val pathFormat = writeConfig.pathFormat
72-
if (mustCleanUpExistingObjects(writeConfig, storageOperations)) {
72+
if (!isAppendSync(writeConfig)) {
7373
LOGGER.info {
74-
"Clearing storage area in destination started for namespace $namespace " +
74+
"Listing objects to cleanup for namespace $namespace " +
7575
"stream $stream bucketObject $outputBucketPath pathFormat $pathFormat"
7676
}
77-
storageOperations.cleanUpBucketObject(
78-
namespace,
79-
stream,
80-
outputBucketPath,
81-
pathFormat
77+
writeConfig.objectsFromOldGeneration.addAll(
78+
keysForOverwriteDeletion(
79+
writeConfig,
80+
storageOperations,
81+
),
8282
)
8383
LOGGER.info {
84-
"Clearing storage area in destination completed for namespace $namespace stream $stream bucketObject $outputBucketPath"
84+
"Marked ${writeConfig.objectsFromOldGeneration.size} keys for deletion at end of sync " +
85+
"for namespace $namespace stream $stream bucketObject $outputBucketPath"
8586
}
8687
} else {
8788
LOGGER.info {
@@ -140,17 +141,43 @@ class S3ConsumerFactory {
140141
storageOperations: BlobStorageOperations,
141142
writeConfigs: List<WriteConfig>
142143
): OnCloseFunction {
143-
return OnCloseFunction { hasFailed: Boolean, _: Map<StreamDescriptor, StreamSyncSummary> ->
144-
if (hasFailed) {
145-
LOGGER.info { "Cleaning up destination started for ${writeConfigs.size} streams" }
146-
for (writeConfig in writeConfigs) {
147-
storageOperations.cleanUpBucketObject(
148-
writeConfig.fullOutputPath,
149-
writeConfig.storedFiles
150-
)
151-
writeConfig.clearStoredFiles()
144+
145+
val streamDescriptorToWriteConfig =
146+
writeConfigs.associateBy {
147+
StreamDescriptor().withNamespace(it.namespace).withName(it.streamName)
148+
}
149+
return OnCloseFunction {
150+
_: Boolean,
151+
streamSyncSummaries: Map<StreamDescriptor, StreamSyncSummary> ->
152+
// On stream success clean up the objects marked for deletion per stream. This is done
153+
// on per-stream basis
154+
streamSyncSummaries.forEach { (streamDescriptor, streamSummary) ->
155+
val streamSuccessful =
156+
streamSummary.terminalStatus ==
157+
AirbyteStreamStatusTraceMessage.AirbyteStreamStatus.COMPLETE
158+
if (streamSuccessful) {
159+
val writeConfig = streamDescriptorToWriteConfig[streamDescriptor]!!
160+
if (writeConfig.objectsFromOldGeneration.isNotEmpty()) {
161+
// Although S3API is safe to send empty list of keys, just avoiding
162+
// unnecessary S3 call
163+
// Logic to determine what to delete is in onStart so not doing any
164+
// redundant checks for
165+
// destinationSyncMode.
166+
LOGGER.info {
167+
"Found ${writeConfig.objectsFromOldGeneration.size} marked for deletion in namespace: ${streamDescriptor.namespace},stream: ${streamDescriptor.name} " +
168+
"Proceeding with cleaning up the objects"
169+
}
170+
storageOperations.cleanUpObjects(writeConfig.objectsFromOldGeneration)
171+
LOGGER.info {
172+
"Cleaning up completed for namespace: ${streamDescriptor.namespace},stream: ${streamDescriptor.name}"
173+
}
174+
}
175+
} else {
176+
LOGGER.info {
177+
"Stream not successful with status ${streamSummary.terminalStatus} for namespace: ${streamDescriptor.namespace}, name: ${streamDescriptor.name} " +
178+
"Skipping deletion of any old objects marked for deletion."
179+
}
152180
}
153-
LOGGER.info { "Cleaning up destination completed." }
154181
}
155182
}
156183
}
@@ -225,56 +252,74 @@ class S3ConsumerFactory {
225252
)
226253
}
227254

228-
private fun mustCleanUpExistingObjects(
255+
private fun isAppendSync(writeConfig: WriteConfig): Boolean {
256+
// This is an additional safety check, that this really is OVERWRITE
257+
// mode, this avoids bad things happening like deleting all objects
258+
// in APPEND mode.
259+
return writeConfig.minimumGenerationId == 0L &&
260+
writeConfig.syncMode != DestinationSyncMode.OVERWRITE
261+
}
262+
263+
private fun keysForOverwriteDeletion(
229264
writeConfig: WriteConfig,
230265
storageOperations: BlobStorageOperations
231-
): Boolean {
232-
return when (writeConfig.minimumGenerationId) {
233-
// This is an additional safety check, that this really is OVERWRITE
234-
// mode, this avoids bad things happening like deleting all objects
235-
// in APPEND mode.
236-
0L -> writeConfig.syncMode == DestinationSyncMode.OVERWRITE
237-
writeConfig.generationId -> {
238-
// This is truncate sync and try to determine if the current generation
239-
// data is already present
240-
val namespace = writeConfig.namespace
241-
val stream = writeConfig.streamName
242-
val outputBucketPath = writeConfig.outputBucketPath
243-
val pathFormat = writeConfig.pathFormat
244-
// generationId is missing, assume the last sync was ran in non-resumeable refresh
245-
// mode,
246-
// cleanup files
247-
val currentGenerationId =
248-
storageOperations.getStageGeneration(
249-
namespace,
250-
stream,
251-
outputBucketPath,
252-
pathFormat
253-
)
254-
if (currentGenerationId == null) {
255-
LOGGER.info {
256-
"Missing generationId from the lastModified object, proceeding with cleanup for stream ${writeConfig.streamName}"
257-
}
258-
return true
266+
): List<String> {
267+
// Guards to fail fast
268+
if (writeConfig.minimumGenerationId == 0L) {
269+
throw IllegalArgumentException(
270+
"Keys should not be marked for deletion when not in OVERWRITE mode"
271+
)
272+
}
273+
if (writeConfig.minimumGenerationId != writeConfig.generationId) {
274+
throw IllegalArgumentException("Hybrid refreshes are not yet supported.")
275+
}
276+
277+
// This is truncate sync and try to determine if the current generation
278+
// data is already present
279+
val namespace = writeConfig.namespace
280+
val stream = writeConfig.streamName
281+
val outputBucketPath = writeConfig.outputBucketPath
282+
val pathFormat = writeConfig.pathFormat
283+
// generationId is missing, assume the last sync was ran in non-resumeable refresh
284+
// mode,
285+
// cleanup files
286+
val currentGenerationId =
287+
storageOperations.getStageGeneration(namespace, stream, outputBucketPath, pathFormat)
288+
var filterByCurrentGen = false
289+
if (currentGenerationId != null) {
290+
// if minGen = gen = retrievedGen and skip clean up
291+
val hasDataFromCurrentGeneration = currentGenerationId == writeConfig.generationId
292+
if (hasDataFromCurrentGeneration) {
293+
LOGGER.info {
294+
"Preserving data from previous sync for stream ${writeConfig.streamName} since it matches the current generation ${writeConfig.generationId}"
259295
}
260-
// if minGen = gen = retrievedGen and skip clean up
261-
val hasDataFromCurrentGeneration = currentGenerationId == writeConfig.generationId
262-
if (hasDataFromCurrentGeneration) {
263-
LOGGER.info {
264-
"Preserving data from previous sync for stream ${writeConfig.streamName} since it matches the current generation ${writeConfig.generationId}"
265-
}
266-
} else {
267-
LOGGER.info {
268-
"No data exists from previous sync for stream ${writeConfig.streamName} from current generation ${writeConfig.generationId}, " +
269-
"proceeding to clean up existing data"
270-
}
296+
// There could be data dangling from T-2 sync if current generation failed in T-1
297+
// sync.
298+
filterByCurrentGen = true
299+
} else {
300+
LOGGER.info {
301+
"No data exists from previous sync for stream ${writeConfig.streamName} from current generation ${writeConfig.generationId}, " +
302+
"proceeding to clean up existing data"
271303
}
272-
return !hasDataFromCurrentGeneration
273304
}
274-
else -> {
275-
throw IllegalArgumentException("Hybrid refreshes are not yet supported.")
305+
} else {
306+
LOGGER.info {
307+
"Missing generationId from the lastModified object, proceeding with cleanup for stream ${writeConfig.streamName}"
276308
}
277309
}
310+
311+
return storageOperations.listExistingObjects(
312+
namespace,
313+
stream,
314+
outputBucketPath,
315+
pathFormat,
316+
currentGenerationId =
317+
if (filterByCurrentGen) {
318+
writeConfig.generationId
319+
} else {
320+
null
321+
},
322+
)
278323
}
279324

280325
companion object {

0 commit comments

Comments
 (0)