Skip to content

Commit 8f42de9

Browse files
authored
[source-mongodb-v2] : Implement WASS algorithm (#42561)
Fixes airbytehq/airbyte-internal-issues#8730 Port of #38240 for MongoDB
1 parent bebe5fd commit 8f42de9

File tree

11 files changed

+165
-35
lines changed

11 files changed

+165
-35
lines changed

airbyte-integrations/connectors/source-mongodb-v2/build.gradle

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ plugins {
33
}
44

55
airbyteJavaConnector {
6-
cdkVersionRequired = '0.40.7'
6+
cdkVersionRequired = '0.43.4'
77
features = ['db-sources', 'datastore-mongo']
88
useLocalCdk = false
99
}

airbyte-integrations/connectors/source-mongodb-v2/integration_tests/expected_spec.json

+10
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,16 @@
185185
"default": "Lookup",
186186
"order": 12,
187187
"group": "advanced"
188+
},
189+
"initial_load_timeout_hours": {
190+
"type": "integer",
191+
"title": "Initial Load Timeout in Hours (Advanced)",
192+
"description": "The amount of time an initial load is allowed to continue for before catching up on CDC logs.",
193+
"default": 8,
194+
"min": 4,
195+
"max": 24,
196+
"order": 13,
197+
"group": "advanced"
188198
}
189199
},
190200
"groups": [

airbyte-integrations/connectors/source-mongodb-v2/metadata.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ data:
88
connectorSubtype: database
99
connectorType: source
1010
definitionId: b2e713cd-cc36-4c0a-b5bd-b47cb8a0561e
11-
dockerImageTag: 1.4.3
11+
dockerImageTag: 1.5.0
1212
dockerRepository: airbyte/source-mongodb-v2
1313
documentationUrl: https://docs.airbyte.com/integrations/sources/mongodb-v2
1414
githubIssueLabel: source-mongodb-v2

airbyte-integrations/connectors/source-mongodb-v2/src/main/java/io/airbyte/integrations/source/mongodb/InitialSnapshotHandler.java

+6-2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
import io.airbyte.protocol.models.v0.AirbyteStreamStatusTraceMessage;
2525
import io.airbyte.protocol.models.v0.CatalogHelpers;
2626
import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream;
27+
import java.time.Duration;
28+
import java.time.Instant;
2729
import java.util.ArrayList;
2830
import java.util.List;
2931
import java.util.Optional;
@@ -51,7 +53,9 @@ public List<AutoCloseableIterator<AirbyteMessage>> getIterators(
5153
final MongoDatabase database,
5254
final MongoDbSourceConfig config,
5355
final boolean decorateWithStartedStatus,
54-
final boolean decorateWithCompletedStatus) {
56+
final boolean decorateWithCompletedStatus,
57+
final Instant emittedAt,
58+
final Optional<Duration> cdcInitialLoadTimeout) {
5559
final boolean isEnforceSchema = config.getEnforceSchema();
5660
final var checkpointInterval = config.getCheckpointInterval();
5761
final String MULTIPLE_ID_TYPES_ANALYTICS_MESSAGE_KEY = "db-sources-mongo-multiple-id-types";
@@ -85,7 +89,7 @@ public List<AutoCloseableIterator<AirbyteMessage>> getIterators(
8589

8690
final Optional<CollectionStatistics> collectionStatistics = MongoUtil.getCollectionStatistics(database, airbyteStream);
8791
final var recordIterator = new MongoDbInitialLoadRecordIterator(collection, fields, existingState, isEnforceSchema,
88-
MongoUtil.getChunkSizeForCollection(collectionStatistics, airbyteStream));
92+
MongoUtil.getChunkSizeForCollection(collectionStatistics, airbyteStream), emittedAt, cdcInitialLoadTimeout);
8993
final var stateIterator =
9094
new SourceStateIterator<>(recordIterator, airbyteStream, stateManager, new StateEmitFrequency(checkpointInterval,
9195
MongoConstants.CHECKPOINT_DURATION));

airbyte-integrations/connectors/source-mongodb-v2/src/main/java/io/airbyte/integrations/source/mongodb/MongoDbInitialLoadRecordIterator.java

+22-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
package io.airbyte.integrations.source.mongodb;
66

7+
import static io.airbyte.cdk.db.DbAnalyticsUtils.cdcSnapshotForceShutdownMessage;
78
import static io.airbyte.integrations.source.mongodb.state.IdType.idToStringRepresenation;
89
import static io.airbyte.integrations.source.mongodb.state.IdType.parseBinaryIdString;
910
import static io.airbyte.integrations.source.mongodb.state.InitialSnapshotStatus.IN_PROGRESS;
@@ -13,10 +14,14 @@
1314
import com.mongodb.client.MongoCursor;
1415
import com.mongodb.client.model.Filters;
1516
import com.mongodb.client.model.Sorts;
17+
import io.airbyte.cdk.integrations.base.AirbyteTraceMessageUtility;
1618
import io.airbyte.commons.exceptions.ConfigErrorException;
19+
import io.airbyte.commons.exceptions.TransientErrorException;
1720
import io.airbyte.commons.util.AutoCloseableIterator;
1821
import io.airbyte.integrations.source.mongodb.state.IdType;
1922
import io.airbyte.integrations.source.mongodb.state.MongoDbStreamState;
23+
import java.time.Duration;
24+
import java.time.Instant;
2025
import java.util.Optional;
2126
import org.bson.*;
2227
import org.bson.conversions.Bson;
@@ -44,21 +49,37 @@ public class MongoDbInitialLoadRecordIterator extends AbstractIterator<Document>
4449

4550
private int numSubqueries = 0;
4651

52+
private final Instant startInstant;
53+
private Optional<Duration> cdcInitialLoadTimeout;
54+
4755
MongoDbInitialLoadRecordIterator(final MongoCollection<Document> collection,
4856
final Bson fields,
4957
final Optional<MongoDbStreamState> existingState,
5058
final boolean isEnforceSchema,
51-
final int chunkSize) {
59+
final int chunkSize,
60+
final Instant startInstant,
61+
final Optional<Duration> cdcInitialLoadTimeout) {
5262
this.collection = collection;
5363
this.fields = fields;
5464
this.currentState = existingState;
5565
this.isEnforceSchema = isEnforceSchema;
5666
this.chunkSize = chunkSize;
5767
this.currentIterator = buildNewQueryIterator();
68+
this.startInstant = startInstant;
69+
this.cdcInitialLoadTimeout = cdcInitialLoadTimeout;
5870
}
5971

6072
@Override
6173
protected Document computeNext() {
74+
if (cdcInitialLoadTimeout.isPresent()
75+
&& Duration.between(startInstant, Instant.now()).compareTo(cdcInitialLoadTimeout.get()) > 0) {
76+
final String cdcInitialLoadTimeoutMessage = String.format(
77+
"Initial load for table %s has taken longer than %s, Canceling sync so that CDC replication can catch-up on subsequent attempt, and then initial snapshotting will resume",
78+
getAirbyteStream().get(), cdcInitialLoadTimeout.get());
79+
LOGGER.info(cdcInitialLoadTimeoutMessage);
80+
AirbyteTraceMessageUtility.emitAnalyticsTrace(cdcSnapshotForceShutdownMessage());
81+
throw new TransientErrorException(cdcInitialLoadTimeoutMessage);
82+
}
6283
if (shouldBuildNextQuery()) {
6384
try {
6485
LOGGER.info("Finishing subquery number : {}, processing at id : {}", numSubqueries,

airbyte-integrations/connectors/source-mongodb-v2/src/main/java/io/airbyte/integrations/source/mongodb/MongoDbSource.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import java.time.Instant;
2828
import java.util.ArrayList;
2929
import java.util.List;
30+
import java.util.Optional;
3031
import org.slf4j.Logger;
3132
import org.slf4j.LoggerFactory;
3233

@@ -184,7 +185,9 @@ List<AutoCloseableIterator<AirbyteMessage>> createFullRefreshIterators(final Mon
184185
mongoClient.getDatabase(sourceConfig.getDatabaseName()),
185186
sourceConfig,
186187
true,
187-
true);
188+
true,
189+
emmitedAt,
190+
Optional.empty());
188191

189192
return fullRefreshIterators;
190193
}

airbyte-integrations/connectors/source-mongodb-v2/src/main/java/io/airbyte/integrations/source/mongodb/cdc/MongoDbCdcInitializer.java

+91-12
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import com.mongodb.client.MongoDatabase;
1313
import io.airbyte.cdk.integrations.base.AirbyteTraceMessageUtility;
1414
import io.airbyte.cdk.integrations.debezium.AirbyteDebeziumHandler;
15+
import io.airbyte.cdk.integrations.source.relationaldb.InitialLoadTimeoutUtil;
1516
import io.airbyte.cdk.integrations.source.relationaldb.streamstatus.StreamStatusTraceEmitterIterator;
1617
import io.airbyte.commons.exceptions.ConfigErrorException;
1718
import io.airbyte.commons.json.Jsons;
@@ -21,6 +22,7 @@
2122
import io.airbyte.integrations.source.mongodb.InitialSnapshotHandler;
2223
import io.airbyte.integrations.source.mongodb.MongoDbSourceConfig;
2324
import io.airbyte.integrations.source.mongodb.MongoUtil;
25+
import io.airbyte.integrations.source.mongodb.state.InitialSnapshotStatus;
2426
import io.airbyte.integrations.source.mongodb.state.MongoDbStateManager;
2527
import io.airbyte.protocol.models.v0.*;
2628
import io.airbyte.protocol.models.v0.AirbyteMessage;
@@ -30,6 +32,7 @@
3032
import java.time.Instant;
3133
import java.util.*;
3234
import java.util.function.Supplier;
35+
import java.util.stream.Collectors;
3336
import java.util.stream.Stream;
3437
import org.bson.BsonDocument;
3538
import org.bson.BsonTimestamp;
@@ -91,19 +94,25 @@ public List<AutoCloseableIterator<AirbyteMessage>> createCdcIterators(
9194
// a workaround to allow making subsequent wait time configurable.
9295
final Duration subsequentRecordWaitTime = firstRecordWaitTime;
9396
LOGGER.info("Subsequent cdc record wait time: {} seconds", subsequentRecordWaitTime);
97+
final Duration initialLoadTimeout = InitialLoadTimeoutUtil.getInitialLoadTimeout(config.rawConfig());
98+
9499
final int queueSize = MongoUtil.getDebeziumEventQueueSize(config);
95100
final String databaseName = config.getDatabaseName();
96101
final boolean isEnforceSchema = config.getEnforceSchema();
102+
97103
final Properties defaultDebeziumProperties = MongoDbCdcProperties.getDebeziumProperties();
98104
logOplogInfo(mongoClient);
105+
99106
final BsonDocument initialResumeToken =
100107
MongoDbResumeTokenHelper.getMostRecentResumeToken(mongoClient, databaseName, incrementalOnlyStreamsCatalog);
101108
final JsonNode initialDebeziumState =
102109
mongoDbDebeziumStateUtil.constructInitialDebeziumState(initialResumeToken, databaseName);
110+
103111
final MongoDbCdcState cdcState =
104112
(stateManager.getCdcState() == null || stateManager.getCdcState().state() == null || stateManager.getCdcState().state().isNull())
105113
? new MongoDbCdcState(initialDebeziumState, isEnforceSchema)
106114
: new MongoDbCdcState(Jsons.clone(stateManager.getCdcState().state()), stateManager.getCdcState().schema_enforced());
115+
107116
final Optional<BsonDocument> optSavedOffset = mongoDbDebeziumStateUtil.savedOffset(
108117
Jsons.clone(defaultDebeziumProperties),
109118
incrementalOnlyStreamsCatalog,
@@ -127,6 +136,7 @@ public List<AutoCloseableIterator<AirbyteMessage>> createCdcIterators(
127136
throw new ConfigErrorException(
128137
"Saved offset is not valid. Please reset the connection, and then increase oplog retention and/or increase sync frequency to prevent his from happening in the future. See https://docs.airbyte.com/integrations/sources/mongodb-v2#mongodb-oplog-and-change-streams for more details");
129138
}
139+
130140
LOGGER.info("Saved offset is not valid. Airbyte will trigger a full refresh.");
131141
// If the offset in the state is invalid, reset the state to the initial STATE
132142
stateManager.resetState(new MongoDbCdcState(initialDebeziumState, config.getEnforceSchema()));
@@ -145,33 +155,42 @@ public List<AutoCloseableIterator<AirbyteMessage>> createCdcIterators(
145155
final List<ConfiguredAirbyteStream> initialSnapshotStreams =
146156
MongoDbCdcInitialSnapshotUtils.getStreamsForInitialSnapshot(mongoClient, stateManager, incrementalOnlyStreamsCatalog, savedOffsetIsValid);
147157
final InitialSnapshotHandler initialSnapshotHandler = new InitialSnapshotHandler();
158+
159+
final Set<AirbyteStreamNameNamespacePair> streamsStillInInitialSnapshot = stateManager.getStreamStates().entrySet().stream()
160+
.filter(e -> InitialSnapshotStatus.IN_PROGRESS.equals(e.getValue().status()))
161+
.map(Map.Entry::getKey)
162+
.collect(Collectors.toSet());
163+
164+
// Fetch the streams from the catalog that still need to complete the initial snapshot sync
165+
List<ConfiguredAirbyteStream> inProgressSnapshotStreams = new ArrayList<>(incrementalOnlyStreamsCatalog.getStreams().stream()
166+
.filter(stream -> streamsStillInInitialSnapshot.contains(AirbyteStreamNameNamespacePair.fromAirbyteStream(stream.getStream())))
167+
.map(Jsons::clone)
168+
.toList());
169+
final var startedCdcStreamList = incrementalOnlyStreamsCatalog.getStreams().stream()
170+
.filter(stream -> (!initialSnapshotStreams.contains(stream) || inProgressSnapshotStreams.contains(stream)))
171+
.map(stream -> stream.getStream().getNamespace() + "." + stream.getStream().getName()).toList();
172+
148173
final List<AutoCloseableIterator<AirbyteMessage>> initialSnapshotIterators =
149174
initialSnapshotHandler.getIterators(initialSnapshotStreams, stateManager, mongoClient.getDatabase(databaseName),
150-
config, true, false);
175+
config, false, false, emittedAt, Optional.of(initialLoadTimeout));
151176

152177
final AirbyteDebeziumHandler<BsonTimestamp> handler = new AirbyteDebeziumHandler<>(config.getDatabaseConfig(),
153178
new MongoDbCdcTargetPosition(initialResumeToken), false, firstRecordWaitTime, queueSize, false);
179+
154180
final MongoDbCdcStateHandler mongoDbCdcStateHandler = new MongoDbCdcStateHandler(stateManager);
155181
final MongoDbCdcSavedInfoFetcher cdcSavedInfoFetcher = new MongoDbCdcSavedInfoFetcher(stateToBeUsed);
182+
156183
final var cdcStreamList = incrementalOnlyStreamsCatalog.getStreams().stream()
157184
.filter(stream -> stream.getSyncMode() == SyncMode.INCREMENTAL)
158185
.map(s -> s.getStream().getNamespace() + "\\." + s.getStream().getName())
159186
.toList();
160-
final var propertiesManager =
161-
new MongoDbDebeziumPropertiesManager(defaultDebeziumProperties, config.getDatabaseConfig(), incrementalOnlyStreamsCatalog, cdcStreamList);
162-
final var eventConverter =
163-
new MongoDbDebeziumEventConverter(cdcMetadataInjector, incrementalOnlyStreamsCatalog, emittedAt, config.getDatabaseConfig());
164-
165-
final Supplier<AutoCloseableIterator<AirbyteMessage>> incrementalIteratorSupplier = () -> handler.getIncrementalIterators(
166-
propertiesManager, eventConverter, cdcSavedInfoFetcher, mongoDbCdcStateHandler);
167187

168188
// We can close the client after the initial snapshot is complete, incremental
169189
// iterator does not make use of the client.
170190
final AutoCloseableIterator<AirbyteMessage> initialSnapshotIterator = AutoCloseableIterators.appendOnClose(
171191
AutoCloseableIterators.concatWithEagerClose(initialSnapshotIterators), mongoClient::close);
172192

173193
final List<AutoCloseableIterator<AirbyteMessage>> cdcStreamsStartStatusEmitters = incrementalOnlyStreamsCatalog.getStreams().stream()
174-
.filter(stream -> !initialSnapshotStreams.contains(stream))
175194
.map(stream -> (AutoCloseableIterator<AirbyteMessage>) new StreamStatusTraceEmitterIterator(new AirbyteStreamStatusHolder(
176195
new io.airbyte.protocol.models.AirbyteStreamNameNamespacePair(stream.getStream().getName(), stream.getStream().getNamespace()),
177196
AirbyteStreamStatusTraceMessage.AirbyteStreamStatus.STARTED)))
@@ -183,9 +202,69 @@ public List<AutoCloseableIterator<AirbyteMessage>> createCdcIterators(
183202
AirbyteStreamStatusTraceMessage.AirbyteStreamStatus.COMPLETE)))
184203
.toList();
185204

186-
return Stream.of(Collections.singletonList(initialSnapshotIterator), cdcStreamsStartStatusEmitters,
187-
Collections.singletonList(AutoCloseableIterators.lazyIterator(incrementalIteratorSupplier, null)),
188-
cdcStreamsCompleteStatusEmitters).flatMap(Collection::stream).toList();
205+
if (startedCdcStreamList.isEmpty()) {
206+
LOGGER.info("First sync - no cdc streams have been completed or started");
207+
/*
208+
* This is the first run case - no initial loads have been started. In this case, we want to run the
209+
* iterators in the following order: 1. Run the initial load iterators. This step will timeout and
210+
* throw a transient error if run for too long (> 8hrs by default). 2. Run the debezium iterators
211+
* with ALL of the incremental streams configured. This is because if step 1 completes, the initial
212+
* load can be considered finished.
213+
*/
214+
final var propertiesManager =
215+
new MongoDbDebeziumPropertiesManager(defaultDebeziumProperties, config.getDatabaseConfig(), incrementalOnlyStreamsCatalog, cdcStreamList);
216+
final var eventConverter =
217+
new MongoDbDebeziumEventConverter(cdcMetadataInjector, incrementalOnlyStreamsCatalog, emittedAt, config.getDatabaseConfig());
218+
final Supplier<AutoCloseableIterator<AirbyteMessage>> incrementalIteratorSupplier = () -> handler.getIncrementalIterators(
219+
propertiesManager, eventConverter, cdcSavedInfoFetcher, mongoDbCdcStateHandler);
220+
221+
return Stream.of(
222+
cdcStreamsStartStatusEmitters,
223+
Collections.singletonList(initialSnapshotIterator),
224+
Collections.singletonList(AutoCloseableIterators.lazyIterator(incrementalIteratorSupplier, null)),
225+
cdcStreamsCompleteStatusEmitters).flatMap(Collection::stream).toList();
226+
} else if (initialSnapshotIterators.isEmpty()) {
227+
LOGGER.info("Initial load has finished completely - only reading the oplog");
228+
/*
229+
* In this case, the initial load has completed and only debezium should be run. The iterators
230+
* should be run in the following order: 1. Run the debezium iterators with ALL of the incremental
231+
* streams configured.
232+
*/
233+
final var propertiesManager =
234+
new MongoDbDebeziumPropertiesManager(defaultDebeziumProperties, config.getDatabaseConfig(), incrementalOnlyStreamsCatalog, cdcStreamList);
235+
final var eventConverter =
236+
new MongoDbDebeziumEventConverter(cdcMetadataInjector, incrementalOnlyStreamsCatalog, emittedAt, config.getDatabaseConfig());
237+
final Supplier<AutoCloseableIterator<AirbyteMessage>> incrementalIteratorSupplier = () -> handler.getIncrementalIterators(
238+
propertiesManager, eventConverter, cdcSavedInfoFetcher, mongoDbCdcStateHandler);
239+
return Stream.of(
240+
cdcStreamsStartStatusEmitters,
241+
Collections.singletonList(AutoCloseableIterators.lazyIterator(incrementalIteratorSupplier, null)),
242+
cdcStreamsCompleteStatusEmitters).flatMap(Collection::stream).toList();
243+
} else {
244+
LOGGER.info("Initial load is in progress - reading oplog first and then resuming with initial load.");
245+
/*
246+
* In this case, the initial load has partially completed (WASS case). The iterators should be run
247+
* in the following order: 1. Run the debezium iterators with only the incremental streams which
248+
* have been fully or partially completed configured. 2. Resume initial load for partially completed
249+
* and not started streams. This step will timeout and throw a transient error if run for too long
250+
* (> 8hrs by default). 3. Emit a transient error. This is to signal to the platform to restart the
251+
* sync to clear the oplog. We cannot simply add the same cdc iterators as their target end position
252+
* is fixed to the tip of the oplog at the start of the sync.
253+
*/
254+
final var propertiesManager =
255+
new MongoDbDebeziumPropertiesManager(defaultDebeziumProperties, config.getDatabaseConfig(), incrementalOnlyStreamsCatalog,
256+
startedCdcStreamList);
257+
final var eventConverter =
258+
new MongoDbDebeziumEventConverter(cdcMetadataInjector, incrementalOnlyStreamsCatalog, emittedAt, config.getDatabaseConfig());
259+
final Supplier<AutoCloseableIterator<AirbyteMessage>> incrementalIteratorSupplier = () -> handler.getIncrementalIterators(
260+
propertiesManager, eventConverter, cdcSavedInfoFetcher, mongoDbCdcStateHandler);
261+
return Stream.of(
262+
cdcStreamsStartStatusEmitters,
263+
Collections.singletonList(AutoCloseableIterators.lazyIterator(incrementalIteratorSupplier, null)),
264+
Collections.singletonList(initialSnapshotIterator),
265+
cdcStreamsCompleteStatusEmitters)
266+
.flatMap(Collection::stream).toList();
267+
}
189268
}
190269

191270
private void logOplogInfo(final MongoClient mongoClient) {

airbyte-integrations/connectors/source-mongodb-v2/src/main/resources/spec.json

+10
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,16 @@
185185
"default": "Lookup",
186186
"order": 12,
187187
"group": "advanced"
188+
},
189+
"initial_load_timeout_hours": {
190+
"type": "integer",
191+
"title": "Initial Load Timeout in Hours (Advanced)",
192+
"description": "The amount of time an initial load is allowed to continue for before catching up on CDC logs.",
193+
"default": 8,
194+
"min": 4,
195+
"max": 24,
196+
"order": 13,
197+
"group": "advanced"
188198
}
189199
},
190200
"groups": [

0 commit comments

Comments
 (0)