Skip to content

Commit 89f2db4

Browse files
(Incomplete) First Cut Load CDK with E2E Destination (#44822)
1 parent bf2295d commit 89f2db4

32 files changed

+2154
-0
lines changed
+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
dependencies {
2+
implementation project(':airbyte-cdk:bulk:core:bulk-cdk-core-base')
3+
implementation 'org.apache.commons:commons-lang3:3.14.0'
4+
5+
// For ranges and rangesets
6+
implementation("com.google.guava:guava:33.3.0-jre")
7+
8+
testFixturesApi testFixtures(project(':airbyte-cdk:bulk:core:bulk-cdk-core-base'))
9+
10+
testImplementation("org.jetbrains.kotlinx:kotlinx-coroutines-test:1.7.1")
11+
implementation "org.jetbrains.kotlin:kotlin-reflect:2.0.0"
12+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.command
6+
7+
import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog
8+
import io.micronaut.context.annotation.Factory
9+
import jakarta.inject.Singleton
10+
11+
/**
12+
* Internal representation of destination streams. This is intended to be a case class specialized
13+
* for usability.
14+
*/
15+
data class DestinationCatalog(
16+
val streams: List<DestinationStream> = emptyList(),
17+
) {
18+
private val byDescriptor: Map<DestinationStream.Descriptor, DestinationStream> =
19+
streams.associateBy { it.descriptor }
20+
21+
fun getStream(name: String, namespace: String): DestinationStream {
22+
val descriptor = DestinationStream.Descriptor(namespace = namespace, name = name)
23+
return byDescriptor[descriptor]
24+
?: throw IllegalArgumentException("Stream not found: namespace=$namespace, name=$name")
25+
}
26+
}
27+
28+
@Factory
29+
class DestinationCatalogFactory(
30+
private val catalog: ConfiguredAirbyteCatalog,
31+
private val streamFactory: DestinationStreamFactory
32+
) {
33+
@Singleton
34+
fun make(): DestinationCatalog {
35+
return DestinationCatalog(streams = catalog.streams.map { streamFactory.make(it) })
36+
}
37+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.command
6+
7+
import io.micronaut.context.annotation.ConfigurationProperties
8+
import io.micronaut.context.annotation.Factory
9+
import jakarta.inject.Singleton
10+
11+
@ConfigurationProperties("destination.config")
12+
interface DestinationConfiguration : Configuration {
13+
/**
14+
* Micronaut factory which glues [ConfigurationJsonObjectSupplier] and
15+
* [DestinationConfigurationFactory] together to produce a [DestinationConfiguration] singleton.
16+
*/
17+
@Factory
18+
private class MicronautFactory {
19+
@Singleton
20+
fun <I : ConfigurationJsonObjectBase> sourceConfig(
21+
pojoSupplier: ConfigurationJsonObjectSupplier<I>,
22+
factory: DestinationConfigurationFactory<I, out DestinationConfiguration>,
23+
): DestinationConfiguration = factory.make(pojoSupplier.get())
24+
}
25+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.command
6+
7+
import io.airbyte.cdk.ConfigErrorException
8+
9+
interface DestinationConfigurationFactory<
10+
I : ConfigurationJsonObjectBase, O : DestinationConfiguration> {
11+
fun makeWithoutExceptionHandling(pojo: I): O
12+
13+
/** Wraps [makeWithoutExceptionHandling] exceptions in [ConfigErrorException]. */
14+
fun make(pojo: I): O =
15+
try {
16+
makeWithoutExceptionHandling(pojo)
17+
} catch (e: Exception) {
18+
// Wrap NPEs (mostly) in ConfigErrorException.
19+
throw ConfigErrorException("Failed to build ConnectorConfiguration.", e)
20+
}
21+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.command
6+
7+
import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream
8+
import jakarta.inject.Singleton
9+
10+
/**
11+
* Internal representation of destination streams. This is intended to be a case class specialized
12+
* for usability.
13+
*
14+
* TODO: Add missing info like sync type, generation_id, etc.
15+
*
16+
* TODO: Add dedicated schema type, converted from json-schema.
17+
*/
18+
class DestinationStream(val descriptor: Descriptor) {
19+
data class Descriptor(val namespace: String, val name: String)
20+
21+
override fun hashCode(): Int {
22+
return descriptor.hashCode()
23+
}
24+
25+
override fun equals(other: Any?): Boolean {
26+
return other is DestinationStream && descriptor == other.descriptor
27+
}
28+
29+
override fun toString(): String {
30+
return "DestinationStream(descriptor=$descriptor)"
31+
}
32+
}
33+
34+
@Singleton
35+
class DestinationStreamFactory {
36+
fun make(stream: ConfiguredAirbyteStream): DestinationStream {
37+
return DestinationStream(
38+
descriptor =
39+
DestinationStream.Descriptor(
40+
namespace = stream.stream.namespace,
41+
name = stream.stream.name
42+
)
43+
)
44+
}
45+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.command
6+
7+
import io.micronaut.context.annotation.Secondary
8+
import jakarta.inject.Singleton
9+
10+
/**
11+
* General configuration for the write operation. The implementor can override this to tweak runtime
12+
* behavior.
13+
*/
14+
interface WriteConfiguration {
15+
/** Batch accumulation settings. */
16+
val recordBatchSizeBytes: Long
17+
val firstStageTmpFilePrefix: String
18+
19+
/** Memory queue settings */
20+
val maxMessageQueueMemoryUsageRatio: Double // as fraction of available memory
21+
val estimatedRecordMemoryOverheadRatio: Double // 0 => No overhead, 1.0 => 2x overhead
22+
}
23+
24+
@Singleton
25+
@Secondary
26+
open class DefaultWriteConfiguration : WriteConfiguration {
27+
override val recordBatchSizeBytes: Long = 200L * 1024L * 1024L
28+
override val firstStageTmpFilePrefix = "airbyte-cdk-load-staged-raw-records"
29+
30+
override val maxMessageQueueMemoryUsageRatio: Double = 0.2
31+
override val estimatedRecordMemoryOverheadRatio: Double = 0.1
32+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.message
6+
7+
import io.airbyte.protocol.models.v0.AirbyteGlobalState
8+
import io.airbyte.protocol.models.v0.AirbyteStateMessage
9+
import io.airbyte.protocol.models.v0.AirbyteStateStats
10+
import io.airbyte.protocol.models.v0.AirbyteStreamState
11+
import io.airbyte.protocol.models.v0.StreamDescriptor
12+
import jakarta.inject.Singleton
13+
14+
/**
15+
* Converts the internal @[DestinationStateMessage] case class to the Protocol state messages
16+
* required by @[io.airbyte.cdk.output.OutputConsumer]
17+
*/
18+
interface AirbyteStateMessageFactory {
19+
fun fromDestinationStateMessage(message: DestinationStateMessage): AirbyteStateMessage
20+
}
21+
22+
@Singleton
23+
class DefaultAirbyteStateMessageFactory : AirbyteStateMessageFactory {
24+
override fun fromDestinationStateMessage(
25+
message: DestinationStateMessage
26+
): AirbyteStateMessage {
27+
return when (message) {
28+
is DestinationStreamState ->
29+
AirbyteStateMessage()
30+
.withSourceStats(
31+
AirbyteStateStats()
32+
.withRecordCount(message.sourceStats.recordCount.toDouble())
33+
)
34+
.withDestinationStats(
35+
message.destinationStats?.let {
36+
AirbyteStateStats().withRecordCount(it.recordCount.toDouble())
37+
}
38+
?: throw IllegalStateException(
39+
"Destination stats must be provided for DestinationStreamState"
40+
)
41+
)
42+
.withType(AirbyteStateMessage.AirbyteStateType.STREAM)
43+
.withStream(fromStreamState(message.streamState))
44+
is DestinationGlobalState ->
45+
AirbyteStateMessage()
46+
.withSourceStats(
47+
AirbyteStateStats()
48+
.withRecordCount(message.sourceStats.recordCount.toDouble())
49+
)
50+
.withDestinationStats(
51+
message.destinationStats?.let {
52+
AirbyteStateStats().withRecordCount(it.recordCount.toDouble())
53+
}
54+
)
55+
.withType(AirbyteStateMessage.AirbyteStateType.GLOBAL)
56+
.withGlobal(
57+
AirbyteGlobalState()
58+
.withSharedState(message.state)
59+
.withStreamStates(message.streamStates.map { fromStreamState(it) })
60+
)
61+
}
62+
}
63+
64+
private fun fromStreamState(
65+
streamState: DestinationStateMessage.StreamState
66+
): AirbyteStreamState {
67+
return AirbyteStreamState()
68+
.withStreamDescriptor(
69+
StreamDescriptor()
70+
.withNamespace(streamState.stream.descriptor.namespace)
71+
.withName(streamState.stream.descriptor.name)
72+
)
73+
.withStreamState(streamState.state)
74+
}
75+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.message
6+
7+
import com.google.common.collect.Range
8+
import com.google.common.collect.RangeSet
9+
import com.google.common.collect.TreeRangeSet
10+
import java.nio.file.Path
11+
12+
/**
13+
* Represents an accumulated batch of records in some stage of processing.
14+
*
15+
* Emitted by @[io.airbyte.cdk.write.StreamLoader.processRecords] to describe the batch accumulated.
16+
* Non-[State.COMPLETE] batches are routed to @[io.airbyte.cdk.write.StreamLoader.processBatch]
17+
* re-entrantly until completion.
18+
*
19+
* The framework will track the association between the Batch and the range of records it
20+
* represents, by [Batch.State]s. The [State.PERSISTED] state has special meaning: it indicates that
21+
* the associated ranges have been persisted remotely, and that platform checkpoint messages can be
22+
* emitted.
23+
*
24+
* [State.SPOOLED] is used internally to indicate that records have been spooled to disk for
25+
* processing and should not be used by implementors.
26+
*
27+
* When a stream has been read to End-of-stream, and all ranges between 0 and End-of-stream are
28+
* [State.COMPLETE], then all records are considered to have been processed.
29+
*
30+
* The intended usage for implementors is to implement the provided interfaces in case classes that
31+
* contain the necessary metadata for processing, using them in @
32+
* [io.airbyte.cdk.write.StreamLoader.processBatch] to route to the appropriate handler(s).
33+
*
34+
* For example:
35+
*
36+
* ```kotlin
37+
* sealed class MyBatch: Batch
38+
* data class MyLocalFile(
39+
* override val path: Path,
40+
* override val totalSizeBytes: Long
41+
* ): StagedLocalFile
42+
* data class MyRemoteObject(
43+
* override val key: String
44+
* ): RemoteObject
45+
* // etc...
46+
* ```
47+
*/
48+
interface Batch {
49+
enum class State {
50+
SPOOLED,
51+
LOCAL,
52+
PERSISTED,
53+
COMPLETE
54+
}
55+
56+
val state: State
57+
}
58+
59+
/** Simple batch: use if you need no other metadata for processing. */
60+
data class SimpleBatch(override val state: Batch.State) : Batch
61+
62+
/** Represents a file of records locally staged. */
63+
abstract class StagedLocalFile() : Batch {
64+
override val state: Batch.State = Batch.State.LOCAL
65+
abstract val localPath: Path
66+
abstract val totalSizeBytes: Long
67+
}
68+
69+
/** Represents a remote object containing persisted records. */
70+
abstract class RemoteObject() : Batch {
71+
override val state: Batch.State = Batch.State.PERSISTED
72+
abstract val key: String
73+
}
74+
75+
/**
76+
* Represents a file of raw records staged to disk for pre-processing. Used internally by the
77+
* framework
78+
*/
79+
data class SpooledRawMessagesLocalFile(
80+
override val localPath: Path,
81+
override val totalSizeBytes: Long,
82+
override val state: Batch.State = Batch.State.SPOOLED
83+
) : StagedLocalFile()
84+
85+
/**
86+
* Internally-used wrapper for tracking the association between a batch and the range of records it
87+
* contains.
88+
*/
89+
data class BatchEnvelope<B : Batch>(
90+
val batch: B,
91+
val ranges: RangeSet<Long> = TreeRangeSet.create()
92+
) {
93+
constructor(
94+
batch: B,
95+
range: Range<Long>
96+
) : this(batch = batch, ranges = TreeRangeSet.create(listOf(range)))
97+
98+
fun <C : Batch> withBatch(newBatch: C): BatchEnvelope<C> {
99+
return BatchEnvelope(newBatch, ranges)
100+
}
101+
}

0 commit comments

Comments
 (0)