-
Notifications
You must be signed in to change notification settings - Fork 4.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Load-CDK/Destination-S3DataLake: DirectLoader (no spill2disk, partiti…
…oning) (#53241)
- Loading branch information
1 parent
91ccf4e
commit b88b37b
Showing
54 changed files
with
1,870 additions
and
239 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
32 changes: 32 additions & 0 deletions
32
airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/message/PartitionedQueue.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
/* | ||
* Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
*/ | ||
|
||
package io.airbyte.cdk.load.message | ||
|
||
import io.airbyte.cdk.load.util.CloseableCoroutine | ||
import kotlinx.coroutines.flow.Flow | ||
|
||
class PartitionedQueue<T>(private val queues: Array<MessageQueue<T>>) : CloseableCoroutine { | ||
val partitions = queues.size | ||
|
||
fun consume(partition: Int): Flow<T> { | ||
if (partition < 0 || partition >= queues.size) { | ||
throw IllegalArgumentException("Invalid partition: $partition") | ||
} | ||
return queues[partition].consume() | ||
} | ||
|
||
suspend fun publish(value: T, partition: Int) { | ||
if (partition < 0 || partition >= queues.size) { | ||
throw IllegalArgumentException("Invalid partition: $partition") | ||
} | ||
queues[partition].publish(value) | ||
} | ||
|
||
suspend fun broadcast(value: T) = queues.forEach { it.publish(value) } | ||
|
||
override suspend fun close() { | ||
queues.forEach { it.close() } | ||
} | ||
} |
24 changes: 24 additions & 0 deletions
24
airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/message/PipelineEvent.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
/* | ||
* Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
*/ | ||
|
||
package io.airbyte.cdk.load.message | ||
|
||
import io.airbyte.cdk.load.command.DestinationStream | ||
import io.airbyte.cdk.load.state.CheckpointId | ||
|
||
/** Used internally by the CDK to pass messages between steps in the loader pipeline. */ | ||
sealed interface PipelineEvent<K : WithStream, T> | ||
|
||
class PipelineMessage<K : WithStream, T>( | ||
val checkpointCounts: Map<CheckpointId, Long>, | ||
val key: K, | ||
val value: T | ||
) : PipelineEvent<K, T> | ||
|
||
/** | ||
* We send the end message on the stream and not the key, because there's no way to partition an | ||
* empty message. | ||
*/ | ||
class PipelineEndOfStream<K : WithStream, T>(val stream: DestinationStream.Descriptor) : | ||
PipelineEvent<K, T> |
15 changes: 15 additions & 0 deletions
15
airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/message/WithBatchState.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
/* | ||
* Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
*/ | ||
|
||
package io.airbyte.cdk.load.message | ||
|
||
/** | ||
* Used internally by the CDK to implement Loaders. It is added to outputs of | ||
* [io.airbyte.cdk.load.pipeline.BatchAccumulator] that can ack or complete record batches. This is | ||
* done *when stitching the dev interface to the pipeline*, so the dev does not have to think about | ||
* internal state. | ||
*/ | ||
interface WithBatchState { | ||
val state: Batch.State | ||
} |
17 changes: 17 additions & 0 deletions
17
airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/message/WithStream.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
/* | ||
* Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
*/ | ||
|
||
package io.airbyte.cdk.load.message | ||
|
||
import io.airbyte.cdk.load.command.DestinationStream | ||
|
||
/** | ||
* Used internally by the CDK to keep track of streams while still allowing for partitioning on key. | ||
*/ | ||
interface WithStream { | ||
val stream: DestinationStream.Descriptor | ||
} | ||
|
||
/** The default key: partitioned by stream. */ | ||
data class StreamKey(override val stream: DestinationStream.Descriptor) : WithStream |
17 changes: 17 additions & 0 deletions
17
airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/pipeline/BatchAccumulator.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
/* | ||
* Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
*/ | ||
|
||
package io.airbyte.cdk.load.pipeline | ||
|
||
import io.airbyte.cdk.load.message.WithStream | ||
|
||
/** | ||
* [BatchAccumulator] is used internally by the CDK to implement RecordLoaders. Connector devs | ||
* should never need to implement this interface. | ||
*/ | ||
interface BatchAccumulator<S, K : WithStream, T, U> { | ||
fun start(key: K, part: Int): S | ||
fun accept(record: T, state: S): Pair<S, U?> | ||
fun finish(state: S): U | ||
} |
24 changes: 24 additions & 0 deletions
24
airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/pipeline/BatchStateUpdate.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
/* | ||
* Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
*/ | ||
|
||
package io.airbyte.cdk.load.pipeline | ||
|
||
import io.airbyte.cdk.load.command.DestinationStream | ||
import io.airbyte.cdk.load.message.Batch | ||
import io.airbyte.cdk.load.state.CheckpointId | ||
|
||
/** Used internally by the CDK to track record ranges to ack. */ | ||
sealed interface BatchUpdate { | ||
val stream: DestinationStream.Descriptor | ||
} | ||
|
||
data class BatchStateUpdate( | ||
override val stream: DestinationStream.Descriptor, | ||
val checkpointCounts: Map<CheckpointId, Long>, | ||
val state: Batch.State, | ||
) : BatchUpdate | ||
|
||
data class BatchEndOfStream( | ||
override val stream: DestinationStream.Descriptor, | ||
) : BatchUpdate |
20 changes: 20 additions & 0 deletions
20
...yte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/pipeline/DirectLoadPipeline.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
/* | ||
* Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
*/ | ||
|
||
package io.airbyte.cdk.load.pipeline | ||
|
||
import io.airbyte.cdk.load.write.DirectLoaderFactory | ||
import io.micronaut.context.annotation.Requires | ||
import jakarta.inject.Singleton | ||
|
||
/** | ||
* Used internally by the CDK to implement the DirectLoader. | ||
* | ||
* Creates a single pipeline step reading from a (possibly partitioned) record stream. Batch updates | ||
* are written to the batchStateUpdateQueue whenever the loader returns | ||
*/ | ||
@Singleton | ||
@Requires(bean = DirectLoaderFactory::class) | ||
class DirectLoadPipeline(val pipelineStep: DirectLoadPipelineStep<*>) : | ||
LoadPipeline(listOf(pipelineStep)) |
50 changes: 50 additions & 0 deletions
50
...cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/pipeline/DirectLoadPipelineStep.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
/* | ||
* Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
*/ | ||
|
||
package io.airbyte.cdk.load.pipeline | ||
|
||
import io.airbyte.cdk.load.message.DestinationRecordAirbyteValue | ||
import io.airbyte.cdk.load.message.PartitionedQueue | ||
import io.airbyte.cdk.load.message.PipelineEvent | ||
import io.airbyte.cdk.load.message.QueueWriter | ||
import io.airbyte.cdk.load.message.StreamKey | ||
import io.airbyte.cdk.load.state.Reserved | ||
import io.airbyte.cdk.load.task.internal.LoadPipelineStepTask | ||
import io.airbyte.cdk.load.write.DirectLoader | ||
import io.airbyte.cdk.load.write.DirectLoaderFactory | ||
import io.github.oshai.kotlinlogging.KotlinLogging | ||
import io.micronaut.context.annotation.Requires | ||
import io.micronaut.context.annotation.Value | ||
import jakarta.inject.Named | ||
import jakarta.inject.Singleton | ||
|
||
@Singleton | ||
@Requires(bean = DirectLoaderFactory::class) | ||
class DirectLoadPipelineStep<S : DirectLoader>( | ||
val accumulator: DirectLoadRecordAccumulator<S, StreamKey>, | ||
@Named("recordQueue") | ||
val inputQueue: | ||
PartitionedQueue<Reserved<PipelineEvent<StreamKey, DestinationRecordAirbyteValue>>>, | ||
@Named("batchStateUpdateQueue") val batchQueue: QueueWriter<BatchUpdate>, | ||
@Value("\${airbyte.destination.core.record-batch-size-override:null}") | ||
val batchSizeOverride: Long? = null, | ||
val directLoaderFactory: DirectLoaderFactory<S>, | ||
) : LoadPipelineStep { | ||
private val log = KotlinLogging.logger {} | ||
|
||
override val numWorkers: Int = directLoaderFactory.inputPartitions | ||
|
||
override fun taskForPartition(partition: Int): LoadPipelineStepTask<*, *, *, *, *> { | ||
log.info { "Creating DirectLoad pipeline step task for partition $partition" } | ||
return LoadPipelineStepTask( | ||
accumulator, | ||
inputQueue.consume(partition), | ||
batchUpdateQueue = batchQueue, | ||
outputPartitioner = null, | ||
outputQueue = null as PartitionedQueue<PipelineEvent<StreamKey, DirectLoadAccResult>>?, | ||
batchSizeOverride?.let { RecordCountFlushStrategy(it) }, | ||
partition | ||
) | ||
} | ||
} |
Oops, something went wrong.