diff --git a/airbyte-cdk/bulk/core/base/build.gradle b/airbyte-cdk/bulk/core/base/build.gradle index f60f0a3347626..1431f08c225fc 100644 --- a/airbyte-cdk/bulk/core/base/build.gradle +++ b/airbyte-cdk/bulk/core/base/build.gradle @@ -47,6 +47,5 @@ dependencies { } testFixturesApi 'io.micronaut.test:micronaut-test-core:4.5.0' testFixturesApi 'io.micronaut.test:micronaut-test-junit5:4.5.0' - testFixturesApi 'com.h2database:h2:2.2.224' testFixturesApi 'io.github.deblockt:json-diff:1.0.1' } diff --git a/airbyte-cdk/bulk/core/extract/build.gradle b/airbyte-cdk/bulk/core/extract/build.gradle index d0b76d9b15dba..bc631cc3e37b5 100644 --- a/airbyte-cdk/bulk/core/extract/build.gradle +++ b/airbyte-cdk/bulk/core/extract/build.gradle @@ -4,6 +4,4 @@ dependencies { implementation 'hu.webarticum:tree-printer:3.2.1' testFixturesApi testFixtures(project(':airbyte-cdk:bulk:core:bulk-cdk-core-base')) - - testImplementation project(':airbyte-cdk:bulk:toolkits:bulk-cdk-toolkit-extract-jdbc') } diff --git a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamDecorator.kt b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamDecorator.kt deleted file mode 100644 index 4553510cdcf9c..0000000000000 --- a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamDecorator.kt +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.discover - -import io.airbyte.protocol.models.v0.AirbyteStream - -/** - * Stateless object for building an [AirbyteStream] during DISCOVER. - * - * [DefaultAirbyteStreamDecorator] is the sane default implementation, to be replaced with - * connector-specific implementations when required. - */ -interface AirbyteStreamDecorator { - /** Connector-specific [AirbyteStream] decoration logic for GLOBAL-state streams. */ - fun decorateGlobal(airbyteStream: AirbyteStream) - - /** - * Connector-specific [AirbyteStream] decoration logic for STREAM-state streams for which at - * least one discovered field can be used as a user-defined cursor in incremental syncs. - */ - fun decorateNonGlobal(airbyteStream: AirbyteStream) - - /** - * Connector-specific [AirbyteStream] decoration logic for STREAM-state streams for which no - * discovered field can be used as a user-defined cursor in incremental syncs. - */ - fun decorateNonGlobalNoCursor(airbyteStream: AirbyteStream) - - /** - * Can the field be used as part of a primary key? - * - * For this to be possible, - * 1. the field needs to be part of a key as defined by the source, - * 2. and its values must be deserializable from the checkpoint persisted in an Airbyte state - * message. - * - * This method does not determine (1), of course, because the source keys are defined in the - * source database itself and are retrieved via [MetadataQuerier.primaryKey]. Instead, this - * method determines (2) based on the type information of the field, typically the [FieldType] - * objects. For instance if the [Field.type] does not map to a [LosslessFieldType] then the - * field can't reliably round-trip checkpoint values during a resumable initial sync. - */ - fun isPossiblePrimaryKeyElement(field: Field): Boolean - - /** - * Can the field be used as a cursor in a cursor-based incremental sync? - * - * This predicate is like [isPossiblePrimaryKeyElement] but tighter: in addition to being able - * to round-trip the column values, we need to be able to query the max value from the source at - * the start of the sync. - */ - fun isPossibleCursor(field: Field): Boolean -} diff --git a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamFactory.kt b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamFactory.kt new file mode 100644 index 0000000000000..93705c4d25122 --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamFactory.kt @@ -0,0 +1,27 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.discover + +import io.airbyte.protocol.models.Field as AirbyteField +import io.airbyte.protocol.models.v0.AirbyteStream +import io.airbyte.protocol.models.v0.CatalogHelpers + +/** Stateless object for building an [AirbyteStream] during DISCOVER. */ +interface AirbyteStreamFactory { + /** Connector-specific [AirbyteStream] creation logic for GLOBAL-state streams. */ + fun createGlobal(discoveredStream: DiscoveredStream): AirbyteStream + + /** Connector-specific [AirbyteStream] creation logic for STREAM-state streams. */ + fun createNonGlobal(discoveredStream: DiscoveredStream): AirbyteStream + + companion object { + + fun createAirbyteStream(discoveredStream: DiscoveredStream): AirbyteStream = + CatalogHelpers.createAirbyteStream( + discoveredStream.name, + discoveredStream.namespace, + discoveredStream.columns.map { + AirbyteField.of(it.id, it.type.airbyteType.asJsonSchemaType()) + }, + ) + } +} diff --git a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoverOperation.kt b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoverOperation.kt index 7884aef6d66dc..90f1732e6dd96 100644 --- a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoverOperation.kt +++ b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoverOperation.kt @@ -4,10 +4,8 @@ package io.airbyte.cdk.discover import io.airbyte.cdk.Operation import io.airbyte.cdk.command.SourceConfiguration import io.airbyte.cdk.output.OutputConsumer -import io.airbyte.protocol.models.Field as AirbyteField import io.airbyte.protocol.models.v0.AirbyteCatalog import io.airbyte.protocol.models.v0.AirbyteStream -import io.airbyte.protocol.models.v0.CatalogHelpers import io.github.oshai.kotlinlogging.KotlinLogging import io.micronaut.context.annotation.Requires import jakarta.inject.Singleton @@ -18,7 +16,7 @@ import jakarta.inject.Singleton class DiscoverOperation( val config: SourceConfiguration, val metadataQuerierFactory: MetadataQuerier.Factory, - val airbyteStreamDecorator: AirbyteStreamDecorator, + val airbyteStreamFactory: AirbyteStreamFactory, val outputConsumer: OutputConsumer, ) : Operation { private val log = KotlinLogging.logger {} @@ -39,50 +37,16 @@ class DiscoverOperation( } val primaryKey: List> = metadataQuerier.primaryKey(name, namespace) val discoveredStream = DiscoveredStream(name, namespace, fields, primaryKey) - airbyteStreams.add(toAirbyteStream(discoveredStream)) + val airbyteStream: AirbyteStream = + if (config.global) { + airbyteStreamFactory.createGlobal(discoveredStream) + } else { + airbyteStreamFactory.createNonGlobal(discoveredStream) + } + airbyteStreams.add(airbyteStream) } } } outputConsumer.accept(AirbyteCatalog().withStreams(airbyteStreams)) } - - fun toAirbyteStream(discoveredStream: DiscoveredStream): AirbyteStream { - val allColumnsByID: Map = discoveredStream.columns.associateBy { it.id } - val airbyteStream: AirbyteStream = - CatalogHelpers.createAirbyteStream( - discoveredStream.name, - discoveredStream.namespace, - discoveredStream.columns.map { - AirbyteField.of(it.id, it.type.airbyteType.asJsonSchemaType()) - }, - ) - val isValidPK: Boolean = - discoveredStream.primaryKeyColumnIDs.all { idComponents: List -> - val id: String = idComponents.joinToString(separator = ".") - val field: Field? = allColumnsByID[id] - field != null && airbyteStreamDecorator.isPossiblePrimaryKeyElement(field) - } - airbyteStream.withSourceDefinedPrimaryKey( - if (isValidPK) discoveredStream.primaryKeyColumnIDs else listOf(), - ) - airbyteStream.isResumable = airbyteStream.sourceDefinedPrimaryKey.isNotEmpty() - if (config.global) { - // There is a global feed of incremental records, like CDC. - airbyteStreamDecorator.decorateGlobal(airbyteStream) - } else if (discoveredStream.columns.any { airbyteStreamDecorator.isPossibleCursor(it) }) { - // There is one field whose values can be round-tripped and aggregated by MAX. - airbyteStreamDecorator.decorateNonGlobal(airbyteStream) - } else { - // There is no such field. - airbyteStreamDecorator.decorateNonGlobalNoCursor(airbyteStream) - } - return airbyteStream - } - - data class DiscoveredStream( - val name: String, - val namespace: String?, - val columns: List, - val primaryKeyColumnIDs: List>, - ) } diff --git a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoveredStream.kt b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoveredStream.kt new file mode 100644 index 0000000000000..57453f29b29f4 --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoveredStream.kt @@ -0,0 +1,12 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.discover + +data class DiscoveredStream( + val name: String, + val namespace: String?, + val columns: List, + val primaryKeyColumnIDs: List>, +) diff --git a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/MetadataQuerier.kt b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/MetadataQuerier.kt index 65b4f1e04534c..d671576cc76fc 100644 --- a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/MetadataQuerier.kt +++ b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/MetadataQuerier.kt @@ -3,23 +3,22 @@ package io.airbyte.cdk.discover import io.airbyte.cdk.command.SourceConfiguration -/** A very thin abstraction around JDBC metadata queries. */ +/** An abstraction for a catalog discovery session. */ interface MetadataQuerier : AutoCloseable { - /** - * Queries the information_schema for all table names in the schemas specified by the connector - * configuration. - */ + + /** Returns all available namespaces. */ fun streamNamespaces(): List + /** Returns all available stream names in the given namespace. */ fun streamNames(streamNamespace: String?): List - /** Executes a SELECT * on the table, discards the results, and extracts all column metadata. */ + /** Returns all available fields in the given stream. */ fun fields( streamName: String, streamNamespace: String?, ): List - /** Queries the information_schema for any primary key on the given table. */ + /** Returns the primary key for the given stream, if it exists; empty list otherwise. */ fun primaryKey( streamName: String, streamNamespace: String?, diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceCheckTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/check/CheckTest.kt similarity index 85% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceCheckTest.kt rename to airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/check/CheckTest.kt index 388bc805ee6ff..f160a07453a55 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceCheckTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/check/CheckTest.kt @@ -1,8 +1,8 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +package io.airbyte.cdk.check import io.airbyte.cdk.Operation -import io.airbyte.cdk.check.CheckOperation +import io.airbyte.cdk.fakesource.FakeSourceConfigurationJsonObject import io.airbyte.cdk.output.BufferingOutputConsumer import io.airbyte.protocol.models.v0.AirbyteConnectionStatus import io.micronaut.context.annotation.Property @@ -13,7 +13,7 @@ import org.junit.jupiter.api.Test @MicronautTest(environments = ["source"], rebuildContext = true) @Property(name = Operation.PROPERTY, value = "check") -class FakeSourceCheckTest { +class CheckTest { @Inject lateinit var checkOperation: CheckOperation @Inject lateinit var outputConsumer: BufferingOutputConsumer @@ -22,6 +22,7 @@ class FakeSourceCheckTest { @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.port", value = "-1") @Property(name = "airbyte.connector.config.database", value = "testdb") + @Property(name = "metadata.resource", value = "discover/metadata-valid.json") fun testConfigBadPort() { assertFailed(" must have a minimum value of 0".toRegex()) } @@ -29,7 +30,7 @@ class FakeSourceCheckTest { @Test @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") - @Property(name = "metadata.resource", value = "fakesource/metadata-valid.json") + @Property(name = "metadata.resource", value = "discover/metadata-valid.json") fun testSuccess() { assertSucceeded() } @@ -37,7 +38,7 @@ class FakeSourceCheckTest { @Test @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") - @Property(name = "metadata.resource", value = "fakesource/metadata-empty.json") + @Property(name = "metadata.resource", value = "discover/metadata-empty.json") fun testBadSchema() { assertFailed("Discovered zero tables".toRegex()) } @@ -45,7 +46,7 @@ class FakeSourceCheckTest { @Test @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") - @Property(name = "metadata.resource", value = "fakesource/metadata-column-query-fails.json") + @Property(name = "metadata.resource", value = "discover/metadata-column-query-fails.json") fun testBadTables() { assertFailed("Unable to query any of the [0-9]+ discovered table".toRegex()) } diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationFactoryTest.kt similarity index 87% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationTest.kt rename to airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationFactoryTest.kt index 53f1a8e460287..e01a55f1bcb9b 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationFactoryTest.kt @@ -1,7 +1,8 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +package io.airbyte.cdk.command -import io.airbyte.cdk.command.SourceConfiguration +import io.airbyte.cdk.fakesource.FakeSourceConfiguration +import io.airbyte.cdk.fakesource.UserDefinedCursor import io.airbyte.cdk.ssh.SshConnectionOptions import io.airbyte.cdk.ssh.SshPasswordAuthTunnelMethod import io.micronaut.context.annotation.Property @@ -13,7 +14,7 @@ import org.junit.jupiter.api.Assertions import org.junit.jupiter.api.Test @MicronautTest(rebuildContext = true) -class FakeSourceConfigurationTest { +class ConfigurationFactoryTest { @Inject lateinit var actual: SourceConfiguration @Test @@ -39,10 +40,7 @@ class FakeSourceConfigurationTest { sshTunnel = SshPasswordAuthTunnelMethod("localhost", 22, "sshuser", "secret"), sshConnectionOptions = SshConnectionOptions(1_000.milliseconds, 2_000.milliseconds, Duration.ZERO), - jdbcUrlFmt = "jdbc:h2:tcp://%s:%d/mem:testdb", - schemas = setOf("PUBLIC", "TESTSCHEMA"), cursor = UserDefinedCursor, - resumablePreferred = true, maxConcurrency = 1, checkpointTargetInterval = java.time.Duration.ofDays(100L), ) diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationJsonObjectSupplierTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationJsonObjectSupplierTest.kt index c0972dd4b05b0..9cc3733ac0c5b 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationJsonObjectSupplierTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationJsonObjectSupplierTest.kt @@ -21,7 +21,7 @@ class ConfigurationJsonObjectSupplierTest { @Test fun testSchema() { Assertions.assertEquals(FakeSourceConfigurationJsonObject::class.java, supplier.javaClass) - val expected: String = ResourceUtils.readResource("command/expected-schema.json") + val expected: String = ResourceUtils.readResource("fakesource/expected-schema.json") Assertions.assertEquals(Jsons.readTree(expected), supplier.jsonSchema) } diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/SyncsTestFixtureTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/SyncsTestFixtureTest.kt new file mode 100644 index 0000000000000..a12e2fc3d3f22 --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/SyncsTestFixtureTest.kt @@ -0,0 +1,12 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.command + +import org.junit.jupiter.api.Test + +class SyncsTestFixtureTest { + + @Test + fun testSpec() { + SyncsTestFixture.testSpec("fakesource/expected-spec.json") + } +} diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceDiscoverTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/discover/DiscoverTest.kt similarity index 91% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceDiscoverTest.kt rename to airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/discover/DiscoverTest.kt index d7ef4de471d7f..8c7426fe9373d 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceDiscoverTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/discover/DiscoverTest.kt @@ -1,8 +1,7 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +package io.airbyte.cdk.discover import io.airbyte.cdk.Operation -import io.airbyte.cdk.discover.DiscoverOperation import io.airbyte.cdk.output.BufferingOutputConsumer import io.airbyte.cdk.util.Jsons import io.airbyte.protocol.models.v0.AirbyteCatalog @@ -16,7 +15,7 @@ import org.junit.jupiter.api.Test @MicronautTest(environments = ["source"], rebuildContext = true) @Property(name = Operation.PROPERTY, value = "discover") -class FakeSourceDiscoverTest { +class DiscoverTest { @Inject lateinit var discoverOperation: DiscoverOperation @Inject lateinit var outputConsumer: BufferingOutputConsumer @@ -25,7 +24,7 @@ class FakeSourceDiscoverTest { @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") @Property(name = "airbyte.connector.config.cursor", value = "user_defined") - @Property(name = "metadata.resource", value = "fakesource/metadata-valid.json") + @Property(name = "metadata.resource", value = "discover/metadata-valid.json") fun testCursorBasedIncremental() { val events = AirbyteStream() @@ -33,6 +32,7 @@ class FakeSourceDiscoverTest { .withNamespace("PUBLIC") .withJsonSchema(Jsons.readTree(EVENTS_SCHEMA)) .withSupportedSyncModes(listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL)) + .withSourceDefinedCursor(false) .withSourceDefinedPrimaryKey(listOf(listOf("ID"))) .withIsResumable(true) val kv = @@ -41,6 +41,7 @@ class FakeSourceDiscoverTest { .withNamespace("PUBLIC") .withJsonSchema(Jsons.readTree(KV_SCHEMA)) .withSupportedSyncModes(listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL)) + .withSourceDefinedCursor(false) .withSourceDefinedPrimaryKey(listOf(listOf("K"))) .withIsResumable(true) val expected = AirbyteCatalog().withStreams(listOf(events, kv)) @@ -52,7 +53,7 @@ class FakeSourceDiscoverTest { @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") @Property(name = "airbyte.connector.config.cursor", value = "cdc") - @Property(name = "metadata.resource", value = "fakesource/metadata-valid.json") + @Property(name = "metadata.resource", value = "discover/metadata-valid.json") fun testCdcIncremental() { val events = AirbyteStream() @@ -60,6 +61,7 @@ class FakeSourceDiscoverTest { .withNamespace("PUBLIC") .withJsonSchema(Jsons.readTree(EVENTS_SCHEMA)) .withSupportedSyncModes(listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL)) + .withSourceDefinedCursor(false) .withSourceDefinedPrimaryKey(listOf(listOf("ID"))) .withIsResumable(true) val kv = @@ -68,6 +70,7 @@ class FakeSourceDiscoverTest { .withNamespace("PUBLIC") .withJsonSchema(Jsons.readTree(KV_SCHEMA)) .withSupportedSyncModes(listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL)) + .withSourceDefinedCursor(false) .withSourceDefinedPrimaryKey(listOf(listOf("K"))) .withIsResumable(true) val expected = AirbyteCatalog().withStreams(listOf(events, kv)) diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfiguration.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfiguration.kt index 8e2ce24022c4f..215850a945ce5 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfiguration.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfiguration.kt @@ -1,7 +1,6 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ package io.airbyte.cdk.fakesource -import io.airbyte.cdk.command.JdbcSourceConfiguration import io.airbyte.cdk.command.SourceConfiguration import io.airbyte.cdk.command.SourceConfigurationFactory import io.airbyte.cdk.ssh.SshConnectionOptions @@ -13,27 +12,23 @@ import io.micronaut.context.env.Environment import jakarta.inject.Singleton import java.time.Duration -/** [SourceConfiguration] implementation for [FakeSource]. */ +/** [SourceConfiguration] implementation for a fake source. */ data class FakeSourceConfiguration( override val realHost: String, override val realPort: Int, override val sshTunnel: SshTunnelMethodConfiguration, override val sshConnectionOptions: SshConnectionOptions, - override val jdbcUrlFmt: String, - override val schemas: Set, val cursor: CursorConfiguration, - val resumablePreferred: Boolean, override val maxConcurrency: Int, override val checkpointTargetInterval: Duration, -) : JdbcSourceConfiguration { +) : SourceConfiguration { override val global: Boolean = cursor is CdcCursor - override val jdbcProperties: Map = mapOf() override val resourceAcquisitionHeartbeat: Duration get() = Duration.ofMillis(10) } -/** [SourceConfigurationFactory] implementation for [FakeSource]. */ +/** [SourceConfigurationFactory] implementation for a fake source. */ @Singleton @Requires(env = [Environment.TEST]) @Secondary @@ -49,10 +44,7 @@ class FakeSourceConfigurationFactory : realPort = pojo.port, sshTunnel = pojo.getTunnelMethodValue() ?: SshNoTunnelMethod, sshConnectionOptions = sshConnectionOptions, - jdbcUrlFmt = "jdbc:h2:tcp://%s:%d/mem:${pojo.database}", - schemas = pojo.schemas?.takeUnless { it.isEmpty() }?.toSet() ?: setOf("PUBLIC"), cursor = pojo.getCursorConfigurationValue() ?: UserDefinedCursor, - resumablePreferred = pojo.resumablePreferred != false, maxConcurrency = 1, checkpointTargetInterval = Duration.parse(pojo.timeout).takeIf { it.isPositive } ?: Duration.ofDays(100L), diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationJsonObject.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationJsonObject.kt index 6a799c0cae274..aa8e85d444103 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationJsonObject.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationJsonObject.kt @@ -26,7 +26,7 @@ import io.micronaut.context.annotation.ConfigurationProperties import io.micronaut.context.annotation.Secondary import jakarta.inject.Singleton -/** [ConfigurationJsonObjectBase] implementation for [FakeSource]. */ +/** [ConfigurationJsonObjectBase] implementation for a fake source. */ @JsonSchemaTitle("Test Source Spec") @JsonPropertyOrder( value = diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt deleted file mode 100644 index e8b4466d10d63..0000000000000 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource - -import io.airbyte.cdk.command.OpaqueStateValue -import io.airbyte.cdk.read.CreateNoPartitions -import io.airbyte.cdk.read.Feed -import io.airbyte.cdk.read.Global -import io.airbyte.cdk.read.PartitionsCreator -import io.airbyte.cdk.read.PartitionsCreatorFactory -import io.airbyte.cdk.read.StateQuerier -import io.airbyte.cdk.read.Stream -import io.airbyte.cdk.read.StreamPartitionReader -import io.airbyte.cdk.read.StreamPartitionsCreator -import io.airbyte.cdk.read.StreamReadContext -import io.airbyte.cdk.read.StreamReadContextManager -import io.airbyte.cdk.read.streamPartitionsCreatorInput -import jakarta.inject.Singleton - -@Singleton -class FakeSourcePartitionsCreatorFactory( - val streamReadContextManager: StreamReadContextManager, -) : PartitionsCreatorFactory { - override fun make( - stateQuerier: StateQuerier, - feed: Feed, - ): PartitionsCreator { - val opaqueStateValue: OpaqueStateValue? = stateQuerier.current(feed) - return when (feed) { - is Global -> CreateNoPartitions - is Stream -> { - val ctx: StreamReadContext = streamReadContextManager[feed] - StreamPartitionsCreator( - ctx, - opaqueStateValue.streamPartitionsCreatorInput(ctx), - StreamPartitionsCreator.Parameters(preferParallelized = false), - StreamPartitionReader.Parameters(preferResumable = false), - ) - } - } - } -} diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerGlobalStatesTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerGlobalStatesTest.kt index c6e9f292f45e1..c66bc180bb169 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerGlobalStatesTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerGlobalStatesTest.kt @@ -20,7 +20,7 @@ import org.junit.jupiter.api.Test @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") @Property(name = "airbyte.connector.config.cursor.cursor_method", value = "cdc") -@Property(name = "metadata.resource", value = "read/metadata.json") +@Property(name = "metadata.resource", value = "discover/metadata-valid.json") class StateManagerGlobalStatesTest { @Inject lateinit var config: SourceConfiguration @@ -37,7 +37,7 @@ class StateManagerGlobalStatesTest { } @Test - @Property(name = "airbyte.connector.catalog.resource", value = "read/cdc-catalog.json") + @Property(name = "airbyte.connector.catalog.resource", value = "fakesource/cdc-catalog.json") @Property( name = "airbyte.connector.state.json", value = @@ -55,7 +55,7 @@ class StateManagerGlobalStatesTest { } @Test - @Property(name = "airbyte.connector.catalog.resource", value = "read/cdc-catalog.json") + @Property(name = "airbyte.connector.catalog.resource", value = "fakesource/cdc-catalog.json") @Property(name = "airbyte.connector.state.json", value = "[]") fun testColdStart() { val streams: Streams = prelude() @@ -98,7 +98,7 @@ class StateManagerGlobalStatesTest { } @Test - @Property(name = "airbyte.connector.catalog.resource", value = "read/cdc-catalog.json") + @Property(name = "airbyte.connector.catalog.resource", value = "fakesource/cdc-catalog.json") @Property( name = "airbyte.connector.state.json", value = @@ -147,7 +147,7 @@ class StateManagerGlobalStatesTest { } @Test - @Property(name = "airbyte.connector.catalog.resource", value = "read/cdc-catalog.json") + @Property(name = "airbyte.connector.catalog.resource", value = "fakesource/cdc-catalog.json") @Property( name = "airbyte.connector.state.json", value = diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerStreamStatesTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerStreamStatesTest.kt index 4774fefdbb759..8225a9b55f2fe 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerStreamStatesTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerStreamStatesTest.kt @@ -22,7 +22,7 @@ import org.junit.jupiter.api.Test @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") @Property(name = "airbyte.connector.config.cursor.cursor_method", value = "user_defined") -@Property(name = "metadata.resource", value = "read/metadata.json") +@Property(name = "metadata.resource", value = "discover/metadata-valid.json") class StateManagerStreamStatesTest { @Inject lateinit var config: SourceConfiguration diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt deleted file mode 100644 index 5be288d662286..0000000000000 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt +++ /dev/null @@ -1,180 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.read - -import com.fasterxml.jackson.databind.JsonNode -import io.airbyte.cdk.TestClockFactory -import io.airbyte.cdk.discover.Field -import io.airbyte.cdk.fakesource.FakeSourceConfiguration -import io.airbyte.cdk.fakesource.FakeSourceConfigurationFactory -import io.airbyte.cdk.fakesource.FakeSourceConfigurationJsonObject -import io.airbyte.cdk.fakesource.FakeSourceOperations -import io.airbyte.cdk.h2.H2TestFixture -import io.airbyte.cdk.jdbc.IntFieldType -import io.airbyte.cdk.jdbc.JdbcConnectionFactory -import io.airbyte.cdk.jdbc.StringFieldType -import io.airbyte.cdk.output.BufferingCatalogValidationFailureHandler -import io.airbyte.cdk.output.BufferingOutputConsumer -import io.airbyte.cdk.read.MemoryFetchSizeEstimator.Companion.DEFAULT_FETCH_SIZE -import io.airbyte.cdk.read.Sample.Kind -import io.airbyte.cdk.util.Jsons -import io.airbyte.protocol.models.v0.SyncMode -import org.junit.jupiter.api.Assertions -import org.junit.jupiter.api.Test - -class StreamPartitionsCreatorUtilsTest { - val h2 = H2TestFixture() - - init { - h2.execute( - """CREATE TABLE kv ( - |k INT PRIMARY KEY, - |v VARCHAR(60)) - | - """ - .trimMargin() - .replace('\n', ' '), - ) - h2.execute( - "INSERT INTO kv (k, v) " + - "VALUES (1, 'foo'), (2, 'bar'), (3, NULL), (4, 'baz'), (5, 'quux');", - ) - } - - val k = Field("k", IntFieldType) - val v = Field("v", StringFieldType) - - val stream = - Stream( - name = "kv", - namespace = "public", - fields = listOf(k, v), - configuredSyncMode = SyncMode.FULL_REFRESH, - configuredPrimaryKey = listOf(k), - configuredCursor = null, - ) - - val querySpec = - SelectQuerySpec( - SelectColumns(listOf(k)), - From("kv", "public"), - orderBy = OrderBy(listOf(k)), - ) - - val testParameters = - StreamPartitionsCreator.Parameters( - preferParallelized = true, - tableSampleSize = 2, - throughputBytesPerSecond = 10L, - ) - - @Test - fun testCollectSample() { - val utils: StreamPartitionsCreatorUtils = createUtils(testParameters) - val sample = utils.collectSample(querySpec) {} - Assertions.assertEquals(Kind.SMALL, sample.kind) - } - - @Test - fun testCollectTinySample() { - val utils: StreamPartitionsCreatorUtils = - createUtils(testParameters.copy(tableSampleSize = 100)) - val sample = utils.collectSample(querySpec) {} - Assertions.assertEquals(Kind.TINY, sample.kind) - } - - @Test - fun testCollectEmptySample() { - h2.execute("TRUNCATE TABLE kv") - val utils: StreamPartitionsCreatorUtils = createUtils(testParameters) - val sample = utils.collectSample(querySpec) {} - Assertions.assertEquals(Kind.EMPTY, sample.kind) - } - - @Test - fun testCollectSampleInLargeTable() { - h2.execute("INSERT INTO kv(k, v) SELECT X, NULL FROM SYSTEM_RANGE(6, 100000)") - val utils: StreamPartitionsCreatorUtils = - createUtils(testParameters.copy(tableSampleSize = 100)) - val sample = utils.collectSample(querySpec) {} - Assertions.assertEquals(Kind.SMALL, sample.kind) - } - - @Test - fun testMemoryFetchSizeEstimator() { - Assertions.assertEquals( - 14000, - MemoryFetchSizeEstimator(700_000, 1).apply(Sample(listOf(10, 20, 30), Kind.SMALL, 0L)), - ) - Assertions.assertEquals( - 7000, - MemoryFetchSizeEstimator(700_000, 2).apply(Sample(listOf(10, 20, 30), Kind.SMALL, 0L)), - ) - Assertions.assertEquals( - DEFAULT_FETCH_SIZE, - MemoryFetchSizeEstimator(700_000, 2).apply(Sample(listOf(), Kind.MEDIUM, 0L)), - ) - } - - @Test - fun testCursorUpperBound() { - val utils: StreamPartitionsCreatorUtils = createUtils(testParameters) - utils.computeCursorUpperBound(k) - Assertions.assertEquals( - "5", - utils.ctx.transientCursorUpperBoundState.get()?.toString(), - ) - } - - @Test - fun testSplitPrimaryKey() { - val utils: StreamPartitionsCreatorUtils = createUtils(testParameters) - val input = - StreamPartitionReader.SnapshotInput( - primaryKey = listOf(k), - primaryKeyLowerBound = null, - primaryKeyUpperBound = null, - ) - val splits: List?, List?>> = - utils.split(input, input.primaryKeyLowerBound, input.primaryKeyUpperBound) - val actual: String = splits.joinToString { (l, r) -> "]${l?.first()}, ${r?.first()}]" } - Assertions.assertEquals("]null, 1], ]1, 2], ]2, null]", actual) - } - - @Test - fun testSplitCursor() { - val utils: StreamPartitionsCreatorUtils = createUtils(testParameters) - val input = - StreamPartitionReader.CursorIncrementalInput( - cursor = k, - cursorLowerBound = Jsons.numberNode(1), - cursorUpperBound = Jsons.numberNode(4), - ) - val splits: List?, List?>> = - utils.split(input, listOf(input.cursorLowerBound), listOf(input.cursorUpperBound)) - val actual: String = splits.joinToString { (l, r) -> "]${l?.first()}, ${r?.first()}]" } - Assertions.assertEquals("]1, 2], ]2, 4]", actual) - } - - private fun createUtils( - params: StreamPartitionsCreator.Parameters, - ): StreamPartitionsCreatorUtils { - val configPojo: FakeSourceConfigurationJsonObject = - FakeSourceConfigurationJsonObject().apply { - port = h2.port - database = h2.database - timeout = "PT1S" - } - val config: FakeSourceConfiguration = FakeSourceConfigurationFactory().make(configPojo) - val ctxManager = - StreamReadContextManager( - config, - BufferingCatalogValidationFailureHandler(), - FakeSourceOperations(), - JdbcSelectQuerier(JdbcConnectionFactory(config)), - BufferingOutputConsumer(TestClockFactory().fixed()), - ) - val ctx = ctxManager[stream] - ctx.resetStream() - return StreamPartitionsCreatorUtils(ctx, params) - } -} diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceSpecTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/spec/SpecTest.kt similarity index 94% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceSpecTest.kt rename to airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/spec/SpecTest.kt index c725179edca93..1538cadcf420a 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceSpecTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/spec/SpecTest.kt @@ -1,5 +1,5 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +package io.airbyte.cdk.spec import com.deblock.jsondiff.DiffGenerator import com.deblock.jsondiff.diff.JsonDiff @@ -11,7 +11,6 @@ import com.deblock.jsondiff.matcher.StrictPrimitivePartialMatcher import com.deblock.jsondiff.viewer.OnlyErrorDiffViewer import io.airbyte.cdk.Operation import io.airbyte.cdk.output.BufferingOutputConsumer -import io.airbyte.cdk.spec.SpecOperation import io.airbyte.cdk.util.Jsons import io.airbyte.cdk.util.ResourceUtils import io.micronaut.context.annotation.Property @@ -23,7 +22,7 @@ import org.junit.jupiter.api.Test @MicronautTest(environments = ["source"], rebuildContext = true) @Property(name = Operation.PROPERTY, value = "spec") @Property(name = "airbyte.connector.metadata.documentation-url", value = "https://docs.airbyte.com") -class FakeSourceSpecTest { +class SpecTest { @Inject lateinit var specOperation: SpecOperation @Inject lateinit var outputConsumer: BufferingOutputConsumer diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cdc-catalog.json b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cdc-catalog.json index 0b4c5b8d3af91..08eec4fcc91c1 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cdc-catalog.json +++ b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cdc-catalog.json @@ -6,22 +6,22 @@ "json_schema": { "type": "object", "properties": { + "MSG": { + "type": "string" + }, "ID": { - "type": "string", - "contentEncoding": "base64" + "type": "string" }, "TS": { "type": "string", "format": "date-time", "airbyte_type": "timestamp_with_timezone" - }, - "MSG": { - "type": "string" } } }, "supported_sync_modes": ["full_refresh", "incremental"], - "default_cursor_field": ["ID", "TS"], + "source_defined_cursor": false, + "default_cursor_field": [], "source_defined_primary_key": [["ID"]], "is_resumable": true, "namespace": "PUBLIC" @@ -37,17 +37,18 @@ "json_schema": { "type": "object", "properties": { + "V": { + "type": "string" + }, "K": { "type": "number", "airbyte_type": "integer" - }, - "V": { - "type": "string" } } }, "supported_sync_modes": ["full_refresh", "incremental"], - "default_cursor_field": ["K"], + "source_defined_cursor": false, + "default_cursor_field": [], "source_defined_primary_key": [["K"]], "is_resumable": true, "namespace": "PUBLIC" diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cursor-catalog.json b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cursor-catalog.json index 3520f52b260b4..f1b4850c1fe1f 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cursor-catalog.json +++ b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cursor-catalog.json @@ -6,17 +6,16 @@ "json_schema": { "type": "object", "properties": { + "MSG": { + "type": "string" + }, "ID": { - "type": "string", - "contentEncoding": "base64" + "type": "string" }, "TS": { "type": "string", "format": "date-time", "airbyte_type": "timestamp_with_timezone" - }, - "MSG": { - "type": "string" } } }, @@ -37,12 +36,12 @@ "json_schema": { "type": "object", "properties": { + "V": { + "type": "string" + }, "K": { "type": "number", "airbyte_type": "integer" - }, - "V": { - "type": "string" } } }, diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/command/expected-schema.json b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-schema.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/command/expected-schema.json rename to airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-schema.json diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-valid.json b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-valid.json deleted file mode 100644 index 332020e65a5be..0000000000000 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-valid.json +++ /dev/null @@ -1,25 +0,0 @@ -[ - { - "name": "EVENTS", - "namespace": "PUBLIC", - "metadata": { - "columns": { - "ID": "io.airbyte.cdk.jdbc.StringFieldType", - "TS": "io.airbyte.cdk.jdbc.OffsetDateTimeFieldType", - "MSG": "io.airbyte.cdk.jdbc.StringFieldType" - }, - "primaryKeys": [["ID"]] - } - }, - { - "name": "KV", - "namespace": "PUBLIC", - "metadata": { - "columns": { - "K": "io.airbyte.cdk.jdbc.IntFieldType", - "V": "io.airbyte.cdk.jdbc.StringFieldType" - }, - "primaryKeys": [["K"]] - } - } -] diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/read/metadata.json b/airbyte-cdk/bulk/core/extract/src/test/resources/read/metadata.json deleted file mode 100644 index 332020e65a5be..0000000000000 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/read/metadata.json +++ /dev/null @@ -1,25 +0,0 @@ -[ - { - "name": "EVENTS", - "namespace": "PUBLIC", - "metadata": { - "columns": { - "ID": "io.airbyte.cdk.jdbc.StringFieldType", - "TS": "io.airbyte.cdk.jdbc.OffsetDateTimeFieldType", - "MSG": "io.airbyte.cdk.jdbc.StringFieldType" - }, - "primaryKeys": [["ID"]] - } - }, - { - "name": "KV", - "namespace": "PUBLIC", - "metadata": { - "columns": { - "K": "io.airbyte.cdk.jdbc.IntFieldType", - "V": "io.airbyte.cdk.jdbc.StringFieldType" - }, - "primaryKeys": [["K"]] - } - } -] diff --git a/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/IntFieldType.kt b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/IntFieldType.kt new file mode 100644 index 0000000000000..821082136bd26 --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/IntFieldType.kt @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.discover + +import io.airbyte.cdk.data.AirbyteType +import io.airbyte.cdk.data.IntCodec +import io.airbyte.cdk.data.JsonEncoder +import io.airbyte.cdk.data.LeafAirbyteType + +data object IntFieldType : FieldType { + override val airbyteType: AirbyteType = LeafAirbyteType.INTEGER + override val jsonEncoder: JsonEncoder<*> = IntCodec +} diff --git a/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/OffsetDateTimeFieldType.kt b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/OffsetDateTimeFieldType.kt new file mode 100644 index 0000000000000..ecf3c2035395b --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/OffsetDateTimeFieldType.kt @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.discover + +import io.airbyte.cdk.data.AirbyteType +import io.airbyte.cdk.data.JsonEncoder +import io.airbyte.cdk.data.LeafAirbyteType +import io.airbyte.cdk.data.OffsetDateTimeCodec + +data object OffsetDateTimeFieldType : FieldType { + override val airbyteType: AirbyteType = LeafAirbyteType.TIMESTAMP_WITH_TIMEZONE + override val jsonEncoder: JsonEncoder<*> = OffsetDateTimeCodec +} diff --git a/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/StringFieldType.kt b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/StringFieldType.kt new file mode 100644 index 0000000000000..8f1d386cf2ef2 --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/StringFieldType.kt @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.discover + +import io.airbyte.cdk.data.AirbyteType +import io.airbyte.cdk.data.JsonEncoder +import io.airbyte.cdk.data.LeafAirbyteType +import io.airbyte.cdk.data.TextCodec + +data object StringFieldType : FieldType { + override val airbyteType: AirbyteType = LeafAirbyteType.STRING + override val jsonEncoder: JsonEncoder<*> = TextCodec +} diff --git a/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/TestAirbyteStreamFactory.kt b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/TestAirbyteStreamFactory.kt new file mode 100644 index 0000000000000..5b84e88c2faa9 --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/TestAirbyteStreamFactory.kt @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.discover + +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.protocol.models.v0.AirbyteStream +import io.airbyte.protocol.models.v0.SyncMode +import io.micronaut.context.annotation.Requires +import io.micronaut.context.env.Environment +import jakarta.inject.Singleton + +@Singleton +@Requires(env = [Environment.TEST]) +@Requires(notEnv = [Environment.CLI]) +class TestAirbyteStreamFactory : AirbyteStreamFactory { + + override fun createGlobal(discoveredStream: DiscoveredStream): AirbyteStream = + AirbyteStreamFactory.createAirbyteStream(discoveredStream).apply { + supportedSyncModes = listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL) + (jsonSchema["properties"] as ObjectNode).apply { + for (metaField in CommonMetaField.entries) { + set(metaField.id, metaField.type.airbyteType.asJsonSchema()) + } + } + defaultCursorField = listOf(CommonMetaField.CDC_LSN.id) + sourceDefinedCursor = true + if (discoveredStream.primaryKeyColumnIDs.isNotEmpty()) { + sourceDefinedPrimaryKey = discoveredStream.primaryKeyColumnIDs + isResumable = true + } else { + isResumable = false + } + } + + override fun createNonGlobal(discoveredStream: DiscoveredStream): AirbyteStream = + AirbyteStreamFactory.createAirbyteStream(discoveredStream).apply { + supportedSyncModes = listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL) + sourceDefinedCursor = false + if (discoveredStream.primaryKeyColumnIDs.isNotEmpty()) { + sourceDefinedPrimaryKey = discoveredStream.primaryKeyColumnIDs + isResumable = true + } else { + isResumable = false + } + } +} diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-column-query-fails.json b/airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-column-query-fails.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-column-query-fails.json rename to airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-column-query-fails.json diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-empty.json b/airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-empty.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-empty.json rename to airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-empty.json diff --git a/airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-valid.json b/airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-valid.json new file mode 100644 index 0000000000000..2c1da810db3af --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-valid.json @@ -0,0 +1,25 @@ +[ + { + "name": "EVENTS", + "namespace": "PUBLIC", + "metadata": { + "columns": { + "ID": "io.airbyte.cdk.discover.StringFieldType", + "TS": "io.airbyte.cdk.discover.OffsetDateTimeFieldType", + "MSG": "io.airbyte.cdk.discover.StringFieldType" + }, + "primaryKeys": [["ID"]] + } + }, + { + "name": "KV", + "namespace": "PUBLIC", + "metadata": { + "columns": { + "K": "io.airbyte.cdk.discover.IntFieldType", + "V": "io.airbyte.cdk.discover.StringFieldType" + }, + "primaryKeys": [["K"]] + } + } +] diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/build.gradle b/airbyte-cdk/bulk/toolkits/extract-jdbc/build.gradle index 83f004146c5ee..ec56ca6ca6471 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/build.gradle +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/build.gradle @@ -1,4 +1,9 @@ dependencies { implementation project(':airbyte-cdk:bulk:core:bulk-cdk-core-base') implementation project(':airbyte-cdk:bulk:core:bulk-cdk-core-extract') + + testFixturesApi testFixtures(project(':airbyte-cdk:bulk:core:bulk-cdk-core-base')) + testFixturesApi testFixtures(project(':airbyte-cdk:bulk:core:bulk-cdk-core-extract')) + testFixturesImplementation 'com.h2database:h2:2.2.224' + testFixturesImplementation 'org.apache.commons:commons-lang3:3.14.0' } diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamDecorator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamDecorator.kt deleted file mode 100644 index 070b85314d76f..0000000000000 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamDecorator.kt +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.discover - -import com.fasterxml.jackson.databind.node.ObjectNode -import io.airbyte.cdk.jdbc.BinaryStreamFieldType -import io.airbyte.cdk.jdbc.BooleanFieldType -import io.airbyte.cdk.jdbc.CharacterStreamFieldType -import io.airbyte.cdk.jdbc.ClobFieldType -import io.airbyte.cdk.jdbc.JsonStringFieldType -import io.airbyte.cdk.jdbc.NCharacterStreamFieldType -import io.airbyte.cdk.jdbc.NClobFieldType -import io.airbyte.protocol.models.v0.AirbyteStream -import io.airbyte.protocol.models.v0.SyncMode -import jakarta.inject.Singleton - -@Singleton -class JdbcAirbyteStreamDecorator : AirbyteStreamDecorator { - override fun decorateGlobal(airbyteStream: AirbyteStream) { - airbyteStream.apply { - supportedSyncModes = listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL) - (jsonSchema["properties"] as ObjectNode).apply { - for (metaField in CommonMetaField.entries) { - set(metaField.id, metaField.type.airbyteType.asJsonSchema()) - } - } - defaultCursorField = listOf(CommonMetaField.CDC_LSN.id) - sourceDefinedCursor = true - } - } - - override fun decorateNonGlobal(airbyteStream: AirbyteStream) { - airbyteStream.apply { - supportedSyncModes = listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL) - } - } - - override fun decorateNonGlobalNoCursor(airbyteStream: AirbyteStream) { - airbyteStream.apply { supportedSyncModes = listOf(SyncMode.FULL_REFRESH) } - } - - override fun isPossiblePrimaryKeyElement(field: Field): Boolean = - when (field.type) { - !is LosslessFieldType -> false - BinaryStreamFieldType, - CharacterStreamFieldType, - NCharacterStreamFieldType, - ClobFieldType, - NClobFieldType, - JsonStringFieldType, -> false - else -> true - } - - override fun isPossibleCursor(field: Field): Boolean = - isPossiblePrimaryKeyElement(field) && field.type !is BooleanFieldType -} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamFactory.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamFactory.kt new file mode 100644 index 0000000000000..8818a0380773b --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamFactory.kt @@ -0,0 +1,100 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.discover + +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.jdbc.BinaryStreamFieldType +import io.airbyte.cdk.jdbc.BooleanFieldType +import io.airbyte.cdk.jdbc.CharacterStreamFieldType +import io.airbyte.cdk.jdbc.ClobFieldType +import io.airbyte.cdk.jdbc.JsonStringFieldType +import io.airbyte.cdk.jdbc.NCharacterStreamFieldType +import io.airbyte.cdk.jdbc.NClobFieldType +import io.airbyte.protocol.models.v0.SyncMode +import jakarta.inject.Singleton + +@Singleton +class JdbcAirbyteStreamFactory : AirbyteStreamFactory { + + override fun createGlobal(discoveredStream: DiscoveredStream) = + AirbyteStreamFactory.createAirbyteStream(discoveredStream).apply { + supportedSyncModes = listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL) + (jsonSchema["properties"] as ObjectNode).apply { + for (metaField in CommonMetaField.entries) { + set(metaField.id, metaField.type.airbyteType.asJsonSchema()) + } + } + defaultCursorField = listOf(CommonMetaField.CDC_LSN.id) + sourceDefinedCursor = true + if (hasValidPrimaryKey(discoveredStream)) { + sourceDefinedPrimaryKey = discoveredStream.primaryKeyColumnIDs + isResumable = true + } + } + + override fun createNonGlobal(discoveredStream: DiscoveredStream) = + AirbyteStreamFactory.createAirbyteStream(discoveredStream).apply { + if (hasCursorFields(discoveredStream)) { + supportedSyncModes = listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL) + } else { + supportedSyncModes = listOf(SyncMode.FULL_REFRESH) + } + sourceDefinedCursor = false + if (hasValidPrimaryKey(discoveredStream)) { + sourceDefinedPrimaryKey = discoveredStream.primaryKeyColumnIDs + isResumable = true + } + } + + /** Does the [discoveredStream] have a field that could serve as a cursor? */ + fun hasCursorFields(discoveredStream: DiscoveredStream): Boolean = + !discoveredStream.columns.none(::isPossibleCursor) + + /** Does the [discoveredStream] have a valid primary key declared? */ + fun hasValidPrimaryKey(discoveredStream: DiscoveredStream): Boolean { + if (discoveredStream.primaryKeyColumnIDs.isEmpty()) { + return false + } + val allColumnsByID: Map = discoveredStream.columns.associateBy { it.id } + return discoveredStream.primaryKeyColumnIDs.all { idComponents: List -> + val id: String = idComponents.joinToString(separator = ".") + val field: Field? = allColumnsByID[id] + field != null && isPossiblePrimaryKeyElement(field) + } + } + + /** + * Can the field be used as part of a primary key? + * + * For this to be possible, + * 1. the field needs to be part of a key as defined by the source, + * 2. and its values must be deserializable from the checkpoint persisted in an Airbyte state + * message. + * + * This method does not determine (1), of course, because the source keys are defined in the + * source database itself and are retrieved via [MetadataQuerier.primaryKey]. Instead, this + * method determines (2) based on the type information of the field, typically the [FieldType] + * objects. For instance if the [Field.type] does not map to a [LosslessFieldType] then the + * field can't reliably round-trip checkpoint values during a resumable initial sync. + */ + fun isPossiblePrimaryKeyElement(field: Field): Boolean = + when (field.type) { + !is LosslessFieldType -> false + BinaryStreamFieldType, + CharacterStreamFieldType, + NCharacterStreamFieldType, + ClobFieldType, + NClobFieldType, + JsonStringFieldType, -> false + else -> true + } + + /** + * Can the field be used as a cursor in a cursor-based incremental sync? + * + * This predicate is like [isPossiblePrimaryKeyElement] but tighter: in addition to being able + * to round-trip the column values, we need to be able to query the max value from the source at + * the start of the sync. + */ + fun isPossibleCursor(field: Field): Boolean = + isPossiblePrimaryKeyElement(field) && field.type !is BooleanFieldType +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt deleted file mode 100644 index 0e51c8cd5eaaf..0000000000000 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt +++ /dev/null @@ -1,144 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.read - -import com.fasterxml.jackson.annotation.JsonProperty -import com.fasterxml.jackson.databind.JsonNode -import io.airbyte.cdk.command.OpaqueStateValue -import io.airbyte.cdk.discover.Field -import io.airbyte.cdk.discover.FieldOrMetaField -import io.airbyte.cdk.output.InvalidCursor -import io.airbyte.cdk.output.InvalidPrimaryKey -import io.airbyte.cdk.util.Jsons -import io.airbyte.protocol.models.v0.SyncMode - -/** - * [CheckpointStreamState] is the type used to represent state checkpoints for source connectors - * which make use of this package. This maps to the value of an Airbyte STATE message of type - * STREAM, interpreted using the provided configuration and configured catalog. - */ -sealed interface CheckpointStreamState - -data object SnapshotCompleted : CheckpointStreamState - -data class SnapshotCheckpoint( - val primaryKey: List, - val primaryKeyCheckpoint: List, -) : CheckpointStreamState - -data class SnapshotWithCursorCheckpoint( - val primaryKey: List, - val primaryKeyCheckpoint: List, - val cursor: Field, - val cursorUpperBound: JsonNode, -) : CheckpointStreamState - -data class CursorIncrementalCheckpoint( - val cursor: Field, - val cursorCheckpoint: JsonNode, -) : CheckpointStreamState - -/** Serializes a [CheckpointStreamState] into an [OpaqueStateValue]. */ -fun CheckpointStreamState.opaqueStateValue(): OpaqueStateValue = Jsons.valueToTree(jsonValue()) - -private fun CheckpointStreamState.jsonValue(): StreamStateJsonValue = - when (this) { - SnapshotCompleted -> StreamStateJsonValue() - is SnapshotCheckpoint -> - StreamStateJsonValue( - primaryKey = primaryKey.map { it.id }.zip(primaryKeyCheckpoint).toMap(), - ) - is SnapshotWithCursorCheckpoint -> - StreamStateJsonValue( - primaryKey = primaryKey.map { it.id }.zip(primaryKeyCheckpoint).toMap(), - cursors = mapOf(cursor.id to cursorUpperBound), - ) - is CursorIncrementalCheckpoint -> - StreamStateJsonValue(cursors = mapOf(cursor.id to cursorCheckpoint)) - } - -/** - * Deserializes a nullable [OpaqueStateValue] into a nullable [CheckpointStreamState] based on the - * current [StreamReadContext], which contains the configuration and the catalog. - */ -fun OpaqueStateValue?.checkpoint(ctx: StreamReadContext): CheckpointStreamState? = - if (this == null) { - null - } else { - Jsons.treeToValue(this, StreamStateJsonValue::class.java).checkpoint(ctx) - } - -/** - * [StreamStateJsonValue] is like [CheckpointStreamState] but configuration- and catalog-agnostic. - * This is the object which is used for de/serializing Airbyte STATE message values from/to - * [OpaqueStateValue]s. - */ -data class StreamStateJsonValue( - @JsonProperty("primary_key") val primaryKey: Map = mapOf(), - @JsonProperty("cursors") val cursors: Map = mapOf(), -) - -private fun StreamStateJsonValue.checkpoint(ctx: StreamReadContext): CheckpointStreamState? { - val pkMap: Map = run { - if (primaryKey.isEmpty()) { - return@run mapOf() - } - val pk: List = ctx.stream.configuredPrimaryKey ?: listOf() - if (primaryKey.keys != pk.map { it.id }.toSet()) { - ctx.handler.accept( - InvalidPrimaryKey( - ctx.stream.name, - ctx.stream.namespace, - primaryKey.keys.toList(), - ), - ) - return null - } - pk.associateWith { primaryKey[it.id]!! } - } - val cursorPair: Pair? = run { - if (cursors.isEmpty()) { - return@run null - } - if (cursors.size > 1) { - ctx.handler.accept( - InvalidCursor(ctx.stream.name, ctx.stream.namespace, cursors.keys.toString()), - ) - return null - } - val cursorLabel: String = cursors.keys.first() - val cursor: FieldOrMetaField? = ctx.stream.fields.find { it.id == cursorLabel } - if (cursor !is Field) { - ctx.handler.accept( - InvalidCursor(ctx.stream.name, ctx.stream.namespace, cursorLabel), - ) - return null - } - cursor to cursors[cursorLabel]!! - } - val isCursorBasedIncremental: Boolean = - ctx.stream.configuredSyncMode == SyncMode.INCREMENTAL && !ctx.configuration.global - - return if (cursorPair == null) { - if (isCursorBasedIncremental) { - null - } else if (pkMap.isEmpty()) { - SnapshotCompleted - } else { - SnapshotCheckpoint(pkMap.keys.toList(), pkMap.values.toList()) - } - } else { - val (cursor: Field, cursorCheckpoint: JsonNode) = cursorPair - if (!isCursorBasedIncremental) { - null - } else if (pkMap.isEmpty()) { - CursorIncrementalCheckpoint(cursor, cursorCheckpoint) - } else { - SnapshotWithCursorCheckpoint( - pkMap.keys.toList(), - pkMap.values.toList(), - cursor, - cursorCheckpoint, - ) - } - } -} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimator.kt new file mode 100644 index 0000000000000..af103b75fde8f --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimator.kt @@ -0,0 +1,40 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.read + +import io.github.oshai.kotlinlogging.KotlinLogging + +/** + * [FetchSizeEstimator] is used to estimate the value of the JDBC fetchSize parameter to fill up a + * portion of the JVM heap defined by [MEM_CAPACITY_RATIO]. + */ +class DefaultJdbcFetchSizeEstimator( + val maxMemoryBytes: Long, + val maxConcurrency: Int, + val minFetchSize: Int, + val defaultFetchSize: Int, + val maxFetchSize: Int, + val memoryCapacityRatio: Double, +) : JdbcSharedState.JdbcFetchSizeEstimator { + private val log = KotlinLogging.logger {} + + override fun apply(rowByteSizeSample: Sample): Int { + val maxRowBytes: Long = rowByteSizeSample.sampledValues.maxOrNull() ?: 0L + log.info { + "Maximum row size in ${rowByteSizeSample.kind.name} table is $maxRowBytes bytes." + } + val targetMemoryUse: Long = (maxMemoryBytes * memoryCapacityRatio).toLong() + if (listOf(maxRowBytes, targetMemoryUse, maxConcurrency.toLong()).any { it <= 0L }) { + return defaultFetchSize + } + val targetMemoryUsePerQuery: Long = targetMemoryUse / maxConcurrency + log.info { + "Targeting a maximum of $targetMemoryUsePerQuery bytes " + + "for each of up to $maxConcurrency queries." + } + val maxRowsFetchedPerQuery: Long = targetMemoryUsePerQuery / maxRowBytes + return maxRowsFetchedPerQuery + .coerceAtLeast(minFetchSize.toLong()) + .coerceAtMost(maxFetchSize.toLong()) + .toInt() + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartition.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartition.kt new file mode 100644 index 0000000000000..a87f883b3d5ce --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartition.kt @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.command.OpaqueStateValue +import io.airbyte.cdk.discover.Field +import io.airbyte.cdk.util.Jsons + +/** Base class for default implementations of [JdbcPartition]. */ +sealed class DefaultJdbcPartition( + val selectQueryGenerator: SelectQueryGenerator, + final override val streamState: DefaultJdbcStreamState, +) : JdbcPartition { + val stream: Stream = streamState.stream + val from = From(stream.name, stream.namespace) +} + +/** Base class for default implementations of [JdbcPartition] for unsplittable partitions. */ +sealed class DefaultJdbcUnsplittablePartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, +) : DefaultJdbcPartition(selectQueryGenerator, streamState) { + + override val nonResumableQuery: SelectQuery + get() = selectQueryGenerator.generate(nonResumableQuerySpec.optimize()) + + val nonResumableQuerySpec = SelectQuerySpec(SelectColumns(stream.fields), from) + + override fun samplingQuery(sampleRateInvPow2: Int): SelectQuery { + val sampleSize: Int = streamState.sharedState.maxSampleSize + val querySpec = + SelectQuerySpec( + SelectColumns(stream.fields), + FromSample(stream.name, stream.namespace, sampleRateInvPow2, sampleSize), + ) + return selectQueryGenerator.generate(querySpec.optimize()) + } +} + +/** Default implementation of a [JdbcPartition] for an unsplittable snapshot partition. */ +class DefaultJdbcUnsplittableSnapshotPartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, +) : DefaultJdbcUnsplittablePartition(selectQueryGenerator, streamState) { + + override val completeState: OpaqueStateValue = DefaultJdbcStreamStateValue.snapshotCompleted +} + +/** + * Default implementation of a [JdbcPartition] for an unsplittable snapshot partition preceding a + * cursor-based incremental sync. + */ +class DefaultJdbcUnsplittableSnapshotWithCursorPartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, + val cursor: Field, +) : + DefaultJdbcUnsplittablePartition(selectQueryGenerator, streamState), + JdbcCursorPartition { + + override val completeState: OpaqueStateValue + get() = + DefaultJdbcStreamStateValue.cursorIncrementalCheckpoint( + cursor, + cursorCheckpoint = streamState.cursorUpperBound!! + ) + + override val cursorUpperBoundQuery: SelectQuery + get() = selectQueryGenerator.generate(cursorUpperBoundQuerySpec.optimize()) + + val cursorUpperBoundQuerySpec = SelectQuerySpec(SelectColumnMaxValue(cursor), from) +} + +/** Base class for default implementations of [JdbcPartition] for splittable partitions. */ +sealed class DefaultJdbcSplittablePartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, + val checkpointColumns: List, +) : + DefaultJdbcPartition(selectQueryGenerator, streamState), + JdbcSplittablePartition { + abstract val lowerBound: List? + abstract val upperBound: List? + + override val nonResumableQuery: SelectQuery + get() = selectQueryGenerator.generate(nonResumableQuerySpec.optimize()) + + val nonResumableQuerySpec: SelectQuerySpec + get() = SelectQuerySpec(SelectColumns(stream.fields), from, where) + + override fun resumableQuery(limit: Long): SelectQuery { + val querySpec = + SelectQuerySpec( + SelectColumns((stream.fields + checkpointColumns).distinct()), + from, + where, + OrderBy(checkpointColumns), + Limit(limit), + ) + return selectQueryGenerator.generate(querySpec.optimize()) + } + + override fun samplingQuery(sampleRateInvPow2: Int): SelectQuery { + val sampleSize: Int = streamState.sharedState.maxSampleSize + val querySpec = + SelectQuerySpec( + SelectColumns(stream.fields + checkpointColumns), + FromSample(stream.name, stream.namespace, sampleRateInvPow2, sampleSize), + where, + OrderBy(checkpointColumns), + ) + return selectQueryGenerator.generate(querySpec.optimize()) + } + + val where: Where + get() { + val zippedLowerBound: List> = + lowerBound?.let { checkpointColumns.zip(it) } ?: listOf() + val lowerBoundDisj: List = + zippedLowerBound.mapIndexed { idx: Int, (gtCol: Field, gtValue: JsonNode) -> + val lastLeaf: WhereClauseLeafNode = + if (isLowerBoundIncluded && idx == checkpointColumns.size - 1) { + GreaterOrEqual(gtCol, gtValue) + } else { + Greater(gtCol, gtValue) + } + And( + zippedLowerBound.take(idx).map { (eqCol: Field, eqValue: JsonNode) -> + Equal(eqCol, eqValue) + } + listOf(lastLeaf), + ) + } + val zippedUpperBound: List> = + upperBound?.let { checkpointColumns.zip(it) } ?: listOf() + val upperBoundDisj: List = + zippedUpperBound.mapIndexed { idx: Int, (leqCol: Field, leqValue: JsonNode) -> + val lastLeaf: WhereClauseLeafNode = + if (idx < zippedUpperBound.size - 1) { + Lesser(leqCol, leqValue) + } else { + LesserOrEqual(leqCol, leqValue) + } + And( + zippedUpperBound.take(idx).map { (eqCol: Field, eqValue: JsonNode) -> + Equal(eqCol, eqValue) + } + listOf(lastLeaf), + ) + } + return Where(And(Or(lowerBoundDisj), Or(upperBoundDisj))) + } + + open val isLowerBoundIncluded: Boolean = false +} + +/** Default implementation of a [JdbcPartition] for a splittable snapshot partition. */ +class DefaultJdbcSplittableSnapshotPartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, + primaryKey: List, + override val lowerBound: List?, + override val upperBound: List?, +) : DefaultJdbcSplittablePartition(selectQueryGenerator, streamState, primaryKey) { + + override val completeState: OpaqueStateValue + get() = + when (upperBound) { + null -> DefaultJdbcStreamStateValue.snapshotCompleted + else -> + DefaultJdbcStreamStateValue.snapshotCheckpoint( + primaryKey = checkpointColumns, + primaryKeyCheckpoint = upperBound, + ) + } + + override fun incompleteState(lastRecord: ObjectNode): OpaqueStateValue = + DefaultJdbcStreamStateValue.snapshotCheckpoint( + primaryKey = checkpointColumns, + primaryKeyCheckpoint = checkpointColumns.map { lastRecord[it.id] ?: Jsons.nullNode() }, + ) +} + +/** + * Default implementation of a [JdbcPartition] for a splittable partition involving cursor columns. + */ +sealed class DefaultJdbcCursorPartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, + checkpointColumns: List, + val cursor: Field, + private val explicitCursorUpperBound: JsonNode?, +) : + DefaultJdbcSplittablePartition(selectQueryGenerator, streamState, checkpointColumns), + JdbcCursorPartition { + + val cursorUpperBound: JsonNode + get() = explicitCursorUpperBound ?: streamState.cursorUpperBound!! + + override val cursorUpperBoundQuery: SelectQuery + get() = selectQueryGenerator.generate(cursorUpperBoundQuerySpec.optimize()) + + val cursorUpperBoundQuerySpec = SelectQuerySpec(SelectColumnMaxValue(cursor), from) +} + +/** + * Default implementation of a [JdbcPartition] for a splittable snapshot partition preceding a + * cursor-based incremental sync. + */ +class DefaultJdbcSplittableSnapshotWithCursorPartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, + primaryKey: List, + override val lowerBound: List?, + override val upperBound: List?, + cursor: Field, + cursorUpperBound: JsonNode?, +) : + DefaultJdbcCursorPartition( + selectQueryGenerator, + streamState, + primaryKey, + cursor, + cursorUpperBound + ) { + + override val completeState: OpaqueStateValue + get() = + when (upperBound) { + null -> + DefaultJdbcStreamStateValue.cursorIncrementalCheckpoint( + cursor, + cursorUpperBound + ) + else -> + DefaultJdbcStreamStateValue.snapshotWithCursorCheckpoint( + primaryKey = checkpointColumns, + primaryKeyCheckpoint = upperBound, + cursor, + cursorUpperBound, + ) + } + + override fun incompleteState(lastRecord: ObjectNode): OpaqueStateValue = + DefaultJdbcStreamStateValue.snapshotWithCursorCheckpoint( + primaryKey = checkpointColumns, + primaryKeyCheckpoint = checkpointColumns.map { lastRecord[it.id] ?: Jsons.nullNode() }, + cursor, + cursorUpperBound, + ) +} + +/** + * Default implementation of a [JdbcPartition] for a cursor incremental partition. These are always + * splittable. + */ +class DefaultJdbcCursorIncrementalPartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, + cursor: Field, + val cursorLowerBound: JsonNode, + override val isLowerBoundIncluded: Boolean, + cursorUpperBound: JsonNode?, +) : + DefaultJdbcCursorPartition( + selectQueryGenerator, + streamState, + listOf(cursor), + cursor, + cursorUpperBound + ) { + + override val lowerBound: List = listOf(cursorLowerBound) + override val upperBound: List + get() = listOf(cursorUpperBound) + + override val completeState: OpaqueStateValue + get() = + DefaultJdbcStreamStateValue.cursorIncrementalCheckpoint( + cursor, + cursorCheckpoint = cursorUpperBound, + ) + + override fun incompleteState(lastRecord: ObjectNode): OpaqueStateValue = + DefaultJdbcStreamStateValue.cursorIncrementalCheckpoint( + cursor, + cursorCheckpoint = lastRecord[cursor.id] ?: Jsons.nullNode(), + ) +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactory.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactory.kt new file mode 100644 index 0000000000000..e324756517103 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactory.kt @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.JsonNode +import io.airbyte.cdk.ConfigErrorException +import io.airbyte.cdk.command.JdbcSourceConfiguration +import io.airbyte.cdk.command.OpaqueStateValue +import io.airbyte.cdk.discover.Field +import io.airbyte.cdk.discover.FieldOrMetaField +import io.airbyte.cdk.output.CatalogValidationFailureHandler +import io.airbyte.cdk.output.InvalidCursor +import io.airbyte.cdk.output.InvalidPrimaryKey +import io.airbyte.cdk.output.ResetStream +import io.airbyte.cdk.util.Jsons +import io.airbyte.protocol.models.v0.SyncMode +import jakarta.inject.Singleton +import java.util.concurrent.ConcurrentHashMap + +/** Default implementation of [JdbcPartitionFactory]. */ +@Singleton +class DefaultJdbcPartitionFactory( + override val sharedState: DefaultJdbcSharedState, + val handler: CatalogValidationFailureHandler, + val selectQueryGenerator: SelectQueryGenerator, +) : + JdbcPartitionFactory< + DefaultJdbcSharedState, + DefaultJdbcStreamState, + DefaultJdbcPartition, + > { + + private val streamStates = ConcurrentHashMap() + + override fun streamState(stream: Stream): DefaultJdbcStreamState = + streamStates.getOrPut(stream.label) { DefaultJdbcStreamState(sharedState, stream) } + + override fun create( + stream: Stream, + opaqueStateValue: OpaqueStateValue?, + ): DefaultJdbcPartition? { + val streamState: DefaultJdbcStreamState = streamState(stream) + if (opaqueStateValue == null) { + return coldStart(streamState) + } + val sv: DefaultJdbcStreamStateValue = + Jsons.treeToValue(opaqueStateValue, DefaultJdbcStreamStateValue::class.java) + val pkMap: Map = + sv.pkMap(stream) + ?: run { + handler.accept(ResetStream(stream.name, stream.namespace)) + streamState.reset() + return coldStart(streamState) + } + val cursorPair: Pair? = + if (sv.cursors.isEmpty()) { + null + } else { + sv.cursorPair(stream) + ?: run { + handler.accept(ResetStream(stream.name, stream.namespace)) + streamState.reset() + return coldStart(streamState) + } + } + + val isCursorBasedIncremental: Boolean = + stream.configuredSyncMode == SyncMode.INCREMENTAL && !configuration.global + + return if (cursorPair == null) { + if (isCursorBasedIncremental) { + handler.accept(ResetStream(stream.name, stream.namespace)) + streamState.reset() + coldStart(streamState) + } else if (pkMap.isEmpty()) { + // Snapshot complete. + null + } else { + // Snapshot ongoing. + DefaultJdbcSplittableSnapshotPartition( + selectQueryGenerator, + streamState, + primaryKey = pkMap.keys.toList(), + lowerBound = pkMap.values.toList(), + upperBound = null + ) + } + } else { + val (cursor: Field, cursorCheckpoint: JsonNode) = cursorPair + if (!isCursorBasedIncremental) { + handler.accept(ResetStream(stream.name, stream.namespace)) + streamState.reset() + coldStart(streamState) + } else if (pkMap.isNotEmpty()) { + // Snapshot ongoing. + DefaultJdbcSplittableSnapshotWithCursorPartition( + selectQueryGenerator, + streamState, + primaryKey = pkMap.keys.toList(), + lowerBound = pkMap.values.toList(), + upperBound = null, + cursor, + cursorUpperBound = cursorCheckpoint, + ) + } else if (cursorCheckpoint == streamState.cursorUpperBound) { + // Incremental complete. + null + } else { + // Incremental ongoing. + DefaultJdbcCursorIncrementalPartition( + selectQueryGenerator, + streamState, + cursor, + cursorLowerBound = cursorCheckpoint, + isLowerBoundIncluded = true, + cursorUpperBound = streamState.cursorUpperBound, + ) + } + } + } + + private fun DefaultJdbcStreamStateValue.pkMap(stream: Stream): Map? { + if (primaryKey.isEmpty()) { + return mapOf() + } + val fields: List = stream.configuredPrimaryKey ?: listOf() + if (primaryKey.keys != fields.map { it.id }.toSet()) { + handler.accept( + InvalidPrimaryKey(stream.name, stream.namespace, primaryKey.keys.toList()), + ) + return null + } + return fields.associateWith { primaryKey[it.id]!! } + } + + private fun DefaultJdbcStreamStateValue.cursorPair(stream: Stream): Pair? { + if (cursors.size > 1) { + handler.accept( + InvalidCursor(stream.name, stream.namespace, cursors.keys.toString()), + ) + return null + } + val cursorLabel: String = cursors.keys.first() + val cursor: FieldOrMetaField? = stream.fields.find { it.id == cursorLabel } + if (cursor !is Field) { + handler.accept( + InvalidCursor(stream.name, stream.namespace, cursorLabel), + ) + return null + } + if (stream.configuredCursor != cursor) { + handler.accept( + InvalidCursor(stream.name, stream.namespace, cursorLabel), + ) + return null + } + return cursor to cursors[cursorLabel]!! + } + + private fun coldStart(streamState: DefaultJdbcStreamState): DefaultJdbcPartition { + val stream: Stream = streamState.stream + val pkChosenFromCatalog: List = stream.configuredPrimaryKey ?: listOf() + if (stream.configuredSyncMode == SyncMode.FULL_REFRESH || configuration.global) { + if (pkChosenFromCatalog.isEmpty()) { + return DefaultJdbcUnsplittableSnapshotPartition( + selectQueryGenerator, + streamState, + ) + } + return DefaultJdbcSplittableSnapshotPartition( + selectQueryGenerator, + streamState, + pkChosenFromCatalog, + lowerBound = null, + upperBound = null, + ) + } + val cursorChosenFromCatalog: Field = + stream.configuredCursor as? Field ?: throw ConfigErrorException("no cursor") + if (pkChosenFromCatalog.isEmpty()) { + return DefaultJdbcUnsplittableSnapshotWithCursorPartition( + selectQueryGenerator, + streamState, + cursorChosenFromCatalog + ) + } + return DefaultJdbcSplittableSnapshotWithCursorPartition( + selectQueryGenerator, + streamState, + pkChosenFromCatalog, + lowerBound = null, + upperBound = null, + cursorChosenFromCatalog, + cursorUpperBound = null, + ) + } + + val configuration: JdbcSourceConfiguration = sharedState.configuration + + override fun split( + unsplitPartition: DefaultJdbcPartition, + opaqueStateValues: List + ): List { + val splitPartitionBoundaries: List by lazy { + opaqueStateValues.map { Jsons.treeToValue(it, DefaultJdbcStreamStateValue::class.java) } + } + return when (unsplitPartition) { + is DefaultJdbcSplittableSnapshotPartition -> + unsplitPartition.split(splitPartitionBoundaries) + is DefaultJdbcSplittableSnapshotWithCursorPartition -> + unsplitPartition.split(splitPartitionBoundaries) + is DefaultJdbcCursorIncrementalPartition -> + unsplitPartition.split(splitPartitionBoundaries) + is DefaultJdbcUnsplittableSnapshotPartition -> listOf(unsplitPartition) + is DefaultJdbcUnsplittableSnapshotWithCursorPartition -> listOf(unsplitPartition) + } + } + + private fun DefaultJdbcSplittableSnapshotPartition.split( + splitPointValues: List + ): List { + val inners: List> = + splitPointValues.mapNotNull { it.pkMap(streamState.stream)?.values?.toList() } + val lbs: List?> = listOf(lowerBound) + inners + val ubs: List?> = inners + listOf(upperBound) + return lbs.zip(ubs).map { (lowerBound, upperBound) -> + DefaultJdbcSplittableSnapshotPartition( + selectQueryGenerator, + streamState, + primaryKey = checkpointColumns, + lowerBound, + upperBound, + ) + } + } + + private fun DefaultJdbcSplittableSnapshotWithCursorPartition.split( + splitPointValues: List + ): List { + val inners: List> = + splitPointValues.mapNotNull { it.pkMap(streamState.stream)?.values?.toList() } + val lbs: List?> = listOf(lowerBound) + inners + val ubs: List?> = inners + listOf(upperBound) + return lbs.zip(ubs).map { (lowerBound, upperBound) -> + DefaultJdbcSplittableSnapshotWithCursorPartition( + selectQueryGenerator, + streamState, + primaryKey = checkpointColumns, + lowerBound, + upperBound, + cursor, + cursorUpperBound, + ) + } + } + + private fun DefaultJdbcCursorIncrementalPartition.split( + splitPointValues: List + ): List { + val inners: List = splitPointValues.mapNotNull { it.cursorPair(stream)?.second } + val lbs: List = listOf(cursorLowerBound) + inners + val ubs: List = inners + listOf(cursorUpperBound) + return lbs.zip(ubs).mapIndexed { idx: Int, (lowerBound, upperBound) -> + DefaultJdbcCursorIncrementalPartition( + selectQueryGenerator, + streamState, + cursor, + lowerBound, + isLowerBoundIncluded = idx == 0, + upperBound, + ) + } + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt new file mode 100644 index 0000000000000..0eb02dc9dc491 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.command.JdbcSourceConfiguration +import io.airbyte.cdk.output.OutputConsumer +import io.micronaut.context.annotation.ConfigurationProperties +import jakarta.inject.Singleton +import kotlinx.coroutines.sync.Semaphore + +/** Default implementation of [JdbcSharedState]. */ +@Singleton +class DefaultJdbcSharedState( + override val configuration: JdbcSourceConfiguration, + override val outputConsumer: OutputConsumer, + override val selectQuerier: SelectQuerier, + val constants: Constants, +) : JdbcSharedState { + + @ConfigurationProperties(JDBC_PROPERTY_PREFIX) + data class Constants( + val withSampling: Boolean = WITH_SAMPLING, + val maxSampleSize: Int = TABLE_SAMPLE_SIZE, + /** How many bytes per second we can expect the database to send to the connector. */ + val expectedThroughputBytesPerSecond: Long = THROUGHPUT_BYTES_PER_SECOND, + /** Smallest possible fetchSize value. */ + val minFetchSize: Int = FETCH_SIZE_LOWER_BOUND, + /** Default fetchSize value, in absence of any other estimate. */ + val defaultFetchSize: Int = DEFAULT_FETCH_SIZE, + /** Largest possible fetchSize value. */ + val maxFetchSize: Int = FETCH_SIZE_UPPER_BOUND, + /** How much of the JVM heap can we fill up with [java.sql.ResultSet] data. */ + val memoryCapacityRatio: Double = MEM_CAPACITY_RATIO, + /** Estimated bytes used as overhead for each row in a [java.sql.ResultSet]. */ + val estimatedRecordOverheadBytes: Long = RECORD_OVERHEAD_BYTES, + /** Estimated bytes used as overhead for each column value in a [java.sql.ResultSet]. */ + val estimatedFieldOverheadBytes: Long = FIELD_OVERHEAD_BYTES, + /** Overrides the JVM heap capacity to provide determinism in tests. */ + val maxMemoryBytesForTesting: Long? = null + ) { + companion object { + + // Sampling defaults. + internal const val WITH_SAMPLING: Boolean = false + internal const val TABLE_SAMPLE_SIZE: Int = 1024 + internal const val THROUGHPUT_BYTES_PER_SECOND: Long = 10L shl 20 + + // fetchSize defaults + internal const val FETCH_SIZE_LOWER_BOUND: Int = 10 + internal const val DEFAULT_FETCH_SIZE: Int = 1_000 + internal const val FETCH_SIZE_UPPER_BOUND: Int = 10_000_000 + + // Memory estimate defaults. + internal const val RECORD_OVERHEAD_BYTES = 16L + internal const val FIELD_OVERHEAD_BYTES = 16L + // We're targeting use of 60% of the available memory in order to allow + // for some headroom for other garbage collection. + internal const val MEM_CAPACITY_RATIO: Double = 0.6 + } + } + + override val withSampling: Boolean + get() = constants.withSampling + + override val maxSampleSize: Int + get() = constants.maxSampleSize + + val maxPartitionThroughputBytesPerSecond: Long = + constants.expectedThroughputBytesPerSecond / configuration.maxConcurrency + + override val targetPartitionByteSize: Long = + maxPartitionThroughputBytesPerSecond * configuration.checkpointTargetInterval.seconds + + override fun jdbcFetchSizeEstimator(): JdbcSharedState.JdbcFetchSizeEstimator = + DefaultJdbcFetchSizeEstimator( + maxMemoryBytes = constants.maxMemoryBytesForTesting ?: Runtime.getRuntime().maxMemory(), + configuration.maxConcurrency, + constants.minFetchSize, + constants.defaultFetchSize, + constants.maxFetchSize, + constants.memoryCapacityRatio, + ) + + override fun rowByteSizeEstimator(): JdbcSharedState.RowByteSizeEstimator = + DefaultRowByteSizeEstimator( + constants.estimatedRecordOverheadBytes, + constants.estimatedFieldOverheadBytes, + ) + + internal val semaphore = Semaphore(configuration.maxConcurrency) + + override fun tryAcquireResourcesForCreator(): JdbcPartitionsCreator.AcquiredResources? = + if (semaphore.tryAcquire()) { + JdbcPartitionsCreator.AcquiredResources { semaphore.release() } + } else { + null + } + + override fun tryAcquireResourcesForReader(): JdbcPartitionReader.AcquiredResources? = + if (semaphore.tryAcquire()) { + JdbcPartitionReader.AcquiredResources { semaphore.release() } + } else { + null + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamState.kt new file mode 100644 index 0000000000000..f4c16b87a7224 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamState.kt @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.JsonNode +import java.util.concurrent.atomic.AtomicReference + +/** Default implementation of [JdbcStreamState]. */ +class DefaultJdbcStreamState( + override val sharedState: DefaultJdbcSharedState, + override val stream: Stream, +) : JdbcStreamState { + + override var cursorUpperBound: JsonNode? + get() = transient.get().cursorUpperBound + set(value) { + transient.updateAndGet { it.copy(cursorUpperBound = value) } + } + + override var fetchSize: Int? + get() = transient.get().fetchSize + set(value) { + transient.updateAndGet { it.copy(fetchSize = value) } + } + + override val fetchSizeOrDefault: Int + get() = fetchSize ?: sharedState.constants.defaultFetchSize + + override val limit: Long + get() = fetchSizeOrDefault * transient.get().limitState.current + + private val transient = AtomicReference(Transient.initial) + + override fun updateLimitState(fn: (LimitState) -> LimitState) { + transient.updateAndGet { it.copy(limitState = fn(it.limitState)) } + } + + override fun reset() { + transient.set(Transient.initial) + } + + private data class Transient( + val fetchSize: Int?, + val limitState: LimitState, + val cursorUpperBound: JsonNode?, + ) { + companion object { + val initial = Transient(fetchSize = null, LimitState.minimum, cursorUpperBound = null) + } + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamStateValue.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamStateValue.kt new file mode 100644 index 0000000000000..ee4ac3f398767 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamStateValue.kt @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.annotation.JsonProperty +import com.fasterxml.jackson.databind.JsonNode +import io.airbyte.cdk.command.OpaqueStateValue +import io.airbyte.cdk.discover.Field +import io.airbyte.cdk.util.Jsons + +/** + * [DefaultJdbcStreamStateValue] is used by [DefaultJdbcPartitionFactory] for deserializing an + * [OpaqueStateValue] into a [DefaultJdbcPartition]. The latter is able to, in turn, serialize a + * partition boundary into an [OpaqueStateValue] with [DefaultJdbcStreamStateValue]. + */ +data class DefaultJdbcStreamStateValue( + @JsonProperty("primary_key") val primaryKey: Map = mapOf(), + @JsonProperty("cursors") val cursors: Map = mapOf(), +) { + companion object { + /** Value representing the completion of a FULL_REFRESH snapshot. */ + val snapshotCompleted: OpaqueStateValue + get() = Jsons.valueToTree(DefaultJdbcStreamStateValue()) + + /** Value representing the progress of a ongoing snapshot not involving cursor columns. */ + fun snapshotCheckpoint( + primaryKey: List, + primaryKeyCheckpoint: List, + ): OpaqueStateValue = + Jsons.valueToTree( + DefaultJdbcStreamStateValue( + primaryKey = primaryKey.map { it.id }.zip(primaryKeyCheckpoint).toMap(), + ) + ) + + /** Value representing the progress of an ongoing snapshot involving cursor columns. */ + fun snapshotWithCursorCheckpoint( + primaryKey: List, + primaryKeyCheckpoint: List, + cursor: Field, + cursorUpperBound: JsonNode, + ): OpaqueStateValue = + Jsons.valueToTree( + DefaultJdbcStreamStateValue( + primaryKey = primaryKey.map { it.id }.zip(primaryKeyCheckpoint).toMap(), + cursors = mapOf(cursor.id to cursorUpperBound), + ) + ) + + /** Value representing the progress of an ongoing incremental cursor read. */ + fun cursorIncrementalCheckpoint( + cursor: Field, + cursorCheckpoint: JsonNode, + ): OpaqueStateValue = + Jsons.valueToTree( + DefaultJdbcStreamStateValue( + cursors = mapOf(cursor.id to cursorCheckpoint), + ) + ) + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimator.kt new file mode 100644 index 0000000000000..7694472f9ad06 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimator.kt @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.core.JsonGenerator +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.util.Jsons +import java.io.OutputStream + +/** Estimates the in-memory byte size of a table row based on its [ObjectNode] representation. */ +class DefaultRowByteSizeEstimator( + val estimatedRecordOverheadBytes: Long, + val estimatedFieldOverheadBytes: Long, +) : JdbcSharedState.RowByteSizeEstimator { + private var counter: Long = 0L + + override fun apply(record: ObjectNode): Long { + counter = 0L + Jsons.writeValue(jsonGenerator, record) + // The counter value includes the byte count on field name encodings; subtract this. + // We don't want the estimate to depend on the column name lengths. + val adjustedFieldOverheadBytes: Long = + record.fields().asSequence().sumOf { (fieldName: String, _) -> + val fieldNameOvercount: Int = ",\"".length + fieldName.length + "\":".length + estimatedFieldOverheadBytes - fieldNameOvercount + } + return estimatedRecordOverheadBytes + counter + adjustedFieldOverheadBytes + } + + private val countingOutputStream = + object : OutputStream() { + override fun write(b: Int) { + counter++ + } + } + + private val jsonGenerator: JsonGenerator = Jsons.createGenerator(countingOutputStream) +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartition.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartition.kt new file mode 100644 index 0000000000000..e6bd519911aa1 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartition.kt @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.command.OpaqueStateValue + +/** + * Encapsulates database-specific aspects relating to a JDBC stream partition, consumed by + * [JdbcPartitionReader] and friends. + */ +interface JdbcPartition> { + + /** The partition's stream's transient state, including parameters like fetchSize, etc. */ + val streamState: S + + /** Query which produces all records in the partition in no particular order. */ + val nonResumableQuery: SelectQuery + + /** State value to emit when the partition is read in its entirety. */ + val completeState: OpaqueStateValue + + /** Query which samples records in the partition at the rate of 2^-[sampleRateInvPow2]. */ + fun samplingQuery(sampleRateInvPow2: Int): SelectQuery + + /** Tries to acquire resources for [JdbcPartitionsCreator]. */ + fun tryAcquireResourcesForCreator(): JdbcPartitionsCreator.AcquiredResources? = + // Acquire global resources by default. + streamState.sharedState.tryAcquireResourcesForCreator() + + /** Tries to acquire resources for [JdbcPartitionReader]. */ + fun tryAcquireResourcesForReader(): JdbcPartitionReader.AcquiredResources? = + // Acquire global resources by default. + streamState.sharedState.tryAcquireResourcesForReader() +} + +/** A [JdbcPartition] which can be subdivided. */ +interface JdbcSplittablePartition> : JdbcPartition { + + /** Query which produces a subset of records at the beginning of the partition. */ + fun resumableQuery(limit: Long): SelectQuery + + /** State value to emit when the partition is read up to (and including) [lastRecord]. */ + fun incompleteState(lastRecord: ObjectNode): OpaqueStateValue +} + +/** A [JdbcPartition] which allows cursor-based incremental reads. */ +interface JdbcCursorPartition> : JdbcPartition { + + /** Query which produces the current maximum cursor value in the stream. */ + val cursorUpperBoundQuery: SelectQuery +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionFactory.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionFactory.kt new file mode 100644 index 0000000000000..6d885654b8c3b --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionFactory.kt @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.command.OpaqueStateValue +import io.micronaut.context.annotation.DefaultImplementation + +/** Encapsulates database-specific logic turning [OpaqueStateValue] into [JdbcPartition]. */ +@DefaultImplementation(DefaultJdbcPartitionFactory::class) +interface JdbcPartitionFactory< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +> { + + /** The state shared by all partitions. Includes global resources. */ + val sharedState: A + + /** Get or create the [JdbcStreamState] for a [stream]. */ + fun streamState(stream: Stream): S + + /** + * Deserializes [opaqueStateValue] and creates a [JdbcPartition] instance corresponding to all + * remaining unread data in the [stream], if any; null otherwise. + */ + fun create(stream: Stream, opaqueStateValue: OpaqueStateValue?): P? + + /** Subdivides the [unsplitPartition] by splitting at the [opaqueStateValues], if possible. */ + fun split(unsplitPartition: P, opaqueStateValues: List): List

+} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionReader.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionReader.kt new file mode 100644 index 0000000000000..77c2944befc1f --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionReader.kt @@ -0,0 +1,149 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.command.OpaqueStateValue +import io.airbyte.cdk.output.OutputConsumer +import io.airbyte.cdk.util.Jsons +import io.airbyte.protocol.models.v0.AirbyteRecordMessage +import java.util.concurrent.atomic.AtomicBoolean +import java.util.concurrent.atomic.AtomicLong +import java.util.concurrent.atomic.AtomicReference +import kotlin.coroutines.coroutineContext +import kotlinx.coroutines.ensureActive + +/** Base class for JDBC implementations of [PartitionReader]. */ +sealed class JdbcPartitionReader

>( + val partition: P, +) : PartitionReader { + + val streamState: JdbcStreamState<*> = partition.streamState + val stream: Stream = streamState.stream + val sharedState: JdbcSharedState = streamState.sharedState + val outputConsumer: OutputConsumer = sharedState.outputConsumer + val selectQuerier: SelectQuerier = sharedState.selectQuerier + + private val acquiredResources = AtomicReference() + + /** Calling [close] releases the resources acquired for the [JdbcPartitionReader]. */ + fun interface AcquiredResources : AutoCloseable + + override fun tryAcquireResources(): PartitionReader.TryAcquireResourcesStatus { + val acquiredResources: AcquiredResources = + partition.tryAcquireResourcesForReader() + ?: return PartitionReader.TryAcquireResourcesStatus.RETRY_LATER + this.acquiredResources.set(acquiredResources) + return PartitionReader.TryAcquireResourcesStatus.READY_TO_RUN + } + + fun out(record: ObjectNode) { + val recordMessageData: ObjectNode = Jsons.objectNode() + for (fieldName in streamFieldNames) { + recordMessageData.set(fieldName, record[fieldName] ?: Jsons.nullNode()) + } + outputConsumer.accept( + AirbyteRecordMessage() + .withStream(stream.name) + .withNamespace(stream.namespace) + .withData(recordMessageData), + ) + } + + val streamFieldNames: List = stream.fields.map { it.id } + + override fun releaseResources() { + acquiredResources.getAndSet(null)?.close() + } +} + +/** JDBC implementation of [PartitionReader] which reads the [partition] in its entirety. */ +class JdbcNonResumablePartitionReader

>( + partition: P, +) : JdbcPartitionReader

(partition) { + + val runComplete = AtomicBoolean(false) + val numRecords = AtomicLong() + + override suspend fun run() { + selectQuerier + .executeQuery( + q = partition.nonResumableQuery, + parameters = SelectQuerier.Parameters(streamState.fetchSize), + ) + .use { result: SelectQuerier.Result -> + for (record in result) { + out(record) + numRecords.incrementAndGet() + } + } + runComplete.set(true) + } + + override fun checkpoint(): PartitionReadCheckpoint { + // Sanity check. + if (!runComplete.get()) throw RuntimeException("cannot checkpoint non-resumable read") + // The run method executed to completion without a LIMIT clause. + // This implies that the partition boundary has been reached. + return PartitionReadCheckpoint(partition.completeState, numRecords.get()) + } +} + +/** + * JDBC implementation of [PartitionReader] which reads as much as possible of the [partition], in + * order, before timing out. + */ +class JdbcResumablePartitionReader

>( + partition: P, +) : JdbcPartitionReader

(partition) { + + val incumbentLimit = AtomicLong() + val numRecords = AtomicLong() + val lastRecord = AtomicReference(null) + val runComplete = AtomicBoolean(false) + + override suspend fun run() { + val fetchSize: Int = streamState.fetchSizeOrDefault + val limit: Long = streamState.limit + incumbentLimit.set(limit) + selectQuerier + .executeQuery( + q = partition.resumableQuery(limit), + parameters = SelectQuerier.Parameters(fetchSize), + ) + .use { result: SelectQuerier.Result -> + for (record in result) { + out(record) + lastRecord.set(record) + // Check activity periodically to handle timeout. + if (numRecords.incrementAndGet() % fetchSize == 0L) { + coroutineContext.ensureActive() + } + } + } + runComplete.set(true) + } + + override fun checkpoint(): PartitionReadCheckpoint { + if (runComplete.get() && numRecords.get() < streamState.limit) { + // The run method executed to completion with a LIMIT clause which was not reached. + return PartitionReadCheckpoint(partition.completeState, numRecords.get()) + } + // The run method ended because of either the LIMIT or the timeout. + // Adjust the LIMIT value so that it grows or shrinks to try to fit the timeout. + if (incumbentLimit.get() > 0L) { + if (runComplete.get() && streamState.limit <= incumbentLimit.get()) { + // Increase the limit clause for the next PartitionReader, because it's too small. + // If it had been bigger then run might have executed for longer. + streamState.updateLimitState { it.up } + } + if (!runComplete.get() && incumbentLimit.get() <= streamState.limit) { + // Decrease the limit clause for the next PartitionReader, because it's too big. + // If it had been smaller then run might have completed in time. + streamState.updateLimitState { it.down } + } + } + val checkpointState: OpaqueStateValue = partition.incompleteState(lastRecord.get()!!) + return PartitionReadCheckpoint(checkpointState, numRecords.get()) + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreator.kt new file mode 100644 index 0000000000000..7c371d6f41b6d --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreator.kt @@ -0,0 +1,240 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.command.JdbcSourceConfiguration +import io.airbyte.cdk.command.OpaqueStateValue +import io.airbyte.cdk.output.OutputConsumer +import io.airbyte.cdk.util.Jsons +import io.github.oshai.kotlinlogging.KotlinLogging +import java.util.concurrent.atomic.AtomicReference +import kotlin.random.Random + +/** Base class for JDBC implementations of [PartitionsCreator]. */ +sealed class JdbcPartitionsCreator< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +>( + val partition: P, + val partitionFactory: JdbcPartitionFactory, +) : PartitionsCreator { + private val log = KotlinLogging.logger {} + + val streamState: S = partition.streamState + val stream: Stream = streamState.stream + val sharedState: A = streamState.sharedState + val configuration: JdbcSourceConfiguration = sharedState.configuration + val outputConsumer: OutputConsumer = sharedState.outputConsumer + val selectQuerier: SelectQuerier = sharedState.selectQuerier + + private val acquiredResources = AtomicReference() + + /** Calling [close] releases the resources acquired for the [JdbcPartitionsCreator]. */ + fun interface AcquiredResources : AutoCloseable + + override fun tryAcquireResources(): PartitionsCreator.TryAcquireResourcesStatus { + val acquiredResources: AcquiredResources = + partition.tryAcquireResourcesForCreator() + ?: return PartitionsCreator.TryAcquireResourcesStatus.RETRY_LATER + this.acquiredResources.set(acquiredResources) + return PartitionsCreator.TryAcquireResourcesStatus.READY_TO_RUN + } + + override fun releaseResources() { + acquiredResources.getAndSet(null)?.close() + } + + fun ensureCursorUpperBound() { + val cursorUpperBoundQuery: SelectQuery = + (partition as JdbcCursorPartition<*>).cursorUpperBoundQuery + if (streamState.cursorUpperBound != null) { + return + } + log.info { "Querying maximum cursor column value." } + val record: ObjectNode? = + selectQuerier.executeQuery(cursorUpperBoundQuery).use { + if (it.hasNext()) it.next() else null + } + if (record == null) { + streamState.cursorUpperBound = Jsons.nullNode() + return + } + val cursorUpperBound: JsonNode? = record.fields().asSequence().firstOrNull()?.value + if (cursorUpperBound == null) { + log.warn { "No cursor column value found in '${stream.label}'." } + return + } + if (cursorUpperBound.isNull) { + log.warn { "Maximum cursor column value in '${stream.label}' is NULL." } + return + } + log.info { "Maximum cursor column value in '${stream.label}' is '$cursorUpperBound'." } + streamState.cursorUpperBound = cursorUpperBound + } + + /** Collects a sample of rows in the unsplit partition. */ + fun collectSample( + recordMapper: (ObjectNode) -> T, + ): Sample { + val values = mutableListOf() + var previousWeight = 0L + for (sampleRateInvPow2 in listOf(16, 8, 0)) { + val sampleRateInv: Long = 1L shl sampleRateInvPow2 + log.info { "Sampling stream '${stream.label}' at rate 1 / $sampleRateInv." } + // First, try sampling the table at a rate of one every 2^16 = 65_536 rows. + // If that's not enough to produce the desired number of sampled rows (1024 by default) + // then try sampling at a higher rate of one every 2^8 = 256 rows. + // If that's still not enough, don't sample at all. + values.clear() + val samplingQuery: SelectQuery = partition.samplingQuery(sampleRateInvPow2) + selectQuerier.executeQuery(samplingQuery).use { + for (record in it) { + values.add(recordMapper(record)) + } + } + if (values.size < sharedState.maxSampleSize) { + previousWeight = sampleRateInv * values.size / sharedState.maxSampleSize + continue + } + val kind: Sample.Kind = + when (sampleRateInvPow2) { + 16 -> Sample.Kind.LARGE + 8 -> Sample.Kind.MEDIUM + else -> Sample.Kind.SMALL + } + log.info { "Sampled ${values.size} rows in ${kind.name} stream '${stream.label}'." } + return Sample(values, kind, previousWeight.coerceAtLeast(sampleRateInv)) + } + val kind: Sample.Kind = if (values.isEmpty()) Sample.Kind.EMPTY else Sample.Kind.TINY + log.info { "Sampled ${values.size} rows in ${kind.name} stream '${stream.label}'." } + return Sample(values, kind, if (values.isEmpty()) 0L else 1L) + } +} + +/** Sequential JDBC implementation of [PartitionsCreator]. */ +class JdbcSequentialPartitionsCreator< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +>( + partition: P, + partitionFactory: JdbcPartitionFactory, +) : JdbcPartitionsCreator(partition, partitionFactory) { + private val log = KotlinLogging.logger {} + + override suspend fun run(): List { + // Ensure that the cursor upper bound is known, if required. + if (partition is JdbcCursorPartition<*>) { + ensureCursorUpperBound() + if (streamState.cursorUpperBound?.isNull == true) { + log.info { "Maximum cursor column value query found that the table was empty." } + return listOf() + } + } + if (streamState.fetchSize == null) { + if (sharedState.withSampling) { + val rowByteSizeSample: Sample = + collectSample(sharedState.rowByteSizeEstimator()::apply) + val expectedTableByteSize: Long = + rowByteSizeSample.sampledValues.sum() * rowByteSizeSample.valueWeight + log.info { "Table memory size estimated at ${expectedTableByteSize shr 20} MiB." } + if (rowByteSizeSample.kind == Sample.Kind.EMPTY) { + log.info { "Sampling query found that the table was empty." } + return listOf() + } + streamState.fetchSize = + sharedState.jdbcFetchSizeEstimator().apply(rowByteSizeSample) + } else { + // TODO: adaptive fetchSize computation? + } + } + // Handle edge case where the partition cannot be split. + if (partition !is JdbcSplittablePartition<*>) { + log.warn { + "Table cannot be read by sequential partition reader because it cannot be split." + } + return listOf(JdbcNonResumablePartitionReader(partition)) + } + // Happy path. + log.info { "Table will be read by sequential partition reader(s)." } + return listOf(JdbcResumablePartitionReader(partition)) + } +} + +/** Concurrent JDBC implementation of [PartitionsCreator]. */ +class JdbcConcurrentPartitionsCreator< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +>( + partition: P, + partitionFactory: JdbcPartitionFactory, +) : JdbcPartitionsCreator(partition, partitionFactory) { + private val log = KotlinLogging.logger {} + + override suspend fun run(): List { + // Ensure that the cursor upper bound is known, if required. + if (partition is JdbcCursorPartition<*>) { + ensureCursorUpperBound() + if (streamState.cursorUpperBound?.isNull == true) { + log.info { "Maximum cursor column value query found that the table was empty." } + return listOf() + } + } + // Handle edge case where the table can't be sampled. + if (!sharedState.withSampling) { + log.warn { + "Table cannot be read by concurrent partition readers because it cannot be sampled." + } + // TODO: adaptive fetchSize computation? + return listOf(JdbcNonResumablePartitionReader(partition)) + } + // Sample the table for partition split boundaries and for record byte sizes. + val sample: Sample> = collectSample { record: ObjectNode -> + val boundary: OpaqueStateValue? = + (partition as? JdbcSplittablePartition<*>)?.incompleteState(record) + val rowByteSize: Long = sharedState.rowByteSizeEstimator().apply(record) + boundary to rowByteSize + } + if (sample.kind == Sample.Kind.EMPTY) { + log.info { "Sampling query found that the table was empty." } + return listOf() + } + val rowByteSizeSample: Sample = sample.map { (_, rowByteSize: Long) -> rowByteSize } + streamState.fetchSize = sharedState.jdbcFetchSizeEstimator().apply(rowByteSizeSample) + val expectedTableByteSize: Long = rowByteSizeSample.sampledValues.sum() * sample.valueWeight + log.info { "Table memory size estimated at ${expectedTableByteSize shr 20} MiB." } + // Handle edge case where the table can't be split. + if (partition !is JdbcSplittablePartition<*>) { + log.warn { + "Table cannot be read by concurrent partition readers because it cannot be split." + } + return listOf(JdbcNonResumablePartitionReader(partition)) + } + // Happy path. + log.info { "Target partition size is ${sharedState.targetPartitionByteSize shr 20} MiB." } + val secondarySamplingRate: Double = + if (expectedTableByteSize <= sharedState.targetPartitionByteSize) { + 0.0 + } else { + val expectedPartitionByteSize: Long = + expectedTableByteSize / sharedState.maxSampleSize + if (expectedPartitionByteSize < sharedState.targetPartitionByteSize) { + expectedPartitionByteSize.toDouble() / sharedState.targetPartitionByteSize + } else { + 1.0 + } + } + val random = Random(expectedTableByteSize) // RNG output is repeatable. + val splitBoundaries: List = + sample.sampledValues + .filter { random.nextDouble() < secondarySamplingRate } + .mapNotNull { (splitBoundary: OpaqueStateValue?, _) -> splitBoundary } + .distinct() + val partitions: List> = partitionFactory.split(partition, splitBoundaries) + log.info { "Table will be read by ${partitions.size} concurrent partition reader(s)." } + return partitions.map { JdbcNonResumablePartitionReader(it) } + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorFactory.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorFactory.kt new file mode 100644 index 0000000000000..382dd38e822e5 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorFactory.kt @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.command.OpaqueStateValue +import io.micronaut.context.annotation.Requires +import jakarta.inject.Singleton + +/** Base class for JDBC implementations of [PartitionsCreatorFactory]. */ +sealed class JdbcPartitionsCreatorFactory< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +>( + val partitionFactory: JdbcPartitionFactory, +) : PartitionsCreatorFactory { + + override fun make( + stateQuerier: StateQuerier, + feed: Feed, + ): PartitionsCreator { + val opaqueStateValue: OpaqueStateValue? = stateQuerier.current(feed) + return when (feed) { + is Global -> CreateNoPartitions + is Stream -> { + val partition: P? = partitionFactory.create(feed, opaqueStateValue) + if (partition == null) { + CreateNoPartitions + } else { + partitionsCreator(partition) + } + } + } + } + + abstract fun partitionsCreator(partition: P): JdbcPartitionsCreator +} + +/** Sequential JDBC implementation of [PartitionsCreatorFactory]. */ +@Singleton +@Requires(property = MODE_PROPERTY, value = "sequential") +class JdbcSequentialPartitionsCreatorFactory< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +>( + partitionFactory: JdbcPartitionFactory, +) : JdbcPartitionsCreatorFactory(partitionFactory) { + + override fun partitionsCreator(partition: P): JdbcPartitionsCreator = + JdbcSequentialPartitionsCreator(partition, partitionFactory) +} + +/** Concurrent JDBC implementation of [PartitionsCreatorFactory]. */ +@Singleton +@Requires(property = MODE_PROPERTY, value = "concurrent") +class JdbcConcurrentPartitionsCreatorFactory< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +>( + partitionFactory: JdbcPartitionFactory, +) : JdbcPartitionsCreatorFactory(partitionFactory) { + + override fun partitionsCreator(partition: P): JdbcPartitionsCreator = + JdbcConcurrentPartitionsCreator(partition, partitionFactory) +} + +const val JDBC_PROPERTY_PREFIX = "airbyte.connector.extract.jdbc" + +private const val MODE_PROPERTY = "$JDBC_PROPERTY_PREFIX.mode" diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcSharedState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcSharedState.kt new file mode 100644 index 0000000000000..83e9d8275b9bc --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcSharedState.kt @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.command.JdbcSourceConfiguration +import io.airbyte.cdk.output.OutputConsumer +import io.micronaut.context.annotation.DefaultImplementation + +/** + * Encapsulates database-specific state, both constant or transient, common to all partitions. + * + * Implementations should be thread-safe. + */ +@DefaultImplementation(DefaultJdbcSharedState::class) +interface JdbcSharedState { + + /** Configuration for the JDBC source connector. */ + val configuration: JdbcSourceConfiguration + + /** Where the records get dumped into. */ + val outputConsumer: OutputConsumer + + /** Queries the database. */ + val selectQuerier: SelectQuerier + + /** Is sampling the streams a good idea? */ + val withSampling: Boolean + + /** Sample size limit. */ + val maxSampleSize: Int + + /** Targeted memory footprint of a partition, in bytes. */ + val targetPartitionByteSize: Long + + /** Creates a new instance of a [JdbcFetchSizeEstimator]. */ + fun jdbcFetchSizeEstimator(): JdbcFetchSizeEstimator + + fun interface JdbcFetchSizeEstimator { + /** Estimates a good JDBC fetchSize value based on a [rowByteSizeSample]. */ + fun apply(rowByteSizeSample: Sample): Int + } + + /** Creates a new instance of a [RowByteSizeEstimator]. */ + fun rowByteSizeEstimator(): RowByteSizeEstimator + + fun interface RowByteSizeEstimator { + /** Estimates the memory footprint of a row based on its corresponding [record]. */ + fun apply(record: ObjectNode): Long + } + + /** Tries to acquire global resources for [JdbcPartitionsCreator]. */ + fun tryAcquireResourcesForCreator(): JdbcPartitionsCreator.AcquiredResources? + + /** Tries to acquire global resources for [JdbcPartitionReader]. */ + fun tryAcquireResourcesForReader(): JdbcPartitionReader.AcquiredResources? +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcStreamState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcStreamState.kt new file mode 100644 index 0000000000000..872340c4877c6 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcStreamState.kt @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.JsonNode + +/** + * Encapsulates database-specific transient state for a particular [stream]. + * + * Implementations should be thread-safe. + */ +interface JdbcStreamState { + + val stream: Stream + + /** The transient state shared by all partitions. Includes global resources. */ + val sharedState: A + + /** Value to use as upper bound for the cursor column. */ + var cursorUpperBound: JsonNode? + + /** Value to use for JDBC fetchSize, if specified. */ + var fetchSize: Int? + + /** Same as [fetchSize], but falls back to a default value. */ + val fetchSizeOrDefault: Int + + /** Value to use for the LIMIT clause in resumable reads, if applicable. */ + val limit: Long + + /** Adjusts the [limit] value up or down. */ + fun updateLimitState(fn: (LimitState) -> LimitState) + + /** Resets the transient state to its initial setting. */ + fun reset() +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/MemoryFetchSizeEstimator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/MemoryFetchSizeEstimator.kt deleted file mode 100644 index 4d73e339b3c8b..0000000000000 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/MemoryFetchSizeEstimator.kt +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.read - -import io.github.oshai.kotlinlogging.KotlinLogging -import kotlin.math.max -import kotlin.math.min - -/** - * [MemoryFetchSizeEstimator] is used to estimate the value of the JDBC fetchSize parameter to fill - * up a portion of the JVM heap defined by [MEM_CAPACITY_RATIO]. - */ -class MemoryFetchSizeEstimator( - val maxMemoryBytes: Long, - val maxConcurrency: Int, -) { - private val log = KotlinLogging.logger {} - - fun apply(rowByteSizeSample: Sample): Int { - val maxRowBytes: Long = rowByteSizeSample.sampledValues.maxOrNull() ?: 0L - log.info { - "maximum row size in ${rowByteSizeSample.kind.name} table is $maxRowBytes bytes" - } - val targetMemoryUse: Long = (maxMemoryBytes * MEM_CAPACITY_RATIO).toLong() - if (listOf(maxRowBytes, targetMemoryUse, maxConcurrency.toLong()).any { it <= 0L }) { - return DEFAULT_FETCH_SIZE - } - val targetMemoryUsePerQuery: Long = targetMemoryUse / maxConcurrency - log.info { - "targeting a maximum of $targetMemoryUsePerQuery bytes " + - "for each of up to $maxConcurrency queries" - } - val maxRowsFetchedPerQuery: Long = targetMemoryUsePerQuery / maxRowBytes - return max( - FETCH_SIZE_LOWER_BOUND, - min( - maxRowsFetchedPerQuery, - FETCH_SIZE_UPPER_BOUND.toLong(), - ) - .toInt(), - ) - } - - companion object { - const val FETCH_SIZE_LOWER_BOUND: Int = 10 - const val DEFAULT_FETCH_SIZE: Int = 1_000 - const val FETCH_SIZE_UPPER_BOUND: Int = 10_000_000 - - // We're targeting use of 60% of the available memory in order to allow - // for some headroom for other garbage collection. - const val MEM_CAPACITY_RATIO: Double = 0.6 - } -} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/Sample.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/Sample.kt index 059061ec2740a..eed60db50ca7d 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/Sample.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/Sample.kt @@ -1,6 +1,7 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ package io.airbyte.cdk.read +/** Convenience object for maintaining sampled data and its accompanying metadata. */ data class Sample( val sampledValues: List, val kind: Kind, @@ -10,7 +11,7 @@ data class Sample( enum class Kind { EMPTY, // the table is empty; - TINY, // the table has less rows than the target sample size; + TINY, // the table has fewer rows than the target sample size; SMALL, // collecting the sample still requires a full table scan; MEDIUM, // collecting the sample is possible while sampling at ~0.3%; LARGE, // collecting the sample is possible while sampling most aggressively. diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerier.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerier.kt index ca84de7a91220..93ebb28f4e148 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerier.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerier.kt @@ -71,12 +71,17 @@ class JdbcSelectQuerier( var isReady = false var hasNext = false + var hasLoggedResultsReceived = false override fun hasNext(): Boolean { // hasNext() is idempotent if (isReady) return hasNext // Advance to the next row to become ready again. hasNext = rs!!.next() + if (!hasLoggedResultsReceived) { + log.info { "Received results from server." } + hasLoggedResultsReceived = true + } if (!hasNext) { close() } @@ -107,7 +112,10 @@ class JdbcSelectQuerier( isReady = true hasNext = false try { - rs?.close() + if (rs != null) { + log.info { "Closing ${q.sql}" } + rs!!.close() + } } finally { rs = null try { diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerySpec.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerySpec.kt index 132b987da38e9..86498d4388e1d 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerySpec.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerySpec.kt @@ -26,7 +26,9 @@ sealed interface SelectNode { data class SelectColumns( override val columns: List, -) : SelectNode +) : SelectNode { + constructor(vararg columns: Field) : this(columns.toList()) +} data class SelectColumnMaxValue( val column: Field, @@ -87,6 +89,11 @@ sealed interface WhereClauseLeafNode : WhereClauseNode { val bindingValue: JsonNode } +data class GreaterOrEqual( + override val column: Field, + override val bindingValue: JsonNode, +) : WhereClauseLeafNode + data class Greater( override val column: Field, override val bindingValue: JsonNode, @@ -97,12 +104,12 @@ data class LesserOrEqual( override val bindingValue: JsonNode, ) : WhereClauseLeafNode -data class Equal( +data class Lesser( override val column: Field, override val bindingValue: JsonNode, ) : WhereClauseLeafNode -data class Lesser( +data class Equal( override val column: Field, override val bindingValue: JsonNode, ) : WhereClauseLeafNode @@ -111,7 +118,9 @@ sealed interface OrderByNode data class OrderBy( val columns: List, -) : OrderByNode +) : OrderByNode { + constructor(vararg columns: Field) : this(columns.toList()) +} data object NoOrderBy : OrderByNode diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt deleted file mode 100644 index f163e54145184..0000000000000 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt +++ /dev/null @@ -1,291 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.read - -import com.fasterxml.jackson.databind.JsonNode -import com.fasterxml.jackson.databind.node.ObjectNode -import io.airbyte.cdk.discover.Field -import io.airbyte.cdk.util.Jsons -import io.airbyte.protocol.models.v0.AirbyteRecordMessage -import java.util.concurrent.atomic.AtomicBoolean -import java.util.concurrent.atomic.AtomicLong -import java.util.concurrent.atomic.AtomicReference -import kotlin.coroutines.coroutineContext -import kotlinx.coroutines.ensureActive - -/** Default implementation of [PartitionReader] for streams in JDBC sources. */ -class StreamPartitionReader( - val ctx: StreamReadContext, - val input: Input, - val parameters: Parameters, -) : PartitionReader { - sealed interface Input - - data class SnapshotInput( - val primaryKey: List, - val primaryKeyLowerBound: List?, - val primaryKeyUpperBound: List?, - ) : Input - - data class SnapshotWithCursorInput( - val primaryKey: List, - val primaryKeyLowerBound: List?, - val primaryKeyUpperBound: List?, - val cursor: Field, - val cursorUpperBound: JsonNode, - ) : Input - - data class CursorIncrementalInput( - val cursor: Field, - val cursorLowerBound: JsonNode, - val cursorUpperBound: JsonNode, - ) : Input - - data class Parameters( - val preferResumable: Boolean, - ) - - override fun tryAcquireResources(): PartitionReader.TryAcquireResourcesStatus = - if (ctx.querySemaphore.tryAcquire()) { - PartitionReader.TryAcquireResourcesStatus.READY_TO_RUN - } else { - PartitionReader.TryAcquireResourcesStatus.RETRY_LATER - } - - override fun releaseResources() { - ctx.querySemaphore.release() - } - - val resumable: Boolean = - parameters.preferResumable && - when (input) { - is SnapshotInput -> input.primaryKey.isNotEmpty() - is SnapshotWithCursorInput -> input.primaryKey.isNotEmpty() - is CursorIncrementalInput -> true - } - - val incumbentTransientState = AtomicReference() - val numRecords = AtomicLong() - val lastRecord = AtomicReference(null) - val runComplete = AtomicBoolean(false) - - override suspend fun run() { - // Store the transient state at the start of the run for use in checkpoint(). - val transientState = - TransientState(ctx.transientLimitState.get(), ctx.transientFetchSize.get()) - incumbentTransientState.set(transientState) - // Build the query. - val querySpec: SelectQuerySpec = - input.querySpec( - ctx.stream, - isOrdered = resumable, - limit = transientState.limit.takeIf { resumable }, - ) - val query: SelectQuery = ctx.selectQueryGenerator.generate(querySpec.optimize()) - val streamFieldNames: List = ctx.stream.fields.map { it.id } - val querierParameters = SelectQuerier.Parameters(fetchSize = transientState.fetchSize) - // Execute the query. - ctx.selectQuerier.executeQuery(query, querierParameters).use { result: SelectQuerier.Result - -> - for (record in result) { - val dataRecord: JsonNode = - Jsons.objectNode().apply { - for (fieldName in streamFieldNames) { - set(fieldName, record[fieldName] ?: Jsons.nullNode()) - } - } - ctx.outputConsumer.accept( - AirbyteRecordMessage() - .withStream(ctx.stream.name) - .withNamespace(ctx.stream.namespace) - .withData(dataRecord), - ) - lastRecord.set(record) - numRecords.incrementAndGet() - // If progress can be checkpointed at any time, - // check activity periodically to handle timeout. - if (!resumable) continue - if (numRecords.get() % transientState.fetchSizeOrLowerBound != 0L) continue - coroutineContext.ensureActive() - } - } - runComplete.set(true) - } - - override fun checkpoint(): PartitionReadCheckpoint { - val checkpointState: CheckpointStreamState - val transientState: TransientState = incumbentTransientState.get() - if (!runComplete.get()) { - // Sanity check. - if (!resumable) throw RuntimeException("cannot checkpoint non-resumable read") - // The run method execution was interrupted. - checkpointState = input.checkpoint(lastRecord.get()) - // Decrease the limit clause for the next PartitionReader, because it's too big. - // If it had been smaller then run might have completed in time. - ctx.transientLimitState.update { - if (transientState.limitState.current <= it.current) it.down else it - } - } else if (resumable) { - // The run method executed to completion with a LIMIT clause. - // The partition boundary may or may not have been reached. - // If the number of records read is less than the LIMIT clause, - // then it certainly has. - checkpointState = - if (numRecords.get() < transientState.limit) { - input.checkpoint() - } else { - input.checkpoint(lastRecord.get()) - } - // Increase the limit clause for the next PartitionReader, because it's too small. - // If it had been bigger then run might have executed for longer. - ctx.transientLimitState.update { - if (it.current <= transientState.limitState.current) it.up else it - } - } else { - // The run method executed to completion without a LIMIT clause. - // This implies that the partition boundary has been reached. - checkpointState = input.checkpoint() - } - return PartitionReadCheckpoint(checkpointState.opaqueStateValue(), numRecords.get()) - } - - inner class TransientState( - val limitState: LimitState, - val fetchSize: Int?, - ) { - val fetchSizeOrLowerBound: Int - get() = fetchSize ?: MemoryFetchSizeEstimator.FETCH_SIZE_LOWER_BOUND - - /** Value to use for the LIMIT clause, if applicable. */ - val limit: Long - get() = fetchSizeOrLowerBound * limitState.current - } -} - -/** Converts a [StreamPartitionReader.Input] into a [SelectQuerySpec]. */ -fun StreamPartitionReader.Input.querySpec( - stream: Stream, - isOrdered: Boolean, - limit: Long?, -): SelectQuerySpec = - when (this) { - is StreamPartitionReader.SnapshotInput -> - querySpecForStreamPartitionReader( - stream, - checkpointColumns = primaryKey, - checkpointLowerBound = primaryKeyLowerBound, - checkpointUpperBound = primaryKeyUpperBound, - isOrdered, - limit, - ) - is StreamPartitionReader.SnapshotWithCursorInput -> - querySpecForStreamPartitionReader( - stream, - checkpointColumns = primaryKey, - checkpointLowerBound = primaryKeyLowerBound, - checkpointUpperBound = primaryKeyUpperBound, - isOrdered, - limit, - ) - is StreamPartitionReader.CursorIncrementalInput -> - querySpecForStreamPartitionReader( - stream, - checkpointColumns = listOf(cursor), - checkpointLowerBound = listOf(cursorLowerBound), - checkpointUpperBound = listOf(cursorUpperBound), - isOrdered, - limit, - ) - } - -private fun querySpecForStreamPartitionReader( - stream: Stream, - checkpointColumns: List, - checkpointLowerBound: List?, - checkpointUpperBound: List?, - isOrdered: Boolean, - limit: Long?, -): SelectQuerySpec { - val selectColumns: List = - if (isOrdered) { - stream.fields + checkpointColumns - } else { - stream.fields - } - val zippedLowerBound: List> = - checkpointLowerBound?.let { checkpointColumns.zip(it) } ?: listOf() - val lowerBoundDisj: List = - zippedLowerBound.mapIndexed { idx: Int, (gtCol: Field, gtValue: JsonNode) -> - val lastLeaf: WhereClauseLeafNode = Greater(gtCol, gtValue) - And( - zippedLowerBound.take(idx).map { (eqCol: Field, eqValue: JsonNode) -> - Equal(eqCol, eqValue) - } + listOf(lastLeaf), - ) - } - val zippedUpperBound: List> = - checkpointUpperBound?.let { checkpointColumns.zip(it) } ?: listOf() - val upperBoundDisj: List = - zippedUpperBound.mapIndexed { idx: Int, (leqCol: Field, leqValue: JsonNode) -> - val lastLeaf: WhereClauseLeafNode = - if (idx < zippedUpperBound.size - 1) { - Lesser(leqCol, leqValue) - } else { - LesserOrEqual(leqCol, leqValue) - } - And( - zippedUpperBound.take(idx).map { (eqCol: Field, eqValue: JsonNode) -> - Equal(eqCol, eqValue) - } + listOf(lastLeaf), - ) - } - return SelectQuerySpec( - SelectColumns(selectColumns), - From(stream.name, stream.namespace), - Where(And(Or(lowerBoundDisj), Or(upperBoundDisj))), - if (isOrdered) OrderBy(checkpointColumns) else NoOrderBy, - if (limit == null) NoLimit else Limit(limit), - ) -} - -/** - * Generates a [CheckpointStreamState] using the [StreamPartitionReader.Input] initial state and, if - * provided, the last record read by the [StreamPartitionReader]. When not provided, the partition - * is presumed to have been read in its entirety. - */ -fun StreamPartitionReader.Input.checkpoint(row: ObjectNode? = null): CheckpointStreamState { - fun getRowValue(field: Field): JsonNode = row?.get(field.id) ?: Jsons.nullNode() - return when (this) { - is StreamPartitionReader.SnapshotInput -> - if (row != null) { - SnapshotCheckpoint(primaryKey, primaryKey.map(::getRowValue)) - } else if (primaryKeyUpperBound != null) { - SnapshotCheckpoint(primaryKey, primaryKeyUpperBound) - } else { - SnapshotCompleted - } - is StreamPartitionReader.SnapshotWithCursorInput -> - if (row != null) { - SnapshotWithCursorCheckpoint( - primaryKey, - primaryKey.map(::getRowValue), - cursor, - cursorUpperBound, - ) - } else if (primaryKeyUpperBound != null) { - SnapshotWithCursorCheckpoint( - primaryKey, - primaryKeyUpperBound, - cursor, - cursorUpperBound, - ) - } else { - CursorIncrementalCheckpoint(cursor, cursorUpperBound) - } - is StreamPartitionReader.CursorIncrementalInput -> - if (row == null) { - CursorIncrementalCheckpoint(cursor, cursorUpperBound) - } else { - CursorIncrementalCheckpoint(cursor, getRowValue(cursor)) - } - } -} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt deleted file mode 100644 index b0ca55db9865e..0000000000000 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt +++ /dev/null @@ -1,207 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.read - -import com.fasterxml.jackson.databind.JsonNode -import io.airbyte.cdk.ConfigErrorException -import io.airbyte.cdk.command.OpaqueStateValue -import io.airbyte.cdk.discover.Field -import io.airbyte.protocol.models.v0.SyncMode - -/** Default implementation of [PartitionsCreator] for streams in JDBC sources. */ -class StreamPartitionsCreator( - val ctx: StreamReadContext, - val input: Input, - val parameters: Parameters, - val readerParameters: StreamPartitionReader.Parameters, -) : PartitionsCreator { - sealed interface Input - - data object NoStart : Input - - data class SnapshotColdStart( - val primaryKey: List, - ) : Input - - data class SnapshotWithCursorColdStart( - val primaryKey: List, - val cursor: Field, - ) : Input - - data class CursorIncrementalColdStart( - val cursor: Field, - val cursorLowerBound: JsonNode, - ) : Input - - data class SnapshotWarmStart( - val primaryKey: List, - val primaryKeyLowerBound: List, - ) : Input - - data class SnapshotWithCursorWarmStart( - val primaryKey: List, - val primaryKeyLowerBound: List, - val cursor: Field, - val cursorUpperBound: JsonNode, - ) : Input - - data class CursorIncrementalWarmStart( - val cursor: Field, - val cursorLowerBound: JsonNode, - val cursorUpperBound: JsonNode, - ) : Input - - data class Parameters( - val preferParallelized: Boolean, - val tableSampleSize: Int = 1024, - val throughputBytesPerSecond: Long = 10L * 1024L * 1024L, - ) - - override fun tryAcquireResources(): PartitionsCreator.TryAcquireResourcesStatus = - // Running this PartitionsCreator may not always involve JDBC queries. - // In those cases, the semaphore will be released very soon after, so this is OK. - if (ctx.querySemaphore.tryAcquire()) { - PartitionsCreator.TryAcquireResourcesStatus.READY_TO_RUN - } else { - PartitionsCreator.TryAcquireResourcesStatus.RETRY_LATER - } - - override fun releaseResources() { - ctx.querySemaphore.release() - } - - override suspend fun run(): List = - input.partitionReaderInputs().map { StreamPartitionReader(ctx, it, readerParameters) } - - fun Input.partitionReaderInputs(): List { - return when (this) { - is NoStart -> listOf() - is SnapshotColdStart -> - StreamPartitionReader.SnapshotInput( - primaryKey = primaryKey, - primaryKeyLowerBound = null, - primaryKeyUpperBound = null, - ) - .split() - is SnapshotWithCursorColdStart -> - StreamPartitionReader.SnapshotWithCursorInput( - primaryKey = primaryKey, - primaryKeyLowerBound = null, - primaryKeyUpperBound = null, - cursor = cursor, - cursorUpperBound = utils.computeCursorUpperBound(cursor) ?: return listOf(), - ) - .split() - is CursorIncrementalColdStart -> - StreamPartitionReader.CursorIncrementalInput( - cursor = cursor, - cursorLowerBound = cursorLowerBound, - cursorUpperBound = utils.computeCursorUpperBound(cursor) ?: return listOf(), - ) - .split() - is SnapshotWarmStart -> - StreamPartitionReader.SnapshotInput( - primaryKey = primaryKey, - primaryKeyLowerBound = primaryKeyLowerBound, - primaryKeyUpperBound = null, - ) - .split() - is SnapshotWithCursorWarmStart -> - StreamPartitionReader.SnapshotWithCursorInput( - primaryKey = primaryKey, - primaryKeyLowerBound = primaryKeyLowerBound, - primaryKeyUpperBound = null, - cursor = cursor, - cursorUpperBound = cursorUpperBound, - ) - .split() - is CursorIncrementalWarmStart -> - StreamPartitionReader.CursorIncrementalInput( - cursor = cursor, - cursorLowerBound = cursorLowerBound, - cursorUpperBound = cursorUpperBound, - ) - .split() - } - } - - fun StreamPartitionReader.SnapshotInput.split(): List = - utils.split(this, primaryKeyLowerBound, primaryKeyUpperBound).map { (lb, ub) -> - copy(primaryKeyLowerBound = lb, primaryKeyUpperBound = ub) - } - - fun StreamPartitionReader.SnapshotWithCursorInput.split(): - List = - utils.split(this, primaryKeyLowerBound, primaryKeyUpperBound).map { (lb, ub) -> - copy(primaryKeyLowerBound = lb, primaryKeyUpperBound = ub) - } - - fun StreamPartitionReader.CursorIncrementalInput.split(): - List = - utils.split(this, listOf(cursorLowerBound), listOf(cursorUpperBound)).map { (lb, ub) -> - copy(cursorLowerBound = lb!!.first(), cursorUpperBound = ub!!.first()) - } - - private val utils = StreamPartitionsCreatorUtils(ctx, parameters) -} - -/** Converts a nullable [OpaqueStateValue] into an input for [StreamPartitionsCreator]. */ -fun OpaqueStateValue?.streamPartitionsCreatorInput( - ctx: StreamReadContext, -): StreamPartitionsCreator.Input { - val checkpoint: CheckpointStreamState? = checkpoint(ctx) - if (checkpoint == null && this != null) { - ctx.resetStream() - } - return checkpoint.streamPartitionsCreatorInput(ctx) -} - -/** Converts a nullable [CheckpointStreamState] into an input for [StreamPartitionsCreator]. */ -fun CheckpointStreamState?.streamPartitionsCreatorInput( - ctx: StreamReadContext, -): StreamPartitionsCreator.Input { - if (this == null) { - val pkChosenFromCatalog: List = ctx.stream.configuredPrimaryKey ?: listOf() - if (ctx.stream.configuredSyncMode == SyncMode.FULL_REFRESH || ctx.configuration.global) { - return StreamPartitionsCreator.SnapshotColdStart(pkChosenFromCatalog) - } - val cursorChosenFromCatalog: Field = - ctx.stream.configuredCursor as? Field ?: throw ConfigErrorException("no cursor") - return StreamPartitionsCreator.SnapshotWithCursorColdStart( - pkChosenFromCatalog, - cursorChosenFromCatalog, - ) - } - return when (this) { - SnapshotCompleted -> StreamPartitionsCreator.NoStart - is SnapshotCheckpoint -> - StreamPartitionsCreator.SnapshotWarmStart( - primaryKey, - primaryKeyCheckpoint, - ) - is SnapshotWithCursorCheckpoint -> - StreamPartitionsCreator.SnapshotWithCursorWarmStart( - primaryKey, - primaryKeyCheckpoint, - cursor, - cursorUpperBound, - ) - is CursorIncrementalCheckpoint -> - when (val cursorUpperBound: JsonNode? = ctx.transientCursorUpperBoundState.get()) { - null -> - StreamPartitionsCreator.CursorIncrementalColdStart( - cursor, - cursorCheckpoint, - ) - else -> - if (cursorCheckpoint == cursorUpperBound) { - StreamPartitionsCreator.NoStart - } else { - StreamPartitionsCreator.CursorIncrementalWarmStart( - cursor, - cursorCheckpoint, - cursorUpperBound, - ) - } - } - } -} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt deleted file mode 100644 index 16bdec7d71439..0000000000000 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.read - -import com.fasterxml.jackson.core.JsonGenerator -import com.fasterxml.jackson.databind.JsonNode -import com.fasterxml.jackson.databind.node.ObjectNode -import io.airbyte.cdk.ConfigErrorException -import io.airbyte.cdk.discover.Field -import io.airbyte.cdk.util.Jsons -import io.github.oshai.kotlinlogging.KotlinLogging -import java.io.OutputStream -import kotlin.random.Random - -/** Utilities for [StreamPartitionsCreator] that don't rely directly on its input state. */ -class StreamPartitionsCreatorUtils( - val ctx: StreamReadContext, - val parameters: StreamPartitionsCreator.Parameters, -) { - fun split( - input: StreamPartitionReader.Input, - globalLowerBound: List?, - globalUpperBound: List?, - ): List?, List?>> { - // Collect a sample from the unsplit partition of this table. - // Each sampled row is mapped to the values of the order fields - // and to the approximate byte size in memory of the row. - val unsplitQuerySpec: SelectQuerySpec = - input.querySpec(ctx.stream, isOrdered = true, limit = null) - val checkpointColumns: List = (unsplitQuerySpec.orderBy as OrderBy).columns - val rowByteSizeEstimator: (ObjectNode) -> Long = rowByteSizeEstimator() - val sample: Sample, Long>> by lazy { - log.info { "Sampling stream '${ctx.stream.label}'" } - collectSample(unsplitQuerySpec) { record: ObjectNode -> - val checkpointValues: List = - checkpointColumns.map { record[it.id] ?: Jsons.nullNode() } - checkpointValues to rowByteSizeEstimator(record) - } - } - // Ensure that the JDBC fetchSize parameter value for this table is set. - // Compute it using the sample. - if (ctx.transientFetchSize.get() == null) { - val rowByteSizeSample: Sample = - sample.map { (_, rowByteSize: Long) -> rowByteSize } - val maxMemoryBytes: Long = Runtime.getRuntime().maxMemory() - val fetchSizeEstimator = - MemoryFetchSizeEstimator(maxMemoryBytes, ctx.configuration.maxConcurrency) - ctx.transientFetchSize.update { fetchSizeEstimator.apply(rowByteSizeSample) } - } - // Compute partition split boundaries. - // First, check if splitting can or should be done, and exit if that isn't the case. - if (checkpointColumns.isEmpty() || !parameters.preferParallelized) { - log.info { - "not attempting to create more than one partition for '${ctx.stream.label}'" - } - return listOf(globalLowerBound to globalUpperBound) - } - // At this point, try to split the partition defined by - // ]globalLowerBound, globalUpperBound]. Each of these splits should be processed within the - // targeted amount of time defined in the configuration. This estimate is very imprecise: - // the sampling is almost certainly going to be biased, the throughput is wildly dependent - // on many uncontrollable factors, etc. - val splitBoundaries: List> = computeSplitBoundaries(sample) - if (splitBoundaries.isEmpty()) { - log.info { "creating one partition for remaining data in '${ctx.stream.label}" } - } else { - log.info { - "split remaining data in '${ctx.stream.label} " + - "into ${splitBoundaries.size + 1} partitions" - } - } - val lbs: List?> = listOf(globalLowerBound) + splitBoundaries - val ubs: List?> = splitBoundaries + listOf(globalUpperBound) - return lbs.zip(ubs) - } - - fun rowByteSizeEstimator(): (ObjectNode) -> Long { - val countingOutputStream = - object : OutputStream() { - var counter: Long = 0L - - override fun write(b: Int) { - counter++ - } - } - val jsonGenerator: JsonGenerator = Jsons.createGenerator(countingOutputStream) - val fieldOverheadEstimate = 16L - return { record: ObjectNode -> - countingOutputStream.counter = 0L - Jsons.writeValue(jsonGenerator, record) - val rowOverheadBytes: Long = - fieldOverheadEstimate * record.fields().asSequence().count() - countingOutputStream.counter + rowOverheadBytes - } - } - - /** Computes the max value for the cursor column, used as an upper bound during this sync. */ - fun computeCursorUpperBound(cursor: Field): JsonNode? { - val querySpec = - SelectQuerySpec( - SelectColumnMaxValue(cursor), - From(ctx.stream.name, ctx.stream.namespace), - ) - val q: SelectQuery = ctx.selectQueryGenerator.generate(querySpec.optimize()) - val record: ObjectNode = - ctx.selectQuerier.executeQuery(q).use { if (it.hasNext()) it.next() else return null } - val value: JsonNode = record[cursor.id] ?: Jsons.nullNode() - if (value.isNull) { - throw ConfigErrorException("NULL value found for cursor ${cursor.id}") - } - return ctx.transientCursorUpperBoundState.update { value } - } - - /** Computes the partition split boundaries from the given sample. */ - private fun computeSplitBoundaries( - sample: Sample, Long>>, - ): List> { - val expectedTableByteSize: Long = - sample.sampledValues.sumOf { (_, rowByteSize: Long) -> - rowByteSize * sample.valueWeight - } - log.info { - "remaining data in '${ctx.stream.label}' " + - "is estimated at ${expectedTableByteSize shr 20} MiB" - } - val streamThroughputBytesPerSecond: Long = - parameters.throughputBytesPerSecond / ctx.configuration.maxConcurrency - val targetCheckpointByteSize: Long = - streamThroughputBytesPerSecond * ctx.configuration.checkpointTargetInterval.seconds - log.info { - "target partition size for '${ctx.stream.label}' " + - "is ${targetCheckpointByteSize shr 20} MiB" - } - val secondarySamplingRate: Double = - if (expectedTableByteSize <= targetCheckpointByteSize) { - 0.0 - } else { - val expectedPartitionByteSize: Long = - expectedTableByteSize / parameters.tableSampleSize - if (expectedPartitionByteSize < targetCheckpointByteSize) { - expectedPartitionByteSize.toDouble() / targetCheckpointByteSize - } else { - 1.0 - } - } - val random = Random(expectedTableByteSize) // RNG output is repeatable. - return sample.sampledValues - .filter { random.nextDouble() < secondarySamplingRate } - .map { (splitBoundary: List, _) -> splitBoundary } - } - - /** Collects a sample of rows in the unsplit partition. */ - fun collectSample( - querySpec: SelectQuerySpec, - rowFn: (ObjectNode) -> T, - ): Sample { - val values = mutableListOf() - var previousWeight = 0L - for (sampleRateInvPow2 in listOf(16, 8, 0)) { - // First, try sampling the table at a rate of one every 2^16 = 65_536 rows. - // If that's not enough to produce the desired number of sampled rows (1024 by default) - // then try sampling at a higher rate of one every 2^8 = 256 rows. - // If that's still not enough, don't sample at all. - values.clear() - val fromSample = - FromSample( - ctx.stream.name, - ctx.stream.namespace, - sampleRateInvPow2, - parameters.tableSampleSize, - ) - val sampledQuerySpec: SelectQuerySpec = querySpec.copy(from = fromSample) - val q: SelectQuery = ctx.selectQueryGenerator.generate(sampledQuerySpec.optimize()) - ctx.selectQuerier.executeQuery(q).use { for (record in it) values.add(rowFn(record)) } - if (values.size < parameters.tableSampleSize) { - previousWeight = (fromSample.sampleRateInv * values.size) / fromSample.sampleSize - continue - } - val kind: Sample.Kind = - when (sampleRateInvPow2) { - 16 -> Sample.Kind.LARGE - 8 -> Sample.Kind.MEDIUM - else -> Sample.Kind.SMALL - } - log.info { "sampled ${values.size} rows in ${kind.name} stream ${ctx.stream.label}." } - return Sample(values, kind, previousWeight.coerceAtLeast(fromSample.sampleRateInv)) - } - val kind: Sample.Kind = if (values.isEmpty()) Sample.Kind.EMPTY else Sample.Kind.TINY - log.info { "sampled ${values.size} rows in ${kind.name} stream ${ctx.stream.label}." } - return Sample(values, kind, if (values.isEmpty()) 0L else 1L) - } - - private val log = KotlinLogging.logger {} -} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContext.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContext.kt deleted file mode 100644 index bde696168bfed..0000000000000 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContext.kt +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.read - -import com.fasterxml.jackson.databind.JsonNode -import io.airbyte.cdk.command.JdbcSourceConfiguration -import io.airbyte.cdk.output.CatalogValidationFailureHandler -import io.airbyte.cdk.output.OutputConsumer -import io.airbyte.cdk.output.ResetStream -import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair -import jakarta.inject.Singleton -import java.util.concurrent.ConcurrentHashMap -import java.util.concurrent.ConcurrentMap -import java.util.concurrent.atomic.AtomicReference -import kotlinx.coroutines.sync.Semaphore - -/** - * A [StreamReadContextManager] may be injected in a [io.airbyte.cdk.read.PartitionsCreatorFactory] - * to provide it, and the[io.airbyte.cdk.read.PartitionsCreator] and - * [io.airbyte.cdk.read.PartitionReader] instances it creates, with a set of global singletons - * useful for implementing stream READs for a JDBC source. - * - * For each stream in the configured catalog, these global singletons are packaged in a - * [StreamReadContext] which bundles them with the corresponding [Stream] as well as a couple - * [TransientState] instances which hold mutable metadata which is _transient_, transient in the - * sense that it is not persisted in an Airbyte STATE message. - */ -@Singleton -class StreamReadContextManager( - val configuration: JdbcSourceConfiguration, - val handler: CatalogValidationFailureHandler, - val selectQueryGenerator: SelectQueryGenerator, - val selectQuerier: SelectQuerier, - val outputConsumer: OutputConsumer, -) { - private val map: ConcurrentMap = - ConcurrentHashMap() - - private val globalSemaphore = Semaphore(configuration.maxConcurrency) - - operator fun get(stream: Stream): StreamReadContext = - map.getOrPut(stream.namePair) { - StreamReadContext( - configuration, - handler, - selectQueryGenerator, - selectQuerier, - globalSemaphore, - outputConsumer, - stream, - ) - } -} - -class StreamReadContext( - val configuration: JdbcSourceConfiguration, - val handler: CatalogValidationFailureHandler, - val selectQueryGenerator: SelectQueryGenerator, - val selectQuerier: SelectQuerier, - val querySemaphore: Semaphore, - val outputConsumer: OutputConsumer, - val stream: Stream, -) { - val transientLimitState: TransientState = TransientState(LimitState.minimum) - - val transientCursorUpperBoundState: TransientState = TransientState(null) - - val transientFetchSize: TransientState = TransientState(null) - - fun resetStream() { - handler.accept(ResetStream(stream.name, stream.namespace)) - transientLimitState.reset() - transientCursorUpperBoundState.reset() - transientFetchSize.reset() - } -} - -class TransientState( - val initialState: T, -) { - private val ref: AtomicReference = AtomicReference(initialState) - - fun get(): T = ref.get() - - fun reset() { - ref.set(initialState) - } - - fun update(fn: (T) -> T): T = ref.updateAndGet(fn) -} diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/discover/JdbcMetadataQuerierTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/discover/JdbcMetadataQuerierTest.kt similarity index 85% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/discover/JdbcMetadataQuerierTest.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/discover/JdbcMetadataQuerierTest.kt index f8355036e9974..a2c56900e2ab7 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/discover/JdbcMetadataQuerierTest.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/discover/JdbcMetadataQuerierTest.kt @@ -1,11 +1,11 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ package io.airbyte.cdk.discover -import io.airbyte.cdk.fakesource.FakeSourceConfiguration -import io.airbyte.cdk.fakesource.FakeSourceConfigurationFactory -import io.airbyte.cdk.fakesource.FakeSourceConfigurationJsonObject -import io.airbyte.cdk.fakesource.FakeSourceOperations import io.airbyte.cdk.h2.H2TestFixture +import io.airbyte.cdk.h2source.H2SourceConfiguration +import io.airbyte.cdk.h2source.H2SourceConfigurationFactory +import io.airbyte.cdk.h2source.H2SourceConfigurationJsonObject +import io.airbyte.cdk.h2source.H2SourceOperations import java.sql.JDBCType import org.junit.jupiter.api.Assertions import org.junit.jupiter.api.Test @@ -17,16 +17,16 @@ class JdbcMetadataQuerierTest { h2.execute("CREATE TABLE kv (k INT PRIMARY KEY, v VARCHAR(60))") } - val factory = JdbcMetadataQuerier.Factory(FakeSourceOperations(), FakeSourceOperations()) + val factory = JdbcMetadataQuerier.Factory(H2SourceOperations(), H2SourceOperations()) @Test fun test() { val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database } - val config: FakeSourceConfiguration = FakeSourceConfigurationFactory().make(configPojo) + val config: H2SourceConfiguration = H2SourceConfigurationFactory().make(configPojo) factory.session(config).use { mdq: MetadataQuerier -> Assertions.assertEquals(listOf("PUBLIC"), mdq.streamNamespaces()) Assertions.assertEquals(listOf("KV"), mdq.streamNames("PUBLIC")) diff --git a/airbyte-cdk/bulk/core/base/src/test/kotlin/io/airbyte/cdk/h2/H2TestFixtureTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/h2/H2TestFixtureTest.kt similarity index 100% rename from airbyte-cdk/bulk/core/base/src/test/kotlin/io/airbyte/cdk/h2/H2TestFixtureTest.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/h2/H2TestFixtureTest.kt diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceIntegrationTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/h2source/H2SourceIntegrationTest.kt similarity index 81% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceIntegrationTest.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/h2source/H2SourceIntegrationTest.kt index 28fd695763fec..271723e628c2c 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceIntegrationTest.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/h2source/H2SourceIntegrationTest.kt @@ -1,5 +1,5 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +package io.airbyte.cdk.h2source import io.airbyte.cdk.command.SyncsTestFixture import io.airbyte.cdk.h2.H2TestFixture @@ -10,16 +10,16 @@ import java.sql.Statement import org.junit.jupiter.api.Test import org.testcontainers.Testcontainers -class FakeSourceIntegrationTest { +class H2SourceIntegrationTest { @Test fun testSpec() { - SyncsTestFixture.testSpec("fakesource/expected-spec.json") + SyncsTestFixture.testSpec("h2source/expected-spec.json") } @Test fun testCheckFailBadConfig() { SyncsTestFixture.testCheck( - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = -1 database = "" }, @@ -31,7 +31,7 @@ class FakeSourceIntegrationTest { fun testCheckFailNoDatabase() { H2TestFixture().use { h2: H2TestFixture -> val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database + "_garbage" } @@ -43,7 +43,7 @@ class FakeSourceIntegrationTest { fun testCheckFailNoTables() { H2TestFixture().use { h2: H2TestFixture -> val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database } @@ -56,7 +56,7 @@ class FakeSourceIntegrationTest { H2TestFixture().use { h2: H2TestFixture -> h2.createConnection().use(Companion::prelude) val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database } @@ -71,7 +71,7 @@ class FakeSourceIntegrationTest { Testcontainers.exposeHostPorts(h2.port) SshBastionContainer(tunnelingToHostPort = h2.port).use { ssh: SshBastionContainer -> val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { host = DOCKER_HOST_FROM_WITHIN_CONTAINER // required only because of container port = h2.port @@ -90,11 +90,11 @@ class FakeSourceIntegrationTest { H2TestFixture().use { h2: H2TestFixture -> h2.createConnection().use(Companion::prelude) val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database } - SyncsTestFixture.testDiscover(configPojo, "fakesource/expected-cursor-catalog.json") + SyncsTestFixture.testDiscover(configPojo, "h2source/expected-cursor-catalog.json") } } @@ -102,7 +102,7 @@ class FakeSourceIntegrationTest { fun testReadGlobal() { H2TestFixture().use { h2: H2TestFixture -> val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database setCursorMethodValue(CdcCursor) @@ -112,10 +112,10 @@ class FakeSourceIntegrationTest { configPojo, h2::createConnection, Companion::prelude, - "fakesource/expected-cdc-catalog.json", - "fakesource/cdc-catalog.json", + "h2source/expected-cdc-catalog.json", + "h2source/cdc-catalog.json", SyncsTestFixture.AfterRead.Companion.fromExpectedMessages( - "fakesource/expected-messages-global-cold-start.json", + "h2source/expected-messages-global-cold-start.json", ), ) } @@ -125,7 +125,7 @@ class FakeSourceIntegrationTest { fun testReadStreams() { H2TestFixture().use { h2: H2TestFixture -> val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database resumablePreferred = true @@ -134,13 +134,13 @@ class FakeSourceIntegrationTest { configPojo, h2::createConnection, Companion::prelude, - "fakesource/expected-cursor-catalog.json", - "fakesource/cursor-catalog.json", + "h2source/expected-cursor-catalog.json", + "h2source/cursor-catalog.json", SyncsTestFixture.AfterRead.Companion.fromExpectedMessages( - "fakesource/expected-messages-stream-cold-start.json", + "h2source/expected-messages-stream-cold-start.json", ), SyncsTestFixture.AfterRead.Companion.fromExpectedMessages( - "fakesource/expected-messages-stream-warm-start.json", + "h2source/expected-messages-stream-warm-start.json", ), ) } diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcAccessorTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcAccessorTest.kt similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcAccessorTest.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcAccessorTest.kt diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcConnectionFactoryTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcConnectionFactoryTest.kt similarity index 73% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcConnectionFactoryTest.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcConnectionFactoryTest.kt index e7a9b765ea7eb..336e8cf284b05 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcConnectionFactoryTest.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcConnectionFactoryTest.kt @@ -1,9 +1,9 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ package io.airbyte.cdk.jdbc -import io.airbyte.cdk.fakesource.FakeSourceConfigurationFactory -import io.airbyte.cdk.fakesource.FakeSourceConfigurationJsonObject import io.airbyte.cdk.h2.H2TestFixture +import io.airbyte.cdk.h2source.H2SourceConfigurationFactory +import io.airbyte.cdk.h2source.H2SourceConfigurationJsonObject import io.airbyte.cdk.ssh.SshBastionContainer import io.airbyte.cdk.testcontainers.DOCKER_HOST_FROM_WITHIN_CONTAINER import org.junit.jupiter.api.Assertions @@ -22,37 +22,37 @@ class JdbcConnectionFactoryTest { @Test fun testVanilla() { val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database } - val factory = JdbcConnectionFactory(FakeSourceConfigurationFactory().make(configPojo)) + val factory = JdbcConnectionFactory(H2SourceConfigurationFactory().make(configPojo)) Assertions.assertEquals("H2", factory.get().metaData.databaseProductName) } @Test fun testSshKeyAuth() { val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { host = DOCKER_HOST_FROM_WITHIN_CONTAINER // required only because of container port = h2.port database = h2.database setTunnelMethodValue(sshBastion.outerKeyAuthTunnelMethod) } - val factory = JdbcConnectionFactory(FakeSourceConfigurationFactory().make(configPojo)) + val factory = JdbcConnectionFactory(H2SourceConfigurationFactory().make(configPojo)) Assertions.assertEquals("H2", factory.get().metaData.databaseProductName) } @Test fun testSshPasswordAuth() { val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { host = DOCKER_HOST_FROM_WITHIN_CONTAINER // required only because of container port = h2.port database = h2.database setTunnelMethodValue(sshBastion.outerPasswordAuthTunnelMethod) } - val factory = JdbcConnectionFactory(FakeSourceConfigurationFactory().make(configPojo)) + val factory = JdbcConnectionFactory(H2SourceConfigurationFactory().make(configPojo)) Assertions.assertEquals("H2", factory.get().metaData.databaseProductName) } } diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimatorTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimatorTest.kt new file mode 100644 index 0000000000000..70228817de8e8 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimatorTest.kt @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.read.Sample.Kind +import io.airbyte.cdk.read.TestFixtures.sharedState +import org.junit.jupiter.api.Assertions +import org.junit.jupiter.api.Test + +class DefaultJdbcFetchSizeEstimatorTest { + + @Test + fun testSingleSmall() { + val sample = Sample(listOf(10L, 20L, 30L), Kind.SMALL, valueWeight = 0L) + val sharedState = sharedState(maxMemoryBytesForTesting = 700_000, maxConcurrency = 1) + val estimator = sharedState.jdbcFetchSizeEstimator() + Assertions.assertEquals(14_000, estimator.apply(sample)) + } + + @Test + fun testTwoSmall() { + val sample = Sample(listOf(10L, 20L, 30L), Kind.SMALL, valueWeight = 0L) + val sharedState = sharedState(maxMemoryBytesForTesting = 700_000, maxConcurrency = 2) + val estimator = sharedState.jdbcFetchSizeEstimator() + Assertions.assertEquals(7_000, estimator.apply(sample)) + } + + @Test + fun testEmpty() { + val sample = Sample(listOf(), Kind.EMPTY, 0L) + val sharedState = sharedState(maxMemoryBytesForTesting = 700_000, maxConcurrency = 2) + val estimator = sharedState.jdbcFetchSizeEstimator() + Assertions.assertEquals(sharedState.constants.defaultFetchSize, estimator.apply(sample)) + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactoryTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactoryTest.kt new file mode 100644 index 0000000000000..0f7e05158752b --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactoryTest.kt @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.data.IntCodec +import io.airbyte.cdk.data.LocalDateCodec +import io.airbyte.cdk.output.InvalidCursor +import io.airbyte.cdk.output.InvalidPrimaryKey +import io.airbyte.cdk.output.ResetStream +import io.airbyte.cdk.read.TestFixtures.assertFailures +import io.airbyte.cdk.read.TestFixtures.assertJsonEquals +import io.airbyte.cdk.read.TestFixtures.assertQueryEquals +import io.airbyte.cdk.read.TestFixtures.factory +import io.airbyte.cdk.read.TestFixtures.id +import io.airbyte.cdk.read.TestFixtures.msg +import io.airbyte.cdk.read.TestFixtures.opaqueStateValue +import io.airbyte.cdk.read.TestFixtures.record +import io.airbyte.cdk.read.TestFixtures.sharedState +import io.airbyte.cdk.read.TestFixtures.stream +import io.airbyte.cdk.read.TestFixtures.ts +import java.time.LocalDate +import org.junit.jupiter.api.Assertions +import org.junit.jupiter.api.Test + +class DefaultJdbcPartitionFactoryTest { + + val cursorValue = LocalDate.parse("2024-08-19") + + @Test + fun testColdStartUnsplittableSnapshot() { + val stream = stream(withPK = false, withCursor = false) + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue = null) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcUnsplittableSnapshotPartition) + val partition = result as DefaultJdbcUnsplittableSnapshotPartition + // Check partition properties + sanityCheck(stream, factory, partition) + // Check query generation + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec(SelectColumns(id, ts, msg), From(stream.name, stream.namespace)) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, + ), + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue()) + } + + @Test + fun testColdStartUnsplittableSnapshotWithCursor() { + val stream = stream(withPK = false) + val factory = sharedState().factory() + val result = factory.create(stream, null) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcUnsplittableSnapshotWithCursorPartition) + val partition = result as DefaultJdbcUnsplittableSnapshotWithCursorPartition + partition.streamState.cursorUpperBound = LocalDateCodec.encode(cursorValue) + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(ts, partition.cursor) + // Check query generation + partition.cursorUpperBoundQuery.assertQueryEquals( + SelectQuerySpec(SelectColumnMaxValue(ts), From(stream.name, stream.namespace)) + ) + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec(SelectColumns(id, ts, msg), From(stream.name, stream.namespace)) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, + ), + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue(cursor = cursorValue)) + } + + @Test + fun testColdStartSplittableSnapshot() { + val stream = stream(withCursor = false) + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue = null) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcSplittableSnapshotPartition) + val partition = result as DefaultJdbcSplittableSnapshotPartition + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(listOf(id), partition.checkpointColumns) + Assertions.assertNull(partition.lowerBound) + Assertions.assertNull(partition.upperBound) + // Check query generation + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec(SelectColumns(id, ts, msg), From(stream.name, stream.namespace)) + ) + partition + .resumableQuery(limit = 10L) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + NoWhere, + OrderBy(id), + Limit(10L) + ) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, + ), + NoWhere, + OrderBy(id), + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue()) + partition.incompleteState(record(pk = 22)).assertJsonEquals(opaqueStateValue(pk = 22)) + // Check split output + val rawSplits: List = + factory.split(partition, listOf(opaqueStateValue(pk = 22), opaqueStateValue(pk = 44))) + val splits: List = + rawSplits.filterIsInstance() + Assertions.assertIterableEquals(rawSplits, splits) + splits.forEach { + sanityCheck(stream, factory, it) + Assertions.assertIterableEquals(listOf(id), it.checkpointColumns) + } + Assertions.assertEquals(3, splits.size) + Assertions.assertNull(splits[0].lowerBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), splits[0].upperBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), splits[1].lowerBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(44)), splits[1].upperBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(44)), splits[2].lowerBound) + Assertions.assertNull(splits[2].upperBound) + } + + @Test + fun testColdStartSplittableSnapshotWithCursor() { + val stream = stream() + val factory = sharedState().factory() + val result = factory.create(stream, null) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcSplittableSnapshotWithCursorPartition) + val partition = result as DefaultJdbcSplittableSnapshotWithCursorPartition + partition.streamState.cursorUpperBound = LocalDateCodec.encode(cursorValue) + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(listOf(id), partition.checkpointColumns) + Assertions.assertEquals(ts, partition.cursor) + Assertions.assertNull(partition.lowerBound) + Assertions.assertNull(partition.upperBound) + // Check query generation + partition.cursorUpperBoundQuery.assertQueryEquals( + SelectQuerySpec(SelectColumnMaxValue(ts), From(stream.name, stream.namespace)) + ) + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec(SelectColumns(id, ts, msg), From(stream.name, stream.namespace)) + ) + partition + .resumableQuery(limit = 10L) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + NoWhere, + OrderBy(id), + Limit(10L) + ) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, + ), + NoWhere, + OrderBy(id) + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue(cursor = cursorValue)) + partition + .incompleteState(record(pk = 22)) + .assertJsonEquals(opaqueStateValue(pk = 22, cursor = cursorValue)) + // Check split output + val rawSplits: List = + factory.split(partition, listOf(opaqueStateValue(pk = 22), opaqueStateValue(pk = 44))) + val splits: List = + rawSplits.filterIsInstance() + Assertions.assertIterableEquals(rawSplits, splits) + splits.forEach { + sanityCheck(stream, factory, it) + Assertions.assertIterableEquals(listOf(id), it.checkpointColumns) + Assertions.assertEquals(ts, it.cursor) + Assertions.assertEquals(LocalDateCodec.encode(cursorValue), it.cursorUpperBound) + } + Assertions.assertEquals(3, splits.size) + Assertions.assertNull(splits[0].lowerBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), splits[0].upperBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), splits[1].lowerBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(44)), splits[1].upperBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(44)), splits[2].lowerBound) + Assertions.assertNull(splits[2].upperBound) + } + + @Test + fun testInvalidPrimaryKey() { + val stream = stream(withPK = false, withCursor = false) + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue(pk = 22)) + factory.assertFailures( + InvalidPrimaryKey(stream.name, stream.namespace, listOf(id.id)), + ResetStream(stream.name, stream.namespace), + ) + Assertions.assertTrue(result is DefaultJdbcUnsplittableSnapshotPartition) + val partition = result as DefaultJdbcUnsplittableSnapshotPartition + // Check partition properties + sanityCheck(stream, factory, partition) + } + + @Test + fun testInvalidCursor() { + val stream = stream(withCursor = false) + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue(cursor = cursorValue)) + factory.assertFailures( + InvalidCursor(stream.name, stream.namespace, ts.id), + ResetStream(stream.name, stream.namespace), + ) + Assertions.assertTrue(result is DefaultJdbcSplittableSnapshotPartition) + val partition = result as DefaultJdbcSplittableSnapshotPartition + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(listOf(id), partition.checkpointColumns) + Assertions.assertNull(partition.lowerBound) + Assertions.assertNull(partition.upperBound) + } + + @Test + fun testWarmStartSnapshot() { + val stream = stream(withCursor = false) + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue(pk = 22)) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcSplittableSnapshotPartition) + val partition = result as DefaultJdbcSplittableSnapshotPartition + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(listOf(id), partition.checkpointColumns) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), partition.lowerBound) + Assertions.assertNull(partition.upperBound) + // Check query generation + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where(Greater(id, IntCodec.encode(22))) + ) + ) + partition + .resumableQuery(limit = 10L) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id), + Limit(10L) + ) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, + ), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id), + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue()) + partition.incompleteState(record(pk = 10)).assertJsonEquals(opaqueStateValue(pk = 10)) + // Check full refresh termination criteria + val finalResult = factory.create(stream, opaqueStateValue()) + factory.assertFailures() + Assertions.assertNull(finalResult) + } + + @Test + fun testWarmStartSnapshotWithCursor() { + val stream = stream() + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue(pk = 22, cursor = cursorValue)) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcSplittableSnapshotWithCursorPartition) + val partition = result as DefaultJdbcSplittableSnapshotWithCursorPartition + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(listOf(id), partition.checkpointColumns) + Assertions.assertEquals(ts, partition.cursor) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), partition.lowerBound) + Assertions.assertNull(partition.upperBound) + // Check query generation + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where(Greater(id, IntCodec.encode(22))), + ) + ) + partition + .resumableQuery(limit = 10L) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id), + Limit(10L) + ) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, + ), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id) + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue(cursor = cursorValue)) + partition + .incompleteState(record(pk = 44)) + .assertJsonEquals(opaqueStateValue(pk = 44, cursor = cursorValue)) + // Check snapshot termination criteria and transition to cursor-based incremental + val finalResult = factory.create(stream, opaqueStateValue(cursor = cursorValue)) + factory.assertFailures() + Assertions.assertTrue(finalResult is DefaultJdbcCursorIncrementalPartition) + val finalPartition = finalResult as DefaultJdbcCursorIncrementalPartition + sanityCheck(stream, factory, finalPartition) + Assertions.assertEquals(ts, finalPartition.cursor) + Assertions.assertEquals(LocalDateCodec.encode(cursorValue), finalPartition.cursorLowerBound) + } + + @Test + fun testCursorIncremental() { + val stream = stream(withPK = false) + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue(cursor = cursorValue)) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcCursorIncrementalPartition) + val partition = result as DefaultJdbcCursorIncrementalPartition + val cursorUpperBound = cursorValue.plusMonths(1) + partition.streamState.cursorUpperBound = LocalDateCodec.encode(cursorUpperBound) + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(ts, partition.cursor) + Assertions.assertIterableEquals(listOf(ts), partition.checkpointColumns) + Assertions.assertEquals(LocalDateCodec.encode(cursorValue), partition.cursorLowerBound) + Assertions.assertIterableEquals(listOf(partition.cursorLowerBound), partition.lowerBound) + Assertions.assertEquals(LocalDateCodec.encode(cursorUpperBound), partition.cursorUpperBound) + Assertions.assertIterableEquals(listOf(partition.cursorUpperBound), partition.upperBound) + Assertions.assertTrue(partition.isLowerBoundIncluded) + // Check query generation + partition.cursorUpperBoundQuery.assertQueryEquals( + SelectQuerySpec(SelectColumnMaxValue(ts), From(stream.name, stream.namespace)) + ) + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where( + And( + GreaterOrEqual(ts, LocalDateCodec.encode(cursorValue)), + LesserOrEqual(ts, LocalDateCodec.encode(cursorUpperBound)) + ), + ), + ) + ) + partition + .resumableQuery(limit = 10L) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where( + And( + GreaterOrEqual(ts, LocalDateCodec.encode(cursorValue)), + LesserOrEqual(ts, LocalDateCodec.encode(cursorUpperBound)) + ), + ), + OrderBy(ts), + Limit(10L) + ) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, + ), + Where( + And( + GreaterOrEqual(ts, LocalDateCodec.encode(cursorValue)), + LesserOrEqual(ts, LocalDateCodec.encode(cursorUpperBound)) + ), + ), + OrderBy(ts) + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue(cursor = cursorUpperBound)) + partition + .incompleteState(record(cursor = cursorValue.plusDays(1))) + .assertJsonEquals(opaqueStateValue(cursor = cursorValue.plusDays(1))) + // Check that subsequent non-terminal partition includes the lower bound + val nextResult = factory.create(stream, opaqueStateValue(cursor = cursorValue.plusDays(1))) + factory.assertFailures() + Assertions.assertTrue(nextResult is DefaultJdbcCursorIncrementalPartition) + val nextPartition = nextResult as DefaultJdbcCursorIncrementalPartition + sanityCheck(stream, factory, nextPartition) + Assertions.assertTrue(nextPartition.isLowerBoundIncluded) + // Check termination criteria + val finalResult = factory.create(stream, opaqueStateValue(cursor = cursorUpperBound)) + factory.assertFailures() + Assertions.assertNull(finalResult) + // Check split output + val boundary1 = cursorValue.plusDays(1) + val boundary2 = cursorValue.plusDays(2) + val rawSplits: List = + factory.split( + partition, + listOf(opaqueStateValue(cursor = boundary1), opaqueStateValue(cursor = boundary2)), + ) + val splits: List = + rawSplits.filterIsInstance() + Assertions.assertIterableEquals(rawSplits, splits) + splits.forEach { + sanityCheck(stream, factory, it) + Assertions.assertEquals(ts, it.cursor) + } + Assertions.assertEquals(3, splits.size) + Assertions.assertEquals(LocalDateCodec.encode(cursorValue), splits[0].cursorLowerBound) + Assertions.assertTrue(splits[0].isLowerBoundIncluded) + Assertions.assertEquals(LocalDateCodec.encode(boundary1), splits[0].cursorUpperBound) + Assertions.assertEquals(LocalDateCodec.encode(boundary1), splits[1].cursorLowerBound) + Assertions.assertFalse(splits[1].isLowerBoundIncluded) + Assertions.assertEquals(LocalDateCodec.encode(boundary2), splits[1].cursorUpperBound) + Assertions.assertEquals(LocalDateCodec.encode(boundary2), splits[2].cursorLowerBound) + Assertions.assertFalse(splits[2].isLowerBoundIncluded) + Assertions.assertEquals(LocalDateCodec.encode(cursorUpperBound), splits[2].cursorUpperBound) + } + + fun sanityCheck( + stream: Stream, + factory: DefaultJdbcPartitionFactory, + partition: DefaultJdbcPartition, + ) { + Assertions.assertEquals(stream, partition.stream) + Assertions.assertEquals(stream, partition.streamState.stream) + Assertions.assertEquals(factory.sharedState, partition.streamState.sharedState) + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimatorTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimatorTest.kt new file mode 100644 index 0000000000000..61f61da5d2687 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimatorTest.kt @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.read.TestFixtures.sharedState +import io.airbyte.cdk.util.Jsons +import org.junit.jupiter.api.Assertions +import org.junit.jupiter.api.Test + +class DefaultRowByteSizeEstimatorTest { + + fun estimate(jsonRecord: String): Int = + sharedState().rowByteSizeEstimator().apply(Jsons.readTree(jsonRecord) as ObjectNode).toInt() + + @Test + fun testZero() { + Assertions.assertEquals(18, estimate("""{}""")) + } + + @Test + fun testOne() { + Assertions.assertEquals(34, estimate("""{"one":1}""")) + } + + @Test + fun testTwo() { + Assertions.assertEquals(51, estimate("""{"one":1,"two":2}""")) + } + + @Test + fun testThree() { + Assertions.assertEquals(68, estimate("""{"one":1,"two":2,"three":3}""")) + } + + @Test + fun testFour() { + Assertions.assertEquals(90, estimate("""{"one":1,"two":2,"three":3,"four":"four"}""")) + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionReaderTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionReaderTest.kt new file mode 100644 index 0000000000000..d4076c9965307 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionReaderTest.kt @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.data.LocalDateCodec +import io.airbyte.cdk.output.BufferingOutputConsumer +import io.airbyte.cdk.read.TestFixtures.assertFailures +import io.airbyte.cdk.read.TestFixtures.factory +import io.airbyte.cdk.read.TestFixtures.id +import io.airbyte.cdk.read.TestFixtures.msg +import io.airbyte.cdk.read.TestFixtures.opaqueStateValue +import io.airbyte.cdk.read.TestFixtures.sharedState +import io.airbyte.cdk.read.TestFixtures.stream +import io.airbyte.cdk.read.TestFixtures.ts +import java.time.LocalDate +import kotlinx.coroutines.CancellationException +import kotlinx.coroutines.delay +import kotlinx.coroutines.runBlocking +import kotlinx.coroutines.withTimeoutOrNull +import org.junit.jupiter.api.Assertions +import org.junit.jupiter.api.Test + +class JdbcPartitionReaderTest { + + val cursorLowerBound = LocalDate.parse("2024-08-01") + val cursorCheckpoint = LocalDate.parse("2024-08-02") + val cursorUpperBound = LocalDate.parse("2024-08-05") + + @Test + fun testNonResumable() { + // Generate partition + val stream = stream(withPK = false) + val sharedState = + sharedState( + mockedQueries = + arrayOf( + TestFixtures.MockedQuery( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where( + And( + GreaterOrEqual(ts, LocalDateCodec.encode(cursorLowerBound)), + LesserOrEqual(ts, LocalDateCodec.encode(cursorUpperBound)), + ) + ), + ), + SelectQuerier.Parameters(fetchSize = 2), + """{"id":1,"ts":"2024-08-01","msg":"hello"}""", + """{"id":2,"ts":"2024-08-02","msg":"how"}""", + """{"id":3,"ts":"2024-08-03","msg":"are"}""", + """{"id":4,"ts":"2024-08-04","msg":"you"}""", + """{"id":5,"ts":"2024-08-05","msg":"today"}""", + ) + ) + ) + val factory = sharedState.factory() + val result = factory.create(stream, opaqueStateValue(cursor = cursorLowerBound)) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcCursorIncrementalPartition) + val partition = result as DefaultJdbcCursorIncrementalPartition + partition.streamState.cursorUpperBound = LocalDateCodec.encode(cursorUpperBound) + partition.streamState.fetchSize = 2 + // Generate reader + val reader = JdbcNonResumablePartitionReader(partition) + // Acquire resources + Assertions.assertEquals( + sharedState.configuration.maxConcurrency, + factory.sharedState.semaphore.availablePermits, + ) + Assertions.assertEquals( + PartitionReader.TryAcquireResourcesStatus.READY_TO_RUN, + reader.tryAcquireResources() + ) + Assertions.assertEquals( + sharedState.configuration.maxConcurrency - 1, + factory.sharedState.semaphore.availablePermits, + ) + // Run + runBlocking { reader.run() } + // Checkpoint + Assertions.assertEquals( + PartitionReadCheckpoint(opaqueStateValue(cursor = cursorUpperBound), 5), + reader.checkpoint(), + ) + // Check output + Assertions.assertEquals( + "hello how are you today", + (sharedState.outputConsumer as BufferingOutputConsumer) + .records() + .map { it.data["msg"].asText() } + .joinToString(separator = " ") + ) + // Release resources + Assertions.assertEquals( + sharedState.configuration.maxConcurrency - 1, + factory.sharedState.semaphore.availablePermits, + ) + reader.releaseResources() + Assertions.assertEquals( + sharedState.configuration.maxConcurrency, + factory.sharedState.semaphore.availablePermits, + ) + } + + @Test + fun testResumable() { + // Generate partition + val stream = stream(withPK = false) + val sharedState = + sharedState( + mockedQueries = + arrayOf( + TestFixtures.MockedQuery( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where( + And( + GreaterOrEqual(ts, LocalDateCodec.encode(cursorLowerBound)), + LesserOrEqual(ts, LocalDateCodec.encode(cursorUpperBound)), + ) + ), + OrderBy(ts), + Limit(4), + ), + SelectQuerier.Parameters(fetchSize = 2), + """{"id":1,"ts":"2024-08-01","msg":"hello"}""", + """{"id":2,"ts":"2024-08-02","msg":"how"}""", + """{"id":3,"ts":"2024-08-03","msg":"are"}""", + """{"id":4,"ts":"2024-08-04","msg":"you"}""", + ) + ) + ) + val factory = sharedState.factory() + val result = factory.create(stream, opaqueStateValue(cursor = cursorLowerBound)) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcCursorIncrementalPartition) + val partition = result as DefaultJdbcCursorIncrementalPartition + partition.streamState.cursorUpperBound = LocalDateCodec.encode(cursorUpperBound) + partition.streamState.fetchSize = 2 + partition.streamState.updateLimitState { it.up } // so we don't hit the limit + // Generate reader + val reader = JdbcResumablePartitionReader(partition) + // Acquire resources + Assertions.assertEquals( + sharedState.configuration.maxConcurrency, + factory.sharedState.semaphore.availablePermits, + ) + Assertions.assertEquals( + PartitionReader.TryAcquireResourcesStatus.READY_TO_RUN, + reader.tryAcquireResources() + ) + Assertions.assertEquals( + sharedState.configuration.maxConcurrency - 1, + factory.sharedState.semaphore.availablePermits, + ) + // Run and simulate timing out + runBlocking { + withTimeoutOrNull(1) { + try { + delay(100) + } catch (_: CancellationException) { + // swallow + } + reader.run() + } + } + // Checkpoint + Assertions.assertEquals( + PartitionReadCheckpoint(opaqueStateValue(cursor = cursorCheckpoint), 2), + reader.checkpoint(), + ) + // Check output + Assertions.assertEquals( + "hello how", + (sharedState.outputConsumer as BufferingOutputConsumer) + .records() + .map { it.data["msg"].asText() } + .joinToString(separator = " ") + ) + // Release resources + Assertions.assertEquals( + sharedState.configuration.maxConcurrency - 1, + factory.sharedState.semaphore.availablePermits, + ) + reader.releaseResources() + Assertions.assertEquals( + sharedState.configuration.maxConcurrency, + factory.sharedState.semaphore.availablePermits, + ) + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorTest.kt new file mode 100644 index 0000000000000..225ddcc02bee4 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorTest.kt @@ -0,0 +1,428 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.data.IntCodec +import io.airbyte.cdk.data.LocalDateCodec +import io.airbyte.cdk.read.TestFixtures.assertFailures +import io.airbyte.cdk.read.TestFixtures.factory +import io.airbyte.cdk.read.TestFixtures.id +import io.airbyte.cdk.read.TestFixtures.msg +import io.airbyte.cdk.read.TestFixtures.opaqueStateValue +import io.airbyte.cdk.read.TestFixtures.sharedState +import io.airbyte.cdk.read.TestFixtures.stream +import io.airbyte.cdk.read.TestFixtures.ts +import java.time.LocalDate +import kotlinx.coroutines.runBlocking +import org.junit.jupiter.api.Assertions +import org.junit.jupiter.api.Test + +class JdbcPartitionsCreatorTest { + + @Test + fun testConcurrentSnapshotWithCursor() { + val stream = stream() + + val sharedState = + sharedState( + constants = + DefaultJdbcSharedState.Constants( + withSampling = true, + maxSampleSize = 4, + // absurdly low value to create many partitions + expectedThroughputBytesPerSecond = 1L, + ), + mockedQueries = + arrayOf( + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumnMaxValue(ts), + From(stream().name, stream().namespace), + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"max":"$cursorUpperBound"}""", + ), + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream().name, + stream().namespace, + sampleRateInvPow2 = 16, + sampleSize = 4 + ), + NoWhere, + OrderBy(id) + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"id":10000,"ts":"2024-08-01","msg":"foo"}""", + ), + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream().name, + stream().namespace, + sampleRateInvPow2 = 8, + sampleSize = 4 + ), + NoWhere, + OrderBy(id) + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"id":10000,"ts":"2024-08-01","msg":"foo"}""", + """{"id":20000,"ts":"2024-08-02","msg":"bar"}""", + """{"id":30000,"ts":"2024-08-03","msg":"baz"}""", + """{"id":40000,"ts":"2024-08-04","msg":"quux"}""", + ) + ), + ) + val expectedPartitions = 5 // adjust this as needed based on inputs + val expectedFetchSize = 681 // adjust this as needed based on inputs + val factory = sharedState.factory() + val initialPartition = factory.create(stream, opaqueStateValue = null).asPartition() + factory.assertFailures() + val readers = JdbcConcurrentPartitionsCreator(initialPartition, factory).runInTest() + val partitions: List = + concurrentPartitions(stream, factory, readers) + val streamState: DefaultJdbcStreamState = partitions.first().streamState + Assertions.assertEquals( + LocalDateCodec.encode(cursorUpperBound), + streamState.cursorUpperBound + ) + Assertions.assertEquals(expectedFetchSize, streamState.fetchSize) + Assertions.assertEquals(expectedPartitions, partitions.size) + Assertions.assertIterableEquals(listOf(id), partitions.first().checkpointColumns) + Assertions.assertNull(partitions.first().lowerBound) + for (i in 1..(expectedPartitions - 1)) { + Assertions.assertIterableEquals(partitions[i - 1].upperBound, partitions[i].lowerBound) + Assertions.assertIterableEquals(listOf(id), partitions[i].checkpointColumns) + } + Assertions.assertNull(partitions.last().upperBound) + } + + @Test + fun testConcurrentSnapshot() { + val stream = stream(withCursor = false) + val sharedState = + sharedState( + constants = + DefaultJdbcSharedState.Constants( + withSampling = true, + maxSampleSize = 4, + // absurdly low value to create many partitions + expectedThroughputBytesPerSecond = 1L, + ), + mockedQueries = + arrayOf( + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream().name, + stream().namespace, + sampleRateInvPow2 = 16, + sampleSize = 4 + ), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id) + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"id":10000,"ts":"2024-08-01","msg":"foo"}""", + ), + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream().name, + stream().namespace, + sampleRateInvPow2 = 8, + sampleSize = 4 + ), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id) + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"id":10000,"ts":"2024-08-01","msg":"foo"}""", + """{"id":20000,"ts":"2024-08-02","msg":"bar"}""", + """{"id":30000,"ts":"2024-08-03","msg":"baz"}""", + """{"id":40000,"ts":"2024-08-04","msg":"quux"}""", + ) + ), + ) + val expectedPartitions = 5 // adjust this as needed based on inputs + val expectedFetchSize = 681 // adjust this as needed based on inputs + val factory = sharedState.factory() + val initialPartition = factory.create(stream, opaqueStateValue(pk = 22)).asPartition() + factory.assertFailures() + val readers = JdbcConcurrentPartitionsCreator(initialPartition, factory).runInTest() + val partitions: List = + concurrentPartitions(stream, factory, readers) + val streamState: DefaultJdbcStreamState = partitions.first().streamState + Assertions.assertNull(streamState.cursorUpperBound) + Assertions.assertEquals(expectedFetchSize, streamState.fetchSize) + Assertions.assertEquals(expectedPartitions, partitions.size) + Assertions.assertIterableEquals(listOf(id), partitions.first().checkpointColumns) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), partitions.first().lowerBound) + for (i in 1..(expectedPartitions - 1)) { + Assertions.assertIterableEquals(partitions[i - 1].upperBound, partitions[i].lowerBound) + Assertions.assertIterableEquals(listOf(id), partitions[i].checkpointColumns) + } + Assertions.assertNull(partitions.last().upperBound) + } + + @Test + fun testConcurrentSnapshotWithoutSampling() { + val stream = stream(withCursor = false) + val sharedState = sharedState() + val factory = sharedState.factory() + val initialPartition = factory.create(stream, opaqueStateValue(pk = 22)).asPartition() + factory.assertFailures() + val readers = JdbcConcurrentPartitionsCreator(initialPartition, factory).runInTest() + val partitions: List = + concurrentPartitions(stream, factory, readers) + // No sampling means no splitting. + Assertions.assertEquals(1, partitions.size) + Assertions.assertIterableEquals( + stream.configuredPrimaryKey, + partitions.first().checkpointColumns, + ) + Assertions.assertEquals(listOf(IntCodec.encode(22)), partitions.first().lowerBound) + Assertions.assertNull(partitions.first().upperBound) + } + + @Test + fun testColdStartSequentialSnapshot() { + val stream = stream(withCursor = false) + val sharedState = sharedState() + val factory = sharedState.factory() + val initialPartition = factory.create(stream, opaqueStateValue(pk = 22)).asPartition() + factory.assertFailures() + val readers = JdbcSequentialPartitionsCreator(initialPartition, factory).runInTest() + val readerPartition: DefaultJdbcSplittableSnapshotPartition = + sequentialPartition(stream, factory, readers) + Assertions.assertNull(readerPartition.streamState.cursorUpperBound) + Assertions.assertNull(readerPartition.streamState.fetchSize) + Assertions.assertIterableEquals( + stream.configuredPrimaryKey, + readerPartition.checkpointColumns, + ) + Assertions.assertEquals(listOf(IntCodec.encode(22)), readerPartition.lowerBound) + Assertions.assertNull(readerPartition.upperBound) + } + + @Test + fun testColdStartSequentialSnapshotWithSampling() { + val stream = stream(withCursor = false) + val sharedState = + sharedState( + constants = + DefaultJdbcSharedState.Constants( + withSampling = true, + maxSampleSize = 4, + ), + mockedQueries = + arrayOf( + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream().name, + stream().namespace, + sampleRateInvPow2 = 16, + sampleSize = 4 + ), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id) + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"id":10000,"ts":"2024-08-01","msg":"foo"}""", + ), + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream().name, + stream().namespace, + sampleRateInvPow2 = 8, + sampleSize = 4 + ), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id) + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"id":10000,"ts":"2024-08-01","msg":"foo"}""", + """{"id":20000,"ts":"2024-08-02","msg":"bar"}""", + """{"id":30000,"ts":"2024-08-03","msg":"baz"}""", + """{"id":40000,"ts":"2024-08-04","msg":"quux"}""", + ) + ), + ) + val expectedFetchSize = 674 // adjust this as needed based on inputs + val factory = sharedState.factory() + val initialPartition = factory.create(stream, opaqueStateValue(pk = 22)).asPartition() + factory.assertFailures() + val readers = JdbcSequentialPartitionsCreator(initialPartition, factory).runInTest() + val readerPartition: DefaultJdbcSplittableSnapshotPartition = + sequentialPartition(stream, factory, readers) + Assertions.assertNull(readerPartition.streamState.cursorUpperBound) + Assertions.assertEquals(expectedFetchSize, readerPartition.streamState.fetchSize) + Assertions.assertIterableEquals(listOf(id), readerPartition.checkpointColumns) + Assertions.assertEquals(listOf(IntCodec.encode(22)), readerPartition.lowerBound) + Assertions.assertNull(readerPartition.upperBound) + } + + @Test + fun testColdStartCursorIncrementalSequential() { + val stream = stream() + val sharedState = + sharedState( + mockedQueries = + arrayOf( + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumnMaxValue(ts), + From(stream().name, stream().namespace), + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"max":"$cursorUpperBound"}""", + ), + ) + ) + val factory = sharedState.factory() + val initialPartition = + factory.create(stream, opaqueStateValue(cursor = cursorCheckpoint)).asPartition() + factory.assertFailures() + val readers = JdbcSequentialPartitionsCreator(initialPartition, factory).runInTest() + val readerPartition: DefaultJdbcCursorIncrementalPartition = + sequentialPartition(stream, factory, readers) + Assertions.assertEquals( + LocalDateCodec.encode(cursorUpperBound), + readerPartition.streamState.cursorUpperBound, + ) + Assertions.assertNull(readerPartition.streamState.fetchSize) + Assertions.assertEquals(ts, readerPartition.cursor) + Assertions.assertEquals( + LocalDateCodec.encode(cursorCheckpoint), + readerPartition.cursorLowerBound, + ) + Assertions.assertEquals( + LocalDateCodec.encode(cursorUpperBound), + readerPartition.cursorUpperBound, + ) + } + + @Test + fun testWarmStartCursorIncrementalSequentialWithSampling() { + val stream = stream() + val sharedState = + sharedState( + constants = DefaultJdbcSharedState.Constants(withSampling = true), + // The JdbcSequentialPartitionsCreator is not expected to query anything. + mockedQueries = arrayOf() + ) + val factory = sharedState.factory() + run { + // This warm start is particularly warm; the stream state has some transient state. + val streamState: DefaultJdbcStreamState = factory.streamState(stream) + streamState.fetchSize = 1234 + streamState.cursorUpperBound = LocalDateCodec.encode(cursorUpperBound) + } + val initialPartition = + factory.create(stream, opaqueStateValue(cursor = cursorCheckpoint)).asPartition() + factory.assertFailures() + val readers = JdbcSequentialPartitionsCreator(initialPartition, factory).runInTest() + val readerPartition: DefaultJdbcCursorIncrementalPartition = + sequentialPartition(stream, factory, readers) + Assertions.assertEquals(ts, readerPartition.cursor) + Assertions.assertEquals( + LocalDateCodec.encode(cursorCheckpoint), + readerPartition.cursorLowerBound, + ) + Assertions.assertEquals( + LocalDateCodec.encode(cursorUpperBound), + readerPartition.cursorUpperBound, + ) + } + + val cursorCheckpoint = LocalDate.parse("2024-08-02") + val cursorUpperBound = LocalDate.parse("2024-08-05") + + inline fun concurrentPartitions( + stream: Stream, + factory: DefaultJdbcPartitionFactory, + readers: List + ): List { + Assertions.assertTrue(readers.isNotEmpty()) + val typedReaders = readers.filterIsInstance>() + Assertions.assertIterableEquals(readers, typedReaders) + for (reader in typedReaders) { + Assertions.assertTrue(reader.partition is T) + Assertions.assertEquals(stream, reader.stream) + Assertions.assertEquals(factory.streamState(stream), reader.partition.streamState) + } + return typedReaders.map { it.partition as T } + } + + inline fun sequentialPartition( + stream: Stream, + factory: DefaultJdbcPartitionFactory, + readers: List + ): T { + Assertions.assertTrue(readers.firstOrNull() is JdbcResumablePartitionReader<*>) + Assertions.assertNull(readers.getOrNull(1)) + val reader = readers.first() as JdbcResumablePartitionReader<*> + Assertions.assertTrue(reader.partition is T) + val partition = reader.partition as T + Assertions.assertEquals(stream, reader.stream) + Assertions.assertEquals(factory.streamState(stream), partition.streamState) + return partition + } + + fun DefaultJdbcPartition?.asPartition(): DefaultJdbcPartition { + Assertions.assertTrue(this is DefaultJdbcPartition) + return this as DefaultJdbcPartition + } + + fun JdbcPartitionsCreator + .runInTest(): List { + val sharedState: DefaultJdbcSharedState = sharedState + // Acquire resources + Assertions.assertEquals( + sharedState.configuration.maxConcurrency, + sharedState.semaphore.availablePermits, + ) + Assertions.assertEquals( + PartitionsCreator.TryAcquireResourcesStatus.READY_TO_RUN, + tryAcquireResources() + ) + Assertions.assertEquals( + sharedState.configuration.maxConcurrency - 1, + sharedState.semaphore.availablePermits, + ) + // Run + val partitionReaders: List = runBlocking { run() } + // Release resources + Assertions.assertEquals( + sharedState.configuration.maxConcurrency - 1, + sharedState.semaphore.availablePermits, + ) + releaseResources() + Assertions.assertEquals( + sharedState.configuration.maxConcurrency, + sharedState.semaphore.availablePermits, + ) + // Return result + return partitionReaders + } +} diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/JdbcSelectQuerierTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcSelectQuerierTest.kt similarity index 86% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/JdbcSelectQuerierTest.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcSelectQuerierTest.kt index 869b2e35a41a9..c0136171b940f 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/JdbcSelectQuerierTest.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcSelectQuerierTest.kt @@ -3,10 +3,10 @@ package io.airbyte.cdk.read import com.fasterxml.jackson.databind.node.ObjectNode import io.airbyte.cdk.discover.Field -import io.airbyte.cdk.fakesource.FakeSourceConfiguration -import io.airbyte.cdk.fakesource.FakeSourceConfigurationFactory -import io.airbyte.cdk.fakesource.FakeSourceConfigurationJsonObject import io.airbyte.cdk.h2.H2TestFixture +import io.airbyte.cdk.h2source.H2SourceConfiguration +import io.airbyte.cdk.h2source.H2SourceConfigurationFactory +import io.airbyte.cdk.h2source.H2SourceConfigurationJsonObject import io.airbyte.cdk.jdbc.IntFieldType import io.airbyte.cdk.jdbc.JdbcConnectionFactory import io.airbyte.cdk.jdbc.StringFieldType @@ -79,12 +79,12 @@ class JdbcSelectQuerierTest { q: SelectQuery, vararg expected: String, ) { - val configPojo: FakeSourceConfigurationJsonObject = - FakeSourceConfigurationJsonObject().apply { + val configPojo: H2SourceConfigurationJsonObject = + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database } - val config: FakeSourceConfiguration = FakeSourceConfigurationFactory().make(configPojo) + val config: H2SourceConfiguration = H2SourceConfigurationFactory().make(configPojo) val querier: SelectQuerier = JdbcSelectQuerier(JdbcConnectionFactory(config)) val actual: List = querier.executeQuery(q).use { it.asSequence().toList() } Assertions.assertIterableEquals(expected.toList().map(Jsons::readTree), actual) diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt new file mode 100644 index 0000000000000..cbea4d9464807 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.TestClockFactory +import io.airbyte.cdk.command.JdbcSourceConfiguration +import io.airbyte.cdk.command.OpaqueStateValue +import io.airbyte.cdk.discover.Field +import io.airbyte.cdk.jdbc.IntFieldType +import io.airbyte.cdk.jdbc.LocalDateFieldType +import io.airbyte.cdk.jdbc.StringFieldType +import io.airbyte.cdk.output.BufferingCatalogValidationFailureHandler +import io.airbyte.cdk.output.BufferingOutputConsumer +import io.airbyte.cdk.output.CatalogValidationFailure +import io.airbyte.cdk.ssh.SshConnectionOptions +import io.airbyte.cdk.ssh.SshTunnelMethodConfiguration +import io.airbyte.cdk.util.Jsons +import io.airbyte.protocol.models.v0.SyncMode +import java.time.Duration +import java.time.LocalDate +import org.junit.jupiter.api.Assertions + +object TestFixtures { + + val id = Field("id", IntFieldType) + val ts = Field("ts", LocalDateFieldType) + val msg = Field("msg", StringFieldType) + + fun stream( + withPK: Boolean = true, + withCursor: Boolean = true, + ) = + Stream( + name = "events", + namespace = "test", + fields = listOf(id, ts, msg), + configuredSyncMode = if (withCursor) SyncMode.INCREMENTAL else SyncMode.FULL_REFRESH, + configuredPrimaryKey = listOf(id).takeIf { withPK }, + configuredCursor = ts.takeIf { withCursor }, + ) + + fun opaqueStateValue( + pk: Int? = null, + cursor: LocalDate? = null, + ): OpaqueStateValue = + Jsons.readTree( + listOf( + """"primary_key":""" + if (pk == null) "{}" else """{"${id.id}":$pk }""", + """"cursors":""" + if (cursor == null) "{}" else """{"${ts.id}":"$cursor"} """, + ) + .joinToString(",", "{", "}") + ) + + fun record( + pk: Int? = null, + cursor: LocalDate? = null, + ): ObjectNode = + Jsons.readTree( + listOfNotNull( + """ "${id.id}" : $pk """.takeIf { pk != null }, + """ "${ts.id}" : "$cursor" """.takeIf { cursor != null }, + ) + .joinToString(",", "{", "}") + ) as ObjectNode + + fun sharedState( + global: Boolean = false, + checkpointTargetInterval: Duration = Duration.ofMinutes(1), + maxConcurrency: Int = 10, + maxMemoryBytesForTesting: Long = 1_000_000L, + constants: DefaultJdbcSharedState.Constants = DefaultJdbcSharedState.Constants(), + vararg mockedQueries: MockedQuery, + ) = + DefaultJdbcSharedState( + StubbedJdbcSourceConfiguration(global, checkpointTargetInterval, maxConcurrency), + BufferingOutputConsumer(TestClockFactory().fixed()), + MockSelectQuerier(ArrayDeque(mockedQueries.toList())), + constants.copy(maxMemoryBytesForTesting = maxMemoryBytesForTesting) + ) + + fun DefaultJdbcSharedState.factory() = + DefaultJdbcPartitionFactory( + this, + BufferingCatalogValidationFailureHandler(), + MockSelectQueryGenerator + ) + + fun DefaultJdbcPartitionFactory.assertFailures(vararg failures: CatalogValidationFailure) { + Assertions.assertIterableEquals( + failures.toList(), + (handler as BufferingCatalogValidationFailureHandler).get(), + ) + } + + fun SelectQuery.assertQueryEquals(expected: SelectQuerySpec) { + Assertions.assertEquals(expected.toString(), this.sql) + } + + fun JsonNode.assertJsonEquals(expected: String) { + Assertions.assertEquals(expected, this.toString()) + } + + fun JsonNode.assertJsonEquals(expected: JsonNode) { + Assertions.assertEquals(expected.toString(), this.toString()) + } + + class StubbedJdbcSourceConfiguration( + override val global: Boolean, + override val checkpointTargetInterval: Duration, + override val maxConcurrency: Int, + ) : JdbcSourceConfiguration { + override val realHost: String + get() = TODO("Not yet implemented") + override val jdbcUrlFmt: String + get() = TODO("Not yet implemented") + override val jdbcProperties: Map + get() = TODO("Not yet implemented") + override val schemas: Set + get() = TODO("Not yet implemented") + override val realPort: Int + get() = TODO("Not yet implemented") + override val sshTunnel: SshTunnelMethodConfiguration + get() = TODO("Not yet implemented") + override val sshConnectionOptions: SshConnectionOptions + get() = TODO("Not yet implemented") + override val resourceAcquisitionHeartbeat: Duration + get() = TODO("Not yet implemented") + } + + class MockSelectQuerier(val mockedQueries: ArrayDeque) : SelectQuerier { + + override fun executeQuery( + q: SelectQuery, + parameters: SelectQuerier.Parameters + ): SelectQuerier.Result { + val mockedQuery: MockedQuery? = mockedQueries.removeFirstOrNull() + Assertions.assertNotNull(mockedQuery, q.sql) + Assertions.assertEquals(q.sql, mockedQuery!!.expectedQuerySpec.toString()) + Assertions.assertEquals(parameters, mockedQuery.expectedParameters, q.sql) + return object : SelectQuerier.Result { + val wrapped: Iterator = mockedQuery.results.iterator() + override fun hasNext(): Boolean = wrapped.hasNext() + override fun next(): ObjectNode = wrapped.next() + override fun close() {} + } + } + } + + data class MockedQuery( + val expectedQuerySpec: SelectQuerySpec, + val expectedParameters: SelectQuerier.Parameters, + val results: List + ) { + constructor( + expectedQuerySpec: SelectQuerySpec, + expectedParameters: SelectQuerier.Parameters, + vararg rows: String, + ) : this( + expectedQuerySpec, + expectedParameters, + rows.map { Jsons.readTree(it) as ObjectNode }, + ) + } + + object MockSelectQueryGenerator : SelectQueryGenerator { + override fun generate(ast: SelectQuerySpec): SelectQuery = + SelectQuery(ast.toString(), listOf(), listOf()) + } +} diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/read/cdc-catalog.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/cdc-catalog.json similarity index 86% rename from airbyte-cdk/bulk/core/extract/src/test/resources/read/cdc-catalog.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/cdc-catalog.json index 7fd5bc286a51a..20c8b3b07b22e 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/read/cdc-catalog.json +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/cdc-catalog.json @@ -6,21 +6,23 @@ "json_schema": { "type": "object", "properties": { - "MSG": { - "type": "string" - }, "ID": { - "type": "string" + "type": "string", + "contentEncoding": "base64" }, "TS": { "type": "string", "format": "date-time", "airbyte_type": "timestamp_with_timezone" + }, + "MSG": { + "type": "string" } } }, "supported_sync_modes": ["full_refresh", "incremental"], - "default_cursor_field": ["ID", "TS"], + "source_defined_cursor": false, + "default_cursor_field": [], "source_defined_primary_key": [["ID"]], "is_resumable": true, "namespace": "PUBLIC" @@ -36,17 +38,18 @@ "json_schema": { "type": "object", "properties": { - "V": { - "type": "string" - }, "K": { "type": "number", "airbyte_type": "integer" + }, + "V": { + "type": "string" } } }, "supported_sync_modes": ["full_refresh", "incremental"], - "default_cursor_field": ["K"], + "source_defined_cursor": false, + "default_cursor_field": [], "source_defined_primary_key": [["K"]], "is_resumable": true, "namespace": "PUBLIC" diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/read/cursor-catalog.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/cursor-catalog.json similarity index 90% rename from airbyte-cdk/bulk/core/extract/src/test/resources/read/cursor-catalog.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/cursor-catalog.json index f1b4850c1fe1f..8ea7c7ce9e03c 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/read/cursor-catalog.json +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/cursor-catalog.json @@ -6,20 +6,22 @@ "json_schema": { "type": "object", "properties": { - "MSG": { - "type": "string" - }, "ID": { - "type": "string" + "type": "string", + "contentEncoding": "base64" }, "TS": { "type": "string", "format": "date-time", "airbyte_type": "timestamp_with_timezone" + }, + "MSG": { + "type": "string" } } }, "supported_sync_modes": ["full_refresh", "incremental"], + "source_defined_cursor": false, "default_cursor_field": [], "source_defined_primary_key": [["ID"]], "is_resumable": true, @@ -36,16 +38,17 @@ "json_schema": { "type": "object", "properties": { - "V": { - "type": "string" - }, "K": { "type": "number", "airbyte_type": "integer" + }, + "V": { + "type": "string" } } }, "supported_sync_modes": ["full_refresh", "incremental"], + "source_defined_cursor": false, "default_cursor_field": [], "source_defined_primary_key": [["K"]], "is_resumable": true, diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-cdc-catalog.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-cdc-catalog.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-cdc-catalog.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-cdc-catalog.json diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-cursor-catalog.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-cursor-catalog.json similarity index 94% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-cursor-catalog.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-cursor-catalog.json index 361331109c391..6238a25d21529 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-cursor-catalog.json +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-cursor-catalog.json @@ -24,6 +24,7 @@ } }, "supported_sync_modes": ["full_refresh", "incremental"], + "source_defined_cursor": false, "default_cursor_field": [], "source_defined_primary_key": [["ID"]], "is_resumable": true, @@ -48,6 +49,7 @@ } }, "supported_sync_modes": ["full_refresh", "incremental"], + "source_defined_cursor": false, "default_cursor_field": [], "source_defined_primary_key": [["K"]], "is_resumable": true, diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-global-cold-start.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-global-cold-start.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-global-cold-start.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-global-cold-start.json diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-stream-cold-start.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-stream-cold-start.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-stream-cold-start.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-stream-cold-start.json diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-stream-warm-start.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-stream-warm-start.json similarity index 61% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-stream-warm-start.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-stream-warm-start.json index 6d6e5adf7c8ba..a9323f871874b 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-stream-warm-start.json +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-stream-warm-start.json @@ -27,6 +27,40 @@ } } }, + { + "type": "RECORD", + "record": { + "namespace": "PUBLIC", + "stream": "EVENTS", + "data": { + "ID": "3VWqE0Hrb7TV5BOEP2wN+g==", + "TS": "2024-04-30T00:00:00.000000-04:00", + "MSG": null + }, + "emitted_at": 3133641600000 + } + }, + { + "type": "STATE", + "state": { + "type": "STREAM", + "stream": { + "stream_descriptor": { + "name": "EVENTS", + "namespace": "PUBLIC" + }, + "stream_state": { + "primary_key": {}, + "cursors": { + "TS": "2024-04-30T00:00:00.000000-04:00" + } + } + }, + "sourceStats": { + "recordCount": 2.0 + } + } + }, { "type": "TRACE", "trace": { diff --git a/airbyte-cdk/bulk/core/base/src/testFixtures/kotlin/io/airbyte/cdk/h2/H2TestFixture.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2/H2TestFixture.kt similarity index 100% rename from airbyte-cdk/bulk/core/base/src/testFixtures/kotlin/io/airbyte/cdk/h2/H2TestFixture.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2/H2TestFixture.kt diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSource.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2Source.kt similarity index 64% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSource.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2Source.kt index fd5887caf2486..fac1b555a7927 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSource.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2Source.kt @@ -1,10 +1,13 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.h2source import io.airbyte.cdk.AirbyteSourceRunner /** A fake source database connector, vaguely compatible with the H2 database. */ -class FakeSource { +class H2Source { fun main(args: Array) { AirbyteSourceRunner.run(*args) } diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfiguration.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfiguration.kt new file mode 100644 index 0000000000000..1fe026241887a --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfiguration.kt @@ -0,0 +1,61 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.h2source + +import io.airbyte.cdk.command.JdbcSourceConfiguration +import io.airbyte.cdk.command.SourceConfiguration +import io.airbyte.cdk.command.SourceConfigurationFactory +import io.airbyte.cdk.ssh.SshConnectionOptions +import io.airbyte.cdk.ssh.SshNoTunnelMethod +import io.airbyte.cdk.ssh.SshTunnelMethodConfiguration +import io.micronaut.context.annotation.Requires +import io.micronaut.context.annotation.Secondary +import io.micronaut.context.env.Environment +import jakarta.inject.Singleton +import java.time.Duration + +/** [SourceConfiguration] implementation for [H2Source]. */ +data class H2SourceConfiguration( + override val realHost: String, + override val realPort: Int, + override val sshTunnel: SshTunnelMethodConfiguration, + override val sshConnectionOptions: SshConnectionOptions, + override val jdbcUrlFmt: String, + override val schemas: Set, + val cursor: CursorConfiguration, + val resumablePreferred: Boolean, + override val maxConcurrency: Int, + override val checkpointTargetInterval: Duration, +) : JdbcSourceConfiguration { + override val global: Boolean = cursor is CdcCursor + override val jdbcProperties: Map = mapOf() + + override val resourceAcquisitionHeartbeat: Duration + get() = Duration.ofMillis(10) +} + +/** [SourceConfigurationFactory] implementation for [H2Source]. */ +@Singleton +@Requires(env = [Environment.TEST]) +@Secondary +class H2SourceConfigurationFactory : + SourceConfigurationFactory { + override fun makeWithoutExceptionHandling( + pojo: H2SourceConfigurationJsonObject, + ): H2SourceConfiguration { + val sshConnectionOptions: SshConnectionOptions = + SshConnectionOptions.fromAdditionalProperties(pojo.getAdditionalProperties()) + return H2SourceConfiguration( + realHost = pojo.host, + realPort = pojo.port, + sshTunnel = pojo.getTunnelMethodValue() ?: SshNoTunnelMethod, + sshConnectionOptions = sshConnectionOptions, + jdbcUrlFmt = "jdbc:h2:tcp://%s:%d/mem:${pojo.database}", + schemas = pojo.schemas?.takeUnless { it.isEmpty() }?.toSet() ?: setOf("PUBLIC"), + cursor = pojo.getCursorConfigurationValue() ?: UserDefinedCursor, + resumablePreferred = pojo.resumablePreferred != false, + maxConcurrency = 1, + checkpointTargetInterval = Duration.parse(pojo.timeout).takeIf { it.isPositive } + ?: Duration.ofDays(100L), + ) + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfigurationJsonObject.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfigurationJsonObject.kt new file mode 100644 index 0000000000000..f5289e1fab7ae --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfigurationJsonObject.kt @@ -0,0 +1,162 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.h2source + +import com.fasterxml.jackson.annotation.JsonAnyGetter +import com.fasterxml.jackson.annotation.JsonAnySetter +import com.fasterxml.jackson.annotation.JsonGetter +import com.fasterxml.jackson.annotation.JsonIgnore +import com.fasterxml.jackson.annotation.JsonProperty +import com.fasterxml.jackson.annotation.JsonPropertyDescription +import com.fasterxml.jackson.annotation.JsonPropertyOrder +import com.fasterxml.jackson.annotation.JsonSetter +import com.fasterxml.jackson.annotation.JsonSubTypes +import com.fasterxml.jackson.annotation.JsonTypeInfo +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaArrayWithUniqueItems +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaDefault +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaDescription +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaInject +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings +import io.airbyte.cdk.ConfigErrorException +import io.airbyte.cdk.command.CONNECTOR_CONFIG_PREFIX +import io.airbyte.cdk.command.ConfigurationJsonObjectBase +import io.airbyte.cdk.ssh.MicronautPropertiesFriendlySshTunnelMethodConfigurationJsonObject +import io.airbyte.cdk.ssh.SshTunnelMethodConfiguration +import io.micronaut.context.annotation.ConfigurationBuilder +import io.micronaut.context.annotation.ConfigurationProperties +import io.micronaut.context.annotation.Secondary +import jakarta.inject.Singleton + +/** [ConfigurationJsonObjectBase] implementation for [H2Source]. */ +@JsonSchemaTitle("Test Source Spec") +@JsonPropertyOrder( + value = + [ + "host", + "port", + "database", + "schemas", + "tunnel_method", + "cursor", + ], +) +@Singleton +@Secondary +@ConfigurationProperties(CONNECTOR_CONFIG_PREFIX) +@SuppressFBWarnings(value = ["NP_NONNULL_RETURN_VIOLATION"], justification = "Micronaut DI") +class H2SourceConfigurationJsonObject : ConfigurationJsonObjectBase() { + @JsonProperty("host") + @JsonSchemaTitle("Host") + @JsonSchemaInject(json = """{"order":1}""") + @JsonSchemaDefault("localhost") + @JsonPropertyDescription("Hostname of the database.") + var host: String = "localhost" + + @JsonProperty("port") + @JsonSchemaTitle("Port") + @JsonSchemaInject(json = """{"order":2,"minimum": 0,"maximum": 65536}""") + @JsonSchemaDefault("9092") + @JsonPropertyDescription("Port of the database.") + var port: Int = 9092 + + @JsonProperty("database") + @JsonSchemaTitle("Database") + @JsonPropertyDescription("Name of the database.") + @JsonSchemaInject(json = """{"order":3}""") + lateinit var database: String + + @JsonProperty("schemas") + @JsonSchemaTitle("Schemas") + @JsonSchemaArrayWithUniqueItems("schemas") + @JsonPropertyDescription("The list of schemas to sync from. Defaults to PUBLIC.") + @JsonSchemaInject(json = """{"order":4,"minItems":1,"uniqueItems":true}""") + var schemas: List? = null + + @JsonIgnore + @ConfigurationBuilder(configurationPrefix = "tunnel_method") + val tunnelMethod = MicronautPropertiesFriendlySshTunnelMethodConfigurationJsonObject() + + @JsonIgnore var tunnelMethodJson: SshTunnelMethodConfiguration? = null + + @JsonSetter("tunnel_method") + fun setTunnelMethodValue(value: SshTunnelMethodConfiguration?) { + tunnelMethodJson = value + } + + @JsonGetter("tunnel_method") + @JsonSchemaTitle("SSH Tunnel Method") + @JsonPropertyDescription( + "Whether to initiate an SSH tunnel before connecting to the database," + + " and if so, which kind of authentication to use.", + ) + @JsonSchemaInject(json = """{"order":5}""") + fun getTunnelMethodValue(): SshTunnelMethodConfiguration? = + tunnelMethodJson ?: tunnelMethod.asSshTunnelMethod() + + @JsonIgnore + @ConfigurationBuilder(configurationPrefix = "cursor") + val cursor = MicronautPropertiesFriendlyCursorConfiguration() + + @JsonIgnore var cursorJson: CursorConfiguration? = null + + @JsonSetter("cursor") + fun setCursorMethodValue(value: CursorConfiguration?) { + cursorJson = value + } + + @JsonGetter("cursor") + @JsonSchemaTitle("Update Method") + @JsonPropertyDescription("Configures how data is extracted from the database.") + @JsonSchemaInject(json = """{"order":6,"display_type":"radio"}""") + fun getCursorConfigurationValue(): CursorConfiguration? = + cursorJson ?: cursor.asCursorConfiguration() + + @JsonProperty("resumable_preferred") + @JsonSchemaDefault("true") + @JsonSchemaInject(json = """{"order":7,"display_type":"check"}""") + var resumablePreferred: Boolean? = true + + @JsonProperty("timeout") + @JsonSchemaDefault("PT0S") + @JsonSchemaInject(json = """{"order":8}""") + var timeout: String? = "PT0S" + + @JsonIgnore var additionalPropertiesMap = mutableMapOf() + + @JsonAnyGetter fun getAdditionalProperties(): Map = additionalPropertiesMap + + @JsonAnySetter + fun setAdditionalProperty( + name: String, + value: Any, + ) { + additionalPropertiesMap[name] = value + } +} + +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "cursor_method") +@JsonSubTypes( + JsonSubTypes.Type(value = UserDefinedCursor::class, name = "user_defined"), + JsonSubTypes.Type(value = CdcCursor::class, name = "cdc"), +) +@JsonSchemaTitle("Update Method") +@JsonSchemaDescription("Configures how data is extracted from the database.") +sealed interface CursorConfiguration + +@JsonSchemaTitle("Scan Changes with User Defined Cursor") +data object UserDefinedCursor : CursorConfiguration + +@JsonSchemaTitle("Read Changes using Change Data Capture (CDC)") +data object CdcCursor : CursorConfiguration + +@ConfigurationProperties("$CONNECTOR_CONFIG_PREFIX.cursor") +class MicronautPropertiesFriendlyCursorConfiguration { + var cursorMethod: String = "user_defined" + + fun asCursorConfiguration(): CursorConfiguration = + when (cursorMethod) { + "user_defined" -> UserDefinedCursor + "cdc" -> CdcCursor + else -> throw ConfigErrorException("invalid value $cursorMethod") + } +} diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceOperations.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceOperations.kt similarity index 95% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceOperations.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceOperations.kt index 9adeec57816be..b561dcef6ee1d 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceOperations.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceOperations.kt @@ -1,5 +1,5 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +package io.airbyte.cdk.h2source import io.airbyte.cdk.discover.FieldType import io.airbyte.cdk.discover.JdbcMetadataQuerier @@ -34,6 +34,7 @@ import io.airbyte.cdk.read.From import io.airbyte.cdk.read.FromNode import io.airbyte.cdk.read.FromSample import io.airbyte.cdk.read.Greater +import io.airbyte.cdk.read.GreaterOrEqual import io.airbyte.cdk.read.Lesser import io.airbyte.cdk.read.LesserOrEqual import io.airbyte.cdk.read.Limit @@ -61,11 +62,11 @@ import io.micronaut.context.env.Environment import jakarta.inject.Singleton import java.sql.JDBCType -/** Stateless connector-specific logic for [FakeSource]. */ +/** Stateless connector-specific logic for [H2Source]. */ @Singleton @Requires(env = [Environment.TEST]) @Secondary -class FakeSourceOperations : JdbcMetadataQuerier.FieldTypeMapper, SelectQueryGenerator { +class H2SourceOperations : JdbcMetadataQuerier.FieldTypeMapper, SelectQueryGenerator { override fun toFieldType(c: JdbcMetadataQuerier.ColumnMetadata): FieldType = when (c.type.jdbcType) { JDBCType.BIT, @@ -148,6 +149,7 @@ class FakeSourceOperations : JdbcMetadataQuerier.FieldTypeMapper, SelectQueryGen is And -> conj.map { it.sql() }.joinToString(") AND (", "(", ")") is Or -> disj.map { it.sql() }.joinToString(") OR (", "(", ")") is Equal -> "${column.id} = ?" + is GreaterOrEqual -> "${column.id} >= ?" is Greater -> "${column.id} > ?" is LesserOrEqual -> "${column.id} <= ?" is Lesser -> "${column.id} < ?" @@ -179,7 +181,7 @@ class FakeSourceOperations : JdbcMetadataQuerier.FieldTypeMapper, SelectQueryGen is Or -> disj.flatMap { it.bindings() } is WhereClauseLeafNode -> { val type = column.type as LosslessJdbcFieldType<*, *> - listOf(io.airbyte.cdk.read.SelectQuery.Binding(bindingValue, type)) + listOf(SelectQuery.Binding(bindingValue, type)) } } } diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/application.yml b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/application.yml new file mode 100644 index 0000000000000..52d72019ae547 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/application.yml @@ -0,0 +1,6 @@ +--- +airbyte: + connector: + extract: + jdbc: + mode: sequential diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/h2source/expected-spec.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/h2source/expected-spec.json new file mode 100644 index 0000000000000..1656ebc276c36 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/h2source/expected-spec.json @@ -0,0 +1,212 @@ +{ + "documentationUrl": "https://docs.airbyte.com", + "connectionSpecification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Test Source Spec", + "type": "object", + "additionalProperties": true, + "properties": { + "host": { + "type": "string", + "default": "localhost", + "description": "Hostname of the database.", + "title": "Host", + "order": 1 + }, + "port": { + "type": "integer", + "default": 9092, + "description": "Port of the database.", + "title": "Port", + "order": 2, + "minimum": 0, + "maximum": 65536 + }, + "database": { + "type": "string", + "description": "Name of the database.", + "title": "Database", + "order": 3 + }, + "schemas": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The list of schemas to sync from. Defaults to PUBLIC.", + "title": "Schemas", + "order": 4, + "minItems": 1, + "uniqueItems": true + }, + "tunnel_method": { + "oneOf": [ + { + "title": "No Tunnel", + "type": "object", + "additionalProperties": true, + "description": "No ssh tunnel needed to connect to database", + "properties": { + "tunnel_method": { + "type": "string", + "enum": ["NO_TUNNEL"], + "default": "NO_TUNNEL" + } + }, + "required": ["tunnel_method"] + }, + { + "title": "SSH Key Authentication", + "type": "object", + "additionalProperties": true, + "description": "Connect through a jump server tunnel host using username and ssh key", + "properties": { + "tunnel_method": { + "type": "string", + "enum": ["SSH_KEY_AUTH"], + "default": "SSH_KEY_AUTH" + }, + "tunnel_host": { + "type": "string", + "description": "Hostname of the jump server host that allows inbound ssh tunnel.", + "title": "SSH Tunnel Jump Server Host", + "order": 1 + }, + "tunnel_port": { + "type": "integer", + "default": 22, + "description": "Port on the proxy/jump server that accepts inbound ssh connections.", + "title": "SSH Connection Port", + "order": 2, + "minimum": 0, + "maximum": 65536 + }, + "tunnel_user": { + "type": "string", + "description": "OS-level username for logging into the jump server host", + "title": "SSH Login Username", + "order": 3 + }, + "ssh_key": { + "type": "string", + "description": "OS-level user account ssh key credentials in RSA PEM format ( created with ssh-keygen -t rsa -m PEM -f myuser_rsa )", + "title": "SSH Private Key", + "order": 4, + "multiline": true, + "airbyte_secret": true + } + }, + "required": [ + "tunnel_method", + "tunnel_host", + "tunnel_port", + "tunnel_user", + "ssh_key" + ] + }, + { + "title": "Password Authentication", + "type": "object", + "additionalProperties": true, + "description": "Connect through a jump server tunnel host using username and password authentication", + "properties": { + "tunnel_method": { + "type": "string", + "enum": ["SSH_PASSWORD_AUTH"], + "default": "SSH_PASSWORD_AUTH" + }, + "tunnel_host": { + "type": "string", + "description": "Hostname of the jump server host that allows inbound ssh tunnel.", + "title": "SSH Tunnel Jump Server Host", + "order": 1 + }, + "tunnel_port": { + "type": "integer", + "default": 22, + "description": "Port on the proxy/jump server that accepts inbound ssh connections.", + "title": "SSH Connection Port", + "order": 2, + "minimum": 0, + "maximum": 65536 + }, + "tunnel_user": { + "type": "string", + "description": "OS-level username for logging into the jump server host", + "title": "SSH Login Username", + "order": 3 + }, + "tunnel_user_password": { + "type": "string", + "description": "OS-level password for logging into the jump server host", + "title": "Password", + "order": 4, + "airbyte_secret": true + } + }, + "required": [ + "tunnel_method", + "tunnel_host", + "tunnel_port", + "tunnel_user", + "tunnel_user_password" + ] + } + ], + "description": "Whether to initiate an SSH tunnel before connecting to the database, and if so, which kind of authentication to use.", + "title": "SSH Tunnel Method", + "order": 5, + "type": "object" + }, + "cursor": { + "oneOf": [ + { + "title": "Scan Changes with User Defined Cursor", + "type": "object", + "additionalProperties": true, + "description": "Configures how data is extracted from the database.", + "properties": { + "cursor_method": { + "type": "string", + "enum": ["user_defined"], + "default": "user_defined" + } + }, + "required": ["cursor_method"] + }, + { + "title": "Read Changes using Change Data Capture (CDC)", + "type": "object", + "additionalProperties": true, + "description": "Configures how data is extracted from the database.", + "properties": { + "cursor_method": { + "type": "string", + "enum": ["cdc"], + "default": "cdc" + } + }, + "required": ["cursor_method"] + } + ], + "description": "Configures how data is extracted from the database.", + "title": "Update Method", + "order": 6, + "display_type": "radio", + "type": "object" + }, + "resumable_preferred": { + "type": "boolean", + "default": true, + "order": 7, + "display_type": "check" + }, + "timeout": { + "type": "string", + "default": "PT0S", + "order": 8 + } + }, + "required": ["host", "port", "database"] + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/metadata.yaml b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/metadata.yaml new file mode 100644 index 0000000000000..e136b2dbbb9ee --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/metadata.yaml @@ -0,0 +1,4 @@ +--- +data: + dockerRepository: "airbyte/h2-source" + documentationUrl: "https://docs.airbyte.com"