From f3fc79a9bd09414fad53f2d0194abe170926f933 Mon Sep 17 00:00:00 2001 From: Marius Posta Date: Tue, 20 Aug 2024 11:28:43 -0400 Subject: [PATCH 01/11] extract-jdbc: add GreaterOrEqual --- .../cdk/fakesource/FakeSourceOperations.kt | 2 ++ .../expected-messages-stream-warm-start.json | 34 +++++++++++++++++++ .../io/airbyte/cdk/read/SelectQuerySpec.kt | 17 +++++++--- 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceOperations.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceOperations.kt index 9adeec57816be..7c8af4fad02b4 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceOperations.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceOperations.kt @@ -34,6 +34,7 @@ import io.airbyte.cdk.read.From import io.airbyte.cdk.read.FromNode import io.airbyte.cdk.read.FromSample import io.airbyte.cdk.read.Greater +import io.airbyte.cdk.read.GreaterOrEqual import io.airbyte.cdk.read.Lesser import io.airbyte.cdk.read.LesserOrEqual import io.airbyte.cdk.read.Limit @@ -148,6 +149,7 @@ class FakeSourceOperations : JdbcMetadataQuerier.FieldTypeMapper, SelectQueryGen is And -> conj.map { it.sql() }.joinToString(") AND (", "(", ")") is Or -> disj.map { it.sql() }.joinToString(") OR (", "(", ")") is Equal -> "${column.id} = ?" + is GreaterOrEqual -> "${column.id} >= ?" is Greater -> "${column.id} > ?" is LesserOrEqual -> "${column.id} <= ?" is Lesser -> "${column.id} < ?" diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-stream-warm-start.json b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-stream-warm-start.json index 6d6e5adf7c8ba..a9323f871874b 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-stream-warm-start.json +++ b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-stream-warm-start.json @@ -27,6 +27,40 @@ } } }, + { + "type": "RECORD", + "record": { + "namespace": "PUBLIC", + "stream": "EVENTS", + "data": { + "ID": "3VWqE0Hrb7TV5BOEP2wN+g==", + "TS": "2024-04-30T00:00:00.000000-04:00", + "MSG": null + }, + "emitted_at": 3133641600000 + } + }, + { + "type": "STATE", + "state": { + "type": "STREAM", + "stream": { + "stream_descriptor": { + "name": "EVENTS", + "namespace": "PUBLIC" + }, + "stream_state": { + "primary_key": {}, + "cursors": { + "TS": "2024-04-30T00:00:00.000000-04:00" + } + } + }, + "sourceStats": { + "recordCount": 2.0 + } + } + }, { "type": "TRACE", "trace": { diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerySpec.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerySpec.kt index 132b987da38e9..86498d4388e1d 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerySpec.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerySpec.kt @@ -26,7 +26,9 @@ sealed interface SelectNode { data class SelectColumns( override val columns: List, -) : SelectNode +) : SelectNode { + constructor(vararg columns: Field) : this(columns.toList()) +} data class SelectColumnMaxValue( val column: Field, @@ -87,6 +89,11 @@ sealed interface WhereClauseLeafNode : WhereClauseNode { val bindingValue: JsonNode } +data class GreaterOrEqual( + override val column: Field, + override val bindingValue: JsonNode, +) : WhereClauseLeafNode + data class Greater( override val column: Field, override val bindingValue: JsonNode, @@ -97,12 +104,12 @@ data class LesserOrEqual( override val bindingValue: JsonNode, ) : WhereClauseLeafNode -data class Equal( +data class Lesser( override val column: Field, override val bindingValue: JsonNode, ) : WhereClauseLeafNode -data class Lesser( +data class Equal( override val column: Field, override val bindingValue: JsonNode, ) : WhereClauseLeafNode @@ -111,7 +118,9 @@ sealed interface OrderByNode data class OrderBy( val columns: List, -) : OrderByNode +) : OrderByNode { + constructor(vararg columns: Field) : this(columns.toList()) +} data object NoOrderBy : OrderByNode From 6c36470e20b3c8f12f5a12eb3975c7b9515e2aa7 Mon Sep 17 00:00:00 2001 From: Marius Posta Date: Tue, 20 Aug 2024 11:28:44 -0400 Subject: [PATCH 02/11] extract-jdbc: cursor incremental query includes lb --- .../cdk/read/StreamPartitionsCreatorUtilsTest.kt | 1 + .../io/airbyte/cdk/read/StreamPartitionReader.kt | 12 +++++++++++- .../io/airbyte/cdk/read/StreamPartitionsCreator.kt | 12 ++++++++++-- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt index 5be288d662286..68ec98799b98a 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt @@ -147,6 +147,7 @@ class StreamPartitionsCreatorUtilsTest { StreamPartitionReader.CursorIncrementalInput( cursor = k, cursorLowerBound = Jsons.numberNode(1), + isLowerBoundIncluded = false, cursorUpperBound = Jsons.numberNode(4), ) val splits: List?, List?>> = diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt index f163e54145184..56deee1ba98bd 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt @@ -37,6 +37,7 @@ class StreamPartitionReader( data class CursorIncrementalInput( val cursor: Field, val cursorLowerBound: JsonNode, + val isLowerBoundIncluded: Boolean, val cursorUpperBound: JsonNode, ) : Input @@ -173,6 +174,7 @@ fun StreamPartitionReader.Input.querySpec( stream, checkpointColumns = primaryKey, checkpointLowerBound = primaryKeyLowerBound, + isLowerBoundIncluded = false, checkpointUpperBound = primaryKeyUpperBound, isOrdered, limit, @@ -182,6 +184,7 @@ fun StreamPartitionReader.Input.querySpec( stream, checkpointColumns = primaryKey, checkpointLowerBound = primaryKeyLowerBound, + isLowerBoundIncluded = false, checkpointUpperBound = primaryKeyUpperBound, isOrdered, limit, @@ -191,6 +194,7 @@ fun StreamPartitionReader.Input.querySpec( stream, checkpointColumns = listOf(cursor), checkpointLowerBound = listOf(cursorLowerBound), + isLowerBoundIncluded = isLowerBoundIncluded, checkpointUpperBound = listOf(cursorUpperBound), isOrdered, limit, @@ -201,6 +205,7 @@ private fun querySpecForStreamPartitionReader( stream: Stream, checkpointColumns: List, checkpointLowerBound: List?, + isLowerBoundIncluded: Boolean, checkpointUpperBound: List?, isOrdered: Boolean, limit: Long?, @@ -215,7 +220,12 @@ private fun querySpecForStreamPartitionReader( checkpointLowerBound?.let { checkpointColumns.zip(it) } ?: listOf() val lowerBoundDisj: List = zippedLowerBound.mapIndexed { idx: Int, (gtCol: Field, gtValue: JsonNode) -> - val lastLeaf: WhereClauseLeafNode = Greater(gtCol, gtValue) + val lastLeaf: WhereClauseLeafNode = + if (isLowerBoundIncluded && idx == checkpointColumns.size - 1) { + GreaterOrEqual(gtCol, gtValue) + } else { + Greater(gtCol, gtValue) + } And( zippedLowerBound.take(idx).map { (eqCol: Field, eqValue: JsonNode) -> Equal(eqCol, eqValue) diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt index b0ca55db9865e..4188878557f0e 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt @@ -95,6 +95,7 @@ class StreamPartitionsCreator( StreamPartitionReader.CursorIncrementalInput( cursor = cursor, cursorLowerBound = cursorLowerBound, + isLowerBoundIncluded = true, cursorUpperBound = utils.computeCursorUpperBound(cursor) ?: return listOf(), ) .split() @@ -118,6 +119,7 @@ class StreamPartitionsCreator( StreamPartitionReader.CursorIncrementalInput( cursor = cursor, cursorLowerBound = cursorLowerBound, + isLowerBoundIncluded = true, cursorUpperBound = cursorUpperBound, ) .split() @@ -137,8 +139,14 @@ class StreamPartitionsCreator( fun StreamPartitionReader.CursorIncrementalInput.split(): List = - utils.split(this, listOf(cursorLowerBound), listOf(cursorUpperBound)).map { (lb, ub) -> - copy(cursorLowerBound = lb!!.first(), cursorUpperBound = ub!!.first()) + utils.split(this, listOf(cursorLowerBound), listOf(cursorUpperBound)).mapIndexed { + idx: Int, + (lb, ub) -> + copy( + cursorLowerBound = lb!!.first(), + isLowerBoundIncluded = idx == 0, + cursorUpperBound = ub!!.first(), + ) } private val utils = StreamPartitionsCreatorUtils(ctx, parameters) From 3131abd625a2ba349622f1780ea689871f9e326f Mon Sep 17 00:00:00 2001 From: Marius Posta Date: Tue, 20 Aug 2024 11:28:45 -0400 Subject: [PATCH 03/11] extract-jdbc: better handling of NULL cursor values --- .../io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt index 16bdec7d71439..6d0fb7dda799b 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt @@ -4,7 +4,6 @@ package io.airbyte.cdk.read import com.fasterxml.jackson.core.JsonGenerator import com.fasterxml.jackson.databind.JsonNode import com.fasterxml.jackson.databind.node.ObjectNode -import io.airbyte.cdk.ConfigErrorException import io.airbyte.cdk.discover.Field import io.airbyte.cdk.util.Jsons import io.github.oshai.kotlinlogging.KotlinLogging @@ -105,7 +104,9 @@ class StreamPartitionsCreatorUtils( ctx.selectQuerier.executeQuery(q).use { if (it.hasNext()) it.next() else return null } val value: JsonNode = record[cursor.id] ?: Jsons.nullNode() if (value.isNull) { - throw ConfigErrorException("NULL value found for cursor ${cursor.id}") + // Either the table is empty, or its cursor column values are all NULL. + // In both cases, there is nothing to be done. + return null } return ctx.transientCursorUpperBoundState.update { value } } From 53e69bb0605ceeb44b86acd6436a6c57b24758a3 Mon Sep 17 00:00:00 2001 From: Marius Posta Date: Tue, 20 Aug 2024 11:28:46 -0400 Subject: [PATCH 04/11] bulk-cdk-core-extract: rewrite commentary for MetadataQuerier --- .../io/airbyte/cdk/discover/MetadataQuerier.kt | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/MetadataQuerier.kt b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/MetadataQuerier.kt index 65b4f1e04534c..d671576cc76fc 100644 --- a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/MetadataQuerier.kt +++ b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/MetadataQuerier.kt @@ -3,23 +3,22 @@ package io.airbyte.cdk.discover import io.airbyte.cdk.command.SourceConfiguration -/** A very thin abstraction around JDBC metadata queries. */ +/** An abstraction for a catalog discovery session. */ interface MetadataQuerier : AutoCloseable { - /** - * Queries the information_schema for all table names in the schemas specified by the connector - * configuration. - */ + + /** Returns all available namespaces. */ fun streamNamespaces(): List + /** Returns all available stream names in the given namespace. */ fun streamNames(streamNamespace: String?): List - /** Executes a SELECT * on the table, discards the results, and extracts all column metadata. */ + /** Returns all available fields in the given stream. */ fun fields( streamName: String, streamNamespace: String?, ): List - /** Queries the information_schema for any primary key on the given table. */ + /** Returns the primary key for the given stream, if it exists; empty list otherwise. */ fun primaryKey( streamName: String, streamNamespace: String?, From 371f7faa4fa0273605aa9e52a0b97f8751cecad2 Mon Sep 17 00:00:00 2001 From: Marius Posta Date: Tue, 20 Aug 2024 11:28:47 -0400 Subject: [PATCH 05/11] bulk-cdk-core-extract: replace AirbyteStreamDecorator with AirbyteStreamFactory --- .../cdk/discover/AirbyteStreamDecorator.kt | 52 --------- .../cdk/discover/AirbyteStreamFactory.kt | 27 +++++ .../airbyte/cdk/discover/DiscoverOperation.kt | 52 ++------- .../airbyte/cdk/discover/DiscoveredStream.kt | 12 +++ .../cdk/fakesource/FakeSourceDiscoverTest.kt | 4 + .../resources/fakesource/cdc-catalog.json | 6 +- .../fakesource/expected-cursor-catalog.json | 2 + .../src/test/resources/read/cdc-catalog.json | 6 +- .../test/resources/read/cursor-catalog.json | 2 + .../discover/JdbcAirbyteStreamDecorator.kt | 55 ---------- .../cdk/discover/JdbcAirbyteStreamFactory.kt | 100 ++++++++++++++++++ 11 files changed, 163 insertions(+), 155 deletions(-) delete mode 100644 airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamDecorator.kt create mode 100644 airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamFactory.kt create mode 100644 airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoveredStream.kt delete mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamDecorator.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamFactory.kt diff --git a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamDecorator.kt b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamDecorator.kt deleted file mode 100644 index 4553510cdcf9c..0000000000000 --- a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamDecorator.kt +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.discover - -import io.airbyte.protocol.models.v0.AirbyteStream - -/** - * Stateless object for building an [AirbyteStream] during DISCOVER. - * - * [DefaultAirbyteStreamDecorator] is the sane default implementation, to be replaced with - * connector-specific implementations when required. - */ -interface AirbyteStreamDecorator { - /** Connector-specific [AirbyteStream] decoration logic for GLOBAL-state streams. */ - fun decorateGlobal(airbyteStream: AirbyteStream) - - /** - * Connector-specific [AirbyteStream] decoration logic for STREAM-state streams for which at - * least one discovered field can be used as a user-defined cursor in incremental syncs. - */ - fun decorateNonGlobal(airbyteStream: AirbyteStream) - - /** - * Connector-specific [AirbyteStream] decoration logic for STREAM-state streams for which no - * discovered field can be used as a user-defined cursor in incremental syncs. - */ - fun decorateNonGlobalNoCursor(airbyteStream: AirbyteStream) - - /** - * Can the field be used as part of a primary key? - * - * For this to be possible, - * 1. the field needs to be part of a key as defined by the source, - * 2. and its values must be deserializable from the checkpoint persisted in an Airbyte state - * message. - * - * This method does not determine (1), of course, because the source keys are defined in the - * source database itself and are retrieved via [MetadataQuerier.primaryKey]. Instead, this - * method determines (2) based on the type information of the field, typically the [FieldType] - * objects. For instance if the [Field.type] does not map to a [LosslessFieldType] then the - * field can't reliably round-trip checkpoint values during a resumable initial sync. - */ - fun isPossiblePrimaryKeyElement(field: Field): Boolean - - /** - * Can the field be used as a cursor in a cursor-based incremental sync? - * - * This predicate is like [isPossiblePrimaryKeyElement] but tighter: in addition to being able - * to round-trip the column values, we need to be able to query the max value from the source at - * the start of the sync. - */ - fun isPossibleCursor(field: Field): Boolean -} diff --git a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamFactory.kt b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamFactory.kt new file mode 100644 index 0000000000000..93705c4d25122 --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/AirbyteStreamFactory.kt @@ -0,0 +1,27 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.discover + +import io.airbyte.protocol.models.Field as AirbyteField +import io.airbyte.protocol.models.v0.AirbyteStream +import io.airbyte.protocol.models.v0.CatalogHelpers + +/** Stateless object for building an [AirbyteStream] during DISCOVER. */ +interface AirbyteStreamFactory { + /** Connector-specific [AirbyteStream] creation logic for GLOBAL-state streams. */ + fun createGlobal(discoveredStream: DiscoveredStream): AirbyteStream + + /** Connector-specific [AirbyteStream] creation logic for STREAM-state streams. */ + fun createNonGlobal(discoveredStream: DiscoveredStream): AirbyteStream + + companion object { + + fun createAirbyteStream(discoveredStream: DiscoveredStream): AirbyteStream = + CatalogHelpers.createAirbyteStream( + discoveredStream.name, + discoveredStream.namespace, + discoveredStream.columns.map { + AirbyteField.of(it.id, it.type.airbyteType.asJsonSchemaType()) + }, + ) + } +} diff --git a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoverOperation.kt b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoverOperation.kt index 7884aef6d66dc..90f1732e6dd96 100644 --- a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoverOperation.kt +++ b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoverOperation.kt @@ -4,10 +4,8 @@ package io.airbyte.cdk.discover import io.airbyte.cdk.Operation import io.airbyte.cdk.command.SourceConfiguration import io.airbyte.cdk.output.OutputConsumer -import io.airbyte.protocol.models.Field as AirbyteField import io.airbyte.protocol.models.v0.AirbyteCatalog import io.airbyte.protocol.models.v0.AirbyteStream -import io.airbyte.protocol.models.v0.CatalogHelpers import io.github.oshai.kotlinlogging.KotlinLogging import io.micronaut.context.annotation.Requires import jakarta.inject.Singleton @@ -18,7 +16,7 @@ import jakarta.inject.Singleton class DiscoverOperation( val config: SourceConfiguration, val metadataQuerierFactory: MetadataQuerier.Factory, - val airbyteStreamDecorator: AirbyteStreamDecorator, + val airbyteStreamFactory: AirbyteStreamFactory, val outputConsumer: OutputConsumer, ) : Operation { private val log = KotlinLogging.logger {} @@ -39,50 +37,16 @@ class DiscoverOperation( } val primaryKey: List> = metadataQuerier.primaryKey(name, namespace) val discoveredStream = DiscoveredStream(name, namespace, fields, primaryKey) - airbyteStreams.add(toAirbyteStream(discoveredStream)) + val airbyteStream: AirbyteStream = + if (config.global) { + airbyteStreamFactory.createGlobal(discoveredStream) + } else { + airbyteStreamFactory.createNonGlobal(discoveredStream) + } + airbyteStreams.add(airbyteStream) } } } outputConsumer.accept(AirbyteCatalog().withStreams(airbyteStreams)) } - - fun toAirbyteStream(discoveredStream: DiscoveredStream): AirbyteStream { - val allColumnsByID: Map = discoveredStream.columns.associateBy { it.id } - val airbyteStream: AirbyteStream = - CatalogHelpers.createAirbyteStream( - discoveredStream.name, - discoveredStream.namespace, - discoveredStream.columns.map { - AirbyteField.of(it.id, it.type.airbyteType.asJsonSchemaType()) - }, - ) - val isValidPK: Boolean = - discoveredStream.primaryKeyColumnIDs.all { idComponents: List -> - val id: String = idComponents.joinToString(separator = ".") - val field: Field? = allColumnsByID[id] - field != null && airbyteStreamDecorator.isPossiblePrimaryKeyElement(field) - } - airbyteStream.withSourceDefinedPrimaryKey( - if (isValidPK) discoveredStream.primaryKeyColumnIDs else listOf(), - ) - airbyteStream.isResumable = airbyteStream.sourceDefinedPrimaryKey.isNotEmpty() - if (config.global) { - // There is a global feed of incremental records, like CDC. - airbyteStreamDecorator.decorateGlobal(airbyteStream) - } else if (discoveredStream.columns.any { airbyteStreamDecorator.isPossibleCursor(it) }) { - // There is one field whose values can be round-tripped and aggregated by MAX. - airbyteStreamDecorator.decorateNonGlobal(airbyteStream) - } else { - // There is no such field. - airbyteStreamDecorator.decorateNonGlobalNoCursor(airbyteStream) - } - return airbyteStream - } - - data class DiscoveredStream( - val name: String, - val namespace: String?, - val columns: List, - val primaryKeyColumnIDs: List>, - ) } diff --git a/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoveredStream.kt b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoveredStream.kt new file mode 100644 index 0000000000000..57453f29b29f4 --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/discover/DiscoveredStream.kt @@ -0,0 +1,12 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.discover + +data class DiscoveredStream( + val name: String, + val namespace: String?, + val columns: List, + val primaryKeyColumnIDs: List>, +) diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceDiscoverTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceDiscoverTest.kt index d7ef4de471d7f..f82b98af7a1c7 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceDiscoverTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceDiscoverTest.kt @@ -33,6 +33,7 @@ class FakeSourceDiscoverTest { .withNamespace("PUBLIC") .withJsonSchema(Jsons.readTree(EVENTS_SCHEMA)) .withSupportedSyncModes(listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL)) + .withSourceDefinedCursor(false) .withSourceDefinedPrimaryKey(listOf(listOf("ID"))) .withIsResumable(true) val kv = @@ -41,6 +42,7 @@ class FakeSourceDiscoverTest { .withNamespace("PUBLIC") .withJsonSchema(Jsons.readTree(KV_SCHEMA)) .withSupportedSyncModes(listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL)) + .withSourceDefinedCursor(false) .withSourceDefinedPrimaryKey(listOf(listOf("K"))) .withIsResumable(true) val expected = AirbyteCatalog().withStreams(listOf(events, kv)) @@ -60,6 +62,7 @@ class FakeSourceDiscoverTest { .withNamespace("PUBLIC") .withJsonSchema(Jsons.readTree(EVENTS_SCHEMA)) .withSupportedSyncModes(listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL)) + .withSourceDefinedCursor(false) .withSourceDefinedPrimaryKey(listOf(listOf("ID"))) .withIsResumable(true) val kv = @@ -68,6 +71,7 @@ class FakeSourceDiscoverTest { .withNamespace("PUBLIC") .withJsonSchema(Jsons.readTree(KV_SCHEMA)) .withSupportedSyncModes(listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL)) + .withSourceDefinedCursor(false) .withSourceDefinedPrimaryKey(listOf(listOf("K"))) .withIsResumable(true) val expected = AirbyteCatalog().withStreams(listOf(events, kv)) diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cdc-catalog.json b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cdc-catalog.json index 0b4c5b8d3af91..20c8b3b07b22e 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cdc-catalog.json +++ b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cdc-catalog.json @@ -21,7 +21,8 @@ } }, "supported_sync_modes": ["full_refresh", "incremental"], - "default_cursor_field": ["ID", "TS"], + "source_defined_cursor": false, + "default_cursor_field": [], "source_defined_primary_key": [["ID"]], "is_resumable": true, "namespace": "PUBLIC" @@ -47,7 +48,8 @@ } }, "supported_sync_modes": ["full_refresh", "incremental"], - "default_cursor_field": ["K"], + "source_defined_cursor": false, + "default_cursor_field": [], "source_defined_primary_key": [["K"]], "is_resumable": true, "namespace": "PUBLIC" diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-cursor-catalog.json b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-cursor-catalog.json index 361331109c391..6238a25d21529 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-cursor-catalog.json +++ b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-cursor-catalog.json @@ -24,6 +24,7 @@ } }, "supported_sync_modes": ["full_refresh", "incremental"], + "source_defined_cursor": false, "default_cursor_field": [], "source_defined_primary_key": [["ID"]], "is_resumable": true, @@ -48,6 +49,7 @@ } }, "supported_sync_modes": ["full_refresh", "incremental"], + "source_defined_cursor": false, "default_cursor_field": [], "source_defined_primary_key": [["K"]], "is_resumable": true, diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/read/cdc-catalog.json b/airbyte-cdk/bulk/core/extract/src/test/resources/read/cdc-catalog.json index 7fd5bc286a51a..08eec4fcc91c1 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/read/cdc-catalog.json +++ b/airbyte-cdk/bulk/core/extract/src/test/resources/read/cdc-catalog.json @@ -20,7 +20,8 @@ } }, "supported_sync_modes": ["full_refresh", "incremental"], - "default_cursor_field": ["ID", "TS"], + "source_defined_cursor": false, + "default_cursor_field": [], "source_defined_primary_key": [["ID"]], "is_resumable": true, "namespace": "PUBLIC" @@ -46,7 +47,8 @@ } }, "supported_sync_modes": ["full_refresh", "incremental"], - "default_cursor_field": ["K"], + "source_defined_cursor": false, + "default_cursor_field": [], "source_defined_primary_key": [["K"]], "is_resumable": true, "namespace": "PUBLIC" diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/read/cursor-catalog.json b/airbyte-cdk/bulk/core/extract/src/test/resources/read/cursor-catalog.json index f1b4850c1fe1f..92d203e0fb205 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/read/cursor-catalog.json +++ b/airbyte-cdk/bulk/core/extract/src/test/resources/read/cursor-catalog.json @@ -20,6 +20,7 @@ } }, "supported_sync_modes": ["full_refresh", "incremental"], + "source_defined_cursor": false, "default_cursor_field": [], "source_defined_primary_key": [["ID"]], "is_resumable": true, @@ -46,6 +47,7 @@ } }, "supported_sync_modes": ["full_refresh", "incremental"], + "source_defined_cursor": false, "default_cursor_field": [], "source_defined_primary_key": [["K"]], "is_resumable": true, diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamDecorator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamDecorator.kt deleted file mode 100644 index 070b85314d76f..0000000000000 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamDecorator.kt +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.discover - -import com.fasterxml.jackson.databind.node.ObjectNode -import io.airbyte.cdk.jdbc.BinaryStreamFieldType -import io.airbyte.cdk.jdbc.BooleanFieldType -import io.airbyte.cdk.jdbc.CharacterStreamFieldType -import io.airbyte.cdk.jdbc.ClobFieldType -import io.airbyte.cdk.jdbc.JsonStringFieldType -import io.airbyte.cdk.jdbc.NCharacterStreamFieldType -import io.airbyte.cdk.jdbc.NClobFieldType -import io.airbyte.protocol.models.v0.AirbyteStream -import io.airbyte.protocol.models.v0.SyncMode -import jakarta.inject.Singleton - -@Singleton -class JdbcAirbyteStreamDecorator : AirbyteStreamDecorator { - override fun decorateGlobal(airbyteStream: AirbyteStream) { - airbyteStream.apply { - supportedSyncModes = listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL) - (jsonSchema["properties"] as ObjectNode).apply { - for (metaField in CommonMetaField.entries) { - set(metaField.id, metaField.type.airbyteType.asJsonSchema()) - } - } - defaultCursorField = listOf(CommonMetaField.CDC_LSN.id) - sourceDefinedCursor = true - } - } - - override fun decorateNonGlobal(airbyteStream: AirbyteStream) { - airbyteStream.apply { - supportedSyncModes = listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL) - } - } - - override fun decorateNonGlobalNoCursor(airbyteStream: AirbyteStream) { - airbyteStream.apply { supportedSyncModes = listOf(SyncMode.FULL_REFRESH) } - } - - override fun isPossiblePrimaryKeyElement(field: Field): Boolean = - when (field.type) { - !is LosslessFieldType -> false - BinaryStreamFieldType, - CharacterStreamFieldType, - NCharacterStreamFieldType, - ClobFieldType, - NClobFieldType, - JsonStringFieldType, -> false - else -> true - } - - override fun isPossibleCursor(field: Field): Boolean = - isPossiblePrimaryKeyElement(field) && field.type !is BooleanFieldType -} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamFactory.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamFactory.kt new file mode 100644 index 0000000000000..8818a0380773b --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/discover/JdbcAirbyteStreamFactory.kt @@ -0,0 +1,100 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.discover + +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.jdbc.BinaryStreamFieldType +import io.airbyte.cdk.jdbc.BooleanFieldType +import io.airbyte.cdk.jdbc.CharacterStreamFieldType +import io.airbyte.cdk.jdbc.ClobFieldType +import io.airbyte.cdk.jdbc.JsonStringFieldType +import io.airbyte.cdk.jdbc.NCharacterStreamFieldType +import io.airbyte.cdk.jdbc.NClobFieldType +import io.airbyte.protocol.models.v0.SyncMode +import jakarta.inject.Singleton + +@Singleton +class JdbcAirbyteStreamFactory : AirbyteStreamFactory { + + override fun createGlobal(discoveredStream: DiscoveredStream) = + AirbyteStreamFactory.createAirbyteStream(discoveredStream).apply { + supportedSyncModes = listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL) + (jsonSchema["properties"] as ObjectNode).apply { + for (metaField in CommonMetaField.entries) { + set(metaField.id, metaField.type.airbyteType.asJsonSchema()) + } + } + defaultCursorField = listOf(CommonMetaField.CDC_LSN.id) + sourceDefinedCursor = true + if (hasValidPrimaryKey(discoveredStream)) { + sourceDefinedPrimaryKey = discoveredStream.primaryKeyColumnIDs + isResumable = true + } + } + + override fun createNonGlobal(discoveredStream: DiscoveredStream) = + AirbyteStreamFactory.createAirbyteStream(discoveredStream).apply { + if (hasCursorFields(discoveredStream)) { + supportedSyncModes = listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL) + } else { + supportedSyncModes = listOf(SyncMode.FULL_REFRESH) + } + sourceDefinedCursor = false + if (hasValidPrimaryKey(discoveredStream)) { + sourceDefinedPrimaryKey = discoveredStream.primaryKeyColumnIDs + isResumable = true + } + } + + /** Does the [discoveredStream] have a field that could serve as a cursor? */ + fun hasCursorFields(discoveredStream: DiscoveredStream): Boolean = + !discoveredStream.columns.none(::isPossibleCursor) + + /** Does the [discoveredStream] have a valid primary key declared? */ + fun hasValidPrimaryKey(discoveredStream: DiscoveredStream): Boolean { + if (discoveredStream.primaryKeyColumnIDs.isEmpty()) { + return false + } + val allColumnsByID: Map = discoveredStream.columns.associateBy { it.id } + return discoveredStream.primaryKeyColumnIDs.all { idComponents: List -> + val id: String = idComponents.joinToString(separator = ".") + val field: Field? = allColumnsByID[id] + field != null && isPossiblePrimaryKeyElement(field) + } + } + + /** + * Can the field be used as part of a primary key? + * + * For this to be possible, + * 1. the field needs to be part of a key as defined by the source, + * 2. and its values must be deserializable from the checkpoint persisted in an Airbyte state + * message. + * + * This method does not determine (1), of course, because the source keys are defined in the + * source database itself and are retrieved via [MetadataQuerier.primaryKey]. Instead, this + * method determines (2) based on the type information of the field, typically the [FieldType] + * objects. For instance if the [Field.type] does not map to a [LosslessFieldType] then the + * field can't reliably round-trip checkpoint values during a resumable initial sync. + */ + fun isPossiblePrimaryKeyElement(field: Field): Boolean = + when (field.type) { + !is LosslessFieldType -> false + BinaryStreamFieldType, + CharacterStreamFieldType, + NCharacterStreamFieldType, + ClobFieldType, + NClobFieldType, + JsonStringFieldType, -> false + else -> true + } + + /** + * Can the field be used as a cursor in a cursor-based incremental sync? + * + * This predicate is like [isPossiblePrimaryKeyElement] but tighter: in addition to being able + * to round-trip the column values, we need to be able to query the max value from the source at + * the start of the sync. + */ + fun isPossibleCursor(field: Field): Boolean = + isPossiblePrimaryKeyElement(field) && field.type !is BooleanFieldType +} From 596a7b53fe4b13e72eade59ebe847922d6593510 Mon Sep 17 00:00:00 2001 From: Marius Posta Date: Tue, 20 Aug 2024 11:28:48 -0400 Subject: [PATCH 06/11] extract-jdbc: add and adopt JdbcSharedState and JdbcStreamState --- .../read/StreamPartitionsCreatorUtilsTest.kt | 38 +++--- .../bulk/toolkits/extract-jdbc/build.gradle | 3 + .../airbyte/cdk/read/CheckpointStreamState.kt | 35 ++--- .../cdk/read/DefaultJdbcFetchSizeEstimator.kt | 40 ++++++ .../cdk/read/DefaultJdbcSharedState.kt | 103 +++++++++++++++ .../cdk/read/DefaultJdbcStreamState.kt | 53 ++++++++ .../cdk/read/DefaultJdbcStreamStateValue.kt | 63 +++++++++ .../cdk/read/DefaultRowByteSizeEstimator.kt | 40 ++++++ .../io/airbyte/cdk/read/JdbcSharedState.kt | 59 +++++++++ .../io/airbyte/cdk/read/JdbcStreamState.kt | 38 ++++++ .../cdk/read/MemoryFetchSizeEstimator.kt | 52 -------- .../main/kotlin/io/airbyte/cdk/read/Sample.kt | 3 +- .../io/airbyte/cdk/read/SelectQuerier.kt | 10 +- .../airbyte/cdk/read/StreamPartitionReader.kt | 45 ++++--- .../cdk/read/StreamPartitionsCreator.kt | 22 ++-- .../cdk/read/StreamPartitionsCreatorUtils.kt | 11 +- .../io/airbyte/cdk/read/StreamReadContext.kt | 64 ++++----- .../read/DefaultJdbcFetchSizeEstimatorTest.kt | 37 ++++++ .../read/DefaultRowByteSizeEstimatorTest.kt | 42 ++++++ .../io/airbyte/cdk/read/TestFixtures.kt | 121 ++++++++++++++++++ 20 files changed, 705 insertions(+), 174 deletions(-) create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimator.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamState.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamStateValue.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimator.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcSharedState.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcStreamState.kt delete mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/MemoryFetchSizeEstimator.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimatorTest.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimatorTest.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt index 68ec98799b98a..5add648d116a8 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt @@ -14,7 +14,6 @@ import io.airbyte.cdk.jdbc.JdbcConnectionFactory import io.airbyte.cdk.jdbc.StringFieldType import io.airbyte.cdk.output.BufferingCatalogValidationFailureHandler import io.airbyte.cdk.output.BufferingOutputConsumer -import io.airbyte.cdk.read.MemoryFetchSizeEstimator.Companion.DEFAULT_FETCH_SIZE import io.airbyte.cdk.read.Sample.Kind import io.airbyte.cdk.util.Jsons import io.airbyte.protocol.models.v0.SyncMode @@ -99,29 +98,13 @@ class StreamPartitionsCreatorUtilsTest { Assertions.assertEquals(Kind.SMALL, sample.kind) } - @Test - fun testMemoryFetchSizeEstimator() { - Assertions.assertEquals( - 14000, - MemoryFetchSizeEstimator(700_000, 1).apply(Sample(listOf(10, 20, 30), Kind.SMALL, 0L)), - ) - Assertions.assertEquals( - 7000, - MemoryFetchSizeEstimator(700_000, 2).apply(Sample(listOf(10, 20, 30), Kind.SMALL, 0L)), - ) - Assertions.assertEquals( - DEFAULT_FETCH_SIZE, - MemoryFetchSizeEstimator(700_000, 2).apply(Sample(listOf(), Kind.MEDIUM, 0L)), - ) - } - @Test fun testCursorUpperBound() { val utils: StreamPartitionsCreatorUtils = createUtils(testParameters) utils.computeCursorUpperBound(k) Assertions.assertEquals( "5", - utils.ctx.transientCursorUpperBoundState.get()?.toString(), + utils.ctx.streamState.cursorUpperBound?.toString(), ) } @@ -166,13 +149,26 @@ class StreamPartitionsCreatorUtilsTest { timeout = "PT1S" } val config: FakeSourceConfiguration = FakeSourceConfigurationFactory().make(configPojo) + val sharedState = + DefaultJdbcSharedState( + config, + BufferingOutputConsumer(TestClockFactory().fixed()), + JdbcSelectQuerier(JdbcConnectionFactory(config)), + withSampling = true, + maxSampleSize = 1024, + expectedThroughputBytesPerSecond = 10 * 1024 * 1024, + minFetchSize = 10, + defaultFetchSize = 1_000, + maxFetchSize = 10_000_000, + memoryCapacityRatio = 0.6, + estimatedRecordOverheadBytes = 16, + estimatedFieldOverheadBytes = 16, + ) val ctxManager = StreamReadContextManager( - config, + sharedState, BufferingCatalogValidationFailureHandler(), FakeSourceOperations(), - JdbcSelectQuerier(JdbcConnectionFactory(config)), - BufferingOutputConsumer(TestClockFactory().fixed()), ) val ctx = ctxManager[stream] ctx.resetStream() diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/build.gradle b/airbyte-cdk/bulk/toolkits/extract-jdbc/build.gradle index 83f004146c5ee..df08dd0c8b84e 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/build.gradle +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/build.gradle @@ -1,4 +1,7 @@ dependencies { implementation project(':airbyte-cdk:bulk:core:bulk-cdk-core-base') implementation project(':airbyte-cdk:bulk:core:bulk-cdk-core-extract') + + testFixturesApi testFixtures(project(':airbyte-cdk:bulk:core:bulk-cdk-core-base')) + testFixturesApi testFixtures(project(':airbyte-cdk:bulk:core:bulk-cdk-core-extract')) } diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt index 0e51c8cd5eaaf..f03700f63c81a 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt @@ -1,7 +1,6 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ package io.airbyte.cdk.read -import com.fasterxml.jackson.annotation.JsonProperty import com.fasterxml.jackson.databind.JsonNode import io.airbyte.cdk.command.OpaqueStateValue import io.airbyte.cdk.discover.Field @@ -38,22 +37,20 @@ data class CursorIncrementalCheckpoint( ) : CheckpointStreamState /** Serializes a [CheckpointStreamState] into an [OpaqueStateValue]. */ -fun CheckpointStreamState.opaqueStateValue(): OpaqueStateValue = Jsons.valueToTree(jsonValue()) - -private fun CheckpointStreamState.jsonValue(): StreamStateJsonValue = +fun CheckpointStreamState.opaqueStateValue(): OpaqueStateValue = when (this) { - SnapshotCompleted -> StreamStateJsonValue() + SnapshotCompleted -> DefaultJdbcStreamStateValue.snapshotCompleted is SnapshotCheckpoint -> - StreamStateJsonValue( - primaryKey = primaryKey.map { it.id }.zip(primaryKeyCheckpoint).toMap(), - ) + DefaultJdbcStreamStateValue.snapshotCheckpoint(primaryKey, primaryKeyCheckpoint) is SnapshotWithCursorCheckpoint -> - StreamStateJsonValue( - primaryKey = primaryKey.map { it.id }.zip(primaryKeyCheckpoint).toMap(), - cursors = mapOf(cursor.id to cursorUpperBound), + DefaultJdbcStreamStateValue.snapshotWithCursorCheckpoint( + primaryKey, + primaryKeyCheckpoint, + cursor, + cursorUpperBound ) is CursorIncrementalCheckpoint -> - StreamStateJsonValue(cursors = mapOf(cursor.id to cursorCheckpoint)) + DefaultJdbcStreamStateValue.cursorIncrementalCheckpoint(cursor, cursorCheckpoint) } /** @@ -64,20 +61,10 @@ fun OpaqueStateValue?.checkpoint(ctx: StreamReadContext): CheckpointStreamState? if (this == null) { null } else { - Jsons.treeToValue(this, StreamStateJsonValue::class.java).checkpoint(ctx) + Jsons.treeToValue(this, DefaultJdbcStreamStateValue::class.java).checkpoint(ctx) } -/** - * [StreamStateJsonValue] is like [CheckpointStreamState] but configuration- and catalog-agnostic. - * This is the object which is used for de/serializing Airbyte STATE message values from/to - * [OpaqueStateValue]s. - */ -data class StreamStateJsonValue( - @JsonProperty("primary_key") val primaryKey: Map = mapOf(), - @JsonProperty("cursors") val cursors: Map = mapOf(), -) - -private fun StreamStateJsonValue.checkpoint(ctx: StreamReadContext): CheckpointStreamState? { +private fun DefaultJdbcStreamStateValue.checkpoint(ctx: StreamReadContext): CheckpointStreamState? { val pkMap: Map = run { if (primaryKey.isEmpty()) { return@run mapOf() diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimator.kt new file mode 100644 index 0000000000000..af103b75fde8f --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimator.kt @@ -0,0 +1,40 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.read + +import io.github.oshai.kotlinlogging.KotlinLogging + +/** + * [FetchSizeEstimator] is used to estimate the value of the JDBC fetchSize parameter to fill up a + * portion of the JVM heap defined by [MEM_CAPACITY_RATIO]. + */ +class DefaultJdbcFetchSizeEstimator( + val maxMemoryBytes: Long, + val maxConcurrency: Int, + val minFetchSize: Int, + val defaultFetchSize: Int, + val maxFetchSize: Int, + val memoryCapacityRatio: Double, +) : JdbcSharedState.JdbcFetchSizeEstimator { + private val log = KotlinLogging.logger {} + + override fun apply(rowByteSizeSample: Sample): Int { + val maxRowBytes: Long = rowByteSizeSample.sampledValues.maxOrNull() ?: 0L + log.info { + "Maximum row size in ${rowByteSizeSample.kind.name} table is $maxRowBytes bytes." + } + val targetMemoryUse: Long = (maxMemoryBytes * memoryCapacityRatio).toLong() + if (listOf(maxRowBytes, targetMemoryUse, maxConcurrency.toLong()).any { it <= 0L }) { + return defaultFetchSize + } + val targetMemoryUsePerQuery: Long = targetMemoryUse / maxConcurrency + log.info { + "Targeting a maximum of $targetMemoryUsePerQuery bytes " + + "for each of up to $maxConcurrency queries." + } + val maxRowsFetchedPerQuery: Long = targetMemoryUsePerQuery / maxRowBytes + return maxRowsFetchedPerQuery + .coerceAtLeast(minFetchSize.toLong()) + .coerceAtMost(maxFetchSize.toLong()) + .toInt() + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt new file mode 100644 index 0000000000000..346d314e0acf5 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.command.JdbcSourceConfiguration +import io.airbyte.cdk.command.MetadataYamlPropertySource.Companion.PROPERTY_PREFIX +import io.airbyte.cdk.output.OutputConsumer +import io.micronaut.context.annotation.Value +import jakarta.inject.Singleton +import kotlinx.coroutines.sync.Semaphore + +/** Default implementation of [JdbcSharedState]. */ +@Singleton +class DefaultJdbcSharedState( + override val configuration: JdbcSourceConfiguration, + override val outputConsumer: OutputConsumer, + override val selectQuerier: SelectQuerier, + @Value("\${$PROPERTY_PREFIX.jdbc.with-sampling:$WITH_SAMPLING}") + override val withSampling: Boolean, + @Value("\${$PROPERTY_PREFIX.jdbc.table-sample-size:$TABLE_SAMPLE_SIZE}") + override val maxSampleSize: Int, + /** How many bytes per second we can expect the database to send to the connector. */ + @Value("\${$PROPERTY_PREFIX.jdbc.throughput-bytes-per-second:$THROUGHPUT_BYTES_PER_SECOND}") + val expectedThroughputBytesPerSecond: Long, + /** Smallest possible fetchSize value. */ + @Value("\${$PROPERTY_PREFIX.jdbc.min-fetch-size:$FETCH_SIZE_LOWER_BOUND}") + val minFetchSize: Int, + /** Default fetchSize value, in absence of any other estimate. */ + @Value("\${$PROPERTY_PREFIX.jdbc.default-fetch-size:$DEFAULT_FETCH_SIZE}") + val defaultFetchSize: Int, + /** Largest possible fetchSize value. */ + @Value("\${$PROPERTY_PREFIX.jdbc.max-fetch-size:$FETCH_SIZE_UPPER_BOUND}") + val maxFetchSize: Int, + /** How much of the JVM heap can we fill up with [java.sql.ResultSet] data. */ + @Value("\${$PROPERTY_PREFIX.jdbc.memory-capacity-ratio:$MEM_CAPACITY_RATIO}") + val memoryCapacityRatio: Double, + /** Estimated bytes used as overhead for each row in a [java.sql.ResultSet]. */ + @Value("\${$PROPERTY_PREFIX.jdbc.estimated-record-overhead-bytes:$RECORD_OVERHEAD_BYTES}") + val estimatedRecordOverheadBytes: Long, + /** Estimated bytes used as overhead for each column value in a [java.sql.ResultSet]. */ + @Value("\${$PROPERTY_PREFIX.jdbc.estimated-field-overhead-bytes:$FIELD_OVERHEAD_BYTES}") + val estimatedFieldOverheadBytes: Long, + /** Overrides the JVM heap capacity to provide determinism in tests. */ + val maxMemoryBytesForTesting: Long? = null +) : JdbcSharedState { + + val maxPartitionThroughputBytesPerSecond: Long = + expectedThroughputBytesPerSecond / configuration.maxConcurrency + + override val targetPartitionByteSize: Long = + maxPartitionThroughputBytesPerSecond * configuration.checkpointTargetInterval.seconds + + override fun jdbcFetchSizeEstimator(): JdbcSharedState.JdbcFetchSizeEstimator = + DefaultJdbcFetchSizeEstimator( + maxMemoryBytes = maxMemoryBytesForTesting ?: Runtime.getRuntime().maxMemory(), + configuration.maxConcurrency, + minFetchSize, + defaultFetchSize, + maxFetchSize, + memoryCapacityRatio, + ) + + override fun rowByteSizeEstimator(): JdbcSharedState.RowByteSizeEstimator = + DefaultRowByteSizeEstimator(estimatedRecordOverheadBytes, estimatedFieldOverheadBytes) + + internal val semaphore = Semaphore(configuration.maxConcurrency) + + override fun tryAcquireResourcesForCreator(): StreamPartitionsCreator.AcquiredResources? = + if (semaphore.tryAcquire()) { + StreamPartitionsCreator.AcquiredResources { semaphore.release() } + } else { + null + } + + override fun tryAcquireResourcesForReader(): StreamPartitionReader.AcquiredResources? = + if (semaphore.tryAcquire()) { + StreamPartitionReader.AcquiredResources { semaphore.release() } + } else { + null + } + + companion object { + + // Sampling defaults. + internal const val WITH_SAMPLING: Boolean = false + internal const val TABLE_SAMPLE_SIZE: Int = 1024 + internal const val THROUGHPUT_BYTES_PER_SECOND: Long = 10L shl 20 + + // fetchSize defaults + internal const val FETCH_SIZE_LOWER_BOUND: Int = 10 + internal const val DEFAULT_FETCH_SIZE: Int = 1_000 + internal const val FETCH_SIZE_UPPER_BOUND: Int = 10_000_000 + + // Memory estimate defaults. + internal const val RECORD_OVERHEAD_BYTES = 16L + internal const val FIELD_OVERHEAD_BYTES = 16L + // We're targeting use of 60% of the available memory in order to allow + // for some headroom for other garbage collection. + internal const val MEM_CAPACITY_RATIO: Double = 0.6 + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamState.kt new file mode 100644 index 0000000000000..88e4712b7189a --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamState.kt @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.JsonNode +import java.util.concurrent.atomic.AtomicReference + +/** Default implementation of [JdbcStreamState]. */ +class DefaultJdbcStreamState( + override val sharedState: DefaultJdbcSharedState, + override val stream: Stream, +) : JdbcStreamState { + + override var cursorUpperBound: JsonNode? + get() = transient.get().cursorUpperBound + set(value) { + transient.updateAndGet { it.copy(cursorUpperBound = value) } + } + + override var fetchSize: Int? + get() = transient.get().fetchSize + set(value) { + transient.updateAndGet { it.copy(fetchSize = value) } + } + + override val fetchSizeOrDefault: Int + get() = fetchSize ?: sharedState.defaultFetchSize + + override val limit: Long + get() = fetchSizeOrDefault * transient.get().limitState.current + + private val transient = AtomicReference(Transient.initial) + + override fun updateLimitState(fn: (LimitState) -> LimitState) { + transient.updateAndGet { it.copy(limitState = fn(it.limitState)) } + } + + override fun reset() { + transient.set(Transient.initial) + } + + private data class Transient( + val fetchSize: Int?, + val limitState: LimitState, + val cursorUpperBound: JsonNode?, + ) { + companion object { + val initial = Transient(fetchSize = null, LimitState.minimum, cursorUpperBound = null) + } + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamStateValue.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamStateValue.kt new file mode 100644 index 0000000000000..ee4ac3f398767 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamStateValue.kt @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.annotation.JsonProperty +import com.fasterxml.jackson.databind.JsonNode +import io.airbyte.cdk.command.OpaqueStateValue +import io.airbyte.cdk.discover.Field +import io.airbyte.cdk.util.Jsons + +/** + * [DefaultJdbcStreamStateValue] is used by [DefaultJdbcPartitionFactory] for deserializing an + * [OpaqueStateValue] into a [DefaultJdbcPartition]. The latter is able to, in turn, serialize a + * partition boundary into an [OpaqueStateValue] with [DefaultJdbcStreamStateValue]. + */ +data class DefaultJdbcStreamStateValue( + @JsonProperty("primary_key") val primaryKey: Map = mapOf(), + @JsonProperty("cursors") val cursors: Map = mapOf(), +) { + companion object { + /** Value representing the completion of a FULL_REFRESH snapshot. */ + val snapshotCompleted: OpaqueStateValue + get() = Jsons.valueToTree(DefaultJdbcStreamStateValue()) + + /** Value representing the progress of a ongoing snapshot not involving cursor columns. */ + fun snapshotCheckpoint( + primaryKey: List, + primaryKeyCheckpoint: List, + ): OpaqueStateValue = + Jsons.valueToTree( + DefaultJdbcStreamStateValue( + primaryKey = primaryKey.map { it.id }.zip(primaryKeyCheckpoint).toMap(), + ) + ) + + /** Value representing the progress of an ongoing snapshot involving cursor columns. */ + fun snapshotWithCursorCheckpoint( + primaryKey: List, + primaryKeyCheckpoint: List, + cursor: Field, + cursorUpperBound: JsonNode, + ): OpaqueStateValue = + Jsons.valueToTree( + DefaultJdbcStreamStateValue( + primaryKey = primaryKey.map { it.id }.zip(primaryKeyCheckpoint).toMap(), + cursors = mapOf(cursor.id to cursorUpperBound), + ) + ) + + /** Value representing the progress of an ongoing incremental cursor read. */ + fun cursorIncrementalCheckpoint( + cursor: Field, + cursorCheckpoint: JsonNode, + ): OpaqueStateValue = + Jsons.valueToTree( + DefaultJdbcStreamStateValue( + cursors = mapOf(cursor.id to cursorCheckpoint), + ) + ) + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimator.kt new file mode 100644 index 0000000000000..7694472f9ad06 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimator.kt @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.core.JsonGenerator +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.util.Jsons +import java.io.OutputStream + +/** Estimates the in-memory byte size of a table row based on its [ObjectNode] representation. */ +class DefaultRowByteSizeEstimator( + val estimatedRecordOverheadBytes: Long, + val estimatedFieldOverheadBytes: Long, +) : JdbcSharedState.RowByteSizeEstimator { + private var counter: Long = 0L + + override fun apply(record: ObjectNode): Long { + counter = 0L + Jsons.writeValue(jsonGenerator, record) + // The counter value includes the byte count on field name encodings; subtract this. + // We don't want the estimate to depend on the column name lengths. + val adjustedFieldOverheadBytes: Long = + record.fields().asSequence().sumOf { (fieldName: String, _) -> + val fieldNameOvercount: Int = ",\"".length + fieldName.length + "\":".length + estimatedFieldOverheadBytes - fieldNameOvercount + } + return estimatedRecordOverheadBytes + counter + adjustedFieldOverheadBytes + } + + private val countingOutputStream = + object : OutputStream() { + override fun write(b: Int) { + counter++ + } + } + + private val jsonGenerator: JsonGenerator = Jsons.createGenerator(countingOutputStream) +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcSharedState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcSharedState.kt new file mode 100644 index 0000000000000..30c779e5df69d --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcSharedState.kt @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.command.JdbcSourceConfiguration +import io.airbyte.cdk.output.OutputConsumer +import io.micronaut.context.annotation.DefaultImplementation + +/** + * Encapsulates database-specific state, both constant or transient, common to all partitions. + * + * Implementations should be thread-safe. + */ +@DefaultImplementation(DefaultJdbcSharedState::class) +interface JdbcSharedState { + + /** Configuration for the JDBC source connector. */ + val configuration: JdbcSourceConfiguration + + /** Where the records get dumped into. */ + val outputConsumer: OutputConsumer + + /** Queries the database. */ + val selectQuerier: SelectQuerier + + /** Is sampling the streams a good idea? */ + val withSampling: Boolean + + /** Sample size limit. */ + val maxSampleSize: Int + + /** Targeted memory footprint of a partition, in bytes. */ + val targetPartitionByteSize: Long + + /** Creates a new instance of a [JdbcFetchSizeEstimator]. */ + fun jdbcFetchSizeEstimator(): JdbcFetchSizeEstimator + + fun interface JdbcFetchSizeEstimator { + /** Estimates a good JDBC fetchSize value based on a [rowByteSizeSample]. */ + fun apply(rowByteSizeSample: Sample): Int + } + + /** Creates a new instance of a [RowByteSizeEstimator]. */ + fun rowByteSizeEstimator(): RowByteSizeEstimator + + fun interface RowByteSizeEstimator { + /** Estimates the memory footprint of a row based on its corresponding [record]. */ + fun apply(record: ObjectNode): Long + } + + /** Tries to acquire global resources for [JdbcPartitionsCreator]. */ + fun tryAcquireResourcesForCreator(): StreamPartitionsCreator.AcquiredResources? + + /** Tries to acquire global resources for [JdbcPartitionReader]. */ + fun tryAcquireResourcesForReader(): StreamPartitionReader.AcquiredResources? +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcStreamState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcStreamState.kt new file mode 100644 index 0000000000000..872340c4877c6 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcStreamState.kt @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.JsonNode + +/** + * Encapsulates database-specific transient state for a particular [stream]. + * + * Implementations should be thread-safe. + */ +interface JdbcStreamState { + + val stream: Stream + + /** The transient state shared by all partitions. Includes global resources. */ + val sharedState: A + + /** Value to use as upper bound for the cursor column. */ + var cursorUpperBound: JsonNode? + + /** Value to use for JDBC fetchSize, if specified. */ + var fetchSize: Int? + + /** Same as [fetchSize], but falls back to a default value. */ + val fetchSizeOrDefault: Int + + /** Value to use for the LIMIT clause in resumable reads, if applicable. */ + val limit: Long + + /** Adjusts the [limit] value up or down. */ + fun updateLimitState(fn: (LimitState) -> LimitState) + + /** Resets the transient state to its initial setting. */ + fun reset() +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/MemoryFetchSizeEstimator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/MemoryFetchSizeEstimator.kt deleted file mode 100644 index 4d73e339b3c8b..0000000000000 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/MemoryFetchSizeEstimator.kt +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.read - -import io.github.oshai.kotlinlogging.KotlinLogging -import kotlin.math.max -import kotlin.math.min - -/** - * [MemoryFetchSizeEstimator] is used to estimate the value of the JDBC fetchSize parameter to fill - * up a portion of the JVM heap defined by [MEM_CAPACITY_RATIO]. - */ -class MemoryFetchSizeEstimator( - val maxMemoryBytes: Long, - val maxConcurrency: Int, -) { - private val log = KotlinLogging.logger {} - - fun apply(rowByteSizeSample: Sample): Int { - val maxRowBytes: Long = rowByteSizeSample.sampledValues.maxOrNull() ?: 0L - log.info { - "maximum row size in ${rowByteSizeSample.kind.name} table is $maxRowBytes bytes" - } - val targetMemoryUse: Long = (maxMemoryBytes * MEM_CAPACITY_RATIO).toLong() - if (listOf(maxRowBytes, targetMemoryUse, maxConcurrency.toLong()).any { it <= 0L }) { - return DEFAULT_FETCH_SIZE - } - val targetMemoryUsePerQuery: Long = targetMemoryUse / maxConcurrency - log.info { - "targeting a maximum of $targetMemoryUsePerQuery bytes " + - "for each of up to $maxConcurrency queries" - } - val maxRowsFetchedPerQuery: Long = targetMemoryUsePerQuery / maxRowBytes - return max( - FETCH_SIZE_LOWER_BOUND, - min( - maxRowsFetchedPerQuery, - FETCH_SIZE_UPPER_BOUND.toLong(), - ) - .toInt(), - ) - } - - companion object { - const val FETCH_SIZE_LOWER_BOUND: Int = 10 - const val DEFAULT_FETCH_SIZE: Int = 1_000 - const val FETCH_SIZE_UPPER_BOUND: Int = 10_000_000 - - // We're targeting use of 60% of the available memory in order to allow - // for some headroom for other garbage collection. - const val MEM_CAPACITY_RATIO: Double = 0.6 - } -} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/Sample.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/Sample.kt index 059061ec2740a..eed60db50ca7d 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/Sample.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/Sample.kt @@ -1,6 +1,7 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ package io.airbyte.cdk.read +/** Convenience object for maintaining sampled data and its accompanying metadata. */ data class Sample( val sampledValues: List, val kind: Kind, @@ -10,7 +11,7 @@ data class Sample( enum class Kind { EMPTY, // the table is empty; - TINY, // the table has less rows than the target sample size; + TINY, // the table has fewer rows than the target sample size; SMALL, // collecting the sample still requires a full table scan; MEDIUM, // collecting the sample is possible while sampling at ~0.3%; LARGE, // collecting the sample is possible while sampling most aggressively. diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerier.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerier.kt index ca84de7a91220..93ebb28f4e148 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerier.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/SelectQuerier.kt @@ -71,12 +71,17 @@ class JdbcSelectQuerier( var isReady = false var hasNext = false + var hasLoggedResultsReceived = false override fun hasNext(): Boolean { // hasNext() is idempotent if (isReady) return hasNext // Advance to the next row to become ready again. hasNext = rs!!.next() + if (!hasLoggedResultsReceived) { + log.info { "Received results from server." } + hasLoggedResultsReceived = true + } if (!hasNext) { close() } @@ -107,7 +112,10 @@ class JdbcSelectQuerier( isReady = true hasNext = false try { - rs?.close() + if (rs != null) { + log.info { "Closing ${q.sql}" } + rs!!.close() + } } finally { rs = null try { diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt index 56deee1ba98bd..77a776ff94cc4 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt @@ -4,6 +4,7 @@ package io.airbyte.cdk.read import com.fasterxml.jackson.databind.JsonNode import com.fasterxml.jackson.databind.node.ObjectNode import io.airbyte.cdk.discover.Field +import io.airbyte.cdk.read.StreamPartitionsCreator.AcquiredResources import io.airbyte.cdk.util.Jsons import io.airbyte.protocol.models.v0.AirbyteRecordMessage import java.util.concurrent.atomic.AtomicBoolean @@ -45,15 +46,20 @@ class StreamPartitionReader( val preferResumable: Boolean, ) - override fun tryAcquireResources(): PartitionReader.TryAcquireResourcesStatus = - if (ctx.querySemaphore.tryAcquire()) { - PartitionReader.TryAcquireResourcesStatus.READY_TO_RUN - } else { - PartitionReader.TryAcquireResourcesStatus.RETRY_LATER - } + fun interface AcquiredResources : AutoCloseable + + val acquiredResources = AtomicReference(null) + + override fun tryAcquireResources(): PartitionReader.TryAcquireResourcesStatus { + val acquiredResources: AcquiredResources = + ctx.sharedState.tryAcquireResourcesForReader() + ?: return PartitionReader.TryAcquireResourcesStatus.RETRY_LATER + this.acquiredResources.set(acquiredResources) + return PartitionReader.TryAcquireResourcesStatus.READY_TO_RUN + } override fun releaseResources() { - ctx.querySemaphore.release() + acquiredResources.getAndSet(null)?.close() } val resumable: Boolean = @@ -72,7 +78,7 @@ class StreamPartitionReader( override suspend fun run() { // Store the transient state at the start of the run for use in checkpoint(). val transientState = - TransientState(ctx.transientLimitState.get(), ctx.transientFetchSize.get()) + TransientState(ctx.streamState.limit, ctx.streamState.fetchSizeOrDefault) incumbentTransientState.set(transientState) // Build the query. val querySpec: SelectQuerySpec = @@ -105,7 +111,7 @@ class StreamPartitionReader( // If progress can be checkpointed at any time, // check activity periodically to handle timeout. if (!resumable) continue - if (numRecords.get() % transientState.fetchSizeOrLowerBound != 0L) continue + if (numRecords.get() % transientState.fetchSize != 0L) continue coroutineContext.ensureActive() } } @@ -122,8 +128,8 @@ class StreamPartitionReader( checkpointState = input.checkpoint(lastRecord.get()) // Decrease the limit clause for the next PartitionReader, because it's too big. // If it had been smaller then run might have completed in time. - ctx.transientLimitState.update { - if (transientState.limitState.current <= it.current) it.down else it + if (transientState.limit <= ctx.streamState.limit) { + ctx.streamState.updateLimitState { it.down } } } else if (resumable) { // The run method executed to completion with a LIMIT clause. @@ -138,8 +144,8 @@ class StreamPartitionReader( } // Increase the limit clause for the next PartitionReader, because it's too small. // If it had been bigger then run might have executed for longer. - ctx.transientLimitState.update { - if (it.current <= transientState.limitState.current) it.up else it + if (ctx.streamState.limit <= transientState.limit) { + ctx.streamState.updateLimitState { it.up } } } else { // The run method executed to completion without a LIMIT clause. @@ -150,16 +156,9 @@ class StreamPartitionReader( } inner class TransientState( - val limitState: LimitState, - val fetchSize: Int?, - ) { - val fetchSizeOrLowerBound: Int - get() = fetchSize ?: MemoryFetchSizeEstimator.FETCH_SIZE_LOWER_BOUND - - /** Value to use for the LIMIT clause, if applicable. */ - val limit: Long - get() = fetchSizeOrLowerBound * limitState.current - } + val limit: Long, + val fetchSize: Int, + ) } /** Converts a [StreamPartitionReader.Input] into a [SelectQuerySpec]. */ diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt index 4188878557f0e..f3cbbe6502b9c 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt @@ -5,7 +5,9 @@ import com.fasterxml.jackson.databind.JsonNode import io.airbyte.cdk.ConfigErrorException import io.airbyte.cdk.command.OpaqueStateValue import io.airbyte.cdk.discover.Field +import io.airbyte.cdk.read.StreamPartitionReader.AcquiredResources import io.airbyte.protocol.models.v0.SyncMode +import java.util.concurrent.atomic.AtomicReference /** Default implementation of [PartitionsCreator] for streams in JDBC sources. */ class StreamPartitionsCreator( @@ -56,17 +58,21 @@ class StreamPartitionsCreator( val throughputBytesPerSecond: Long = 10L * 1024L * 1024L, ) - override fun tryAcquireResources(): PartitionsCreator.TryAcquireResourcesStatus = + val acquiredResources = AtomicReference(null) + fun interface AcquiredResources : AutoCloseable + + override fun tryAcquireResources(): PartitionsCreator.TryAcquireResourcesStatus { // Running this PartitionsCreator may not always involve JDBC queries. // In those cases, the semaphore will be released very soon after, so this is OK. - if (ctx.querySemaphore.tryAcquire()) { - PartitionsCreator.TryAcquireResourcesStatus.READY_TO_RUN - } else { - PartitionsCreator.TryAcquireResourcesStatus.RETRY_LATER - } + val acquiredResources: AcquiredResources = + ctx.sharedState.tryAcquireResourcesForCreator() + ?: return PartitionsCreator.TryAcquireResourcesStatus.RETRY_LATER + this.acquiredResources.set(acquiredResources) + return PartitionsCreator.TryAcquireResourcesStatus.READY_TO_RUN + } override fun releaseResources() { - ctx.querySemaphore.release() + acquiredResources.getAndSet(null)?.close() } override suspend fun run(): List = @@ -194,7 +200,7 @@ fun CheckpointStreamState?.streamPartitionsCreatorInput( cursorUpperBound, ) is CursorIncrementalCheckpoint -> - when (val cursorUpperBound: JsonNode? = ctx.transientCursorUpperBoundState.get()) { + when (val cursorUpperBound: JsonNode? = ctx.streamState.cursorUpperBound) { null -> StreamPartitionsCreator.CursorIncrementalColdStart( cursor, diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt index 6d0fb7dda799b..c5b858858b26a 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt @@ -37,13 +37,11 @@ class StreamPartitionsCreatorUtils( } // Ensure that the JDBC fetchSize parameter value for this table is set. // Compute it using the sample. - if (ctx.transientFetchSize.get() == null) { + if (ctx.streamState.fetchSize == null) { val rowByteSizeSample: Sample = sample.map { (_, rowByteSize: Long) -> rowByteSize } - val maxMemoryBytes: Long = Runtime.getRuntime().maxMemory() - val fetchSizeEstimator = - MemoryFetchSizeEstimator(maxMemoryBytes, ctx.configuration.maxConcurrency) - ctx.transientFetchSize.update { fetchSizeEstimator.apply(rowByteSizeSample) } + val fetchSizeEstimator = ctx.sharedState.jdbcFetchSizeEstimator() + ctx.streamState.fetchSize = fetchSizeEstimator.apply(rowByteSizeSample) } // Compute partition split boundaries. // First, check if splitting can or should be done, and exit if that isn't the case. @@ -108,7 +106,8 @@ class StreamPartitionsCreatorUtils( // In both cases, there is nothing to be done. return null } - return ctx.transientCursorUpperBoundState.update { value } + ctx.streamState.cursorUpperBound = value + return value } /** Computes the partition split boundaries from the given sample. */ diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContext.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContext.kt index bde696168bfed..327fccea063dd 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContext.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContext.kt @@ -1,7 +1,6 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ package io.airbyte.cdk.read -import com.fasterxml.jackson.databind.JsonNode import io.airbyte.cdk.command.JdbcSourceConfiguration import io.airbyte.cdk.output.CatalogValidationFailureHandler import io.airbyte.cdk.output.OutputConsumer @@ -10,8 +9,6 @@ import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair import jakarta.inject.Singleton import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.ConcurrentMap -import java.util.concurrent.atomic.AtomicReference -import kotlinx.coroutines.sync.Semaphore /** * A [StreamReadContextManager] may be injected in a [io.airbyte.cdk.read.PartitionsCreatorFactory] @@ -26,64 +23,55 @@ import kotlinx.coroutines.sync.Semaphore */ @Singleton class StreamReadContextManager( - val configuration: JdbcSourceConfiguration, + val sharedState: JdbcSharedState, val handler: CatalogValidationFailureHandler, val selectQueryGenerator: SelectQueryGenerator, - val selectQuerier: SelectQuerier, - val outputConsumer: OutputConsumer, ) { + + val configuration: JdbcSourceConfiguration + get() = sharedState.configuration + + val outputConsumer: OutputConsumer + get() = sharedState.outputConsumer + + val selectQuerier: SelectQuerier + get() = sharedState.selectQuerier + private val map: ConcurrentMap = ConcurrentHashMap() - private val globalSemaphore = Semaphore(configuration.maxConcurrency) - operator fun get(stream: Stream): StreamReadContext = map.getOrPut(stream.namePair) { StreamReadContext( - configuration, handler, selectQueryGenerator, - selectQuerier, - globalSemaphore, - outputConsumer, - stream, + DefaultJdbcStreamState(sharedState as DefaultJdbcSharedState, stream), ) } } class StreamReadContext( - val configuration: JdbcSourceConfiguration, val handler: CatalogValidationFailureHandler, val selectQueryGenerator: SelectQueryGenerator, - val selectQuerier: SelectQuerier, - val querySemaphore: Semaphore, - val outputConsumer: OutputConsumer, - val stream: Stream, + val streamState: JdbcStreamState<*>, ) { - val transientLimitState: TransientState = TransientState(LimitState.minimum) - - val transientCursorUpperBoundState: TransientState = TransientState(null) + val sharedState: JdbcSharedState + get() = streamState.sharedState - val transientFetchSize: TransientState = TransientState(null) + val stream: Stream + get() = streamState.stream - fun resetStream() { - handler.accept(ResetStream(stream.name, stream.namespace)) - transientLimitState.reset() - transientCursorUpperBoundState.reset() - transientFetchSize.reset() - } -} + val configuration: JdbcSourceConfiguration + get() = sharedState.configuration -class TransientState( - val initialState: T, -) { - private val ref: AtomicReference = AtomicReference(initialState) + val outputConsumer: OutputConsumer + get() = sharedState.outputConsumer - fun get(): T = ref.get() + val selectQuerier: SelectQuerier + get() = sharedState.selectQuerier - fun reset() { - ref.set(initialState) + fun resetStream() { + handler.accept(ResetStream(stream.name, stream.namespace)) + streamState.reset() } - - fun update(fn: (T) -> T): T = ref.updateAndGet(fn) } diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimatorTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimatorTest.kt new file mode 100644 index 0000000000000..59e7d98ec3997 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimatorTest.kt @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.read.Sample.Kind +import io.airbyte.cdk.read.TestFixtures.sharedState +import org.junit.jupiter.api.Assertions +import org.junit.jupiter.api.Test + +class DefaultJdbcFetchSizeEstimatorTest { + + @Test + fun testSingleSmall() { + val sample = Sample(listOf(10L, 20L, 30L), Kind.SMALL, valueWeight = 0L) + val sharedState = sharedState(maxMemoryBytesForTesting = 700_000, maxConcurrency = 1) + val estimator = sharedState.jdbcFetchSizeEstimator() + Assertions.assertEquals(14_000, estimator.apply(sample)) + } + + @Test + fun testTwoSmall() { + val sample = Sample(listOf(10L, 20L, 30L), Kind.SMALL, valueWeight = 0L) + val sharedState = sharedState(maxMemoryBytesForTesting = 700_000, maxConcurrency = 2) + val estimator = sharedState.jdbcFetchSizeEstimator() + Assertions.assertEquals(7_000, estimator.apply(sample)) + } + + @Test + fun testEmpty() { + val sample = Sample(listOf(), Kind.EMPTY, 0L) + val sharedState = sharedState(maxMemoryBytesForTesting = 700_000, maxConcurrency = 2) + val estimator = sharedState.jdbcFetchSizeEstimator() + Assertions.assertEquals(sharedState.defaultFetchSize, estimator.apply(sample)) + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimatorTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimatorTest.kt new file mode 100644 index 0000000000000..61f61da5d2687 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultRowByteSizeEstimatorTest.kt @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.read.TestFixtures.sharedState +import io.airbyte.cdk.util.Jsons +import org.junit.jupiter.api.Assertions +import org.junit.jupiter.api.Test + +class DefaultRowByteSizeEstimatorTest { + + fun estimate(jsonRecord: String): Int = + sharedState().rowByteSizeEstimator().apply(Jsons.readTree(jsonRecord) as ObjectNode).toInt() + + @Test + fun testZero() { + Assertions.assertEquals(18, estimate("""{}""")) + } + + @Test + fun testOne() { + Assertions.assertEquals(34, estimate("""{"one":1}""")) + } + + @Test + fun testTwo() { + Assertions.assertEquals(51, estimate("""{"one":1,"two":2}""")) + } + + @Test + fun testThree() { + Assertions.assertEquals(68, estimate("""{"one":1,"two":2,"three":3}""")) + } + + @Test + fun testFour() { + Assertions.assertEquals(90, estimate("""{"one":1,"two":2,"three":3,"four":"four"}""")) + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt new file mode 100644 index 0000000000000..19b392bd39aac --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.TestClockFactory +import io.airbyte.cdk.command.JdbcSourceConfiguration +import io.airbyte.cdk.output.BufferingOutputConsumer +import io.airbyte.cdk.ssh.SshConnectionOptions +import io.airbyte.cdk.ssh.SshTunnelMethodConfiguration +import io.airbyte.cdk.util.Jsons +import java.time.Duration +import org.junit.jupiter.api.Assertions + +object TestFixtures { + + fun sharedState( + global: Boolean = false, + checkpointTargetInterval: Duration = Duration.ofMinutes(1), + maxConcurrency: Int = 10, + withSampling: Boolean = false, + maxSampleSize: Int = DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + expectedThroughputBytesPerSecond: Long = DefaultJdbcSharedState.THROUGHPUT_BYTES_PER_SECOND, + minFetchSize: Int = DefaultJdbcSharedState.FETCH_SIZE_LOWER_BOUND, + defaultFetchSize: Int = DefaultJdbcSharedState.DEFAULT_FETCH_SIZE, + maxFetchSize: Int = DefaultJdbcSharedState.FETCH_SIZE_UPPER_BOUND, + memoryCapacityRatio: Double = DefaultJdbcSharedState.MEM_CAPACITY_RATIO, + estimatedRecordOverheadBytes: Long = DefaultJdbcSharedState.RECORD_OVERHEAD_BYTES, + estimatedFieldOverheadBytes: Long = DefaultJdbcSharedState.FIELD_OVERHEAD_BYTES, + maxMemoryBytesForTesting: Long = 1_000_000, + vararg mockedQueries: MockedQuery, + ) = + DefaultJdbcSharedState( + StubbedJdbcSourceConfiguration(global, checkpointTargetInterval, maxConcurrency), + BufferingOutputConsumer(TestClockFactory().fixed()), + MockSelectQuerier(ArrayDeque(mockedQueries.toList())), + withSampling, + maxSampleSize, + expectedThroughputBytesPerSecond, + minFetchSize, + defaultFetchSize, + maxFetchSize, + memoryCapacityRatio, + estimatedRecordOverheadBytes, + estimatedFieldOverheadBytes, + maxMemoryBytesForTesting, + ) + + fun SelectQuery.assertQueryEquals(expected: SelectQuerySpec) { + Assertions.assertEquals(expected.toString(), this.sql) + } + + fun JsonNode.assertJsonEquals(expected: String) { + Assertions.assertEquals(expected, this.toString()) + } + + fun JsonNode.assertJsonEquals(expected: JsonNode) { + Assertions.assertEquals(expected.toString(), this.toString()) + } + + class StubbedJdbcSourceConfiguration( + override val global: Boolean, + override val checkpointTargetInterval: Duration, + override val maxConcurrency: Int, + ) : JdbcSourceConfiguration { + override val realHost: String + get() = TODO("Not yet implemented") + override val jdbcUrlFmt: String + get() = TODO("Not yet implemented") + override val jdbcProperties: Map + get() = TODO("Not yet implemented") + override val schemas: Set + get() = TODO("Not yet implemented") + override val realPort: Int + get() = TODO("Not yet implemented") + override val sshTunnel: SshTunnelMethodConfiguration + get() = TODO("Not yet implemented") + override val sshConnectionOptions: SshConnectionOptions + get() = TODO("Not yet implemented") + override val resourceAcquisitionHeartbeat: Duration + get() = TODO("Not yet implemented") + } + + class MockSelectQuerier(val mockedQueries: ArrayDeque) : SelectQuerier { + + override fun executeQuery( + q: SelectQuery, + parameters: SelectQuerier.Parameters + ): SelectQuerier.Result { + val mockedQuery: MockedQuery? = mockedQueries.removeFirstOrNull() + Assertions.assertNotNull(mockedQuery, q.sql) + Assertions.assertEquals(q.sql, mockedQuery!!.expectedQuerySpec.toString()) + Assertions.assertEquals(parameters, mockedQuery.expectedParameters, q.sql) + return object : SelectQuerier.Result { + val wrapped: Iterator = mockedQuery.results.iterator() + override fun hasNext(): Boolean = wrapped.hasNext() + override fun next(): ObjectNode = wrapped.next() + override fun close() {} + } + } + } + + data class MockedQuery( + val expectedQuerySpec: SelectQuerySpec, + val expectedParameters: SelectQuerier.Parameters, + val results: List + ) { + constructor( + expectedQuerySpec: SelectQuerySpec, + expectedParameters: SelectQuerier.Parameters, + vararg rows: String, + ) : this( + expectedQuerySpec, + expectedParameters, + rows.map { Jsons.readTree(it) as ObjectNode }, + ) + } +} From f0fbc3640a8b9b132c2798da0cbc9a3d36ece93f Mon Sep 17 00:00:00 2001 From: Marius Posta Date: Tue, 20 Aug 2024 11:28:49 -0400 Subject: [PATCH 07/11] extract-jdbc: refactor PartitionReader and PartitionsCreator impls --- .../FakeSourcePartitionsCreatorFactory.kt | 19 +- .../read/StreamPartitionsCreatorUtilsTest.kt | 177 ------------ .../airbyte/cdk/read/CheckpointStreamState.kt | 44 +-- .../airbyte/cdk/read/StreamPartitionReader.kt | 223 ++++++++------ .../cdk/read/StreamPartitionsCreator.kt | 271 ++++++++++++++++-- .../cdk/read/StreamPartitionsCreatorUtils.kt | 193 ------------- ...Context.kt => StreamReadContextManager.kt} | 43 +-- 7 files changed, 415 insertions(+), 555 deletions(-) delete mode 100644 airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt delete mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt rename airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/{StreamReadContext.kt => StreamReadContextManager.kt} (53%) diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt index e8b4466d10d63..dd44ae464b1b0 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt @@ -5,14 +5,13 @@ import io.airbyte.cdk.command.OpaqueStateValue import io.airbyte.cdk.read.CreateNoPartitions import io.airbyte.cdk.read.Feed import io.airbyte.cdk.read.Global +import io.airbyte.cdk.read.JdbcStreamState import io.airbyte.cdk.read.PartitionsCreator import io.airbyte.cdk.read.PartitionsCreatorFactory import io.airbyte.cdk.read.StateQuerier import io.airbyte.cdk.read.Stream -import io.airbyte.cdk.read.StreamPartitionReader -import io.airbyte.cdk.read.StreamPartitionsCreator -import io.airbyte.cdk.read.StreamReadContext import io.airbyte.cdk.read.StreamReadContextManager +import io.airbyte.cdk.read.StreamSequentialPartitionsCreator import io.airbyte.cdk.read.streamPartitionsCreatorInput import jakarta.inject.Singleton @@ -28,12 +27,14 @@ class FakeSourcePartitionsCreatorFactory( return when (feed) { is Global -> CreateNoPartitions is Stream -> { - val ctx: StreamReadContext = streamReadContextManager[feed] - StreamPartitionsCreator( - ctx, - opaqueStateValue.streamPartitionsCreatorInput(ctx), - StreamPartitionsCreator.Parameters(preferParallelized = false), - StreamPartitionReader.Parameters(preferResumable = false), + val streamState: JdbcStreamState<*> = streamReadContextManager[feed] + StreamSequentialPartitionsCreator( + streamReadContextManager.selectQueryGenerator, + streamState, + opaqueStateValue.streamPartitionsCreatorInput( + streamReadContextManager.handler, + streamState, + ) ) } } diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt deleted file mode 100644 index 5add648d116a8..0000000000000 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtilsTest.kt +++ /dev/null @@ -1,177 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.read - -import com.fasterxml.jackson.databind.JsonNode -import io.airbyte.cdk.TestClockFactory -import io.airbyte.cdk.discover.Field -import io.airbyte.cdk.fakesource.FakeSourceConfiguration -import io.airbyte.cdk.fakesource.FakeSourceConfigurationFactory -import io.airbyte.cdk.fakesource.FakeSourceConfigurationJsonObject -import io.airbyte.cdk.fakesource.FakeSourceOperations -import io.airbyte.cdk.h2.H2TestFixture -import io.airbyte.cdk.jdbc.IntFieldType -import io.airbyte.cdk.jdbc.JdbcConnectionFactory -import io.airbyte.cdk.jdbc.StringFieldType -import io.airbyte.cdk.output.BufferingCatalogValidationFailureHandler -import io.airbyte.cdk.output.BufferingOutputConsumer -import io.airbyte.cdk.read.Sample.Kind -import io.airbyte.cdk.util.Jsons -import io.airbyte.protocol.models.v0.SyncMode -import org.junit.jupiter.api.Assertions -import org.junit.jupiter.api.Test - -class StreamPartitionsCreatorUtilsTest { - val h2 = H2TestFixture() - - init { - h2.execute( - """CREATE TABLE kv ( - |k INT PRIMARY KEY, - |v VARCHAR(60)) - | - """ - .trimMargin() - .replace('\n', ' '), - ) - h2.execute( - "INSERT INTO kv (k, v) " + - "VALUES (1, 'foo'), (2, 'bar'), (3, NULL), (4, 'baz'), (5, 'quux');", - ) - } - - val k = Field("k", IntFieldType) - val v = Field("v", StringFieldType) - - val stream = - Stream( - name = "kv", - namespace = "public", - fields = listOf(k, v), - configuredSyncMode = SyncMode.FULL_REFRESH, - configuredPrimaryKey = listOf(k), - configuredCursor = null, - ) - - val querySpec = - SelectQuerySpec( - SelectColumns(listOf(k)), - From("kv", "public"), - orderBy = OrderBy(listOf(k)), - ) - - val testParameters = - StreamPartitionsCreator.Parameters( - preferParallelized = true, - tableSampleSize = 2, - throughputBytesPerSecond = 10L, - ) - - @Test - fun testCollectSample() { - val utils: StreamPartitionsCreatorUtils = createUtils(testParameters) - val sample = utils.collectSample(querySpec) {} - Assertions.assertEquals(Kind.SMALL, sample.kind) - } - - @Test - fun testCollectTinySample() { - val utils: StreamPartitionsCreatorUtils = - createUtils(testParameters.copy(tableSampleSize = 100)) - val sample = utils.collectSample(querySpec) {} - Assertions.assertEquals(Kind.TINY, sample.kind) - } - - @Test - fun testCollectEmptySample() { - h2.execute("TRUNCATE TABLE kv") - val utils: StreamPartitionsCreatorUtils = createUtils(testParameters) - val sample = utils.collectSample(querySpec) {} - Assertions.assertEquals(Kind.EMPTY, sample.kind) - } - - @Test - fun testCollectSampleInLargeTable() { - h2.execute("INSERT INTO kv(k, v) SELECT X, NULL FROM SYSTEM_RANGE(6, 100000)") - val utils: StreamPartitionsCreatorUtils = - createUtils(testParameters.copy(tableSampleSize = 100)) - val sample = utils.collectSample(querySpec) {} - Assertions.assertEquals(Kind.SMALL, sample.kind) - } - - @Test - fun testCursorUpperBound() { - val utils: StreamPartitionsCreatorUtils = createUtils(testParameters) - utils.computeCursorUpperBound(k) - Assertions.assertEquals( - "5", - utils.ctx.streamState.cursorUpperBound?.toString(), - ) - } - - @Test - fun testSplitPrimaryKey() { - val utils: StreamPartitionsCreatorUtils = createUtils(testParameters) - val input = - StreamPartitionReader.SnapshotInput( - primaryKey = listOf(k), - primaryKeyLowerBound = null, - primaryKeyUpperBound = null, - ) - val splits: List?, List?>> = - utils.split(input, input.primaryKeyLowerBound, input.primaryKeyUpperBound) - val actual: String = splits.joinToString { (l, r) -> "]${l?.first()}, ${r?.first()}]" } - Assertions.assertEquals("]null, 1], ]1, 2], ]2, null]", actual) - } - - @Test - fun testSplitCursor() { - val utils: StreamPartitionsCreatorUtils = createUtils(testParameters) - val input = - StreamPartitionReader.CursorIncrementalInput( - cursor = k, - cursorLowerBound = Jsons.numberNode(1), - isLowerBoundIncluded = false, - cursorUpperBound = Jsons.numberNode(4), - ) - val splits: List?, List?>> = - utils.split(input, listOf(input.cursorLowerBound), listOf(input.cursorUpperBound)) - val actual: String = splits.joinToString { (l, r) -> "]${l?.first()}, ${r?.first()}]" } - Assertions.assertEquals("]1, 2], ]2, 4]", actual) - } - - private fun createUtils( - params: StreamPartitionsCreator.Parameters, - ): StreamPartitionsCreatorUtils { - val configPojo: FakeSourceConfigurationJsonObject = - FakeSourceConfigurationJsonObject().apply { - port = h2.port - database = h2.database - timeout = "PT1S" - } - val config: FakeSourceConfiguration = FakeSourceConfigurationFactory().make(configPojo) - val sharedState = - DefaultJdbcSharedState( - config, - BufferingOutputConsumer(TestClockFactory().fixed()), - JdbcSelectQuerier(JdbcConnectionFactory(config)), - withSampling = true, - maxSampleSize = 1024, - expectedThroughputBytesPerSecond = 10 * 1024 * 1024, - minFetchSize = 10, - defaultFetchSize = 1_000, - maxFetchSize = 10_000_000, - memoryCapacityRatio = 0.6, - estimatedRecordOverheadBytes = 16, - estimatedFieldOverheadBytes = 16, - ) - val ctxManager = - StreamReadContextManager( - sharedState, - BufferingCatalogValidationFailureHandler(), - FakeSourceOperations(), - ) - val ctx = ctxManager[stream] - ctx.resetStream() - return StreamPartitionsCreatorUtils(ctx, params) - } -} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt index f03700f63c81a..db8bbfab94a78 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt @@ -5,6 +5,7 @@ import com.fasterxml.jackson.databind.JsonNode import io.airbyte.cdk.command.OpaqueStateValue import io.airbyte.cdk.discover.Field import io.airbyte.cdk.discover.FieldOrMetaField +import io.airbyte.cdk.output.CatalogValidationFailureHandler import io.airbyte.cdk.output.InvalidCursor import io.airbyte.cdk.output.InvalidPrimaryKey import io.airbyte.cdk.util.Jsons @@ -55,28 +56,33 @@ fun CheckpointStreamState.opaqueStateValue(): OpaqueStateValue = /** * Deserializes a nullable [OpaqueStateValue] into a nullable [CheckpointStreamState] based on the - * current [StreamReadContext], which contains the configuration and the catalog. + * current [JdbcStreamState], which contains the configuration and the catalog. */ -fun OpaqueStateValue?.checkpoint(ctx: StreamReadContext): CheckpointStreamState? = +fun OpaqueStateValue?.checkpoint( + handler: CatalogValidationFailureHandler, + streamState: JdbcStreamState<*>, +): CheckpointStreamState? = if (this == null) { null } else { - Jsons.treeToValue(this, DefaultJdbcStreamStateValue::class.java).checkpoint(ctx) + Jsons.treeToValue(this, DefaultJdbcStreamStateValue::class.java) + .checkpoint(handler, streamState) } -private fun DefaultJdbcStreamStateValue.checkpoint(ctx: StreamReadContext): CheckpointStreamState? { +private fun DefaultJdbcStreamStateValue.checkpoint( + handler: CatalogValidationFailureHandler, + streamState: JdbcStreamState<*>, +): CheckpointStreamState? { + val sharedState: JdbcSharedState = streamState.sharedState + val stream: Stream = streamState.stream val pkMap: Map = run { if (primaryKey.isEmpty()) { return@run mapOf() } - val pk: List = ctx.stream.configuredPrimaryKey ?: listOf() + val pk: List = stream.configuredPrimaryKey ?: listOf() if (primaryKey.keys != pk.map { it.id }.toSet()) { - ctx.handler.accept( - InvalidPrimaryKey( - ctx.stream.name, - ctx.stream.namespace, - primaryKey.keys.toList(), - ), + handler.accept( + InvalidPrimaryKey(stream.name, stream.namespace, primaryKey.keys.toList()), ) return null } @@ -87,23 +93,27 @@ private fun DefaultJdbcStreamStateValue.checkpoint(ctx: StreamReadContext): Chec return@run null } if (cursors.size > 1) { - ctx.handler.accept( - InvalidCursor(ctx.stream.name, ctx.stream.namespace, cursors.keys.toString()), + handler.accept( + InvalidCursor( + streamState.stream.name, + streamState.stream.namespace, + cursors.keys.toString() + ), ) return null } val cursorLabel: String = cursors.keys.first() - val cursor: FieldOrMetaField? = ctx.stream.fields.find { it.id == cursorLabel } + val cursor: FieldOrMetaField? = stream.fields.find { it.id == cursorLabel } if (cursor !is Field) { - ctx.handler.accept( - InvalidCursor(ctx.stream.name, ctx.stream.namespace, cursorLabel), + handler.accept( + InvalidCursor(stream.name, stream.namespace, cursorLabel), ) return null } cursor to cursors[cursorLabel]!! } val isCursorBasedIncremental: Boolean = - ctx.stream.configuredSyncMode == SyncMode.INCREMENTAL && !ctx.configuration.global + stream.configuredSyncMode == SyncMode.INCREMENTAL && !sharedState.configuration.global return if (cursorPair == null) { if (isCursorBasedIncremental) { diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt index 77a776ff94cc4..4203d59f2fc1a 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt @@ -3,8 +3,9 @@ package io.airbyte.cdk.read import com.fasterxml.jackson.databind.JsonNode import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.command.OpaqueStateValue import io.airbyte.cdk.discover.Field -import io.airbyte.cdk.read.StreamPartitionsCreator.AcquiredResources +import io.airbyte.cdk.output.OutputConsumer import io.airbyte.cdk.util.Jsons import io.airbyte.protocol.models.v0.AirbyteRecordMessage import java.util.concurrent.atomic.AtomicBoolean @@ -13,19 +14,28 @@ import java.util.concurrent.atomic.AtomicReference import kotlin.coroutines.coroutineContext import kotlinx.coroutines.ensureActive -/** Default implementation of [PartitionReader] for streams in JDBC sources. */ -class StreamPartitionReader( - val ctx: StreamReadContext, +/** Base class for JDBC implementations of [PartitionReader]. */ +sealed class StreamPartitionReader( + val streamState: JdbcStreamState<*>, val input: Input, - val parameters: Parameters, ) : PartitionReader { - sealed interface Input + val stream: Stream = streamState.stream + val sharedState: JdbcSharedState = streamState.sharedState + val outputConsumer: OutputConsumer = sharedState.outputConsumer + val selectQuerier: SelectQuerier = sharedState.selectQuerier + + sealed interface Input { + val resumable: Boolean + } data class SnapshotInput( val primaryKey: List, val primaryKeyLowerBound: List?, val primaryKeyUpperBound: List?, - ) : Input + ) : Input { + override val resumable: Boolean + get() = primaryKey.isNotEmpty() + } data class SnapshotWithCursorInput( val primaryKey: List, @@ -33,132 +43,161 @@ class StreamPartitionReader( val primaryKeyUpperBound: List?, val cursor: Field, val cursorUpperBound: JsonNode, - ) : Input + ) : Input { + override val resumable: Boolean + get() = primaryKey.isNotEmpty() + } data class CursorIncrementalInput( val cursor: Field, val cursorLowerBound: JsonNode, val isLowerBoundIncluded: Boolean, val cursorUpperBound: JsonNode, - ) : Input + ) : Input { + override val resumable: Boolean + get() = true + } - data class Parameters( - val preferResumable: Boolean, - ) + private val acquiredResources = AtomicReference() fun interface AcquiredResources : AutoCloseable - val acquiredResources = AtomicReference(null) - override fun tryAcquireResources(): PartitionReader.TryAcquireResourcesStatus { val acquiredResources: AcquiredResources = - ctx.sharedState.tryAcquireResourcesForReader() + sharedState.tryAcquireResourcesForReader() ?: return PartitionReader.TryAcquireResourcesStatus.RETRY_LATER this.acquiredResources.set(acquiredResources) return PartitionReader.TryAcquireResourcesStatus.READY_TO_RUN } + fun out(record: ObjectNode) { + val recordMessageData: ObjectNode = Jsons.objectNode() + for (fieldName in streamFieldNames) { + recordMessageData.set(fieldName, record[fieldName] ?: Jsons.nullNode()) + } + outputConsumer.accept( + AirbyteRecordMessage() + .withStream(stream.name) + .withNamespace(stream.namespace) + .withData(recordMessageData), + ) + } + + val streamFieldNames: List = stream.fields.map { it.id } + override fun releaseResources() { acquiredResources.getAndSet(null)?.close() } +} + +/** JDBC implementation of [PartitionReader] which reads the [input] in its entirety. */ +class StreamNonResumablePartitionReader( + val selectQueryGenerator: SelectQueryGenerator, + streamState: JdbcStreamState<*>, + input: Input, +) : StreamPartitionReader(streamState, input) { + + val runComplete = AtomicBoolean(false) + val numRecords = AtomicLong() - val resumable: Boolean = - parameters.preferResumable && - when (input) { - is SnapshotInput -> input.primaryKey.isNotEmpty() - is SnapshotWithCursorInput -> input.primaryKey.isNotEmpty() - is CursorIncrementalInput -> true + override suspend fun run() { + val querySpec: SelectQuerySpec = + input.querySpec( + stream, + isOrdered = false, + limit = null, + ) + val query: SelectQuery = selectQueryGenerator.generate(querySpec.optimize()) + selectQuerier + .executeQuery( + q = query, + parameters = SelectQuerier.Parameters(streamState.fetchSize), + ) + .use { result: SelectQuerier.Result -> + for (record in result) { + out(record) + numRecords.incrementAndGet() + } } + runComplete.set(true) + } - val incumbentTransientState = AtomicReference() + override fun checkpoint(): PartitionReadCheckpoint { + // Sanity check. + if (!runComplete.get()) throw RuntimeException("cannot checkpoint non-resumable read") + // The run method executed to completion without a LIMIT clause. + // This implies that the partition boundary has been reached. + return PartitionReadCheckpoint(input.checkpoint().opaqueStateValue(), numRecords.get()) + } +} + +/** + * JDBC implementation of [PartitionReader] which reads as much as possible of the [input], in + * order, before timing out. + */ +class StreamResumablePartitionReader( + val selectQueryGenerator: SelectQueryGenerator, + streamState: JdbcStreamState<*>, + input: Input, +) : StreamPartitionReader(streamState, input) { + + val incumbentLimit = AtomicLong() val numRecords = AtomicLong() val lastRecord = AtomicReference(null) val runComplete = AtomicBoolean(false) override suspend fun run() { - // Store the transient state at the start of the run for use in checkpoint(). - val transientState = - TransientState(ctx.streamState.limit, ctx.streamState.fetchSizeOrDefault) - incumbentTransientState.set(transientState) - // Build the query. + val fetchSize: Int = streamState.fetchSizeOrDefault + val limit: Long = streamState.limit + incumbentLimit.set(limit) val querySpec: SelectQuerySpec = input.querySpec( - ctx.stream, - isOrdered = resumable, - limit = transientState.limit.takeIf { resumable }, + stream, + isOrdered = true, + limit = limit, ) - val query: SelectQuery = ctx.selectQueryGenerator.generate(querySpec.optimize()) - val streamFieldNames: List = ctx.stream.fields.map { it.id } - val querierParameters = SelectQuerier.Parameters(fetchSize = transientState.fetchSize) - // Execute the query. - ctx.selectQuerier.executeQuery(query, querierParameters).use { result: SelectQuerier.Result - -> - for (record in result) { - val dataRecord: JsonNode = - Jsons.objectNode().apply { - for (fieldName in streamFieldNames) { - set(fieldName, record[fieldName] ?: Jsons.nullNode()) - } + val query: SelectQuery = selectQueryGenerator.generate(querySpec.optimize()) + selectQuerier + .executeQuery( + q = query, + parameters = SelectQuerier.Parameters(fetchSize), + ) + .use { result: SelectQuerier.Result -> + for (record in result) { + out(record) + lastRecord.set(record) + // Check activity periodically to handle timeout. + if (numRecords.incrementAndGet() % fetchSize == 0L) { + coroutineContext.ensureActive() } - ctx.outputConsumer.accept( - AirbyteRecordMessage() - .withStream(ctx.stream.name) - .withNamespace(ctx.stream.namespace) - .withData(dataRecord), - ) - lastRecord.set(record) - numRecords.incrementAndGet() - // If progress can be checkpointed at any time, - // check activity periodically to handle timeout. - if (!resumable) continue - if (numRecords.get() % transientState.fetchSize != 0L) continue - coroutineContext.ensureActive() + } } - } runComplete.set(true) } override fun checkpoint(): PartitionReadCheckpoint { - val checkpointState: CheckpointStreamState - val transientState: TransientState = incumbentTransientState.get() - if (!runComplete.get()) { - // Sanity check. - if (!resumable) throw RuntimeException("cannot checkpoint non-resumable read") - // The run method execution was interrupted. - checkpointState = input.checkpoint(lastRecord.get()) - // Decrease the limit clause for the next PartitionReader, because it's too big. - // If it had been smaller then run might have completed in time. - if (transientState.limit <= ctx.streamState.limit) { - ctx.streamState.updateLimitState { it.down } + if (runComplete.get() && numRecords.get() < streamState.limit) { + // The run method executed to completion with a LIMIT clause which was not reached. + return PartitionReadCheckpoint(input.checkpoint().opaqueStateValue(), numRecords.get()) + } + // The run method ended because of either the LIMIT or the timeout. + // Adjust the LIMIT value so that it grows or shrinks to try to fit the timeout. + if (incumbentLimit.get() > 0L) { + if (runComplete.get() && streamState.limit <= incumbentLimit.get()) { + // Increase the limit clause for the next PartitionReader, because it's too small. + // If it had been bigger then run might have executed for longer. + streamState.updateLimitState { it.up } } - } else if (resumable) { - // The run method executed to completion with a LIMIT clause. - // The partition boundary may or may not have been reached. - // If the number of records read is less than the LIMIT clause, - // then it certainly has. - checkpointState = - if (numRecords.get() < transientState.limit) { - input.checkpoint() - } else { - input.checkpoint(lastRecord.get()) - } - // Increase the limit clause for the next PartitionReader, because it's too small. - // If it had been bigger then run might have executed for longer. - if (ctx.streamState.limit <= transientState.limit) { - ctx.streamState.updateLimitState { it.up } + if (!runComplete.get() && incumbentLimit.get() <= streamState.limit) { + // Decrease the limit clause for the next PartitionReader, because it's too big. + // If it had been smaller then run might have completed in time. + streamState.updateLimitState { it.down } } - } else { - // The run method executed to completion without a LIMIT clause. - // This implies that the partition boundary has been reached. - checkpointState = input.checkpoint() } - return PartitionReadCheckpoint(checkpointState.opaqueStateValue(), numRecords.get()) + val checkpointState: OpaqueStateValue = + input.checkpoint(lastRecord.get()!!).opaqueStateValue() + return PartitionReadCheckpoint(checkpointState, numRecords.get()) } - - inner class TransientState( - val limit: Long, - val fetchSize: Int, - ) } /** Converts a [StreamPartitionReader.Input] into a [SelectQuerySpec]. */ diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt index f3cbbe6502b9c..e3635baac2617 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt @@ -2,20 +2,33 @@ package io.airbyte.cdk.read import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.ObjectNode import io.airbyte.cdk.ConfigErrorException +import io.airbyte.cdk.command.JdbcSourceConfiguration import io.airbyte.cdk.command.OpaqueStateValue import io.airbyte.cdk.discover.Field -import io.airbyte.cdk.read.StreamPartitionReader.AcquiredResources +import io.airbyte.cdk.output.CatalogValidationFailureHandler +import io.airbyte.cdk.output.OutputConsumer +import io.airbyte.cdk.output.ResetStream +import io.airbyte.cdk.util.Jsons import io.airbyte.protocol.models.v0.SyncMode +import io.github.oshai.kotlinlogging.KotlinLogging import java.util.concurrent.atomic.AtomicReference +import kotlin.random.Random -/** Default implementation of [PartitionsCreator] for streams in JDBC sources. */ -class StreamPartitionsCreator( - val ctx: StreamReadContext, +/** Base class for JDBC implementations of [PartitionsCreator]. */ +sealed class StreamPartitionsCreator( + val selectQueryGenerator: SelectQueryGenerator, + val streamState: JdbcStreamState<*>, val input: Input, - val parameters: Parameters, - val readerParameters: StreamPartitionReader.Parameters, ) : PartitionsCreator { + private val log = KotlinLogging.logger {} + + val stream: Stream = streamState.stream + val sharedState: JdbcSharedState = streamState.sharedState + val outputConsumer: OutputConsumer = sharedState.outputConsumer + val selectQuerier: SelectQuerier = sharedState.selectQuerier + sealed interface Input data object NoStart : Input @@ -58,25 +71,23 @@ class StreamPartitionsCreator( val throughputBytesPerSecond: Long = 10L * 1024L * 1024L, ) - val acquiredResources = AtomicReference(null) + private val acquiredResources = AtomicReference() + + /** Calling [close] releases the resources acquired for the [StreamPartitionsCreator]. */ fun interface AcquiredResources : AutoCloseable override fun tryAcquireResources(): PartitionsCreator.TryAcquireResourcesStatus { - // Running this PartitionsCreator may not always involve JDBC queries. - // In those cases, the semaphore will be released very soon after, so this is OK. val acquiredResources: AcquiredResources = - ctx.sharedState.tryAcquireResourcesForCreator() + sharedState.tryAcquireResourcesForCreator() ?: return PartitionsCreator.TryAcquireResourcesStatus.RETRY_LATER this.acquiredResources.set(acquiredResources) return PartitionsCreator.TryAcquireResourcesStatus.READY_TO_RUN } - override fun releaseResources() { - acquiredResources.getAndSet(null)?.close() - } - override suspend fun run(): List = - input.partitionReaderInputs().map { StreamPartitionReader(ctx, it, readerParameters) } + input.partitionReaderInputs().map { createReader(it) } + + abstract fun createReader(input: StreamPartitionReader.Input): StreamPartitionReader fun Input.partitionReaderInputs(): List { return when (this) { @@ -94,7 +105,7 @@ class StreamPartitionsCreator( primaryKeyLowerBound = null, primaryKeyUpperBound = null, cursor = cursor, - cursorUpperBound = utils.computeCursorUpperBound(cursor) ?: return listOf(), + cursorUpperBound = ensureCursorUpperBound(cursor) ?: return listOf(), ) .split() is CursorIncrementalColdStart -> @@ -102,7 +113,7 @@ class StreamPartitionsCreator( cursor = cursor, cursorLowerBound = cursorLowerBound, isLowerBoundIncluded = true, - cursorUpperBound = utils.computeCursorUpperBound(cursor) ?: return listOf(), + cursorUpperBound = ensureCursorUpperBound(cursor) ?: return listOf(), ) .split() is SnapshotWarmStart -> @@ -133,19 +144,19 @@ class StreamPartitionsCreator( } fun StreamPartitionReader.SnapshotInput.split(): List = - utils.split(this, primaryKeyLowerBound, primaryKeyUpperBound).map { (lb, ub) -> + split(this, primaryKeyLowerBound, primaryKeyUpperBound).map { (lb, ub) -> copy(primaryKeyLowerBound = lb, primaryKeyUpperBound = ub) } fun StreamPartitionReader.SnapshotWithCursorInput.split(): List = - utils.split(this, primaryKeyLowerBound, primaryKeyUpperBound).map { (lb, ub) -> + split(this, primaryKeyLowerBound, primaryKeyUpperBound).map { (lb, ub) -> copy(primaryKeyLowerBound = lb, primaryKeyUpperBound = ub) } fun StreamPartitionReader.CursorIncrementalInput.split(): List = - utils.split(this, listOf(cursorLowerBound), listOf(cursorUpperBound)).mapIndexed { + split(this, listOf(cursorLowerBound), listOf(cursorUpperBound)).mapIndexed { idx: Int, (lb, ub) -> copy( @@ -155,31 +166,231 @@ class StreamPartitionsCreator( ) } - private val utils = StreamPartitionsCreatorUtils(ctx, parameters) + abstract fun split( + input: StreamPartitionReader.Input, + globalLowerBound: List?, + globalUpperBound: List?, + ): List?, List?>> + + override fun releaseResources() { + acquiredResources.getAndSet(null)?.close() + } + + fun ensureCursorUpperBound(cursor: Field): JsonNode? { + if (streamState.cursorUpperBound != null) { + return streamState.cursorUpperBound + } + val querySpec = + SelectQuerySpec( + SelectColumnMaxValue(cursor), + From(stream.name, stream.namespace), + ) + val cursorUpperBoundQuery: SelectQuery = selectQueryGenerator.generate(querySpec.optimize()) + log.info { "Querying maximum cursor column value." } + val record: ObjectNode? = + selectQuerier.executeQuery(cursorUpperBoundQuery).use { + if (it.hasNext()) it.next() else null + } + val cursorUpperBound: JsonNode? = record?.fields()?.asSequence()?.firstOrNull()?.value + if (cursorUpperBound == null) { + streamState.cursorUpperBound = Jsons.nullNode() + log.warn { "No cursor column value found in '${stream.label}'." } + return null + } + streamState.cursorUpperBound = cursorUpperBound + if (cursorUpperBound.isNull) { + log.warn { "Maximum cursor column value in '${stream.label}' is NULL." } + return null + } + log.info { "Maximum cursor column value in '${stream.label}' is '$cursorUpperBound'." } + return cursorUpperBound + } + + /** Collects a sample of rows in the unsplit partition. */ + fun collectSample( + querySpec: SelectQuerySpec, + recordMapper: (ObjectNode) -> T, + ): Sample { + val values = mutableListOf() + var previousWeight = 0L + for (sampleRateInvPow2 in listOf(16, 8, 0)) { + val sampleRateInv: Long = 1L shl sampleRateInvPow2 + log.info { "Sampling stream '${stream.label}' at rate 1 / $sampleRateInv." } + // First, try sampling the table at a rate of one every 2^16 = 65_536 rows. + // If that's not enough to produce the desired number of sampled rows (1024 by default) + // then try sampling at a higher rate of one every 2^8 = 256 rows. + // If that's still not enough, don't sample at all. + values.clear() + val fromSample = + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2, + sharedState.maxSampleSize, + ) + val sampledQuerySpec: SelectQuerySpec = querySpec.copy(from = fromSample) + val samplingQuery: SelectQuery = + selectQueryGenerator.generate(sampledQuerySpec.optimize()) + selectQuerier.executeQuery(samplingQuery).use { + for (record in it) { + values.add(recordMapper(record)) + } + } + if (values.size < sharedState.maxSampleSize) { + previousWeight = sampleRateInv * values.size / sharedState.maxSampleSize + continue + } + val kind: Sample.Kind = + when (sampleRateInvPow2) { + 16 -> Sample.Kind.LARGE + 8 -> Sample.Kind.MEDIUM + else -> Sample.Kind.SMALL + } + log.info { "Sampled ${values.size} rows in ${kind.name} stream '${stream.label}'." } + return Sample(values, kind, previousWeight.coerceAtLeast(sampleRateInv)) + } + val kind: Sample.Kind = if (values.isEmpty()) Sample.Kind.EMPTY else Sample.Kind.TINY + log.info { "Sampled ${values.size} rows in ${kind.name} stream '${stream.label}'." } + return Sample(values, kind, if (values.isEmpty()) 0L else 1L) + } +} + +/** Sequential JDBC implementation of [PartitionsCreator]. */ +class StreamSequentialPartitionsCreator( + selectQueryGenerator: SelectQueryGenerator, + streamState: JdbcStreamState<*>, + input: Input, +) : StreamPartitionsCreator(selectQueryGenerator, streamState, input) { + private val log = KotlinLogging.logger {} + + override fun createReader(input: StreamPartitionReader.Input): StreamPartitionReader { + // Handle edge case where the partition cannot be split. + if (!input.resumable) { + log.warn { + "Table cannot be read by sequential partition reader because it cannot be split." + } + return StreamNonResumablePartitionReader(selectQueryGenerator, streamState, input) + } + // Happy path. + log.info { "Table will be read by sequential partition reader(s)." } + return StreamResumablePartitionReader(selectQueryGenerator, streamState, input) + } + + override fun split( + input: StreamPartitionReader.Input, + globalLowerBound: List?, + globalUpperBound: List? + ): List?, List?>> { + return listOf(globalLowerBound to globalUpperBound) + } +} + +/** Concurrent JDBC implementation of [PartitionsCreator]. */ +class StreamConcurrentPartitionsCreator( + selectQueryGenerator: SelectQueryGenerator, + streamState: JdbcStreamState<*>, + input: Input, +) : StreamPartitionsCreator(selectQueryGenerator, streamState, input) { + private val log = KotlinLogging.logger {} + + override fun createReader(input: StreamPartitionReader.Input): StreamPartitionReader = + StreamNonResumablePartitionReader(selectQueryGenerator, streamState, input) + + override fun split( + input: StreamPartitionReader.Input, + globalLowerBound: List?, + globalUpperBound: List? + ): List?, List?>> { + // Handle edge case where the table can't be sampled. + if (!sharedState.withSampling) { + log.warn { + "Table cannot be read by concurrent partition readers because it cannot be sampled." + } + // TODO: adaptive fetchSize computation? + return listOf(globalLowerBound to globalUpperBound) + } + // Sample the table for partition split boundaries and for record byte sizes. + val unsplitQuerySpec: SelectQuerySpec = + input.querySpec(stream, isOrdered = true, limit = null) + val checkpointColumns: List = (unsplitQuerySpec.orderBy as OrderBy).columns + val sample: Sample, Long>> = + collectSample(unsplitQuerySpec) { record: ObjectNode -> + val checkpointValues: List = + checkpointColumns.map { record[it.id] ?: Jsons.nullNode() } + val rowByteSize: Long = sharedState.rowByteSizeEstimator().apply(record) + checkpointValues to rowByteSize + } + if (sample.kind == Sample.Kind.EMPTY) { + log.info { "Sampling query found that the table was empty." } + return listOf() + } + val rowByteSizeSample: Sample = sample.map { (_, rowByteSize: Long) -> rowByteSize } + streamState.fetchSize = sharedState.jdbcFetchSizeEstimator().apply(rowByteSizeSample) + val expectedTableByteSize: Long = rowByteSizeSample.sampledValues.sum() * sample.valueWeight + log.info { "Table memory size estimated at ${expectedTableByteSize shr 20} MiB." } + // Handle edge case where the table can't be split. + if (!input.resumable) { + log.warn { + "Table cannot be read by concurrent partition readers because it cannot be split." + } + return listOf(globalLowerBound to globalUpperBound) + } + // Happy path. + log.info { "Target partition size is ${sharedState.targetPartitionByteSize shr 20} MiB." } + val secondarySamplingRate: Double = + if (expectedTableByteSize <= sharedState.targetPartitionByteSize) { + 0.0 + } else { + val expectedPartitionByteSize: Long = + expectedTableByteSize / sharedState.maxSampleSize + if (expectedPartitionByteSize < sharedState.targetPartitionByteSize) { + expectedPartitionByteSize.toDouble() / sharedState.targetPartitionByteSize + } else { + 1.0 + } + } + val random = Random(expectedTableByteSize) // RNG output is repeatable. + val innerSplitBoundaries: List> = + sample.sampledValues + .filter { random.nextDouble() < secondarySamplingRate } + .map { (splitBoundary: List, _) -> splitBoundary } + .distinct() + log.info { + "Table will be read by ${innerSplitBoundaries.size + 1} concurrent partition reader(s)." + } + val lbs: List?> = listOf(globalLowerBound) + innerSplitBoundaries + val ubs: List?> = innerSplitBoundaries + listOf(globalUpperBound) + return lbs.zip(ubs) + } } /** Converts a nullable [OpaqueStateValue] into an input for [StreamPartitionsCreator]. */ fun OpaqueStateValue?.streamPartitionsCreatorInput( - ctx: StreamReadContext, + handler: CatalogValidationFailureHandler, + streamState: JdbcStreamState<*>, ): StreamPartitionsCreator.Input { - val checkpoint: CheckpointStreamState? = checkpoint(ctx) + val checkpoint: CheckpointStreamState? = checkpoint(handler, streamState) if (checkpoint == null && this != null) { - ctx.resetStream() + handler.accept(ResetStream(streamState.stream.name, streamState.stream.namespace)) + streamState.reset() } - return checkpoint.streamPartitionsCreatorInput(ctx) + return checkpoint.streamPartitionsCreatorInput(streamState) } /** Converts a nullable [CheckpointStreamState] into an input for [StreamPartitionsCreator]. */ fun CheckpointStreamState?.streamPartitionsCreatorInput( - ctx: StreamReadContext, + streamState: JdbcStreamState<*>, ): StreamPartitionsCreator.Input { + val stream: Stream = streamState.stream + val sharedState: JdbcSharedState = streamState.sharedState + val configuration: JdbcSourceConfiguration = sharedState.configuration if (this == null) { - val pkChosenFromCatalog: List = ctx.stream.configuredPrimaryKey ?: listOf() - if (ctx.stream.configuredSyncMode == SyncMode.FULL_REFRESH || ctx.configuration.global) { + val pkChosenFromCatalog: List = stream.configuredPrimaryKey ?: listOf() + if (stream.configuredSyncMode == SyncMode.FULL_REFRESH || configuration.global) { return StreamPartitionsCreator.SnapshotColdStart(pkChosenFromCatalog) } val cursorChosenFromCatalog: Field = - ctx.stream.configuredCursor as? Field ?: throw ConfigErrorException("no cursor") + stream.configuredCursor as? Field ?: throw ConfigErrorException("no cursor") return StreamPartitionsCreator.SnapshotWithCursorColdStart( pkChosenFromCatalog, cursorChosenFromCatalog, @@ -200,7 +411,7 @@ fun CheckpointStreamState?.streamPartitionsCreatorInput( cursorUpperBound, ) is CursorIncrementalCheckpoint -> - when (val cursorUpperBound: JsonNode? = ctx.streamState.cursorUpperBound) { + when (val cursorUpperBound: JsonNode? = streamState.cursorUpperBound) { null -> StreamPartitionsCreator.CursorIncrementalColdStart( cursor, diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt deleted file mode 100644 index c5b858858b26a..0000000000000 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreatorUtils.kt +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.read - -import com.fasterxml.jackson.core.JsonGenerator -import com.fasterxml.jackson.databind.JsonNode -import com.fasterxml.jackson.databind.node.ObjectNode -import io.airbyte.cdk.discover.Field -import io.airbyte.cdk.util.Jsons -import io.github.oshai.kotlinlogging.KotlinLogging -import java.io.OutputStream -import kotlin.random.Random - -/** Utilities for [StreamPartitionsCreator] that don't rely directly on its input state. */ -class StreamPartitionsCreatorUtils( - val ctx: StreamReadContext, - val parameters: StreamPartitionsCreator.Parameters, -) { - fun split( - input: StreamPartitionReader.Input, - globalLowerBound: List?, - globalUpperBound: List?, - ): List?, List?>> { - // Collect a sample from the unsplit partition of this table. - // Each sampled row is mapped to the values of the order fields - // and to the approximate byte size in memory of the row. - val unsplitQuerySpec: SelectQuerySpec = - input.querySpec(ctx.stream, isOrdered = true, limit = null) - val checkpointColumns: List = (unsplitQuerySpec.orderBy as OrderBy).columns - val rowByteSizeEstimator: (ObjectNode) -> Long = rowByteSizeEstimator() - val sample: Sample, Long>> by lazy { - log.info { "Sampling stream '${ctx.stream.label}'" } - collectSample(unsplitQuerySpec) { record: ObjectNode -> - val checkpointValues: List = - checkpointColumns.map { record[it.id] ?: Jsons.nullNode() } - checkpointValues to rowByteSizeEstimator(record) - } - } - // Ensure that the JDBC fetchSize parameter value for this table is set. - // Compute it using the sample. - if (ctx.streamState.fetchSize == null) { - val rowByteSizeSample: Sample = - sample.map { (_, rowByteSize: Long) -> rowByteSize } - val fetchSizeEstimator = ctx.sharedState.jdbcFetchSizeEstimator() - ctx.streamState.fetchSize = fetchSizeEstimator.apply(rowByteSizeSample) - } - // Compute partition split boundaries. - // First, check if splitting can or should be done, and exit if that isn't the case. - if (checkpointColumns.isEmpty() || !parameters.preferParallelized) { - log.info { - "not attempting to create more than one partition for '${ctx.stream.label}'" - } - return listOf(globalLowerBound to globalUpperBound) - } - // At this point, try to split the partition defined by - // ]globalLowerBound, globalUpperBound]. Each of these splits should be processed within the - // targeted amount of time defined in the configuration. This estimate is very imprecise: - // the sampling is almost certainly going to be biased, the throughput is wildly dependent - // on many uncontrollable factors, etc. - val splitBoundaries: List> = computeSplitBoundaries(sample) - if (splitBoundaries.isEmpty()) { - log.info { "creating one partition for remaining data in '${ctx.stream.label}" } - } else { - log.info { - "split remaining data in '${ctx.stream.label} " + - "into ${splitBoundaries.size + 1} partitions" - } - } - val lbs: List?> = listOf(globalLowerBound) + splitBoundaries - val ubs: List?> = splitBoundaries + listOf(globalUpperBound) - return lbs.zip(ubs) - } - - fun rowByteSizeEstimator(): (ObjectNode) -> Long { - val countingOutputStream = - object : OutputStream() { - var counter: Long = 0L - - override fun write(b: Int) { - counter++ - } - } - val jsonGenerator: JsonGenerator = Jsons.createGenerator(countingOutputStream) - val fieldOverheadEstimate = 16L - return { record: ObjectNode -> - countingOutputStream.counter = 0L - Jsons.writeValue(jsonGenerator, record) - val rowOverheadBytes: Long = - fieldOverheadEstimate * record.fields().asSequence().count() - countingOutputStream.counter + rowOverheadBytes - } - } - - /** Computes the max value for the cursor column, used as an upper bound during this sync. */ - fun computeCursorUpperBound(cursor: Field): JsonNode? { - val querySpec = - SelectQuerySpec( - SelectColumnMaxValue(cursor), - From(ctx.stream.name, ctx.stream.namespace), - ) - val q: SelectQuery = ctx.selectQueryGenerator.generate(querySpec.optimize()) - val record: ObjectNode = - ctx.selectQuerier.executeQuery(q).use { if (it.hasNext()) it.next() else return null } - val value: JsonNode = record[cursor.id] ?: Jsons.nullNode() - if (value.isNull) { - // Either the table is empty, or its cursor column values are all NULL. - // In both cases, there is nothing to be done. - return null - } - ctx.streamState.cursorUpperBound = value - return value - } - - /** Computes the partition split boundaries from the given sample. */ - private fun computeSplitBoundaries( - sample: Sample, Long>>, - ): List> { - val expectedTableByteSize: Long = - sample.sampledValues.sumOf { (_, rowByteSize: Long) -> - rowByteSize * sample.valueWeight - } - log.info { - "remaining data in '${ctx.stream.label}' " + - "is estimated at ${expectedTableByteSize shr 20} MiB" - } - val streamThroughputBytesPerSecond: Long = - parameters.throughputBytesPerSecond / ctx.configuration.maxConcurrency - val targetCheckpointByteSize: Long = - streamThroughputBytesPerSecond * ctx.configuration.checkpointTargetInterval.seconds - log.info { - "target partition size for '${ctx.stream.label}' " + - "is ${targetCheckpointByteSize shr 20} MiB" - } - val secondarySamplingRate: Double = - if (expectedTableByteSize <= targetCheckpointByteSize) { - 0.0 - } else { - val expectedPartitionByteSize: Long = - expectedTableByteSize / parameters.tableSampleSize - if (expectedPartitionByteSize < targetCheckpointByteSize) { - expectedPartitionByteSize.toDouble() / targetCheckpointByteSize - } else { - 1.0 - } - } - val random = Random(expectedTableByteSize) // RNG output is repeatable. - return sample.sampledValues - .filter { random.nextDouble() < secondarySamplingRate } - .map { (splitBoundary: List, _) -> splitBoundary } - } - - /** Collects a sample of rows in the unsplit partition. */ - fun collectSample( - querySpec: SelectQuerySpec, - rowFn: (ObjectNode) -> T, - ): Sample { - val values = mutableListOf() - var previousWeight = 0L - for (sampleRateInvPow2 in listOf(16, 8, 0)) { - // First, try sampling the table at a rate of one every 2^16 = 65_536 rows. - // If that's not enough to produce the desired number of sampled rows (1024 by default) - // then try sampling at a higher rate of one every 2^8 = 256 rows. - // If that's still not enough, don't sample at all. - values.clear() - val fromSample = - FromSample( - ctx.stream.name, - ctx.stream.namespace, - sampleRateInvPow2, - parameters.tableSampleSize, - ) - val sampledQuerySpec: SelectQuerySpec = querySpec.copy(from = fromSample) - val q: SelectQuery = ctx.selectQueryGenerator.generate(sampledQuerySpec.optimize()) - ctx.selectQuerier.executeQuery(q).use { for (record in it) values.add(rowFn(record)) } - if (values.size < parameters.tableSampleSize) { - previousWeight = (fromSample.sampleRateInv * values.size) / fromSample.sampleSize - continue - } - val kind: Sample.Kind = - when (sampleRateInvPow2) { - 16 -> Sample.Kind.LARGE - 8 -> Sample.Kind.MEDIUM - else -> Sample.Kind.SMALL - } - log.info { "sampled ${values.size} rows in ${kind.name} stream ${ctx.stream.label}." } - return Sample(values, kind, previousWeight.coerceAtLeast(fromSample.sampleRateInv)) - } - val kind: Sample.Kind = if (values.isEmpty()) Sample.Kind.EMPTY else Sample.Kind.TINY - log.info { "sampled ${values.size} rows in ${kind.name} stream ${ctx.stream.label}." } - return Sample(values, kind, if (values.isEmpty()) 0L else 1L) - } - - private val log = KotlinLogging.logger {} -} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContext.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContextManager.kt similarity index 53% rename from airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContext.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContextManager.kt index 327fccea063dd..888964dc89f71 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContext.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContextManager.kt @@ -4,7 +4,6 @@ package io.airbyte.cdk.read import io.airbyte.cdk.command.JdbcSourceConfiguration import io.airbyte.cdk.output.CatalogValidationFailureHandler import io.airbyte.cdk.output.OutputConsumer -import io.airbyte.cdk.output.ResetStream import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair import jakarta.inject.Singleton import java.util.concurrent.ConcurrentHashMap @@ -17,9 +16,9 @@ import java.util.concurrent.ConcurrentMap * useful for implementing stream READs for a JDBC source. * * For each stream in the configured catalog, these global singletons are packaged in a - * [StreamReadContext] which bundles them with the corresponding [Stream] as well as a couple - * [TransientState] instances which hold mutable metadata which is _transient_, transient in the - * sense that it is not persisted in an Airbyte STATE message. + * [JdbcStreamState] which bundles them with the corresponding [Stream] as well as mutable metadata + * which is _transient_, transient in the sense that it is not persisted in an Airbyte STATE + * message. */ @Singleton class StreamReadContextManager( @@ -37,41 +36,11 @@ class StreamReadContextManager( val selectQuerier: SelectQuerier get() = sharedState.selectQuerier - private val map: ConcurrentMap = + private val map: ConcurrentMap> = ConcurrentHashMap() - operator fun get(stream: Stream): StreamReadContext = + operator fun get(stream: Stream): JdbcStreamState<*> = map.getOrPut(stream.namePair) { - StreamReadContext( - handler, - selectQueryGenerator, - DefaultJdbcStreamState(sharedState as DefaultJdbcSharedState, stream), - ) + DefaultJdbcStreamState(sharedState as DefaultJdbcSharedState, stream) } } - -class StreamReadContext( - val handler: CatalogValidationFailureHandler, - val selectQueryGenerator: SelectQueryGenerator, - val streamState: JdbcStreamState<*>, -) { - val sharedState: JdbcSharedState - get() = streamState.sharedState - - val stream: Stream - get() = streamState.stream - - val configuration: JdbcSourceConfiguration - get() = sharedState.configuration - - val outputConsumer: OutputConsumer - get() = sharedState.outputConsumer - - val selectQuerier: SelectQuerier - get() = sharedState.selectQuerier - - fun resetStream() { - handler.accept(ResetStream(stream.name, stream.namespace)) - streamState.reset() - } -} From 415260320ac4aeef1428b1e67ba434dc76a035e1 Mon Sep 17 00:00:00 2001 From: Marius Posta Date: Tue, 20 Aug 2024 11:28:51 -0400 Subject: [PATCH 08/11] extract-jdbc: rename PartitionsCreator and PartitionReader impls --- .../FakeSourcePartitionsCreatorFactory.kt | 4 +- .../cdk/read/DefaultJdbcSharedState.kt | 8 +- ...titionReader.kt => JdbcPartitionReader.kt} | 34 ++++---- ...onsCreator.kt => JdbcPartitionsCreator.kt} | 78 +++++++++---------- .../io/airbyte/cdk/read/JdbcSharedState.kt | 4 +- 5 files changed, 64 insertions(+), 64 deletions(-) rename airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/{StreamPartitionReader.kt => JdbcPartitionReader.kt} (92%) rename airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/{StreamPartitionsCreator.kt => JdbcPartitionsCreator.kt} (86%) diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt index dd44ae464b1b0..c55c63bb4787a 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt @@ -5,13 +5,13 @@ import io.airbyte.cdk.command.OpaqueStateValue import io.airbyte.cdk.read.CreateNoPartitions import io.airbyte.cdk.read.Feed import io.airbyte.cdk.read.Global +import io.airbyte.cdk.read.JdbcSequentialPartitionsCreator import io.airbyte.cdk.read.JdbcStreamState import io.airbyte.cdk.read.PartitionsCreator import io.airbyte.cdk.read.PartitionsCreatorFactory import io.airbyte.cdk.read.StateQuerier import io.airbyte.cdk.read.Stream import io.airbyte.cdk.read.StreamReadContextManager -import io.airbyte.cdk.read.StreamSequentialPartitionsCreator import io.airbyte.cdk.read.streamPartitionsCreatorInput import jakarta.inject.Singleton @@ -28,7 +28,7 @@ class FakeSourcePartitionsCreatorFactory( is Global -> CreateNoPartitions is Stream -> { val streamState: JdbcStreamState<*> = streamReadContextManager[feed] - StreamSequentialPartitionsCreator( + JdbcSequentialPartitionsCreator( streamReadContextManager.selectQueryGenerator, streamState, opaqueStateValue.streamPartitionsCreatorInput( diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt index 346d314e0acf5..86fb0f2cf8e69 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt @@ -67,16 +67,16 @@ class DefaultJdbcSharedState( internal val semaphore = Semaphore(configuration.maxConcurrency) - override fun tryAcquireResourcesForCreator(): StreamPartitionsCreator.AcquiredResources? = + override fun tryAcquireResourcesForCreator(): JdbcPartitionsCreator.AcquiredResources? = if (semaphore.tryAcquire()) { - StreamPartitionsCreator.AcquiredResources { semaphore.release() } + JdbcPartitionsCreator.AcquiredResources { semaphore.release() } } else { null } - override fun tryAcquireResourcesForReader(): StreamPartitionReader.AcquiredResources? = + override fun tryAcquireResourcesForReader(): JdbcPartitionReader.AcquiredResources? = if (semaphore.tryAcquire()) { - StreamPartitionReader.AcquiredResources { semaphore.release() } + JdbcPartitionReader.AcquiredResources { semaphore.release() } } else { null } diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionReader.kt similarity index 92% rename from airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionReader.kt index 4203d59f2fc1a..d11ac97d370e0 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionReader.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionReader.kt @@ -15,7 +15,7 @@ import kotlin.coroutines.coroutineContext import kotlinx.coroutines.ensureActive /** Base class for JDBC implementations of [PartitionReader]. */ -sealed class StreamPartitionReader( +sealed class JdbcPartitionReader( val streamState: JdbcStreamState<*>, val input: Input, ) : PartitionReader { @@ -91,11 +91,11 @@ sealed class StreamPartitionReader( } /** JDBC implementation of [PartitionReader] which reads the [input] in its entirety. */ -class StreamNonResumablePartitionReader( +class JdbcNonResumablePartitionReader( val selectQueryGenerator: SelectQueryGenerator, streamState: JdbcStreamState<*>, input: Input, -) : StreamPartitionReader(streamState, input) { +) : JdbcPartitionReader(streamState, input) { val runComplete = AtomicBoolean(false) val numRecords = AtomicLong() @@ -135,11 +135,11 @@ class StreamNonResumablePartitionReader( * JDBC implementation of [PartitionReader] which reads as much as possible of the [input], in * order, before timing out. */ -class StreamResumablePartitionReader( +class JdbcResumablePartitionReader( val selectQueryGenerator: SelectQueryGenerator, streamState: JdbcStreamState<*>, input: Input, -) : StreamPartitionReader(streamState, input) { +) : JdbcPartitionReader(streamState, input) { val incumbentLimit = AtomicLong() val numRecords = AtomicLong() @@ -200,14 +200,14 @@ class StreamResumablePartitionReader( } } -/** Converts a [StreamPartitionReader.Input] into a [SelectQuerySpec]. */ -fun StreamPartitionReader.Input.querySpec( +/** Converts a [JdbcPartitionReader.Input] into a [SelectQuerySpec]. */ +fun JdbcPartitionReader.Input.querySpec( stream: Stream, isOrdered: Boolean, limit: Long?, ): SelectQuerySpec = when (this) { - is StreamPartitionReader.SnapshotInput -> + is JdbcPartitionReader.SnapshotInput -> querySpecForStreamPartitionReader( stream, checkpointColumns = primaryKey, @@ -217,7 +217,7 @@ fun StreamPartitionReader.Input.querySpec( isOrdered, limit, ) - is StreamPartitionReader.SnapshotWithCursorInput -> + is JdbcPartitionReader.SnapshotWithCursorInput -> querySpecForStreamPartitionReader( stream, checkpointColumns = primaryKey, @@ -227,7 +227,7 @@ fun StreamPartitionReader.Input.querySpec( isOrdered, limit, ) - is StreamPartitionReader.CursorIncrementalInput -> + is JdbcPartitionReader.CursorIncrementalInput -> querySpecForStreamPartitionReader( stream, checkpointColumns = listOf(cursor), @@ -296,14 +296,14 @@ private fun querySpecForStreamPartitionReader( } /** - * Generates a [CheckpointStreamState] using the [StreamPartitionReader.Input] initial state and, if - * provided, the last record read by the [StreamPartitionReader]. When not provided, the partition - * is presumed to have been read in its entirety. + * Generates a [CheckpointStreamState] using the [JdbcPartitionReader.Input] initial state and, if + * provided, the last record read by the [JdbcPartitionReader]. When not provided, the partition is + * presumed to have been read in its entirety. */ -fun StreamPartitionReader.Input.checkpoint(row: ObjectNode? = null): CheckpointStreamState { +fun JdbcPartitionReader.Input.checkpoint(row: ObjectNode? = null): CheckpointStreamState { fun getRowValue(field: Field): JsonNode = row?.get(field.id) ?: Jsons.nullNode() return when (this) { - is StreamPartitionReader.SnapshotInput -> + is JdbcPartitionReader.SnapshotInput -> if (row != null) { SnapshotCheckpoint(primaryKey, primaryKey.map(::getRowValue)) } else if (primaryKeyUpperBound != null) { @@ -311,7 +311,7 @@ fun StreamPartitionReader.Input.checkpoint(row: ObjectNode? = null): CheckpointS } else { SnapshotCompleted } - is StreamPartitionReader.SnapshotWithCursorInput -> + is JdbcPartitionReader.SnapshotWithCursorInput -> if (row != null) { SnapshotWithCursorCheckpoint( primaryKey, @@ -329,7 +329,7 @@ fun StreamPartitionReader.Input.checkpoint(row: ObjectNode? = null): CheckpointS } else { CursorIncrementalCheckpoint(cursor, cursorUpperBound) } - is StreamPartitionReader.CursorIncrementalInput -> + is JdbcPartitionReader.CursorIncrementalInput -> if (row == null) { CursorIncrementalCheckpoint(cursor, cursorUpperBound) } else { diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreator.kt similarity index 86% rename from airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreator.kt index e3635baac2617..ab47bb5f1c0a1 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamPartitionsCreator.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreator.kt @@ -17,7 +17,7 @@ import java.util.concurrent.atomic.AtomicReference import kotlin.random.Random /** Base class for JDBC implementations of [PartitionsCreator]. */ -sealed class StreamPartitionsCreator( +sealed class JdbcPartitionsCreator( val selectQueryGenerator: SelectQueryGenerator, val streamState: JdbcStreamState<*>, val input: Input, @@ -73,7 +73,7 @@ sealed class StreamPartitionsCreator( private val acquiredResources = AtomicReference() - /** Calling [close] releases the resources acquired for the [StreamPartitionsCreator]. */ + /** Calling [close] releases the resources acquired for the [JdbcPartitionsCreator]. */ fun interface AcquiredResources : AutoCloseable override fun tryAcquireResources(): PartitionsCreator.TryAcquireResourcesStatus { @@ -87,20 +87,20 @@ sealed class StreamPartitionsCreator( override suspend fun run(): List = input.partitionReaderInputs().map { createReader(it) } - abstract fun createReader(input: StreamPartitionReader.Input): StreamPartitionReader + abstract fun createReader(input: JdbcPartitionReader.Input): JdbcPartitionReader - fun Input.partitionReaderInputs(): List { + fun Input.partitionReaderInputs(): List { return when (this) { is NoStart -> listOf() is SnapshotColdStart -> - StreamPartitionReader.SnapshotInput( + JdbcPartitionReader.SnapshotInput( primaryKey = primaryKey, primaryKeyLowerBound = null, primaryKeyUpperBound = null, ) .split() is SnapshotWithCursorColdStart -> - StreamPartitionReader.SnapshotWithCursorInput( + JdbcPartitionReader.SnapshotWithCursorInput( primaryKey = primaryKey, primaryKeyLowerBound = null, primaryKeyUpperBound = null, @@ -109,7 +109,7 @@ sealed class StreamPartitionsCreator( ) .split() is CursorIncrementalColdStart -> - StreamPartitionReader.CursorIncrementalInput( + JdbcPartitionReader.CursorIncrementalInput( cursor = cursor, cursorLowerBound = cursorLowerBound, isLowerBoundIncluded = true, @@ -117,14 +117,14 @@ sealed class StreamPartitionsCreator( ) .split() is SnapshotWarmStart -> - StreamPartitionReader.SnapshotInput( + JdbcPartitionReader.SnapshotInput( primaryKey = primaryKey, primaryKeyLowerBound = primaryKeyLowerBound, primaryKeyUpperBound = null, ) .split() is SnapshotWithCursorWarmStart -> - StreamPartitionReader.SnapshotWithCursorInput( + JdbcPartitionReader.SnapshotWithCursorInput( primaryKey = primaryKey, primaryKeyLowerBound = primaryKeyLowerBound, primaryKeyUpperBound = null, @@ -133,7 +133,7 @@ sealed class StreamPartitionsCreator( ) .split() is CursorIncrementalWarmStart -> - StreamPartitionReader.CursorIncrementalInput( + JdbcPartitionReader.CursorIncrementalInput( cursor = cursor, cursorLowerBound = cursorLowerBound, isLowerBoundIncluded = true, @@ -143,19 +143,19 @@ sealed class StreamPartitionsCreator( } } - fun StreamPartitionReader.SnapshotInput.split(): List = + fun JdbcPartitionReader.SnapshotInput.split(): List = split(this, primaryKeyLowerBound, primaryKeyUpperBound).map { (lb, ub) -> copy(primaryKeyLowerBound = lb, primaryKeyUpperBound = ub) } - fun StreamPartitionReader.SnapshotWithCursorInput.split(): - List = + fun JdbcPartitionReader.SnapshotWithCursorInput.split(): + List = split(this, primaryKeyLowerBound, primaryKeyUpperBound).map { (lb, ub) -> copy(primaryKeyLowerBound = lb, primaryKeyUpperBound = ub) } - fun StreamPartitionReader.CursorIncrementalInput.split(): - List = + fun JdbcPartitionReader.CursorIncrementalInput.split(): + List = split(this, listOf(cursorLowerBound), listOf(cursorUpperBound)).mapIndexed { idx: Int, (lb, ub) -> @@ -167,7 +167,7 @@ sealed class StreamPartitionsCreator( } abstract fun split( - input: StreamPartitionReader.Input, + input: JdbcPartitionReader.Input, globalLowerBound: List?, globalUpperBound: List?, ): List?, List?>> @@ -256,28 +256,28 @@ sealed class StreamPartitionsCreator( } /** Sequential JDBC implementation of [PartitionsCreator]. */ -class StreamSequentialPartitionsCreator( +class JdbcSequentialPartitionsCreator( selectQueryGenerator: SelectQueryGenerator, streamState: JdbcStreamState<*>, input: Input, -) : StreamPartitionsCreator(selectQueryGenerator, streamState, input) { +) : JdbcPartitionsCreator(selectQueryGenerator, streamState, input) { private val log = KotlinLogging.logger {} - override fun createReader(input: StreamPartitionReader.Input): StreamPartitionReader { + override fun createReader(input: JdbcPartitionReader.Input): JdbcPartitionReader { // Handle edge case where the partition cannot be split. if (!input.resumable) { log.warn { "Table cannot be read by sequential partition reader because it cannot be split." } - return StreamNonResumablePartitionReader(selectQueryGenerator, streamState, input) + return JdbcNonResumablePartitionReader(selectQueryGenerator, streamState, input) } // Happy path. log.info { "Table will be read by sequential partition reader(s)." } - return StreamResumablePartitionReader(selectQueryGenerator, streamState, input) + return JdbcResumablePartitionReader(selectQueryGenerator, streamState, input) } override fun split( - input: StreamPartitionReader.Input, + input: JdbcPartitionReader.Input, globalLowerBound: List?, globalUpperBound: List? ): List?, List?>> { @@ -286,18 +286,18 @@ class StreamSequentialPartitionsCreator( } /** Concurrent JDBC implementation of [PartitionsCreator]. */ -class StreamConcurrentPartitionsCreator( +class JdbcConcurrentPartitionsCreator( selectQueryGenerator: SelectQueryGenerator, streamState: JdbcStreamState<*>, input: Input, -) : StreamPartitionsCreator(selectQueryGenerator, streamState, input) { +) : JdbcPartitionsCreator(selectQueryGenerator, streamState, input) { private val log = KotlinLogging.logger {} - override fun createReader(input: StreamPartitionReader.Input): StreamPartitionReader = - StreamNonResumablePartitionReader(selectQueryGenerator, streamState, input) + override fun createReader(input: JdbcPartitionReader.Input): JdbcPartitionReader = + JdbcNonResumablePartitionReader(selectQueryGenerator, streamState, input) override fun split( - input: StreamPartitionReader.Input, + input: JdbcPartitionReader.Input, globalLowerBound: List?, globalUpperBound: List? ): List?, List?>> { @@ -364,11 +364,11 @@ class StreamConcurrentPartitionsCreator( } } -/** Converts a nullable [OpaqueStateValue] into an input for [StreamPartitionsCreator]. */ +/** Converts a nullable [OpaqueStateValue] into an input for [JdbcPartitionsCreator]. */ fun OpaqueStateValue?.streamPartitionsCreatorInput( handler: CatalogValidationFailureHandler, streamState: JdbcStreamState<*>, -): StreamPartitionsCreator.Input { +): JdbcPartitionsCreator.Input { val checkpoint: CheckpointStreamState? = checkpoint(handler, streamState) if (checkpoint == null && this != null) { handler.accept(ResetStream(streamState.stream.name, streamState.stream.namespace)) @@ -377,34 +377,34 @@ fun OpaqueStateValue?.streamPartitionsCreatorInput( return checkpoint.streamPartitionsCreatorInput(streamState) } -/** Converts a nullable [CheckpointStreamState] into an input for [StreamPartitionsCreator]. */ +/** Converts a nullable [CheckpointStreamState] into an input for [JdbcPartitionsCreator]. */ fun CheckpointStreamState?.streamPartitionsCreatorInput( streamState: JdbcStreamState<*>, -): StreamPartitionsCreator.Input { +): JdbcPartitionsCreator.Input { val stream: Stream = streamState.stream val sharedState: JdbcSharedState = streamState.sharedState val configuration: JdbcSourceConfiguration = sharedState.configuration if (this == null) { val pkChosenFromCatalog: List = stream.configuredPrimaryKey ?: listOf() if (stream.configuredSyncMode == SyncMode.FULL_REFRESH || configuration.global) { - return StreamPartitionsCreator.SnapshotColdStart(pkChosenFromCatalog) + return JdbcPartitionsCreator.SnapshotColdStart(pkChosenFromCatalog) } val cursorChosenFromCatalog: Field = stream.configuredCursor as? Field ?: throw ConfigErrorException("no cursor") - return StreamPartitionsCreator.SnapshotWithCursorColdStart( + return JdbcPartitionsCreator.SnapshotWithCursorColdStart( pkChosenFromCatalog, cursorChosenFromCatalog, ) } return when (this) { - SnapshotCompleted -> StreamPartitionsCreator.NoStart + SnapshotCompleted -> JdbcPartitionsCreator.NoStart is SnapshotCheckpoint -> - StreamPartitionsCreator.SnapshotWarmStart( + JdbcPartitionsCreator.SnapshotWarmStart( primaryKey, primaryKeyCheckpoint, ) is SnapshotWithCursorCheckpoint -> - StreamPartitionsCreator.SnapshotWithCursorWarmStart( + JdbcPartitionsCreator.SnapshotWithCursorWarmStart( primaryKey, primaryKeyCheckpoint, cursor, @@ -413,15 +413,15 @@ fun CheckpointStreamState?.streamPartitionsCreatorInput( is CursorIncrementalCheckpoint -> when (val cursorUpperBound: JsonNode? = streamState.cursorUpperBound) { null -> - StreamPartitionsCreator.CursorIncrementalColdStart( + JdbcPartitionsCreator.CursorIncrementalColdStart( cursor, cursorCheckpoint, ) else -> if (cursorCheckpoint == cursorUpperBound) { - StreamPartitionsCreator.NoStart + JdbcPartitionsCreator.NoStart } else { - StreamPartitionsCreator.CursorIncrementalWarmStart( + JdbcPartitionsCreator.CursorIncrementalWarmStart( cursor, cursorCheckpoint, cursorUpperBound, diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcSharedState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcSharedState.kt index 30c779e5df69d..83e9d8275b9bc 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcSharedState.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcSharedState.kt @@ -52,8 +52,8 @@ interface JdbcSharedState { } /** Tries to acquire global resources for [JdbcPartitionsCreator]. */ - fun tryAcquireResourcesForCreator(): StreamPartitionsCreator.AcquiredResources? + fun tryAcquireResourcesForCreator(): JdbcPartitionsCreator.AcquiredResources? /** Tries to acquire global resources for [JdbcPartitionReader]. */ - fun tryAcquireResourcesForReader(): StreamPartitionReader.AcquiredResources? + fun tryAcquireResourcesForReader(): JdbcPartitionReader.AcquiredResources? } From a37b4b37b1af4a6467ae0e64820c826f6d2111f6 Mon Sep 17 00:00:00 2001 From: Marius Posta Date: Tue, 20 Aug 2024 11:28:52 -0400 Subject: [PATCH 09/11] extract-jdbc: add and adopt JdbcPartition and JdbcPartitionFactory --- .../FakeSourcePartitionsCreatorFactory.kt | 42 -- .../extract/src/test/resources/metadata.yaml | 2 + .../airbyte/cdk/read/CheckpointStreamState.kt | 141 ----- .../airbyte/cdk/read/DefaultJdbcPartition.kt | 291 ++++++++++ .../cdk/read/DefaultJdbcPartitionFactory.kt | 276 ++++++++++ .../io/airbyte/cdk/read/JdbcPartition.kt | 54 ++ .../airbyte/cdk/read/JdbcPartitionFactory.kt | 32 ++ .../airbyte/cdk/read/JdbcPartitionReader.kt | 228 +------- .../airbyte/cdk/read/JdbcPartitionsCreator.kt | 376 ++++--------- .../cdk/read/JdbcPartitionsCreatorFactory.kt | 72 +++ .../cdk/read/StreamReadContextManager.kt | 46 -- .../read/DefaultJdbcPartitionFactoryTest.kt | 512 ++++++++++++++++++ .../cdk/read/JdbcPartitionReaderTest.kt | 195 +++++++ .../cdk/read/JdbcPartitionsCreatorTest.kt | 418 ++++++++++++++ .../io/airbyte/cdk/read/TestFixtures.kt | 69 +++ 15 files changed, 2032 insertions(+), 722 deletions(-) delete mode 100644 airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt delete mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartition.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactory.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartition.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionFactory.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorFactory.kt delete mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContextManager.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactoryTest.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionReaderTest.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorTest.kt diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt deleted file mode 100644 index c55c63bb4787a..0000000000000 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourcePartitionsCreatorFactory.kt +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource - -import io.airbyte.cdk.command.OpaqueStateValue -import io.airbyte.cdk.read.CreateNoPartitions -import io.airbyte.cdk.read.Feed -import io.airbyte.cdk.read.Global -import io.airbyte.cdk.read.JdbcSequentialPartitionsCreator -import io.airbyte.cdk.read.JdbcStreamState -import io.airbyte.cdk.read.PartitionsCreator -import io.airbyte.cdk.read.PartitionsCreatorFactory -import io.airbyte.cdk.read.StateQuerier -import io.airbyte.cdk.read.Stream -import io.airbyte.cdk.read.StreamReadContextManager -import io.airbyte.cdk.read.streamPartitionsCreatorInput -import jakarta.inject.Singleton - -@Singleton -class FakeSourcePartitionsCreatorFactory( - val streamReadContextManager: StreamReadContextManager, -) : PartitionsCreatorFactory { - override fun make( - stateQuerier: StateQuerier, - feed: Feed, - ): PartitionsCreator { - val opaqueStateValue: OpaqueStateValue? = stateQuerier.current(feed) - return when (feed) { - is Global -> CreateNoPartitions - is Stream -> { - val streamState: JdbcStreamState<*> = streamReadContextManager[feed] - JdbcSequentialPartitionsCreator( - streamReadContextManager.selectQueryGenerator, - streamState, - opaqueStateValue.streamPartitionsCreatorInput( - streamReadContextManager.handler, - streamState, - ) - ) - } - } - } -} diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/metadata.yaml b/airbyte-cdk/bulk/core/extract/src/test/resources/metadata.yaml index aff5a4b3c71cc..3dfe5d2711421 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/metadata.yaml +++ b/airbyte-cdk/bulk/core/extract/src/test/resources/metadata.yaml @@ -2,3 +2,5 @@ data: dockerRepository: "airbyte/fake-source" documentationUrl: "https://docs.airbyte.com" + jdbc: + mode: sequential diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt deleted file mode 100644 index db8bbfab94a78..0000000000000 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/CheckpointStreamState.kt +++ /dev/null @@ -1,141 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.read - -import com.fasterxml.jackson.databind.JsonNode -import io.airbyte.cdk.command.OpaqueStateValue -import io.airbyte.cdk.discover.Field -import io.airbyte.cdk.discover.FieldOrMetaField -import io.airbyte.cdk.output.CatalogValidationFailureHandler -import io.airbyte.cdk.output.InvalidCursor -import io.airbyte.cdk.output.InvalidPrimaryKey -import io.airbyte.cdk.util.Jsons -import io.airbyte.protocol.models.v0.SyncMode - -/** - * [CheckpointStreamState] is the type used to represent state checkpoints for source connectors - * which make use of this package. This maps to the value of an Airbyte STATE message of type - * STREAM, interpreted using the provided configuration and configured catalog. - */ -sealed interface CheckpointStreamState - -data object SnapshotCompleted : CheckpointStreamState - -data class SnapshotCheckpoint( - val primaryKey: List, - val primaryKeyCheckpoint: List, -) : CheckpointStreamState - -data class SnapshotWithCursorCheckpoint( - val primaryKey: List, - val primaryKeyCheckpoint: List, - val cursor: Field, - val cursorUpperBound: JsonNode, -) : CheckpointStreamState - -data class CursorIncrementalCheckpoint( - val cursor: Field, - val cursorCheckpoint: JsonNode, -) : CheckpointStreamState - -/** Serializes a [CheckpointStreamState] into an [OpaqueStateValue]. */ -fun CheckpointStreamState.opaqueStateValue(): OpaqueStateValue = - when (this) { - SnapshotCompleted -> DefaultJdbcStreamStateValue.snapshotCompleted - is SnapshotCheckpoint -> - DefaultJdbcStreamStateValue.snapshotCheckpoint(primaryKey, primaryKeyCheckpoint) - is SnapshotWithCursorCheckpoint -> - DefaultJdbcStreamStateValue.snapshotWithCursorCheckpoint( - primaryKey, - primaryKeyCheckpoint, - cursor, - cursorUpperBound - ) - is CursorIncrementalCheckpoint -> - DefaultJdbcStreamStateValue.cursorIncrementalCheckpoint(cursor, cursorCheckpoint) - } - -/** - * Deserializes a nullable [OpaqueStateValue] into a nullable [CheckpointStreamState] based on the - * current [JdbcStreamState], which contains the configuration and the catalog. - */ -fun OpaqueStateValue?.checkpoint( - handler: CatalogValidationFailureHandler, - streamState: JdbcStreamState<*>, -): CheckpointStreamState? = - if (this == null) { - null - } else { - Jsons.treeToValue(this, DefaultJdbcStreamStateValue::class.java) - .checkpoint(handler, streamState) - } - -private fun DefaultJdbcStreamStateValue.checkpoint( - handler: CatalogValidationFailureHandler, - streamState: JdbcStreamState<*>, -): CheckpointStreamState? { - val sharedState: JdbcSharedState = streamState.sharedState - val stream: Stream = streamState.stream - val pkMap: Map = run { - if (primaryKey.isEmpty()) { - return@run mapOf() - } - val pk: List = stream.configuredPrimaryKey ?: listOf() - if (primaryKey.keys != pk.map { it.id }.toSet()) { - handler.accept( - InvalidPrimaryKey(stream.name, stream.namespace, primaryKey.keys.toList()), - ) - return null - } - pk.associateWith { primaryKey[it.id]!! } - } - val cursorPair: Pair? = run { - if (cursors.isEmpty()) { - return@run null - } - if (cursors.size > 1) { - handler.accept( - InvalidCursor( - streamState.stream.name, - streamState.stream.namespace, - cursors.keys.toString() - ), - ) - return null - } - val cursorLabel: String = cursors.keys.first() - val cursor: FieldOrMetaField? = stream.fields.find { it.id == cursorLabel } - if (cursor !is Field) { - handler.accept( - InvalidCursor(stream.name, stream.namespace, cursorLabel), - ) - return null - } - cursor to cursors[cursorLabel]!! - } - val isCursorBasedIncremental: Boolean = - stream.configuredSyncMode == SyncMode.INCREMENTAL && !sharedState.configuration.global - - return if (cursorPair == null) { - if (isCursorBasedIncremental) { - null - } else if (pkMap.isEmpty()) { - SnapshotCompleted - } else { - SnapshotCheckpoint(pkMap.keys.toList(), pkMap.values.toList()) - } - } else { - val (cursor: Field, cursorCheckpoint: JsonNode) = cursorPair - if (!isCursorBasedIncremental) { - null - } else if (pkMap.isEmpty()) { - CursorIncrementalCheckpoint(cursor, cursorCheckpoint) - } else { - SnapshotWithCursorCheckpoint( - pkMap.keys.toList(), - pkMap.values.toList(), - cursor, - cursorCheckpoint, - ) - } - } -} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartition.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartition.kt new file mode 100644 index 0000000000000..a87f883b3d5ce --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartition.kt @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.command.OpaqueStateValue +import io.airbyte.cdk.discover.Field +import io.airbyte.cdk.util.Jsons + +/** Base class for default implementations of [JdbcPartition]. */ +sealed class DefaultJdbcPartition( + val selectQueryGenerator: SelectQueryGenerator, + final override val streamState: DefaultJdbcStreamState, +) : JdbcPartition { + val stream: Stream = streamState.stream + val from = From(stream.name, stream.namespace) +} + +/** Base class for default implementations of [JdbcPartition] for unsplittable partitions. */ +sealed class DefaultJdbcUnsplittablePartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, +) : DefaultJdbcPartition(selectQueryGenerator, streamState) { + + override val nonResumableQuery: SelectQuery + get() = selectQueryGenerator.generate(nonResumableQuerySpec.optimize()) + + val nonResumableQuerySpec = SelectQuerySpec(SelectColumns(stream.fields), from) + + override fun samplingQuery(sampleRateInvPow2: Int): SelectQuery { + val sampleSize: Int = streamState.sharedState.maxSampleSize + val querySpec = + SelectQuerySpec( + SelectColumns(stream.fields), + FromSample(stream.name, stream.namespace, sampleRateInvPow2, sampleSize), + ) + return selectQueryGenerator.generate(querySpec.optimize()) + } +} + +/** Default implementation of a [JdbcPartition] for an unsplittable snapshot partition. */ +class DefaultJdbcUnsplittableSnapshotPartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, +) : DefaultJdbcUnsplittablePartition(selectQueryGenerator, streamState) { + + override val completeState: OpaqueStateValue = DefaultJdbcStreamStateValue.snapshotCompleted +} + +/** + * Default implementation of a [JdbcPartition] for an unsplittable snapshot partition preceding a + * cursor-based incremental sync. + */ +class DefaultJdbcUnsplittableSnapshotWithCursorPartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, + val cursor: Field, +) : + DefaultJdbcUnsplittablePartition(selectQueryGenerator, streamState), + JdbcCursorPartition { + + override val completeState: OpaqueStateValue + get() = + DefaultJdbcStreamStateValue.cursorIncrementalCheckpoint( + cursor, + cursorCheckpoint = streamState.cursorUpperBound!! + ) + + override val cursorUpperBoundQuery: SelectQuery + get() = selectQueryGenerator.generate(cursorUpperBoundQuerySpec.optimize()) + + val cursorUpperBoundQuerySpec = SelectQuerySpec(SelectColumnMaxValue(cursor), from) +} + +/** Base class for default implementations of [JdbcPartition] for splittable partitions. */ +sealed class DefaultJdbcSplittablePartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, + val checkpointColumns: List, +) : + DefaultJdbcPartition(selectQueryGenerator, streamState), + JdbcSplittablePartition { + abstract val lowerBound: List? + abstract val upperBound: List? + + override val nonResumableQuery: SelectQuery + get() = selectQueryGenerator.generate(nonResumableQuerySpec.optimize()) + + val nonResumableQuerySpec: SelectQuerySpec + get() = SelectQuerySpec(SelectColumns(stream.fields), from, where) + + override fun resumableQuery(limit: Long): SelectQuery { + val querySpec = + SelectQuerySpec( + SelectColumns((stream.fields + checkpointColumns).distinct()), + from, + where, + OrderBy(checkpointColumns), + Limit(limit), + ) + return selectQueryGenerator.generate(querySpec.optimize()) + } + + override fun samplingQuery(sampleRateInvPow2: Int): SelectQuery { + val sampleSize: Int = streamState.sharedState.maxSampleSize + val querySpec = + SelectQuerySpec( + SelectColumns(stream.fields + checkpointColumns), + FromSample(stream.name, stream.namespace, sampleRateInvPow2, sampleSize), + where, + OrderBy(checkpointColumns), + ) + return selectQueryGenerator.generate(querySpec.optimize()) + } + + val where: Where + get() { + val zippedLowerBound: List> = + lowerBound?.let { checkpointColumns.zip(it) } ?: listOf() + val lowerBoundDisj: List = + zippedLowerBound.mapIndexed { idx: Int, (gtCol: Field, gtValue: JsonNode) -> + val lastLeaf: WhereClauseLeafNode = + if (isLowerBoundIncluded && idx == checkpointColumns.size - 1) { + GreaterOrEqual(gtCol, gtValue) + } else { + Greater(gtCol, gtValue) + } + And( + zippedLowerBound.take(idx).map { (eqCol: Field, eqValue: JsonNode) -> + Equal(eqCol, eqValue) + } + listOf(lastLeaf), + ) + } + val zippedUpperBound: List> = + upperBound?.let { checkpointColumns.zip(it) } ?: listOf() + val upperBoundDisj: List = + zippedUpperBound.mapIndexed { idx: Int, (leqCol: Field, leqValue: JsonNode) -> + val lastLeaf: WhereClauseLeafNode = + if (idx < zippedUpperBound.size - 1) { + Lesser(leqCol, leqValue) + } else { + LesserOrEqual(leqCol, leqValue) + } + And( + zippedUpperBound.take(idx).map { (eqCol: Field, eqValue: JsonNode) -> + Equal(eqCol, eqValue) + } + listOf(lastLeaf), + ) + } + return Where(And(Or(lowerBoundDisj), Or(upperBoundDisj))) + } + + open val isLowerBoundIncluded: Boolean = false +} + +/** Default implementation of a [JdbcPartition] for a splittable snapshot partition. */ +class DefaultJdbcSplittableSnapshotPartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, + primaryKey: List, + override val lowerBound: List?, + override val upperBound: List?, +) : DefaultJdbcSplittablePartition(selectQueryGenerator, streamState, primaryKey) { + + override val completeState: OpaqueStateValue + get() = + when (upperBound) { + null -> DefaultJdbcStreamStateValue.snapshotCompleted + else -> + DefaultJdbcStreamStateValue.snapshotCheckpoint( + primaryKey = checkpointColumns, + primaryKeyCheckpoint = upperBound, + ) + } + + override fun incompleteState(lastRecord: ObjectNode): OpaqueStateValue = + DefaultJdbcStreamStateValue.snapshotCheckpoint( + primaryKey = checkpointColumns, + primaryKeyCheckpoint = checkpointColumns.map { lastRecord[it.id] ?: Jsons.nullNode() }, + ) +} + +/** + * Default implementation of a [JdbcPartition] for a splittable partition involving cursor columns. + */ +sealed class DefaultJdbcCursorPartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, + checkpointColumns: List, + val cursor: Field, + private val explicitCursorUpperBound: JsonNode?, +) : + DefaultJdbcSplittablePartition(selectQueryGenerator, streamState, checkpointColumns), + JdbcCursorPartition { + + val cursorUpperBound: JsonNode + get() = explicitCursorUpperBound ?: streamState.cursorUpperBound!! + + override val cursorUpperBoundQuery: SelectQuery + get() = selectQueryGenerator.generate(cursorUpperBoundQuerySpec.optimize()) + + val cursorUpperBoundQuerySpec = SelectQuerySpec(SelectColumnMaxValue(cursor), from) +} + +/** + * Default implementation of a [JdbcPartition] for a splittable snapshot partition preceding a + * cursor-based incremental sync. + */ +class DefaultJdbcSplittableSnapshotWithCursorPartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, + primaryKey: List, + override val lowerBound: List?, + override val upperBound: List?, + cursor: Field, + cursorUpperBound: JsonNode?, +) : + DefaultJdbcCursorPartition( + selectQueryGenerator, + streamState, + primaryKey, + cursor, + cursorUpperBound + ) { + + override val completeState: OpaqueStateValue + get() = + when (upperBound) { + null -> + DefaultJdbcStreamStateValue.cursorIncrementalCheckpoint( + cursor, + cursorUpperBound + ) + else -> + DefaultJdbcStreamStateValue.snapshotWithCursorCheckpoint( + primaryKey = checkpointColumns, + primaryKeyCheckpoint = upperBound, + cursor, + cursorUpperBound, + ) + } + + override fun incompleteState(lastRecord: ObjectNode): OpaqueStateValue = + DefaultJdbcStreamStateValue.snapshotWithCursorCheckpoint( + primaryKey = checkpointColumns, + primaryKeyCheckpoint = checkpointColumns.map { lastRecord[it.id] ?: Jsons.nullNode() }, + cursor, + cursorUpperBound, + ) +} + +/** + * Default implementation of a [JdbcPartition] for a cursor incremental partition. These are always + * splittable. + */ +class DefaultJdbcCursorIncrementalPartition( + selectQueryGenerator: SelectQueryGenerator, + streamState: DefaultJdbcStreamState, + cursor: Field, + val cursorLowerBound: JsonNode, + override val isLowerBoundIncluded: Boolean, + cursorUpperBound: JsonNode?, +) : + DefaultJdbcCursorPartition( + selectQueryGenerator, + streamState, + listOf(cursor), + cursor, + cursorUpperBound + ) { + + override val lowerBound: List = listOf(cursorLowerBound) + override val upperBound: List + get() = listOf(cursorUpperBound) + + override val completeState: OpaqueStateValue + get() = + DefaultJdbcStreamStateValue.cursorIncrementalCheckpoint( + cursor, + cursorCheckpoint = cursorUpperBound, + ) + + override fun incompleteState(lastRecord: ObjectNode): OpaqueStateValue = + DefaultJdbcStreamStateValue.cursorIncrementalCheckpoint( + cursor, + cursorCheckpoint = lastRecord[cursor.id] ?: Jsons.nullNode(), + ) +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactory.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactory.kt new file mode 100644 index 0000000000000..e324756517103 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactory.kt @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.JsonNode +import io.airbyte.cdk.ConfigErrorException +import io.airbyte.cdk.command.JdbcSourceConfiguration +import io.airbyte.cdk.command.OpaqueStateValue +import io.airbyte.cdk.discover.Field +import io.airbyte.cdk.discover.FieldOrMetaField +import io.airbyte.cdk.output.CatalogValidationFailureHandler +import io.airbyte.cdk.output.InvalidCursor +import io.airbyte.cdk.output.InvalidPrimaryKey +import io.airbyte.cdk.output.ResetStream +import io.airbyte.cdk.util.Jsons +import io.airbyte.protocol.models.v0.SyncMode +import jakarta.inject.Singleton +import java.util.concurrent.ConcurrentHashMap + +/** Default implementation of [JdbcPartitionFactory]. */ +@Singleton +class DefaultJdbcPartitionFactory( + override val sharedState: DefaultJdbcSharedState, + val handler: CatalogValidationFailureHandler, + val selectQueryGenerator: SelectQueryGenerator, +) : + JdbcPartitionFactory< + DefaultJdbcSharedState, + DefaultJdbcStreamState, + DefaultJdbcPartition, + > { + + private val streamStates = ConcurrentHashMap() + + override fun streamState(stream: Stream): DefaultJdbcStreamState = + streamStates.getOrPut(stream.label) { DefaultJdbcStreamState(sharedState, stream) } + + override fun create( + stream: Stream, + opaqueStateValue: OpaqueStateValue?, + ): DefaultJdbcPartition? { + val streamState: DefaultJdbcStreamState = streamState(stream) + if (opaqueStateValue == null) { + return coldStart(streamState) + } + val sv: DefaultJdbcStreamStateValue = + Jsons.treeToValue(opaqueStateValue, DefaultJdbcStreamStateValue::class.java) + val pkMap: Map = + sv.pkMap(stream) + ?: run { + handler.accept(ResetStream(stream.name, stream.namespace)) + streamState.reset() + return coldStart(streamState) + } + val cursorPair: Pair? = + if (sv.cursors.isEmpty()) { + null + } else { + sv.cursorPair(stream) + ?: run { + handler.accept(ResetStream(stream.name, stream.namespace)) + streamState.reset() + return coldStart(streamState) + } + } + + val isCursorBasedIncremental: Boolean = + stream.configuredSyncMode == SyncMode.INCREMENTAL && !configuration.global + + return if (cursorPair == null) { + if (isCursorBasedIncremental) { + handler.accept(ResetStream(stream.name, stream.namespace)) + streamState.reset() + coldStart(streamState) + } else if (pkMap.isEmpty()) { + // Snapshot complete. + null + } else { + // Snapshot ongoing. + DefaultJdbcSplittableSnapshotPartition( + selectQueryGenerator, + streamState, + primaryKey = pkMap.keys.toList(), + lowerBound = pkMap.values.toList(), + upperBound = null + ) + } + } else { + val (cursor: Field, cursorCheckpoint: JsonNode) = cursorPair + if (!isCursorBasedIncremental) { + handler.accept(ResetStream(stream.name, stream.namespace)) + streamState.reset() + coldStart(streamState) + } else if (pkMap.isNotEmpty()) { + // Snapshot ongoing. + DefaultJdbcSplittableSnapshotWithCursorPartition( + selectQueryGenerator, + streamState, + primaryKey = pkMap.keys.toList(), + lowerBound = pkMap.values.toList(), + upperBound = null, + cursor, + cursorUpperBound = cursorCheckpoint, + ) + } else if (cursorCheckpoint == streamState.cursorUpperBound) { + // Incremental complete. + null + } else { + // Incremental ongoing. + DefaultJdbcCursorIncrementalPartition( + selectQueryGenerator, + streamState, + cursor, + cursorLowerBound = cursorCheckpoint, + isLowerBoundIncluded = true, + cursorUpperBound = streamState.cursorUpperBound, + ) + } + } + } + + private fun DefaultJdbcStreamStateValue.pkMap(stream: Stream): Map? { + if (primaryKey.isEmpty()) { + return mapOf() + } + val fields: List = stream.configuredPrimaryKey ?: listOf() + if (primaryKey.keys != fields.map { it.id }.toSet()) { + handler.accept( + InvalidPrimaryKey(stream.name, stream.namespace, primaryKey.keys.toList()), + ) + return null + } + return fields.associateWith { primaryKey[it.id]!! } + } + + private fun DefaultJdbcStreamStateValue.cursorPair(stream: Stream): Pair? { + if (cursors.size > 1) { + handler.accept( + InvalidCursor(stream.name, stream.namespace, cursors.keys.toString()), + ) + return null + } + val cursorLabel: String = cursors.keys.first() + val cursor: FieldOrMetaField? = stream.fields.find { it.id == cursorLabel } + if (cursor !is Field) { + handler.accept( + InvalidCursor(stream.name, stream.namespace, cursorLabel), + ) + return null + } + if (stream.configuredCursor != cursor) { + handler.accept( + InvalidCursor(stream.name, stream.namespace, cursorLabel), + ) + return null + } + return cursor to cursors[cursorLabel]!! + } + + private fun coldStart(streamState: DefaultJdbcStreamState): DefaultJdbcPartition { + val stream: Stream = streamState.stream + val pkChosenFromCatalog: List = stream.configuredPrimaryKey ?: listOf() + if (stream.configuredSyncMode == SyncMode.FULL_REFRESH || configuration.global) { + if (pkChosenFromCatalog.isEmpty()) { + return DefaultJdbcUnsplittableSnapshotPartition( + selectQueryGenerator, + streamState, + ) + } + return DefaultJdbcSplittableSnapshotPartition( + selectQueryGenerator, + streamState, + pkChosenFromCatalog, + lowerBound = null, + upperBound = null, + ) + } + val cursorChosenFromCatalog: Field = + stream.configuredCursor as? Field ?: throw ConfigErrorException("no cursor") + if (pkChosenFromCatalog.isEmpty()) { + return DefaultJdbcUnsplittableSnapshotWithCursorPartition( + selectQueryGenerator, + streamState, + cursorChosenFromCatalog + ) + } + return DefaultJdbcSplittableSnapshotWithCursorPartition( + selectQueryGenerator, + streamState, + pkChosenFromCatalog, + lowerBound = null, + upperBound = null, + cursorChosenFromCatalog, + cursorUpperBound = null, + ) + } + + val configuration: JdbcSourceConfiguration = sharedState.configuration + + override fun split( + unsplitPartition: DefaultJdbcPartition, + opaqueStateValues: List + ): List { + val splitPartitionBoundaries: List by lazy { + opaqueStateValues.map { Jsons.treeToValue(it, DefaultJdbcStreamStateValue::class.java) } + } + return when (unsplitPartition) { + is DefaultJdbcSplittableSnapshotPartition -> + unsplitPartition.split(splitPartitionBoundaries) + is DefaultJdbcSplittableSnapshotWithCursorPartition -> + unsplitPartition.split(splitPartitionBoundaries) + is DefaultJdbcCursorIncrementalPartition -> + unsplitPartition.split(splitPartitionBoundaries) + is DefaultJdbcUnsplittableSnapshotPartition -> listOf(unsplitPartition) + is DefaultJdbcUnsplittableSnapshotWithCursorPartition -> listOf(unsplitPartition) + } + } + + private fun DefaultJdbcSplittableSnapshotPartition.split( + splitPointValues: List + ): List { + val inners: List> = + splitPointValues.mapNotNull { it.pkMap(streamState.stream)?.values?.toList() } + val lbs: List?> = listOf(lowerBound) + inners + val ubs: List?> = inners + listOf(upperBound) + return lbs.zip(ubs).map { (lowerBound, upperBound) -> + DefaultJdbcSplittableSnapshotPartition( + selectQueryGenerator, + streamState, + primaryKey = checkpointColumns, + lowerBound, + upperBound, + ) + } + } + + private fun DefaultJdbcSplittableSnapshotWithCursorPartition.split( + splitPointValues: List + ): List { + val inners: List> = + splitPointValues.mapNotNull { it.pkMap(streamState.stream)?.values?.toList() } + val lbs: List?> = listOf(lowerBound) + inners + val ubs: List?> = inners + listOf(upperBound) + return lbs.zip(ubs).map { (lowerBound, upperBound) -> + DefaultJdbcSplittableSnapshotWithCursorPartition( + selectQueryGenerator, + streamState, + primaryKey = checkpointColumns, + lowerBound, + upperBound, + cursor, + cursorUpperBound, + ) + } + } + + private fun DefaultJdbcCursorIncrementalPartition.split( + splitPointValues: List + ): List { + val inners: List = splitPointValues.mapNotNull { it.cursorPair(stream)?.second } + val lbs: List = listOf(cursorLowerBound) + inners + val ubs: List = inners + listOf(cursorUpperBound) + return lbs.zip(ubs).mapIndexed { idx: Int, (lowerBound, upperBound) -> + DefaultJdbcCursorIncrementalPartition( + selectQueryGenerator, + streamState, + cursor, + lowerBound, + isLowerBoundIncluded = idx == 0, + upperBound, + ) + } + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartition.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartition.kt new file mode 100644 index 0000000000000..e6bd519911aa1 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartition.kt @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.cdk.command.OpaqueStateValue + +/** + * Encapsulates database-specific aspects relating to a JDBC stream partition, consumed by + * [JdbcPartitionReader] and friends. + */ +interface JdbcPartition> { + + /** The partition's stream's transient state, including parameters like fetchSize, etc. */ + val streamState: S + + /** Query which produces all records in the partition in no particular order. */ + val nonResumableQuery: SelectQuery + + /** State value to emit when the partition is read in its entirety. */ + val completeState: OpaqueStateValue + + /** Query which samples records in the partition at the rate of 2^-[sampleRateInvPow2]. */ + fun samplingQuery(sampleRateInvPow2: Int): SelectQuery + + /** Tries to acquire resources for [JdbcPartitionsCreator]. */ + fun tryAcquireResourcesForCreator(): JdbcPartitionsCreator.AcquiredResources? = + // Acquire global resources by default. + streamState.sharedState.tryAcquireResourcesForCreator() + + /** Tries to acquire resources for [JdbcPartitionReader]. */ + fun tryAcquireResourcesForReader(): JdbcPartitionReader.AcquiredResources? = + // Acquire global resources by default. + streamState.sharedState.tryAcquireResourcesForReader() +} + +/** A [JdbcPartition] which can be subdivided. */ +interface JdbcSplittablePartition> : JdbcPartition { + + /** Query which produces a subset of records at the beginning of the partition. */ + fun resumableQuery(limit: Long): SelectQuery + + /** State value to emit when the partition is read up to (and including) [lastRecord]. */ + fun incompleteState(lastRecord: ObjectNode): OpaqueStateValue +} + +/** A [JdbcPartition] which allows cursor-based incremental reads. */ +interface JdbcCursorPartition> : JdbcPartition { + + /** Query which produces the current maximum cursor value in the stream. */ + val cursorUpperBoundQuery: SelectQuery +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionFactory.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionFactory.kt new file mode 100644 index 0000000000000..6d885654b8c3b --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionFactory.kt @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.command.OpaqueStateValue +import io.micronaut.context.annotation.DefaultImplementation + +/** Encapsulates database-specific logic turning [OpaqueStateValue] into [JdbcPartition]. */ +@DefaultImplementation(DefaultJdbcPartitionFactory::class) +interface JdbcPartitionFactory< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +> { + + /** The state shared by all partitions. Includes global resources. */ + val sharedState: A + + /** Get or create the [JdbcStreamState] for a [stream]. */ + fun streamState(stream: Stream): S + + /** + * Deserializes [opaqueStateValue] and creates a [JdbcPartition] instance corresponding to all + * remaining unread data in the [stream], if any; null otherwise. + */ + fun create(stream: Stream, opaqueStateValue: OpaqueStateValue?): P? + + /** Subdivides the [unsplitPartition] by splitting at the [opaqueStateValues], if possible. */ + fun split(unsplitPartition: P, opaqueStateValues: List): List

+} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionReader.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionReader.kt index d11ac97d370e0..77c2944befc1f 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionReader.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionReader.kt @@ -4,7 +4,6 @@ package io.airbyte.cdk.read import com.fasterxml.jackson.databind.JsonNode import com.fasterxml.jackson.databind.node.ObjectNode import io.airbyte.cdk.command.OpaqueStateValue -import io.airbyte.cdk.discover.Field import io.airbyte.cdk.output.OutputConsumer import io.airbyte.cdk.util.Jsons import io.airbyte.protocol.models.v0.AirbyteRecordMessage @@ -15,56 +14,24 @@ import kotlin.coroutines.coroutineContext import kotlinx.coroutines.ensureActive /** Base class for JDBC implementations of [PartitionReader]. */ -sealed class JdbcPartitionReader( - val streamState: JdbcStreamState<*>, - val input: Input, +sealed class JdbcPartitionReader

>( + val partition: P, ) : PartitionReader { + + val streamState: JdbcStreamState<*> = partition.streamState val stream: Stream = streamState.stream val sharedState: JdbcSharedState = streamState.sharedState val outputConsumer: OutputConsumer = sharedState.outputConsumer val selectQuerier: SelectQuerier = sharedState.selectQuerier - sealed interface Input { - val resumable: Boolean - } - - data class SnapshotInput( - val primaryKey: List, - val primaryKeyLowerBound: List?, - val primaryKeyUpperBound: List?, - ) : Input { - override val resumable: Boolean - get() = primaryKey.isNotEmpty() - } - - data class SnapshotWithCursorInput( - val primaryKey: List, - val primaryKeyLowerBound: List?, - val primaryKeyUpperBound: List?, - val cursor: Field, - val cursorUpperBound: JsonNode, - ) : Input { - override val resumable: Boolean - get() = primaryKey.isNotEmpty() - } - - data class CursorIncrementalInput( - val cursor: Field, - val cursorLowerBound: JsonNode, - val isLowerBoundIncluded: Boolean, - val cursorUpperBound: JsonNode, - ) : Input { - override val resumable: Boolean - get() = true - } - private val acquiredResources = AtomicReference() + /** Calling [close] releases the resources acquired for the [JdbcPartitionReader]. */ fun interface AcquiredResources : AutoCloseable override fun tryAcquireResources(): PartitionReader.TryAcquireResourcesStatus { val acquiredResources: AcquiredResources = - sharedState.tryAcquireResourcesForReader() + partition.tryAcquireResourcesForReader() ?: return PartitionReader.TryAcquireResourcesStatus.RETRY_LATER this.acquiredResources.set(acquiredResources) return PartitionReader.TryAcquireResourcesStatus.READY_TO_RUN @@ -90,27 +57,18 @@ sealed class JdbcPartitionReader( } } -/** JDBC implementation of [PartitionReader] which reads the [input] in its entirety. */ -class JdbcNonResumablePartitionReader( - val selectQueryGenerator: SelectQueryGenerator, - streamState: JdbcStreamState<*>, - input: Input, -) : JdbcPartitionReader(streamState, input) { +/** JDBC implementation of [PartitionReader] which reads the [partition] in its entirety. */ +class JdbcNonResumablePartitionReader

>( + partition: P, +) : JdbcPartitionReader

(partition) { val runComplete = AtomicBoolean(false) val numRecords = AtomicLong() override suspend fun run() { - val querySpec: SelectQuerySpec = - input.querySpec( - stream, - isOrdered = false, - limit = null, - ) - val query: SelectQuery = selectQueryGenerator.generate(querySpec.optimize()) selectQuerier .executeQuery( - q = query, + q = partition.nonResumableQuery, parameters = SelectQuerier.Parameters(streamState.fetchSize), ) .use { result: SelectQuerier.Result -> @@ -127,19 +85,17 @@ class JdbcNonResumablePartitionReader( if (!runComplete.get()) throw RuntimeException("cannot checkpoint non-resumable read") // The run method executed to completion without a LIMIT clause. // This implies that the partition boundary has been reached. - return PartitionReadCheckpoint(input.checkpoint().opaqueStateValue(), numRecords.get()) + return PartitionReadCheckpoint(partition.completeState, numRecords.get()) } } /** - * JDBC implementation of [PartitionReader] which reads as much as possible of the [input], in + * JDBC implementation of [PartitionReader] which reads as much as possible of the [partition], in * order, before timing out. */ -class JdbcResumablePartitionReader( - val selectQueryGenerator: SelectQueryGenerator, - streamState: JdbcStreamState<*>, - input: Input, -) : JdbcPartitionReader(streamState, input) { +class JdbcResumablePartitionReader

>( + partition: P, +) : JdbcPartitionReader

(partition) { val incumbentLimit = AtomicLong() val numRecords = AtomicLong() @@ -150,16 +106,9 @@ class JdbcResumablePartitionReader( val fetchSize: Int = streamState.fetchSizeOrDefault val limit: Long = streamState.limit incumbentLimit.set(limit) - val querySpec: SelectQuerySpec = - input.querySpec( - stream, - isOrdered = true, - limit = limit, - ) - val query: SelectQuery = selectQueryGenerator.generate(querySpec.optimize()) selectQuerier .executeQuery( - q = query, + q = partition.resumableQuery(limit), parameters = SelectQuerier.Parameters(fetchSize), ) .use { result: SelectQuerier.Result -> @@ -178,7 +127,7 @@ class JdbcResumablePartitionReader( override fun checkpoint(): PartitionReadCheckpoint { if (runComplete.get() && numRecords.get() < streamState.limit) { // The run method executed to completion with a LIMIT clause which was not reached. - return PartitionReadCheckpoint(input.checkpoint().opaqueStateValue(), numRecords.get()) + return PartitionReadCheckpoint(partition.completeState, numRecords.get()) } // The run method ended because of either the LIMIT or the timeout. // Adjust the LIMIT value so that it grows or shrinks to try to fit the timeout. @@ -194,146 +143,7 @@ class JdbcResumablePartitionReader( streamState.updateLimitState { it.down } } } - val checkpointState: OpaqueStateValue = - input.checkpoint(lastRecord.get()!!).opaqueStateValue() + val checkpointState: OpaqueStateValue = partition.incompleteState(lastRecord.get()!!) return PartitionReadCheckpoint(checkpointState, numRecords.get()) } } - -/** Converts a [JdbcPartitionReader.Input] into a [SelectQuerySpec]. */ -fun JdbcPartitionReader.Input.querySpec( - stream: Stream, - isOrdered: Boolean, - limit: Long?, -): SelectQuerySpec = - when (this) { - is JdbcPartitionReader.SnapshotInput -> - querySpecForStreamPartitionReader( - stream, - checkpointColumns = primaryKey, - checkpointLowerBound = primaryKeyLowerBound, - isLowerBoundIncluded = false, - checkpointUpperBound = primaryKeyUpperBound, - isOrdered, - limit, - ) - is JdbcPartitionReader.SnapshotWithCursorInput -> - querySpecForStreamPartitionReader( - stream, - checkpointColumns = primaryKey, - checkpointLowerBound = primaryKeyLowerBound, - isLowerBoundIncluded = false, - checkpointUpperBound = primaryKeyUpperBound, - isOrdered, - limit, - ) - is JdbcPartitionReader.CursorIncrementalInput -> - querySpecForStreamPartitionReader( - stream, - checkpointColumns = listOf(cursor), - checkpointLowerBound = listOf(cursorLowerBound), - isLowerBoundIncluded = isLowerBoundIncluded, - checkpointUpperBound = listOf(cursorUpperBound), - isOrdered, - limit, - ) - } - -private fun querySpecForStreamPartitionReader( - stream: Stream, - checkpointColumns: List, - checkpointLowerBound: List?, - isLowerBoundIncluded: Boolean, - checkpointUpperBound: List?, - isOrdered: Boolean, - limit: Long?, -): SelectQuerySpec { - val selectColumns: List = - if (isOrdered) { - stream.fields + checkpointColumns - } else { - stream.fields - } - val zippedLowerBound: List> = - checkpointLowerBound?.let { checkpointColumns.zip(it) } ?: listOf() - val lowerBoundDisj: List = - zippedLowerBound.mapIndexed { idx: Int, (gtCol: Field, gtValue: JsonNode) -> - val lastLeaf: WhereClauseLeafNode = - if (isLowerBoundIncluded && idx == checkpointColumns.size - 1) { - GreaterOrEqual(gtCol, gtValue) - } else { - Greater(gtCol, gtValue) - } - And( - zippedLowerBound.take(idx).map { (eqCol: Field, eqValue: JsonNode) -> - Equal(eqCol, eqValue) - } + listOf(lastLeaf), - ) - } - val zippedUpperBound: List> = - checkpointUpperBound?.let { checkpointColumns.zip(it) } ?: listOf() - val upperBoundDisj: List = - zippedUpperBound.mapIndexed { idx: Int, (leqCol: Field, leqValue: JsonNode) -> - val lastLeaf: WhereClauseLeafNode = - if (idx < zippedUpperBound.size - 1) { - Lesser(leqCol, leqValue) - } else { - LesserOrEqual(leqCol, leqValue) - } - And( - zippedUpperBound.take(idx).map { (eqCol: Field, eqValue: JsonNode) -> - Equal(eqCol, eqValue) - } + listOf(lastLeaf), - ) - } - return SelectQuerySpec( - SelectColumns(selectColumns), - From(stream.name, stream.namespace), - Where(And(Or(lowerBoundDisj), Or(upperBoundDisj))), - if (isOrdered) OrderBy(checkpointColumns) else NoOrderBy, - if (limit == null) NoLimit else Limit(limit), - ) -} - -/** - * Generates a [CheckpointStreamState] using the [JdbcPartitionReader.Input] initial state and, if - * provided, the last record read by the [JdbcPartitionReader]. When not provided, the partition is - * presumed to have been read in its entirety. - */ -fun JdbcPartitionReader.Input.checkpoint(row: ObjectNode? = null): CheckpointStreamState { - fun getRowValue(field: Field): JsonNode = row?.get(field.id) ?: Jsons.nullNode() - return when (this) { - is JdbcPartitionReader.SnapshotInput -> - if (row != null) { - SnapshotCheckpoint(primaryKey, primaryKey.map(::getRowValue)) - } else if (primaryKeyUpperBound != null) { - SnapshotCheckpoint(primaryKey, primaryKeyUpperBound) - } else { - SnapshotCompleted - } - is JdbcPartitionReader.SnapshotWithCursorInput -> - if (row != null) { - SnapshotWithCursorCheckpoint( - primaryKey, - primaryKey.map(::getRowValue), - cursor, - cursorUpperBound, - ) - } else if (primaryKeyUpperBound != null) { - SnapshotWithCursorCheckpoint( - primaryKey, - primaryKeyUpperBound, - cursor, - cursorUpperBound, - ) - } else { - CursorIncrementalCheckpoint(cursor, cursorUpperBound) - } - is JdbcPartitionReader.CursorIncrementalInput -> - if (row == null) { - CursorIncrementalCheckpoint(cursor, cursorUpperBound) - } else { - CursorIncrementalCheckpoint(cursor, getRowValue(cursor)) - } - } -} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreator.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreator.kt index ab47bb5f1c0a1..7c371d6f41b6d 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreator.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreator.kt @@ -3,74 +3,32 @@ package io.airbyte.cdk.read import com.fasterxml.jackson.databind.JsonNode import com.fasterxml.jackson.databind.node.ObjectNode -import io.airbyte.cdk.ConfigErrorException import io.airbyte.cdk.command.JdbcSourceConfiguration import io.airbyte.cdk.command.OpaqueStateValue -import io.airbyte.cdk.discover.Field -import io.airbyte.cdk.output.CatalogValidationFailureHandler import io.airbyte.cdk.output.OutputConsumer -import io.airbyte.cdk.output.ResetStream import io.airbyte.cdk.util.Jsons -import io.airbyte.protocol.models.v0.SyncMode import io.github.oshai.kotlinlogging.KotlinLogging import java.util.concurrent.atomic.AtomicReference import kotlin.random.Random /** Base class for JDBC implementations of [PartitionsCreator]. */ -sealed class JdbcPartitionsCreator( - val selectQueryGenerator: SelectQueryGenerator, - val streamState: JdbcStreamState<*>, - val input: Input, +sealed class JdbcPartitionsCreator< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +>( + val partition: P, + val partitionFactory: JdbcPartitionFactory, ) : PartitionsCreator { private val log = KotlinLogging.logger {} + val streamState: S = partition.streamState val stream: Stream = streamState.stream - val sharedState: JdbcSharedState = streamState.sharedState + val sharedState: A = streamState.sharedState + val configuration: JdbcSourceConfiguration = sharedState.configuration val outputConsumer: OutputConsumer = sharedState.outputConsumer val selectQuerier: SelectQuerier = sharedState.selectQuerier - sealed interface Input - - data object NoStart : Input - - data class SnapshotColdStart( - val primaryKey: List, - ) : Input - - data class SnapshotWithCursorColdStart( - val primaryKey: List, - val cursor: Field, - ) : Input - - data class CursorIncrementalColdStart( - val cursor: Field, - val cursorLowerBound: JsonNode, - ) : Input - - data class SnapshotWarmStart( - val primaryKey: List, - val primaryKeyLowerBound: List, - ) : Input - - data class SnapshotWithCursorWarmStart( - val primaryKey: List, - val primaryKeyLowerBound: List, - val cursor: Field, - val cursorUpperBound: JsonNode, - ) : Input - - data class CursorIncrementalWarmStart( - val cursor: Field, - val cursorLowerBound: JsonNode, - val cursorUpperBound: JsonNode, - ) : Input - - data class Parameters( - val preferParallelized: Boolean, - val tableSampleSize: Int = 1024, - val throughputBytesPerSecond: Long = 10L * 1024L * 1024L, - ) - private val acquiredResources = AtomicReference() /** Calling [close] releases the resources acquired for the [JdbcPartitionsCreator]. */ @@ -78,137 +36,46 @@ sealed class JdbcPartitionsCreator( override fun tryAcquireResources(): PartitionsCreator.TryAcquireResourcesStatus { val acquiredResources: AcquiredResources = - sharedState.tryAcquireResourcesForCreator() + partition.tryAcquireResourcesForCreator() ?: return PartitionsCreator.TryAcquireResourcesStatus.RETRY_LATER this.acquiredResources.set(acquiredResources) return PartitionsCreator.TryAcquireResourcesStatus.READY_TO_RUN } - override suspend fun run(): List = - input.partitionReaderInputs().map { createReader(it) } - - abstract fun createReader(input: JdbcPartitionReader.Input): JdbcPartitionReader - - fun Input.partitionReaderInputs(): List { - return when (this) { - is NoStart -> listOf() - is SnapshotColdStart -> - JdbcPartitionReader.SnapshotInput( - primaryKey = primaryKey, - primaryKeyLowerBound = null, - primaryKeyUpperBound = null, - ) - .split() - is SnapshotWithCursorColdStart -> - JdbcPartitionReader.SnapshotWithCursorInput( - primaryKey = primaryKey, - primaryKeyLowerBound = null, - primaryKeyUpperBound = null, - cursor = cursor, - cursorUpperBound = ensureCursorUpperBound(cursor) ?: return listOf(), - ) - .split() - is CursorIncrementalColdStart -> - JdbcPartitionReader.CursorIncrementalInput( - cursor = cursor, - cursorLowerBound = cursorLowerBound, - isLowerBoundIncluded = true, - cursorUpperBound = ensureCursorUpperBound(cursor) ?: return listOf(), - ) - .split() - is SnapshotWarmStart -> - JdbcPartitionReader.SnapshotInput( - primaryKey = primaryKey, - primaryKeyLowerBound = primaryKeyLowerBound, - primaryKeyUpperBound = null, - ) - .split() - is SnapshotWithCursorWarmStart -> - JdbcPartitionReader.SnapshotWithCursorInput( - primaryKey = primaryKey, - primaryKeyLowerBound = primaryKeyLowerBound, - primaryKeyUpperBound = null, - cursor = cursor, - cursorUpperBound = cursorUpperBound, - ) - .split() - is CursorIncrementalWarmStart -> - JdbcPartitionReader.CursorIncrementalInput( - cursor = cursor, - cursorLowerBound = cursorLowerBound, - isLowerBoundIncluded = true, - cursorUpperBound = cursorUpperBound, - ) - .split() - } - } - - fun JdbcPartitionReader.SnapshotInput.split(): List = - split(this, primaryKeyLowerBound, primaryKeyUpperBound).map { (lb, ub) -> - copy(primaryKeyLowerBound = lb, primaryKeyUpperBound = ub) - } - - fun JdbcPartitionReader.SnapshotWithCursorInput.split(): - List = - split(this, primaryKeyLowerBound, primaryKeyUpperBound).map { (lb, ub) -> - copy(primaryKeyLowerBound = lb, primaryKeyUpperBound = ub) - } - - fun JdbcPartitionReader.CursorIncrementalInput.split(): - List = - split(this, listOf(cursorLowerBound), listOf(cursorUpperBound)).mapIndexed { - idx: Int, - (lb, ub) -> - copy( - cursorLowerBound = lb!!.first(), - isLowerBoundIncluded = idx == 0, - cursorUpperBound = ub!!.first(), - ) - } - - abstract fun split( - input: JdbcPartitionReader.Input, - globalLowerBound: List?, - globalUpperBound: List?, - ): List?, List?>> - override fun releaseResources() { acquiredResources.getAndSet(null)?.close() } - fun ensureCursorUpperBound(cursor: Field): JsonNode? { + fun ensureCursorUpperBound() { + val cursorUpperBoundQuery: SelectQuery = + (partition as JdbcCursorPartition<*>).cursorUpperBoundQuery if (streamState.cursorUpperBound != null) { - return streamState.cursorUpperBound + return } - val querySpec = - SelectQuerySpec( - SelectColumnMaxValue(cursor), - From(stream.name, stream.namespace), - ) - val cursorUpperBoundQuery: SelectQuery = selectQueryGenerator.generate(querySpec.optimize()) log.info { "Querying maximum cursor column value." } val record: ObjectNode? = selectQuerier.executeQuery(cursorUpperBoundQuery).use { if (it.hasNext()) it.next() else null } - val cursorUpperBound: JsonNode? = record?.fields()?.asSequence()?.firstOrNull()?.value - if (cursorUpperBound == null) { + if (record == null) { streamState.cursorUpperBound = Jsons.nullNode() + return + } + val cursorUpperBound: JsonNode? = record.fields().asSequence().firstOrNull()?.value + if (cursorUpperBound == null) { log.warn { "No cursor column value found in '${stream.label}'." } - return null + return } - streamState.cursorUpperBound = cursorUpperBound if (cursorUpperBound.isNull) { log.warn { "Maximum cursor column value in '${stream.label}' is NULL." } - return null + return } log.info { "Maximum cursor column value in '${stream.label}' is '$cursorUpperBound'." } - return cursorUpperBound + streamState.cursorUpperBound = cursorUpperBound } /** Collects a sample of rows in the unsplit partition. */ fun collectSample( - querySpec: SelectQuerySpec, recordMapper: (ObjectNode) -> T, ): Sample { val values = mutableListOf() @@ -221,16 +88,7 @@ sealed class JdbcPartitionsCreator( // then try sampling at a higher rate of one every 2^8 = 256 rows. // If that's still not enough, don't sample at all. values.clear() - val fromSample = - FromSample( - stream.name, - stream.namespace, - sampleRateInvPow2, - sharedState.maxSampleSize, - ) - val sampledQuerySpec: SelectQuerySpec = querySpec.copy(from = fromSample) - val samplingQuery: SelectQuery = - selectQueryGenerator.generate(sampledQuerySpec.optimize()) + val samplingQuery: SelectQuery = partition.samplingQuery(sampleRateInvPow2) selectQuerier.executeQuery(samplingQuery).use { for (record in it) { values.add(recordMapper(record)) @@ -256,70 +114,90 @@ sealed class JdbcPartitionsCreator( } /** Sequential JDBC implementation of [PartitionsCreator]. */ -class JdbcSequentialPartitionsCreator( - selectQueryGenerator: SelectQueryGenerator, - streamState: JdbcStreamState<*>, - input: Input, -) : JdbcPartitionsCreator(selectQueryGenerator, streamState, input) { +class JdbcSequentialPartitionsCreator< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +>( + partition: P, + partitionFactory: JdbcPartitionFactory, +) : JdbcPartitionsCreator(partition, partitionFactory) { private val log = KotlinLogging.logger {} - override fun createReader(input: JdbcPartitionReader.Input): JdbcPartitionReader { + override suspend fun run(): List { + // Ensure that the cursor upper bound is known, if required. + if (partition is JdbcCursorPartition<*>) { + ensureCursorUpperBound() + if (streamState.cursorUpperBound?.isNull == true) { + log.info { "Maximum cursor column value query found that the table was empty." } + return listOf() + } + } + if (streamState.fetchSize == null) { + if (sharedState.withSampling) { + val rowByteSizeSample: Sample = + collectSample(sharedState.rowByteSizeEstimator()::apply) + val expectedTableByteSize: Long = + rowByteSizeSample.sampledValues.sum() * rowByteSizeSample.valueWeight + log.info { "Table memory size estimated at ${expectedTableByteSize shr 20} MiB." } + if (rowByteSizeSample.kind == Sample.Kind.EMPTY) { + log.info { "Sampling query found that the table was empty." } + return listOf() + } + streamState.fetchSize = + sharedState.jdbcFetchSizeEstimator().apply(rowByteSizeSample) + } else { + // TODO: adaptive fetchSize computation? + } + } // Handle edge case where the partition cannot be split. - if (!input.resumable) { + if (partition !is JdbcSplittablePartition<*>) { log.warn { "Table cannot be read by sequential partition reader because it cannot be split." } - return JdbcNonResumablePartitionReader(selectQueryGenerator, streamState, input) + return listOf(JdbcNonResumablePartitionReader(partition)) } // Happy path. log.info { "Table will be read by sequential partition reader(s)." } - return JdbcResumablePartitionReader(selectQueryGenerator, streamState, input) - } - - override fun split( - input: JdbcPartitionReader.Input, - globalLowerBound: List?, - globalUpperBound: List? - ): List?, List?>> { - return listOf(globalLowerBound to globalUpperBound) + return listOf(JdbcResumablePartitionReader(partition)) } } /** Concurrent JDBC implementation of [PartitionsCreator]. */ -class JdbcConcurrentPartitionsCreator( - selectQueryGenerator: SelectQueryGenerator, - streamState: JdbcStreamState<*>, - input: Input, -) : JdbcPartitionsCreator(selectQueryGenerator, streamState, input) { +class JdbcConcurrentPartitionsCreator< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +>( + partition: P, + partitionFactory: JdbcPartitionFactory, +) : JdbcPartitionsCreator(partition, partitionFactory) { private val log = KotlinLogging.logger {} - override fun createReader(input: JdbcPartitionReader.Input): JdbcPartitionReader = - JdbcNonResumablePartitionReader(selectQueryGenerator, streamState, input) - - override fun split( - input: JdbcPartitionReader.Input, - globalLowerBound: List?, - globalUpperBound: List? - ): List?, List?>> { + override suspend fun run(): List { + // Ensure that the cursor upper bound is known, if required. + if (partition is JdbcCursorPartition<*>) { + ensureCursorUpperBound() + if (streamState.cursorUpperBound?.isNull == true) { + log.info { "Maximum cursor column value query found that the table was empty." } + return listOf() + } + } // Handle edge case where the table can't be sampled. if (!sharedState.withSampling) { log.warn { "Table cannot be read by concurrent partition readers because it cannot be sampled." } // TODO: adaptive fetchSize computation? - return listOf(globalLowerBound to globalUpperBound) + return listOf(JdbcNonResumablePartitionReader(partition)) } // Sample the table for partition split boundaries and for record byte sizes. - val unsplitQuerySpec: SelectQuerySpec = - input.querySpec(stream, isOrdered = true, limit = null) - val checkpointColumns: List = (unsplitQuerySpec.orderBy as OrderBy).columns - val sample: Sample, Long>> = - collectSample(unsplitQuerySpec) { record: ObjectNode -> - val checkpointValues: List = - checkpointColumns.map { record[it.id] ?: Jsons.nullNode() } - val rowByteSize: Long = sharedState.rowByteSizeEstimator().apply(record) - checkpointValues to rowByteSize - } + val sample: Sample> = collectSample { record: ObjectNode -> + val boundary: OpaqueStateValue? = + (partition as? JdbcSplittablePartition<*>)?.incompleteState(record) + val rowByteSize: Long = sharedState.rowByteSizeEstimator().apply(record) + boundary to rowByteSize + } if (sample.kind == Sample.Kind.EMPTY) { log.info { "Sampling query found that the table was empty." } return listOf() @@ -329,11 +207,11 @@ class JdbcConcurrentPartitionsCreator( val expectedTableByteSize: Long = rowByteSizeSample.sampledValues.sum() * sample.valueWeight log.info { "Table memory size estimated at ${expectedTableByteSize shr 20} MiB." } // Handle edge case where the table can't be split. - if (!input.resumable) { + if (partition !is JdbcSplittablePartition<*>) { log.warn { "Table cannot be read by concurrent partition readers because it cannot be split." } - return listOf(globalLowerBound to globalUpperBound) + return listOf(JdbcNonResumablePartitionReader(partition)) } // Happy path. log.info { "Target partition size is ${sharedState.targetPartitionByteSize shr 20} MiB." } @@ -350,83 +228,13 @@ class JdbcConcurrentPartitionsCreator( } } val random = Random(expectedTableByteSize) // RNG output is repeatable. - val innerSplitBoundaries: List> = + val splitBoundaries: List = sample.sampledValues .filter { random.nextDouble() < secondarySamplingRate } - .map { (splitBoundary: List, _) -> splitBoundary } + .mapNotNull { (splitBoundary: OpaqueStateValue?, _) -> splitBoundary } .distinct() - log.info { - "Table will be read by ${innerSplitBoundaries.size + 1} concurrent partition reader(s)." - } - val lbs: List?> = listOf(globalLowerBound) + innerSplitBoundaries - val ubs: List?> = innerSplitBoundaries + listOf(globalUpperBound) - return lbs.zip(ubs) - } -} - -/** Converts a nullable [OpaqueStateValue] into an input for [JdbcPartitionsCreator]. */ -fun OpaqueStateValue?.streamPartitionsCreatorInput( - handler: CatalogValidationFailureHandler, - streamState: JdbcStreamState<*>, -): JdbcPartitionsCreator.Input { - val checkpoint: CheckpointStreamState? = checkpoint(handler, streamState) - if (checkpoint == null && this != null) { - handler.accept(ResetStream(streamState.stream.name, streamState.stream.namespace)) - streamState.reset() - } - return checkpoint.streamPartitionsCreatorInput(streamState) -} - -/** Converts a nullable [CheckpointStreamState] into an input for [JdbcPartitionsCreator]. */ -fun CheckpointStreamState?.streamPartitionsCreatorInput( - streamState: JdbcStreamState<*>, -): JdbcPartitionsCreator.Input { - val stream: Stream = streamState.stream - val sharedState: JdbcSharedState = streamState.sharedState - val configuration: JdbcSourceConfiguration = sharedState.configuration - if (this == null) { - val pkChosenFromCatalog: List = stream.configuredPrimaryKey ?: listOf() - if (stream.configuredSyncMode == SyncMode.FULL_REFRESH || configuration.global) { - return JdbcPartitionsCreator.SnapshotColdStart(pkChosenFromCatalog) - } - val cursorChosenFromCatalog: Field = - stream.configuredCursor as? Field ?: throw ConfigErrorException("no cursor") - return JdbcPartitionsCreator.SnapshotWithCursorColdStart( - pkChosenFromCatalog, - cursorChosenFromCatalog, - ) - } - return when (this) { - SnapshotCompleted -> JdbcPartitionsCreator.NoStart - is SnapshotCheckpoint -> - JdbcPartitionsCreator.SnapshotWarmStart( - primaryKey, - primaryKeyCheckpoint, - ) - is SnapshotWithCursorCheckpoint -> - JdbcPartitionsCreator.SnapshotWithCursorWarmStart( - primaryKey, - primaryKeyCheckpoint, - cursor, - cursorUpperBound, - ) - is CursorIncrementalCheckpoint -> - when (val cursorUpperBound: JsonNode? = streamState.cursorUpperBound) { - null -> - JdbcPartitionsCreator.CursorIncrementalColdStart( - cursor, - cursorCheckpoint, - ) - else -> - if (cursorCheckpoint == cursorUpperBound) { - JdbcPartitionsCreator.NoStart - } else { - JdbcPartitionsCreator.CursorIncrementalWarmStart( - cursor, - cursorCheckpoint, - cursorUpperBound, - ) - } - } + val partitions: List> = partitionFactory.split(partition, splitBoundaries) + log.info { "Table will be read by ${partitions.size} concurrent partition reader(s)." } + return partitions.map { JdbcNonResumablePartitionReader(it) } } } diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorFactory.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorFactory.kt new file mode 100644 index 0000000000000..0d88ae2b81893 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorFactory.kt @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.command.MetadataYamlPropertySource.Companion.PROPERTY_PREFIX +import io.airbyte.cdk.command.OpaqueStateValue +import io.micronaut.context.annotation.Requires +import jakarta.inject.Singleton + +/** Base class for JDBC implementations of [PartitionsCreatorFactory]. */ +sealed class JdbcPartitionsCreatorFactory< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +>( + val partitionFactory: JdbcPartitionFactory, +) : PartitionsCreatorFactory { + + override fun make( + stateQuerier: StateQuerier, + feed: Feed, + ): PartitionsCreator { + val opaqueStateValue: OpaqueStateValue? = stateQuerier.current(feed) + return when (feed) { + is Global -> CreateNoPartitions + is Stream -> { + val partition: P? = partitionFactory.create(feed, opaqueStateValue) + if (partition == null) { + CreateNoPartitions + } else { + partitionsCreator(partition) + } + } + } + } + + abstract fun partitionsCreator(partition: P): JdbcPartitionsCreator +} + +/** Sequential JDBC implementation of [PartitionsCreatorFactory]. */ +@Singleton +@Requires(property = MODE_PROPERTY, value = "sequential") +class JdbcSequentialPartitionsCreatorFactory< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +>( + partitionFactory: JdbcPartitionFactory, +) : JdbcPartitionsCreatorFactory(partitionFactory) { + + override fun partitionsCreator(partition: P): JdbcPartitionsCreator = + JdbcSequentialPartitionsCreator(partition, partitionFactory) +} + +/** Concurrent JDBC implementation of [PartitionsCreatorFactory]. */ +@Singleton +@Requires(property = MODE_PROPERTY, value = "concurrent") +class JdbcConcurrentPartitionsCreatorFactory< + A : JdbcSharedState, + S : JdbcStreamState, + P : JdbcPartition, +>( + partitionFactory: JdbcPartitionFactory, +) : JdbcPartitionsCreatorFactory(partitionFactory) { + + override fun partitionsCreator(partition: P): JdbcPartitionsCreator = + JdbcConcurrentPartitionsCreator(partition, partitionFactory) +} + +private const val MODE_PROPERTY = "$PROPERTY_PREFIX.jdbc.mode" diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContextManager.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContextManager.kt deleted file mode 100644 index 888964dc89f71..0000000000000 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/StreamReadContextManager.kt +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.read - -import io.airbyte.cdk.command.JdbcSourceConfiguration -import io.airbyte.cdk.output.CatalogValidationFailureHandler -import io.airbyte.cdk.output.OutputConsumer -import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair -import jakarta.inject.Singleton -import java.util.concurrent.ConcurrentHashMap -import java.util.concurrent.ConcurrentMap - -/** - * A [StreamReadContextManager] may be injected in a [io.airbyte.cdk.read.PartitionsCreatorFactory] - * to provide it, and the[io.airbyte.cdk.read.PartitionsCreator] and - * [io.airbyte.cdk.read.PartitionReader] instances it creates, with a set of global singletons - * useful for implementing stream READs for a JDBC source. - * - * For each stream in the configured catalog, these global singletons are packaged in a - * [JdbcStreamState] which bundles them with the corresponding [Stream] as well as mutable metadata - * which is _transient_, transient in the sense that it is not persisted in an Airbyte STATE - * message. - */ -@Singleton -class StreamReadContextManager( - val sharedState: JdbcSharedState, - val handler: CatalogValidationFailureHandler, - val selectQueryGenerator: SelectQueryGenerator, -) { - - val configuration: JdbcSourceConfiguration - get() = sharedState.configuration - - val outputConsumer: OutputConsumer - get() = sharedState.outputConsumer - - val selectQuerier: SelectQuerier - get() = sharedState.selectQuerier - - private val map: ConcurrentMap> = - ConcurrentHashMap() - - operator fun get(stream: Stream): JdbcStreamState<*> = - map.getOrPut(stream.namePair) { - DefaultJdbcStreamState(sharedState as DefaultJdbcSharedState, stream) - } -} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactoryTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactoryTest.kt new file mode 100644 index 0000000000000..a311b1b0748c9 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactoryTest.kt @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.data.IntCodec +import io.airbyte.cdk.data.LocalDateCodec +import io.airbyte.cdk.output.InvalidCursor +import io.airbyte.cdk.output.InvalidPrimaryKey +import io.airbyte.cdk.output.ResetStream +import io.airbyte.cdk.read.TestFixtures.assertFailures +import io.airbyte.cdk.read.TestFixtures.assertJsonEquals +import io.airbyte.cdk.read.TestFixtures.assertQueryEquals +import io.airbyte.cdk.read.TestFixtures.factory +import io.airbyte.cdk.read.TestFixtures.id +import io.airbyte.cdk.read.TestFixtures.msg +import io.airbyte.cdk.read.TestFixtures.opaqueStateValue +import io.airbyte.cdk.read.TestFixtures.record +import io.airbyte.cdk.read.TestFixtures.sharedState +import io.airbyte.cdk.read.TestFixtures.stream +import io.airbyte.cdk.read.TestFixtures.ts +import java.time.LocalDate +import org.junit.jupiter.api.Assertions +import org.junit.jupiter.api.Test + +class DefaultJdbcPartitionFactoryTest { + + val cursorValue = LocalDate.parse("2024-08-19") + + @Test + fun testColdStartUnsplittableSnapshot() { + val stream = stream(withPK = false, withCursor = false) + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue = null) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcUnsplittableSnapshotPartition) + val partition = result as DefaultJdbcUnsplittableSnapshotPartition + // Check partition properties + sanityCheck(stream, factory, partition) + // Check query generation + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec(SelectColumns(id, ts, msg), From(stream.name, stream.namespace)) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + ), + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue()) + } + + @Test + fun testColdStartUnsplittableSnapshotWithCursor() { + val stream = stream(withPK = false) + val factory = sharedState().factory() + val result = factory.create(stream, null) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcUnsplittableSnapshotWithCursorPartition) + val partition = result as DefaultJdbcUnsplittableSnapshotWithCursorPartition + partition.streamState.cursorUpperBound = LocalDateCodec.encode(cursorValue) + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(ts, partition.cursor) + // Check query generation + partition.cursorUpperBoundQuery.assertQueryEquals( + SelectQuerySpec(SelectColumnMaxValue(ts), From(stream.name, stream.namespace)) + ) + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec(SelectColumns(id, ts, msg), From(stream.name, stream.namespace)) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + ), + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue(cursor = cursorValue)) + } + + @Test + fun testColdStartSplittableSnapshot() { + val stream = stream(withCursor = false) + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue = null) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcSplittableSnapshotPartition) + val partition = result as DefaultJdbcSplittableSnapshotPartition + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(listOf(id), partition.checkpointColumns) + Assertions.assertNull(partition.lowerBound) + Assertions.assertNull(partition.upperBound) + // Check query generation + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec(SelectColumns(id, ts, msg), From(stream.name, stream.namespace)) + ) + partition + .resumableQuery(limit = 10L) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + NoWhere, + OrderBy(id), + Limit(10L) + ) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + ), + NoWhere, + OrderBy(id), + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue()) + partition.incompleteState(record(pk = 22)).assertJsonEquals(opaqueStateValue(pk = 22)) + // Check split output + val rawSplits: List = + factory.split(partition, listOf(opaqueStateValue(pk = 22), opaqueStateValue(pk = 44))) + val splits: List = + rawSplits.filterIsInstance() + Assertions.assertIterableEquals(rawSplits, splits) + splits.forEach { + sanityCheck(stream, factory, it) + Assertions.assertIterableEquals(listOf(id), it.checkpointColumns) + } + Assertions.assertEquals(3, splits.size) + Assertions.assertNull(splits[0].lowerBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), splits[0].upperBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), splits[1].lowerBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(44)), splits[1].upperBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(44)), splits[2].lowerBound) + Assertions.assertNull(splits[2].upperBound) + } + + @Test + fun testColdStartSplittableSnapshotWithCursor() { + val stream = stream() + val factory = sharedState().factory() + val result = factory.create(stream, null) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcSplittableSnapshotWithCursorPartition) + val partition = result as DefaultJdbcSplittableSnapshotWithCursorPartition + partition.streamState.cursorUpperBound = LocalDateCodec.encode(cursorValue) + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(listOf(id), partition.checkpointColumns) + Assertions.assertEquals(ts, partition.cursor) + Assertions.assertNull(partition.lowerBound) + Assertions.assertNull(partition.upperBound) + // Check query generation + partition.cursorUpperBoundQuery.assertQueryEquals( + SelectQuerySpec(SelectColumnMaxValue(ts), From(stream.name, stream.namespace)) + ) + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec(SelectColumns(id, ts, msg), From(stream.name, stream.namespace)) + ) + partition + .resumableQuery(limit = 10L) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + NoWhere, + OrderBy(id), + Limit(10L) + ) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + ), + NoWhere, + OrderBy(id) + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue(cursor = cursorValue)) + partition + .incompleteState(record(pk = 22)) + .assertJsonEquals(opaqueStateValue(pk = 22, cursor = cursorValue)) + // Check split output + val rawSplits: List = + factory.split(partition, listOf(opaqueStateValue(pk = 22), opaqueStateValue(pk = 44))) + val splits: List = + rawSplits.filterIsInstance() + Assertions.assertIterableEquals(rawSplits, splits) + splits.forEach { + sanityCheck(stream, factory, it) + Assertions.assertIterableEquals(listOf(id), it.checkpointColumns) + Assertions.assertEquals(ts, it.cursor) + Assertions.assertEquals(LocalDateCodec.encode(cursorValue), it.cursorUpperBound) + } + Assertions.assertEquals(3, splits.size) + Assertions.assertNull(splits[0].lowerBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), splits[0].upperBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), splits[1].lowerBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(44)), splits[1].upperBound) + Assertions.assertIterableEquals(listOf(IntCodec.encode(44)), splits[2].lowerBound) + Assertions.assertNull(splits[2].upperBound) + } + + @Test + fun testInvalidPrimaryKey() { + val stream = stream(withPK = false, withCursor = false) + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue(pk = 22)) + factory.assertFailures( + InvalidPrimaryKey(stream.name, stream.namespace, listOf(id.id)), + ResetStream(stream.name, stream.namespace), + ) + Assertions.assertTrue(result is DefaultJdbcUnsplittableSnapshotPartition) + val partition = result as DefaultJdbcUnsplittableSnapshotPartition + // Check partition properties + sanityCheck(stream, factory, partition) + } + + @Test + fun testInvalidCursor() { + val stream = stream(withCursor = false) + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue(cursor = cursorValue)) + factory.assertFailures( + InvalidCursor(stream.name, stream.namespace, ts.id), + ResetStream(stream.name, stream.namespace), + ) + Assertions.assertTrue(result is DefaultJdbcSplittableSnapshotPartition) + val partition = result as DefaultJdbcSplittableSnapshotPartition + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(listOf(id), partition.checkpointColumns) + Assertions.assertNull(partition.lowerBound) + Assertions.assertNull(partition.upperBound) + } + + @Test + fun testWarmStartSnapshot() { + val stream = stream(withCursor = false) + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue(pk = 22)) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcSplittableSnapshotPartition) + val partition = result as DefaultJdbcSplittableSnapshotPartition + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(listOf(id), partition.checkpointColumns) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), partition.lowerBound) + Assertions.assertNull(partition.upperBound) + // Check query generation + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where(Greater(id, IntCodec.encode(22))) + ) + ) + partition + .resumableQuery(limit = 10L) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id), + Limit(10L) + ) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + ), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id), + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue()) + partition.incompleteState(record(pk = 10)).assertJsonEquals(opaqueStateValue(pk = 10)) + // Check full refresh termination criteria + val finalResult = factory.create(stream, opaqueStateValue()) + factory.assertFailures() + Assertions.assertNull(finalResult) + } + + @Test + fun testWarmStartSnapshotWithCursor() { + val stream = stream() + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue(pk = 22, cursor = cursorValue)) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcSplittableSnapshotWithCursorPartition) + val partition = result as DefaultJdbcSplittableSnapshotWithCursorPartition + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(listOf(id), partition.checkpointColumns) + Assertions.assertEquals(ts, partition.cursor) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), partition.lowerBound) + Assertions.assertNull(partition.upperBound) + // Check query generation + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where(Greater(id, IntCodec.encode(22))), + ) + ) + partition + .resumableQuery(limit = 10L) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id), + Limit(10L) + ) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + ), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id) + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue(cursor = cursorValue)) + partition + .incompleteState(record(pk = 44)) + .assertJsonEquals(opaqueStateValue(pk = 44, cursor = cursorValue)) + // Check snapshot termination criteria and transition to cursor-based incremental + val finalResult = factory.create(stream, opaqueStateValue(cursor = cursorValue)) + factory.assertFailures() + Assertions.assertTrue(finalResult is DefaultJdbcCursorIncrementalPartition) + val finalPartition = finalResult as DefaultJdbcCursorIncrementalPartition + sanityCheck(stream, factory, finalPartition) + Assertions.assertEquals(ts, finalPartition.cursor) + Assertions.assertEquals(LocalDateCodec.encode(cursorValue), finalPartition.cursorLowerBound) + } + + @Test + fun testCursorIncremental() { + val stream = stream(withPK = false) + val factory = sharedState().factory() + val result = factory.create(stream, opaqueStateValue(cursor = cursorValue)) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcCursorIncrementalPartition) + val partition = result as DefaultJdbcCursorIncrementalPartition + val cursorUpperBound = cursorValue.plusMonths(1) + partition.streamState.cursorUpperBound = LocalDateCodec.encode(cursorUpperBound) + // Check partition properties + sanityCheck(stream, factory, partition) + Assertions.assertEquals(ts, partition.cursor) + Assertions.assertIterableEquals(listOf(ts), partition.checkpointColumns) + Assertions.assertEquals(LocalDateCodec.encode(cursorValue), partition.cursorLowerBound) + Assertions.assertIterableEquals(listOf(partition.cursorLowerBound), partition.lowerBound) + Assertions.assertEquals(LocalDateCodec.encode(cursorUpperBound), partition.cursorUpperBound) + Assertions.assertIterableEquals(listOf(partition.cursorUpperBound), partition.upperBound) + Assertions.assertTrue(partition.isLowerBoundIncluded) + // Check query generation + partition.cursorUpperBoundQuery.assertQueryEquals( + SelectQuerySpec(SelectColumnMaxValue(ts), From(stream.name, stream.namespace)) + ) + partition.nonResumableQuery.assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where( + And( + GreaterOrEqual(ts, LocalDateCodec.encode(cursorValue)), + LesserOrEqual(ts, LocalDateCodec.encode(cursorUpperBound)) + ), + ), + ) + ) + partition + .resumableQuery(limit = 10L) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where( + And( + GreaterOrEqual(ts, LocalDateCodec.encode(cursorValue)), + LesserOrEqual(ts, LocalDateCodec.encode(cursorUpperBound)) + ), + ), + OrderBy(ts), + Limit(10L) + ) + ) + partition + .samplingQuery(sampleRateInvPow2 = 8) + .assertQueryEquals( + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream.name, + stream.namespace, + sampleRateInvPow2 = 8, + DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + ), + Where( + And( + GreaterOrEqual(ts, LocalDateCodec.encode(cursorValue)), + LesserOrEqual(ts, LocalDateCodec.encode(cursorUpperBound)) + ), + ), + OrderBy(ts) + ) + ) + // Check state generation + partition.completeState.assertJsonEquals(opaqueStateValue(cursor = cursorUpperBound)) + partition + .incompleteState(record(cursor = cursorValue.plusDays(1))) + .assertJsonEquals(opaqueStateValue(cursor = cursorValue.plusDays(1))) + // Check that subsequent non-terminal partition includes the lower bound + val nextResult = factory.create(stream, opaqueStateValue(cursor = cursorValue.plusDays(1))) + factory.assertFailures() + Assertions.assertTrue(nextResult is DefaultJdbcCursorIncrementalPartition) + val nextPartition = nextResult as DefaultJdbcCursorIncrementalPartition + sanityCheck(stream, factory, nextPartition) + Assertions.assertTrue(nextPartition.isLowerBoundIncluded) + // Check termination criteria + val finalResult = factory.create(stream, opaqueStateValue(cursor = cursorUpperBound)) + factory.assertFailures() + Assertions.assertNull(finalResult) + // Check split output + val boundary1 = cursorValue.plusDays(1) + val boundary2 = cursorValue.plusDays(2) + val rawSplits: List = + factory.split( + partition, + listOf(opaqueStateValue(cursor = boundary1), opaqueStateValue(cursor = boundary2)), + ) + val splits: List = + rawSplits.filterIsInstance() + Assertions.assertIterableEquals(rawSplits, splits) + splits.forEach { + sanityCheck(stream, factory, it) + Assertions.assertEquals(ts, it.cursor) + } + Assertions.assertEquals(3, splits.size) + Assertions.assertEquals(LocalDateCodec.encode(cursorValue), splits[0].cursorLowerBound) + Assertions.assertTrue(splits[0].isLowerBoundIncluded) + Assertions.assertEquals(LocalDateCodec.encode(boundary1), splits[0].cursorUpperBound) + Assertions.assertEquals(LocalDateCodec.encode(boundary1), splits[1].cursorLowerBound) + Assertions.assertFalse(splits[1].isLowerBoundIncluded) + Assertions.assertEquals(LocalDateCodec.encode(boundary2), splits[1].cursorUpperBound) + Assertions.assertEquals(LocalDateCodec.encode(boundary2), splits[2].cursorLowerBound) + Assertions.assertFalse(splits[2].isLowerBoundIncluded) + Assertions.assertEquals(LocalDateCodec.encode(cursorUpperBound), splits[2].cursorUpperBound) + } + + fun sanityCheck( + stream: Stream, + factory: DefaultJdbcPartitionFactory, + partition: DefaultJdbcPartition, + ) { + Assertions.assertEquals(stream, partition.stream) + Assertions.assertEquals(stream, partition.streamState.stream) + Assertions.assertEquals(factory.sharedState, partition.streamState.sharedState) + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionReaderTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionReaderTest.kt new file mode 100644 index 0000000000000..d4076c9965307 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionReaderTest.kt @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.data.LocalDateCodec +import io.airbyte.cdk.output.BufferingOutputConsumer +import io.airbyte.cdk.read.TestFixtures.assertFailures +import io.airbyte.cdk.read.TestFixtures.factory +import io.airbyte.cdk.read.TestFixtures.id +import io.airbyte.cdk.read.TestFixtures.msg +import io.airbyte.cdk.read.TestFixtures.opaqueStateValue +import io.airbyte.cdk.read.TestFixtures.sharedState +import io.airbyte.cdk.read.TestFixtures.stream +import io.airbyte.cdk.read.TestFixtures.ts +import java.time.LocalDate +import kotlinx.coroutines.CancellationException +import kotlinx.coroutines.delay +import kotlinx.coroutines.runBlocking +import kotlinx.coroutines.withTimeoutOrNull +import org.junit.jupiter.api.Assertions +import org.junit.jupiter.api.Test + +class JdbcPartitionReaderTest { + + val cursorLowerBound = LocalDate.parse("2024-08-01") + val cursorCheckpoint = LocalDate.parse("2024-08-02") + val cursorUpperBound = LocalDate.parse("2024-08-05") + + @Test + fun testNonResumable() { + // Generate partition + val stream = stream(withPK = false) + val sharedState = + sharedState( + mockedQueries = + arrayOf( + TestFixtures.MockedQuery( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where( + And( + GreaterOrEqual(ts, LocalDateCodec.encode(cursorLowerBound)), + LesserOrEqual(ts, LocalDateCodec.encode(cursorUpperBound)), + ) + ), + ), + SelectQuerier.Parameters(fetchSize = 2), + """{"id":1,"ts":"2024-08-01","msg":"hello"}""", + """{"id":2,"ts":"2024-08-02","msg":"how"}""", + """{"id":3,"ts":"2024-08-03","msg":"are"}""", + """{"id":4,"ts":"2024-08-04","msg":"you"}""", + """{"id":5,"ts":"2024-08-05","msg":"today"}""", + ) + ) + ) + val factory = sharedState.factory() + val result = factory.create(stream, opaqueStateValue(cursor = cursorLowerBound)) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcCursorIncrementalPartition) + val partition = result as DefaultJdbcCursorIncrementalPartition + partition.streamState.cursorUpperBound = LocalDateCodec.encode(cursorUpperBound) + partition.streamState.fetchSize = 2 + // Generate reader + val reader = JdbcNonResumablePartitionReader(partition) + // Acquire resources + Assertions.assertEquals( + sharedState.configuration.maxConcurrency, + factory.sharedState.semaphore.availablePermits, + ) + Assertions.assertEquals( + PartitionReader.TryAcquireResourcesStatus.READY_TO_RUN, + reader.tryAcquireResources() + ) + Assertions.assertEquals( + sharedState.configuration.maxConcurrency - 1, + factory.sharedState.semaphore.availablePermits, + ) + // Run + runBlocking { reader.run() } + // Checkpoint + Assertions.assertEquals( + PartitionReadCheckpoint(opaqueStateValue(cursor = cursorUpperBound), 5), + reader.checkpoint(), + ) + // Check output + Assertions.assertEquals( + "hello how are you today", + (sharedState.outputConsumer as BufferingOutputConsumer) + .records() + .map { it.data["msg"].asText() } + .joinToString(separator = " ") + ) + // Release resources + Assertions.assertEquals( + sharedState.configuration.maxConcurrency - 1, + factory.sharedState.semaphore.availablePermits, + ) + reader.releaseResources() + Assertions.assertEquals( + sharedState.configuration.maxConcurrency, + factory.sharedState.semaphore.availablePermits, + ) + } + + @Test + fun testResumable() { + // Generate partition + val stream = stream(withPK = false) + val sharedState = + sharedState( + mockedQueries = + arrayOf( + TestFixtures.MockedQuery( + SelectQuerySpec( + SelectColumns(id, ts, msg), + From(stream.name, stream.namespace), + Where( + And( + GreaterOrEqual(ts, LocalDateCodec.encode(cursorLowerBound)), + LesserOrEqual(ts, LocalDateCodec.encode(cursorUpperBound)), + ) + ), + OrderBy(ts), + Limit(4), + ), + SelectQuerier.Parameters(fetchSize = 2), + """{"id":1,"ts":"2024-08-01","msg":"hello"}""", + """{"id":2,"ts":"2024-08-02","msg":"how"}""", + """{"id":3,"ts":"2024-08-03","msg":"are"}""", + """{"id":4,"ts":"2024-08-04","msg":"you"}""", + ) + ) + ) + val factory = sharedState.factory() + val result = factory.create(stream, opaqueStateValue(cursor = cursorLowerBound)) + factory.assertFailures() + Assertions.assertTrue(result is DefaultJdbcCursorIncrementalPartition) + val partition = result as DefaultJdbcCursorIncrementalPartition + partition.streamState.cursorUpperBound = LocalDateCodec.encode(cursorUpperBound) + partition.streamState.fetchSize = 2 + partition.streamState.updateLimitState { it.up } // so we don't hit the limit + // Generate reader + val reader = JdbcResumablePartitionReader(partition) + // Acquire resources + Assertions.assertEquals( + sharedState.configuration.maxConcurrency, + factory.sharedState.semaphore.availablePermits, + ) + Assertions.assertEquals( + PartitionReader.TryAcquireResourcesStatus.READY_TO_RUN, + reader.tryAcquireResources() + ) + Assertions.assertEquals( + sharedState.configuration.maxConcurrency - 1, + factory.sharedState.semaphore.availablePermits, + ) + // Run and simulate timing out + runBlocking { + withTimeoutOrNull(1) { + try { + delay(100) + } catch (_: CancellationException) { + // swallow + } + reader.run() + } + } + // Checkpoint + Assertions.assertEquals( + PartitionReadCheckpoint(opaqueStateValue(cursor = cursorCheckpoint), 2), + reader.checkpoint(), + ) + // Check output + Assertions.assertEquals( + "hello how", + (sharedState.outputConsumer as BufferingOutputConsumer) + .records() + .map { it.data["msg"].asText() } + .joinToString(separator = " ") + ) + // Release resources + Assertions.assertEquals( + sharedState.configuration.maxConcurrency - 1, + factory.sharedState.semaphore.availablePermits, + ) + reader.releaseResources() + Assertions.assertEquals( + sharedState.configuration.maxConcurrency, + factory.sharedState.semaphore.availablePermits, + ) + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorTest.kt new file mode 100644 index 0000000000000..7273d2cff04dd --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorTest.kt @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.read + +import io.airbyte.cdk.data.IntCodec +import io.airbyte.cdk.data.LocalDateCodec +import io.airbyte.cdk.read.TestFixtures.assertFailures +import io.airbyte.cdk.read.TestFixtures.factory +import io.airbyte.cdk.read.TestFixtures.id +import io.airbyte.cdk.read.TestFixtures.msg +import io.airbyte.cdk.read.TestFixtures.opaqueStateValue +import io.airbyte.cdk.read.TestFixtures.sharedState +import io.airbyte.cdk.read.TestFixtures.stream +import io.airbyte.cdk.read.TestFixtures.ts +import java.time.LocalDate +import kotlinx.coroutines.runBlocking +import org.junit.jupiter.api.Assertions +import org.junit.jupiter.api.Test + +class JdbcPartitionsCreatorTest { + + @Test + fun testConcurrentSnapshotWithCursor() { + val stream = stream() + val sharedState = + sharedState( + withSampling = true, + maxSampleSize = 4, + expectedThroughputBytesPerSecond = + 1L, // absurdly low value to create many partitions + mockedQueries = + arrayOf( + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumnMaxValue(ts), + From(stream().name, stream().namespace), + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"max":"$cursorUpperBound"}""", + ), + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream().name, + stream().namespace, + sampleRateInvPow2 = 16, + sampleSize = 4 + ), + NoWhere, + OrderBy(id) + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"id":10000,"ts":"2024-08-01","msg":"foo"}""", + ), + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream().name, + stream().namespace, + sampleRateInvPow2 = 8, + sampleSize = 4 + ), + NoWhere, + OrderBy(id) + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"id":10000,"ts":"2024-08-01","msg":"foo"}""", + """{"id":20000,"ts":"2024-08-02","msg":"bar"}""", + """{"id":30000,"ts":"2024-08-03","msg":"baz"}""", + """{"id":40000,"ts":"2024-08-04","msg":"quux"}""", + ) + ), + ) + val expectedPartitions = 5 // adjust this as needed based on inputs + val expectedFetchSize = 681 // adjust this as needed based on inputs + val factory = sharedState.factory() + val initialPartition = factory.create(stream, opaqueStateValue = null).asPartition() + factory.assertFailures() + val readers = JdbcConcurrentPartitionsCreator(initialPartition, factory).runInTest() + val partitions: List = + concurrentPartitions(stream, factory, readers) + val streamState: DefaultJdbcStreamState = partitions.first().streamState + Assertions.assertEquals( + LocalDateCodec.encode(cursorUpperBound), + streamState.cursorUpperBound + ) + Assertions.assertEquals(expectedFetchSize, streamState.fetchSize) + Assertions.assertEquals(expectedPartitions, partitions.size) + Assertions.assertIterableEquals(listOf(id), partitions.first().checkpointColumns) + Assertions.assertNull(partitions.first().lowerBound) + for (i in 1..(expectedPartitions - 1)) { + Assertions.assertIterableEquals(partitions[i - 1].upperBound, partitions[i].lowerBound) + Assertions.assertIterableEquals(listOf(id), partitions[i].checkpointColumns) + } + Assertions.assertNull(partitions.last().upperBound) + } + + @Test + fun testConcurrentSnapshot() { + val stream = stream(withCursor = false) + val sharedState = + sharedState( + withSampling = true, + maxSampleSize = 4, + expectedThroughputBytesPerSecond = + 1L, // absurdly low value to create many partitions + mockedQueries = + arrayOf( + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream().name, + stream().namespace, + sampleRateInvPow2 = 16, + sampleSize = 4 + ), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id) + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"id":10000,"ts":"2024-08-01","msg":"foo"}""", + ), + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream().name, + stream().namespace, + sampleRateInvPow2 = 8, + sampleSize = 4 + ), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id) + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"id":10000,"ts":"2024-08-01","msg":"foo"}""", + """{"id":20000,"ts":"2024-08-02","msg":"bar"}""", + """{"id":30000,"ts":"2024-08-03","msg":"baz"}""", + """{"id":40000,"ts":"2024-08-04","msg":"quux"}""", + ) + ), + ) + val expectedPartitions = 5 // adjust this as needed based on inputs + val expectedFetchSize = 681 // adjust this as needed based on inputs + val factory = sharedState.factory() + val initialPartition = factory.create(stream, opaqueStateValue(pk = 22)).asPartition() + factory.assertFailures() + val readers = JdbcConcurrentPartitionsCreator(initialPartition, factory).runInTest() + val partitions: List = + concurrentPartitions(stream, factory, readers) + val streamState: DefaultJdbcStreamState = partitions.first().streamState + Assertions.assertNull(streamState.cursorUpperBound) + Assertions.assertEquals(expectedFetchSize, streamState.fetchSize) + Assertions.assertEquals(expectedPartitions, partitions.size) + Assertions.assertIterableEquals(listOf(id), partitions.first().checkpointColumns) + Assertions.assertIterableEquals(listOf(IntCodec.encode(22)), partitions.first().lowerBound) + for (i in 1..(expectedPartitions - 1)) { + Assertions.assertIterableEquals(partitions[i - 1].upperBound, partitions[i].lowerBound) + Assertions.assertIterableEquals(listOf(id), partitions[i].checkpointColumns) + } + Assertions.assertNull(partitions.last().upperBound) + } + + @Test + fun testConcurrentSnapshotWithoutSampling() { + val stream = stream(withCursor = false) + val sharedState = sharedState() + val factory = sharedState.factory() + val initialPartition = factory.create(stream, opaqueStateValue(pk = 22)).asPartition() + factory.assertFailures() + val readers = JdbcConcurrentPartitionsCreator(initialPartition, factory).runInTest() + val partitions: List = + concurrentPartitions(stream, factory, readers) + // No sampling means no splitting. + Assertions.assertEquals(1, partitions.size) + Assertions.assertIterableEquals( + stream.configuredPrimaryKey, + partitions.first().checkpointColumns, + ) + Assertions.assertEquals(listOf(IntCodec.encode(22)), partitions.first().lowerBound) + Assertions.assertNull(partitions.first().upperBound) + } + + @Test + fun testColdStartSequentialSnapshot() { + val stream = stream(withCursor = false) + val sharedState = sharedState() + val factory = sharedState.factory() + val initialPartition = factory.create(stream, opaqueStateValue(pk = 22)).asPartition() + factory.assertFailures() + val readers = JdbcSequentialPartitionsCreator(initialPartition, factory).runInTest() + val readerPartition: DefaultJdbcSplittableSnapshotPartition = + sequentialPartition(stream, factory, readers) + Assertions.assertNull(readerPartition.streamState.cursorUpperBound) + Assertions.assertNull(readerPartition.streamState.fetchSize) + Assertions.assertIterableEquals( + stream.configuredPrimaryKey, + readerPartition.checkpointColumns, + ) + Assertions.assertEquals(listOf(IntCodec.encode(22)), readerPartition.lowerBound) + Assertions.assertNull(readerPartition.upperBound) + } + + @Test + fun testColdStartSequentialSnapshotWithSampling() { + val stream = stream(withCursor = false) + val sharedState = + sharedState( + withSampling = true, + maxSampleSize = 4, + mockedQueries = + arrayOf( + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream().name, + stream().namespace, + sampleRateInvPow2 = 16, + sampleSize = 4 + ), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id) + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"id":10000,"ts":"2024-08-01","msg":"foo"}""", + ), + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumns(id, ts, msg), + FromSample( + stream().name, + stream().namespace, + sampleRateInvPow2 = 8, + sampleSize = 4 + ), + Where(Greater(id, IntCodec.encode(22))), + OrderBy(id) + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"id":10000,"ts":"2024-08-01","msg":"foo"}""", + """{"id":20000,"ts":"2024-08-02","msg":"bar"}""", + """{"id":30000,"ts":"2024-08-03","msg":"baz"}""", + """{"id":40000,"ts":"2024-08-04","msg":"quux"}""", + ) + ), + ) + val expectedFetchSize = 674 // adjust this as needed based on inputs + val factory = sharedState.factory() + val initialPartition = factory.create(stream, opaqueStateValue(pk = 22)).asPartition() + factory.assertFailures() + val readers = JdbcSequentialPartitionsCreator(initialPartition, factory).runInTest() + val readerPartition: DefaultJdbcSplittableSnapshotPartition = + sequentialPartition(stream, factory, readers) + Assertions.assertNull(readerPartition.streamState.cursorUpperBound) + Assertions.assertEquals(expectedFetchSize, readerPartition.streamState.fetchSize) + Assertions.assertIterableEquals(listOf(id), readerPartition.checkpointColumns) + Assertions.assertEquals(listOf(IntCodec.encode(22)), readerPartition.lowerBound) + Assertions.assertNull(readerPartition.upperBound) + } + + @Test + fun testColdStartCursorIncrementalSequential() { + val stream = stream() + val sharedState = + sharedState( + mockedQueries = + arrayOf( + TestFixtures.MockedQuery( + expectedQuerySpec = + SelectQuerySpec( + SelectColumnMaxValue(ts), + From(stream().name, stream().namespace), + ), + expectedParameters = SelectQuerier.Parameters(fetchSize = null), + """{"max":"$cursorUpperBound"}""", + ), + ) + ) + val factory = sharedState.factory() + val initialPartition = + factory.create(stream, opaqueStateValue(cursor = cursorCheckpoint)).asPartition() + factory.assertFailures() + val readers = JdbcSequentialPartitionsCreator(initialPartition, factory).runInTest() + val readerPartition: DefaultJdbcCursorIncrementalPartition = + sequentialPartition(stream, factory, readers) + Assertions.assertEquals( + LocalDateCodec.encode(cursorUpperBound), + readerPartition.streamState.cursorUpperBound, + ) + Assertions.assertNull(readerPartition.streamState.fetchSize) + Assertions.assertEquals(ts, readerPartition.cursor) + Assertions.assertEquals( + LocalDateCodec.encode(cursorCheckpoint), + readerPartition.cursorLowerBound, + ) + Assertions.assertEquals( + LocalDateCodec.encode(cursorUpperBound), + readerPartition.cursorUpperBound, + ) + } + + @Test + fun testWarmStartCursorIncrementalSequentialWithSampling() { + val stream = stream() + val sharedState = + sharedState( + withSampling = true, + // The JdbcSequentialPartitionsCreator is not expected to query anything. + mockedQueries = arrayOf() + ) + val factory = sharedState.factory() + run { + // This warm start is particularly warm; the stream state has some transient state. + val streamState: DefaultJdbcStreamState = factory.streamState(stream) + streamState.fetchSize = 1234 + streamState.cursorUpperBound = LocalDateCodec.encode(cursorUpperBound) + } + val initialPartition = + factory.create(stream, opaqueStateValue(cursor = cursorCheckpoint)).asPartition() + factory.assertFailures() + val readers = JdbcSequentialPartitionsCreator(initialPartition, factory).runInTest() + val readerPartition: DefaultJdbcCursorIncrementalPartition = + sequentialPartition(stream, factory, readers) + Assertions.assertEquals(ts, readerPartition.cursor) + Assertions.assertEquals( + LocalDateCodec.encode(cursorCheckpoint), + readerPartition.cursorLowerBound, + ) + Assertions.assertEquals( + LocalDateCodec.encode(cursorUpperBound), + readerPartition.cursorUpperBound, + ) + } + + val cursorCheckpoint = LocalDate.parse("2024-08-02") + val cursorUpperBound = LocalDate.parse("2024-08-05") + + inline fun concurrentPartitions( + stream: Stream, + factory: DefaultJdbcPartitionFactory, + readers: List + ): List { + Assertions.assertTrue(readers.isNotEmpty()) + val typedReaders = readers.filterIsInstance>() + Assertions.assertIterableEquals(readers, typedReaders) + for (reader in typedReaders) { + Assertions.assertTrue(reader.partition is T) + Assertions.assertEquals(stream, reader.stream) + Assertions.assertEquals(factory.streamState(stream), reader.partition.streamState) + } + return typedReaders.map { it.partition as T } + } + + inline fun sequentialPartition( + stream: Stream, + factory: DefaultJdbcPartitionFactory, + readers: List + ): T { + Assertions.assertTrue(readers.firstOrNull() is JdbcResumablePartitionReader<*>) + Assertions.assertNull(readers.getOrNull(1)) + val reader = readers.first() as JdbcResumablePartitionReader<*> + Assertions.assertTrue(reader.partition is T) + val partition = reader.partition as T + Assertions.assertEquals(stream, reader.stream) + Assertions.assertEquals(factory.streamState(stream), partition.streamState) + return partition + } + + fun DefaultJdbcPartition?.asPartition(): DefaultJdbcPartition { + Assertions.assertTrue(this is DefaultJdbcPartition) + return this as DefaultJdbcPartition + } + + fun JdbcPartitionsCreator + .runInTest(): List { + val sharedState: DefaultJdbcSharedState = sharedState + // Acquire resources + Assertions.assertEquals( + sharedState.configuration.maxConcurrency, + sharedState.semaphore.availablePermits, + ) + Assertions.assertEquals( + PartitionsCreator.TryAcquireResourcesStatus.READY_TO_RUN, + tryAcquireResources() + ) + Assertions.assertEquals( + sharedState.configuration.maxConcurrency - 1, + sharedState.semaphore.availablePermits, + ) + // Run + val partitionReaders: List = runBlocking { run() } + // Release resources + Assertions.assertEquals( + sharedState.configuration.maxConcurrency - 1, + sharedState.semaphore.availablePermits, + ) + releaseResources() + Assertions.assertEquals( + sharedState.configuration.maxConcurrency, + sharedState.semaphore.availablePermits, + ) + // Return result + return partitionReaders + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt index 19b392bd39aac..c97e853bf6b9f 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt @@ -8,15 +8,65 @@ import com.fasterxml.jackson.databind.JsonNode import com.fasterxml.jackson.databind.node.ObjectNode import io.airbyte.cdk.TestClockFactory import io.airbyte.cdk.command.JdbcSourceConfiguration +import io.airbyte.cdk.command.OpaqueStateValue +import io.airbyte.cdk.discover.Field +import io.airbyte.cdk.jdbc.IntFieldType +import io.airbyte.cdk.jdbc.LocalDateFieldType +import io.airbyte.cdk.jdbc.StringFieldType +import io.airbyte.cdk.output.BufferingCatalogValidationFailureHandler import io.airbyte.cdk.output.BufferingOutputConsumer +import io.airbyte.cdk.output.CatalogValidationFailure import io.airbyte.cdk.ssh.SshConnectionOptions import io.airbyte.cdk.ssh.SshTunnelMethodConfiguration import io.airbyte.cdk.util.Jsons +import io.airbyte.protocol.models.v0.SyncMode import java.time.Duration +import java.time.LocalDate import org.junit.jupiter.api.Assertions object TestFixtures { + val id = Field("id", IntFieldType) + val ts = Field("ts", LocalDateFieldType) + val msg = Field("msg", StringFieldType) + + fun stream( + withPK: Boolean = true, + withCursor: Boolean = true, + ) = + Stream( + name = "events", + namespace = "test", + fields = listOf(id, ts, msg), + configuredSyncMode = if (withCursor) SyncMode.INCREMENTAL else SyncMode.FULL_REFRESH, + configuredPrimaryKey = listOf(id).takeIf { withPK }, + configuredCursor = ts.takeIf { withCursor }, + ) + + fun opaqueStateValue( + pk: Int? = null, + cursor: LocalDate? = null, + ): OpaqueStateValue = + Jsons.readTree( + listOf( + """"primary_key":""" + if (pk == null) "{}" else """{"${id.id}":$pk }""", + """"cursors":""" + if (cursor == null) "{}" else """{"${ts.id}":"$cursor"} """, + ) + .joinToString(",", "{", "}") + ) + + fun record( + pk: Int? = null, + cursor: LocalDate? = null, + ): ObjectNode = + Jsons.readTree( + listOfNotNull( + """ "${id.id}" : $pk """.takeIf { pk != null }, + """ "${ts.id}" : "$cursor" """.takeIf { cursor != null }, + ) + .joinToString(",", "{", "}") + ) as ObjectNode + fun sharedState( global: Boolean = false, checkpointTargetInterval: Duration = Duration.ofMinutes(1), @@ -49,6 +99,20 @@ object TestFixtures { maxMemoryBytesForTesting, ) + fun DefaultJdbcSharedState.factory() = + DefaultJdbcPartitionFactory( + this, + BufferingCatalogValidationFailureHandler(), + MockSelectQueryGenerator + ) + + fun DefaultJdbcPartitionFactory.assertFailures(vararg failures: CatalogValidationFailure) { + Assertions.assertIterableEquals( + failures.toList(), + (handler as BufferingCatalogValidationFailureHandler).get(), + ) + } + fun SelectQuery.assertQueryEquals(expected: SelectQuerySpec) { Assertions.assertEquals(expected.toString(), this.sql) } @@ -118,4 +182,9 @@ object TestFixtures { rows.map { Jsons.readTree(it) as ObjectNode }, ) } + + object MockSelectQueryGenerator : SelectQueryGenerator { + override fun generate(ast: SelectQuerySpec): SelectQuery = + SelectQuery(ast.toString(), listOf(), listOf()) + } } From bc3b932f100bd148710bf01630a55eb5bcaa7759 Mon Sep 17 00:00:00 2001 From: Marius Posta Date: Tue, 20 Aug 2024 11:28:53 -0400 Subject: [PATCH 10/11] bulk-cdk: refactor tests --- airbyte-cdk/bulk/core/base/build.gradle | 1 - airbyte-cdk/bulk/core/extract/build.gradle | 2 - .../CheckTest.kt} | 13 +- .../ConfigurationFactoryTest.kt} | 10 +- .../ConfigurationJsonObjectSupplierTest.kt | 2 +- .../cdk/command/SyncsTestFixtureTest.kt | 12 + .../DiscoverTest.kt} | 9 +- .../cdk/fakesource/FakeSourceConfiguration.kt | 14 +- .../FakeSourceConfigurationJsonObject.kt | 2 +- .../cdk/read/StateManagerGlobalStatesTest.kt | 10 +- .../cdk/read/StateManagerStreamStatesTest.kt | 2 +- .../SpecTest.kt} | 5 +- .../resources/fakesource/cdc-catalog.json | 15 +- .../resources/fakesource/cursor-catalog.json | 15 +- .../expected-schema.json | 0 .../resources/fakesource/metadata-valid.json | 25 --- .../extract/src/test/resources/metadata.yaml | 2 - .../src/test/resources/read/metadata.json | 25 --- .../io/airbyte/cdk/discover/IntFieldType.kt | 15 ++ .../cdk/discover/OffsetDateTimeFieldType.kt | 15 ++ .../airbyte/cdk/discover/StringFieldType.kt | 15 ++ .../cdk/discover/TestAirbyteStreamFactory.kt | 48 ++++ .../metadata-column-query-fails.json | 0 .../resources/discover}/metadata-empty.json | 0 .../resources/discover/metadata-valid.json | 25 +++ .../bulk/toolkits/extract-jdbc/build.gradle | 2 + .../cdk/discover/JdbcMetadataQuerierTest.kt | 14 +- .../io/airbyte/cdk/h2/H2TestFixtureTest.kt | 0 .../cdk/h2source/H2SourceIntegrationTest.kt} | 38 ++-- .../io/airbyte/cdk/jdbc/JdbcAccessorTest.kt | 0 .../cdk/jdbc/JdbcConnectionFactoryTest.kt | 16 +- .../airbyte/cdk/read/JdbcSelectQuerierTest.kt | 12 +- .../test/resources/h2source}/cdc-catalog.json | 15 +- .../resources/h2source}/cursor-catalog.json | 15 +- .../h2source}/expected-cdc-catalog.json | 0 .../h2source}/expected-cursor-catalog.json | 0 .../expected-messages-global-cold-start.json | 0 .../expected-messages-stream-cold-start.json | 0 .../expected-messages-stream-warm-start.json | 0 .../kotlin/io/airbyte/cdk/h2/H2TestFixture.kt | 0 .../io/airbyte/cdk/h2source/H2Source.kt} | 9 +- .../cdk/h2source/H2SourceConfiguration.kt | 61 +++++ .../H2SourceConfigurationJsonObject.kt | 162 +++++++++++++ .../cdk/h2source/H2SourceOperations.kt} | 8 +- .../resources/h2source/expected-spec.json | 212 ++++++++++++++++++ .../src/testFixtures/resources/metadata.yaml | 6 + 46 files changed, 681 insertions(+), 171 deletions(-) rename airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/{fakesource/FakeSourceCheckTest.kt => check/CheckTest.kt} (85%) rename airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/{fakesource/FakeSourceConfigurationTest.kt => command/ConfigurationFactoryTest.kt} (87%) create mode 100644 airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/SyncsTestFixtureTest.kt rename airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/{fakesource/FakeSourceDiscoverTest.kt => discover/DiscoverTest.kt} (94%) rename airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/{fakesource/FakeSourceSpecTest.kt => spec/SpecTest.kt} (94%) rename airbyte-cdk/bulk/core/extract/src/test/resources/{command => fakesource}/expected-schema.json (100%) delete mode 100644 airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-valid.json delete mode 100644 airbyte-cdk/bulk/core/extract/src/test/resources/read/metadata.json create mode 100644 airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/IntFieldType.kt create mode 100644 airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/OffsetDateTimeFieldType.kt create mode 100644 airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/StringFieldType.kt create mode 100644 airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/TestAirbyteStreamFactory.kt rename airbyte-cdk/bulk/core/extract/src/{test/resources/fakesource => testFixtures/resources/discover}/metadata-column-query-fails.json (100%) rename airbyte-cdk/bulk/core/extract/src/{test/resources/fakesource => testFixtures/resources/discover}/metadata-empty.json (100%) create mode 100644 airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-valid.json rename airbyte-cdk/bulk/{core/extract => toolkits/extract-jdbc}/src/test/kotlin/io/airbyte/cdk/discover/JdbcMetadataQuerierTest.kt (85%) rename airbyte-cdk/bulk/{core/base => toolkits/extract-jdbc}/src/test/kotlin/io/airbyte/cdk/h2/H2TestFixtureTest.kt (100%) rename airbyte-cdk/bulk/{core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceIntegrationTest.kt => toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/h2source/H2SourceIntegrationTest.kt} (81%) rename airbyte-cdk/bulk/{core/extract => toolkits/extract-jdbc}/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcAccessorTest.kt (100%) rename airbyte-cdk/bulk/{core/extract => toolkits/extract-jdbc}/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcConnectionFactoryTest.kt (73%) rename airbyte-cdk/bulk/{core/extract => toolkits/extract-jdbc}/src/test/kotlin/io/airbyte/cdk/read/JdbcSelectQuerierTest.kt (86%) rename airbyte-cdk/bulk/{core/extract/src/test/resources/read => toolkits/extract-jdbc/src/test/resources/h2source}/cdc-catalog.json (95%) rename airbyte-cdk/bulk/{core/extract/src/test/resources/read => toolkits/extract-jdbc/src/test/resources/h2source}/cursor-catalog.json (95%) rename airbyte-cdk/bulk/{core/extract/src/test/resources/fakesource => toolkits/extract-jdbc/src/test/resources/h2source}/expected-cdc-catalog.json (100%) rename airbyte-cdk/bulk/{core/extract/src/test/resources/fakesource => toolkits/extract-jdbc/src/test/resources/h2source}/expected-cursor-catalog.json (100%) rename airbyte-cdk/bulk/{core/extract/src/test/resources/fakesource => toolkits/extract-jdbc/src/test/resources/h2source}/expected-messages-global-cold-start.json (100%) rename airbyte-cdk/bulk/{core/extract/src/test/resources/fakesource => toolkits/extract-jdbc/src/test/resources/h2source}/expected-messages-stream-cold-start.json (100%) rename airbyte-cdk/bulk/{core/extract/src/test/resources/fakesource => toolkits/extract-jdbc/src/test/resources/h2source}/expected-messages-stream-warm-start.json (100%) rename airbyte-cdk/bulk/{core/base => toolkits/extract-jdbc}/src/testFixtures/kotlin/io/airbyte/cdk/h2/H2TestFixture.kt (100%) rename airbyte-cdk/bulk/{core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSource.kt => toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2Source.kt} (64%) create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfiguration.kt create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfigurationJsonObject.kt rename airbyte-cdk/bulk/{core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceOperations.kt => toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceOperations.kt} (96%) create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/h2source/expected-spec.json create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/metadata.yaml diff --git a/airbyte-cdk/bulk/core/base/build.gradle b/airbyte-cdk/bulk/core/base/build.gradle index f60f0a3347626..1431f08c225fc 100644 --- a/airbyte-cdk/bulk/core/base/build.gradle +++ b/airbyte-cdk/bulk/core/base/build.gradle @@ -47,6 +47,5 @@ dependencies { } testFixturesApi 'io.micronaut.test:micronaut-test-core:4.5.0' testFixturesApi 'io.micronaut.test:micronaut-test-junit5:4.5.0' - testFixturesApi 'com.h2database:h2:2.2.224' testFixturesApi 'io.github.deblockt:json-diff:1.0.1' } diff --git a/airbyte-cdk/bulk/core/extract/build.gradle b/airbyte-cdk/bulk/core/extract/build.gradle index d0b76d9b15dba..bc631cc3e37b5 100644 --- a/airbyte-cdk/bulk/core/extract/build.gradle +++ b/airbyte-cdk/bulk/core/extract/build.gradle @@ -4,6 +4,4 @@ dependencies { implementation 'hu.webarticum:tree-printer:3.2.1' testFixturesApi testFixtures(project(':airbyte-cdk:bulk:core:bulk-cdk-core-base')) - - testImplementation project(':airbyte-cdk:bulk:toolkits:bulk-cdk-toolkit-extract-jdbc') } diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceCheckTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/check/CheckTest.kt similarity index 85% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceCheckTest.kt rename to airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/check/CheckTest.kt index 388bc805ee6ff..f160a07453a55 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceCheckTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/check/CheckTest.kt @@ -1,8 +1,8 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +package io.airbyte.cdk.check import io.airbyte.cdk.Operation -import io.airbyte.cdk.check.CheckOperation +import io.airbyte.cdk.fakesource.FakeSourceConfigurationJsonObject import io.airbyte.cdk.output.BufferingOutputConsumer import io.airbyte.protocol.models.v0.AirbyteConnectionStatus import io.micronaut.context.annotation.Property @@ -13,7 +13,7 @@ import org.junit.jupiter.api.Test @MicronautTest(environments = ["source"], rebuildContext = true) @Property(name = Operation.PROPERTY, value = "check") -class FakeSourceCheckTest { +class CheckTest { @Inject lateinit var checkOperation: CheckOperation @Inject lateinit var outputConsumer: BufferingOutputConsumer @@ -22,6 +22,7 @@ class FakeSourceCheckTest { @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.port", value = "-1") @Property(name = "airbyte.connector.config.database", value = "testdb") + @Property(name = "metadata.resource", value = "discover/metadata-valid.json") fun testConfigBadPort() { assertFailed(" must have a minimum value of 0".toRegex()) } @@ -29,7 +30,7 @@ class FakeSourceCheckTest { @Test @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") - @Property(name = "metadata.resource", value = "fakesource/metadata-valid.json") + @Property(name = "metadata.resource", value = "discover/metadata-valid.json") fun testSuccess() { assertSucceeded() } @@ -37,7 +38,7 @@ class FakeSourceCheckTest { @Test @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") - @Property(name = "metadata.resource", value = "fakesource/metadata-empty.json") + @Property(name = "metadata.resource", value = "discover/metadata-empty.json") fun testBadSchema() { assertFailed("Discovered zero tables".toRegex()) } @@ -45,7 +46,7 @@ class FakeSourceCheckTest { @Test @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") - @Property(name = "metadata.resource", value = "fakesource/metadata-column-query-fails.json") + @Property(name = "metadata.resource", value = "discover/metadata-column-query-fails.json") fun testBadTables() { assertFailed("Unable to query any of the [0-9]+ discovered table".toRegex()) } diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationFactoryTest.kt similarity index 87% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationTest.kt rename to airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationFactoryTest.kt index 53f1a8e460287..e01a55f1bcb9b 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationFactoryTest.kt @@ -1,7 +1,8 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +package io.airbyte.cdk.command -import io.airbyte.cdk.command.SourceConfiguration +import io.airbyte.cdk.fakesource.FakeSourceConfiguration +import io.airbyte.cdk.fakesource.UserDefinedCursor import io.airbyte.cdk.ssh.SshConnectionOptions import io.airbyte.cdk.ssh.SshPasswordAuthTunnelMethod import io.micronaut.context.annotation.Property @@ -13,7 +14,7 @@ import org.junit.jupiter.api.Assertions import org.junit.jupiter.api.Test @MicronautTest(rebuildContext = true) -class FakeSourceConfigurationTest { +class ConfigurationFactoryTest { @Inject lateinit var actual: SourceConfiguration @Test @@ -39,10 +40,7 @@ class FakeSourceConfigurationTest { sshTunnel = SshPasswordAuthTunnelMethod("localhost", 22, "sshuser", "secret"), sshConnectionOptions = SshConnectionOptions(1_000.milliseconds, 2_000.milliseconds, Duration.ZERO), - jdbcUrlFmt = "jdbc:h2:tcp://%s:%d/mem:testdb", - schemas = setOf("PUBLIC", "TESTSCHEMA"), cursor = UserDefinedCursor, - resumablePreferred = true, maxConcurrency = 1, checkpointTargetInterval = java.time.Duration.ofDays(100L), ) diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationJsonObjectSupplierTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationJsonObjectSupplierTest.kt index c0972dd4b05b0..9cc3733ac0c5b 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationJsonObjectSupplierTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/ConfigurationJsonObjectSupplierTest.kt @@ -21,7 +21,7 @@ class ConfigurationJsonObjectSupplierTest { @Test fun testSchema() { Assertions.assertEquals(FakeSourceConfigurationJsonObject::class.java, supplier.javaClass) - val expected: String = ResourceUtils.readResource("command/expected-schema.json") + val expected: String = ResourceUtils.readResource("fakesource/expected-schema.json") Assertions.assertEquals(Jsons.readTree(expected), supplier.jsonSchema) } diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/SyncsTestFixtureTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/SyncsTestFixtureTest.kt new file mode 100644 index 0000000000000..a12e2fc3d3f22 --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/command/SyncsTestFixtureTest.kt @@ -0,0 +1,12 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.command + +import org.junit.jupiter.api.Test + +class SyncsTestFixtureTest { + + @Test + fun testSpec() { + SyncsTestFixture.testSpec("fakesource/expected-spec.json") + } +} diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceDiscoverTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/discover/DiscoverTest.kt similarity index 94% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceDiscoverTest.kt rename to airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/discover/DiscoverTest.kt index f82b98af7a1c7..8c7426fe9373d 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceDiscoverTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/discover/DiscoverTest.kt @@ -1,8 +1,7 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +package io.airbyte.cdk.discover import io.airbyte.cdk.Operation -import io.airbyte.cdk.discover.DiscoverOperation import io.airbyte.cdk.output.BufferingOutputConsumer import io.airbyte.cdk.util.Jsons import io.airbyte.protocol.models.v0.AirbyteCatalog @@ -16,7 +15,7 @@ import org.junit.jupiter.api.Test @MicronautTest(environments = ["source"], rebuildContext = true) @Property(name = Operation.PROPERTY, value = "discover") -class FakeSourceDiscoverTest { +class DiscoverTest { @Inject lateinit var discoverOperation: DiscoverOperation @Inject lateinit var outputConsumer: BufferingOutputConsumer @@ -25,7 +24,7 @@ class FakeSourceDiscoverTest { @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") @Property(name = "airbyte.connector.config.cursor", value = "user_defined") - @Property(name = "metadata.resource", value = "fakesource/metadata-valid.json") + @Property(name = "metadata.resource", value = "discover/metadata-valid.json") fun testCursorBasedIncremental() { val events = AirbyteStream() @@ -54,7 +53,7 @@ class FakeSourceDiscoverTest { @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") @Property(name = "airbyte.connector.config.cursor", value = "cdc") - @Property(name = "metadata.resource", value = "fakesource/metadata-valid.json") + @Property(name = "metadata.resource", value = "discover/metadata-valid.json") fun testCdcIncremental() { val events = AirbyteStream() diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfiguration.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfiguration.kt index 8e2ce24022c4f..215850a945ce5 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfiguration.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfiguration.kt @@ -1,7 +1,6 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ package io.airbyte.cdk.fakesource -import io.airbyte.cdk.command.JdbcSourceConfiguration import io.airbyte.cdk.command.SourceConfiguration import io.airbyte.cdk.command.SourceConfigurationFactory import io.airbyte.cdk.ssh.SshConnectionOptions @@ -13,27 +12,23 @@ import io.micronaut.context.env.Environment import jakarta.inject.Singleton import java.time.Duration -/** [SourceConfiguration] implementation for [FakeSource]. */ +/** [SourceConfiguration] implementation for a fake source. */ data class FakeSourceConfiguration( override val realHost: String, override val realPort: Int, override val sshTunnel: SshTunnelMethodConfiguration, override val sshConnectionOptions: SshConnectionOptions, - override val jdbcUrlFmt: String, - override val schemas: Set, val cursor: CursorConfiguration, - val resumablePreferred: Boolean, override val maxConcurrency: Int, override val checkpointTargetInterval: Duration, -) : JdbcSourceConfiguration { +) : SourceConfiguration { override val global: Boolean = cursor is CdcCursor - override val jdbcProperties: Map = mapOf() override val resourceAcquisitionHeartbeat: Duration get() = Duration.ofMillis(10) } -/** [SourceConfigurationFactory] implementation for [FakeSource]. */ +/** [SourceConfigurationFactory] implementation for a fake source. */ @Singleton @Requires(env = [Environment.TEST]) @Secondary @@ -49,10 +44,7 @@ class FakeSourceConfigurationFactory : realPort = pojo.port, sshTunnel = pojo.getTunnelMethodValue() ?: SshNoTunnelMethod, sshConnectionOptions = sshConnectionOptions, - jdbcUrlFmt = "jdbc:h2:tcp://%s:%d/mem:${pojo.database}", - schemas = pojo.schemas?.takeUnless { it.isEmpty() }?.toSet() ?: setOf("PUBLIC"), cursor = pojo.getCursorConfigurationValue() ?: UserDefinedCursor, - resumablePreferred = pojo.resumablePreferred != false, maxConcurrency = 1, checkpointTargetInterval = Duration.parse(pojo.timeout).takeIf { it.isPositive } ?: Duration.ofDays(100L), diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationJsonObject.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationJsonObject.kt index 6a799c0cae274..aa8e85d444103 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationJsonObject.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceConfigurationJsonObject.kt @@ -26,7 +26,7 @@ import io.micronaut.context.annotation.ConfigurationProperties import io.micronaut.context.annotation.Secondary import jakarta.inject.Singleton -/** [ConfigurationJsonObjectBase] implementation for [FakeSource]. */ +/** [ConfigurationJsonObjectBase] implementation for a fake source. */ @JsonSchemaTitle("Test Source Spec") @JsonPropertyOrder( value = diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerGlobalStatesTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerGlobalStatesTest.kt index c6e9f292f45e1..c66bc180bb169 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerGlobalStatesTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerGlobalStatesTest.kt @@ -20,7 +20,7 @@ import org.junit.jupiter.api.Test @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") @Property(name = "airbyte.connector.config.cursor.cursor_method", value = "cdc") -@Property(name = "metadata.resource", value = "read/metadata.json") +@Property(name = "metadata.resource", value = "discover/metadata-valid.json") class StateManagerGlobalStatesTest { @Inject lateinit var config: SourceConfiguration @@ -37,7 +37,7 @@ class StateManagerGlobalStatesTest { } @Test - @Property(name = "airbyte.connector.catalog.resource", value = "read/cdc-catalog.json") + @Property(name = "airbyte.connector.catalog.resource", value = "fakesource/cdc-catalog.json") @Property( name = "airbyte.connector.state.json", value = @@ -55,7 +55,7 @@ class StateManagerGlobalStatesTest { } @Test - @Property(name = "airbyte.connector.catalog.resource", value = "read/cdc-catalog.json") + @Property(name = "airbyte.connector.catalog.resource", value = "fakesource/cdc-catalog.json") @Property(name = "airbyte.connector.state.json", value = "[]") fun testColdStart() { val streams: Streams = prelude() @@ -98,7 +98,7 @@ class StateManagerGlobalStatesTest { } @Test - @Property(name = "airbyte.connector.catalog.resource", value = "read/cdc-catalog.json") + @Property(name = "airbyte.connector.catalog.resource", value = "fakesource/cdc-catalog.json") @Property( name = "airbyte.connector.state.json", value = @@ -147,7 +147,7 @@ class StateManagerGlobalStatesTest { } @Test - @Property(name = "airbyte.connector.catalog.resource", value = "read/cdc-catalog.json") + @Property(name = "airbyte.connector.catalog.resource", value = "fakesource/cdc-catalog.json") @Property( name = "airbyte.connector.state.json", value = diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerStreamStatesTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerStreamStatesTest.kt index 4774fefdbb759..8225a9b55f2fe 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerStreamStatesTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/StateManagerStreamStatesTest.kt @@ -22,7 +22,7 @@ import org.junit.jupiter.api.Test @Property(name = "airbyte.connector.config.host", value = "localhost") @Property(name = "airbyte.connector.config.database", value = "testdb") @Property(name = "airbyte.connector.config.cursor.cursor_method", value = "user_defined") -@Property(name = "metadata.resource", value = "read/metadata.json") +@Property(name = "metadata.resource", value = "discover/metadata-valid.json") class StateManagerStreamStatesTest { @Inject lateinit var config: SourceConfiguration diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceSpecTest.kt b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/spec/SpecTest.kt similarity index 94% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceSpecTest.kt rename to airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/spec/SpecTest.kt index c725179edca93..1538cadcf420a 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceSpecTest.kt +++ b/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/spec/SpecTest.kt @@ -1,5 +1,5 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +package io.airbyte.cdk.spec import com.deblock.jsondiff.DiffGenerator import com.deblock.jsondiff.diff.JsonDiff @@ -11,7 +11,6 @@ import com.deblock.jsondiff.matcher.StrictPrimitivePartialMatcher import com.deblock.jsondiff.viewer.OnlyErrorDiffViewer import io.airbyte.cdk.Operation import io.airbyte.cdk.output.BufferingOutputConsumer -import io.airbyte.cdk.spec.SpecOperation import io.airbyte.cdk.util.Jsons import io.airbyte.cdk.util.ResourceUtils import io.micronaut.context.annotation.Property @@ -23,7 +22,7 @@ import org.junit.jupiter.api.Test @MicronautTest(environments = ["source"], rebuildContext = true) @Property(name = Operation.PROPERTY, value = "spec") @Property(name = "airbyte.connector.metadata.documentation-url", value = "https://docs.airbyte.com") -class FakeSourceSpecTest { +class SpecTest { @Inject lateinit var specOperation: SpecOperation @Inject lateinit var outputConsumer: BufferingOutputConsumer diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cdc-catalog.json b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cdc-catalog.json index 20c8b3b07b22e..08eec4fcc91c1 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cdc-catalog.json +++ b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cdc-catalog.json @@ -6,17 +6,16 @@ "json_schema": { "type": "object", "properties": { + "MSG": { + "type": "string" + }, "ID": { - "type": "string", - "contentEncoding": "base64" + "type": "string" }, "TS": { "type": "string", "format": "date-time", "airbyte_type": "timestamp_with_timezone" - }, - "MSG": { - "type": "string" } } }, @@ -38,12 +37,12 @@ "json_schema": { "type": "object", "properties": { + "V": { + "type": "string" + }, "K": { "type": "number", "airbyte_type": "integer" - }, - "V": { - "type": "string" } } }, diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cursor-catalog.json b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cursor-catalog.json index 3520f52b260b4..f1b4850c1fe1f 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cursor-catalog.json +++ b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/cursor-catalog.json @@ -6,17 +6,16 @@ "json_schema": { "type": "object", "properties": { + "MSG": { + "type": "string" + }, "ID": { - "type": "string", - "contentEncoding": "base64" + "type": "string" }, "TS": { "type": "string", "format": "date-time", "airbyte_type": "timestamp_with_timezone" - }, - "MSG": { - "type": "string" } } }, @@ -37,12 +36,12 @@ "json_schema": { "type": "object", "properties": { + "V": { + "type": "string" + }, "K": { "type": "number", "airbyte_type": "integer" - }, - "V": { - "type": "string" } } }, diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/command/expected-schema.json b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-schema.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/command/expected-schema.json rename to airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-schema.json diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-valid.json b/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-valid.json deleted file mode 100644 index 332020e65a5be..0000000000000 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-valid.json +++ /dev/null @@ -1,25 +0,0 @@ -[ - { - "name": "EVENTS", - "namespace": "PUBLIC", - "metadata": { - "columns": { - "ID": "io.airbyte.cdk.jdbc.StringFieldType", - "TS": "io.airbyte.cdk.jdbc.OffsetDateTimeFieldType", - "MSG": "io.airbyte.cdk.jdbc.StringFieldType" - }, - "primaryKeys": [["ID"]] - } - }, - { - "name": "KV", - "namespace": "PUBLIC", - "metadata": { - "columns": { - "K": "io.airbyte.cdk.jdbc.IntFieldType", - "V": "io.airbyte.cdk.jdbc.StringFieldType" - }, - "primaryKeys": [["K"]] - } - } -] diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/metadata.yaml b/airbyte-cdk/bulk/core/extract/src/test/resources/metadata.yaml index 3dfe5d2711421..aff5a4b3c71cc 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/metadata.yaml +++ b/airbyte-cdk/bulk/core/extract/src/test/resources/metadata.yaml @@ -2,5 +2,3 @@ data: dockerRepository: "airbyte/fake-source" documentationUrl: "https://docs.airbyte.com" - jdbc: - mode: sequential diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/read/metadata.json b/airbyte-cdk/bulk/core/extract/src/test/resources/read/metadata.json deleted file mode 100644 index 332020e65a5be..0000000000000 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/read/metadata.json +++ /dev/null @@ -1,25 +0,0 @@ -[ - { - "name": "EVENTS", - "namespace": "PUBLIC", - "metadata": { - "columns": { - "ID": "io.airbyte.cdk.jdbc.StringFieldType", - "TS": "io.airbyte.cdk.jdbc.OffsetDateTimeFieldType", - "MSG": "io.airbyte.cdk.jdbc.StringFieldType" - }, - "primaryKeys": [["ID"]] - } - }, - { - "name": "KV", - "namespace": "PUBLIC", - "metadata": { - "columns": { - "K": "io.airbyte.cdk.jdbc.IntFieldType", - "V": "io.airbyte.cdk.jdbc.StringFieldType" - }, - "primaryKeys": [["K"]] - } - } -] diff --git a/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/IntFieldType.kt b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/IntFieldType.kt new file mode 100644 index 0000000000000..821082136bd26 --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/IntFieldType.kt @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.discover + +import io.airbyte.cdk.data.AirbyteType +import io.airbyte.cdk.data.IntCodec +import io.airbyte.cdk.data.JsonEncoder +import io.airbyte.cdk.data.LeafAirbyteType + +data object IntFieldType : FieldType { + override val airbyteType: AirbyteType = LeafAirbyteType.INTEGER + override val jsonEncoder: JsonEncoder<*> = IntCodec +} diff --git a/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/OffsetDateTimeFieldType.kt b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/OffsetDateTimeFieldType.kt new file mode 100644 index 0000000000000..ecf3c2035395b --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/OffsetDateTimeFieldType.kt @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.discover + +import io.airbyte.cdk.data.AirbyteType +import io.airbyte.cdk.data.JsonEncoder +import io.airbyte.cdk.data.LeafAirbyteType +import io.airbyte.cdk.data.OffsetDateTimeCodec + +data object OffsetDateTimeFieldType : FieldType { + override val airbyteType: AirbyteType = LeafAirbyteType.TIMESTAMP_WITH_TIMEZONE + override val jsonEncoder: JsonEncoder<*> = OffsetDateTimeCodec +} diff --git a/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/StringFieldType.kt b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/StringFieldType.kt new file mode 100644 index 0000000000000..8f1d386cf2ef2 --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/StringFieldType.kt @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.discover + +import io.airbyte.cdk.data.AirbyteType +import io.airbyte.cdk.data.JsonEncoder +import io.airbyte.cdk.data.LeafAirbyteType +import io.airbyte.cdk.data.TextCodec + +data object StringFieldType : FieldType { + override val airbyteType: AirbyteType = LeafAirbyteType.STRING + override val jsonEncoder: JsonEncoder<*> = TextCodec +} diff --git a/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/TestAirbyteStreamFactory.kt b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/TestAirbyteStreamFactory.kt new file mode 100644 index 0000000000000..5b84e88c2faa9 --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/discover/TestAirbyteStreamFactory.kt @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.discover + +import com.fasterxml.jackson.databind.node.ObjectNode +import io.airbyte.protocol.models.v0.AirbyteStream +import io.airbyte.protocol.models.v0.SyncMode +import io.micronaut.context.annotation.Requires +import io.micronaut.context.env.Environment +import jakarta.inject.Singleton + +@Singleton +@Requires(env = [Environment.TEST]) +@Requires(notEnv = [Environment.CLI]) +class TestAirbyteStreamFactory : AirbyteStreamFactory { + + override fun createGlobal(discoveredStream: DiscoveredStream): AirbyteStream = + AirbyteStreamFactory.createAirbyteStream(discoveredStream).apply { + supportedSyncModes = listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL) + (jsonSchema["properties"] as ObjectNode).apply { + for (metaField in CommonMetaField.entries) { + set(metaField.id, metaField.type.airbyteType.asJsonSchema()) + } + } + defaultCursorField = listOf(CommonMetaField.CDC_LSN.id) + sourceDefinedCursor = true + if (discoveredStream.primaryKeyColumnIDs.isNotEmpty()) { + sourceDefinedPrimaryKey = discoveredStream.primaryKeyColumnIDs + isResumable = true + } else { + isResumable = false + } + } + + override fun createNonGlobal(discoveredStream: DiscoveredStream): AirbyteStream = + AirbyteStreamFactory.createAirbyteStream(discoveredStream).apply { + supportedSyncModes = listOf(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL) + sourceDefinedCursor = false + if (discoveredStream.primaryKeyColumnIDs.isNotEmpty()) { + sourceDefinedPrimaryKey = discoveredStream.primaryKeyColumnIDs + isResumable = true + } else { + isResumable = false + } + } +} diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-column-query-fails.json b/airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-column-query-fails.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-column-query-fails.json rename to airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-column-query-fails.json diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-empty.json b/airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-empty.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/metadata-empty.json rename to airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-empty.json diff --git a/airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-valid.json b/airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-valid.json new file mode 100644 index 0000000000000..2c1da810db3af --- /dev/null +++ b/airbyte-cdk/bulk/core/extract/src/testFixtures/resources/discover/metadata-valid.json @@ -0,0 +1,25 @@ +[ + { + "name": "EVENTS", + "namespace": "PUBLIC", + "metadata": { + "columns": { + "ID": "io.airbyte.cdk.discover.StringFieldType", + "TS": "io.airbyte.cdk.discover.OffsetDateTimeFieldType", + "MSG": "io.airbyte.cdk.discover.StringFieldType" + }, + "primaryKeys": [["ID"]] + } + }, + { + "name": "KV", + "namespace": "PUBLIC", + "metadata": { + "columns": { + "K": "io.airbyte.cdk.discover.IntFieldType", + "V": "io.airbyte.cdk.discover.StringFieldType" + }, + "primaryKeys": [["K"]] + } + } +] diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/build.gradle b/airbyte-cdk/bulk/toolkits/extract-jdbc/build.gradle index df08dd0c8b84e..ec56ca6ca6471 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/build.gradle +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/build.gradle @@ -4,4 +4,6 @@ dependencies { testFixturesApi testFixtures(project(':airbyte-cdk:bulk:core:bulk-cdk-core-base')) testFixturesApi testFixtures(project(':airbyte-cdk:bulk:core:bulk-cdk-core-extract')) + testFixturesImplementation 'com.h2database:h2:2.2.224' + testFixturesImplementation 'org.apache.commons:commons-lang3:3.14.0' } diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/discover/JdbcMetadataQuerierTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/discover/JdbcMetadataQuerierTest.kt similarity index 85% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/discover/JdbcMetadataQuerierTest.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/discover/JdbcMetadataQuerierTest.kt index f8355036e9974..a2c56900e2ab7 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/discover/JdbcMetadataQuerierTest.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/discover/JdbcMetadataQuerierTest.kt @@ -1,11 +1,11 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ package io.airbyte.cdk.discover -import io.airbyte.cdk.fakesource.FakeSourceConfiguration -import io.airbyte.cdk.fakesource.FakeSourceConfigurationFactory -import io.airbyte.cdk.fakesource.FakeSourceConfigurationJsonObject -import io.airbyte.cdk.fakesource.FakeSourceOperations import io.airbyte.cdk.h2.H2TestFixture +import io.airbyte.cdk.h2source.H2SourceConfiguration +import io.airbyte.cdk.h2source.H2SourceConfigurationFactory +import io.airbyte.cdk.h2source.H2SourceConfigurationJsonObject +import io.airbyte.cdk.h2source.H2SourceOperations import java.sql.JDBCType import org.junit.jupiter.api.Assertions import org.junit.jupiter.api.Test @@ -17,16 +17,16 @@ class JdbcMetadataQuerierTest { h2.execute("CREATE TABLE kv (k INT PRIMARY KEY, v VARCHAR(60))") } - val factory = JdbcMetadataQuerier.Factory(FakeSourceOperations(), FakeSourceOperations()) + val factory = JdbcMetadataQuerier.Factory(H2SourceOperations(), H2SourceOperations()) @Test fun test() { val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database } - val config: FakeSourceConfiguration = FakeSourceConfigurationFactory().make(configPojo) + val config: H2SourceConfiguration = H2SourceConfigurationFactory().make(configPojo) factory.session(config).use { mdq: MetadataQuerier -> Assertions.assertEquals(listOf("PUBLIC"), mdq.streamNamespaces()) Assertions.assertEquals(listOf("KV"), mdq.streamNames("PUBLIC")) diff --git a/airbyte-cdk/bulk/core/base/src/test/kotlin/io/airbyte/cdk/h2/H2TestFixtureTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/h2/H2TestFixtureTest.kt similarity index 100% rename from airbyte-cdk/bulk/core/base/src/test/kotlin/io/airbyte/cdk/h2/H2TestFixtureTest.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/h2/H2TestFixtureTest.kt diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceIntegrationTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/h2source/H2SourceIntegrationTest.kt similarity index 81% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceIntegrationTest.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/h2source/H2SourceIntegrationTest.kt index 28fd695763fec..271723e628c2c 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceIntegrationTest.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/h2source/H2SourceIntegrationTest.kt @@ -1,5 +1,5 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +package io.airbyte.cdk.h2source import io.airbyte.cdk.command.SyncsTestFixture import io.airbyte.cdk.h2.H2TestFixture @@ -10,16 +10,16 @@ import java.sql.Statement import org.junit.jupiter.api.Test import org.testcontainers.Testcontainers -class FakeSourceIntegrationTest { +class H2SourceIntegrationTest { @Test fun testSpec() { - SyncsTestFixture.testSpec("fakesource/expected-spec.json") + SyncsTestFixture.testSpec("h2source/expected-spec.json") } @Test fun testCheckFailBadConfig() { SyncsTestFixture.testCheck( - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = -1 database = "" }, @@ -31,7 +31,7 @@ class FakeSourceIntegrationTest { fun testCheckFailNoDatabase() { H2TestFixture().use { h2: H2TestFixture -> val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database + "_garbage" } @@ -43,7 +43,7 @@ class FakeSourceIntegrationTest { fun testCheckFailNoTables() { H2TestFixture().use { h2: H2TestFixture -> val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database } @@ -56,7 +56,7 @@ class FakeSourceIntegrationTest { H2TestFixture().use { h2: H2TestFixture -> h2.createConnection().use(Companion::prelude) val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database } @@ -71,7 +71,7 @@ class FakeSourceIntegrationTest { Testcontainers.exposeHostPorts(h2.port) SshBastionContainer(tunnelingToHostPort = h2.port).use { ssh: SshBastionContainer -> val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { host = DOCKER_HOST_FROM_WITHIN_CONTAINER // required only because of container port = h2.port @@ -90,11 +90,11 @@ class FakeSourceIntegrationTest { H2TestFixture().use { h2: H2TestFixture -> h2.createConnection().use(Companion::prelude) val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database } - SyncsTestFixture.testDiscover(configPojo, "fakesource/expected-cursor-catalog.json") + SyncsTestFixture.testDiscover(configPojo, "h2source/expected-cursor-catalog.json") } } @@ -102,7 +102,7 @@ class FakeSourceIntegrationTest { fun testReadGlobal() { H2TestFixture().use { h2: H2TestFixture -> val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database setCursorMethodValue(CdcCursor) @@ -112,10 +112,10 @@ class FakeSourceIntegrationTest { configPojo, h2::createConnection, Companion::prelude, - "fakesource/expected-cdc-catalog.json", - "fakesource/cdc-catalog.json", + "h2source/expected-cdc-catalog.json", + "h2source/cdc-catalog.json", SyncsTestFixture.AfterRead.Companion.fromExpectedMessages( - "fakesource/expected-messages-global-cold-start.json", + "h2source/expected-messages-global-cold-start.json", ), ) } @@ -125,7 +125,7 @@ class FakeSourceIntegrationTest { fun testReadStreams() { H2TestFixture().use { h2: H2TestFixture -> val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database resumablePreferred = true @@ -134,13 +134,13 @@ class FakeSourceIntegrationTest { configPojo, h2::createConnection, Companion::prelude, - "fakesource/expected-cursor-catalog.json", - "fakesource/cursor-catalog.json", + "h2source/expected-cursor-catalog.json", + "h2source/cursor-catalog.json", SyncsTestFixture.AfterRead.Companion.fromExpectedMessages( - "fakesource/expected-messages-stream-cold-start.json", + "h2source/expected-messages-stream-cold-start.json", ), SyncsTestFixture.AfterRead.Companion.fromExpectedMessages( - "fakesource/expected-messages-stream-warm-start.json", + "h2source/expected-messages-stream-warm-start.json", ), ) } diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcAccessorTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcAccessorTest.kt similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcAccessorTest.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcAccessorTest.kt diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcConnectionFactoryTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcConnectionFactoryTest.kt similarity index 73% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcConnectionFactoryTest.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcConnectionFactoryTest.kt index e7a9b765ea7eb..336e8cf284b05 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcConnectionFactoryTest.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/jdbc/JdbcConnectionFactoryTest.kt @@ -1,9 +1,9 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ package io.airbyte.cdk.jdbc -import io.airbyte.cdk.fakesource.FakeSourceConfigurationFactory -import io.airbyte.cdk.fakesource.FakeSourceConfigurationJsonObject import io.airbyte.cdk.h2.H2TestFixture +import io.airbyte.cdk.h2source.H2SourceConfigurationFactory +import io.airbyte.cdk.h2source.H2SourceConfigurationJsonObject import io.airbyte.cdk.ssh.SshBastionContainer import io.airbyte.cdk.testcontainers.DOCKER_HOST_FROM_WITHIN_CONTAINER import org.junit.jupiter.api.Assertions @@ -22,37 +22,37 @@ class JdbcConnectionFactoryTest { @Test fun testVanilla() { val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database } - val factory = JdbcConnectionFactory(FakeSourceConfigurationFactory().make(configPojo)) + val factory = JdbcConnectionFactory(H2SourceConfigurationFactory().make(configPojo)) Assertions.assertEquals("H2", factory.get().metaData.databaseProductName) } @Test fun testSshKeyAuth() { val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { host = DOCKER_HOST_FROM_WITHIN_CONTAINER // required only because of container port = h2.port database = h2.database setTunnelMethodValue(sshBastion.outerKeyAuthTunnelMethod) } - val factory = JdbcConnectionFactory(FakeSourceConfigurationFactory().make(configPojo)) + val factory = JdbcConnectionFactory(H2SourceConfigurationFactory().make(configPojo)) Assertions.assertEquals("H2", factory.get().metaData.databaseProductName) } @Test fun testSshPasswordAuth() { val configPojo = - FakeSourceConfigurationJsonObject().apply { + H2SourceConfigurationJsonObject().apply { host = DOCKER_HOST_FROM_WITHIN_CONTAINER // required only because of container port = h2.port database = h2.database setTunnelMethodValue(sshBastion.outerPasswordAuthTunnelMethod) } - val factory = JdbcConnectionFactory(FakeSourceConfigurationFactory().make(configPojo)) + val factory = JdbcConnectionFactory(H2SourceConfigurationFactory().make(configPojo)) Assertions.assertEquals("H2", factory.get().metaData.databaseProductName) } } diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/JdbcSelectQuerierTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcSelectQuerierTest.kt similarity index 86% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/JdbcSelectQuerierTest.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcSelectQuerierTest.kt index 869b2e35a41a9..c0136171b940f 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/read/JdbcSelectQuerierTest.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcSelectQuerierTest.kt @@ -3,10 +3,10 @@ package io.airbyte.cdk.read import com.fasterxml.jackson.databind.node.ObjectNode import io.airbyte.cdk.discover.Field -import io.airbyte.cdk.fakesource.FakeSourceConfiguration -import io.airbyte.cdk.fakesource.FakeSourceConfigurationFactory -import io.airbyte.cdk.fakesource.FakeSourceConfigurationJsonObject import io.airbyte.cdk.h2.H2TestFixture +import io.airbyte.cdk.h2source.H2SourceConfiguration +import io.airbyte.cdk.h2source.H2SourceConfigurationFactory +import io.airbyte.cdk.h2source.H2SourceConfigurationJsonObject import io.airbyte.cdk.jdbc.IntFieldType import io.airbyte.cdk.jdbc.JdbcConnectionFactory import io.airbyte.cdk.jdbc.StringFieldType @@ -79,12 +79,12 @@ class JdbcSelectQuerierTest { q: SelectQuery, vararg expected: String, ) { - val configPojo: FakeSourceConfigurationJsonObject = - FakeSourceConfigurationJsonObject().apply { + val configPojo: H2SourceConfigurationJsonObject = + H2SourceConfigurationJsonObject().apply { port = h2.port database = h2.database } - val config: FakeSourceConfiguration = FakeSourceConfigurationFactory().make(configPojo) + val config: H2SourceConfiguration = H2SourceConfigurationFactory().make(configPojo) val querier: SelectQuerier = JdbcSelectQuerier(JdbcConnectionFactory(config)) val actual: List = querier.executeQuery(q).use { it.asSequence().toList() } Assertions.assertIterableEquals(expected.toList().map(Jsons::readTree), actual) diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/read/cdc-catalog.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/cdc-catalog.json similarity index 95% rename from airbyte-cdk/bulk/core/extract/src/test/resources/read/cdc-catalog.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/cdc-catalog.json index 08eec4fcc91c1..20c8b3b07b22e 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/read/cdc-catalog.json +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/cdc-catalog.json @@ -6,16 +6,17 @@ "json_schema": { "type": "object", "properties": { - "MSG": { - "type": "string" - }, "ID": { - "type": "string" + "type": "string", + "contentEncoding": "base64" }, "TS": { "type": "string", "format": "date-time", "airbyte_type": "timestamp_with_timezone" + }, + "MSG": { + "type": "string" } } }, @@ -37,12 +38,12 @@ "json_schema": { "type": "object", "properties": { - "V": { - "type": "string" - }, "K": { "type": "number", "airbyte_type": "integer" + }, + "V": { + "type": "string" } } }, diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/read/cursor-catalog.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/cursor-catalog.json similarity index 95% rename from airbyte-cdk/bulk/core/extract/src/test/resources/read/cursor-catalog.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/cursor-catalog.json index 92d203e0fb205..8ea7c7ce9e03c 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/resources/read/cursor-catalog.json +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/cursor-catalog.json @@ -6,16 +6,17 @@ "json_schema": { "type": "object", "properties": { - "MSG": { - "type": "string" - }, "ID": { - "type": "string" + "type": "string", + "contentEncoding": "base64" }, "TS": { "type": "string", "format": "date-time", "airbyte_type": "timestamp_with_timezone" + }, + "MSG": { + "type": "string" } } }, @@ -37,12 +38,12 @@ "json_schema": { "type": "object", "properties": { - "V": { - "type": "string" - }, "K": { "type": "number", "airbyte_type": "integer" + }, + "V": { + "type": "string" } } }, diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-cdc-catalog.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-cdc-catalog.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-cdc-catalog.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-cdc-catalog.json diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-cursor-catalog.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-cursor-catalog.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-cursor-catalog.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-cursor-catalog.json diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-global-cold-start.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-global-cold-start.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-global-cold-start.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-global-cold-start.json diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-stream-cold-start.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-stream-cold-start.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-stream-cold-start.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-stream-cold-start.json diff --git a/airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-stream-warm-start.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-stream-warm-start.json similarity index 100% rename from airbyte-cdk/bulk/core/extract/src/test/resources/fakesource/expected-messages-stream-warm-start.json rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-stream-warm-start.json diff --git a/airbyte-cdk/bulk/core/base/src/testFixtures/kotlin/io/airbyte/cdk/h2/H2TestFixture.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2/H2TestFixture.kt similarity index 100% rename from airbyte-cdk/bulk/core/base/src/testFixtures/kotlin/io/airbyte/cdk/h2/H2TestFixture.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2/H2TestFixture.kt diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSource.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2Source.kt similarity index 64% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSource.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2Source.kt index fd5887caf2486..fac1b555a7927 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSource.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2Source.kt @@ -1,10 +1,13 @@ -/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.cdk.h2source import io.airbyte.cdk.AirbyteSourceRunner /** A fake source database connector, vaguely compatible with the H2 database. */ -class FakeSource { +class H2Source { fun main(args: Array) { AirbyteSourceRunner.run(*args) } diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfiguration.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfiguration.kt new file mode 100644 index 0000000000000..1fe026241887a --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfiguration.kt @@ -0,0 +1,61 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.h2source + +import io.airbyte.cdk.command.JdbcSourceConfiguration +import io.airbyte.cdk.command.SourceConfiguration +import io.airbyte.cdk.command.SourceConfigurationFactory +import io.airbyte.cdk.ssh.SshConnectionOptions +import io.airbyte.cdk.ssh.SshNoTunnelMethod +import io.airbyte.cdk.ssh.SshTunnelMethodConfiguration +import io.micronaut.context.annotation.Requires +import io.micronaut.context.annotation.Secondary +import io.micronaut.context.env.Environment +import jakarta.inject.Singleton +import java.time.Duration + +/** [SourceConfiguration] implementation for [H2Source]. */ +data class H2SourceConfiguration( + override val realHost: String, + override val realPort: Int, + override val sshTunnel: SshTunnelMethodConfiguration, + override val sshConnectionOptions: SshConnectionOptions, + override val jdbcUrlFmt: String, + override val schemas: Set, + val cursor: CursorConfiguration, + val resumablePreferred: Boolean, + override val maxConcurrency: Int, + override val checkpointTargetInterval: Duration, +) : JdbcSourceConfiguration { + override val global: Boolean = cursor is CdcCursor + override val jdbcProperties: Map = mapOf() + + override val resourceAcquisitionHeartbeat: Duration + get() = Duration.ofMillis(10) +} + +/** [SourceConfigurationFactory] implementation for [H2Source]. */ +@Singleton +@Requires(env = [Environment.TEST]) +@Secondary +class H2SourceConfigurationFactory : + SourceConfigurationFactory { + override fun makeWithoutExceptionHandling( + pojo: H2SourceConfigurationJsonObject, + ): H2SourceConfiguration { + val sshConnectionOptions: SshConnectionOptions = + SshConnectionOptions.fromAdditionalProperties(pojo.getAdditionalProperties()) + return H2SourceConfiguration( + realHost = pojo.host, + realPort = pojo.port, + sshTunnel = pojo.getTunnelMethodValue() ?: SshNoTunnelMethod, + sshConnectionOptions = sshConnectionOptions, + jdbcUrlFmt = "jdbc:h2:tcp://%s:%d/mem:${pojo.database}", + schemas = pojo.schemas?.takeUnless { it.isEmpty() }?.toSet() ?: setOf("PUBLIC"), + cursor = pojo.getCursorConfigurationValue() ?: UserDefinedCursor, + resumablePreferred = pojo.resumablePreferred != false, + maxConcurrency = 1, + checkpointTargetInterval = Duration.parse(pojo.timeout).takeIf { it.isPositive } + ?: Duration.ofDays(100L), + ) + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfigurationJsonObject.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfigurationJsonObject.kt new file mode 100644 index 0000000000000..f5289e1fab7ae --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceConfigurationJsonObject.kt @@ -0,0 +1,162 @@ +/* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ +package io.airbyte.cdk.h2source + +import com.fasterxml.jackson.annotation.JsonAnyGetter +import com.fasterxml.jackson.annotation.JsonAnySetter +import com.fasterxml.jackson.annotation.JsonGetter +import com.fasterxml.jackson.annotation.JsonIgnore +import com.fasterxml.jackson.annotation.JsonProperty +import com.fasterxml.jackson.annotation.JsonPropertyDescription +import com.fasterxml.jackson.annotation.JsonPropertyOrder +import com.fasterxml.jackson.annotation.JsonSetter +import com.fasterxml.jackson.annotation.JsonSubTypes +import com.fasterxml.jackson.annotation.JsonTypeInfo +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaArrayWithUniqueItems +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaDefault +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaDescription +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaInject +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings +import io.airbyte.cdk.ConfigErrorException +import io.airbyte.cdk.command.CONNECTOR_CONFIG_PREFIX +import io.airbyte.cdk.command.ConfigurationJsonObjectBase +import io.airbyte.cdk.ssh.MicronautPropertiesFriendlySshTunnelMethodConfigurationJsonObject +import io.airbyte.cdk.ssh.SshTunnelMethodConfiguration +import io.micronaut.context.annotation.ConfigurationBuilder +import io.micronaut.context.annotation.ConfigurationProperties +import io.micronaut.context.annotation.Secondary +import jakarta.inject.Singleton + +/** [ConfigurationJsonObjectBase] implementation for [H2Source]. */ +@JsonSchemaTitle("Test Source Spec") +@JsonPropertyOrder( + value = + [ + "host", + "port", + "database", + "schemas", + "tunnel_method", + "cursor", + ], +) +@Singleton +@Secondary +@ConfigurationProperties(CONNECTOR_CONFIG_PREFIX) +@SuppressFBWarnings(value = ["NP_NONNULL_RETURN_VIOLATION"], justification = "Micronaut DI") +class H2SourceConfigurationJsonObject : ConfigurationJsonObjectBase() { + @JsonProperty("host") + @JsonSchemaTitle("Host") + @JsonSchemaInject(json = """{"order":1}""") + @JsonSchemaDefault("localhost") + @JsonPropertyDescription("Hostname of the database.") + var host: String = "localhost" + + @JsonProperty("port") + @JsonSchemaTitle("Port") + @JsonSchemaInject(json = """{"order":2,"minimum": 0,"maximum": 65536}""") + @JsonSchemaDefault("9092") + @JsonPropertyDescription("Port of the database.") + var port: Int = 9092 + + @JsonProperty("database") + @JsonSchemaTitle("Database") + @JsonPropertyDescription("Name of the database.") + @JsonSchemaInject(json = """{"order":3}""") + lateinit var database: String + + @JsonProperty("schemas") + @JsonSchemaTitle("Schemas") + @JsonSchemaArrayWithUniqueItems("schemas") + @JsonPropertyDescription("The list of schemas to sync from. Defaults to PUBLIC.") + @JsonSchemaInject(json = """{"order":4,"minItems":1,"uniqueItems":true}""") + var schemas: List? = null + + @JsonIgnore + @ConfigurationBuilder(configurationPrefix = "tunnel_method") + val tunnelMethod = MicronautPropertiesFriendlySshTunnelMethodConfigurationJsonObject() + + @JsonIgnore var tunnelMethodJson: SshTunnelMethodConfiguration? = null + + @JsonSetter("tunnel_method") + fun setTunnelMethodValue(value: SshTunnelMethodConfiguration?) { + tunnelMethodJson = value + } + + @JsonGetter("tunnel_method") + @JsonSchemaTitle("SSH Tunnel Method") + @JsonPropertyDescription( + "Whether to initiate an SSH tunnel before connecting to the database," + + " and if so, which kind of authentication to use.", + ) + @JsonSchemaInject(json = """{"order":5}""") + fun getTunnelMethodValue(): SshTunnelMethodConfiguration? = + tunnelMethodJson ?: tunnelMethod.asSshTunnelMethod() + + @JsonIgnore + @ConfigurationBuilder(configurationPrefix = "cursor") + val cursor = MicronautPropertiesFriendlyCursorConfiguration() + + @JsonIgnore var cursorJson: CursorConfiguration? = null + + @JsonSetter("cursor") + fun setCursorMethodValue(value: CursorConfiguration?) { + cursorJson = value + } + + @JsonGetter("cursor") + @JsonSchemaTitle("Update Method") + @JsonPropertyDescription("Configures how data is extracted from the database.") + @JsonSchemaInject(json = """{"order":6,"display_type":"radio"}""") + fun getCursorConfigurationValue(): CursorConfiguration? = + cursorJson ?: cursor.asCursorConfiguration() + + @JsonProperty("resumable_preferred") + @JsonSchemaDefault("true") + @JsonSchemaInject(json = """{"order":7,"display_type":"check"}""") + var resumablePreferred: Boolean? = true + + @JsonProperty("timeout") + @JsonSchemaDefault("PT0S") + @JsonSchemaInject(json = """{"order":8}""") + var timeout: String? = "PT0S" + + @JsonIgnore var additionalPropertiesMap = mutableMapOf() + + @JsonAnyGetter fun getAdditionalProperties(): Map = additionalPropertiesMap + + @JsonAnySetter + fun setAdditionalProperty( + name: String, + value: Any, + ) { + additionalPropertiesMap[name] = value + } +} + +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "cursor_method") +@JsonSubTypes( + JsonSubTypes.Type(value = UserDefinedCursor::class, name = "user_defined"), + JsonSubTypes.Type(value = CdcCursor::class, name = "cdc"), +) +@JsonSchemaTitle("Update Method") +@JsonSchemaDescription("Configures how data is extracted from the database.") +sealed interface CursorConfiguration + +@JsonSchemaTitle("Scan Changes with User Defined Cursor") +data object UserDefinedCursor : CursorConfiguration + +@JsonSchemaTitle("Read Changes using Change Data Capture (CDC)") +data object CdcCursor : CursorConfiguration + +@ConfigurationProperties("$CONNECTOR_CONFIG_PREFIX.cursor") +class MicronautPropertiesFriendlyCursorConfiguration { + var cursorMethod: String = "user_defined" + + fun asCursorConfiguration(): CursorConfiguration = + when (cursorMethod) { + "user_defined" -> UserDefinedCursor + "cdc" -> CdcCursor + else -> throw ConfigErrorException("invalid value $cursorMethod") + } +} diff --git a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceOperations.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceOperations.kt similarity index 96% rename from airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceOperations.kt rename to airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceOperations.kt index 7c8af4fad02b4..b561dcef6ee1d 100644 --- a/airbyte-cdk/bulk/core/extract/src/test/kotlin/io/airbyte/cdk/fakesource/FakeSourceOperations.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/kotlin/io/airbyte/cdk/h2source/H2SourceOperations.kt @@ -1,5 +1,5 @@ /* Copyright (c) 2024 Airbyte, Inc., all rights reserved. */ -package io.airbyte.cdk.fakesource +package io.airbyte.cdk.h2source import io.airbyte.cdk.discover.FieldType import io.airbyte.cdk.discover.JdbcMetadataQuerier @@ -62,11 +62,11 @@ import io.micronaut.context.env.Environment import jakarta.inject.Singleton import java.sql.JDBCType -/** Stateless connector-specific logic for [FakeSource]. */ +/** Stateless connector-specific logic for [H2Source]. */ @Singleton @Requires(env = [Environment.TEST]) @Secondary -class FakeSourceOperations : JdbcMetadataQuerier.FieldTypeMapper, SelectQueryGenerator { +class H2SourceOperations : JdbcMetadataQuerier.FieldTypeMapper, SelectQueryGenerator { override fun toFieldType(c: JdbcMetadataQuerier.ColumnMetadata): FieldType = when (c.type.jdbcType) { JDBCType.BIT, @@ -181,7 +181,7 @@ class FakeSourceOperations : JdbcMetadataQuerier.FieldTypeMapper, SelectQueryGen is Or -> disj.flatMap { it.bindings() } is WhereClauseLeafNode -> { val type = column.type as LosslessJdbcFieldType<*, *> - listOf(io.airbyte.cdk.read.SelectQuery.Binding(bindingValue, type)) + listOf(SelectQuery.Binding(bindingValue, type)) } } } diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/h2source/expected-spec.json b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/h2source/expected-spec.json new file mode 100644 index 0000000000000..1656ebc276c36 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/h2source/expected-spec.json @@ -0,0 +1,212 @@ +{ + "documentationUrl": "https://docs.airbyte.com", + "connectionSpecification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Test Source Spec", + "type": "object", + "additionalProperties": true, + "properties": { + "host": { + "type": "string", + "default": "localhost", + "description": "Hostname of the database.", + "title": "Host", + "order": 1 + }, + "port": { + "type": "integer", + "default": 9092, + "description": "Port of the database.", + "title": "Port", + "order": 2, + "minimum": 0, + "maximum": 65536 + }, + "database": { + "type": "string", + "description": "Name of the database.", + "title": "Database", + "order": 3 + }, + "schemas": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The list of schemas to sync from. Defaults to PUBLIC.", + "title": "Schemas", + "order": 4, + "minItems": 1, + "uniqueItems": true + }, + "tunnel_method": { + "oneOf": [ + { + "title": "No Tunnel", + "type": "object", + "additionalProperties": true, + "description": "No ssh tunnel needed to connect to database", + "properties": { + "tunnel_method": { + "type": "string", + "enum": ["NO_TUNNEL"], + "default": "NO_TUNNEL" + } + }, + "required": ["tunnel_method"] + }, + { + "title": "SSH Key Authentication", + "type": "object", + "additionalProperties": true, + "description": "Connect through a jump server tunnel host using username and ssh key", + "properties": { + "tunnel_method": { + "type": "string", + "enum": ["SSH_KEY_AUTH"], + "default": "SSH_KEY_AUTH" + }, + "tunnel_host": { + "type": "string", + "description": "Hostname of the jump server host that allows inbound ssh tunnel.", + "title": "SSH Tunnel Jump Server Host", + "order": 1 + }, + "tunnel_port": { + "type": "integer", + "default": 22, + "description": "Port on the proxy/jump server that accepts inbound ssh connections.", + "title": "SSH Connection Port", + "order": 2, + "minimum": 0, + "maximum": 65536 + }, + "tunnel_user": { + "type": "string", + "description": "OS-level username for logging into the jump server host", + "title": "SSH Login Username", + "order": 3 + }, + "ssh_key": { + "type": "string", + "description": "OS-level user account ssh key credentials in RSA PEM format ( created with ssh-keygen -t rsa -m PEM -f myuser_rsa )", + "title": "SSH Private Key", + "order": 4, + "multiline": true, + "airbyte_secret": true + } + }, + "required": [ + "tunnel_method", + "tunnel_host", + "tunnel_port", + "tunnel_user", + "ssh_key" + ] + }, + { + "title": "Password Authentication", + "type": "object", + "additionalProperties": true, + "description": "Connect through a jump server tunnel host using username and password authentication", + "properties": { + "tunnel_method": { + "type": "string", + "enum": ["SSH_PASSWORD_AUTH"], + "default": "SSH_PASSWORD_AUTH" + }, + "tunnel_host": { + "type": "string", + "description": "Hostname of the jump server host that allows inbound ssh tunnel.", + "title": "SSH Tunnel Jump Server Host", + "order": 1 + }, + "tunnel_port": { + "type": "integer", + "default": 22, + "description": "Port on the proxy/jump server that accepts inbound ssh connections.", + "title": "SSH Connection Port", + "order": 2, + "minimum": 0, + "maximum": 65536 + }, + "tunnel_user": { + "type": "string", + "description": "OS-level username for logging into the jump server host", + "title": "SSH Login Username", + "order": 3 + }, + "tunnel_user_password": { + "type": "string", + "description": "OS-level password for logging into the jump server host", + "title": "Password", + "order": 4, + "airbyte_secret": true + } + }, + "required": [ + "tunnel_method", + "tunnel_host", + "tunnel_port", + "tunnel_user", + "tunnel_user_password" + ] + } + ], + "description": "Whether to initiate an SSH tunnel before connecting to the database, and if so, which kind of authentication to use.", + "title": "SSH Tunnel Method", + "order": 5, + "type": "object" + }, + "cursor": { + "oneOf": [ + { + "title": "Scan Changes with User Defined Cursor", + "type": "object", + "additionalProperties": true, + "description": "Configures how data is extracted from the database.", + "properties": { + "cursor_method": { + "type": "string", + "enum": ["user_defined"], + "default": "user_defined" + } + }, + "required": ["cursor_method"] + }, + { + "title": "Read Changes using Change Data Capture (CDC)", + "type": "object", + "additionalProperties": true, + "description": "Configures how data is extracted from the database.", + "properties": { + "cursor_method": { + "type": "string", + "enum": ["cdc"], + "default": "cdc" + } + }, + "required": ["cursor_method"] + } + ], + "description": "Configures how data is extracted from the database.", + "title": "Update Method", + "order": 6, + "display_type": "radio", + "type": "object" + }, + "resumable_preferred": { + "type": "boolean", + "default": true, + "order": 7, + "display_type": "check" + }, + "timeout": { + "type": "string", + "default": "PT0S", + "order": 8 + } + }, + "required": ["host", "port", "database"] + } +} diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/metadata.yaml b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/metadata.yaml new file mode 100644 index 0000000000000..922eea22c849b --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/metadata.yaml @@ -0,0 +1,6 @@ +--- +data: + dockerRepository: "airbyte/h2-source" + documentationUrl: "https://docs.airbyte.com" + jdbc: + mode: sequential From 89ebec48b4963886ef765a6028b986268ff12fae Mon Sep 17 00:00:00 2001 From: Marius Posta Date: Tue, 20 Aug 2024 20:13:46 -0400 Subject: [PATCH 11/11] extract-jdbc: refactor constants --- .../cdk/read/DefaultJdbcSharedState.kt | 116 +++++++++--------- .../cdk/read/DefaultJdbcStreamState.kt | 2 +- .../cdk/read/JdbcPartitionsCreatorFactory.kt | 5 +- .../read/DefaultJdbcFetchSizeEstimatorTest.kt | 2 +- .../read/DefaultJdbcPartitionFactoryTest.kt | 14 +-- .../cdk/read/JdbcPartitionsCreatorTest.kt | 32 +++-- .../io/airbyte/cdk/read/TestFixtures.kt | 23 +--- .../testFixtures/resources/application.yml | 6 + .../src/testFixtures/resources/metadata.yaml | 2 - 9 files changed, 102 insertions(+), 100 deletions(-) create mode 100644 airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/application.yml diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt index 86fb0f2cf8e69..0eb02dc9dc491 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcSharedState.kt @@ -5,9 +5,8 @@ package io.airbyte.cdk.read import io.airbyte.cdk.command.JdbcSourceConfiguration -import io.airbyte.cdk.command.MetadataYamlPropertySource.Companion.PROPERTY_PREFIX import io.airbyte.cdk.output.OutputConsumer -import io.micronaut.context.annotation.Value +import io.micronaut.context.annotation.ConfigurationProperties import jakarta.inject.Singleton import kotlinx.coroutines.sync.Semaphore @@ -17,53 +16,78 @@ class DefaultJdbcSharedState( override val configuration: JdbcSourceConfiguration, override val outputConsumer: OutputConsumer, override val selectQuerier: SelectQuerier, - @Value("\${$PROPERTY_PREFIX.jdbc.with-sampling:$WITH_SAMPLING}") - override val withSampling: Boolean, - @Value("\${$PROPERTY_PREFIX.jdbc.table-sample-size:$TABLE_SAMPLE_SIZE}") - override val maxSampleSize: Int, - /** How many bytes per second we can expect the database to send to the connector. */ - @Value("\${$PROPERTY_PREFIX.jdbc.throughput-bytes-per-second:$THROUGHPUT_BYTES_PER_SECOND}") - val expectedThroughputBytesPerSecond: Long, - /** Smallest possible fetchSize value. */ - @Value("\${$PROPERTY_PREFIX.jdbc.min-fetch-size:$FETCH_SIZE_LOWER_BOUND}") - val minFetchSize: Int, - /** Default fetchSize value, in absence of any other estimate. */ - @Value("\${$PROPERTY_PREFIX.jdbc.default-fetch-size:$DEFAULT_FETCH_SIZE}") - val defaultFetchSize: Int, - /** Largest possible fetchSize value. */ - @Value("\${$PROPERTY_PREFIX.jdbc.max-fetch-size:$FETCH_SIZE_UPPER_BOUND}") - val maxFetchSize: Int, - /** How much of the JVM heap can we fill up with [java.sql.ResultSet] data. */ - @Value("\${$PROPERTY_PREFIX.jdbc.memory-capacity-ratio:$MEM_CAPACITY_RATIO}") - val memoryCapacityRatio: Double, - /** Estimated bytes used as overhead for each row in a [java.sql.ResultSet]. */ - @Value("\${$PROPERTY_PREFIX.jdbc.estimated-record-overhead-bytes:$RECORD_OVERHEAD_BYTES}") - val estimatedRecordOverheadBytes: Long, - /** Estimated bytes used as overhead for each column value in a [java.sql.ResultSet]. */ - @Value("\${$PROPERTY_PREFIX.jdbc.estimated-field-overhead-bytes:$FIELD_OVERHEAD_BYTES}") - val estimatedFieldOverheadBytes: Long, - /** Overrides the JVM heap capacity to provide determinism in tests. */ - val maxMemoryBytesForTesting: Long? = null + val constants: Constants, ) : JdbcSharedState { + @ConfigurationProperties(JDBC_PROPERTY_PREFIX) + data class Constants( + val withSampling: Boolean = WITH_SAMPLING, + val maxSampleSize: Int = TABLE_SAMPLE_SIZE, + /** How many bytes per second we can expect the database to send to the connector. */ + val expectedThroughputBytesPerSecond: Long = THROUGHPUT_BYTES_PER_SECOND, + /** Smallest possible fetchSize value. */ + val minFetchSize: Int = FETCH_SIZE_LOWER_BOUND, + /** Default fetchSize value, in absence of any other estimate. */ + val defaultFetchSize: Int = DEFAULT_FETCH_SIZE, + /** Largest possible fetchSize value. */ + val maxFetchSize: Int = FETCH_SIZE_UPPER_BOUND, + /** How much of the JVM heap can we fill up with [java.sql.ResultSet] data. */ + val memoryCapacityRatio: Double = MEM_CAPACITY_RATIO, + /** Estimated bytes used as overhead for each row in a [java.sql.ResultSet]. */ + val estimatedRecordOverheadBytes: Long = RECORD_OVERHEAD_BYTES, + /** Estimated bytes used as overhead for each column value in a [java.sql.ResultSet]. */ + val estimatedFieldOverheadBytes: Long = FIELD_OVERHEAD_BYTES, + /** Overrides the JVM heap capacity to provide determinism in tests. */ + val maxMemoryBytesForTesting: Long? = null + ) { + companion object { + + // Sampling defaults. + internal const val WITH_SAMPLING: Boolean = false + internal const val TABLE_SAMPLE_SIZE: Int = 1024 + internal const val THROUGHPUT_BYTES_PER_SECOND: Long = 10L shl 20 + + // fetchSize defaults + internal const val FETCH_SIZE_LOWER_BOUND: Int = 10 + internal const val DEFAULT_FETCH_SIZE: Int = 1_000 + internal const val FETCH_SIZE_UPPER_BOUND: Int = 10_000_000 + + // Memory estimate defaults. + internal const val RECORD_OVERHEAD_BYTES = 16L + internal const val FIELD_OVERHEAD_BYTES = 16L + // We're targeting use of 60% of the available memory in order to allow + // for some headroom for other garbage collection. + internal const val MEM_CAPACITY_RATIO: Double = 0.6 + } + } + + override val withSampling: Boolean + get() = constants.withSampling + + override val maxSampleSize: Int + get() = constants.maxSampleSize + val maxPartitionThroughputBytesPerSecond: Long = - expectedThroughputBytesPerSecond / configuration.maxConcurrency + constants.expectedThroughputBytesPerSecond / configuration.maxConcurrency override val targetPartitionByteSize: Long = maxPartitionThroughputBytesPerSecond * configuration.checkpointTargetInterval.seconds override fun jdbcFetchSizeEstimator(): JdbcSharedState.JdbcFetchSizeEstimator = DefaultJdbcFetchSizeEstimator( - maxMemoryBytes = maxMemoryBytesForTesting ?: Runtime.getRuntime().maxMemory(), + maxMemoryBytes = constants.maxMemoryBytesForTesting ?: Runtime.getRuntime().maxMemory(), configuration.maxConcurrency, - minFetchSize, - defaultFetchSize, - maxFetchSize, - memoryCapacityRatio, + constants.minFetchSize, + constants.defaultFetchSize, + constants.maxFetchSize, + constants.memoryCapacityRatio, ) override fun rowByteSizeEstimator(): JdbcSharedState.RowByteSizeEstimator = - DefaultRowByteSizeEstimator(estimatedRecordOverheadBytes, estimatedFieldOverheadBytes) + DefaultRowByteSizeEstimator( + constants.estimatedRecordOverheadBytes, + constants.estimatedFieldOverheadBytes, + ) internal val semaphore = Semaphore(configuration.maxConcurrency) @@ -80,24 +104,4 @@ class DefaultJdbcSharedState( } else { null } - - companion object { - - // Sampling defaults. - internal const val WITH_SAMPLING: Boolean = false - internal const val TABLE_SAMPLE_SIZE: Int = 1024 - internal const val THROUGHPUT_BYTES_PER_SECOND: Long = 10L shl 20 - - // fetchSize defaults - internal const val FETCH_SIZE_LOWER_BOUND: Int = 10 - internal const val DEFAULT_FETCH_SIZE: Int = 1_000 - internal const val FETCH_SIZE_UPPER_BOUND: Int = 10_000_000 - - // Memory estimate defaults. - internal const val RECORD_OVERHEAD_BYTES = 16L - internal const val FIELD_OVERHEAD_BYTES = 16L - // We're targeting use of 60% of the available memory in order to allow - // for some headroom for other garbage collection. - internal const val MEM_CAPACITY_RATIO: Double = 0.6 - } } diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamState.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamState.kt index 88e4712b7189a..f4c16b87a7224 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamState.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/DefaultJdbcStreamState.kt @@ -26,7 +26,7 @@ class DefaultJdbcStreamState( } override val fetchSizeOrDefault: Int - get() = fetchSize ?: sharedState.defaultFetchSize + get() = fetchSize ?: sharedState.constants.defaultFetchSize override val limit: Long get() = fetchSizeOrDefault * transient.get().limitState.current diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorFactory.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorFactory.kt index 0d88ae2b81893..382dd38e822e5 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorFactory.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/main/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorFactory.kt @@ -4,7 +4,6 @@ package io.airbyte.cdk.read -import io.airbyte.cdk.command.MetadataYamlPropertySource.Companion.PROPERTY_PREFIX import io.airbyte.cdk.command.OpaqueStateValue import io.micronaut.context.annotation.Requires import jakarta.inject.Singleton @@ -69,4 +68,6 @@ class JdbcConcurrentPartitionsCreatorFactory< JdbcConcurrentPartitionsCreator(partition, partitionFactory) } -private const val MODE_PROPERTY = "$PROPERTY_PREFIX.jdbc.mode" +const val JDBC_PROPERTY_PREFIX = "airbyte.connector.extract.jdbc" + +private const val MODE_PROPERTY = "$JDBC_PROPERTY_PREFIX.mode" diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimatorTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimatorTest.kt index 59e7d98ec3997..70228817de8e8 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimatorTest.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcFetchSizeEstimatorTest.kt @@ -32,6 +32,6 @@ class DefaultJdbcFetchSizeEstimatorTest { val sample = Sample(listOf(), Kind.EMPTY, 0L) val sharedState = sharedState(maxMemoryBytesForTesting = 700_000, maxConcurrency = 2) val estimator = sharedState.jdbcFetchSizeEstimator() - Assertions.assertEquals(sharedState.defaultFetchSize, estimator.apply(sample)) + Assertions.assertEquals(sharedState.constants.defaultFetchSize, estimator.apply(sample)) } } diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactoryTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactoryTest.kt index a311b1b0748c9..0f7e05158752b 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactoryTest.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/DefaultJdbcPartitionFactoryTest.kt @@ -51,7 +51,7 @@ class DefaultJdbcPartitionFactoryTest { stream.name, stream.namespace, sampleRateInvPow2 = 8, - DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, ), ) ) @@ -87,7 +87,7 @@ class DefaultJdbcPartitionFactoryTest { stream.name, stream.namespace, sampleRateInvPow2 = 8, - DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, ), ) ) @@ -132,7 +132,7 @@ class DefaultJdbcPartitionFactoryTest { stream.name, stream.namespace, sampleRateInvPow2 = 8, - DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, ), NoWhere, OrderBy(id), @@ -202,7 +202,7 @@ class DefaultJdbcPartitionFactoryTest { stream.name, stream.namespace, sampleRateInvPow2 = 8, - DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, ), NoWhere, OrderBy(id) @@ -308,7 +308,7 @@ class DefaultJdbcPartitionFactoryTest { stream.name, stream.namespace, sampleRateInvPow2 = 8, - DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, ), Where(Greater(id, IntCodec.encode(22))), OrderBy(id), @@ -365,7 +365,7 @@ class DefaultJdbcPartitionFactoryTest { stream.name, stream.namespace, sampleRateInvPow2 = 8, - DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, ), Where(Greater(id, IntCodec.encode(22))), OrderBy(id) @@ -446,7 +446,7 @@ class DefaultJdbcPartitionFactoryTest { stream.name, stream.namespace, sampleRateInvPow2 = 8, - DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, + DefaultJdbcSharedState.Constants.TABLE_SAMPLE_SIZE, ), Where( And( diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorTest.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorTest.kt index 7273d2cff04dd..225ddcc02bee4 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorTest.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/JdbcPartitionsCreatorTest.kt @@ -24,12 +24,16 @@ class JdbcPartitionsCreatorTest { @Test fun testConcurrentSnapshotWithCursor() { val stream = stream() + val sharedState = sharedState( - withSampling = true, - maxSampleSize = 4, - expectedThroughputBytesPerSecond = - 1L, // absurdly low value to create many partitions + constants = + DefaultJdbcSharedState.Constants( + withSampling = true, + maxSampleSize = 4, + // absurdly low value to create many partitions + expectedThroughputBytesPerSecond = 1L, + ), mockedQueries = arrayOf( TestFixtures.MockedQuery( @@ -107,10 +111,13 @@ class JdbcPartitionsCreatorTest { val stream = stream(withCursor = false) val sharedState = sharedState( - withSampling = true, - maxSampleSize = 4, - expectedThroughputBytesPerSecond = - 1L, // absurdly low value to create many partitions + constants = + DefaultJdbcSharedState.Constants( + withSampling = true, + maxSampleSize = 4, + // absurdly low value to create many partitions + expectedThroughputBytesPerSecond = 1L, + ), mockedQueries = arrayOf( TestFixtures.MockedQuery( @@ -216,8 +223,11 @@ class JdbcPartitionsCreatorTest { val stream = stream(withCursor = false) val sharedState = sharedState( - withSampling = true, - maxSampleSize = 4, + constants = + DefaultJdbcSharedState.Constants( + withSampling = true, + maxSampleSize = 4, + ), mockedQueries = arrayOf( TestFixtures.MockedQuery( @@ -317,7 +327,7 @@ class JdbcPartitionsCreatorTest { val stream = stream() val sharedState = sharedState( - withSampling = true, + constants = DefaultJdbcSharedState.Constants(withSampling = true), // The JdbcSequentialPartitionsCreator is not expected to query anything. mockedQueries = arrayOf() ) diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt index c97e853bf6b9f..cbea4d9464807 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/read/TestFixtures.kt @@ -71,32 +71,15 @@ object TestFixtures { global: Boolean = false, checkpointTargetInterval: Duration = Duration.ofMinutes(1), maxConcurrency: Int = 10, - withSampling: Boolean = false, - maxSampleSize: Int = DefaultJdbcSharedState.TABLE_SAMPLE_SIZE, - expectedThroughputBytesPerSecond: Long = DefaultJdbcSharedState.THROUGHPUT_BYTES_PER_SECOND, - minFetchSize: Int = DefaultJdbcSharedState.FETCH_SIZE_LOWER_BOUND, - defaultFetchSize: Int = DefaultJdbcSharedState.DEFAULT_FETCH_SIZE, - maxFetchSize: Int = DefaultJdbcSharedState.FETCH_SIZE_UPPER_BOUND, - memoryCapacityRatio: Double = DefaultJdbcSharedState.MEM_CAPACITY_RATIO, - estimatedRecordOverheadBytes: Long = DefaultJdbcSharedState.RECORD_OVERHEAD_BYTES, - estimatedFieldOverheadBytes: Long = DefaultJdbcSharedState.FIELD_OVERHEAD_BYTES, - maxMemoryBytesForTesting: Long = 1_000_000, + maxMemoryBytesForTesting: Long = 1_000_000L, + constants: DefaultJdbcSharedState.Constants = DefaultJdbcSharedState.Constants(), vararg mockedQueries: MockedQuery, ) = DefaultJdbcSharedState( StubbedJdbcSourceConfiguration(global, checkpointTargetInterval, maxConcurrency), BufferingOutputConsumer(TestClockFactory().fixed()), MockSelectQuerier(ArrayDeque(mockedQueries.toList())), - withSampling, - maxSampleSize, - expectedThroughputBytesPerSecond, - minFetchSize, - defaultFetchSize, - maxFetchSize, - memoryCapacityRatio, - estimatedRecordOverheadBytes, - estimatedFieldOverheadBytes, - maxMemoryBytesForTesting, + constants.copy(maxMemoryBytesForTesting = maxMemoryBytesForTesting) ) fun DefaultJdbcSharedState.factory() = diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/application.yml b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/application.yml new file mode 100644 index 0000000000000..52d72019ae547 --- /dev/null +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/application.yml @@ -0,0 +1,6 @@ +--- +airbyte: + connector: + extract: + jdbc: + mode: sequential diff --git a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/metadata.yaml b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/metadata.yaml index 922eea22c849b..e136b2dbbb9ee 100644 --- a/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/metadata.yaml +++ b/airbyte-cdk/bulk/toolkits/extract-jdbc/src/testFixtures/resources/metadata.yaml @@ -2,5 +2,3 @@ data: dockerRepository: "airbyte/h2-source" documentationUrl: "https://docs.airbyte.com" - jdbc: - mode: sequential