apache · ismailsimsek · Jul 20, 2023 · Jul 21, 2023 · Jul 24, 2023 · Sep 29, 2023
diff --git a/.github/workflows/kafka-connect-ci.yml b/.github/workflows/kafka-connect-ci.yml
@@ -93,6 +93,7 @@ jobs:
     - run: |
         ./gradlew -DsparkVersions= -DhiveVersions= -DflinkVersions= -DkafkaVersions=3 \
           :iceberg-kafka-connect:iceberg-kafka-connect-events:check \
+          :iceberg-kafka-connect:iceberg-kafka-connect-transforms:check \
           :iceberg-kafka-connect:iceberg-kafka-connect:check \
           :iceberg-kafka-connect:iceberg-kafka-connect-runtime:check \
           -Pquick=true -x javadoc

diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
@@ -32,6 +32,7 @@ awaitility = "4.2.2"
 awssdk-bom = "2.29.43"
 azuresdk-bom = "1.2.30"
 awssdk-s3accessgrants = "2.3.0"
+bson-ver = "4.11.0"
 caffeine = "2.9.3"
 calcite = "1.10.0"
 datasketches = "6.1.1"
@@ -100,6 +101,7 @@ avro-avro = { module = "org.apache.avro:avro", version.ref = "avro" }
 awssdk-bom = { module = "software.amazon.awssdk:bom", version.ref = "awssdk-bom" }
 awssdk-s3accessgrants = { module = "software.amazon.s3.accessgrants:aws-s3-accessgrants-java-plugin", version.ref = "awssdk-s3accessgrants" }
 azuresdk-bom = { module = "com.azure:azure-sdk-bom", version.ref = "azuresdk-bom" }
+bson = { module = "org.mongodb:bson", version.ref = "bson-ver"}
 caffeine = { module = "com.github.ben-manes.caffeine:caffeine", version.ref = "caffeine" }
 calcite-core = { module = "org.apache.calcite:calcite-core", version.ref = "calcite" }
 calcite-druid = { module = "org.apache.calcite:calcite-druid", version.ref = "calcite" }
@@ -159,6 +161,7 @@ jaxb-runtime = { module = "org.glassfish.jaxb:jaxb-runtime", version.ref = "jaxb
 kafka-clients = { module = "org.apache.kafka:kafka-clients", version.ref = "kafka" }
 kafka-connect-api = { module = "org.apache.kafka:connect-api", version.ref = "kafka" }
 kafka-connect-json = { module = "org.apache.kafka:connect-json", version.ref = "kafka" }
+kafka-connect-transforms = { module = "org.apache.kafka:connect-transforms", version.ref = "kafka" }
 microprofile-openapi-api = { module = "org.eclipse.microprofile.openapi:microprofile-openapi-api", version.ref = "microprofile-openapi-api" }
 nessie-client = { module = "org.projectnessie.nessie:nessie-client", version.ref = "nessie" }
 netty-buffer = { module = "io.netty:netty-buffer", version.ref = "netty-buffer" }

diff --git a/kafka-connect/build.gradle b/kafka-connect/build.gradle
@@ -92,6 +92,7 @@ project(':iceberg-kafka-connect:iceberg-kafka-connect-runtime') {
 
   dependencies {
     implementation project(':iceberg-kafka-connect:iceberg-kafka-connect')
+    implementation project(':iceberg-kafka-connect:iceberg-kafka-connect-transforms')
     implementation(libs.hadoop3.common) {
       exclude group: 'log4j'
       exclude group: 'org.slf4j'
@@ -242,3 +243,19 @@ project(':iceberg-kafka-connect:iceberg-kafka-connect-runtime') {
 
   assemble.dependsOn distZip, hiveDistZip
 }
+
+project(':iceberg-kafka-connect:iceberg-kafka-connect-transforms') {
+  dependencies {
+    implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow')
+    implementation libs.bson
+    implementation libs.slf4j.api
+    compileOnly libs.kafka.clients
+    compileOnly libs.kafka.connect.api
+    compileOnly libs.kafka.connect.json
+    compileOnly libs.kafka.connect.transforms
+  }
+
+  test {
+    useJUnitPlatform()
+  }
+}
diff --git a/kafka-connect/kafka-connect-transforms/README.md b/kafka-connect/kafka-connect-transforms/README.md
@@ -0,0 +1,185 @@
+<!--
+  - Licensed to the Apache Software Foundation (ASF) under one
+  - or more contributor license agreements.  See the NOTICE file
+  - distributed with this work for additional information
+  - regarding copyright ownership.  The ASF licenses this file
+  - to you under the Apache License, Version 2.0 (the
+  - "License"); you may not use this file except in compliance
+  - with the License.  You may obtain a copy of the License at
+  -
+  -   http://www.apache.org/licenses/LICENSE-2.0
+  -
+  - Unless required by applicable law or agreed to in writing,
+  - software distributed under the License is distributed on an
+  - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  - KIND, either express or implied.  See the License for the
+  - specific language governing permissions and limitations
+  - under the License.
+  -->
+
+# SMTs for the Apache Iceberg Sink Connector
+
+This project contains some SMTs that could be useful when transforming Kafka data for use by
+the Iceberg sink connector.
+
+# CopyValue
+_(Experimental)_
+
+The `CopyValue` SMT copies a value from one field to a new field.
+
+## Configuration
+
+| Property         | Description       |
+|------------------|-------------------|
+| source.field     | Source field name |
+| target.field     | Target field name |
+
+## Example
+
+```
+"transforms": "copyId",
+"transforms.copyId.type": "org.apache.iceberg.connect.transforms.CopyValue",
+"transforms.copyId.source.field": "id",
+"transforms.copyId.target.field": "id_copy",
+```
+
+# DmsTransform
+_(Experimental)_
+
+The `DmsTransform` SMT transforms an AWS DMS formatted message for use by the sink's CDC feature.
+It will promote the `data` element fields to top level and add the following metadata fields:
+`_cdc.op`, `_cdc.ts`, and `_cdc.source`.
+
+## Configuration
+
+The SMT currently has no configuration.
+
+# DebeziumTransform
+_(Experimental)_
+
+The `DebeziumTransform` SMT transforms a Debezium formatted message for use by the sink's CDC feature.
+It will promote the `before` or `after` element fields to top level and add the following metadata fields:
+`_cdc.op`, `_cdc.ts`, `_cdc.offset`, `_cdc.source`, `_cdc.target`, and `_cdc.key`.
+
+## Configuration
+
+| Property            | Description                                                                       |
+|---------------------|-----------------------------------------------------------------------------------|
+| cdc.target.pattern  | Pattern to use for setting the CDC target field value, default is `{db}.{table}`  |
+
+# JsonToMapTransform
+_(Experimental)_
+
+The `JsonToMapTransform` SMT parses Strings as Json object payloads to infer schemas.  The iceberg-kafka-connect
+connector for schema-less data (e.g. the Map produced by the Kafka supplied JsonConverter) is to convert Maps into Iceberg
+Structs.  This is fine when the JSON is well-structured, but when you have JSON objects with dynamically
+changing keys, it will lead to an explosion of columns in the Iceberg table due to schema evolutions.
+
+This SMT is useful in situations where the JSON is not well-structured, in order to get data into Iceberg where
+it can be further processed by query engines into a more manageable form. It will convert nested objects to
+Maps and include Map type in the Schema.  The connector will respect the Schema and create Iceberg tables with Iceberg
+Map (String) columns for the JSON objects.
+
+Note:
+
+- You must use the `stringConverter` as the `value.converter` setting for your connector, not `jsonConverter`
+  - It expects JSON objects (`{...}`) in those strings.
+- Message keys, tombstones, and headers are not transformed and are passed along as-is by the SMT
+
+## Configuration
+
+| Property             | Description  (default value)             |
+|----------------------|------------------------------------------|
+| json.root | (false) Boolean value to start at root   |
+
+The `transforms.IDENTIFIER_HERE.json.root` is meant for the most inconsistent data.  It will construct a Struct with a single field
+called `payload` with a Schema of `Map<String, String>`.
+
+If `transforms.IDENTIFIER_HERE.json.root` is false (the default), it will construct a Struct with inferred schemas for primitive and
+array fields.  Nested objects become fields of type `Map<String, String>`.
+
+Keys with empty arrays and empty objects are filtered out from the final schema.  Arrays will be typed unless the
+json arrays have mixed types in which case they are converted to arrays of strings.
+
+Example json:
+
+```json
+{
+  "key": 1, 
+  "array": [1,"two",3],
+  "empty_obj": {},
+  "nested_obj": {"some_key": ["one", "two"]}
+}
+```
+
+Will become the following if `json.root` is true:
+
+```
+SinkRecord.schema: 
+  "payload" : (Optional) Map<String, String>
+
+Sinkrecord.value (Struct): 
+  "payload"  : Map(
+    "key" : "1",
+    "array" : "[1,"two",3]"
+    "empty_obj": "{}"
+    "nested_obj": "{"some_key":["one","two"]}}"
+   )
+```
+
+Will become the following if `json.root` is false
+
+```
+SinkRecord.schema: 
+  "key": (Optional) Int32,
+  "array": (Optional) Array<String>,
+  "nested_object": (Optional) Map<string, String>
+
+SinkRecord.value (Struct):
+ "key" 1, 
+ "array" ["1", "two", "3"] 
+ "nested_object" Map ("some_key" : "["one", "two"]") 
+```
+
+# KafkaMetadataTransform
+_(Experimental)_
+
+The `KafkaMetadata` injects `topic`, `partition`, `offset`, `timestamp` which are properties are the Kafka message.
+
+## Configuration
+
+| Property       | Description (default value)                                                       |
+|----------------|-----------------------------------------------------------------------------------|
+| field_name     | (_kafka_metadata) prefix for fields                                               | 
+| nested         | (false) if true, nests data on a struct else adds to top level as prefixed fields |
+| external_field | (none) appends a constant `key,value` to the metadata (e.g. cluster name)         | 
+
+If `nested` is on:
+
+`_kafka_metadata.topic`, `_kafka_metadata.partition`, `_kafka_metadata.offset`, `_kafka_metadata.timestamp`
+
+If `nested` is off:
+`_kafka_metdata_topic`, `_kafka_metadata_partition`, `_kafka_metadata_offset`, `_kafka_metadata_timestamp`
+
+# MongoDebeziumTransform
+_(Experimental)_ 
+
+The `MongoDebeziumTransform` SMT transforms a Mongo Debezium formatted message with `before`/`after` BSON
+strings into `before`/`after` typed Structs that the `DebeziumTransform` SMT expects. 
+
+It does not (yet) support renaming columns if mongodb column is not supported by your underlying 
+catalog type.  
+
+## Configuration
+
+| Property            | Description                                      |
+|---------------------|--------------------------------------------------|
+| array_handling_mode  | `array` or `document` to set array handling mode |
+
+Value array (the default) will encode arrays as the array datatype. It is user’s responsibility to ensure that 
+all elements for a given array instance are of the same type. This option is a restricting one but offers 
+easy processing of arrays by downstream clients.
+
+Value document will convert the array into a struct of structs in the similar way as done by BSON serialization. 
+The main struct contains fields named _0, _1, _2 etc. where the name represents the index of the element in the array.
+Every element is then passed as the value for the given field.
diff --git a/...-connect-transforms/src/main/java/org/apache/iceberg/connect/transforms/CdcConstants.java b/...-connect-transforms/src/main/java/org/apache/iceberg/connect/transforms/CdcConstants.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.connect.transforms;
+
+public interface CdcConstants {
+
+  String OP_INSERT = "I";
+  String OP_UPDATE = "U";
+  String OP_DELETE = "D";
+
+  String COL_CDC = "_cdc";
+  String COL_OP = "op";
+  String COL_TS = "ts";
+  String COL_OFFSET = "offset";
+  String COL_SOURCE = "source";
+  String COL_TARGET = "target";
+  String COL_KEY = "key";
+}