fix: adding csv support for gcs to spanner template (#1014)

* fix: adding csv support for gcs to spanner template * fix: readme file for GCS to Spanner template (csv file support)
GoogleCloudPlatform · Nov 3, 2024 · 3775cf3 · 3775cf3
1 parent f602c05
commit 3775cf3
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 7 deletions.
diff --git a/java/src/main/java/com/google/cloud/dataproc/templates/gcs/GCSToSpanner.java b/java/src/main/java/com/google/cloud/dataproc/templates/gcs/GCSToSpanner.java
@@ -54,10 +54,18 @@ public void runTemplate() {
     try (SparkSession spark = SparkSession.builder().appName("GCS to Spanner").getOrCreate()) {
       // Set log level
       spark.sparkContext().setLogLevel(config.getSparkLogLevel());
-
-      Dataset<Row> dataset =
-          spark.read().format(config.getInputFormat()).load(config.getInputLocation());
-
+      Dataset<Row> dataset;
+      if ("csv".equalsIgnoreCase(config.getInputFormat())) {
+        dataset =
+            spark
+                .read()
+                .format(config.getInputFormat())
+                .option("header", "true")
+                .option("inferSchema", "true")
+                .load(config.getInputLocation());
+      } else {
+        dataset = spark.read().format(config.getInputFormat()).load(config.getInputLocation());
+      }
       write(dataset);
     }
   }

diff --git a/java/src/main/java/com/google/cloud/dataproc/templates/gcs/GCSToSpannerConfig.java b/java/src/main/java/com/google/cloud/dataproc/templates/gcs/GCSToSpannerConfig.java
@@ -49,7 +49,7 @@ public class GCSToSpannerConfig {
 
   @JsonProperty(value = GCS_SPANNER_INPUT_FORMAT)
   @NotEmpty
-  @Pattern(regexp = "avro|parquet|orc")
+  @Pattern(regexp = "avro|parquet|orc|csv")
   private String inputFormat;
 
   @JsonProperty(value = PROJECT_ID_PROP)

diff --git a/java/src/main/java/com/google/cloud/dataproc/templates/gcs/README.md b/java/src/main/java/com/google/cloud/dataproc/templates/gcs/README.md
@@ -83,7 +83,7 @@ GCS_STAGING_LOCATION=<gcs-staging-bucket-folder> \
 bin/start.sh \
 -- --template GCSTOSPANNER \
 --templateProperty project.id=<gcp-project-id> \
---templateProperty gcs.spanner.input.format=<avro | parquet | orc> \
+--templateProperty gcs.spanner.input.format=<avro | parquet | orc | csv> \
 --templateProperty gcs.spanner.input.location=<gcs path> \
 --templateProperty gcs.spanner.output.instance=<spanner instance id> \
 --templateProperty gcs.spanner.output.database=<spanner database id> \
@@ -101,7 +101,7 @@ GCS_STAGING_LOCATION=<gcs-staging-bucket-folder> \
 bin/start.sh \
 -- --template GCSTOSPANNER \
 --templateProperty project.id=<gcp-project-id> \
---templateProperty gcs.spanner.input.format=<avro | parquet | orc> \
+--templateProperty gcs.spanner.input.format=<avro | parquet | orc | csv> \
 --templateProperty gcs.spanner.input.location=<gcs path> \
 --templateProperty gcs.spanner.output.instance=<spanner instance id> \
 --templateProperty gcs.spanner.output.database=<spanner database id> \
@@ -112,6 +112,9 @@ bin/start.sh \
 --templateProperty spanner.jdbc.dialect=postgresql
 ```
 
+Note :- While running GCS to Spanner template with CSV file formats, header should be specified in CSV file and the Spark inferred data types should be in alignment with the data types of Spanner Tables. Otherwise, the job would fail.
+As for other file formats all this information is by default being covered in their respective file formats like parquet / orc / avro.
+
 ## 4. Cloud Storage to JDBC
 
 ```