From 2a54ba36927afbc8e75f2e929b13e14940e09df2 Mon Sep 17 00:00:00 2001 From: pi-la Date: Tue, 12 Nov 2024 13:09:44 +0800 Subject: [PATCH 01/16] Update ArchiveCompressFormat.java --- .../connectors/seatunnel/file/config/ArchiveCompressFormat.java | 1 + 1 file changed, 1 insertion(+) diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/ArchiveCompressFormat.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/ArchiveCompressFormat.java index da30887a824..b90990d4348 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/ArchiveCompressFormat.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/ArchiveCompressFormat.java @@ -35,6 +35,7 @@ public enum ArchiveCompressFormat { ZIP(".zip"), TAR(".tar"), TAR_GZ(".tar.gz"), + GZ(".gz"), ; private final String archiveCompressCodec; From bff405a3a97298b8e2bdf5fd691b3d7917c2ebaf Mon Sep 17 00:00:00 2001 From: pi-la Date: Tue, 12 Nov 2024 13:11:19 +0800 Subject: [PATCH 02/16] Update AbstractReadStrategy.java --- .../seatunnel/file/source/reader/AbstractReadStrategy.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/AbstractReadStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/AbstractReadStrategy.java index 00d90d84195..a1a99d32cac 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/AbstractReadStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/AbstractReadStrategy.java @@ -238,6 +238,11 @@ protected void resolveArchiveCompressedInputStream( } } break; + case GZ: + GzipCompressorInputStream gzipIn = + new GzipCompressorInputStream(hadoopFileSystemProxy.getInputStream(path)); + readProcess(path, tableId, output, copyInputStream(gzipIn), partitionsMap, path); + break; case NONE: readProcess( path, From d827c581d783a75240b6c3c93ba8d9676337bbbc Mon Sep 17 00:00:00 2001 From: pi-la Date: Tue, 12 Nov 2024 13:18:33 +0800 Subject: [PATCH 03/16] Update LocalFile.md --- docs/en/connector-v2/source/LocalFile.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md index 65f287f057b..69b2519cb45 100644 --- a/docs/en/connector-v2/source/LocalFile.md +++ b/docs/en/connector-v2/source/LocalFile.md @@ -322,6 +322,7 @@ The compress codec of archive files and the details that supported as the follow | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | +| GZ | txt,json,excel,xml | .gz | | NONE | all | .* | ### encoding [string] @@ -490,4 +491,6 @@ sink { - [BugFix] Fix the bug of incorrect path in windows environment ([2980](https://github.com/apache/seatunnel/pull/2980)) - [Improve] Support extract partition from SeaTunnelRow fields ([3085](https://github.com/apache/seatunnel/pull/3085)) - [Improve] Support parse field from file path ([2985](https://github.com/apache/seatunnel/pull/2985)) +### 2.3.9-beta 2024-11-12 +- [Improve] Support parse field from file path ([8019](https://github.com/apache/seatunnel/issues/8019)) From dae992ca8de55c77fc0743fa7e734916a9a990b8 Mon Sep 17 00:00:00 2001 From: zhdech Date: Tue, 12 Nov 2024 18:09:08 +0800 Subject: [PATCH 04/16] solve https://github.com/apache/seatunnel/issues/8019 --- docs/en/connector-v2/source/CosFile.md | 1 + docs/en/connector-v2/source/FtpFile.md | 1 + docs/en/connector-v2/source/HdfsFile.md | 1 + docs/en/connector-v2/source/OssJindoFile.md | 1 + docs/en/connector-v2/source/S3File.md | 1 + 5 files changed, 5 insertions(+) diff --git a/docs/en/connector-v2/source/CosFile.md b/docs/en/connector-v2/source/CosFile.md index 15b6de0c6f8..396725f0639 100644 --- a/docs/en/connector-v2/source/CosFile.md +++ b/docs/en/connector-v2/source/CosFile.md @@ -343,6 +343,7 @@ The compress codec of archive files and the details that supported as the follow | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | +| GZ | txt,json,excel,xml | .gz | | NONE | all | .* | ### encoding [string] diff --git a/docs/en/connector-v2/source/FtpFile.md b/docs/en/connector-v2/source/FtpFile.md index 6d114813769..3872ac0a909 100644 --- a/docs/en/connector-v2/source/FtpFile.md +++ b/docs/en/connector-v2/source/FtpFile.md @@ -328,6 +328,7 @@ The compress codec of archive files and the details that supported as the follow | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | +| GZ | txt,json,excel,xml | .gz | | NONE | all | .* | ### encoding [string] diff --git a/docs/en/connector-v2/source/HdfsFile.md b/docs/en/connector-v2/source/HdfsFile.md index 405dfff820f..7495ae3564d 100644 --- a/docs/en/connector-v2/source/HdfsFile.md +++ b/docs/en/connector-v2/source/HdfsFile.md @@ -144,6 +144,7 @@ The compress codec of archive files and the details that supported as the follow | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | +| GZ | txt,json,excel,xml | .gz | | NONE | all | .* | ### encoding [string] diff --git a/docs/en/connector-v2/source/OssJindoFile.md b/docs/en/connector-v2/source/OssJindoFile.md index 933439edc9f..de8c38ecadd 100644 --- a/docs/en/connector-v2/source/OssJindoFile.md +++ b/docs/en/connector-v2/source/OssJindoFile.md @@ -335,6 +335,7 @@ The compress codec of archive files and the details that supported as the follow | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | +| GZ | txt,json,excel,xml | .gz | | NONE | all | .* | ### encoding [string] diff --git a/docs/en/connector-v2/source/S3File.md b/docs/en/connector-v2/source/S3File.md index 4834b025bc3..e60351a41f9 100644 --- a/docs/en/connector-v2/source/S3File.md +++ b/docs/en/connector-v2/source/S3File.md @@ -299,6 +299,7 @@ The compress codec of archive files and the details that supported as the follow | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | +| GZ | txt,json,excel,xml | .gz | | NONE | all | .* | ### encoding [string] From 0550b6cddeef053154d8800a9a6f6c4c280afb19 Mon Sep 17 00:00:00 2001 From: zhdech Date: Thu, 14 Nov 2024 17:08:45 +0800 Subject: [PATCH 05/16] solve https://github.com/apache/seatunnel/issues/8019 --- .../e2e/connector/file/local/LocalFileIT.java | 63 +++++++++- .../excel/local_excel_gz_to_assert.conf | 119 ++++++++++++++++++ .../json/local_file_json_gz_to_assert.conf | 117 +++++++++++++++++ .../text/local_file_gz_text_to_assert.conf | 117 +++++++++++++++++ .../xml/local_file_gz_xml_to_assert.conf | 102 +++++++++++++++ 5 files changed, 514 insertions(+), 4 deletions(-) create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/json/local_file_json_gz_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_gz_text_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/xml/local_file_gz_xml_to_assert.conf diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java index 0175f26f585..256424c6757 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java @@ -48,10 +48,7 @@ import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.OutputStream; +import java.io.*; import java.nio.charset.StandardCharsets; import java.nio.file.FileVisitResult; import java.nio.file.Files; @@ -149,6 +146,13 @@ public class LocalFileIT extends TestSuiteBase { "/seatunnel/read/tar_gz/txt/multifile/multiTarGz.tar.gz", container); + Path txtGz = + convertToGzFile( + Lists.newArrayList(ContainerUtil.getResourcesFile("/text/e2e.txt")), + "e2e-txt"); + ContainerUtil.copyFileIntoContainers( + txtGz, "/seatunnel/read/gz/txt/single/e2e-txt.gz", container); + Path jsonZip = convertToZipFile( Lists.newArrayList( @@ -168,6 +172,13 @@ public class LocalFileIT extends TestSuiteBase { "/seatunnel/read/zip/json/multifile/multiJson.zip", container); + Path jsonGz = + convertToGzFile( + Lists.newArrayList(ContainerUtil.getResourcesFile("/text/e2e.json")), + "e2e-json"); + ContainerUtil.copyFileIntoContainers( + jsonGz, "/seatunnel/read/gz/json/single/e2e-json.gz", container); + ContainerUtil.copyFileIntoContainers( "/text/e2e_gbk.txt", "/seatunnel/read/encoding/text/e2e_gbk.txt", @@ -193,6 +204,14 @@ public class LocalFileIT extends TestSuiteBase { ContainerUtil.copyFileIntoContainers( xmlZip, "/seatunnel/read/zip/xml/single/e2e-xml.zip", container); + Path xmlGz = + convertToGzFile( + Lists.newArrayList(ContainerUtil.getResourcesFile("/text/e2e.xml")), + "e2e-xml"); + ContainerUtil.copyFileIntoContainers( + xmlGz, "/seatunnel/read/gz/xml/single/e2e-xml.gz", container); + + Path txtLzo = convertToLzoFile(ContainerUtil.getResourcesFile("/text/e2e.txt")); ContainerUtil.copyFileIntoContainers( txtLzo, "/seatunnel/read/lzo_text/e2e.txt", container); @@ -224,6 +243,13 @@ public class LocalFileIT extends TestSuiteBase { "/seatunnel/read/zip/excel/multifile/multiZip.zip", container); + Path xlsxGz = + convertToGzFile( + Lists.newArrayList(ContainerUtil.getResourcesFile("/text/e2e.xlsx")), + "e2e-xlsx"); + ContainerUtil.copyFileIntoContainers( + xlsxGz, "/seatunnel/read/gz/xlsx/single/e2e-xlsx.gz", container); + ContainerUtil.copyFileIntoContainers( "/orc/e2e.orc", "/seatunnel/read/orc/name=tyrantlucifer/hobby=coding/e2e.orc", @@ -313,6 +339,7 @@ public void testLocalFileReadAndWrite(TestContainer container) /** Compressed file test */ // test read single local text file with zip compression helper.execute("/text/local_file_zip_text_to_assert.conf"); + helper.execute("/text/local_file_gz_text_to_assert.conf"); // test read multi local text file with zip compression helper.execute("/text/local_file_multi_zip_text_to_assert.conf"); // test read single local text file with tar compression @@ -325,12 +352,15 @@ public void testLocalFileReadAndWrite(TestContainer container) helper.execute("/text/local_file_multi_tar_gz_text_to_assert.conf"); // test read single local json file with zip compression helper.execute("/json/local_file_json_zip_to_assert.conf"); + helper.execute("/json/local_file_json_gz_to_assert.conf"); // test read multi local json file with zip compression helper.execute("/json/local_file_json_multi_zip_to_assert.conf"); // test read single local xml file with zip compression helper.execute("/xml/local_file_zip_xml_to_assert.conf"); + helper.execute("/xml/local_file_gz_xml_to_assert.conf"); // test read single local excel file with zip compression helper.execute("/excel/local_excel_zip_to_assert.conf"); + helper.execute("/excel/local_excel_gz_to_assert.conf"); // test read multi local excel file with zip compression helper.execute("/excel/local_excel_multi_zip_to_assert.conf"); } @@ -551,4 +581,29 @@ public FileVisitResult visitFile( return tarGzFilePath; } + + public Path convertToGzFile(List files, String name) throws IOException { + if (files == null || files.isEmpty()) { + throw new IllegalArgumentException("File list is empty or invalid"); + } + + File firstFile = files.get(0); + Path gzFilePath = Paths.get(firstFile.getParent(), String.format("%s.gz", name)); + + try (FileInputStream fis = new FileInputStream(firstFile); + FileOutputStream fos = new FileOutputStream(gzFilePath.toFile()); + GZIPOutputStream gzos = new GZIPOutputStream(fos)) { + + byte[] buffer = new byte[2048]; + int length; + + while ((length = fis.read(buffer)) > 0) { + gzos.write(buffer, 0, length); + } + gzos.finish(); + } catch (IOException e) { + e.printStackTrace(); + } + return gzFilePath; + } } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf new file mode 100644 index 00000000000..2814ab92453 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf @@ -0,0 +1,119 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + parallelism = 1 + job.mode = "BATCH" + # You can set spark configuration here + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + LocalFile { + path = "/seatunnel/read/gz/excel/single" + result_table_name = "fake" + file_format_type = excel + archive_compress_codec = "gz" + field_delimiter = ; + skip_header_row_number = 1 + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + } +} + +sink { + Assert { + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 5 + }, + { + rule_type = MIN_ROW + rule_value = 5 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/json/local_file_json_gz_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/json/local_file_json_gz_to_assert.conf new file mode 100644 index 00000000000..d988f7d108a --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/json/local_file_json_gz_to_assert.conf @@ -0,0 +1,117 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + parallelism = 1 + job.mode = "BATCH" + + # You can set spark configuration here + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + LocalFile { + path = "/seatunnel/read/gz/json/single" + file_format_type = "json" + archive_compress_codec = "gz" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + C_MAP = "map" + C_ARRAY = "array" + C_STRING = string + C_BOOLEAN = boolean + C_TINYINT = tinyint + C_SMALLINT = smallint + C_INT = int + C_BIGINT = bigint + C_FLOAT = float + C_DOUBLE = double + C_BYTES = bytes + C_DATE = date + C_DECIMAL = "decimal(38, 18)" + C_TIMESTAMP = timestamp + } + } + } + result_table_name = "fake" + } +} + +sink { + Assert { + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 5 + }, + { + rule_type = MIN_ROW + rule_value = 5 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_gz_text_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_gz_text_to_assert.conf new file mode 100644 index 00000000000..2196026f71d --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_gz_text_to_assert.conf @@ -0,0 +1,117 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + parallelism = 1 + job.mode = "BATCH" + + # You can set spark configuration here + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + LocalFile { + path = "/seatunnel/read/gz/txt/single" + file_format_type = "text" + archive_compress_codec = "gz" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + result_table_name = "fake" + } +} + +sink { + Assert { + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 5 + }, + { + rule_type = MIN_ROW + rule_value = 5 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/xml/local_file_gz_xml_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/xml/local_file_gz_xml_to_assert.conf new file mode 100644 index 00000000000..e1c35da0df2 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/xml/local_file_gz_xml_to_assert.conf @@ -0,0 +1,102 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + parallelism = 1 + job.mode = "BATCH" + + # You can set spark configuration here + spark.app.name = "SeaTunnel" + spark.executor.instances = 1 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + LocalFile { + path = "/seatunnel/read/gz/xml/single" + file_format_type = "xml" + archive_compress_codec = "gz" + xml_row_tag = "RECORD" + xml_use_attr_format = true + schema = { + fields { + c_bytes = "tinyint" + c_short = "smallint" + c_int = "int" + c_bigint = "bigint" + c_string = "string" + c_double = "double" + c_float = "float" + c_decimal = "decimal(10, 2)" + c_boolean = "boolean" + c_map = "map" + c_array = "array" + c_date = "date" + c_datetime = "timestamp" + c_time = "time" + } + } + } +} + +sink { + Assert { + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 1 + }, + { + rule_type = MIN_ROW + rule_value = 1 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} From 98a69f0f35902452aaebf507824ffe896afdfeb9 Mon Sep 17 00:00:00 2001 From: zhdech Date: Thu, 14 Nov 2024 20:54:00 +0800 Subject: [PATCH 06/16] solve https://github.com/apache/seatunnel/issues/8019 --- .../e2e/connector/file/local/LocalFileIT.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java index 256424c6757..b961d9546ce 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java @@ -48,7 +48,6 @@ import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; -import java.io.*; import java.nio.charset.StandardCharsets; import java.nio.file.FileVisitResult; import java.nio.file.Files; @@ -174,7 +173,8 @@ public class LocalFileIT extends TestSuiteBase { Path jsonGz = convertToGzFile( - Lists.newArrayList(ContainerUtil.getResourcesFile("/text/e2e.json")), + Lists.newArrayList( + ContainerUtil.getResourcesFile("/text/e2e.json")), "e2e-json"); ContainerUtil.copyFileIntoContainers( jsonGz, "/seatunnel/read/gz/json/single/e2e-json.gz", container); @@ -211,7 +211,6 @@ public class LocalFileIT extends TestSuiteBase { ContainerUtil.copyFileIntoContainers( xmlGz, "/seatunnel/read/gz/xml/single/e2e-xml.gz", container); - Path txtLzo = convertToLzoFile(ContainerUtil.getResourcesFile("/text/e2e.txt")); ContainerUtil.copyFileIntoContainers( txtLzo, "/seatunnel/read/lzo_text/e2e.txt", container); @@ -245,7 +244,8 @@ public class LocalFileIT extends TestSuiteBase { Path xlsxGz = convertToGzFile( - Lists.newArrayList(ContainerUtil.getResourcesFile("/text/e2e.xlsx")), + Lists.newArrayList( + ContainerUtil.getResourcesFile("/text/e2e.xlsx")), "e2e-xlsx"); ContainerUtil.copyFileIntoContainers( xlsxGz, "/seatunnel/read/gz/xlsx/single/e2e-xlsx.gz", container); @@ -591,8 +591,8 @@ public Path convertToGzFile(List files, String name) throws IOException { Path gzFilePath = Paths.get(firstFile.getParent(), String.format("%s.gz", name)); try (FileInputStream fis = new FileInputStream(firstFile); - FileOutputStream fos = new FileOutputStream(gzFilePath.toFile()); - GZIPOutputStream gzos = new GZIPOutputStream(fos)) { + FileOutputStream fos = new FileOutputStream(gzFilePath.toFile()); + GZIPOutputStream gzos = new GZIPOutputStream(fos)) { byte[] buffer = new byte[2048]; int length; From f7ea9f9f57d5e6738e8b936a5ba63ebeadb15b09 Mon Sep 17 00:00:00 2001 From: zhdech Date: Fri, 15 Nov 2024 10:31:24 +0800 Subject: [PATCH 07/16] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=8C=85=E5=BC=95?= =?UTF-8?q?=E5=85=A5=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../seatunnel/e2e/connector/file/local/LocalFileIT.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java index b961d9546ce..921c37bcb67 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java @@ -48,6 +48,11 @@ import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; import java.nio.charset.StandardCharsets; import java.nio.file.FileVisitResult; import java.nio.file.Files; @@ -591,8 +596,8 @@ public Path convertToGzFile(List files, String name) throws IOException { Path gzFilePath = Paths.get(firstFile.getParent(), String.format("%s.gz", name)); try (FileInputStream fis = new FileInputStream(firstFile); - FileOutputStream fos = new FileOutputStream(gzFilePath.toFile()); - GZIPOutputStream gzos = new GZIPOutputStream(fos)) { + FileOutputStream fos = new FileOutputStream(gzFilePath.toFile()); + GZIPOutputStream gzos = new GZIPOutputStream(fos)) { byte[] buffer = new byte[2048]; int length; From c39f4b90527d5436d0a1ea13f86bb554e98e29e2 Mon Sep 17 00:00:00 2001 From: zhdech Date: Fri, 15 Nov 2024 10:52:13 +0800 Subject: [PATCH 08/16] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=8C=85=E5=BC=95?= =?UTF-8?q?=E5=85=A5=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../apache/seatunnel/e2e/connector/file/local/LocalFileIT.java | 1 + 1 file changed, 1 insertion(+) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java index 921c37bcb67..4fa6e1243c2 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java @@ -53,6 +53,7 @@ import java.io.OutputStream; import java.io.FileInputStream; import java.io.FileOutputStream; +import java.io.ByteArrayOutputStream; import java.nio.charset.StandardCharsets; import java.nio.file.FileVisitResult; import java.nio.file.Files; From 07e83a90c35b282e1425dd8b5d37a5a13478d719 Mon Sep 17 00:00:00 2001 From: zhdech Date: Fri, 15 Nov 2024 11:01:56 +0800 Subject: [PATCH 09/16] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=A0=B7=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../e2e/connector/file/local/LocalFileIT.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java index 4fa6e1243c2..aedb9857eb8 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java @@ -48,12 +48,12 @@ import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; +import java.io.ByteArrayOutputStream; import java.io.File; -import java.io.IOException; -import java.io.OutputStream; import java.io.FileInputStream; import java.io.FileOutputStream; -import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStream; import java.nio.charset.StandardCharsets; import java.nio.file.FileVisitResult; import java.nio.file.Files; @@ -597,8 +597,8 @@ public Path convertToGzFile(List files, String name) throws IOException { Path gzFilePath = Paths.get(firstFile.getParent(), String.format("%s.gz", name)); try (FileInputStream fis = new FileInputStream(firstFile); - FileOutputStream fos = new FileOutputStream(gzFilePath.toFile()); - GZIPOutputStream gzos = new GZIPOutputStream(fos)) { + FileOutputStream fos = new FileOutputStream(gzFilePath.toFile()); + GZIPOutputStream gzos = new GZIPOutputStream(fos)) { byte[] buffer = new byte[2048]; int length; From 4f19768e3e3b0a7feab6fc1532b45145b8db2f79 Mon Sep 17 00:00:00 2001 From: zhdech Date: Fri, 15 Nov 2024 11:47:37 +0800 Subject: [PATCH 10/16] =?UTF-8?q?e2e=E4=BF=AE=E5=A4=8D=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E6=89=BE=E4=B8=8D=E5=88=B0=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../seatunnel/e2e/connector/file/local/LocalFileIT.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java index aedb9857eb8..673161aff60 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java @@ -180,7 +180,7 @@ public class LocalFileIT extends TestSuiteBase { Path jsonGz = convertToGzFile( Lists.newArrayList( - ContainerUtil.getResourcesFile("/text/e2e.json")), + ContainerUtil.getResourcesFile("/json/e2e.json")), "e2e-json"); ContainerUtil.copyFileIntoContainers( jsonGz, "/seatunnel/read/gz/json/single/e2e-json.gz", container); @@ -212,7 +212,7 @@ public class LocalFileIT extends TestSuiteBase { Path xmlGz = convertToGzFile( - Lists.newArrayList(ContainerUtil.getResourcesFile("/text/e2e.xml")), + Lists.newArrayList(ContainerUtil.getResourcesFile("/xml/e2e.xml")), "e2e-xml"); ContainerUtil.copyFileIntoContainers( xmlGz, "/seatunnel/read/gz/xml/single/e2e-xml.gz", container); @@ -251,10 +251,10 @@ public class LocalFileIT extends TestSuiteBase { Path xlsxGz = convertToGzFile( Lists.newArrayList( - ContainerUtil.getResourcesFile("/text/e2e.xlsx")), + ContainerUtil.getResourcesFile("/excel/e2e.xlsx")), "e2e-xlsx"); ContainerUtil.copyFileIntoContainers( - xlsxGz, "/seatunnel/read/gz/xlsx/single/e2e-xlsx.gz", container); + xlsxGz, "/seatunnel/read/gz/excel/single/e2e-xlsx.gz", container); ContainerUtil.copyFileIntoContainers( "/orc/e2e.orc", From a4e53d719b3c2907ef0cf00bc472042aad0c6d9c Mon Sep 17 00:00:00 2001 From: zhdech Date: Fri, 15 Nov 2024 11:47:37 +0800 Subject: [PATCH 11/16] =?UTF-8?q?e2e=E4=BF=AE=E5=A4=8D=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E6=89=BE=E4=B8=8D=E5=88=B0=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../seatunnel/e2e/connector/file/local/LocalFileIT.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java index aedb9857eb8..673161aff60 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java @@ -180,7 +180,7 @@ public class LocalFileIT extends TestSuiteBase { Path jsonGz = convertToGzFile( Lists.newArrayList( - ContainerUtil.getResourcesFile("/text/e2e.json")), + ContainerUtil.getResourcesFile("/json/e2e.json")), "e2e-json"); ContainerUtil.copyFileIntoContainers( jsonGz, "/seatunnel/read/gz/json/single/e2e-json.gz", container); @@ -212,7 +212,7 @@ public class LocalFileIT extends TestSuiteBase { Path xmlGz = convertToGzFile( - Lists.newArrayList(ContainerUtil.getResourcesFile("/text/e2e.xml")), + Lists.newArrayList(ContainerUtil.getResourcesFile("/xml/e2e.xml")), "e2e-xml"); ContainerUtil.copyFileIntoContainers( xmlGz, "/seatunnel/read/gz/xml/single/e2e-xml.gz", container); @@ -251,10 +251,10 @@ public class LocalFileIT extends TestSuiteBase { Path xlsxGz = convertToGzFile( Lists.newArrayList( - ContainerUtil.getResourcesFile("/text/e2e.xlsx")), + ContainerUtil.getResourcesFile("/excel/e2e.xlsx")), "e2e-xlsx"); ContainerUtil.copyFileIntoContainers( - xlsxGz, "/seatunnel/read/gz/xlsx/single/e2e-xlsx.gz", container); + xlsxGz, "/seatunnel/read/gz/excel/single/e2e-xlsx.gz", container); ContainerUtil.copyFileIntoContainers( "/orc/e2e.orc", From 03e7d38726f8f5df46c916f03bed1165a1989b08 Mon Sep 17 00:00:00 2001 From: zhdech Date: Sun, 17 Nov 2024 11:53:42 +0800 Subject: [PATCH 12/16] =?UTF-8?q?e2e=E4=BF=AE=E5=A4=8D=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/test/resources/excel/local_excel_gz_to_assert.conf | 2 +- .../src/test/resources/json/local_file_json_gz_to_assert.conf | 2 +- .../src/test/resources/text/local_file_gz_text_to_assert.conf | 2 +- .../src/test/resources/xml/local_file_gz_xml_to_assert.conf | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf index 2814ab92453..ced1870de8e 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf @@ -29,7 +29,7 @@ env { source { LocalFile { - path = "/seatunnel/read/gz/excel/single" + path = "/seatunnel/read/gz/excel/single/e2e-xlsx.gz" result_table_name = "fake" file_format_type = excel archive_compress_codec = "gz" diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/json/local_file_json_gz_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/json/local_file_json_gz_to_assert.conf index d988f7d108a..4842da5b93e 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/json/local_file_json_gz_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/json/local_file_json_gz_to_assert.conf @@ -29,7 +29,7 @@ env { source { LocalFile { - path = "/seatunnel/read/gz/json/single" + path = "/seatunnel/read/gz/json/single/e2e-json.gz" file_format_type = "json" archive_compress_codec = "gz" schema = { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_gz_text_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_gz_text_to_assert.conf index 2196026f71d..a3bcda8b143 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_gz_text_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_gz_text_to_assert.conf @@ -29,7 +29,7 @@ env { source { LocalFile { - path = "/seatunnel/read/gz/txt/single" + path = "/seatunnel/read/gz/txt/single/e2e-txt.gz" file_format_type = "text" archive_compress_codec = "gz" schema = { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/xml/local_file_gz_xml_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/xml/local_file_gz_xml_to_assert.conf index e1c35da0df2..f41c65d17b2 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/xml/local_file_gz_xml_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/xml/local_file_gz_xml_to_assert.conf @@ -29,7 +29,7 @@ env { source { LocalFile { - path = "/seatunnel/read/gz/xml/single" + path = "/seatunnel/read/gz/xml/single/e2e-xml.gz" file_format_type = "xml" archive_compress_codec = "gz" xml_row_tag = "RECORD" From fbbb8dcf66d06788801681e2be039095856cace8 Mon Sep 17 00:00:00 2001 From: zhdech Date: Mon, 18 Nov 2024 09:07:38 +0800 Subject: [PATCH 13/16] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=AE=B9=E5=99=A8?= =?UTF-8?q?=E4=B8=AD=E6=96=87=E4=BB=B6=E7=9A=84=E8=B7=AF=E5=BE=84=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../e2e/connector/file/local/LocalFileIT.java | 16 ++++++++-------- .../excel/local_excel_gz_to_assert.conf | 2 +- .../json/local_file_json_gz_to_assert.conf | 2 +- .../text/local_file_gz_text_to_assert.conf | 2 +- .../xml/local_file_gz_xml_to_assert.conf | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java index 673161aff60..0df565b608f 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java @@ -154,9 +154,9 @@ public class LocalFileIT extends TestSuiteBase { Path txtGz = convertToGzFile( Lists.newArrayList(ContainerUtil.getResourcesFile("/text/e2e.txt")), - "e2e-txt"); + "e2e-txt-gz"); ContainerUtil.copyFileIntoContainers( - txtGz, "/seatunnel/read/gz/txt/single/e2e-txt.gz", container); + txtGz, "/seatunnel/read/gz/txt/single/e2e-txt-gz.gz", container); Path jsonZip = convertToZipFile( @@ -181,9 +181,9 @@ public class LocalFileIT extends TestSuiteBase { convertToGzFile( Lists.newArrayList( ContainerUtil.getResourcesFile("/json/e2e.json")), - "e2e-json"); + "e2e-json-gz"); ContainerUtil.copyFileIntoContainers( - jsonGz, "/seatunnel/read/gz/json/single/e2e-json.gz", container); + jsonGz, "/seatunnel/read/gz/json/single/e2e-json-gz.gz", container); ContainerUtil.copyFileIntoContainers( "/text/e2e_gbk.txt", @@ -213,9 +213,9 @@ public class LocalFileIT extends TestSuiteBase { Path xmlGz = convertToGzFile( Lists.newArrayList(ContainerUtil.getResourcesFile("/xml/e2e.xml")), - "e2e-xml"); + "e2e-xml-gz"); ContainerUtil.copyFileIntoContainers( - xmlGz, "/seatunnel/read/gz/xml/single/e2e-xml.gz", container); + xmlGz, "/seatunnel/read/gz/xml/single/e2e-xml-gz.gz", container); Path txtLzo = convertToLzoFile(ContainerUtil.getResourcesFile("/text/e2e.txt")); ContainerUtil.copyFileIntoContainers( @@ -252,9 +252,9 @@ public class LocalFileIT extends TestSuiteBase { convertToGzFile( Lists.newArrayList( ContainerUtil.getResourcesFile("/excel/e2e.xlsx")), - "e2e-xlsx"); + "e2e-xlsx-gz"); ContainerUtil.copyFileIntoContainers( - xlsxGz, "/seatunnel/read/gz/excel/single/e2e-xlsx.gz", container); + xlsxGz, "/seatunnel/read/gz/excel/single/e2e-xlsx-gz.gz", container); ContainerUtil.copyFileIntoContainers( "/orc/e2e.orc", diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf index ced1870de8e..1cc1e615aa3 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf @@ -29,7 +29,7 @@ env { source { LocalFile { - path = "/seatunnel/read/gz/excel/single/e2e-xlsx.gz" + path = "/seatunnel/read/gz/excel/single/e2e-xlsx-gz.gz" result_table_name = "fake" file_format_type = excel archive_compress_codec = "gz" diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/json/local_file_json_gz_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/json/local_file_json_gz_to_assert.conf index 4842da5b93e..0433aa5f5cb 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/json/local_file_json_gz_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/json/local_file_json_gz_to_assert.conf @@ -29,7 +29,7 @@ env { source { LocalFile { - path = "/seatunnel/read/gz/json/single/e2e-json.gz" + path = "/seatunnel/read/gz/json/single/e2e-json-gz.gz" file_format_type = "json" archive_compress_codec = "gz" schema = { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_gz_text_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_gz_text_to_assert.conf index a3bcda8b143..d4f71e9901c 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_gz_text_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_gz_text_to_assert.conf @@ -29,7 +29,7 @@ env { source { LocalFile { - path = "/seatunnel/read/gz/txt/single/e2e-txt.gz" + path = "/seatunnel/read/gz/txt/single/e2e-txt-gz.gz" file_format_type = "text" archive_compress_codec = "gz" schema = { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/xml/local_file_gz_xml_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/xml/local_file_gz_xml_to_assert.conf index f41c65d17b2..2a216a18ee5 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/xml/local_file_gz_xml_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/xml/local_file_gz_xml_to_assert.conf @@ -29,7 +29,7 @@ env { source { LocalFile { - path = "/seatunnel/read/gz/xml/single/e2e-xml.gz" + path = "/seatunnel/read/gz/xml/single/e2e-xml-gz.gz" file_format_type = "xml" archive_compress_codec = "gz" xml_row_tag = "RECORD" From 37ae65ecbd641c0c7001a8259068d3c101dc5742 Mon Sep 17 00:00:00 2001 From: zhdech Date: Tue, 19 Nov 2024 13:09:21 +0800 Subject: [PATCH 14/16] Remove Excel --- .../e2e/connector/file/local/LocalFileIT.java | 8 -- .../excel/local_excel_gz_to_assert.conf | 119 ------------------ 2 files changed, 127 deletions(-) delete mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java index 0df565b608f..f334571c26a 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java @@ -248,13 +248,6 @@ public class LocalFileIT extends TestSuiteBase { "/seatunnel/read/zip/excel/multifile/multiZip.zip", container); - Path xlsxGz = - convertToGzFile( - Lists.newArrayList( - ContainerUtil.getResourcesFile("/excel/e2e.xlsx")), - "e2e-xlsx-gz"); - ContainerUtil.copyFileIntoContainers( - xlsxGz, "/seatunnel/read/gz/excel/single/e2e-xlsx-gz.gz", container); ContainerUtil.copyFileIntoContainers( "/orc/e2e.orc", @@ -366,7 +359,6 @@ public void testLocalFileReadAndWrite(TestContainer container) helper.execute("/xml/local_file_gz_xml_to_assert.conf"); // test read single local excel file with zip compression helper.execute("/excel/local_excel_zip_to_assert.conf"); - helper.execute("/excel/local_excel_gz_to_assert.conf"); // test read multi local excel file with zip compression helper.execute("/excel/local_excel_multi_zip_to_assert.conf"); } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf deleted file mode 100644 index 1cc1e615aa3..00000000000 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_gz_to_assert.conf +++ /dev/null @@ -1,119 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -env { - parallelism = 1 - job.mode = "BATCH" - # You can set spark configuration here - spark.app.name = "SeaTunnel" - spark.executor.instances = 2 - spark.executor.cores = 1 - spark.executor.memory = "1g" - spark.master = local - job.mode = "BATCH" -} - -source { - LocalFile { - path = "/seatunnel/read/gz/excel/single/e2e-xlsx-gz.gz" - result_table_name = "fake" - file_format_type = excel - archive_compress_codec = "gz" - field_delimiter = ; - skip_header_row_number = 1 - schema = { - fields { - c_map = "map" - c_array = "array" - c_string = string - c_boolean = boolean - c_tinyint = tinyint - c_smallint = smallint - c_int = int - c_bigint = bigint - c_float = float - c_double = double - c_bytes = bytes - c_date = date - c_decimal = "decimal(38, 18)" - c_timestamp = timestamp - c_row = { - c_map = "map" - c_array = "array" - c_string = string - c_boolean = boolean - c_tinyint = tinyint - c_smallint = smallint - c_int = int - c_bigint = bigint - c_float = float - c_double = double - c_bytes = bytes - c_date = date - c_decimal = "decimal(38, 18)" - c_timestamp = timestamp - } - } - } - } -} - -sink { - Assert { - rules { - row_rules = [ - { - rule_type = MAX_ROW - rule_value = 5 - }, - { - rule_type = MIN_ROW - rule_value = 5 - } - ], - field_rules = [ - { - field_name = c_string - field_type = string - field_value = [ - { - rule_type = NOT_NULL - } - ] - }, - { - field_name = c_boolean - field_type = boolean - field_value = [ - { - rule_type = NOT_NULL - } - ] - }, - { - field_name = c_double - field_type = double - field_value = [ - { - rule_type = NOT_NULL - } - ] - } - ] - } - } -} From 480055b4922b637810ebc5ac9d6aaece3fe3db85 Mon Sep 17 00:00:00 2001 From: zhdech Date: Tue, 19 Nov 2024 18:20:24 +0800 Subject: [PATCH 15/16] format --- .../apache/seatunnel/e2e/connector/file/local/LocalFileIT.java | 1 - 1 file changed, 1 deletion(-) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java index f334571c26a..ed055a3a303 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java @@ -248,7 +248,6 @@ public class LocalFileIT extends TestSuiteBase { "/seatunnel/read/zip/excel/multifile/multiZip.zip", container); - ContainerUtil.copyFileIntoContainers( "/orc/e2e.orc", "/seatunnel/read/orc/name=tyrantlucifer/hobby=coding/e2e.orc", From f4f87ed483d7b30e16dbb19e6d3c6d862ad8c0f9 Mon Sep 17 00:00:00 2001 From: zhdech Date: Wed, 20 Nov 2024 22:19:14 +0800 Subject: [PATCH 16/16] Remove support for Excel in gz compression --- docs/en/connector-v2/source/CosFile.md | 2 +- docs/en/connector-v2/source/FtpFile.md | 2 +- docs/en/connector-v2/source/HdfsFile.md | 2 +- docs/en/connector-v2/source/LocalFile.md | 2 +- docs/en/connector-v2/source/OssJindoFile.md | 2 +- docs/en/connector-v2/source/S3File.md | 2 +- docs/en/connector-v2/source/SftpFile.md | 13 +++++++------ 7 files changed, 13 insertions(+), 12 deletions(-) diff --git a/docs/en/connector-v2/source/CosFile.md b/docs/en/connector-v2/source/CosFile.md index 396725f0639..1cbda880139 100644 --- a/docs/en/connector-v2/source/CosFile.md +++ b/docs/en/connector-v2/source/CosFile.md @@ -343,7 +343,7 @@ The compress codec of archive files and the details that supported as the follow | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | -| GZ | txt,json,excel,xml | .gz | +| GZ | txt,json,xml | .gz | | NONE | all | .* | ### encoding [string] diff --git a/docs/en/connector-v2/source/FtpFile.md b/docs/en/connector-v2/source/FtpFile.md index 3872ac0a909..59f3852cb0d 100644 --- a/docs/en/connector-v2/source/FtpFile.md +++ b/docs/en/connector-v2/source/FtpFile.md @@ -328,7 +328,7 @@ The compress codec of archive files and the details that supported as the follow | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | -| GZ | txt,json,excel,xml | .gz | +| GZ | txt,json,xml | .gz | | NONE | all | .* | ### encoding [string] diff --git a/docs/en/connector-v2/source/HdfsFile.md b/docs/en/connector-v2/source/HdfsFile.md index 7495ae3564d..161b0e63183 100644 --- a/docs/en/connector-v2/source/HdfsFile.md +++ b/docs/en/connector-v2/source/HdfsFile.md @@ -144,7 +144,7 @@ The compress codec of archive files and the details that supported as the follow | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | -| GZ | txt,json,excel,xml | .gz | +| GZ | txt,json,xml | .gz | | NONE | all | .* | ### encoding [string] diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md index 69b2519cb45..8923a031607 100644 --- a/docs/en/connector-v2/source/LocalFile.md +++ b/docs/en/connector-v2/source/LocalFile.md @@ -322,7 +322,7 @@ The compress codec of archive files and the details that supported as the follow | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | -| GZ | txt,json,excel,xml | .gz | +| GZ | txt,json,xml | .gz | | NONE | all | .* | ### encoding [string] diff --git a/docs/en/connector-v2/source/OssJindoFile.md b/docs/en/connector-v2/source/OssJindoFile.md index de8c38ecadd..1db5d62a441 100644 --- a/docs/en/connector-v2/source/OssJindoFile.md +++ b/docs/en/connector-v2/source/OssJindoFile.md @@ -335,7 +335,7 @@ The compress codec of archive files and the details that supported as the follow | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | -| GZ | txt,json,excel,xml | .gz | +| GZ | txt,json,xml | .gz | | NONE | all | .* | ### encoding [string] diff --git a/docs/en/connector-v2/source/S3File.md b/docs/en/connector-v2/source/S3File.md index e60351a41f9..ba4b71cfe93 100644 --- a/docs/en/connector-v2/source/S3File.md +++ b/docs/en/connector-v2/source/S3File.md @@ -299,7 +299,7 @@ The compress codec of archive files and the details that supported as the follow | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | -| GZ | txt,json,excel,xml | .gz | +| GZ | txt,json,xml | .gz | | NONE | all | .* | ### encoding [string] diff --git a/docs/en/connector-v2/source/SftpFile.md b/docs/en/connector-v2/source/SftpFile.md index 95c710110a0..8e80bedd4b3 100644 --- a/docs/en/connector-v2/source/SftpFile.md +++ b/docs/en/connector-v2/source/SftpFile.md @@ -235,11 +235,12 @@ The compress codec of files and the details that supported as the following show The compress codec of archive files and the details that supported as the following shown: | archive_compress_codec | file_format | archive_compress_suffix | -|------------------------|--------------------|-------------------------| -| ZIP | txt,json,excel,xml | .zip | -| TAR | txt,json,excel,xml | .tar | -| TAR_GZ | txt,json,excel,xml | .tar.gz | -| NONE | all | .* | +|--------------------|--------------------|---------------------| +| ZIP | txt,json,excel,xml | .zip | +| TAR | txt,json,excel,xml | .tar | +| TAR_GZ | txt,json,excel,xml | .tar.gz | +| GZ | txt,json,xml | .gz | +| NONE | all | .* | ### encoding [string] @@ -384,4 +385,4 @@ sink { Console { } } -``` \ No newline at end of file +```