From baa7cb89e40c55159414cc4d4c9c6ad5b2875d82 Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Tue, 22 Oct 2024 17:28:28 +0800 Subject: [PATCH 01/17] [Fix][Doc] Fix LocalFile doc (#7887) Supplement and optimize the description of the LocalFile connector on filtering files [(#7887)](https://github.com/apache/seatunnel/issues/7887) --- docs/en/connector-v2/source/LocalFile.md | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md index 6d11b992e3a..533d7fa91bf 100644 --- a/docs/en/connector-v2/source/LocalFile.md +++ b/docs/en/connector-v2/source/LocalFile.md @@ -254,6 +254,12 @@ Specifies Whether to process data using the tag attribute format. Filter pattern, which used for filtering files. +The filtering format is similar to wildcard matching file names in Linux. + +However, it should be noted that unlike Linux wildcard characters, when encountering file suffixes, the middle dot cannot be omitted. + +For example, `abc20241022.csv`, the normal Linux wildcard `abc*` is sufficient, but here we need to use `abc*.*` , Pay attention to a point in the middle. + ### compress_codec [string] The compress codec of files and the details that supported as the following shown: @@ -406,6 +412,29 @@ sink { ``` +### Filter File + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + LocalFile { + path = "/seatunnel/read/" + file_format_type = "csv" + skip_header_row_number = 1 + // file example abcD2024.csv + file_filter_pattern = "abc[DX]*.*" + } +} +sink { + Console { + } +} +``` + ## Changelog ### 2.2.0-beta 2022-09-26 From 427668128471aea7fb34dcca10666bd079fe79ea Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Fri, 25 Oct 2024 18:34:23 +0800 Subject: [PATCH 02/17] [Fix][Connector-V2][ClickHouse] Fix ClickHouse Bug (#7897) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1、When the ClickHouse connector is set to multi parallelism, the task extraction is completed but cannot be stopped normally [(#7897)](https://github.com/apache/seatunnel/issues/7897) 2、Added E2E test cases for this issue [(#7897)](https://github.com/apache/seatunnel/issues/7897) 3、Local developers want to observe **Job Progress Information** in a timely manner, Need to modify the following configuration.The configuration in config is invalid ``` seatunnel engine/seatunnel-engineer-common/src/main/resources/seatunnely.yaml ``` --- .../clickhouse/source/ClickhouseSourceReader.java | 13 +++++++++++-- .../source/ClickhouseSourceSplitEnumerator.java | 2 ++ .../src/main/resources/seatunnel.yaml | 1 + 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java index 591334d9722..b21519083b7 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java @@ -17,6 +17,8 @@ package org.apache.seatunnel.connectors.seatunnel.clickhouse.source; +import lombok.extern.slf4j.Slf4j; +import org.apache.seatunnel.api.source.Boundedness; import org.apache.seatunnel.api.source.Collector; import org.apache.seatunnel.api.source.SourceReader; import org.apache.seatunnel.api.table.type.SeaTunnelRow; @@ -34,7 +36,7 @@ import java.util.Collections; import java.util.List; import java.util.Random; - +@Slf4j public class ClickhouseSourceReader implements SourceReader { private final List servers; @@ -43,6 +45,7 @@ public class ClickhouseSourceReader implements SourceReader request; private final String sql; + private volatile boolean noMoreSplit; private final List splits; @@ -97,6 +100,12 @@ record -> { } this.readerContext.signalNoMoreElement(); this.splits.clear(); + } else if (noMoreSplit + && splits.isEmpty() + && Boundedness.BOUNDED.equals(readerContext.getBoundedness())) { + log.info("Closed the bounded ClickHouse source"); + this.readerContext.signalNoMoreElement(); + this.splits.clear(); } } @@ -111,7 +120,7 @@ public void addSplits(List splits) { } @Override - public void handleNoMoreSplits() {} + public void handleNoMoreSplits() {noMoreSplit = true;} @Override public void notifyCheckpointComplete(long checkpointId) throws Exception {} diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceSplitEnumerator.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceSplitEnumerator.java index c0eb4b6c706..21937db9a81 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceSplitEnumerator.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceSplitEnumerator.java @@ -77,6 +77,8 @@ public void registerReader(int subtaskId) { if (assigned < 0) { assigned = subtaskId; context.assignSplit(subtaskId, new ClickhouseSourceSplit()); + } else { + context.signalNoMoreSplits(subtaskId); } } diff --git a/seatunnel-engine/seatunnel-engine-common/src/main/resources/seatunnel.yaml b/seatunnel-engine/seatunnel-engine-common/src/main/resources/seatunnel.yaml index c9bb71ecc07..8c5a136d1b0 100644 --- a/seatunnel-engine/seatunnel-engine-common/src/main/resources/seatunnel.yaml +++ b/seatunnel-engine/seatunnel-engine-common/src/main/resources/seatunnel.yaml @@ -20,6 +20,7 @@ seatunnel: backup-count: 1 queue-type: blockingqueue print-execution-info-interval: 60 + print-job-metrics-info-interval: 60 slot-service: dynamic-slot: true checkpoint: From 1b8066765f2328f4624fb4137b867566f4d2fa69 Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Fri, 25 Oct 2024 22:33:01 +0800 Subject: [PATCH 03/17] [Fix][Connector-V2][ClickHouse] Fix ClickHouse Bug (#7897) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1、When the ClickHouse connector is set to multi parallelism, the task extraction is completed but cannot be stopped normally [(#7897)](https://github.com/apache/seatunnel/issues/7897) 2、Added E2E test cases for this issue [(#7897)](https://github.com/apache/seatunnel/issues/7897) 3、Local developers want to observe **Job Progress Information** in a timely manner, Need to modify the following configuration.The configuration in config is invalid ``` seatunnel engine/seatunnel-engineer-common/src/main/resources/seatunnely.yaml ``` --- .../source/ClickhouseSourceReader.java | 7 ++- .../seatunnel/clickhouse/ClickhouseIT.java | 7 +++ .../test/resources/clickhouse_to_console.conf | 45 +++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java index b21519083b7..fd3c6300b5d 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java @@ -17,7 +17,6 @@ package org.apache.seatunnel.connectors.seatunnel.clickhouse.source; -import lombok.extern.slf4j.Slf4j; import org.apache.seatunnel.api.source.Boundedness; import org.apache.seatunnel.api.source.Collector; import org.apache.seatunnel.api.source.SourceReader; @@ -30,12 +29,14 @@ import com.clickhouse.client.ClickHouseNode; import com.clickhouse.client.ClickHouseRequest; import com.clickhouse.client.ClickHouseResponse; +import lombok.extern.slf4j.Slf4j; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Random; + @Slf4j public class ClickhouseSourceReader implements SourceReader { @@ -120,7 +121,9 @@ public void addSplits(List splits) { } @Override - public void handleNoMoreSplits() {noMoreSplit = true;} + public void handleNoMoreSplits() { + noMoreSplit = true; + } @Override public void notifyCheckpointComplete(long checkpointId) throws Exception {} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java index 66ee281740c..3e8b0a090fc 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java @@ -101,6 +101,13 @@ public void testClickhouse(TestContainer container) throws Exception { clearSinkTable(); } + @TestTemplate + public void testSourceParallelism(TestContainer container) throws Exception { + System.out.println("=========多并行度测试==========="); + Container.ExecResult execResult = container.executeJob("/clickhouse_to_console.conf"); + System.out.println(execResult.getExitCode()); + } + @BeforeAll @Override public void startUp() throws Exception { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf new file mode 100644 index 00000000000..755131276ba --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf @@ -0,0 +1,45 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +###### +###### This config file is a demonstration of streaming processing in seatunnel config +###### + +env { + parallelism = 2 + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + Clickhouse { + host = "clickhouse:8123" + database = "default" + sql = "select * from source_table" + username = "default" + password = "" + result_table_name = "source_table" + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/connector-v2/source/ClickhouseSource +} + +sink { + console { + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/connector-v2/sink +} \ No newline at end of file From e64b8a67ddd8ae6730813c66cb2e929e2768d6eb Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Sat, 26 Oct 2024 12:02:28 +0800 Subject: [PATCH 04/17] [Fix][Connector-V2][ClickHouse] Fix ClickHouse Bug (#7897) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1、When the ClickHouse connector is set to multi parallelism, the task extraction is completed but cannot be stopped normally [(#7897)](https://github.com/apache/seatunnel/issues/7897) 2、Added E2E test cases for this issue [(#7897)](https://github.com/apache/seatunnel/issues/7897) 3、Local developers want to observe **Job Progress Information** in a timely manner, Need to modify the following configuration.The configuration in config is invalid ``` seatunnel engine/seatunnel-engineer-common/src/main/resources/seatunnely.yaml ``` --- .../connectors/seatunnel/clickhouse/ClickhouseIT.java | 4 ++-- .../src/test/resources/clickhouse_to_console.conf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java index 3e8b0a090fc..e0a09dc56dc 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java @@ -103,9 +103,9 @@ public void testClickhouse(TestContainer container) throws Exception { @TestTemplate public void testSourceParallelism(TestContainer container) throws Exception { - System.out.println("=========多并行度测试==========="); + LOG.info("=========多并行度测试==========="); Container.ExecResult execResult = container.executeJob("/clickhouse_to_console.conf"); - System.out.println(execResult.getExitCode()); + Assertions.assertEquals(0, execResult.getExitCode()); } @BeforeAll diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf index 755131276ba..e996be8e4a2 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf @@ -19,7 +19,7 @@ ###### env { - parallelism = 2 + parallelism = 3 job.mode = "BATCH" } From 42e591925056841d1299effa0112d8134d5d3070 Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Sat, 26 Oct 2024 14:21:46 +0800 Subject: [PATCH 05/17] [Fix][Doc] Fix LocalFile doc (#7887) Continue to optimize the document about filtering files and add some examples [(#7887)](https://github.com/apache/seatunnel/issues/7887) --- docs/en/connector-v2/source/LocalFile.md | 60 ++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md index 533d7fa91bf..077537f6887 100644 --- a/docs/en/connector-v2/source/LocalFile.md +++ b/docs/en/connector-v2/source/LocalFile.md @@ -256,10 +256,70 @@ Filter pattern, which used for filtering files. The filtering format is similar to wildcard matching file names in Linux. +| Wildcard | Meaning | Example | +|--------------|--------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------| +| * | Match 0 or more characters | f*     Any file starting with f
b*.txt   Any file starting with b, any character in the middle, and ending with. txt | +| [] | Match a single character in parentheses | [abc]*   A file that starts with any one of the characters a, b, or c | +| ? | Match any single character | f?.txt   Any file starting with 'f' followed by a character and ending with '. txt' | +| [!] | Match any single character not in parentheses | [!abc]*   Any file that does not start with abc | +| [a-z] | Match any single character from a to z | [a-z]*   Any file starting with a to z | +| {a,b,c}/a..z | When separated by commas, it represents individual characters
When separated by two dots, represents continuous characters | {a,b,c}*   Files starting with any character from abc
{a..Z}*    Files starting with any character from a to z | + However, it should be noted that unlike Linux wildcard characters, when encountering file suffixes, the middle dot cannot be omitted. For example, `abc20241022.csv`, the normal Linux wildcard `abc*` is sufficient, but here we need to use `abc*.*` , Pay attention to a point in the middle. +File Structure Example: +``` +report.txt +notes.txt +input.csv +abch20241022.csv +abcw20241022.csv +abcx20241022.csv +abcq20241022.csv +abcg20241022.csv +abcv20241022.csv +abcb20241022.csv +old_data.csv +logo.png +script.sh +helpers.sh +``` +Matching Rules Example: + +**Example 1**: *Match all .txt files*,Regular Expression: +``` +*.txt +``` +The result of this example matching is: +``` +report.txt +notes.txt +``` +**Example 2**: *Match all Any file starting with abc*,Regular Expression: +``` +abc*.csv +``` +The result of this example matching is: +``` +abch20241022.csv +abcw20241022.csv +abcx20241022.csv +abcq20241022.csv +abcg20241022.csv +abcv20241022.csv +abcb20241022.csv +``` +**Example 3**: *Match all Any file starting with abc,And the fourth character is either x or g*, the Regular Expression: +``` +abc[x,g]*.csv +``` +The result of this example matching is: +``` +abcx20241022.csv +abcg20241022.csv +``` ### compress_codec [string] The compress codec of files and the details that supported as the following shown: From 2e9162d61e5e7074daee1e3a053a795fad8f2d40 Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Sat, 26 Oct 2024 21:44:17 +0800 Subject: [PATCH 06/17] [Fix][Doc] Fix LocalFile doc (#7887) Change to English log [(#7887)](https://github.com/apache/seatunnel/issues/7887) --- .../seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java index e0a09dc56dc..b4093fd4f12 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java @@ -103,7 +103,7 @@ public void testClickhouse(TestContainer container) throws Exception { @TestTemplate public void testSourceParallelism(TestContainer container) throws Exception { - LOG.info("=========多并行度测试==========="); + LOG.info("=========Multi parallelism testing begins==========="); Container.ExecResult execResult = container.executeJob("/clickhouse_to_console.conf"); Assertions.assertEquals(0, execResult.getExitCode()); } From e564e7fe63750639d9778a7bc6e6c64037c3f2bf Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Sun, 27 Oct 2024 00:08:14 +0800 Subject: [PATCH 07/17] Revert "[Fix][Doc] Fix LocalFile doc (#7887)" This reverts commit 2e9162d61e5e7074daee1e3a053a795fad8f2d40. --- .../seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java index b4093fd4f12..e0a09dc56dc 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java @@ -103,7 +103,7 @@ public void testClickhouse(TestContainer container) throws Exception { @TestTemplate public void testSourceParallelism(TestContainer container) throws Exception { - LOG.info("=========Multi parallelism testing begins==========="); + LOG.info("=========多并行度测试==========="); Container.ExecResult execResult = container.executeJob("/clickhouse_to_console.conf"); Assertions.assertEquals(0, execResult.getExitCode()); } From c5bcdf7262ca7f51e812d75cffcdfb7132686a91 Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Sun, 27 Oct 2024 00:08:31 +0800 Subject: [PATCH 08/17] Revert "[Fix][Connector-V2][ClickHouse] Fix ClickHouse Bug (#7897)" This reverts commit e64b8a67ddd8ae6730813c66cb2e929e2768d6eb. --- .../connectors/seatunnel/clickhouse/ClickhouseIT.java | 4 ++-- .../src/test/resources/clickhouse_to_console.conf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java index e0a09dc56dc..3e8b0a090fc 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java @@ -103,9 +103,9 @@ public void testClickhouse(TestContainer container) throws Exception { @TestTemplate public void testSourceParallelism(TestContainer container) throws Exception { - LOG.info("=========多并行度测试==========="); + System.out.println("=========多并行度测试==========="); Container.ExecResult execResult = container.executeJob("/clickhouse_to_console.conf"); - Assertions.assertEquals(0, execResult.getExitCode()); + System.out.println(execResult.getExitCode()); } @BeforeAll diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf index e996be8e4a2..755131276ba 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf @@ -19,7 +19,7 @@ ###### env { - parallelism = 3 + parallelism = 2 job.mode = "BATCH" } From d02a01b5cc5233e2ee962553c758adf3a73ab381 Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Sun, 27 Oct 2024 00:09:10 +0800 Subject: [PATCH 09/17] Revert "[Fix][Connector-V2][ClickHouse] Fix ClickHouse Bug (#7897)" This reverts commit 1b8066765f2328f4624fb4137b867566f4d2fa69. --- .../source/ClickhouseSourceReader.java | 7 +-- .../seatunnel/clickhouse/ClickhouseIT.java | 7 --- .../test/resources/clickhouse_to_console.conf | 45 ------------------- 3 files changed, 2 insertions(+), 57 deletions(-) delete mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java index fd3c6300b5d..b21519083b7 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java @@ -17,6 +17,7 @@ package org.apache.seatunnel.connectors.seatunnel.clickhouse.source; +import lombok.extern.slf4j.Slf4j; import org.apache.seatunnel.api.source.Boundedness; import org.apache.seatunnel.api.source.Collector; import org.apache.seatunnel.api.source.SourceReader; @@ -29,14 +30,12 @@ import com.clickhouse.client.ClickHouseNode; import com.clickhouse.client.ClickHouseRequest; import com.clickhouse.client.ClickHouseResponse; -import lombok.extern.slf4j.Slf4j; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Random; - @Slf4j public class ClickhouseSourceReader implements SourceReader { @@ -121,9 +120,7 @@ public void addSplits(List splits) { } @Override - public void handleNoMoreSplits() { - noMoreSplit = true; - } + public void handleNoMoreSplits() {noMoreSplit = true;} @Override public void notifyCheckpointComplete(long checkpointId) throws Exception {} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java index 3e8b0a090fc..66ee281740c 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/ClickhouseIT.java @@ -101,13 +101,6 @@ public void testClickhouse(TestContainer container) throws Exception { clearSinkTable(); } - @TestTemplate - public void testSourceParallelism(TestContainer container) throws Exception { - System.out.println("=========多并行度测试==========="); - Container.ExecResult execResult = container.executeJob("/clickhouse_to_console.conf"); - System.out.println(execResult.getExitCode()); - } - @BeforeAll @Override public void startUp() throws Exception { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf deleted file mode 100644 index 755131276ba..00000000000 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-clickhouse-e2e/src/test/resources/clickhouse_to_console.conf +++ /dev/null @@ -1,45 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -###### -###### This config file is a demonstration of streaming processing in seatunnel config -###### - -env { - parallelism = 2 - job.mode = "BATCH" -} - -source { - # This is a example source plugin **only for test and demonstrate the feature source plugin** - Clickhouse { - host = "clickhouse:8123" - database = "default" - sql = "select * from source_table" - username = "default" - password = "" - result_table_name = "source_table" - } - # If you would like to get more information about how to configure seatunnel and see full list of source plugins, - # please go to https://seatunnel.apache.org/docs/connector-v2/source/ClickhouseSource -} - -sink { - console { - } - # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, - # please go to https://seatunnel.apache.org/docs/connector-v2/sink -} \ No newline at end of file From 52ee377ddfc4421282e810f8bd4bda4ecbefd3ef Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Sun, 27 Oct 2024 00:09:16 +0800 Subject: [PATCH 10/17] Revert "[Fix][Connector-V2][ClickHouse] Fix ClickHouse Bug (#7897)" This reverts commit 427668128471aea7fb34dcca10666bd079fe79ea. --- .../clickhouse/source/ClickhouseSourceReader.java | 13 ++----------- .../source/ClickhouseSourceSplitEnumerator.java | 2 -- .../src/main/resources/seatunnel.yaml | 1 - 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java index b21519083b7..591334d9722 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java @@ -17,8 +17,6 @@ package org.apache.seatunnel.connectors.seatunnel.clickhouse.source; -import lombok.extern.slf4j.Slf4j; -import org.apache.seatunnel.api.source.Boundedness; import org.apache.seatunnel.api.source.Collector; import org.apache.seatunnel.api.source.SourceReader; import org.apache.seatunnel.api.table.type.SeaTunnelRow; @@ -36,7 +34,7 @@ import java.util.Collections; import java.util.List; import java.util.Random; -@Slf4j + public class ClickhouseSourceReader implements SourceReader { private final List servers; @@ -45,7 +43,6 @@ public class ClickhouseSourceReader implements SourceReader request; private final String sql; - private volatile boolean noMoreSplit; private final List splits; @@ -100,12 +97,6 @@ record -> { } this.readerContext.signalNoMoreElement(); this.splits.clear(); - } else if (noMoreSplit - && splits.isEmpty() - && Boundedness.BOUNDED.equals(readerContext.getBoundedness())) { - log.info("Closed the bounded ClickHouse source"); - this.readerContext.signalNoMoreElement(); - this.splits.clear(); } } @@ -120,7 +111,7 @@ public void addSplits(List splits) { } @Override - public void handleNoMoreSplits() {noMoreSplit = true;} + public void handleNoMoreSplits() {} @Override public void notifyCheckpointComplete(long checkpointId) throws Exception {} diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceSplitEnumerator.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceSplitEnumerator.java index 21937db9a81..c0eb4b6c706 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceSplitEnumerator.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceSplitEnumerator.java @@ -77,8 +77,6 @@ public void registerReader(int subtaskId) { if (assigned < 0) { assigned = subtaskId; context.assignSplit(subtaskId, new ClickhouseSourceSplit()); - } else { - context.signalNoMoreSplits(subtaskId); } } diff --git a/seatunnel-engine/seatunnel-engine-common/src/main/resources/seatunnel.yaml b/seatunnel-engine/seatunnel-engine-common/src/main/resources/seatunnel.yaml index 8c5a136d1b0..c9bb71ecc07 100644 --- a/seatunnel-engine/seatunnel-engine-common/src/main/resources/seatunnel.yaml +++ b/seatunnel-engine/seatunnel-engine-common/src/main/resources/seatunnel.yaml @@ -20,7 +20,6 @@ seatunnel: backup-count: 1 queue-type: blockingqueue print-execution-info-interval: 60 - print-job-metrics-info-interval: 60 slot-service: dynamic-slot: true checkpoint: From 0062ba4ece2da3dafec3f465abdc2fc66699612b Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Mon, 28 Oct 2024 17:10:26 +0800 Subject: [PATCH 11/17] [Fix][Doc] Fix LocalFile Doc Optimize the describe about Regex --- docs/en/connector-v2/source/LocalFile.md | 72 +++++++++--------------- 1 file changed, 27 insertions(+), 45 deletions(-) diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md index 077537f6887..f16cf1cf545 100644 --- a/docs/en/connector-v2/source/LocalFile.md +++ b/docs/en/connector-v2/source/LocalFile.md @@ -254,72 +254,54 @@ Specifies Whether to process data using the tag attribute format. Filter pattern, which used for filtering files. -The filtering format is similar to wildcard matching file names in Linux. - -| Wildcard | Meaning | Example | -|--------------|--------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------| -| * | Match 0 or more characters | f*     Any file starting with f
b*.txt   Any file starting with b, any character in the middle, and ending with. txt | -| [] | Match a single character in parentheses | [abc]*   A file that starts with any one of the characters a, b, or c | -| ? | Match any single character | f?.txt   Any file starting with 'f' followed by a character and ending with '. txt' | -| [!] | Match any single character not in parentheses | [!abc]*   Any file that does not start with abc | -| [a-z] | Match any single character from a to z | [a-z]*   Any file starting with a to z | -| {a,b,c}/a..z | When separated by commas, it represents individual characters
When separated by two dots, represents continuous characters | {a,b,c}*   Files starting with any character from abc
{a..Z}*    Files starting with any character from a to z | - -However, it should be noted that unlike Linux wildcard characters, when encountering file suffixes, the middle dot cannot be omitted. - -For example, `abc20241022.csv`, the normal Linux wildcard `abc*` is sufficient, but here we need to use `abc*.*` , Pay attention to a point in the middle. +The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. learn it File Structure Example: ``` -report.txt -notes.txt -input.csv -abch20241022.csv -abcw20241022.csv -abcx20241022.csv -abcq20241022.csv -abcg20241022.csv -abcv20241022.csv -abcb20241022.csv -old_data.csv -logo.png -script.sh -helpers.sh +/data/seatunnel/20241001/report.txt +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +/data/seatunnel/20241012/logo.png ``` Matching Rules Example: **Example 1**: *Match all .txt files*,Regular Expression: ``` -*.txt +/data/seatunnel/202410\d*/.*.txt ``` The result of this example matching is: ``` -report.txt -notes.txt +/data/seatunnel/20241001/report.txt ``` -**Example 2**: *Match all Any file starting with abc*,Regular Expression: +**Example 2**: *Match all file starting with abc*,Regular Expression: ``` -abc*.csv +/data/seatunnel/202410\d*/abc.* ``` The result of this example matching is: ``` -abch20241022.csv -abcw20241022.csv -abcx20241022.csv -abcq20241022.csv -abcg20241022.csv -abcv20241022.csv -abcb20241022.csv +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv ``` -**Example 3**: *Match all Any file starting with abc,And the fourth character is either x or g*, the Regular Expression: +**Example 3**: *Match all file starting with abc,And the fourth character is either x or g*, the Regular Expression: ``` -abc[x,g]*.csv +/data/seatunnel/20241002/abc[x,g].* ``` The result of this example matching is: ``` -abcx20241022.csv -abcg20241022.csv +/data/seatunnel/20241002/abcg202410.csv +``` +**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: +``` +/data/seatunnel/202410\d*/.*.csv ``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +``` + ### compress_codec [string] The compress codec of files and the details that supported as the following shown: @@ -482,7 +464,7 @@ env { source { LocalFile { - path = "/seatunnel/read/" + path = "/data/seatunnel/" file_format_type = "csv" skip_header_row_number = 1 // file example abcD2024.csv From eae7b144ec42488117c42d5e75a833c44633de74 Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Tue, 29 Oct 2024 10:53:11 +0800 Subject: [PATCH 12/17] [Fix][DOC] LocalFile doc optimize Optimize document structure --- docs/en/connector-v2/source/LocalFile.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md index f16cf1cf545..379f4ebffb2 100644 --- a/docs/en/connector-v2/source/LocalFile.md +++ b/docs/en/connector-v2/source/LocalFile.md @@ -254,9 +254,7 @@ Specifies Whether to process data using the tag attribute format. Filter pattern, which used for filtering files. -The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. learn it - -File Structure Example: +The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv From d542c11270ab02517fdac0ed080df31a7271966e Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Tue, 29 Oct 2024 11:37:26 +0800 Subject: [PATCH 13/17] [Fix][DOC] LocalFile doc optimize Optimize document structure --- docs/en/connector-v2/source/LocalFile.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md index 379f4ebffb2..047f219aa7a 100644 --- a/docs/en/connector-v2/source/LocalFile.md +++ b/docs/en/connector-v2/source/LocalFile.md @@ -255,6 +255,9 @@ Specifies Whether to process data using the tag attribute format. Filter pattern, which used for filtering files. The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. +There are some examples. + +File Structure Example: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv From 867a840cfec4ac88373587d5ed8d7782361e2047 Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Tue, 29 Oct 2024 15:42:01 +0800 Subject: [PATCH 14/17] [Fix][DOC] Additional explanation for the file_filter_pattern parameter Please provide a description of all connectors that support the file_filter_pattern parameter --- docs/en/connector-v2/source/CosFile.md | 124 ++++++++++++++++---- docs/en/connector-v2/source/HdfsFile.md | 79 ++++++++++++- docs/en/connector-v2/source/LocalFile.md | 55 ++++----- docs/en/connector-v2/source/OssFile.md | 76 ++++++++++++ docs/en/connector-v2/source/OssJindoFile.md | 124 ++++++++++++++++---- 5 files changed, 382 insertions(+), 76 deletions(-) diff --git a/docs/en/connector-v2/source/CosFile.md b/docs/en/connector-v2/source/CosFile.md index 702439c3062..4b0675b9893 100644 --- a/docs/en/connector-v2/source/CosFile.md +++ b/docs/en/connector-v2/source/CosFile.md @@ -45,30 +45,30 @@ To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and ## Options -| name | type | required | default value | -|---------------------------|---------|----------|---------------------| -| path | string | yes | - | -| file_format_type | string | yes | - | -| bucket | string | yes | - | -| secret_id | string | yes | - | -| secret_key | string | yes | - | -| region | string | yes | - | -| read_columns | list | yes | - | -| delimiter/field_delimiter | string | no | \001 | -| parse_partition_from_path | boolean | no | true | -| skip_header_row_number | long | no | 0 | -| date_format | string | no | yyyy-MM-dd | -| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | -| time_format | string | no | HH:mm:ss | -| schema | config | no | - | -| sheet_name | string | no | - | -| xml_row_tag | string | no | - | -| xml_use_attr_format | boolean | no | - | -| file_filter_pattern | string | no | - | -| compress_codec | string | no | none | -| archive_compress_codec | string | no | none | -| encoding | string | no | UTF-8 | -| common-options | | no | - | +| name | type | required | default value | +|---------------------------|---------|----------|------------------------------------------------------------| +| path | string | yes | - | +| file_format_type | string | yes | - | +| bucket | string | yes | - | +| secret_id | string | yes | - | +| secret_key | string | yes | - | +| region | string | yes | - | +| read_columns | list | yes | - | +| delimiter/field_delimiter | string | no | \001 | +| parse_partition_from_path | boolean | no | true | +| skip_header_row_number | long | no | 0 | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| schema | config | no | - | +| sheet_name | string | no | - | +| xml_row_tag | string | no | - | +| xml_use_attr_format | boolean | no | - | +| file_filter_pattern | string | no | `*.txt` means you only need read the files end with `.txt` | +| compress_codec | string | no | none | +| archive_compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | ### path [string] @@ -275,6 +275,55 @@ Specifies Whether to process data using the tag attribute format. Filter pattern, which used for filtering files. +The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. +There are some examples. + +File Structure Example: +``` +/data/seatunnel/20241001/report.txt +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +/data/seatunnel/20241012/logo.png +``` +Matching Rules Example: + +**Example 1**: *Match all .txt files*,Regular Expression: +``` +/data/seatunnel/20241001/.*\.txt +``` +The result of this example matching is: +``` +/data/seatunnel/20241001/report.txt +``` +**Example 2**: *Match all file starting with abc*,Regular Expression: +``` +/data/seatunnel/20241002/abc.* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +``` +**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression: +``` +/data/seatunnel/20241007/abc[h,g].* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +``` +**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: +``` +/data/seatunnel/202410\d*/.*\.csv +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +``` + ### compress_codec [string] The compress codec of files and the details that supported as the following shown: @@ -372,6 +421,33 @@ sink { ``` +### Filter File + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + CosFile { + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + path = "/seatunnel/read/binary/" + file_format_type = "binary" + // file example abcD2024.csv + file_filter_pattern = "abc[DX]*.*" + } +} + +sink { + Console { + } +} +``` + ## Changelog ### next version diff --git a/docs/en/connector-v2/source/HdfsFile.md b/docs/en/connector-v2/source/HdfsFile.md index 7413c0428b8..300d96f0f16 100644 --- a/docs/en/connector-v2/source/HdfsFile.md +++ b/docs/en/connector-v2/source/HdfsFile.md @@ -41,7 +41,7 @@ Read data from hdfs file system. ## Source Options -| Name | Type | Required | Default | Description | +| Name | Type | Required | Default | Description | |---------------------------|---------|----------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | The source file path. | | file_format_type | string | yes | - | We supported as the following file types:`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary`.Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. | @@ -62,6 +62,7 @@ Read data from hdfs file system. | sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. | | xml_row_tag | string | no | - | Specifies the tag name of the data rows within the XML file, only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only used when file_format is xml. | +| file_filter_pattern | string | no | | `*.txt` means you only need read the files end with `.txt` | | compress_codec | string | no | none | The compress codec of files | | archive_compress_codec | string | no | none | | encoding | string | no | UTF-8 | | @@ -71,6 +72,59 @@ Read data from hdfs file system. **delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. + +The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. +There are some examples. + +File Structure Example: +``` +/data/seatunnel/20241001/report.txt +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +/data/seatunnel/20241012/logo.png +``` +Matching Rules Example: + +**Example 1**: *Match all .txt files*,Regular Expression: +``` +/data/seatunnel/20241001/.*\.txt +``` +The result of this example matching is: +``` +/data/seatunnel/20241001/report.txt +``` +**Example 2**: *Match all file starting with abc*,Regular Expression: +``` +/data/seatunnel/20241002/abc.* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +``` +**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression: +``` +/data/seatunnel/20241007/abc[h,g].* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +``` +**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: +``` +/data/seatunnel/202410\d*/.*\.csv +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +``` + ### compress_codec [string] The compress codec of files and the details that supported as the following shown: @@ -146,3 +200,26 @@ sink { } ``` +### Filter File + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + HdfsFile { + path = "/apps/hive/demo/student" + file_format_type = "json" + fs.defaultFS = "hdfs://namenode001" + // file example abcD2024.csv + file_filter_pattern = "abc[DX]*.*" + } +} + +sink { + Console { + } +} +``` diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md index 047f219aa7a..fb0e8de9139 100644 --- a/docs/en/connector-v2/source/LocalFile.md +++ b/docs/en/connector-v2/source/LocalFile.md @@ -43,27 +43,27 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you ## Options -| name | type | required | default value | -|---------------------------|---------|----------|--------------------------------------| -| path | string | yes | - | -| file_format_type | string | yes | - | -| read_columns | list | no | - | -| delimiter/field_delimiter | string | no | \001 | -| parse_partition_from_path | boolean | no | true | -| date_format | string | no | yyyy-MM-dd | -| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | -| time_format | string | no | HH:mm:ss | -| skip_header_row_number | long | no | 0 | -| schema | config | no | - | -| sheet_name | string | no | - | -| xml_row_tag | string | no | - | -| xml_use_attr_format | boolean | no | - | -| file_filter_pattern | string | no | - | -| compress_codec | string | no | none | -| archive_compress_codec | string | no | none | -| encoding | string | no | UTF-8 | -| common-options | | no | - | -| tables_configs | list | no | used to define a multiple table task | +| name | type | required | default value | +|---------------------------|---------|----------|------------------------------------------------------------| +| path | string | yes | - | +| file_format_type | string | yes | - | +| read_columns | list | no | - | +| delimiter/field_delimiter | string | no | \001 | +| parse_partition_from_path | boolean | no | true | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| skip_header_row_number | long | no | 0 | +| schema | config | no | - | +| sheet_name | string | no | - | +| xml_row_tag | string | no | - | +| xml_use_attr_format | boolean | no | - | +| file_filter_pattern | string | no | `*.txt` means you only need read the files end with `.txt` | +| compress_codec | string | no | none | +| archive_compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | +| tables_configs | list | no | used to define a multiple table task | ### path [string] @@ -269,7 +269,7 @@ Matching Rules Example: **Example 1**: *Match all .txt files*,Regular Expression: ``` -/data/seatunnel/202410\d*/.*.txt +/data/seatunnel/20241001/.*\.txt ``` The result of this example matching is: ``` @@ -277,24 +277,24 @@ The result of this example matching is: ``` **Example 2**: *Match all file starting with abc*,Regular Expression: ``` -/data/seatunnel/202410\d*/abc.* +/data/seatunnel/20241002/abc.* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` -**Example 3**: *Match all file starting with abc,And the fourth character is either x or g*, the Regular Expression: +**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression: ``` -/data/seatunnel/20241002/abc[x,g].* +/data/seatunnel/20241007/abc[h,g].* ``` The result of this example matching is: ``` -/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241007/abch202410.csv ``` **Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: ``` -/data/seatunnel/202410\d*/.*.csv +/data/seatunnel/202410\d*/.*\.csv ``` The result of this example matching is: ``` @@ -472,6 +472,7 @@ source { file_filter_pattern = "abc[DX]*.*" } } + sink { Console { } diff --git a/docs/en/connector-v2/source/OssFile.md b/docs/en/connector-v2/source/OssFile.md index d5326cb86a4..01bbde41c1e 100644 --- a/docs/en/connector-v2/source/OssFile.md +++ b/docs/en/connector-v2/source/OssFile.md @@ -233,6 +233,55 @@ The encoding of the file to read. This param will be parsed by `Charset.forName( Filter pattern, which used for filtering files. +The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. +There are some examples. + +File Structure Example: +``` +/data/seatunnel/20241001/report.txt +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +/data/seatunnel/20241012/logo.png +``` +Matching Rules Example: + +**Example 1**: *Match all .txt files*,Regular Expression: +``` +/data/seatunnel/20241001/.*\.txt +``` +The result of this example matching is: +``` +/data/seatunnel/20241001/report.txt +``` +**Example 2**: *Match all file starting with abc*,Regular Expression: +``` +/data/seatunnel/20241002/abc.* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +``` +**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression: +``` +/data/seatunnel/20241007/abc[h,g].* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +``` +**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: +``` +/data/seatunnel/202410\d*/.*\.csv +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +``` + ### schema [config] Only need to be configured when the file_format_type are text, json, excel, xml or csv ( Or other format we can't read the schema from metadata). @@ -474,6 +523,33 @@ sink { } ``` +### Filter File + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + OssFile { + path = "/seatunnel/orc" + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + file_format_type = "orc" + // file example abcD2024.csv + file_filter_pattern = "abc[DX]*.*" + } +} + +sink { + Console { + } +} +``` + ## Changelog ### 2.2.0-beta 2022-09-26 diff --git a/docs/en/connector-v2/source/OssJindoFile.md b/docs/en/connector-v2/source/OssJindoFile.md index d5bd6d14fa3..97e6981aea0 100644 --- a/docs/en/connector-v2/source/OssJindoFile.md +++ b/docs/en/connector-v2/source/OssJindoFile.md @@ -49,30 +49,30 @@ It only supports hadoop version **2.9.X+**. ## Options -| name | type | required | default value | -|---------------------------|---------|----------|---------------------| -| path | string | yes | - | -| file_format_type | string | yes | - | -| bucket | string | yes | - | -| access_key | string | yes | - | -| access_secret | string | yes | - | -| endpoint | string | yes | - | -| read_columns | list | no | - | -| delimiter/field_delimiter | string | no | \001 | -| parse_partition_from_path | boolean | no | true | -| date_format | string | no | yyyy-MM-dd | -| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | -| time_format | string | no | HH:mm:ss | -| skip_header_row_number | long | no | 0 | -| schema | config | no | - | -| sheet_name | string | no | - | -| xml_row_tag | string | no | - | -| xml_use_attr_format | boolean | no | - | -| file_filter_pattern | string | no | - | -| compress_codec | string | no | none | -| archive_compress_codec | string | no | none | -| encoding | string | no | UTF-8 | -| common-options | | no | - | +| name | type | required | default value | +|---------------------------|---------|----------|------------------------------------------------------------| +| path | string | yes | - | +| file_format_type | string | yes | - | +| bucket | string | yes | - | +| access_key | string | yes | - | +| access_secret | string | yes | - | +| endpoint | string | yes | - | +| read_columns | list | no | - | +| delimiter/field_delimiter | string | no | \001 | +| parse_partition_from_path | boolean | no | true | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| skip_header_row_number | long | no | 0 | +| schema | config | no | - | +| sheet_name | string | no | - | +| xml_row_tag | string | no | - | +| xml_use_attr_format | boolean | no | - | +| file_filter_pattern | string | no | `*.txt` means you only need read the files end with `.txt` | +| compress_codec | string | no | none | +| archive_compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | ### path [string] @@ -267,6 +267,55 @@ Reader the sheet of the workbook. Filter pattern, which used for filtering files. +The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. +There are some examples. + +File Structure Example: +``` +/data/seatunnel/20241001/report.txt +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +/data/seatunnel/20241012/logo.png +``` +Matching Rules Example: + +**Example 1**: *Match all .txt files*,Regular Expression: +``` +/data/seatunnel/20241001/.*\.txt +``` +The result of this example matching is: +``` +/data/seatunnel/20241001/report.txt +``` +**Example 2**: *Match all file starting with abc*,Regular Expression: +``` +/data/seatunnel/20241002/abc.* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +``` +**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression: +``` +/data/seatunnel/20241007/abc[h,g].* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +``` +**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: +``` +/data/seatunnel/202410\d*/.*\.csv +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +``` + ### compress_codec [string] The compress codec of files and the details that supported as the following shown: @@ -364,6 +413,33 @@ sink { ``` +### Filter File + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + OssJindoFile { + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + path = "/seatunnel/read/binary/" + file_format_type = "binary" + // file example abcD2024.csv + file_filter_pattern = "abc[DX]*.*" + } +} + +sink { + Console { + } +} +``` + ## Changelog ### next version From c1e8f09afed0dc662be2a3f2b4c70f51181c8b6c Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Tue, 29 Oct 2024 15:42:01 +0800 Subject: [PATCH 15/17] [Fix][DOC] Additional explanation for the file_filter_pattern parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added the following file connector description about file_filter_pattern: CosFile(en)、OssFile(en)、OssJindoFile(en)、HdfsFile(en) --- docs/en/connector-v2/source/CosFile.md | 124 ++++++++++++++++---- docs/en/connector-v2/source/HdfsFile.md | 79 ++++++++++++- docs/en/connector-v2/source/LocalFile.md | 55 ++++----- docs/en/connector-v2/source/OssFile.md | 76 ++++++++++++ docs/en/connector-v2/source/OssJindoFile.md | 124 ++++++++++++++++---- 5 files changed, 382 insertions(+), 76 deletions(-) diff --git a/docs/en/connector-v2/source/CosFile.md b/docs/en/connector-v2/source/CosFile.md index 702439c3062..4b0675b9893 100644 --- a/docs/en/connector-v2/source/CosFile.md +++ b/docs/en/connector-v2/source/CosFile.md @@ -45,30 +45,30 @@ To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and ## Options -| name | type | required | default value | -|---------------------------|---------|----------|---------------------| -| path | string | yes | - | -| file_format_type | string | yes | - | -| bucket | string | yes | - | -| secret_id | string | yes | - | -| secret_key | string | yes | - | -| region | string | yes | - | -| read_columns | list | yes | - | -| delimiter/field_delimiter | string | no | \001 | -| parse_partition_from_path | boolean | no | true | -| skip_header_row_number | long | no | 0 | -| date_format | string | no | yyyy-MM-dd | -| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | -| time_format | string | no | HH:mm:ss | -| schema | config | no | - | -| sheet_name | string | no | - | -| xml_row_tag | string | no | - | -| xml_use_attr_format | boolean | no | - | -| file_filter_pattern | string | no | - | -| compress_codec | string | no | none | -| archive_compress_codec | string | no | none | -| encoding | string | no | UTF-8 | -| common-options | | no | - | +| name | type | required | default value | +|---------------------------|---------|----------|------------------------------------------------------------| +| path | string | yes | - | +| file_format_type | string | yes | - | +| bucket | string | yes | - | +| secret_id | string | yes | - | +| secret_key | string | yes | - | +| region | string | yes | - | +| read_columns | list | yes | - | +| delimiter/field_delimiter | string | no | \001 | +| parse_partition_from_path | boolean | no | true | +| skip_header_row_number | long | no | 0 | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| schema | config | no | - | +| sheet_name | string | no | - | +| xml_row_tag | string | no | - | +| xml_use_attr_format | boolean | no | - | +| file_filter_pattern | string | no | `*.txt` means you only need read the files end with `.txt` | +| compress_codec | string | no | none | +| archive_compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | ### path [string] @@ -275,6 +275,55 @@ Specifies Whether to process data using the tag attribute format. Filter pattern, which used for filtering files. +The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. +There are some examples. + +File Structure Example: +``` +/data/seatunnel/20241001/report.txt +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +/data/seatunnel/20241012/logo.png +``` +Matching Rules Example: + +**Example 1**: *Match all .txt files*,Regular Expression: +``` +/data/seatunnel/20241001/.*\.txt +``` +The result of this example matching is: +``` +/data/seatunnel/20241001/report.txt +``` +**Example 2**: *Match all file starting with abc*,Regular Expression: +``` +/data/seatunnel/20241002/abc.* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +``` +**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression: +``` +/data/seatunnel/20241007/abc[h,g].* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +``` +**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: +``` +/data/seatunnel/202410\d*/.*\.csv +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +``` + ### compress_codec [string] The compress codec of files and the details that supported as the following shown: @@ -372,6 +421,33 @@ sink { ``` +### Filter File + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + CosFile { + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + path = "/seatunnel/read/binary/" + file_format_type = "binary" + // file example abcD2024.csv + file_filter_pattern = "abc[DX]*.*" + } +} + +sink { + Console { + } +} +``` + ## Changelog ### next version diff --git a/docs/en/connector-v2/source/HdfsFile.md b/docs/en/connector-v2/source/HdfsFile.md index 7413c0428b8..300d96f0f16 100644 --- a/docs/en/connector-v2/source/HdfsFile.md +++ b/docs/en/connector-v2/source/HdfsFile.md @@ -41,7 +41,7 @@ Read data from hdfs file system. ## Source Options -| Name | Type | Required | Default | Description | +| Name | Type | Required | Default | Description | |---------------------------|---------|----------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | The source file path. | | file_format_type | string | yes | - | We supported as the following file types:`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary`.Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. | @@ -62,6 +62,7 @@ Read data from hdfs file system. | sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. | | xml_row_tag | string | no | - | Specifies the tag name of the data rows within the XML file, only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only used when file_format is xml. | +| file_filter_pattern | string | no | | `*.txt` means you only need read the files end with `.txt` | | compress_codec | string | no | none | The compress codec of files | | archive_compress_codec | string | no | none | | encoding | string | no | UTF-8 | | @@ -71,6 +72,59 @@ Read data from hdfs file system. **delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. + +The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. +There are some examples. + +File Structure Example: +``` +/data/seatunnel/20241001/report.txt +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +/data/seatunnel/20241012/logo.png +``` +Matching Rules Example: + +**Example 1**: *Match all .txt files*,Regular Expression: +``` +/data/seatunnel/20241001/.*\.txt +``` +The result of this example matching is: +``` +/data/seatunnel/20241001/report.txt +``` +**Example 2**: *Match all file starting with abc*,Regular Expression: +``` +/data/seatunnel/20241002/abc.* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +``` +**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression: +``` +/data/seatunnel/20241007/abc[h,g].* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +``` +**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: +``` +/data/seatunnel/202410\d*/.*\.csv +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +``` + ### compress_codec [string] The compress codec of files and the details that supported as the following shown: @@ -146,3 +200,26 @@ sink { } ``` +### Filter File + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + HdfsFile { + path = "/apps/hive/demo/student" + file_format_type = "json" + fs.defaultFS = "hdfs://namenode001" + // file example abcD2024.csv + file_filter_pattern = "abc[DX]*.*" + } +} + +sink { + Console { + } +} +``` diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md index 047f219aa7a..fb0e8de9139 100644 --- a/docs/en/connector-v2/source/LocalFile.md +++ b/docs/en/connector-v2/source/LocalFile.md @@ -43,27 +43,27 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you ## Options -| name | type | required | default value | -|---------------------------|---------|----------|--------------------------------------| -| path | string | yes | - | -| file_format_type | string | yes | - | -| read_columns | list | no | - | -| delimiter/field_delimiter | string | no | \001 | -| parse_partition_from_path | boolean | no | true | -| date_format | string | no | yyyy-MM-dd | -| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | -| time_format | string | no | HH:mm:ss | -| skip_header_row_number | long | no | 0 | -| schema | config | no | - | -| sheet_name | string | no | - | -| xml_row_tag | string | no | - | -| xml_use_attr_format | boolean | no | - | -| file_filter_pattern | string | no | - | -| compress_codec | string | no | none | -| archive_compress_codec | string | no | none | -| encoding | string | no | UTF-8 | -| common-options | | no | - | -| tables_configs | list | no | used to define a multiple table task | +| name | type | required | default value | +|---------------------------|---------|----------|------------------------------------------------------------| +| path | string | yes | - | +| file_format_type | string | yes | - | +| read_columns | list | no | - | +| delimiter/field_delimiter | string | no | \001 | +| parse_partition_from_path | boolean | no | true | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| skip_header_row_number | long | no | 0 | +| schema | config | no | - | +| sheet_name | string | no | - | +| xml_row_tag | string | no | - | +| xml_use_attr_format | boolean | no | - | +| file_filter_pattern | string | no | `*.txt` means you only need read the files end with `.txt` | +| compress_codec | string | no | none | +| archive_compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | +| tables_configs | list | no | used to define a multiple table task | ### path [string] @@ -269,7 +269,7 @@ Matching Rules Example: **Example 1**: *Match all .txt files*,Regular Expression: ``` -/data/seatunnel/202410\d*/.*.txt +/data/seatunnel/20241001/.*\.txt ``` The result of this example matching is: ``` @@ -277,24 +277,24 @@ The result of this example matching is: ``` **Example 2**: *Match all file starting with abc*,Regular Expression: ``` -/data/seatunnel/202410\d*/abc.* +/data/seatunnel/20241002/abc.* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` -**Example 3**: *Match all file starting with abc,And the fourth character is either x or g*, the Regular Expression: +**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression: ``` -/data/seatunnel/20241002/abc[x,g].* +/data/seatunnel/20241007/abc[h,g].* ``` The result of this example matching is: ``` -/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241007/abch202410.csv ``` **Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: ``` -/data/seatunnel/202410\d*/.*.csv +/data/seatunnel/202410\d*/.*\.csv ``` The result of this example matching is: ``` @@ -472,6 +472,7 @@ source { file_filter_pattern = "abc[DX]*.*" } } + sink { Console { } diff --git a/docs/en/connector-v2/source/OssFile.md b/docs/en/connector-v2/source/OssFile.md index d5326cb86a4..01bbde41c1e 100644 --- a/docs/en/connector-v2/source/OssFile.md +++ b/docs/en/connector-v2/source/OssFile.md @@ -233,6 +233,55 @@ The encoding of the file to read. This param will be parsed by `Charset.forName( Filter pattern, which used for filtering files. +The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. +There are some examples. + +File Structure Example: +``` +/data/seatunnel/20241001/report.txt +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +/data/seatunnel/20241012/logo.png +``` +Matching Rules Example: + +**Example 1**: *Match all .txt files*,Regular Expression: +``` +/data/seatunnel/20241001/.*\.txt +``` +The result of this example matching is: +``` +/data/seatunnel/20241001/report.txt +``` +**Example 2**: *Match all file starting with abc*,Regular Expression: +``` +/data/seatunnel/20241002/abc.* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +``` +**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression: +``` +/data/seatunnel/20241007/abc[h,g].* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +``` +**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: +``` +/data/seatunnel/202410\d*/.*\.csv +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +``` + ### schema [config] Only need to be configured when the file_format_type are text, json, excel, xml or csv ( Or other format we can't read the schema from metadata). @@ -474,6 +523,33 @@ sink { } ``` +### Filter File + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + OssFile { + path = "/seatunnel/orc" + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + file_format_type = "orc" + // file example abcD2024.csv + file_filter_pattern = "abc[DX]*.*" + } +} + +sink { + Console { + } +} +``` + ## Changelog ### 2.2.0-beta 2022-09-26 diff --git a/docs/en/connector-v2/source/OssJindoFile.md b/docs/en/connector-v2/source/OssJindoFile.md index d5bd6d14fa3..97e6981aea0 100644 --- a/docs/en/connector-v2/source/OssJindoFile.md +++ b/docs/en/connector-v2/source/OssJindoFile.md @@ -49,30 +49,30 @@ It only supports hadoop version **2.9.X+**. ## Options -| name | type | required | default value | -|---------------------------|---------|----------|---------------------| -| path | string | yes | - | -| file_format_type | string | yes | - | -| bucket | string | yes | - | -| access_key | string | yes | - | -| access_secret | string | yes | - | -| endpoint | string | yes | - | -| read_columns | list | no | - | -| delimiter/field_delimiter | string | no | \001 | -| parse_partition_from_path | boolean | no | true | -| date_format | string | no | yyyy-MM-dd | -| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | -| time_format | string | no | HH:mm:ss | -| skip_header_row_number | long | no | 0 | -| schema | config | no | - | -| sheet_name | string | no | - | -| xml_row_tag | string | no | - | -| xml_use_attr_format | boolean | no | - | -| file_filter_pattern | string | no | - | -| compress_codec | string | no | none | -| archive_compress_codec | string | no | none | -| encoding | string | no | UTF-8 | -| common-options | | no | - | +| name | type | required | default value | +|---------------------------|---------|----------|------------------------------------------------------------| +| path | string | yes | - | +| file_format_type | string | yes | - | +| bucket | string | yes | - | +| access_key | string | yes | - | +| access_secret | string | yes | - | +| endpoint | string | yes | - | +| read_columns | list | no | - | +| delimiter/field_delimiter | string | no | \001 | +| parse_partition_from_path | boolean | no | true | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| skip_header_row_number | long | no | 0 | +| schema | config | no | - | +| sheet_name | string | no | - | +| xml_row_tag | string | no | - | +| xml_use_attr_format | boolean | no | - | +| file_filter_pattern | string | no | `*.txt` means you only need read the files end with `.txt` | +| compress_codec | string | no | none | +| archive_compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | ### path [string] @@ -267,6 +267,55 @@ Reader the sheet of the workbook. Filter pattern, which used for filtering files. +The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. +There are some examples. + +File Structure Example: +``` +/data/seatunnel/20241001/report.txt +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +/data/seatunnel/20241012/logo.png +``` +Matching Rules Example: + +**Example 1**: *Match all .txt files*,Regular Expression: +``` +/data/seatunnel/20241001/.*\.txt +``` +The result of this example matching is: +``` +/data/seatunnel/20241001/report.txt +``` +**Example 2**: *Match all file starting with abc*,Regular Expression: +``` +/data/seatunnel/20241002/abc.* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +``` +**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression: +``` +/data/seatunnel/20241007/abc[h,g].* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +``` +**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: +``` +/data/seatunnel/202410\d*/.*\.csv +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +``` + ### compress_codec [string] The compress codec of files and the details that supported as the following shown: @@ -364,6 +413,33 @@ sink { ``` +### Filter File + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + OssJindoFile { + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + path = "/seatunnel/read/binary/" + file_format_type = "binary" + // file example abcD2024.csv + file_filter_pattern = "abc[DX]*.*" + } +} + +sink { + Console { + } +} +``` + ## Changelog ### next version From 9b1479712192f14f2f02c13e56edbc8b6aa0aa8d Mon Sep 17 00:00:00 2001 From: "YOMO.LEE" <75362129@qq.com> Date: Tue, 29 Oct 2024 17:12:07 +0800 Subject: [PATCH 16/17] [Fix][DOC] Additional explanation for the file_filter_pattern parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added the following file connector description about file_filter_pattern: FtpFile(en)、SftpFile(en)、S3File(en)、HdfsFile(zh) --- docs/en/connector-v2/source/FtpFile.md | 81 ++++++++++++++++++++++++ docs/en/connector-v2/source/S3File.md | 83 ++++++++++++++++++++++++- docs/en/connector-v2/source/SftpFile.md | 83 ++++++++++++++++++++++++- docs/zh/connector-v2/source/HdfsFile.md | 77 ++++++++++++++++++++++- 4 files changed, 321 insertions(+), 3 deletions(-) diff --git a/docs/en/connector-v2/source/FtpFile.md b/docs/en/connector-v2/source/FtpFile.md index ec02f77f9f7..3fe7e6b88f3 100644 --- a/docs/en/connector-v2/source/FtpFile.md +++ b/docs/en/connector-v2/source/FtpFile.md @@ -63,6 +63,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you | archive_compress_codec | string | no | none | | encoding | string | no | UTF-8 | | common-options | | no | - | +| file_filter_pattern | string | no | | ### host [string] @@ -84,6 +85,59 @@ The target ftp password is required The source file path. +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. + +The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. +There are some examples. + +File Structure Example: +``` +/data/seatunnel/20241001/report.txt +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +/data/seatunnel/20241012/logo.png +``` +Matching Rules Example: + +**Example 1**: *Match all .txt files*,Regular Expression: +``` +/data/seatunnel/20241001/.*\.txt +``` +The result of this example matching is: +``` +/data/seatunnel/20241001/report.txt +``` +**Example 2**: *Match all file starting with abc*,Regular Expression: +``` +/data/seatunnel/20241002/abc.* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +``` +**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression: +``` +/data/seatunnel/20241007/abc[h,g].* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +``` +**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: +``` +/data/seatunnel/202410\d*/.*\.csv +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +``` + ### file_format_type [string] File type, supported as the following file types: @@ -400,6 +454,33 @@ sink { ``` +### Filter File + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FtpFile { + host = "192.168.31.48" + port = 21 + user = tyrantlucifer + password = tianchao + path = "/seatunnel/read/binary/" + file_format_type = "binary" + // file example abcD2024.csv + file_filter_pattern = "abc[DX]*.*" + } +} + +sink { + Console { + } +} +``` + ## Changelog ### 2.2.0-beta 2022-09-26 diff --git a/docs/en/connector-v2/source/S3File.md b/docs/en/connector-v2/source/S3File.md index d280d6dc7f2..2e23fc6a3a7 100644 --- a/docs/en/connector-v2/source/S3File.md +++ b/docs/en/connector-v2/source/S3File.md @@ -196,7 +196,7 @@ If you assign file type to `parquet` `orc`, schema option not required, connecto ## Options -| name | type | required | default value | Description | +| name | type | required | default value | Description | |---------------------------------|---------|----------|-------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | The s3 path that needs to be read can have sub paths, but the sub paths need to meet certain format requirements. Specific requirements can be referred to "parse_partition_from_path" option | | file_format_type | string | yes | - | File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` | @@ -221,11 +221,65 @@ If you assign file type to `parquet` `orc`, schema option not required, connecto | archive_compress_codec | string | no | none | | | encoding | string | no | UTF-8 | | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details. | +| file_filter_pattern | string | no | | `*.txt` means you only need read the files end with `.txt` | ### delimiter/field_delimiter [string] **delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. + +The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. +There are some examples. + +File Structure Example: +``` +/data/seatunnel/20241001/report.txt +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +/data/seatunnel/20241012/logo.png +``` +Matching Rules Example: + +**Example 1**: *Match all .txt files*,Regular Expression: +``` +/data/seatunnel/20241001/.*\.txt +``` +The result of this example matching is: +``` +/data/seatunnel/20241001/report.txt +``` +**Example 2**: *Match all file starting with abc*,Regular Expression: +``` +/data/seatunnel/20241002/abc.* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +``` +**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression: +``` +/data/seatunnel/20241007/abc[h,g].* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +``` +**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: +``` +/data/seatunnel/202410\d*/.*\.csv +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +``` + ### compress_codec [string] The compress codec of files and the details that supported as the following shown: @@ -349,6 +403,33 @@ sink { } ``` +### Filter File + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + S3File { + path = "/seatunnel/json" + bucket = "s3a://seatunnel-test" + fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" + fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" + file_format_type = "json" + read_columns = ["id", "name"] + // file example abcD2024.csv + file_filter_pattern = "abc[DX]*.*" + } +} + +sink { + Console { + } +} +``` + ## Changelog ### 2.3.0-beta 2022-10-20 diff --git a/docs/en/connector-v2/source/SftpFile.md b/docs/en/connector-v2/source/SftpFile.md index 3eadcd3a69e..ff9964cefef 100644 --- a/docs/en/connector-v2/source/SftpFile.md +++ b/docs/en/connector-v2/source/SftpFile.md @@ -71,7 +71,7 @@ The File does not have a specific type list, and we can indicate which SeaTunnel ## Source Options -| Name | Type | Required | default value | Description | +| Name | Type | Required | default value | Description | |---------------------------|---------|----------|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | host | String | Yes | - | The target sftp host is required | | port | Int | Yes | - | The target sftp port is required | @@ -95,6 +95,60 @@ The File does not have a specific type list, and we can indicate which SeaTunnel | archive_compress_codec | string | no | none | | encoding | string | no | UTF-8 | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details. | +| file_filter_pattern | string | no | | `*.txt` means you only need read the files end with `.txt` | + +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. + +The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. +There are some examples. + +File Structure Example: +``` +/data/seatunnel/20241001/report.txt +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +/data/seatunnel/20241012/logo.png +``` +Matching Rules Example: + +**Example 1**: *Match all .txt files*,Regular Expression: +``` +/data/seatunnel/20241001/.*\.txt +``` +The result of this example matching is: +``` +/data/seatunnel/20241001/report.txt +``` +**Example 2**: *Match all file starting with abc*,Regular Expression: +``` +/data/seatunnel/20241002/abc.* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +``` +**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression: +``` +/data/seatunnel/20241007/abc[h,g].* +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +``` +**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: +``` +/data/seatunnel/202410\d*/.*\.csv +``` +The result of this example matching is: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +``` ### file_format_type [string] @@ -305,3 +359,30 @@ SftpFile { ``` +### Filter File + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + SftpFile { + host = "sftp" + port = 22 + user = seatunnel + password = pass + path = "tmp/seatunnel/read/json" + file_format_type = "json" + result_table_name = "sftp" + // file example abcD2024.csv + file_filter_pattern = "abc[DX]*.*" + } +} + +sink { + Console { + } +} +``` \ No newline at end of file diff --git a/docs/zh/connector-v2/source/HdfsFile.md b/docs/zh/connector-v2/source/HdfsFile.md index 0f983a80bcf..24ca67bfca4 100644 --- a/docs/zh/connector-v2/source/HdfsFile.md +++ b/docs/zh/connector-v2/source/HdfsFile.md @@ -39,7 +39,7 @@ ## 源选项 -| 名称 | 类型 | 是否必须 | 默认值 | 描述 | +| 名称 | 类型 | 是否必须 | 默认值 | 描述 | |---------------------------|---------|------|----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | 是 | - | 源文件路径。 | | file_format_type | string | 是 | - | 我们支持以下文件类型:`text` `json` `csv` `orc` `parquet` `excel`。请注意,最终文件名将以文件格式的后缀结束,文本文件的后缀是 `txt`。 | @@ -55,6 +55,7 @@ | kerberos_principal | string | 否 | - | kerberos 的 principal。 | | kerberos_keytab_path | string | 否 | - | kerberos 的 keytab 路径。 | | skip_header_row_number | long | 否 | 0 | 跳过前几行,但仅适用于 txt 和 csv。例如,设置如下:`skip_header_row_number = 2`。然后 Seatunnel 将跳过源文件中的前两行。 | +| file_filter_pattern | string | 否 | - | `*.txt` 代表你能过滤出来以txt为后缀名的文件。 | | schema | config | 否 | - | 上游数据的模式字段。 | | sheet_name | string | 否 | - | 读取工作簿的表格,仅在文件格式为 excel 时使用。 | | compress_codec | string | 否 | none | 文件的压缩编解码器。 | @@ -64,6 +65,58 @@ **delimiter** 参数在版本 2.3.5 后将被弃用,请改用 **field_delimiter**。 +### file_filter_pattern [string] + +这个过滤规则遵循正则表达式. 关于详情,请参考 https://en.wikipedia.org/wiki/Regular_expression 学习 + +这里是一些例子. + +文件清单: +``` +/data/seatunnel/20241001/report.txt +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +/data/seatunnel/20241012/logo.png +``` +匹配规则: + +**例子 1**: *匹配所有txt为后缀名的文件*,匹配正则为: +``` +/data/seatunnel/20241001/.*\.txt +``` +匹配的结果是: +``` +/data/seatunnel/20241001/report.txt +``` +**例子 2**: *匹配所有文件名以abc开头的文件*,匹配正则为: +``` +/data/seatunnel/20241002/abc.* +``` +匹配的结果是: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +``` +**例子 3**: *匹配所有文件名以abc开头,并且文件第四个字母是 h 或者 g 的文件*, 匹配正则为: +``` +/data/seatunnel/20241007/abc[h,g].* +``` +匹配的结果是: +``` +/data/seatunnel/20241007/abch202410.csv +``` +**例子 4**: *匹配所有文件夹第三级以 202410 开头并且文件后缀名是.csv的文件*, 匹配正则为: +``` +/data/seatunnel/202410\d*/.*\.csv +``` +匹配的结果是: +``` +/data/seatunnel/20241007/abch202410.csv +/data/seatunnel/20241002/abcg202410.csv +/data/seatunnel/20241005/old_data.csv +``` + ### compress_codec [string] 文件的压缩编解码器及支持的详细信息如下所示: @@ -125,3 +178,25 @@ sink { } ``` +### Filter File + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + HdfsFile { + path = "/apps/hive/demo/student" + file_format_type = "json" + fs.defaultFS = "hdfs://namenode001" + file_filter_pattern = "abc[DX]*.*" + } +} + +sink { + Console { + } +} +``` \ No newline at end of file From 9dc6fe768c3e2726d1029c9368585141b72092d9 Mon Sep 17 00:00:00 2001 From: Jia Fan Date: Tue, 29 Oct 2024 18:09:34 +0800 Subject: [PATCH 17/17] update --- docs/en/connector-v2/source/CosFile.md | 48 ++++++++++----------- docs/en/connector-v2/source/FtpFile.md | 1 - docs/en/connector-v2/source/HdfsFile.md | 4 +- docs/en/connector-v2/source/LocalFile.md | 42 +++++++++--------- docs/en/connector-v2/source/OssFile.md | 4 +- docs/en/connector-v2/source/OssJindoFile.md | 48 ++++++++++----------- docs/en/connector-v2/source/S3File.md | 4 +- docs/en/connector-v2/source/SftpFile.md | 1 - docs/zh/connector-v2/source/HdfsFile.md | 6 ++- 9 files changed, 79 insertions(+), 79 deletions(-) diff --git a/docs/en/connector-v2/source/CosFile.md b/docs/en/connector-v2/source/CosFile.md index 4b0675b9893..15b6de0c6f8 100644 --- a/docs/en/connector-v2/source/CosFile.md +++ b/docs/en/connector-v2/source/CosFile.md @@ -45,30 +45,30 @@ To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and ## Options -| name | type | required | default value | -|---------------------------|---------|----------|------------------------------------------------------------| -| path | string | yes | - | -| file_format_type | string | yes | - | -| bucket | string | yes | - | -| secret_id | string | yes | - | -| secret_key | string | yes | - | -| region | string | yes | - | -| read_columns | list | yes | - | -| delimiter/field_delimiter | string | no | \001 | -| parse_partition_from_path | boolean | no | true | -| skip_header_row_number | long | no | 0 | -| date_format | string | no | yyyy-MM-dd | -| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | -| time_format | string | no | HH:mm:ss | -| schema | config | no | - | -| sheet_name | string | no | - | -| xml_row_tag | string | no | - | -| xml_use_attr_format | boolean | no | - | -| file_filter_pattern | string | no | `*.txt` means you only need read the files end with `.txt` | -| compress_codec | string | no | none | -| archive_compress_codec | string | no | none | -| encoding | string | no | UTF-8 | -| common-options | | no | - | +| name | type | required | default value | +|---------------------------|---------|----------|---------------------| +| path | string | yes | - | +| file_format_type | string | yes | - | +| bucket | string | yes | - | +| secret_id | string | yes | - | +| secret_key | string | yes | - | +| region | string | yes | - | +| read_columns | list | yes | - | +| delimiter/field_delimiter | string | no | \001 | +| parse_partition_from_path | boolean | no | true | +| skip_header_row_number | long | no | 0 | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| schema | config | no | - | +| sheet_name | string | no | - | +| xml_row_tag | string | no | - | +| xml_use_attr_format | boolean | no | - | +| file_filter_pattern | string | no | | +| compress_codec | string | no | none | +| archive_compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | ### path [string] diff --git a/docs/en/connector-v2/source/FtpFile.md b/docs/en/connector-v2/source/FtpFile.md index 3fe7e6b88f3..6d114813769 100644 --- a/docs/en/connector-v2/source/FtpFile.md +++ b/docs/en/connector-v2/source/FtpFile.md @@ -63,7 +63,6 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you | archive_compress_codec | string | no | none | | encoding | string | no | UTF-8 | | common-options | | no | - | -| file_filter_pattern | string | no | | ### host [string] diff --git a/docs/en/connector-v2/source/HdfsFile.md b/docs/en/connector-v2/source/HdfsFile.md index 300d96f0f16..405dfff820f 100644 --- a/docs/en/connector-v2/source/HdfsFile.md +++ b/docs/en/connector-v2/source/HdfsFile.md @@ -41,7 +41,7 @@ Read data from hdfs file system. ## Source Options -| Name | Type | Required | Default | Description | +| Name | Type | Required | Default | Description | |---------------------------|---------|----------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | The source file path. | | file_format_type | string | yes | - | We supported as the following file types:`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary`.Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. | @@ -62,7 +62,7 @@ Read data from hdfs file system. | sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. | | xml_row_tag | string | no | - | Specifies the tag name of the data rows within the XML file, only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only used when file_format is xml. | -| file_filter_pattern | string | no | | `*.txt` means you only need read the files end with `.txt` | +| file_filter_pattern | string | no | | Filter pattern, which used for filtering files. | | compress_codec | string | no | none | The compress codec of files | | archive_compress_codec | string | no | none | | encoding | string | no | UTF-8 | | diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md index fb0e8de9139..65f287f057b 100644 --- a/docs/en/connector-v2/source/LocalFile.md +++ b/docs/en/connector-v2/source/LocalFile.md @@ -43,27 +43,27 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you ## Options -| name | type | required | default value | -|---------------------------|---------|----------|------------------------------------------------------------| -| path | string | yes | - | -| file_format_type | string | yes | - | -| read_columns | list | no | - | -| delimiter/field_delimiter | string | no | \001 | -| parse_partition_from_path | boolean | no | true | -| date_format | string | no | yyyy-MM-dd | -| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | -| time_format | string | no | HH:mm:ss | -| skip_header_row_number | long | no | 0 | -| schema | config | no | - | -| sheet_name | string | no | - | -| xml_row_tag | string | no | - | -| xml_use_attr_format | boolean | no | - | -| file_filter_pattern | string | no | `*.txt` means you only need read the files end with `.txt` | -| compress_codec | string | no | none | -| archive_compress_codec | string | no | none | -| encoding | string | no | UTF-8 | -| common-options | | no | - | -| tables_configs | list | no | used to define a multiple table task | +| name | type | required | default value | +|---------------------------|---------|----------|--------------------------------------| +| path | string | yes | - | +| file_format_type | string | yes | - | +| read_columns | list | no | - | +| delimiter/field_delimiter | string | no | \001 | +| parse_partition_from_path | boolean | no | true | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| skip_header_row_number | long | no | 0 | +| schema | config | no | - | +| sheet_name | string | no | - | +| xml_row_tag | string | no | - | +| xml_use_attr_format | boolean | no | - | +| file_filter_pattern | string | no | | +| compress_codec | string | no | none | +| archive_compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | +| tables_configs | list | no | used to define a multiple table task | ### path [string] diff --git a/docs/en/connector-v2/source/OssFile.md b/docs/en/connector-v2/source/OssFile.md index 01bbde41c1e..36d998f054c 100644 --- a/docs/en/connector-v2/source/OssFile.md +++ b/docs/en/connector-v2/source/OssFile.md @@ -190,7 +190,7 @@ If you assign file type to `parquet` `orc`, schema option not required, connecto ## Options -| name | type | required | default value | Description | +| name | type | required | default value | Description | |---------------------------|---------|----------|---------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | The Oss path that needs to be read can have sub paths, but the sub paths need to meet certain format requirements. Specific requirements can be referred to "parse_partition_from_path" option | | file_format_type | string | yes | - | File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` | @@ -211,7 +211,7 @@ If you assign file type to `parquet` `orc`, schema option not required, connecto | xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only used when file_format is xml. | | compress_codec | string | no | none | Which compress codec the files used. | | encoding | string | no | UTF-8 | -| file_filter_pattern | string | no | | `*.txt` means you only need read the files end with `.txt` | +| file_filter_pattern | string | no | | Filter pattern, which used for filtering files. | | common-options | config | no | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details. | ### compress_codec [string] diff --git a/docs/en/connector-v2/source/OssJindoFile.md b/docs/en/connector-v2/source/OssJindoFile.md index 97e6981aea0..933439edc9f 100644 --- a/docs/en/connector-v2/source/OssJindoFile.md +++ b/docs/en/connector-v2/source/OssJindoFile.md @@ -49,30 +49,30 @@ It only supports hadoop version **2.9.X+**. ## Options -| name | type | required | default value | -|---------------------------|---------|----------|------------------------------------------------------------| -| path | string | yes | - | -| file_format_type | string | yes | - | -| bucket | string | yes | - | -| access_key | string | yes | - | -| access_secret | string | yes | - | -| endpoint | string | yes | - | -| read_columns | list | no | - | -| delimiter/field_delimiter | string | no | \001 | -| parse_partition_from_path | boolean | no | true | -| date_format | string | no | yyyy-MM-dd | -| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | -| time_format | string | no | HH:mm:ss | -| skip_header_row_number | long | no | 0 | -| schema | config | no | - | -| sheet_name | string | no | - | -| xml_row_tag | string | no | - | -| xml_use_attr_format | boolean | no | - | -| file_filter_pattern | string | no | `*.txt` means you only need read the files end with `.txt` | -| compress_codec | string | no | none | -| archive_compress_codec | string | no | none | -| encoding | string | no | UTF-8 | -| common-options | | no | - | +| name | type | required | default value | +|---------------------------|---------|----------|---------------------| +| path | string | yes | - | +| file_format_type | string | yes | - | +| bucket | string | yes | - | +| access_key | string | yes | - | +| access_secret | string | yes | - | +| endpoint | string | yes | - | +| read_columns | list | no | - | +| delimiter/field_delimiter | string | no | \001 | +| parse_partition_from_path | boolean | no | true | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| skip_header_row_number | long | no | 0 | +| schema | config | no | - | +| sheet_name | string | no | - | +| xml_row_tag | string | no | - | +| xml_use_attr_format | boolean | no | - | +| file_filter_pattern | string | no | | +| compress_codec | string | no | none | +| archive_compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | ### path [string] diff --git a/docs/en/connector-v2/source/S3File.md b/docs/en/connector-v2/source/S3File.md index 2e23fc6a3a7..4834b025bc3 100644 --- a/docs/en/connector-v2/source/S3File.md +++ b/docs/en/connector-v2/source/S3File.md @@ -196,7 +196,7 @@ If you assign file type to `parquet` `orc`, schema option not required, connecto ## Options -| name | type | required | default value | Description | +| name | type | required | default value | Description | |---------------------------------|---------|----------|-------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | The s3 path that needs to be read can have sub paths, but the sub paths need to meet certain format requirements. Specific requirements can be referred to "parse_partition_from_path" option | | file_format_type | string | yes | - | File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` | @@ -220,8 +220,8 @@ If you assign file type to `parquet` `orc`, schema option not required, connecto | compress_codec | string | no | none | | | archive_compress_codec | string | no | none | | | encoding | string | no | UTF-8 | | +| file_filter_pattern | string | no | | Filter pattern, which used for filtering files. | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details. | -| file_filter_pattern | string | no | | `*.txt` means you only need read the files end with `.txt` | ### delimiter/field_delimiter [string] diff --git a/docs/en/connector-v2/source/SftpFile.md b/docs/en/connector-v2/source/SftpFile.md index ff9964cefef..95c710110a0 100644 --- a/docs/en/connector-v2/source/SftpFile.md +++ b/docs/en/connector-v2/source/SftpFile.md @@ -95,7 +95,6 @@ The File does not have a specific type list, and we can indicate which SeaTunnel | archive_compress_codec | string | no | none | | encoding | string | no | UTF-8 | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details. | -| file_filter_pattern | string | no | | `*.txt` means you only need read the files end with `.txt` | ### file_filter_pattern [string] diff --git a/docs/zh/connector-v2/source/HdfsFile.md b/docs/zh/connector-v2/source/HdfsFile.md index 24ca67bfca4..9cd254ef808 100644 --- a/docs/zh/connector-v2/source/HdfsFile.md +++ b/docs/zh/connector-v2/source/HdfsFile.md @@ -39,7 +39,7 @@ ## 源选项 -| 名称 | 类型 | 是否必须 | 默认值 | 描述 | +| 名称 | 类型 | 是否必须 | 默认值 | 描述 | |---------------------------|---------|------|----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | 是 | - | 源文件路径。 | | file_format_type | string | 是 | - | 我们支持以下文件类型:`text` `json` `csv` `orc` `parquet` `excel`。请注意,最终文件名将以文件格式的后缀结束,文本文件的后缀是 `txt`。 | @@ -55,7 +55,7 @@ | kerberos_principal | string | 否 | - | kerberos 的 principal。 | | kerberos_keytab_path | string | 否 | - | kerberos 的 keytab 路径。 | | skip_header_row_number | long | 否 | 0 | 跳过前几行,但仅适用于 txt 和 csv。例如,设置如下:`skip_header_row_number = 2`。然后 Seatunnel 将跳过源文件中的前两行。 | -| file_filter_pattern | string | 否 | - | `*.txt` 代表你能过滤出来以txt为后缀名的文件。 | +| file_filter_pattern | string | 否 | - | 过滤模式,用于过滤文件。 | | schema | config | 否 | - | 上游数据的模式字段。 | | sheet_name | string | 否 | - | 读取工作簿的表格,仅在文件格式为 excel 时使用。 | | compress_codec | string | 否 | none | 文件的压缩编解码器。 | @@ -67,6 +67,8 @@ ### file_filter_pattern [string] +过滤模式,用于过滤文件。 + 这个过滤规则遵循正则表达式. 关于详情,请参考 https://en.wikipedia.org/wiki/Regular_expression 学习 这里是一些例子.