From ea2423a6c66d271fd7426727373264acfd98c17c Mon Sep 17 00:00:00 2001 From: Mahdi Khashan <58775404+mahdikhashan@users.noreply.github.com> Date: Sun, 29 Dec 2024 21:26:57 +0100 Subject: [PATCH] Updating pipeline: dataset_processing Adding dataset: workspace Updating dataset: preprocessed_dataset_sink --- dataset/preprocessed_dataset_sink.json | 3 + dataset/workspace.json | 17 ++++ pipeline/dataset_processing.json | 121 ++++++++++++++++++++++++- 3 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 dataset/workspace.json diff --git a/dataset/preprocessed_dataset_sink.json b/dataset/preprocessed_dataset_sink.json index f08ed34..4e2579a 100644 --- a/dataset/preprocessed_dataset_sink.json +++ b/dataset/preprocessed_dataset_sink.json @@ -11,6 +11,9 @@ "location": { "type": "AzureBlobStorageLocation", "container": "sink" + }, + "compression": { + "type": "TarGZip" } } } diff --git a/dataset/workspace.json b/dataset/workspace.json new file mode 100644 index 0000000..4e6553c --- /dev/null +++ b/dataset/workspace.json @@ -0,0 +1,17 @@ +{ + "name": "workspace", + "properties": { + "linkedServiceName": { + "referenceName": "Dataset", + "type": "LinkedServiceReference" + }, + "annotations": [], + "type": "Binary", + "typeProperties": { + "location": { + "type": "AzureBlobStorageLocation", + "container": "workspace" + } + } + } +} \ No newline at end of file diff --git a/pipeline/dataset_processing.json b/pipeline/dataset_processing.json index 7d9aed1..123925b 100644 --- a/pipeline/dataset_processing.json +++ b/pipeline/dataset_processing.json @@ -29,7 +29,8 @@ "sink": { "type": "BinarySink", "storeSettings": { - "type": "AzureBlobStorageWriteSettings" + "type": "AzureBlobStorageWriteSettings", + "copyBehavior": "FlattenHierarchy" } }, "enableStaging": false @@ -40,6 +41,119 @@ "type": "DatasetReference" } ], + "outputs": [ + { + "referenceName": "workspace", + "type": "DatasetReference" + } + ] + }, + { + "name": "preprocess_dataset", + "type": "AzureFunctionActivity", + "dependsOn": [ + { + "activity": "unzip", + "dependencyConditions": [ + "Succeeded" + ] + } + ], + "policy": { + "timeout": "0.12:00:00", + "retry": 0, + "retryIntervalInSeconds": 30, + "secureOutput": false, + "secureInput": false + }, + "userProperties": [] + }, + { + "name": "unzip", + "type": "AzureFunctionActivity", + "dependsOn": [ + { + "activity": "duplicate_data", + "dependencyConditions": [ + "Succeeded" + ] + } + ], + "policy": { + "timeout": "0.12:00:00", + "retry": 0, + "retryIntervalInSeconds": 30, + "secureOutput": false, + "secureInput": false + }, + "userProperties": [] + }, + { + "name": "compress", + "type": "AzureFunctionActivity", + "dependsOn": [ + { + "activity": "preprocess_dataset", + "dependencyConditions": [ + "Succeeded" + ] + } + ], + "policy": { + "timeout": "0.12:00:00", + "retry": 0, + "retryIntervalInSeconds": 30, + "secureOutput": false, + "secureInput": false + }, + "userProperties": [] + }, + { + "name": "Copy data1", + "type": "Copy", + "dependsOn": [ + { + "activity": "compress", + "dependencyConditions": [ + "Succeeded" + ] + } + ], + "policy": { + "timeout": "0.12:00:00", + "retry": 0, + "retryIntervalInSeconds": 30, + "secureOutput": false, + "secureInput": false + }, + "userProperties": [], + "typeProperties": { + "source": { + "type": "BinarySource", + "storeSettings": { + "type": "AzureBlobStorageReadSettings", + "recursive": true, + "deleteFilesAfterCompletion": false + }, + "formatSettings": { + "type": "BinaryReadSettings" + } + }, + "sink": { + "type": "BinarySink", + "storeSettings": { + "type": "AzureBlobStorageWriteSettings", + "copyBehavior": "FlattenHierarchy" + } + }, + "enableStaging": false + }, + "inputs": [ + { + "referenceName": "workspace", + "type": "DatasetReference" + } + ], "outputs": [ { "referenceName": "preprocessed_dataset_sink", @@ -48,6 +162,11 @@ ] } ], + "variables": { + "file": { + "type": "Array" + } + }, "annotations": [] } } \ No newline at end of file