From 79e7e08f8c3e62fbd6afb1d807aa4fdf9a9d4dc5 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 20 Jan 2025 09:47:47 +0200 Subject: [PATCH 01/17] Obtain the Ray cluster run ID from the user for KFP v2. Signed-off-by: Revital Sur --- kfp/doc/simple_transform_pipeline.md | 25 +++++++++++++++---- .../src/runtime_utils/kfp_utils.py | 7 +++--- .../templates/simple_pipeline.py | 25 ++++++++++--------- .../code2parquet/kfp_ray/code2parquet_wf.py | 24 +++++++++--------- .../code_quality/kfp_ray/code_quality_wf.py | 24 +++++++++--------- .../kfp_ray/header_cleanser_wf.py | 14 ++--------- .../kfp_ray/license_select_wf.py | 24 +++++++++--------- transforms/code/malware/kfp_ray/malware_wf.py | 24 +++++++++--------- .../kfp_ray/proglang_select_wf.py | 24 +++++++++--------- .../kfp_ray/repo_level_order_wf.py | 24 +++++++++--------- .../kfp_ray/doc_chunk_multiple_wf.py | 24 +++++++++--------- .../doc_chunk/kfp_ray/doc_chunk_wf.py | 24 +++++++++--------- .../kfp_ray/doc_quality_multiple_wf.py | 24 +++++++++--------- .../doc_quality/kfp_ray/doc_quality_wf.py | 25 ++++++++++--------- .../html2parquet/kfp_ray/html2parquet_wf.py | 24 +++++++++--------- .../lang_id/kfp_ray/lang_id_multiple_wf.py | 24 +++++++++--------- .../language/lang_id/kfp_ray/lang_id_wf.py | 24 +++++++++--------- .../kfp_ray/pdf2parquet_multiple_wf.py | 24 +++++++++--------- .../pdf2parquet/kfp_ray/pdf2parquet_wf.py | 24 +++++++++--------- .../pii_redactor/kfp_ray/pii_redactor_wf.py | 24 +++++++++--------- .../kfp_ray/text_encoder_multiple_wf.py | 24 +++++++++--------- .../text_encoder/kfp_ray/text_encoder_wf.py | 24 +++++++++--------- .../universal/doc_id/kfp_ray/doc_id_wf.py | 24 +++++++++--------- .../universal/ededup/kfp_ray/ededup_wf.py | 18 +++++++++---- .../universal/fdedup/kfp_ray/fdedup_wf.py | 18 +++++++++---- .../universal/filter/kfp_ray/filter_wf.py | 24 +++++++++--------- transforms/universal/hap/kfp_ray/hap_wf.py | 24 +++++++++--------- .../noop/kfp_ray/noop_multiple_wf.py | 24 +++++++++--------- transforms/universal/noop/kfp_ray/noop_wf.py | 24 +++++++++--------- .../universal/profiler/kfp_ray/profiler_wf.py | 12 +++++++++ .../universal/resize/kfp_ray/resize_wf.py | 24 +++++++++--------- .../tokenization/kfp_ray/tokenization_wf.py | 12 +++++++++ 32 files changed, 378 insertions(+), 330 deletions(-) diff --git a/kfp/doc/simple_transform_pipeline.md b/kfp/doc/simple_transform_pipeline.md index 10341c24b..e49eef625 100644 --- a/kfp/doc/simple_transform_pipeline.md +++ b/kfp/doc/simple_transform_pipeline.md @@ -57,11 +57,16 @@ Ray cluster. For each step we have to define a component that will execute them: ```python # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.2" - # compute execution parameters. Here different transforms might need different implementations. As - # a result, instead of creating a component we are creating it in place here. - compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image - ) + # KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the + # `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. + # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use + # this if/else statement and explicitly call the decorator. + if os.getenv("KFPv2", "0") == "1": + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) # create Ray cluster create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") # execute job @@ -148,6 +153,16 @@ Now, when all components and input parameters are defined, we can implement pipe component execution and parameters submitted to every component. ```python + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py index 7fa76453f..3a281e48a 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py +++ b/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py @@ -81,9 +81,10 @@ def runtime_name(ray_name: str = "", run_id: str = "") -> str: # the return value plus namespace name will be the name of the Ray Route, # which length is restricted to 64 characters, # therefore we restrict the return name by 15 character. - if run_id != "": - return f"{ray_name[:9]}-{run_id[:5]}" - return ray_name[:15] + if run_id == "": + logger.error("Run ID must not be provided") + sys.exit(1) + return f"{ray_name[:9]}-{run_id[:5]}" @staticmethod def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: diff --git a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py index ce7657a5c..2022e8359 100644 --- a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py +++ b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py @@ -73,23 +73,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -111,9 +99,11 @@ def {{ pipeline_name }}( ray_name: str = "{{ pipeline_name }}-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed {%- if image_pull_secret != "" %} + ray_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image}, {%- else %} + ray_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, {%- endif %} @@ -142,6 +132,7 @@ def {{ pipeline_name }}( """ Pipeline to execute {{ pipeline_name }} transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -177,6 +168,16 @@ def {{ pipeline_name }}( {%- endfor %} :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index e506ab5b3..8afde87d4 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -77,23 +77,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster @@ -113,6 +101,7 @@ def compute_exec_params_func( ) def code2parquet( ray_name: str = "code2parquet-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -139,6 +128,7 @@ def code2parquet( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -178,6 +168,16 @@ def code2parquet( (here we are assuming that select language info is in S3, but potentially in the different bucket) :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index f37fb5870..ba2d8e53f 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -74,23 +74,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster @@ -112,6 +100,7 @@ def compute_exec_params_func( def code_quality( # Ray cluster ray_name: str = "code_quality-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -136,6 +125,7 @@ def code_quality( """ Pipeline to execute Code Quality transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -171,6 +161,16 @@ def code_quality( :param cq_hf_token - Huggingface auth token to download and use the tokenizer :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index 6fdf1862a..107795463 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -82,23 +82,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster @@ -120,6 +108,7 @@ def compute_exec_params_func( def header_cleanser( # Ray cluster ray_name: str = "header_cleanser-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -148,6 +137,7 @@ def header_cleanser( """ Pipeline to execute Header Cleanser transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index 7c10b1c34..b92cb6498 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -71,23 +71,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -106,6 +94,7 @@ def compute_exec_params_func( ) def license_select( ray_name: str = "license_select-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -135,6 +124,7 @@ def license_select( """ Pipeline to execute License Select transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -166,6 +156,16 @@ def license_select( :param lc_licenses_file - path to license list json file :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index 30525e870..ad1bf4aaf 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -70,23 +70,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -104,6 +92,7 @@ def compute_exec_params_func( ) def malware( ray_name: str = "malware-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -126,6 +115,7 @@ def malware( """ Pipeline to execute malware transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -160,6 +150,16 @@ def malware( :param malware_output_column - output column name :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index f1b271d3c..3ba7d8926 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -71,23 +71,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -106,6 +94,7 @@ def compute_exec_params_func( ) def lang_select( ray_name: str = "proglang-match-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -129,6 +118,7 @@ def lang_select( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -165,6 +155,16 @@ def lang_select( (here we are assuming that select language info is in S3, but potentially in the different bucket) :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index 47388f394..38099a192 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -87,23 +87,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -123,6 +111,7 @@ def compute_exec_params_func( def repo_level_order( # Ray cluster ray_name: str = "repo_level_order-kfp-ray", + ray_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { "replicas": 2, @@ -159,6 +148,7 @@ def repo_level_order( """ Pipeline to execute repo_level_order transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -201,6 +191,16 @@ def repo_level_order( :param repo_lvl_combine_rows - # If specified, output rows per repo are combined to form a single repo :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 62161be6d..5518f0ba1 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -73,23 +73,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -108,6 +96,7 @@ def compute_exec_params_func( def doc_chunk( # Ray cluster ray_name: str = "doc-json-chunk-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -139,6 +128,7 @@ def doc_chunk( """ Pipeline to execute chunk documents transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -175,6 +165,16 @@ def doc_chunk( :param doc_chunk_dl_min_chunk_len - minimum chunk size :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index 618c11d68..e671177a9 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -73,23 +73,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -109,6 +97,7 @@ def compute_exec_params_func( def doc_chunk( # Ray cluster ray_name: str = "doc-json-chunk-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -133,6 +122,7 @@ def doc_chunk( """ Pipeline to execute chunk documents transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -169,6 +159,16 @@ def doc_chunk( :param doc_chunk_dl_min_chunk_len - minimum chunk size :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py index 4a2d9de1d..2830ce32c 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py @@ -72,23 +72,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -107,6 +95,7 @@ def compute_exec_params_func( def doc_quality( # Ray cluster ray_name: str = "doc_quality-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image, "image_pull_policy": "Always"}, ray_worker_options: dict = { "replicas": 2, @@ -137,6 +126,7 @@ def doc_quality( """ Pipeline to execute Document Quality transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -172,6 +162,16 @@ def doc_quality( :param docq_bad_word_filepath - a path to bad word file :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py index e26efe832..c4d6c7d43 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py @@ -72,23 +72,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -107,6 +95,7 @@ def compute_exec_params_func( def doc_quality( # Ray cluster ray_name: str = "doc_quality-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", ray_head_options: dict = { "cpu": 1, "memory": 4, @@ -143,6 +132,7 @@ def doc_quality( """ Pipeline to execute Document Quality transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -178,6 +168,17 @@ def doc_quality( :param docq_bad_word_filepath - a path to bad word file :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER + # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index b6f5dff19..b75064e79 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -71,23 +71,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -107,6 +95,7 @@ def compute_exec_params_func( def html2parquet( # Ray cluster ray_name: str = "html2parquet-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -137,6 +126,7 @@ def html2parquet( """ Pipeline to execute html2parquet transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -171,6 +161,16 @@ def html2parquet( :param html2parquet_output_format - # Output format for the contents column. :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index 941d32627..480f1a738 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -77,23 +77,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -112,6 +100,7 @@ def compute_exec_params_func( def lang_id( # Ray cluster ray_name: str = "lang_id-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -145,6 +134,7 @@ def lang_id( """ Pipeline to execute Language Identification transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -183,6 +173,16 @@ def lang_id( :param lang_id_output_score_column_name - name of the output column to hold score of prediction :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index fa4debbe3..b16243762 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -78,23 +78,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -113,6 +101,7 @@ def compute_exec_params_func( def lang_id( # Ray cluster ray_name: str = "lang_id-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -146,6 +135,7 @@ def lang_id( """ Pipeline to execute Language Identification transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -184,6 +174,16 @@ def lang_id( :param lang_id_output_score_column_name - name of the output column to hold score of prediction :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index 91d40567e..f1796ee9f 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -75,23 +75,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -110,6 +98,7 @@ def compute_exec_params_func( def pdf2parquet( # Ray cluster ray_name: str = "pdf2parquet-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -142,6 +131,7 @@ def pdf2parquet( """ Pipeline to execute PDF2PARQUET transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -179,6 +169,16 @@ def pdf2parquet( :param pdf2parquet_bitmap_area_threshold - threshold for bitmaps :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index 4dab7d4af..a6f308ea7 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -77,23 +77,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -113,6 +101,7 @@ def compute_exec_params_func( def pdf2parquet( # Ray cluster ray_name: str = "pdf2parquet-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -146,6 +135,7 @@ def pdf2parquet( """ Pipeline to execute PDF2PARQUET transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -183,6 +173,16 @@ def pdf2parquet( :param pdf2parquet_bitmap_area_threshold - threshold for bitmaps :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py index b05aecd69..fb70f789a 100644 --- a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py +++ b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py @@ -66,23 +66,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -102,6 +90,7 @@ def compute_exec_params_func( def pii_redactor( # Ray cluster ray_name: str = "pii-redactor-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -130,6 +119,7 @@ def pii_redactor( """ Pipeline to execute pii_redactor transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -163,6 +153,16 @@ def pii_redactor( :param pii_redactor_contents - column that has pii data and needs to be transformed by pii redactor transform :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index 2005ee163..f746f4aef 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -71,23 +71,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -106,6 +94,7 @@ def compute_exec_params_func( def text_encoder( # Ray cluster ray_name: str = "text-encoder-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -129,6 +118,7 @@ def text_encoder( """ Pipeline to execute TextEncoder transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -164,6 +154,16 @@ def text_encoder( :param text_encoder_output_embeddings_column_name - name of the output column :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index aa63e23f8..5e7421490 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -71,23 +71,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -107,6 +95,7 @@ def compute_exec_params_func( def text_encoder( # Ray cluster ray_name: str = "text-encoder-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -130,6 +119,7 @@ def text_encoder( """ Pipeline to execute TextEncoder transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -165,6 +155,16 @@ def text_encoder( :param text_encoder_output_embeddings_column_name - name of the output column :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index c5d4cac6d..985139c92 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -80,23 +80,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -115,6 +103,7 @@ def compute_exec_params_func( def doc_id( # Ray cluster ray_name: str = "doc_id-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -149,6 +138,7 @@ def doc_id( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -185,6 +175,16 @@ def doc_id( :param doc_id_start_id - starting id :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index 17c85b630..62db57fea 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -49,11 +49,7 @@ compute_exec_params_op = dsl.component_decorator.component( func=ededup_compute_execution_params, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex + else: compute_exec_params_op = comp.create_component_from_func( func=ededup_compute_execution_params, base_image=base_kfp_image @@ -78,6 +74,7 @@ def ededup( # Ray cluster ray_name: str = "ededup-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -111,6 +108,7 @@ def ededup( """ Pipeline to execute EDEDUP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -148,6 +146,16 @@ def ededup( :param ededup_n_samples - number of samples for parameters computation :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 51ead9c79..bf45ac197 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -64,11 +64,7 @@ compute_data_cleaning_exec_params_op = dsl.component_decorator.component( func=data_cleaning_compute_execution_params, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex + else: compute_common_params_op = comp.create_component_from_func(func=compute_common_params, base_image=base_kfp_image) compute_signature_calc_exec_params_op = comp.create_component_from_func( @@ -114,6 +110,7 @@ def fuzzydedup( # folders used # Ray cluster ray_name: str = "fuzzydedup-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = { "cpu": 8, @@ -164,6 +161,7 @@ def fuzzydedup( """ Pipeline to execute FDEDUP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -208,6 +206,16 @@ def fuzzydedup( :param fdedup_n_samples - number of samples for parameters computation :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index a18d2796d..26ae44489 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -72,23 +72,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -107,6 +95,7 @@ def compute_exec_params_func( def filtering( # Ray cluster ray_name: str = "filter-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -130,6 +119,7 @@ def filtering( """ Pipeline to execute Filtering transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -165,6 +155,16 @@ def filtering( :param filter_columns_to_drop - list of columns to drop after filtering :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/universal/hap/kfp_ray/hap_wf.py b/transforms/universal/hap/kfp_ray/hap_wf.py index 64c80fe37..46d1dba1a 100644 --- a/transforms/universal/hap/kfp_ray/hap_wf.py +++ b/transforms/universal/hap/kfp_ray/hap_wf.py @@ -79,23 +79,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -115,6 +103,7 @@ def compute_exec_params_func( def hap( # Ray cluster ray_name: str = "hap-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -149,6 +138,7 @@ def hap( """ Pipeline to execute hap transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -187,6 +177,16 @@ def hap( :param batch_size - # batch size :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 9ed874f3d..dd535db5c 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -67,23 +67,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -102,6 +90,7 @@ def compute_exec_params_func( def noop( # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -123,6 +112,7 @@ def noop( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -156,6 +146,16 @@ def noop( :param noop_sleep_sec - noop sleep time :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 5a1ce393a..0392e9ab5 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -67,23 +67,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -104,6 +92,7 @@ def noop( # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster # Add image_pull_secret, image_pull_policy and tolerations to ray options if needed + ray_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", @@ -125,6 +114,7 @@ def noop( """ Pipeline to execute noop transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -158,6 +148,16 @@ def noop( :param noop_sleep_sec - noop sleep time :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index 7a157c146..6300f62f8 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -78,6 +78,7 @@ def profiler( # Ray cluster ray_name: str = "profiler-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -102,6 +103,7 @@ def profiler( """ Pipeline to execute EDEDUP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -137,6 +139,16 @@ def profiler( :param profiler_n_samples - number of samples for parameters computation :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 6a1403f18..89007c8be 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -76,23 +76,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -111,6 +99,7 @@ def compute_exec_params_func( def resize( # Ray cluster ray_name: str = "resize-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -137,6 +126,7 @@ def resize( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -172,6 +162,16 @@ def resize( :param resize_size_type - size type - disk/memory :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index 82fc55ae2..15958665b 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -116,6 +116,7 @@ def compute_exec_params_func( def tokenization( # Ray cluster ray_name: str = "tkn-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -149,6 +150,7 @@ def tokenization( """ Pipeline to execute tokenization transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -187,6 +189,16 @@ def tokenization( :param tkn_chunk_size - Specify >0 value to tokenize each row/text in chunks of characters (rounded in words) :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params From 070a9d8c8435052059a9b717d40916b820139d20 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 20 Jan 2025 06:00:16 -0600 Subject: [PATCH 02/17] Set default value for run id. Signed-off-by: Revital Sur --- transforms/.make.workflows | 3 +++ 1 file changed, 3 insertions(+) diff --git a/transforms/.make.workflows b/transforms/.make.workflows index a1e5accce..16c6f38fb 100644 --- a/transforms/.make.workflows +++ b/transforms/.make.workflows @@ -46,6 +46,9 @@ FORCE: ifeq ($(USE_DEV_IMAGES), 1) cd ${TRANSFORM_SRC} && $(MAKE) image && $(MAKE) kind-load-image cd ${REPOROOT}/kfp/kfp_ray_components && $(MAKE) image && $(MAKE) kind-load-image +endif +ifeq ($(KFPv2), 1) + yq -i '.root.inputDefinitions.parameters.ray_id_KFPv2.defaultValue = "123"' ${CURDIR}/${PIPELINE_FILE} endif . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.pipeline_utils.pipelines_tests_utils -c "sanity-test" -p ${CURDIR}/${PIPELINE_FILE} -e ${KFP_ENDPOINT} From 31db4cc150efd5fb931e70c1b9d58f53f16ab183 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 20 Jan 2025 22:46:54 -0600 Subject: [PATCH 03/17] Add _set_run_id function. Signed-off-by: Revital Sur --- .../pipeline_utils/pipelines_tests_utils.py | 21 +++++++++++++++++++ transforms/.make.workflows | 3 --- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py index 6b23067f9..05f7d8c04 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py @@ -52,6 +52,26 @@ def run_test(pipeline_package_path: str, endpoint: str = "http://localhost:8080/ logger.info(f"Pipeline {pipeline_name} successfully completed") return pipeline_name +def _set_run_id(pipeline_package_path: str): + """ + Assign a dummy run ID value for testing purposes. By default, this value + is empty and is set by the user during runtime. + + :param pipeline_package_path: Local path to the pipeline package. + """ + import yaml + + try: + stream = open(pipeline_package_path, "r") + docs = list(yaml.load_all(stream, yaml.FullLoader)) + for doc in docs: + if "root" in doc: + doc["root"]["inputDefinitions"]["parameters"]["ray_id_KFPv2"]["defaultValue"] = "123" + with open(pipeline_package_path, "w") as outfile: + yaml.dump_all(docs, outfile) + except Exception as e: + logger.error(f"Failed to update run id value, exception {e}") + sys.exit(1) if __name__ == "__main__": import argparse @@ -74,6 +94,7 @@ def run_test(pipeline_package_path: str, endpoint: str = "http://localhost:8080/ if pipeline is None: sys.exit(1) case "sanity-test": + _set_run_id(args.pipeline_package_path) run = run_test( endpoint=args.endpoint, pipeline_package_path=args.pipeline_package_path, diff --git a/transforms/.make.workflows b/transforms/.make.workflows index 16c6f38fb..a1e5accce 100644 --- a/transforms/.make.workflows +++ b/transforms/.make.workflows @@ -46,9 +46,6 @@ FORCE: ifeq ($(USE_DEV_IMAGES), 1) cd ${TRANSFORM_SRC} && $(MAKE) image && $(MAKE) kind-load-image cd ${REPOROOT}/kfp/kfp_ray_components && $(MAKE) image && $(MAKE) kind-load-image -endif -ifeq ($(KFPv2), 1) - yq -i '.root.inputDefinitions.parameters.ray_id_KFPv2.defaultValue = "123"' ${CURDIR}/${PIPELINE_FILE} endif . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.pipeline_utils.pipelines_tests_utils -c "sanity-test" -p ${CURDIR}/${PIPELINE_FILE} -e ${KFP_ENDPOINT} From 5a8c36fb19eda482d8a235ed7c5a8e36e87bb13a Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Tue, 21 Jan 2025 02:39:15 -0600 Subject: [PATCH 04/17] Minor fix. Signed-off-by: Revital Sur --- .../code/header_cleanser/kfp_ray/header_cleanser_wf.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index 107795463..a3b02c7c8 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -177,6 +177,16 @@ def header_cleanser( :param skip_timeout - Hold value true or false to skip removing copyright/header or not when scaning timeout. :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) From 76620d4496f75d74ec9cf6a8db9665ca278a3d2e Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 22 Jan 2025 10:19:28 +0200 Subject: [PATCH 05/17] Add missing --chmod=775 --chown=ray:root in dockerfiles. Signed-off-by: Revital Sur --- kfp/kfp_ray_components/Dockerfile | 2 +- transforms/code/code2parquet/ray/Dockerfile | 4 ++-- transforms/code/code_quality/ray/Dockerfile | 8 ++++---- transforms/code/header_cleanser/ray/Dockerfile | 2 +- transforms/code/license_select/ray/Dockerfile | 4 ++-- transforms/code/malware/ray/Dockerfile | 4 ++-- transforms/universal/noop/ray/Dockerfile | 8 ++++---- transforms/universal/profiler/ray/Dockerfile | 4 ++-- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index 6f6bf323d..5b51b0d2a 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -30,7 +30,7 @@ RUN pip install --no-cache-dir pydantic==2.6.3 # remove credentials-containing file RUN rm requirements.txt # components -COPY ./src /pipelines/component/src +COPY --chmod=775 --chown=ray:root ./src /pipelines/component/src # Set environment ENV KFP_v2=$KFP_v2 diff --git a/transforms/code/code2parquet/ray/Dockerfile b/transforms/code/code2parquet/ray/Dockerfile index 1309416ea..cf363def4 100644 --- a/transforms/code/code2parquet/ray/Dockerfile +++ b/transforms/code/code2parquet/ray/Dockerfile @@ -28,10 +28,10 @@ COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY src/code2parquet_transform_ray.py . +COPY --chmod=775 --chown=ray:root src/code2parquet_transform_ray.py . # copy some of the samples in -COPY src/code2parquet_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/code2parquet_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/code_quality/ray/Dockerfile b/transforms/code/code_quality/ray/Dockerfile index 54630e9d9..2127cfd81 100644 --- a/transforms/code/code_quality/ray/Dockerfile +++ b/transforms/code/code_quality/ray/Dockerfile @@ -33,14 +33,14 @@ COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY ./src/code_quality_transform_ray.py . +COPY --chmod=775 --chown=ray:root ./src/code_quality_transform_ray.py . # copy some of the samples in -COPY ./src/code_quality_local_ray.py local/ +COPY --chmod=775 --chown=ray:root ./src/code_quality_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/header_cleanser/ray/Dockerfile b/transforms/code/header_cleanser/ray/Dockerfile index 1b21cef43..b5fc809aa 100644 --- a/transforms/code/header_cleanser/ray/Dockerfile +++ b/transforms/code/header_cleanser/ray/Dockerfile @@ -31,7 +31,7 @@ User ray # copy source data COPY ./src/header_cleanser_transform_ray.py . -COPY src/header_cleanser_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/header_cleanser_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/license_select/ray/Dockerfile b/transforms/code/license_select/ray/Dockerfile index 6c8301c85..184747ff5 100644 --- a/transforms/code/license_select/ray/Dockerfile +++ b/transforms/code/license_select/ray/Dockerfile @@ -27,8 +27,8 @@ COPY --chmod=775 --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy source data -COPY src/license_select_transform_ray.py . -COPY src/license_select_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/license_select_transform_ray.py . +COPY --chmod=775 --chown=ray:root src/license_select_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/malware/ray/Dockerfile b/transforms/code/malware/ray/Dockerfile index 24f43d053..f06c2005c 100644 --- a/transforms/code/malware/ray/Dockerfile +++ b/transforms/code/malware/ray/Dockerfile @@ -56,10 +56,10 @@ COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY src/malware_transform_ray.py ./ +COPY --chmod=775 --chown=ray:root src/malware_transform_ray.py ./ # copy some of the samples in -COPY src/malware_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/malware_local_ray.py local/ COPY test/ test/ COPY test-data/ test-data/ diff --git a/transforms/universal/noop/ray/Dockerfile b/transforms/universal/noop/ray/Dockerfile index 796a9559f..bfca6fab4 100644 --- a/transforms/universal/noop/ray/Dockerfile +++ b/transforms/universal/noop/ray/Dockerfile @@ -29,14 +29,14 @@ COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY ./src/noop_transform_ray.py . +COPY --chmod=775 --chown=ray:root ./src/noop_transform_ray.py . # copy some of the samples in -COPY ./src/noop_local_ray.py local/ +COPY --chmod=775 --chown=ray:root ./src/noop_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/universal/profiler/ray/Dockerfile b/transforms/universal/profiler/ray/Dockerfile index 131229d1f..9fdfa4594 100644 --- a/transforms/universal/profiler/ray/Dockerfile +++ b/transforms/universal/profiler/ray/Dockerfile @@ -30,10 +30,10 @@ COPY --chmod=775 --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY src/profiler_transform_ray.py . +COPY --chmod=775 --chown=ray:root src/profiler_transform_ray.py . # copy some of the samples in -COPY src/profiler_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/profiler_local_ray.py local/ # copy test COPY test/ test/ From 84d48a06f03e6e68352cd308d60865b90f60cb49 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 22 Jan 2025 10:22:08 +0200 Subject: [PATCH 06/17] Minor fix. Signed-off-by: Revital Sur --- transforms/code/malware/ray/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/code/malware/ray/Dockerfile b/transforms/code/malware/ray/Dockerfile index f06c2005c..1c231275f 100644 --- a/transforms/code/malware/ray/Dockerfile +++ b/transforms/code/malware/ray/Dockerfile @@ -61,8 +61,8 @@ COPY --chmod=775 --chown=ray:root src/malware_transform_ray.py ./ # copy some of the samples in COPY --chmod=775 --chown=ray:root src/malware_local_ray.py local/ -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ ENV PYTHONPATH /home/ray From 3cec0ab2881e65f9b4ddbfdec440064a975540fd Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 22 Jan 2025 12:25:20 +0200 Subject: [PATCH 07/17] Address review comments. Signed-off-by: Revital Sur --- kfp/doc/simple_transform_pipeline.md | 11 ++++++----- .../pipeline_utils/pipelines_tests_utils.py | 3 ++- .../single-pipeline/templates/simple_pipeline.py | 2 +- .../code/code2parquet/kfp_ray/code2parquet_wf.py | 2 +- .../code/code_quality/kfp_ray/code_quality_wf.py | 2 +- .../header_cleanser/kfp_ray/header_cleanser_wf.py | 2 +- .../code/license_select/kfp_ray/license_select_wf.py | 2 +- transforms/code/malware/kfp_ray/malware_wf.py | 2 +- .../proglang_select/kfp_ray/proglang_select_wf.py | 2 +- .../kfp_ray/repo_level_order_wf.py | 2 +- .../doc_chunk/kfp_ray/doc_chunk_multiple_wf.py | 2 +- transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py | 2 +- .../language/html2parquet/kfp_ray/html2parquet_wf.py | 2 +- .../language/lang_id/kfp_ray/lang_id_multiple_wf.py | 2 +- transforms/language/lang_id/kfp_ray/lang_id_wf.py | 2 +- .../pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py | 2 +- .../language/pdf2parquet/kfp_ray/pdf2parquet_wf.py | 2 +- .../language/pii_redactor/kfp_ray/pii_redactor_wf.py | 2 +- .../text_encoder/kfp_ray/text_encoder_multiple_wf.py | 2 +- .../language/text_encoder/kfp_ray/text_encoder_wf.py | 2 +- transforms/universal/doc_id/kfp_ray/doc_id_wf.py | 2 +- transforms/universal/ededup/kfp_ray/ededup_wf.py | 2 +- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 2 +- transforms/universal/filter/kfp_ray/filter_wf.py | 2 +- transforms/universal/hap/kfp_ray/hap_wf.py | 2 +- transforms/universal/noop/kfp_ray/noop_multiple_wf.py | 2 +- transforms/universal/noop/kfp_ray/noop_wf.py | 2 +- transforms/universal/profiler/kfp_ray/profiler_wf.py | 2 +- transforms/universal/resize/kfp_ray/resize_wf.py | 2 +- .../universal/tokenization/kfp_ray/tokenization_wf.py | 2 +- 30 files changed, 36 insertions(+), 34 deletions(-) diff --git a/kfp/doc/simple_transform_pipeline.md b/kfp/doc/simple_transform_pipeline.md index e49eef625..ccb4d16a7 100644 --- a/kfp/doc/simple_transform_pipeline.md +++ b/kfp/doc/simple_transform_pipeline.md @@ -112,6 +112,7 @@ The input parameters section defines all the parameters required for the pipelin The parameters used here are as follows: * ray_name: name of the Ray cluster +* ray_id_KFPv2: Ray cluster unique ID used only in KFP v2 * ray_head_options: head node options, containing the following: * cpu - number of cpus * memory - memory @@ -156,7 +157,7 @@ component execution and parameters submitted to every component. # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") @@ -164,7 +165,7 @@ component execution and parameters submitted to every component. else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -177,7 +178,7 @@ component execution and parameters submitted to every component. # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -188,7 +189,7 @@ component execution and parameters submitted to every component. # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, additional_params=additional_params, # note that the parameters below are specific for NOOP transform exec_params={ @@ -198,7 +199,7 @@ component execution and parameters submitted to every component. "num_workers": compute_exec_params.output, "worker_options": actor_options, "pipeline_id": pipeline_id, - "job_id": dsl.RUN_ID_PLACEHOLDER, + "job_id": run_id, "code_location": code_location, "noop_sleep_sec": noop_sleep_sec, }, diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py index 05f7d8c04..41a392e26 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py @@ -60,13 +60,14 @@ def _set_run_id(pipeline_package_path: str): :param pipeline_package_path: Local path to the pipeline package. """ import yaml + import uuid try: stream = open(pipeline_package_path, "r") docs = list(yaml.load_all(stream, yaml.FullLoader)) for doc in docs: if "root" in doc: - doc["root"]["inputDefinitions"]["parameters"]["ray_id_KFPv2"]["defaultValue"] = "123" + doc["root"]["inputDefinitions"]["parameters"]["ray_id_KFPv2"]["defaultValue"] = uuid.uuid4().hex with open(pipeline_package_path, "w") as outfile: yaml.dump_all(docs, outfile) except Exception as e: diff --git a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py index 2022e8359..4191f5d46 100644 --- a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py +++ b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py @@ -171,7 +171,7 @@ def {{ pipeline_name }}( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index 8afde87d4..c78afc4a9 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -171,7 +171,7 @@ def code2parquet( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index ba2d8e53f..466d15f63 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -164,7 +164,7 @@ def code_quality( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index a3b02c7c8..fc02f04ea 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -180,7 +180,7 @@ def header_cleanser( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index b92cb6498..25bcc29f6 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -159,7 +159,7 @@ def license_select( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index ad1bf4aaf..ef5a290ab 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -153,7 +153,7 @@ def malware( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index 3ba7d8926..b4ad50016 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -158,7 +158,7 @@ def lang_select( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index 38099a192..b882cf82b 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -194,7 +194,7 @@ def repo_level_order( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 5518f0ba1..4b3349389 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -168,7 +168,7 @@ def doc_chunk( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index e671177a9..835514793 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -162,7 +162,7 @@ def doc_chunk( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index b75064e79..ce70c27b1 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -164,7 +164,7 @@ def html2parquet( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index 480f1a738..00c7e490d 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -176,7 +176,7 @@ def lang_id( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index b16243762..66dade14c 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -177,7 +177,7 @@ def lang_id( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index f1796ee9f..b5e61e67f 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -172,7 +172,7 @@ def pdf2parquet( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index a6f308ea7..3dce876aa 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -176,7 +176,7 @@ def pdf2parquet( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py index fb70f789a..133a595d4 100644 --- a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py +++ b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py @@ -156,7 +156,7 @@ def pii_redactor( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index f746f4aef..fde0c8996 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -157,7 +157,7 @@ def text_encoder( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index 5e7421490..223f0434c 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -158,7 +158,7 @@ def text_encoder( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 985139c92..9e0a98af5 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -178,7 +178,7 @@ def doc_id( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index 62db57fea..c11cfc050 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -149,7 +149,7 @@ def ededup( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index bf45ac197..b2713c851 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -209,7 +209,7 @@ def fuzzydedup( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index 26ae44489..59cb52cc2 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -158,7 +158,7 @@ def filtering( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/hap/kfp_ray/hap_wf.py b/transforms/universal/hap/kfp_ray/hap_wf.py index 46d1dba1a..088034935 100644 --- a/transforms/universal/hap/kfp_ray/hap_wf.py +++ b/transforms/universal/hap/kfp_ray/hap_wf.py @@ -180,7 +180,7 @@ def hap( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index dd535db5c..9b59ebbae 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -149,7 +149,7 @@ def noop( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 0392e9ab5..8fbcde300 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -151,7 +151,7 @@ def noop( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index 6300f62f8..e39fe8c88 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -142,7 +142,7 @@ def profiler( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 89007c8be..64b27c231 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -165,7 +165,7 @@ def resize( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index 15958665b..c9e2c5f49 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -192,7 +192,7 @@ def tokenization( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") From b420d1dfe66eab1ed0241ddec5ed7ea02616c73e Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 22 Jan 2025 12:49:41 +0200 Subject: [PATCH 08/17] Minor fix. Signed-off-by: Revital Sur --- kfp/doc/simple_transform_pipeline.md | 4 ++-- .../pipeline_utils/pipelines_tests_utils.py | 2 +- .../single-pipeline/templates/simple_pipeline.py | 8 ++++---- transforms/code/code2parquet/kfp_ray/code2parquet_wf.py | 6 +++--- transforms/code/code_quality/kfp_ray/code_quality_wf.py | 6 +++--- .../code/header_cleanser/kfp_ray/header_cleanser_wf.py | 6 +++--- .../code/license_select/kfp_ray/license_select_wf.py | 6 +++--- transforms/code/malware/kfp_ray/malware_wf.py | 6 +++--- .../code/proglang_select/kfp_ray/proglang_select_wf.py | 6 +++--- .../repo_level_ordering/kfp_ray/repo_level_order_wf.py | 6 +++--- .../language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py | 6 +++--- transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py | 6 +++--- .../doc_quality/kfp_ray/doc_quality_multiple_wf.py | 6 +++--- transforms/language/doc_quality/kfp_ray/doc_quality_wf.py | 6 +++--- .../language/html2parquet/kfp_ray/html2parquet_wf.py | 6 +++--- .../language/lang_id/kfp_ray/lang_id_multiple_wf.py | 6 +++--- transforms/language/lang_id/kfp_ray/lang_id_wf.py | 6 +++--- .../pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py | 6 +++--- transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py | 6 +++--- .../language/pii_redactor/kfp_ray/pii_redactor_wf.py | 6 +++--- .../text_encoder/kfp_ray/text_encoder_multiple_wf.py | 6 +++--- .../language/text_encoder/kfp_ray/text_encoder_wf.py | 6 +++--- transforms/universal/doc_id/kfp_ray/doc_id_wf.py | 6 +++--- transforms/universal/ededup/kfp_ray/ededup_wf.py | 6 +++--- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 6 +++--- transforms/universal/filter/kfp_ray/filter_wf.py | 6 +++--- transforms/universal/hap/kfp_ray/hap_wf.py | 6 +++--- transforms/universal/noop/kfp_ray/noop_multiple_wf.py | 6 +++--- transforms/universal/noop/kfp_ray/noop_wf.py | 6 +++--- transforms/universal/profiler/kfp_ray/profiler_wf.py | 6 +++--- transforms/universal/resize/kfp_ray/resize_wf.py | 6 +++--- .../universal/tokenization/kfp_ray/tokenization_wf.py | 6 +++--- 32 files changed, 94 insertions(+), 94 deletions(-) diff --git a/kfp/doc/simple_transform_pipeline.md b/kfp/doc/simple_transform_pipeline.md index ccb4d16a7..00cd9e204 100644 --- a/kfp/doc/simple_transform_pipeline.md +++ b/kfp/doc/simple_transform_pipeline.md @@ -112,7 +112,7 @@ The input parameters section defines all the parameters required for the pipelin The parameters used here are as follows: * ray_name: name of the Ray cluster -* ray_id_KFPv2: Ray cluster unique ID used only in KFP v2 +* ray_run_id_KFPv2: Ray cluster unique ID used only in KFP v2 * ray_head_options: head node options, containing the following: * cpu - number of cpus * memory - memory @@ -161,7 +161,7 @@ component execution and parameters submitted to every component. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py index 41a392e26..00530406f 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py @@ -67,7 +67,7 @@ def _set_run_id(pipeline_package_path: str): docs = list(yaml.load_all(stream, yaml.FullLoader)) for doc in docs: if "root" in doc: - doc["root"]["inputDefinitions"]["parameters"]["ray_id_KFPv2"]["defaultValue"] = uuid.uuid4().hex + doc["root"]["inputDefinitions"]["parameters"]["ray_run_id_KFPv2"]["defaultValue"] = uuid.uuid4().hex with open(pipeline_package_path, "w") as outfile: yaml.dump_all(docs, outfile) except Exception as e: diff --git a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py index 4191f5d46..6a682a0f2 100644 --- a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py +++ b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py @@ -99,11 +99,11 @@ def {{ pipeline_name }}( ray_name: str = "{{ pipeline_name }}-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed {%- if image_pull_secret != "" %} - ray_id_KFPv2: str = "", + ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image}, {%- else %} - ray_id_KFPv2: str = "", + ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, {%- endif %} @@ -132,7 +132,7 @@ def {{ pipeline_name }}( """ Pipeline to execute {{ pipeline_name }} transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -175,7 +175,7 @@ def {{ pipeline_name }}( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index c78afc4a9..7dd7111ce 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -101,7 +101,7 @@ def compute_exec_params_func( ) def code2parquet( ray_name: str = "code2parquet-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -128,7 +128,7 @@ def code2parquet( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -175,7 +175,7 @@ def code2parquet( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index 466d15f63..58a571e9b 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -100,7 +100,7 @@ def compute_exec_params_func( def code_quality( # Ray cluster ray_name: str = "code_quality-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -125,7 +125,7 @@ def code_quality( """ Pipeline to execute Code Quality transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -168,7 +168,7 @@ def code_quality( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index fc02f04ea..b6d15934b 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -108,7 +108,7 @@ def compute_exec_params_func( def header_cleanser( # Ray cluster ray_name: str = "header_cleanser-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -137,7 +137,7 @@ def header_cleanser( """ Pipeline to execute Header Cleanser transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -184,7 +184,7 @@ def header_cleanser( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index 25bcc29f6..55a176a0d 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -94,7 +94,7 @@ def compute_exec_params_func( ) def license_select( ray_name: str = "license_select-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -124,7 +124,7 @@ def license_select( """ Pipeline to execute License Select transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -163,7 +163,7 @@ def license_select( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index ef5a290ab..9c18c5e30 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -92,7 +92,7 @@ def compute_exec_params_func( ) def malware( ray_name: str = "malware-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -115,7 +115,7 @@ def malware( """ Pipeline to execute malware transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -157,7 +157,7 @@ def malware( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index b4ad50016..31bad3798 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -94,7 +94,7 @@ def compute_exec_params_func( ) def lang_select( ray_name: str = "proglang-match-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -118,7 +118,7 @@ def lang_select( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -162,7 +162,7 @@ def lang_select( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index b882cf82b..8fa169209 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -111,7 +111,7 @@ def compute_exec_params_func( def repo_level_order( # Ray cluster ray_name: str = "repo_level_order-kfp-ray", - ray_id_KFPv2: str = "", + ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { "replicas": 2, @@ -148,7 +148,7 @@ def repo_level_order( """ Pipeline to execute repo_level_order transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -198,7 +198,7 @@ def repo_level_order( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 4b3349389..7442aabe4 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -96,7 +96,7 @@ def compute_exec_params_func( def doc_chunk( # Ray cluster ray_name: str = "doc-json-chunk-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -128,7 +128,7 @@ def doc_chunk( """ Pipeline to execute chunk documents transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -172,7 +172,7 @@ def doc_chunk( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index 835514793..975902797 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -97,7 +97,7 @@ def compute_exec_params_func( def doc_chunk( # Ray cluster ray_name: str = "doc-json-chunk-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -122,7 +122,7 @@ def doc_chunk( """ Pipeline to execute chunk documents transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -166,7 +166,7 @@ def doc_chunk( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py index 2830ce32c..d5a8abc9d 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py @@ -95,7 +95,7 @@ def compute_exec_params_func( def doc_quality( # Ray cluster ray_name: str = "doc_quality-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", + ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image, "image_pull_policy": "Always"}, ray_worker_options: dict = { "replicas": 2, @@ -126,7 +126,7 @@ def doc_quality( """ Pipeline to execute Document Quality transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -169,7 +169,7 @@ def doc_quality( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py index c4d6c7d43..6ba23c515 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py @@ -95,7 +95,7 @@ def compute_exec_params_func( def doc_quality( # Ray cluster ray_name: str = "doc_quality-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", + ray_run_id_KFPv2: str = "", ray_head_options: dict = { "cpu": 1, "memory": 4, @@ -132,7 +132,7 @@ def doc_quality( """ Pipeline to execute Document Quality transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -175,7 +175,7 @@ def doc_quality( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index ce70c27b1..2207363d8 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -95,7 +95,7 @@ def compute_exec_params_func( def html2parquet( # Ray cluster ray_name: str = "html2parquet-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -126,7 +126,7 @@ def html2parquet( """ Pipeline to execute html2parquet transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -168,7 +168,7 @@ def html2parquet( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index 00c7e490d..680e27300 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -100,7 +100,7 @@ def compute_exec_params_func( def lang_id( # Ray cluster ray_name: str = "lang_id-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -134,7 +134,7 @@ def lang_id( """ Pipeline to execute Language Identification transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -180,7 +180,7 @@ def lang_id( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index 66dade14c..a7cfe4509 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -101,7 +101,7 @@ def compute_exec_params_func( def lang_id( # Ray cluster ray_name: str = "lang_id-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -135,7 +135,7 @@ def lang_id( """ Pipeline to execute Language Identification transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -181,7 +181,7 @@ def lang_id( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index b5e61e67f..3895489f1 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -98,7 +98,7 @@ def compute_exec_params_func( def pdf2parquet( # Ray cluster ray_name: str = "pdf2parquet-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -131,7 +131,7 @@ def pdf2parquet( """ Pipeline to execute PDF2PARQUET transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -176,7 +176,7 @@ def pdf2parquet( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index 3dce876aa..13a39a1b8 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -101,7 +101,7 @@ def compute_exec_params_func( def pdf2parquet( # Ray cluster ray_name: str = "pdf2parquet-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -135,7 +135,7 @@ def pdf2parquet( """ Pipeline to execute PDF2PARQUET transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -180,7 +180,7 @@ def pdf2parquet( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py index 133a595d4..50ee4c653 100644 --- a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py +++ b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py @@ -90,7 +90,7 @@ def compute_exec_params_func( def pii_redactor( # Ray cluster ray_name: str = "pii-redactor-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -119,7 +119,7 @@ def pii_redactor( """ Pipeline to execute pii_redactor transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -160,7 +160,7 @@ def pii_redactor( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index fde0c8996..06c7a3253 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -94,7 +94,7 @@ def compute_exec_params_func( def text_encoder( # Ray cluster ray_name: str = "text-encoder-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -118,7 +118,7 @@ def text_encoder( """ Pipeline to execute TextEncoder transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -161,7 +161,7 @@ def text_encoder( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index 223f0434c..98011cb15 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -95,7 +95,7 @@ def compute_exec_params_func( def text_encoder( # Ray cluster ray_name: str = "text-encoder-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -119,7 +119,7 @@ def text_encoder( """ Pipeline to execute TextEncoder transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -162,7 +162,7 @@ def text_encoder( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 9e0a98af5..03cd29b0b 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -103,7 +103,7 @@ def compute_exec_params_func( def doc_id( # Ray cluster ray_name: str = "doc_id-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -138,7 +138,7 @@ def doc_id( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -182,7 +182,7 @@ def doc_id( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index c11cfc050..dba40490e 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -74,7 +74,7 @@ def ededup( # Ray cluster ray_name: str = "ededup-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -108,7 +108,7 @@ def ededup( """ Pipeline to execute EDEDUP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -153,7 +153,7 @@ def ededup( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index b2713c851..33782b07b 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -110,7 +110,7 @@ def fuzzydedup( # folders used # Ray cluster ray_name: str = "fuzzydedup-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = { "cpu": 8, @@ -161,7 +161,7 @@ def fuzzydedup( """ Pipeline to execute FDEDUP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -213,7 +213,7 @@ def fuzzydedup( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index 59cb52cc2..6b2f87f97 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -95,7 +95,7 @@ def compute_exec_params_func( def filtering( # Ray cluster ray_name: str = "filter-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -119,7 +119,7 @@ def filtering( """ Pipeline to execute Filtering transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -162,7 +162,7 @@ def filtering( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/hap/kfp_ray/hap_wf.py b/transforms/universal/hap/kfp_ray/hap_wf.py index 088034935..01f943bbc 100644 --- a/transforms/universal/hap/kfp_ray/hap_wf.py +++ b/transforms/universal/hap/kfp_ray/hap_wf.py @@ -103,7 +103,7 @@ def compute_exec_params_func( def hap( # Ray cluster ray_name: str = "hap-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -138,7 +138,7 @@ def hap( """ Pipeline to execute hap transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -184,7 +184,7 @@ def hap( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 9b59ebbae..4e25fd17e 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -90,7 +90,7 @@ def compute_exec_params_func( def noop( # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -112,7 +112,7 @@ def noop( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -153,7 +153,7 @@ def noop( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 8fbcde300..e4057632d 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -92,7 +92,7 @@ def noop( # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster # Add image_pull_secret, image_pull_policy and tolerations to ray options if needed - ray_id_KFPv2: str = "", + ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", @@ -114,7 +114,7 @@ def noop( """ Pipeline to execute noop transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -155,7 +155,7 @@ def noop( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index e39fe8c88..f5dcefd35 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -78,7 +78,7 @@ def profiler( # Ray cluster ray_name: str = "profiler-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -103,7 +103,7 @@ def profiler( """ Pipeline to execute EDEDUP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -146,7 +146,7 @@ def profiler( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 64b27c231..af917f1f6 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -99,7 +99,7 @@ def compute_exec_params_func( def resize( # Ray cluster ray_name: str = "resize-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -126,7 +126,7 @@ def resize( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -169,7 +169,7 @@ def resize( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index c9e2c5f49..bbcb4a6a1 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -116,7 +116,7 @@ def compute_exec_params_func( def tokenization( # Ray cluster ray_name: str = "tkn-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -150,7 +150,7 @@ def tokenization( """ Pipeline to execute tokenization transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -196,7 +196,7 @@ def tokenization( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task From c6af2880a69e1f87c8f14c32aac5d0a991b720da Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 26 Jan 2025 09:25:14 +0200 Subject: [PATCH 09/17] Address review comments. Signed-off-by: Revital Sur --- kfp/doc/simple_transform_pipeline.md | 25 ++++++++++++++----- .../example/pipeline_definitions.yaml | 2 +- .../templates/simple_pipeline.py | 5 ++-- .../code2parquet/kfp_ray/code2parquet_wf.py | 2 +- .../code_quality/kfp_ray/code_quality_wf.py | 2 +- .../kfp_ray/header_cleanser_wf.py | 2 +- .../kfp_ray/license_select_wf.py | 2 +- transforms/code/malware/kfp_ray/malware_wf.py | 2 +- .../kfp_ray/proglang_select_wf.py | 2 +- .../kfp_ray/repo_level_order_wf.py | 2 +- .../kfp_ray/doc_chunk_multiple_wf.py | 2 +- .../doc_chunk/kfp_ray/doc_chunk_wf.py | 2 +- .../kfp_ray/doc_quality_multiple_wf.py | 2 +- .../doc_quality/kfp_ray/doc_quality_wf.py | 2 +- .../html2parquet/kfp_ray/html2parquet_wf.py | 2 +- .../lang_id/kfp_ray/lang_id_multiple_wf.py | 2 +- .../language/lang_id/kfp_ray/lang_id_wf.py | 2 +- .../kfp_ray/pdf2parquet_multiple_wf.py | 2 +- .../pdf2parquet/kfp_ray/pdf2parquet_wf.py | 2 +- .../pii_redactor/kfp_ray/pii_redactor_wf.py | 2 +- .../kfp_ray/text_encoder_multiple_wf.py | 2 +- .../text_encoder/kfp_ray/text_encoder_wf.py | 2 +- .../universal/doc_id/kfp_ray/doc_id_wf.py | 2 +- .../universal/ededup/kfp_ray/ededup_wf.py | 2 +- .../universal/fdedup/kfp_ray/fdedup_wf.py | 2 +- .../universal/filter/kfp_ray/filter_wf.py | 2 +- transforms/universal/hap/kfp_ray/hap_wf.py | 2 +- .../noop/kfp_ray/noop_multiple_wf.py | 2 +- transforms/universal/noop/kfp_ray/noop_wf.py | 7 ++++-- .../universal/profiler/kfp_ray/profiler_wf.py | 2 +- .../universal/resize/kfp_ray/resize_wf.py | 2 +- .../tokenization/kfp_ray/tokenization_wf.py | 2 +- 32 files changed, 55 insertions(+), 40 deletions(-) diff --git a/kfp/doc/simple_transform_pipeline.md b/kfp/doc/simple_transform_pipeline.md index 00cd9e204..633c82059 100644 --- a/kfp/doc/simple_transform_pipeline.md +++ b/kfp/doc/simple_transform_pipeline.md @@ -41,7 +41,9 @@ Note: the project and the explanation below are based on [KFPv1](https://www.kub import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl +import os from kfp_support.workflow_support.runtime_utils import ( + DEFAULT_KFP_COMPONENT_SPEC_PATH, ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, @@ -56,7 +58,8 @@ Ray cluster. For each step we have to define a component that will execute them: ```python # components - base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.2" + base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + component_spec_path = os.getenv("KFP_COMPONENT_SPEC_PATH", DEFAULT_KFP_COMPONENT_SPEC_PATH) # KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the # `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use @@ -68,11 +71,11 @@ Ray cluster. For each step we have to define a component that will execute them: else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) # create Ray cluster - create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") + create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job - execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") + execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") # clean up Ray - cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") + cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") # Task name is part of the pipeline name, the ray cluster name and the job name in DMF. TASK_NAME: str = "noop" ``` @@ -89,6 +92,7 @@ The input parameters section defines all the parameters required for the pipelin ```python # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster + ray_run_id_KFPv2: str = "", ray_head_options: str = '{"cpu": 1, "memory": 4, \ "image": "' + task_image + '" }', ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, \ @@ -99,6 +103,7 @@ The input parameters section defines all the parameters required for the pipelin data_s3_access_secret: str = "s3-secret", data_max_files: int = -1, data_num_samples: int = -1, + data_checkpointing: bool = False, # orchestrator actor_options: str = "{'num_cpus': 0.8}", pipeline_id: str = "pipeline_id", @@ -171,8 +176,16 @@ component execution and parameters submitted to every component. with dsl.ExitHandler(clean_up_task): # compute execution params compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=actor_options, + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + data_checkpointing=data_checkpointing, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + noop_sleep_sec=noop_sleep_sec, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster diff --git a/kfp/pipeline_generator/single-pipeline/example/pipeline_definitions.yaml b/kfp/pipeline_generator/single-pipeline/example/pipeline_definitions.yaml index d703d36ca..4f8088b19 100644 --- a/kfp/pipeline_generator/single-pipeline/example/pipeline_definitions.yaml +++ b/kfp/pipeline_generator/single-pipeline/example/pipeline_definitions.yaml @@ -1,7 +1,7 @@ pipeline_parameters: name: "noop" description: "Pipeline for noop task" - script_name: "noop_transform.py" + script_name: "-m dpk_noop.ray.runtime" prefix: "" multi_s3: False compute_func_name: "" diff --git a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py index 6a682a0f2..8d8207fc8 100644 --- a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py +++ b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py @@ -97,13 +97,12 @@ def compute_exec_params_func( def {{ pipeline_name }}( # Ray cluster ray_name: str = "{{ pipeline_name }}-kfp-ray", # name of Ray cluster + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed {%- if image_pull_secret != "" %} - ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image}, {%- else %} - ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, {%- endif %} @@ -132,7 +131,7 @@ def {{ pipeline_name }}( """ Pipeline to execute {{ pipeline_name }} transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index 7dd7111ce..c5cba0230 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -128,7 +128,7 @@ def code2parquet( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index 58a571e9b..6aa4dc82c 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -125,7 +125,7 @@ def code_quality( """ Pipeline to execute Code Quality transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index b6d15934b..0f64bd4b0 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -137,7 +137,7 @@ def header_cleanser( """ Pipeline to execute Header Cleanser transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index 55a176a0d..f29f1c839 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -124,7 +124,7 @@ def license_select( """ Pipeline to execute License Select transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index 9c18c5e30..77f5b56b6 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -115,7 +115,7 @@ def malware( """ Pipeline to execute malware transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index 31bad3798..5a6d1d20c 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -118,7 +118,7 @@ def lang_select( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index 8fa169209..4e753ecef 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -148,7 +148,7 @@ def repo_level_order( """ Pipeline to execute repo_level_order transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 7442aabe4..f0408e285 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -128,7 +128,7 @@ def doc_chunk( """ Pipeline to execute chunk documents transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index 975902797..f6670ebef 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -122,7 +122,7 @@ def doc_chunk( """ Pipeline to execute chunk documents transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py index d5a8abc9d..d85430259 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py @@ -126,7 +126,7 @@ def doc_quality( """ Pipeline to execute Document Quality transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py index 6ba23c515..bd3a35894 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py @@ -132,7 +132,7 @@ def doc_quality( """ Pipeline to execute Document Quality transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index 2207363d8..855eac46d 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -126,7 +126,7 @@ def html2parquet( """ Pipeline to execute html2parquet transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index 680e27300..1faaa0d2e 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -134,7 +134,7 @@ def lang_id( """ Pipeline to execute Language Identification transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index a7cfe4509..a9e39edac 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -135,7 +135,7 @@ def lang_id( """ Pipeline to execute Language Identification transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index 3895489f1..8b8797c3d 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -131,7 +131,7 @@ def pdf2parquet( """ Pipeline to execute PDF2PARQUET transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index 13a39a1b8..e9ee06238 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -135,7 +135,7 @@ def pdf2parquet( """ Pipeline to execute PDF2PARQUET transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py index 50ee4c653..a3fd5e1ef 100644 --- a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py +++ b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py @@ -119,7 +119,7 @@ def pii_redactor( """ Pipeline to execute pii_redactor transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index 06c7a3253..a515b7c26 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -118,7 +118,7 @@ def text_encoder( """ Pipeline to execute TextEncoder transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index 98011cb15..0b51e7f87 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -119,7 +119,7 @@ def text_encoder( """ Pipeline to execute TextEncoder transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 03cd29b0b..0b9ccd42d 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -138,7 +138,7 @@ def doc_id( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index dba40490e..941678d60 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -108,7 +108,7 @@ def ededup( """ Pipeline to execute EDEDUP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 33782b07b..4f8e067d9 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -161,7 +161,7 @@ def fuzzydedup( """ Pipeline to execute FDEDUP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index 6b2f87f97..167b862bc 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -119,7 +119,7 @@ def filtering( """ Pipeline to execute Filtering transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/hap/kfp_ray/hap_wf.py b/transforms/universal/hap/kfp_ray/hap_wf.py index 01f943bbc..37e377abd 100644 --- a/transforms/universal/hap/kfp_ray/hap_wf.py +++ b/transforms/universal/hap/kfp_ray/hap_wf.py @@ -138,7 +138,7 @@ def hap( """ Pipeline to execute hap transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 5b3e2e41f..ae614a2b2 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -112,7 +112,7 @@ def noop( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 748ceb52b..1b65ad85d 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -42,6 +42,7 @@ def compute_exec_params_func( data_s3_config: str, data_max_files: int, data_num_samples: int, + data_checkpointing: bool, runtime_pipeline_id: str, runtime_job_id: str, runtime_code_location: dict, @@ -53,6 +54,7 @@ def compute_exec_params_func( "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, + "data_checkpointing": data_checkpointing, "runtime_num_workers": KFPUtils.default_compute_execution_params(str(worker_options), str(actor_options)), "runtime_worker_options": str(actor_options), "runtime_pipeline_id": runtime_pipeline_id, @@ -91,8 +93,8 @@ def compute_exec_params_func( def noop( # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster - # Add image_pull_secret, image_pull_policy and tolerations to ray options if needed ray_run_id_KFPv2: str = "", + # Add image_pull_secret, image_pull_policy and tolerations to ray options if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", @@ -114,7 +116,7 @@ def noop( """ Pipeline to execute noop transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -172,6 +174,7 @@ def noop( data_s3_config=data_s3_config, data_max_files=data_max_files, data_num_samples=data_num_samples, + data_checkpointing=data_checkpointing, runtime_pipeline_id=runtime_pipeline_id, runtime_job_id=run_id, runtime_code_location=runtime_code_location, diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index 53c9ab590..52d2a4dc1 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -103,7 +103,7 @@ def profiler( """ Pipeline to execute EDEDUP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 071d69bdc..bb958688a 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -126,7 +126,7 @@ def resize( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index bbcb4a6a1..5e3c23dde 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -150,7 +150,7 @@ def tokenization( """ Pipeline to execute tokenization transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory From d7fa55d8110b093a52f16dbf1ed6dad9ce467813 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 26 Jan 2025 14:00:38 +0200 Subject: [PATCH 10/17] Address review comments. Signed-off-by: Revital Sur --- kfp/kfp_ray_components/Dockerfile | 2 +- tools/ingest2parquet/Dockerfile | 2 +- transforms/Dockerfile.ray.template | 2 +- transforms/code/code2parquet/ray/Dockerfile | 2 +- transforms/code/code_profiler/Dockerfile.ray | 2 +- transforms/code/code_quality/ray/Dockerfile | 2 +- transforms/code/header_cleanser/ray/Dockerfile | 2 +- transforms/code/license_select/ray/Dockerfile | 2 +- transforms/code/malware/ray/Dockerfile | 2 +- transforms/code/proglang_select/ray/Dockerfile | 2 +- transforms/code/repo_level_ordering/ray/Dockerfile | 2 +- transforms/language/doc_chunk/Dockerfile.ray | 2 +- transforms/language/doc_quality/Dockerfile.ray | 2 +- transforms/language/html2parquet/Dockerfile.ray | 2 +- transforms/language/lang_id/Dockerfile.ray | 2 +- transforms/language/pdf2parquet/Dockerfile.ray | 2 +- transforms/language/pii_redactor/Dockerfile.ray | 2 +- transforms/language/text_encoder/Dockerfile.ray | 2 +- transforms/universal/doc_id/Dockerfile.ray | 2 +- transforms/universal/ededup/Dockerfile.ray | 2 +- transforms/universal/fdedup/Dockerfile.ray | 2 +- transforms/universal/filter/Dockerfile.ray | 2 +- transforms/universal/hap/Dockerfile.ray | 2 +- transforms/universal/profiler/Dockerfile.ray | 2 +- transforms/universal/resize/Dockerfile.ray | 2 +- transforms/universal/tokenization/Dockerfile.ray | 2 +- 26 files changed, 26 insertions(+), 26 deletions(-) diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index 5b51b0d2a..dd3670c52 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray # install libraries diff --git a/tools/ingest2parquet/Dockerfile b/tools/ingest2parquet/Dockerfile index a4319c105..02bdebb84 100644 --- a/tools/ingest2parquet/Dockerfile +++ b/tools/ingest2parquet/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray # install pytest diff --git a/transforms/Dockerfile.ray.template b/transforms/Dockerfile.ray.template index b8e52425b..30b1da959 100644 --- a/transforms/Dockerfile.ray.template +++ b/transforms/Dockerfile.ray.template @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/code2parquet/ray/Dockerfile b/transforms/code/code2parquet/ray/Dockerfile index cf363def4..1f683ed04 100644 --- a/transforms/code/code2parquet/ray/Dockerfile +++ b/transforms/code/code2parquet/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/code_profiler/Dockerfile.ray b/transforms/code/code_profiler/Dockerfile.ray index c308c284c..440b7f977 100644 --- a/transforms/code/code_profiler/Dockerfile.ray +++ b/transforms/code/code_profiler/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/code_quality/ray/Dockerfile b/transforms/code/code_quality/ray/Dockerfile index 2127cfd81..e06ee8c7a 100644 --- a/transforms/code/code_quality/ray/Dockerfile +++ b/transforms/code/code_quality/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/header_cleanser/ray/Dockerfile b/transforms/code/header_cleanser/ray/Dockerfile index b5fc809aa..21bd02b2b 100644 --- a/transforms/code/header_cleanser/ray/Dockerfile +++ b/transforms/code/header_cleanser/ray/Dockerfile @@ -2,7 +2,7 @@ FROM docker.io/rayproject/ray:2.24.0-py310 # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray # install pytest diff --git a/transforms/code/license_select/ray/Dockerfile b/transforms/code/license_select/ray/Dockerfile index 184747ff5..49ada2fda 100644 --- a/transforms/code/license_select/ray/Dockerfile +++ b/transforms/code/license_select/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/malware/ray/Dockerfile b/transforms/code/malware/ray/Dockerfile index 1c231275f..284c9fac4 100644 --- a/transforms/code/malware/ray/Dockerfile +++ b/transforms/code/malware/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} AS base # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/proglang_select/ray/Dockerfile b/transforms/code/proglang_select/ray/Dockerfile index f13ea3444..f7ef64f17 100644 --- a/transforms/code/proglang_select/ray/Dockerfile +++ b/transforms/code/proglang_select/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/repo_level_ordering/ray/Dockerfile b/transforms/code/repo_level_ordering/ray/Dockerfile index 79806dd73..6b308fdbf 100644 --- a/transforms/code/repo_level_ordering/ray/Dockerfile +++ b/transforms/code/repo_level_ordering/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/language/doc_chunk/Dockerfile.ray b/transforms/language/doc_chunk/Dockerfile.ray index 63f2981c3..f0514943b 100644 --- a/transforms/language/doc_chunk/Dockerfile.ray +++ b/transforms/language/doc_chunk/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray # install pytest diff --git a/transforms/language/doc_quality/Dockerfile.ray b/transforms/language/doc_quality/Dockerfile.ray index 6b4ce18a1..fe0fe5b3b 100644 --- a/transforms/language/doc_quality/Dockerfile.ray +++ b/transforms/language/doc_quality/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/language/html2parquet/Dockerfile.ray b/transforms/language/html2parquet/Dockerfile.ray index f246116f4..43ddfa97d 100644 --- a/transforms/language/html2parquet/Dockerfile.ray +++ b/transforms/language/html2parquet/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/language/lang_id/Dockerfile.ray b/transforms/language/lang_id/Dockerfile.ray index ce81c320f..8e2de45ba 100644 --- a/transforms/language/lang_id/Dockerfile.ray +++ b/transforms/language/lang_id/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/language/pdf2parquet/Dockerfile.ray b/transforms/language/pdf2parquet/Dockerfile.ray index e295a9e7f..3a11b7ee6 100644 --- a/transforms/language/pdf2parquet/Dockerfile.ray +++ b/transforms/language/pdf2parquet/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/language/pii_redactor/Dockerfile.ray b/transforms/language/pii_redactor/Dockerfile.ray index a95ce7cbe..93b6bf420 100644 --- a/transforms/language/pii_redactor/Dockerfile.ray +++ b/transforms/language/pii_redactor/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/language/text_encoder/Dockerfile.ray b/transforms/language/text_encoder/Dockerfile.ray index 4b6bee791..9a6fa04d8 100644 --- a/transforms/language/text_encoder/Dockerfile.ray +++ b/transforms/language/text_encoder/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray # install pytest diff --git a/transforms/universal/doc_id/Dockerfile.ray b/transforms/universal/doc_id/Dockerfile.ray index b8e52425b..30b1da959 100644 --- a/transforms/universal/doc_id/Dockerfile.ray +++ b/transforms/universal/doc_id/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/ededup/Dockerfile.ray b/transforms/universal/ededup/Dockerfile.ray index 2584f7979..10faaf297 100644 --- a/transforms/universal/ededup/Dockerfile.ray +++ b/transforms/universal/ededup/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/fdedup/Dockerfile.ray b/transforms/universal/fdedup/Dockerfile.ray index da1c668f1..8d36e6a35 100644 --- a/transforms/universal/fdedup/Dockerfile.ray +++ b/transforms/universal/fdedup/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/filter/Dockerfile.ray b/transforms/universal/filter/Dockerfile.ray index b8e52425b..30b1da959 100644 --- a/transforms/universal/filter/Dockerfile.ray +++ b/transforms/universal/filter/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/hap/Dockerfile.ray b/transforms/universal/hap/Dockerfile.ray index b8e52425b..30b1da959 100644 --- a/transforms/universal/hap/Dockerfile.ray +++ b/transforms/universal/hap/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/profiler/Dockerfile.ray b/transforms/universal/profiler/Dockerfile.ray index b8e52425b..30b1da959 100644 --- a/transforms/universal/profiler/Dockerfile.ray +++ b/transforms/universal/profiler/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/resize/Dockerfile.ray b/transforms/universal/resize/Dockerfile.ray index b8e52425b..30b1da959 100644 --- a/transforms/universal/resize/Dockerfile.ray +++ b/transforms/universal/resize/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/tokenization/Dockerfile.ray b/transforms/universal/tokenization/Dockerfile.ray index 50e6ff7a3..0469e7d9b 100644 --- a/transforms/universal/tokenization/Dockerfile.ray +++ b/transforms/universal/tokenization/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip From 1fdb7fa6a7089cff8c935cad197d0c64571fe4c9 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 26 Jan 2025 14:18:49 +0200 Subject: [PATCH 11/17] Minor fix. Signed-off-by: Revital Sur --- transforms/universal/noop/Dockerfile.ray | 33 ++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 transforms/universal/noop/Dockerfile.ray diff --git a/transforms/universal/noop/Dockerfile.ray b/transforms/universal/noop/Dockerfile.ray new file mode 100644 index 000000000..30b1da959 --- /dev/null +++ b/transforms/universal/noop/Dockerfile.ray @@ -0,0 +1,33 @@ +ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 +FROM ${BASE_IMAGE} + +# see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images +USER root +RUN chown ray:root /home/ray && chmod -R g=u /home/ray +USER ray + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest +ARG DPK_WHEEL_FILE_NAME +ARG TRANSFORM_NAME + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] + + +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# Set environment +ENV PYTHONPATH /home/ray + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT From 535048176057f7ede9358431225fa5a8971aa0a0 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 26 Jan 2025 16:49:21 +0200 Subject: [PATCH 12/17] Address review comments. Signed-off-by: Revital Sur --- kfp/kfp_ray_components/Dockerfile | 10 +++++----- tools/ingest2parquet/Dockerfile | 10 +++++----- transforms/Dockerfile.ray.template | 8 ++++---- transforms/code/code2parquet/ray/Dockerfile | 14 +++++++------- transforms/code/code_profiler/Dockerfile.ray | 8 ++++---- transforms/code/code_quality/ray/Dockerfile | 18 +++++++++--------- transforms/code/header_cleanser/ray/Dockerfile | 12 ++++++------ transforms/code/license_select/ray/Dockerfile | 16 ++++++++-------- transforms/code/malware/ray/Dockerfile | 18 +++++++++--------- transforms/code/proglang_select/ray/Dockerfile | 10 +++++----- .../code/repo_level_ordering/ray/Dockerfile | 10 +++++----- transforms/language/doc_chunk/Dockerfile.ray | 8 ++++---- transforms/language/doc_quality/Dockerfile.ray | 8 ++++---- .../language/html2parquet/Dockerfile.ray | 8 ++++---- transforms/language/lang_id/Dockerfile.ray | 8 ++++---- transforms/language/pdf2parquet/Dockerfile.ray | 8 ++++---- .../language/pii_redactor/Dockerfile.ray | 4 ++-- .../language/text_encoder/Dockerfile.ray | 8 ++++---- transforms/universal/doc_id/Dockerfile.ray | 8 ++++---- transforms/universal/ededup/Dockerfile.ray | 10 +++++----- transforms/universal/fdedup/Dockerfile.ray | 8 ++++---- transforms/universal/filter/Dockerfile.ray | 8 ++++---- transforms/universal/hap/Dockerfile.ray | 8 ++++---- transforms/universal/noop/Dockerfile.ray | 8 ++++---- transforms/universal/profiler/Dockerfile.ray | 8 ++++---- transforms/universal/resize/Dockerfile.ray | 8 ++++---- .../universal/tokenization/Dockerfile.ray | 8 ++++---- 27 files changed, 130 insertions(+), 130 deletions(-) diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index dd3670c52..f33c415f6 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray # install libraries @@ -15,13 +15,13 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root shared_workflow_support_lib shared_workflow_support_lib/ +COPY --chmod=g=u --chown=ray:root shared_workflow_support_lib shared_workflow_support_lib/ RUN cd shared_workflow_support_lib && pip install --no-cache-dir -e . -COPY --chmod=775 --chown=ray:root workflow_support_lib workflow_support_lib/ +COPY --chmod=g=u --chown=ray:root workflow_support_lib workflow_support_lib/ RUN cd workflow_support_lib && pip install --no-cache-dir -e . # overwriting the installation of old versions of pydantic @@ -30,7 +30,7 @@ RUN pip install --no-cache-dir pydantic==2.6.3 # remove credentials-containing file RUN rm requirements.txt # components -COPY --chmod=775 --chown=ray:root ./src /pipelines/component/src +COPY --chmod=g=u --chown=ray:root ./src /pipelines/component/src # Set environment ENV KFP_v2=$KFP_v2 diff --git a/tools/ingest2parquet/Dockerfile b/tools/ingest2parquet/Dockerfile index 02bdebb84..6809535d2 100644 --- a/tools/ingest2parquet/Dockerfile +++ b/tools/ingest2parquet/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray # install pytest @@ -13,7 +13,7 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] COPY requirements.txt requirements.txt @@ -21,9 +21,9 @@ RUN pip install --no-cache-dir -r requirements.txt RUN rm requirements.txt # copy source -COPY --chmod=775 --chown=ray:root ./src . +COPY --chmod=g=u --chown=ray:root ./src . # copy test -COPY --chmod=775 --chown=ray:root test/ test/ -COPY --chmod=775 --chown=ray:root test-data/ test-data/ +COPY --chmod=g=u --chown=ray:root test/ test/ +COPY --chmod=g=u --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/Dockerfile.ray.template b/transforms/Dockerfile.ray.template index 30b1da959..07a22fac7 100644 --- a/transforms/Dockerfile.ray.template +++ b/transforms/Dockerfile.ray.template @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/code/code2parquet/ray/Dockerfile b/transforms/code/code2parquet/ray/Dockerfile index 1f683ed04..f3e091c62 100644 --- a/transforms/code/code2parquet/ray/Dockerfile +++ b/transforms/code/code2parquet/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -16,22 +16,22 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ +COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . # Install ray project source -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY --chmod=775 --chown=ray:root src/code2parquet_transform_ray.py . +COPY --chmod=g=u --chown=ray:root src/code2parquet_transform_ray.py . # copy some of the samples in -COPY --chmod=775 --chown=ray:root src/code2parquet_local_ray.py local/ +COPY --chmod=g=u --chown=ray:root src/code2parquet_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/code_profiler/Dockerfile.ray b/transforms/code/code_profiler/Dockerfile.ray index 440b7f977..0d501f547 100644 --- a/transforms/code/code_profiler/Dockerfile.ray +++ b/transforms/code/code_profiler/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=775 --chown=ray:root dpk_code_profiler/ dpk_code_profiler/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_code_profiler/ dpk_code_profiler/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/code/code_quality/ray/Dockerfile b/transforms/code/code_quality/ray/Dockerfile index e06ee8c7a..f34572b27 100644 --- a/transforms/code/code_quality/ray/Dockerfile +++ b/transforms/code/code_quality/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -19,28 +19,28 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ +COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . #COPY requirements.txt requirements.txt #RUN pip install --no-cache-dir -r requirements.txt -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY --chmod=775 --chown=ray:root ./src/code_quality_transform_ray.py . +COPY --chmod=g=u --chown=ray:root ./src/code_quality_transform_ray.py . # copy some of the samples in -COPY --chmod=775 --chown=ray:root ./src/code_quality_local_ray.py local/ +COPY --chmod=g=u --chown=ray:root ./src/code_quality_local_ray.py local/ # copy test -COPY --chmod=775 --chown=ray:root test/ test/ -COPY --chmod=775 --chown=ray:root test-data/ test-data/ +COPY --chmod=g=u --chown=ray:root test/ test/ +COPY --chmod=g=u --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/header_cleanser/ray/Dockerfile b/transforms/code/header_cleanser/ray/Dockerfile index 21bd02b2b..465b8c7a9 100644 --- a/transforms/code/header_cleanser/ray/Dockerfile +++ b/transforms/code/header_cleanser/ray/Dockerfile @@ -2,7 +2,7 @@ FROM docker.io/rayproject/ray:2.24.0-py310 # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray # install pytest @@ -12,14 +12,14 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform +COPY --chmod=g=u --chown=ray:root python-transform/ python-transform RUN cd python-transform && pip install --no-cache-dir -e . -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # Install system dependencies, including libgomp1 @@ -31,7 +31,7 @@ User ray # copy source data COPY ./src/header_cleanser_transform_ray.py . -COPY --chmod=775 --chown=ray:root src/header_cleanser_local_ray.py local/ +COPY --chmod=g=u --chown=ray:root src/header_cleanser_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/license_select/ray/Dockerfile b/transforms/code/license_select/ray/Dockerfile index 49ada2fda..c4604d275 100644 --- a/transforms/code/license_select/ray/Dockerfile +++ b/transforms/code/license_select/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,20 +15,20 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ +COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml -COPY --chmod=775 --chown=ray:root README.md README.md +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy source data -COPY --chmod=775 --chown=ray:root src/license_select_transform_ray.py . -COPY --chmod=775 --chown=ray:root src/license_select_local_ray.py local/ +COPY --chmod=g=u --chown=ray:root src/license_select_transform_ray.py . +COPY --chmod=g=u --chown=ray:root src/license_select_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/malware/ray/Dockerfile b/transforms/code/malware/ray/Dockerfile index 284c9fac4..2d2dd5e10 100644 --- a/transforms/code/malware/ray/Dockerfile +++ b/transforms/code/malware/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} AS base # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -45,24 +45,24 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ +COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY --chmod=775 --chown=ray:root src/malware_transform_ray.py ./ +COPY --chmod=g=u --chown=ray:root src/malware_transform_ray.py ./ # copy some of the samples in -COPY --chmod=775 --chown=ray:root src/malware_local_ray.py local/ +COPY --chmod=g=u --chown=ray:root src/malware_local_ray.py local/ -COPY --chmod=775 --chown=ray:root test/ test/ -COPY --chmod=775 --chown=ray:root test-data/ test-data/ +COPY --chmod=g=u --chown=ray:root test/ test/ +COPY --chmod=g=u --chown=ray:root test-data/ test-data/ ENV PYTHONPATH /home/ray diff --git a/transforms/code/proglang_select/ray/Dockerfile b/transforms/code/proglang_select/ray/Dockerfile index f7ef64f17..65ff9b15e 100644 --- a/transforms/code/proglang_select/ray/Dockerfile +++ b/transforms/code/proglang_select/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,17 +15,17 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ +COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . #COPY requirements.txt requirements.txt #RUN pip install --no-cache-dir -r requirements.txt -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/code/repo_level_ordering/ray/Dockerfile b/transforms/code/repo_level_ordering/ray/Dockerfile index 6b308fdbf..69bd33f09 100644 --- a/transforms/code/repo_level_ordering/ray/Dockerfile +++ b/transforms/code/repo_level_ordering/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml -COPY --chmod=775 --chown=ray:root README.md README.md +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy source data diff --git a/transforms/language/doc_chunk/Dockerfile.ray b/transforms/language/doc_chunk/Dockerfile.ray index f0514943b..d2b4d464c 100644 --- a/transforms/language/doc_chunk/Dockerfile.ray +++ b/transforms/language/doc_chunk/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray # install pytest @@ -14,12 +14,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_doc_chunk/ dpk_doc_chunk/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_doc_chunk/ dpk_doc_chunk/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/language/doc_quality/Dockerfile.ray b/transforms/language/doc_quality/Dockerfile.ray index fe0fe5b3b..f1f6858b2 100644 --- a/transforms/language/doc_quality/Dockerfile.ray +++ b/transforms/language/doc_quality/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=775 --chown=ray:root dpk_doc_quality/ dpk_doc_quality/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_doc_quality/ dpk_doc_quality/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/language/html2parquet/Dockerfile.ray b/transforms/language/html2parquet/Dockerfile.ray index 43ddfa97d..9ed6c8e50 100644 --- a/transforms/language/html2parquet/Dockerfile.ray +++ b/transforms/language/html2parquet/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=775 --chown=ray:root dpk_html2parquet/ dpk_html2parquet/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_html2parquet/ dpk_html2parquet/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/language/lang_id/Dockerfile.ray b/transforms/language/lang_id/Dockerfile.ray index 8e2de45ba..91b05d6ee 100644 --- a/transforms/language/lang_id/Dockerfile.ray +++ b/transforms/language/lang_id/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -21,12 +21,12 @@ USER ray # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_lang_id/ dpk_lang_id/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_lang_id/ dpk_lang_id/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # clean up apt diff --git a/transforms/language/pdf2parquet/Dockerfile.ray b/transforms/language/pdf2parquet/Dockerfile.ray index 3a11b7ee6..f3b03f596 100644 --- a/transforms/language/pdf2parquet/Dockerfile.ray +++ b/transforms/language/pdf2parquet/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -23,13 +23,13 @@ RUN \ # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=775 --chown=ray:root dpk_pdf2parquet/ dpk_pdf2parquet/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_pdf2parquet/ dpk_pdf2parquet/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} -r requirements.txt diff --git a/transforms/language/pii_redactor/Dockerfile.ray b/transforms/language/pii_redactor/Dockerfile.ray index 93b6bf420..40d6f8e6a 100644 --- a/transforms/language/pii_redactor/Dockerfile.ray +++ b/transforms/language/pii_redactor/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,7 +15,7 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform diff --git a/transforms/language/text_encoder/Dockerfile.ray b/transforms/language/text_encoder/Dockerfile.ray index 9a6fa04d8..638a02e9f 100644 --- a/transforms/language/text_encoder/Dockerfile.ray +++ b/transforms/language/text_encoder/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray # install pytest @@ -13,11 +13,11 @@ ARG PIP_INSTALL_EXTRA_ARGS ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_text_encoder/ dpk_text_encoder/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_text_encoder/ dpk_text_encoder/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/doc_id/Dockerfile.ray b/transforms/universal/doc_id/Dockerfile.ray index 30b1da959..07a22fac7 100644 --- a/transforms/universal/doc_id/Dockerfile.ray +++ b/transforms/universal/doc_id/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/ededup/Dockerfile.ray b/transforms/universal/ededup/Dockerfile.ray index 10faaf297..01d60d3b9 100644 --- a/transforms/universal/ededup/Dockerfile.ray +++ b/transforms/universal/ededup/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,14 +15,14 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] # Install ray project source -COPY --chmod=775 --chown=ray:root dpk_ededup/ dpk_ededup/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt -COPY --chmod=775 --chown=ray:root README.md README.md +COPY --chmod=g=u --chown=ray:root dpk_ededup/ dpk_ededup/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root README.md README.md RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/fdedup/Dockerfile.ray b/transforms/universal/fdedup/Dockerfile.ray index 8d36e6a35..bae6cd9ef 100644 --- a/transforms/universal/fdedup/Dockerfile.ray +++ b/transforms/universal/fdedup/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=775 --chown=ray:root dpk_fdedup/ dpk_fdedup/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_fdedup/ dpk_fdedup/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/universal/filter/Dockerfile.ray b/transforms/universal/filter/Dockerfile.ray index 30b1da959..07a22fac7 100644 --- a/transforms/universal/filter/Dockerfile.ray +++ b/transforms/universal/filter/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/hap/Dockerfile.ray b/transforms/universal/hap/Dockerfile.ray index 30b1da959..07a22fac7 100644 --- a/transforms/universal/hap/Dockerfile.ray +++ b/transforms/universal/hap/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/noop/Dockerfile.ray b/transforms/universal/noop/Dockerfile.ray index 30b1da959..07a22fac7 100644 --- a/transforms/universal/noop/Dockerfile.ray +++ b/transforms/universal/noop/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/profiler/Dockerfile.ray b/transforms/universal/profiler/Dockerfile.ray index 30b1da959..07a22fac7 100644 --- a/transforms/universal/profiler/Dockerfile.ray +++ b/transforms/universal/profiler/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/resize/Dockerfile.ray b/transforms/universal/resize/Dockerfile.ray index 30b1da959..07a22fac7 100644 --- a/transforms/universal/resize/Dockerfile.ray +++ b/transforms/universal/resize/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/tokenization/Dockerfile.ray b/transforms/universal/tokenization/Dockerfile.ray index 0469e7d9b..2988d8938 100644 --- a/transforms/universal/tokenization/Dockerfile.ray +++ b/transforms/universal/tokenization/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_tokenization/ dpk_tokenization/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_tokenization/ dpk_tokenization/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment From 6253bacd32f59249f6485b03a20fe0b05b53f7c4 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 26 Jan 2025 17:36:32 +0200 Subject: [PATCH 13/17] A fix. Signed-off-by: Revital Sur --- kfp/kfp_ray_components/Dockerfile | 8 ++++---- tools/ingest2parquet/Dockerfile | 8 ++++---- transforms/Dockerfile.ray.template | 6 +++--- transforms/code/code2parquet/ray/Dockerfile | 12 ++++++------ transforms/code/code_profiler/Dockerfile.ray | 6 +++--- transforms/code/code_quality/ray/Dockerfile | 16 ++++++++-------- transforms/code/header_cleanser/ray/Dockerfile | 10 +++++----- transforms/code/license_select/ray/Dockerfile | 14 +++++++------- transforms/code/malware/ray/Dockerfile | 16 ++++++++-------- transforms/code/proglang_select/ray/Dockerfile | 8 ++++---- .../code/repo_level_ordering/ray/Dockerfile | 8 ++++---- transforms/language/doc_chunk/Dockerfile.ray | 6 +++--- transforms/language/doc_quality/Dockerfile.ray | 6 +++--- transforms/language/html2parquet/Dockerfile.ray | 6 +++--- transforms/language/lang_id/Dockerfile.ray | 6 +++--- transforms/language/pdf2parquet/Dockerfile.ray | 6 +++--- transforms/language/pii_redactor/Dockerfile.ray | 2 +- transforms/language/text_encoder/Dockerfile.ray | 6 +++--- transforms/universal/doc_id/Dockerfile.ray | 6 +++--- transforms/universal/ededup/Dockerfile.ray | 8 ++++---- transforms/universal/fdedup/Dockerfile.ray | 6 +++--- transforms/universal/filter/Dockerfile.ray | 6 +++--- transforms/universal/hap/Dockerfile.ray | 6 +++--- transforms/universal/noop/Dockerfile.ray | 6 +++--- transforms/universal/profiler/Dockerfile.ray | 6 +++--- transforms/universal/resize/Dockerfile.ray | 6 +++--- transforms/universal/tokenization/Dockerfile.ray | 6 +++--- 27 files changed, 103 insertions(+), 103 deletions(-) diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index f33c415f6..5adbd533a 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -15,13 +15,13 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root shared_workflow_support_lib shared_workflow_support_lib/ +COPY --chmod=775 --chown=ray:root shared_workflow_support_lib shared_workflow_support_lib/ RUN cd shared_workflow_support_lib && pip install --no-cache-dir -e . -COPY --chmod=g=u --chown=ray:root workflow_support_lib workflow_support_lib/ +COPY --chmod=775 --chown=ray:root workflow_support_lib workflow_support_lib/ RUN cd workflow_support_lib && pip install --no-cache-dir -e . # overwriting the installation of old versions of pydantic @@ -30,7 +30,7 @@ RUN pip install --no-cache-dir pydantic==2.6.3 # remove credentials-containing file RUN rm requirements.txt # components -COPY --chmod=g=u --chown=ray:root ./src /pipelines/component/src +COPY --chmod=775 --chown=ray:root ./src /pipelines/component/src # Set environment ENV KFP_v2=$KFP_v2 diff --git a/tools/ingest2parquet/Dockerfile b/tools/ingest2parquet/Dockerfile index 6809535d2..c37739a6a 100644 --- a/tools/ingest2parquet/Dockerfile +++ b/tools/ingest2parquet/Dockerfile @@ -13,7 +13,7 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] COPY requirements.txt requirements.txt @@ -21,9 +21,9 @@ RUN pip install --no-cache-dir -r requirements.txt RUN rm requirements.txt # copy source -COPY --chmod=g=u --chown=ray:root ./src . +COPY --chmod=775 --chown=ray:root ./src . # copy test -COPY --chmod=g=u --chown=ray:root test/ test/ -COPY --chmod=g=u --chown=ray:root test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/Dockerfile.ray.template b/transforms/Dockerfile.ray.template index 07a22fac7..837a3ffda 100644 --- a/transforms/Dockerfile.ray.template +++ b/transforms/Dockerfile.ray.template @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/code/code2parquet/ray/Dockerfile b/transforms/code/code2parquet/ray/Dockerfile index f3e091c62..74e6577ed 100644 --- a/transforms/code/code2parquet/ray/Dockerfile +++ b/transforms/code/code2parquet/ray/Dockerfile @@ -16,22 +16,22 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ +COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . # Install ray project source -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY --chmod=g=u --chown=ray:root src/code2parquet_transform_ray.py . +COPY --chmod=775 --chown=ray:root src/code2parquet_transform_ray.py . # copy some of the samples in -COPY --chmod=g=u --chown=ray:root src/code2parquet_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/code2parquet_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/code_profiler/Dockerfile.ray b/transforms/code/code_profiler/Dockerfile.ray index 0d501f547..9a63ee9fb 100644 --- a/transforms/code/code_profiler/Dockerfile.ray +++ b/transforms/code/code_profiler/Dockerfile.ray @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=g=u --chown=ray:root dpk_code_profiler/ dpk_code_profiler/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_code_profiler/ dpk_code_profiler/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/code/code_quality/ray/Dockerfile b/transforms/code/code_quality/ray/Dockerfile index f34572b27..9906a50b7 100644 --- a/transforms/code/code_quality/ray/Dockerfile +++ b/transforms/code/code_quality/ray/Dockerfile @@ -19,28 +19,28 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ +COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . #COPY requirements.txt requirements.txt #RUN pip install --no-cache-dir -r requirements.txt -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY --chmod=g=u --chown=ray:root ./src/code_quality_transform_ray.py . +COPY --chmod=775 --chown=ray:root ./src/code_quality_transform_ray.py . # copy some of the samples in -COPY --chmod=g=u --chown=ray:root ./src/code_quality_local_ray.py local/ +COPY --chmod=775 --chown=ray:root ./src/code_quality_local_ray.py local/ # copy test -COPY --chmod=g=u --chown=ray:root test/ test/ -COPY --chmod=g=u --chown=ray:root test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/header_cleanser/ray/Dockerfile b/transforms/code/header_cleanser/ray/Dockerfile index 465b8c7a9..056f64c0a 100644 --- a/transforms/code/header_cleanser/ray/Dockerfile +++ b/transforms/code/header_cleanser/ray/Dockerfile @@ -12,14 +12,14 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root python-transform/ python-transform +COPY --chmod=775 --chown=ray:root python-transform/ python-transform RUN cd python-transform && pip install --no-cache-dir -e . -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # Install system dependencies, including libgomp1 @@ -31,7 +31,7 @@ User ray # copy source data COPY ./src/header_cleanser_transform_ray.py . -COPY --chmod=g=u --chown=ray:root src/header_cleanser_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/header_cleanser_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/license_select/ray/Dockerfile b/transforms/code/license_select/ray/Dockerfile index c4604d275..8d1c457e5 100644 --- a/transforms/code/license_select/ray/Dockerfile +++ b/transforms/code/license_select/ray/Dockerfile @@ -15,20 +15,20 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ +COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml -COPY --chmod=g=u --chown=ray:root README.md README.md +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy source data -COPY --chmod=g=u --chown=ray:root src/license_select_transform_ray.py . -COPY --chmod=g=u --chown=ray:root src/license_select_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/license_select_transform_ray.py . +COPY --chmod=775 --chown=ray:root src/license_select_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/malware/ray/Dockerfile b/transforms/code/malware/ray/Dockerfile index 2d2dd5e10..56825ffa6 100644 --- a/transforms/code/malware/ray/Dockerfile +++ b/transforms/code/malware/ray/Dockerfile @@ -45,24 +45,24 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ +COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY --chmod=g=u --chown=ray:root src/malware_transform_ray.py ./ +COPY --chmod=775 --chown=ray:root src/malware_transform_ray.py ./ # copy some of the samples in -COPY --chmod=g=u --chown=ray:root src/malware_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/malware_local_ray.py local/ -COPY --chmod=g=u --chown=ray:root test/ test/ -COPY --chmod=g=u --chown=ray:root test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ ENV PYTHONPATH /home/ray diff --git a/transforms/code/proglang_select/ray/Dockerfile b/transforms/code/proglang_select/ray/Dockerfile index 65ff9b15e..f7ccd1ca2 100644 --- a/transforms/code/proglang_select/ray/Dockerfile +++ b/transforms/code/proglang_select/ray/Dockerfile @@ -15,17 +15,17 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ +COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . #COPY requirements.txt requirements.txt #RUN pip install --no-cache-dir -r requirements.txt -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/code/repo_level_ordering/ray/Dockerfile b/transforms/code/repo_level_ordering/ray/Dockerfile index 69bd33f09..528439722 100644 --- a/transforms/code/repo_level_ordering/ray/Dockerfile +++ b/transforms/code/repo_level_ordering/ray/Dockerfile @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml -COPY --chmod=g=u --chown=ray:root README.md README.md +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy source data diff --git a/transforms/language/doc_chunk/Dockerfile.ray b/transforms/language/doc_chunk/Dockerfile.ray index d2b4d464c..3a541de1b 100644 --- a/transforms/language/doc_chunk/Dockerfile.ray +++ b/transforms/language/doc_chunk/Dockerfile.ray @@ -14,12 +14,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_doc_chunk/ dpk_doc_chunk/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_doc_chunk/ dpk_doc_chunk/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/language/doc_quality/Dockerfile.ray b/transforms/language/doc_quality/Dockerfile.ray index f1f6858b2..7ba61b544 100644 --- a/transforms/language/doc_quality/Dockerfile.ray +++ b/transforms/language/doc_quality/Dockerfile.ray @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=g=u --chown=ray:root dpk_doc_quality/ dpk_doc_quality/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_doc_quality/ dpk_doc_quality/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/language/html2parquet/Dockerfile.ray b/transforms/language/html2parquet/Dockerfile.ray index 9ed6c8e50..742cea06a 100644 --- a/transforms/language/html2parquet/Dockerfile.ray +++ b/transforms/language/html2parquet/Dockerfile.ray @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=g=u --chown=ray:root dpk_html2parquet/ dpk_html2parquet/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_html2parquet/ dpk_html2parquet/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/language/lang_id/Dockerfile.ray b/transforms/language/lang_id/Dockerfile.ray index 91b05d6ee..93df29982 100644 --- a/transforms/language/lang_id/Dockerfile.ray +++ b/transforms/language/lang_id/Dockerfile.ray @@ -21,12 +21,12 @@ USER ray # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_lang_id/ dpk_lang_id/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_lang_id/ dpk_lang_id/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # clean up apt diff --git a/transforms/language/pdf2parquet/Dockerfile.ray b/transforms/language/pdf2parquet/Dockerfile.ray index f3b03f596..4dc62538e 100644 --- a/transforms/language/pdf2parquet/Dockerfile.ray +++ b/transforms/language/pdf2parquet/Dockerfile.ray @@ -23,13 +23,13 @@ RUN \ # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=g=u --chown=ray:root dpk_pdf2parquet/ dpk_pdf2parquet/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_pdf2parquet/ dpk_pdf2parquet/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} -r requirements.txt diff --git a/transforms/language/pii_redactor/Dockerfile.ray b/transforms/language/pii_redactor/Dockerfile.ray index 40d6f8e6a..791cfd2a9 100644 --- a/transforms/language/pii_redactor/Dockerfile.ray +++ b/transforms/language/pii_redactor/Dockerfile.ray @@ -15,7 +15,7 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform diff --git a/transforms/language/text_encoder/Dockerfile.ray b/transforms/language/text_encoder/Dockerfile.ray index 638a02e9f..ba0913bad 100644 --- a/transforms/language/text_encoder/Dockerfile.ray +++ b/transforms/language/text_encoder/Dockerfile.ray @@ -13,11 +13,11 @@ ARG PIP_INSTALL_EXTRA_ARGS ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_text_encoder/ dpk_text_encoder/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_text_encoder/ dpk_text_encoder/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/doc_id/Dockerfile.ray b/transforms/universal/doc_id/Dockerfile.ray index 07a22fac7..837a3ffda 100644 --- a/transforms/universal/doc_id/Dockerfile.ray +++ b/transforms/universal/doc_id/Dockerfile.ray @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/ededup/Dockerfile.ray b/transforms/universal/ededup/Dockerfile.ray index 01d60d3b9..bb1ffae5d 100644 --- a/transforms/universal/ededup/Dockerfile.ray +++ b/transforms/universal/ededup/Dockerfile.ray @@ -15,14 +15,14 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] # Install ray project source -COPY --chmod=g=u --chown=ray:root dpk_ededup/ dpk_ededup/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt -COPY --chmod=g=u --chown=ray:root README.md README.md +COPY --chmod=775 --chown=ray:root dpk_ededup/ dpk_ededup/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root README.md README.md RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/fdedup/Dockerfile.ray b/transforms/universal/fdedup/Dockerfile.ray index bae6cd9ef..09ebced24 100644 --- a/transforms/universal/fdedup/Dockerfile.ray +++ b/transforms/universal/fdedup/Dockerfile.ray @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=g=u --chown=ray:root dpk_fdedup/ dpk_fdedup/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_fdedup/ dpk_fdedup/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/universal/filter/Dockerfile.ray b/transforms/universal/filter/Dockerfile.ray index 07a22fac7..837a3ffda 100644 --- a/transforms/universal/filter/Dockerfile.ray +++ b/transforms/universal/filter/Dockerfile.ray @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/hap/Dockerfile.ray b/transforms/universal/hap/Dockerfile.ray index 07a22fac7..837a3ffda 100644 --- a/transforms/universal/hap/Dockerfile.ray +++ b/transforms/universal/hap/Dockerfile.ray @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/noop/Dockerfile.ray b/transforms/universal/noop/Dockerfile.ray index 07a22fac7..837a3ffda 100644 --- a/transforms/universal/noop/Dockerfile.ray +++ b/transforms/universal/noop/Dockerfile.ray @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/profiler/Dockerfile.ray b/transforms/universal/profiler/Dockerfile.ray index 07a22fac7..837a3ffda 100644 --- a/transforms/universal/profiler/Dockerfile.ray +++ b/transforms/universal/profiler/Dockerfile.ray @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/resize/Dockerfile.ray b/transforms/universal/resize/Dockerfile.ray index 07a22fac7..837a3ffda 100644 --- a/transforms/universal/resize/Dockerfile.ray +++ b/transforms/universal/resize/Dockerfile.ray @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/tokenization/Dockerfile.ray b/transforms/universal/tokenization/Dockerfile.ray index 2988d8938..26d4a24d6 100644 --- a/transforms/universal/tokenization/Dockerfile.ray +++ b/transforms/universal/tokenization/Dockerfile.ray @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_tokenization/ dpk_tokenization/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_tokenization/ dpk_tokenization/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment From 40c2889f2d9ae7ddc6c9d034c8a235ab1300b693 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 26 Jan 2025 18:52:26 +0200 Subject: [PATCH 14/17] More changes. Signed-off-by: Revital Sur --- transforms/code/code2parquet/ray/Dockerfile | 4 ++-- transforms/code/header_cleanser/ray/Dockerfile | 6 +++--- transforms/code/license_select/ray/Dockerfile | 4 ++-- transforms/code/proglang_select/ray/Dockerfile | 8 ++++---- transforms/code/repo_level_ordering/ray/Dockerfile | 10 +++++----- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/transforms/code/code2parquet/ray/Dockerfile b/transforms/code/code2parquet/ray/Dockerfile index 74e6577ed..6681f09d5 100644 --- a/transforms/code/code2parquet/ray/Dockerfile +++ b/transforms/code/code2parquet/ray/Dockerfile @@ -34,8 +34,8 @@ COPY --chmod=775 --chown=ray:root src/code2parquet_transform_ray.py . COPY --chmod=775 --chown=ray:root src/code2parquet_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/header_cleanser/ray/Dockerfile b/transforms/code/header_cleanser/ray/Dockerfile index 056f64c0a..0bef909d8 100644 --- a/transforms/code/header_cleanser/ray/Dockerfile +++ b/transforms/code/header_cleanser/ray/Dockerfile @@ -30,12 +30,12 @@ RUN sudo apt-get update && sudo apt-get install -y \ User ray # copy source data -COPY ./src/header_cleanser_transform_ray.py . +COPY --chmod=775 --chown=ray:root ./src/header_cleanser_transform_ray.py . COPY --chmod=775 --chown=ray:root src/header_cleanser_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/license_select/ray/Dockerfile b/transforms/code/license_select/ray/Dockerfile index 8d1c457e5..d7b3be5f8 100644 --- a/transforms/code/license_select/ray/Dockerfile +++ b/transforms/code/license_select/ray/Dockerfile @@ -31,8 +31,8 @@ COPY --chmod=775 --chown=ray:root src/license_select_transform_ray.py . COPY --chmod=775 --chown=ray:root src/license_select_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Put these at the end since they seem to upset the docker cache. ARG BUILD_DATE diff --git a/transforms/code/proglang_select/ray/Dockerfile b/transforms/code/proglang_select/ray/Dockerfile index f7ccd1ca2..7f457ed4e 100644 --- a/transforms/code/proglang_select/ray/Dockerfile +++ b/transforms/code/proglang_select/ray/Dockerfile @@ -29,14 +29,14 @@ COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY ./src/proglang_select_transform_ray.py . +COPY --chmod=775 --chown=ray:root ./src/proglang_select_transform_ray.py . # copy some of the samples in -COPY ./src/proglang_select_local_ray.py local/ +COPY --chmod=775 --chown=ray:root ./src/proglang_select_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/repo_level_ordering/ray/Dockerfile b/transforms/code/repo_level_ordering/ray/Dockerfile index 528439722..a533a281e 100644 --- a/transforms/code/repo_level_ordering/ray/Dockerfile +++ b/transforms/code/repo_level_ordering/ray/Dockerfile @@ -24,13 +24,13 @@ COPY --chmod=775 --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy source data -COPY ./src/repo_level_order_transform_ray.py . -COPY ./src/repo_level_order_local_ray.py local/ -COPY ./src/repo_level_order_s3_ray.py local/ +COPY --chmod=775 --chown=ray:root ./src/repo_level_order_transform_ray.py . +COPY --chmod=775 --chown=ray:root ./src/repo_level_order_local_ray.py local/ +COPY --chmod=775 --chown=ray:root ./src/repo_level_order_s3_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray:/home/ray/src From c5117e54c52e324f29182c07fcbe3a613768e09a Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 27 Jan 2025 06:11:40 +0200 Subject: [PATCH 15/17] Fix super pipeline kfp v2. Signed-off-by: Revital Sur --- examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md | 1 + .../superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py | 2 ++ transforms/universal/doc_id/kfp_ray/doc_id_wf.py | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md index f68c1aaf7..2a16be57f 100644 --- a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md +++ b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md @@ -21,6 +21,7 @@ Another useful feature of the KFP v2 is the `Json` editor for the `dict` type in - It creates just one run that includes all the nested transfroms and their sub-tasks. - No need for additional component as `executeSubWorkflowComponent.yaml`. All the implementation in the same pipeline file. - In superpipelines of KFP v1 there exists an option to override the common parameters with specific values for each one of the transforms. This option is missing in the KFP v2 superpipelines. +- In kfp V2 pipelines the user is requested to insert a unique string for the ray cluster created at run creation time (called `ray_run_id_KFPv2`). This is because in KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. ### How to compile the superpipeline ``` diff --git a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py index 434d84ab0..7c82ab79a 100644 --- a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py +++ b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py @@ -62,6 +62,7 @@ def super_pipeline( p2_skip: bool = False, p2_noop_sleep_sec: int = 10, p2_ray_name: str = "noop-kfp-ray", + p2_ray_run_id_KFPv2: str = "", p2_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "", "image": noop_image}, p2_ray_worker_options: dict = { "replicas": 2, @@ -75,6 +76,7 @@ def super_pipeline( # Document ID step parameters p3_name: str = "doc_id", p3_ray_name: str = "docid-kfp-ray", + p3_ray_run_id_KFPv2: str = "", p3_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "", "image": doc_id_image}, p3_ray_worker_options: dict = { "replicas": 2, diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 0b9ccd42d..2542a876c 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -27,7 +27,7 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "-m dpk_doc_id.ray.transform" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.3" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = os.getenv( From 5bba22cf78f4767add841383478558396ea59348 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 27 Jan 2025 06:53:53 -0600 Subject: [PATCH 16/17] Address review comments. Signed-off-by: Revital Sur --- examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md | 2 +- .../ray/kfp_v2/superpipeline_noop_docId_v2_wf.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md index 2a16be57f..69d315efe 100644 --- a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md +++ b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md @@ -21,7 +21,7 @@ Another useful feature of the KFP v2 is the `Json` editor for the `dict` type in - It creates just one run that includes all the nested transfroms and their sub-tasks. - No need for additional component as `executeSubWorkflowComponent.yaml`. All the implementation in the same pipeline file. - In superpipelines of KFP v1 there exists an option to override the common parameters with specific values for each one of the transforms. This option is missing in the KFP v2 superpipelines. -- In kfp V2 pipelines the user is requested to insert a unique string for the ray cluster created at run creation time (called `ray_run_id_KFPv2`). This is because in KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. +- In kfp V2 pipelines the user is requested to insert a unique string for the ray cluster created at run creation time (called `ray_run_id_KFPv2`). This is because in KFPv2 `dsl.RUN_ID_PLACEHOLDER` is deprecated and cannot be used since SDK 2.5.0 and we cannot generate a unique string at run-time, see https://github.com/kubeflow/pipelines/issues/10187. ### How to compile the superpipeline ``` diff --git a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py index 7c82ab79a..5d3846540 100644 --- a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py +++ b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py @@ -57,12 +57,12 @@ def super_pipeline( p1_pipeline_data_max_files: int = -1, p1_pipeline_data_num_samples: int = -1, p1_pipeline_data_checkpointing: bool = False, + p1_pipeline_ray_run_id_KFPv2: str = "", # noop step parameters p2_name: str = "noop", p2_skip: bool = False, p2_noop_sleep_sec: int = 10, p2_ray_name: str = "noop-kfp-ray", - p2_ray_run_id_KFPv2: str = "", p2_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "", "image": noop_image}, p2_ray_worker_options: dict = { "replicas": 2, @@ -76,7 +76,6 @@ def super_pipeline( # Document ID step parameters p3_name: str = "doc_id", p3_ray_name: str = "docid-kfp-ray", - p3_ray_run_id_KFPv2: str = "", p3_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "", "image": doc_id_image}, p3_ray_worker_options: dict = { "replicas": 2, From a306cbfd21973ed26c713cd0e51a86f74d0dbe02 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 27 Jan 2025 14:02:39 -0500 Subject: [PATCH 17/17] fix dependency issue breaking test-src Signed-off-by: Maroun Touma --- transforms/universal/fdedup/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/requirements.txt b/transforms/universal/fdedup/requirements.txt index b28fac859..42af99d8b 100644 --- a/transforms/universal/fdedup/requirements.txt +++ b/transforms/universal/fdedup/requirements.txt @@ -6,4 +6,4 @@ disjoint-set>=0.8.0 scipy>=1.12.1, <2.0.0 numpy<1.29.0 sentencepiece>=0.2.0 -mmh3>=4.1.0 +mmh3>=4.1.0, <=5.0.1