Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pull request source branch to cut release 0.2.1 #622

Merged
merged 11 commits into from
Sep 25, 2024
2 changes: 1 addition & 1 deletion .make.versions
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ DPK_MINOR_VERSION=2
DPK_MICRO_VERSION=1
# The suffix is generally always set in the main/development branch and only nulled out when creating release branches.
# It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi.
DPK_VERSION_SUFFIX=.dev3
DPK_VERSION_SUFFIX=

DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX)

Expand Down
2 changes: 1 addition & 1 deletion data-processing-lib/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10"
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
description = "Data Preparation Toolkit Library"
Expand Down
4 changes: 2 additions & 2 deletions data-processing-lib/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_ray"
version = "0.2.1.dev3"
version = "0.2.1"
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
requires-python = ">=3.10"
description = "Data Preparation Toolkit Library for Ray"
Expand All @@ -11,7 +11,7 @@ authors = [
{ name = "Boris Lublinsky", email = "[email protected]" },
]
dependencies = [
"data-prep-toolkit>=0.2.1.dev3",
"data-prep-toolkit>=0.2.1",
"ray[default]==2.24.0",
# These two are to fix security issues identified by quay.io
"fastapi>=0.110.2",
Expand Down
4 changes: 2 additions & 2 deletions data-processing-lib/spark/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_spark"
version = "0.2.1.dev3"
version = "0.2.1"
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
requires-python = ">=3.10"
description = "Data Preparation Toolkit Library for Spark"
Expand All @@ -11,7 +11,7 @@ authors = [
{ name = "Boris Lublinsky", email = "[email protected]" },
]
dependencies = [
"data-prep-toolkit==0.2.1.dev3",
"data-prep-toolkit==0.2.1",
"pyspark>=3.5.2",
"psutil>=6.0.0"
]
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/createRayClusterComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ inputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/deleteRayClusterComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ inputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/executeRayJobComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ inputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ inputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/executeSubWorkflowComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ outputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists, and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
4 changes: 2 additions & 2 deletions kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_kfp_v1"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10,<3.12"
description = "Data Preparation Kit Library. KFP support"
license = {text = "Apache-2.0"}
Expand All @@ -13,7 +13,7 @@ authors = [
]
dependencies = [
"kfp==1.8.22",
"data-prep-toolkit-kfp-shared==0.2.1.dev3",
"data-prep-toolkit-kfp-shared==0.2.1",
]

[build-system]
Expand Down
4 changes: 2 additions & 2 deletions kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_kfp_v2"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10,<3.12"
description = "Data Preparation Kit Library. KFP support"
license = {text = "Apache-2.0"}
Expand All @@ -14,7 +14,7 @@ authors = [
dependencies = [
"kfp==2.8.0",
"kfp-kubernetes==1.2.0",
"data-prep-toolkit-kfp-shared==0.2.1.dev3",
"data-prep-toolkit-kfp-shared==0.2.1",
]

[build-system]
Expand Down
4 changes: 2 additions & 2 deletions kfp/kfp_support_lib/shared_workflow_support/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_kfp_shared"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10,<3.12"
description = "Data Preparation Kit Library. KFP support"
license = {text = "Apache-2.0"}
Expand All @@ -14,7 +14,7 @@ authors = [
dependencies = [
"requests",
"kubernetes",
"data-prep-toolkit-ray==0.2.1.dev3",
"data-prep-toolkit-ray==0.2.1",
]

[build-system]
Expand Down
16 changes: 8 additions & 8 deletions kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@
run_fuzzy_dedup_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")
run_tokenization_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")

code_to_parquet_image = "quay.io/dataprep1/data-prep-kit/code2parquet-ray:latest"
proglang_select_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:latest"
code_quality_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest"
malware_image = "quay.io/dataprep1/data-prep-kit/malware-ray:latest"
doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:latest"
ededup_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:latest"
fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest"
tokenizer_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:latest"
code_to_parquet_image = "quay.io/dataprep1/data-prep-kit/code2parquet-ray:0.2.1"
proglang_select_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:0.2.1"
code_quality_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:0.2.1"
malware_image = "quay.io/dataprep1/data-prep-kit/malware-ray:0.2.1"
doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:0.2.1"
ededup_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:0.2.1"
fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:0.2.1"
tokenizer_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:0.2.1"


# Pipeline to invoke execution on remote resource
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
run_exact_dedup_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")
run_fuzzy_dedup_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")

doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:latest"
ededup_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:latest"
fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest"
doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:0.2.1"
ededup_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:0.2.1"
fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:0.2.1"

# Pipeline to invoke execution on remote resource
@dsl.pipeline(
Expand Down
36 changes: 36 additions & 0 deletions release-notes.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,41 @@
# Data Prep Kit Release notes

## Release 0.2.1 - 9/24/2024

### General
1. Bug fixes across the repo
1. Added AI Alliance RAG demo, tutorials and notebooks and tips for running on google colab
1. Added new transforms and single package for transforms published to pypi
1. improved CI/CD with targeted workflow triggered on specific changes to specific modules
1. New enhancements for cutting a release


### data-prep-toolkit libraries (python, ray, spark)

1. Restructure the repository to distinguish/separate runtime libraries
1. Split data-processing-lib/ray into python and ray
1. Spark runtime
1. updated pyarrow version
1. define required transform() method as abstract to AbstractTableTransform
1. Enables configuration of makefile to use src or pypi for data-prep-kit library dependencies
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some of the bullets start with upper case, while others with lower.



### KFP Workloads

1. Update kfp image version
1. Enable kfp in GH action for testing randomly selected workflow and prevent kfp test for transforms that do not support it
1. Auto generate kfp pipelines
1. Combine the common KFP support code in a shared library
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No it is not a new feature.
but the following is:

  1. Add a configurable timeout before destroying the deployed Ray cluster.

1. Update K8s cluster deployment and remove creation of clusterrolebinding in kubeflow installation


### Transforms

1. Added 7 new transdforms including: language identification, profiler, repo level ordering, doc quality, pdf2parquet, HTML2Parquet and PII Transform
1. Added ededup python implementation and incremental ededup
1. Added fuzzy floating point comparison


## Release 0.2.0 - 6/27/2024

### General
Expand Down
4 changes: 2 additions & 2 deletions transforms/code/code2parquet/kfp_ray/code2parquet_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@
# the name of the job script
EXEC_SCRIPT_NAME: str = "code2parquet_transform_ray.py"

task_image = "quay.io/dataprep1/data-prep-kit/code2parquet-ray:latest"
task_image = "quay.io/dataprep1/data-prep-kit/code2parquet-ray:0.2.1"


# components
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"

# path to kfp component specifications files
component_spec_path = "../../../../kfp/kfp_ray_components/"
Expand Down
4 changes: 2 additions & 2 deletions transforms/code/code2parquet/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code2parquet_transform_python"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10"
description = "code2parquet Python Transform"
license = {text = "Apache-2.0"}
Expand All @@ -10,7 +10,7 @@ authors = [
{ name = "Boris Lublinsky", email = "[email protected]" },
]
dependencies = [
"data-prep-toolkit==0.2.1.dev3",
"data-prep-toolkit==0.2.1",
"parameterized",
"pandas",
]
Expand Down
6 changes: 3 additions & 3 deletions transforms/code/code2parquet/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code2parquet_transform_ray"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10"
description = "code2parquet Ray Transform"
license = {text = "Apache-2.0"}
Expand All @@ -10,8 +10,8 @@ authors = [
{ name = "Boris Lublinsky", email = "[email protected]" },
]
dependencies = [
"data-prep-toolkit-ray==0.2.1.dev3",
"dpk-code2parquet-transform-python==0.2.1.dev3",
"data-prep-toolkit-ray==0.2.1",
"dpk-code2parquet-transform-python==0.2.1",
"parameterized",
"pandas",
]
Expand Down
4 changes: 2 additions & 2 deletions transforms/code/code_quality/kfp_ray/code_quality_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
EXEC_SCRIPT_NAME: str = "code_quality_transform_ray.py"
PREFIX: str = ""

task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest"
task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:0.2.1"

# components
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"

# path to kfp component specifications files
component_spec_path = "../../../../kfp/kfp_ray_components/"
Expand Down
4 changes: 2 additions & 2 deletions transforms/code/code_quality/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code_quality_transform_python"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10"
description = "Code Quality Python Transform"
license = {text = "Apache-2.0"}
Expand All @@ -9,7 +9,7 @@ authors = [
{ name = "Shivdeep Singh", email = "[email protected]" },
]
dependencies = [
"data-prep-toolkit==0.2.1.dev3",
"data-prep-toolkit==0.2.1",
"bs4==0.0.2",
"transformers==4.38.2",
]
Expand Down
6 changes: 3 additions & 3 deletions transforms/code/code_quality/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code_quality_transform_ray"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10"
description = "Code Quality Ray Transform"
license = {text = "Apache-2.0"}
Expand All @@ -9,8 +9,8 @@ authors = [
{ name = "Shivdeep Singh", email = "[email protected]" },
]
dependencies = [
"dpk-code-quality-transform-python==0.2.1.dev3",
"data-prep-toolkit-ray==0.2.1.dev3",
"dpk-code-quality-transform-python==0.2.1",
"data-prep-toolkit-ray==0.2.1",
]

[build-system]
Expand Down
4 changes: 2 additions & 2 deletions transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
EXEC_SCRIPT_NAME: str = "header_cleanser_transform_ray.py"
PREFIX: str = ""

task_image = "quay.io/dataprep1/data-prep-kit/header_cleanser-ray:latest"
task_image = "quay.io/dataprep1/data-prep-kit/header_cleanser-ray:0.2.1"

# components
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"

# path to kfp component specifications files
component_spec_path = "../../../../kfp/kfp_ray_components/"
Expand Down
4 changes: 2 additions & 2 deletions transforms/code/header_cleanser/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_header_cleanser_transform_python"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10"
description = "License and Copyright Removal Transform for Python"
license = {text = "Apache-2.0"}
Expand All @@ -9,7 +9,7 @@ authors = [
{ name = "Yash kalathiya", email = "[email protected]" },
]
dependencies = [
"data-prep-toolkit==0.2.1.dev3",
"data-prep-toolkit==0.2.1",
"scancode-toolkit==32.1.0",
]

Expand Down
6 changes: 3 additions & 3 deletions transforms/code/header_cleanser/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_header_cleanser_transform_ray"
version = "0.2.1.dev3"
version = "0.2.1"
requires-python = ">=3.10"
description = "License and copyright removal Transform for Ray"
license = {text = "Apache-2.0"}
Expand All @@ -9,8 +9,8 @@ authors = [
{ name = "Yash kalathiya", email = "[email protected]" },
]
dependencies = [
"dpk-header-cleanser-transform-python==0.2.1.dev3",
"data-prep-toolkit-ray==0.2.1.dev3",
"dpk-header-cleanser-transform-python==0.2.1",
"data-prep-toolkit-ray==0.2.1",
"scancode-toolkit==32.1.0",
]

Expand Down
4 changes: 2 additions & 2 deletions transforms/code/malware/kfp_ray/malware_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
# the name of the job script
EXEC_SCRIPT_NAME: str = "malware_transform_ray.py"

task_image = "quay.io/dataprep1/data-prep-kit/malware-ray:latest"
task_image = "quay.io/dataprep1/data-prep-kit/malware-ray:0.2.1"

# components
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.1"

# path to kfp component specifications files
component_spec_path = "../../../../kfp/kfp_ray_components/"
Expand Down
Loading
Loading