From c8c0d04e2e6d3293bb2784d2a97b76ad09d435d5 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 14 Jan 2025 06:16:52 -0500 Subject: [PATCH 1/4] prepare for 1.0 release Signed-off-by: Maroun Touma --- transforms/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index faf72cfe6..452bb4cb7 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "1.0.0a3" +version = "1.0.0" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" From a3e248a842349bf627f96a313a44da3807bfb053 Mon Sep 17 00:00:00 2001 From: Shahrokh Daijavad Date: Fri, 24 Jan 2025 11:06:13 -0800 Subject: [PATCH 2/4] Update release-notes.md --- release-notes.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/release-notes.md b/release-notes.md index 1020b3214..84a9fbf92 100644 --- a/release-notes.md +++ b/release-notes.md @@ -1,17 +1,17 @@ # Data Prep Kit Release notes -## Release 1.0.0 - 1/14/2025 +## Release 1.0.0 - 1/24/2025 ## General -1. Refactor all language transforms and implement simplified APIs for refactored transforms -1. Added notebooks examples for each of the transforms -1. Streamlined documentation and added tutorial -1. Other minor enhancements and bug fixes +1. Refactored all language transforms and implemented simplified APIs for the refactored transforms +1. Added notebook examples for each of the transforms +1. Streamlined documentation and added tutorial for developers who want to build new transforms +1. Other minor enhancements and bug fixes were done for transforms, workflow pipelines, and CI/CD makefiles ### Transforms -1. Added new similarity transform +1. Added new similarity transform (for detecting confidentiality, copyright, and/or plagiarism in documents) ## Release 0.2.3 - 12/15/2024 From f9e2e7280296a13c9c703704727ef53f97c68db9 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 24 Jan 2025 15:50:26 -0500 Subject: [PATCH 3/4] final testing with 1.0 before release to pypi Signed-off-by: Maroun Touma --- transforms/pyproject.toml | 2 +- transforms/transforms-1.0-lang-ray.ipynb | 22 +++------------------- transforms/transforms-1.0-lang.ipynb | 6 +++--- 3 files changed, 7 insertions(+), 23 deletions(-) diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index af19f97cb..396ee62d9 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "1.0.0a6" +version = "1.0.0" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" diff --git a/transforms/transforms-1.0-lang-ray.ipynb b/transforms/transforms-1.0-lang-ray.ipynb index 1f19f8bb3..e903e562a 100644 --- a/transforms/transforms-1.0-lang-ray.ipynb +++ b/transforms/transforms-1.0-lang-ray.ipynb @@ -21,7 +21,7 @@ "outputs": [], "source": [ "%%capture\n", - "!pip install 'data-prep-toolkit-transforms[ray, language]==1.0.0a4'\n", + "!pip install 'data-prep-toolkit-transforms[ray,language]'\n", "import pyarrow.parquet as pq\n", "import pandas as pd" ] @@ -335,8 +335,8 @@ "outputs": [], "source": [ "##### **** To explote the output from eDedup, run the code below\n", - "table = pq.read_table('files-ededup/arxiv_org_2408.09869v5.pdf_application.parquet')\n", - "table.to_pandas()" + "#table = pq.read_table('files-ededup/arxiv_org_2408.09869v5.pdf_application.parquet')\n", + "#table.to_pandas()" ] }, { @@ -595,22 +595,6 @@ "#import glob\n", "#glob.glob(\"files-fdedup/*\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "36c243b7-5097-4a3c-bd4e-45c3b8273a90", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52d85768-7a15-46bc-8c46-6782dba53d69", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/transforms/transforms-1.0-lang.ipynb b/transforms/transforms-1.0-lang.ipynb index fa86b7609..d9bb0868c 100644 --- a/transforms/transforms-1.0-lang.ipynb +++ b/transforms/transforms-1.0-lang.ipynb @@ -20,7 +20,7 @@ "outputs": [], "source": [ "%%capture\n", - "!pip install 'data-prep-toolkit-transforms[language]==1.0.0a1'\n", + "!pip install 'data-prep-toolkit-transforms[language]'\n", "import pyarrow.parquet as pq\n", "import pandas as pd" ] @@ -330,8 +330,8 @@ "outputs": [], "source": [ "##### **** To explote the output from eDedup, run the code below\n", - "table = pq.read_table('files-ededup/arxiv_org_2408.09869v5.pdf_application.parquet')\n", - "table.to_pandas()" + "#table = pq.read_table('files-ededup/arxiv_org_2408.09869v5.pdf_application.parquet')\n", + "#table.to_pandas()" ] }, { From 8e927bdef573fddc9857968f98f4e92447f19283 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 24 Jan 2025 16:23:02 -0500 Subject: [PATCH 4/4] setup for next release Signed-off-by: Maroun Touma --- .make.versions | 2 +- transforms/pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.make.versions b/.make.versions index 4cde94aa9..cd1f7505c 100644 --- a/.make.versions +++ b/.make.versions @@ -63,4 +63,4 @@ endif # # If you change the versions numbers, be sure to run "make set-versions" to # update version numbers across the transform (e.g., pyproject.toml). -TRANSFORMS_PKG_VERSION=1.0.0a0 +TRANSFORMS_PKG_VERSION=1.0.1.dev0 diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 396ee62d9..c80b93af7 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "1.0.0" +version = "1.0.1.dev0" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray"