From c294f85791486d947d8a93ebcf0125450701b43c Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 24 Jan 2025 10:29:53 -0500 Subject: [PATCH 1/5] refactored profiler Signed-off-by: Maroun Touma --- transforms/universal/profiler/.dockerignore | 10 ++ .../{python/Dockerfile => Dockerfile.python} | 20 +-- .../{ray/Dockerfile => Dockerfile.ray} | 24 +-- .../universal/profiler/Dockerfile.spark | 34 +++++ transforms/universal/profiler/Makefile | 86 ++--------- transforms/universal/profiler/README.md | 100 ++++++++++++- .../profiler/dpk_profiler/__init__.py | 2 + .../src => dpk_profiler}/base_tokenizer.py | 0 .../local.py} | 6 +- .../local_python.py} | 4 +- .../profiler/dpk_profiler/ray/__init__.py | 0 .../ray/local.py} | 2 +- .../ray/runtime.py} | 39 ++++- .../ray/s3.py} | 2 +- .../runtime.py} | 31 +++- .../profiler/dpk_profiler/spark/__init__.py | 0 .../spark/local.py} | 4 +- .../spark/runtime.py} | 2 +- .../transform_base.py} | 2 +- .../universal/profiler/kfp_ray/Makefile | 43 ++---- .../universal/profiler/kfp_ray/profiler_wf.py | 2 +- .../universal/profiler/profiler-python.ipynb | 137 ++++++++++++++++++ .../universal/profiler/profiler-ray.ipynb | 137 ++++++++++++++++++ .../universal/profiler/python/.dockerignore | 1 - transforms/universal/profiler/python/Makefile | 64 -------- .../universal/profiler/python/README.md | 82 ----------- .../universal/profiler/python/pyproject.toml | 45 ------ .../universal/profiler/ray/.dockerignore | 1 - transforms/universal/profiler/ray/.gitignore | 38 ----- transforms/universal/profiler/ray/Makefile | 68 --------- transforms/universal/profiler/ray/README.md | 72 --------- .../universal/profiler/ray/pyproject.toml | 46 ------ .../profiler/{python => }/requirements.txt | 1 - .../universal/profiler/spark/.dockerignore | 1 - .../universal/profiler/spark/.gitignore | 39 ----- .../universal/profiler/spark/Dockerfile | 44 ------ transforms/universal/profiler/spark/Makefile | 58 -------- transforms/universal/profiler/spark/README.md | 60 -------- .../universal/profiler/spark/pyproject.toml | 45 ------ .../98aafbca-62b0-4b3f-a81a-d9f13439d074.csv | 0 .../test-data/expected/metadata.json | 0 .../test-data/input/sample1.parquet | Bin .../{python => }/test/test_profiler.py | 6 +- .../{python => }/test/test_profiler_python.py | 4 +- .../{ray => }/test/test_profiler_ray.py | 4 +- .../{spark => }/test/test_profiler_spark.py | 6 +- .../universal/profiler/transform.config | 20 --- 47 files changed, 541 insertions(+), 851 deletions(-) create mode 100644 transforms/universal/profiler/.dockerignore rename transforms/universal/profiler/{python/Dockerfile => Dockerfile.python} (61%) rename transforms/universal/profiler/{ray/Dockerfile => Dockerfile.ray} (58%) create mode 100644 transforms/universal/profiler/Dockerfile.spark create mode 100644 transforms/universal/profiler/dpk_profiler/__init__.py rename transforms/universal/profiler/{python/src => dpk_profiler}/base_tokenizer.py (100%) rename transforms/universal/profiler/{python/src/profiler_local.py => dpk_profiler/local.py} (91%) rename transforms/universal/profiler/{python/src/profiler_local_python.py => dpk_profiler/local_python.py} (92%) create mode 100644 transforms/universal/profiler/dpk_profiler/ray/__init__.py rename transforms/universal/profiler/{ray/src/profiler_local_ray.py => dpk_profiler/ray/local.py} (96%) rename transforms/universal/profiler/{ray/src/profiler_transform_ray.py => dpk_profiler/ray/runtime.py} (84%) rename transforms/universal/profiler/{ray/src/profiler_s3_ray.py => dpk_profiler/ray/s3.py} (96%) rename transforms/universal/profiler/{python/src/profiler_transform_python.py => dpk_profiler/runtime.py} (83%) create mode 100644 transforms/universal/profiler/dpk_profiler/spark/__init__.py rename transforms/universal/profiler/{spark/src/profiler_local_spark.py => dpk_profiler/spark/local.py} (92%) rename transforms/universal/profiler/{spark/src/profiler_transform_spark.py => dpk_profiler/spark/runtime.py} (97%) rename transforms/universal/profiler/{python/src/profiler_transform_base.py => dpk_profiler/transform_base.py} (99%) create mode 100644 transforms/universal/profiler/profiler-python.ipynb create mode 100644 transforms/universal/profiler/profiler-ray.ipynb delete mode 100644 transforms/universal/profiler/python/.dockerignore delete mode 100644 transforms/universal/profiler/python/Makefile delete mode 100644 transforms/universal/profiler/python/README.md delete mode 100644 transforms/universal/profiler/python/pyproject.toml delete mode 100644 transforms/universal/profiler/ray/.dockerignore delete mode 100644 transforms/universal/profiler/ray/.gitignore delete mode 100644 transforms/universal/profiler/ray/Makefile delete mode 100644 transforms/universal/profiler/ray/README.md delete mode 100644 transforms/universal/profiler/ray/pyproject.toml rename transforms/universal/profiler/{python => }/requirements.txt (52%) delete mode 100644 transforms/universal/profiler/spark/.dockerignore delete mode 100644 transforms/universal/profiler/spark/.gitignore delete mode 100644 transforms/universal/profiler/spark/Dockerfile delete mode 100644 transforms/universal/profiler/spark/Makefile delete mode 100644 transforms/universal/profiler/spark/README.md delete mode 100644 transforms/universal/profiler/spark/pyproject.toml rename transforms/universal/profiler/{python => }/test-data/expected/98aafbca-62b0-4b3f-a81a-d9f13439d074.csv (100%) rename transforms/universal/profiler/{python => }/test-data/expected/metadata.json (100%) rename transforms/universal/profiler/{python => }/test-data/input/sample1.parquet (100%) rename transforms/universal/profiler/{python => }/test/test_profiler.py (91%) rename transforms/universal/profiler/{python => }/test/test_profiler_python.py (94%) rename transforms/universal/profiler/{ray => }/test/test_profiler_ray.py (96%) rename transforms/universal/profiler/{spark => }/test/test_profiler_spark.py (92%) delete mode 100644 transforms/universal/profiler/transform.config diff --git a/transforms/universal/profiler/.dockerignore b/transforms/universal/profiler/.dockerignore new file mode 100644 index 0000000000..9dc92578d2 --- /dev/null +++ b/transforms/universal/profiler/.dockerignore @@ -0,0 +1,10 @@ +venv/ +test-data/output +output/* + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + + diff --git a/transforms/universal/profiler/python/Dockerfile b/transforms/universal/profiler/Dockerfile.python similarity index 61% rename from transforms/universal/profiler/python/Dockerfile rename to transforms/universal/profiler/Dockerfile.python index 31c628cca3..9f38097b72 100644 --- a/transforms/universal/profiler/python/Dockerfile +++ b/transforms/universal/profiler/Dockerfile.python @@ -10,26 +10,18 @@ RUN useradd -ms /bin/bash dpk USER dpk WORKDIR /home/dpk ARG DPK_WHEEL_FILE_NAME +ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chown=dpk:root data-processing-dist data-processing-dist +COPY --chown=dpk:users data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} -COPY --chown=dpk:root src/ src/ -COPY --chown=dpk:root pyproject.toml pyproject.toml -COPY --chown=dpk:root README.md README.md -COPY --chown=dpk:root requirements.txt requirements.txt +# END OF STEPS destined for a data-prep-kit base image -RUN pip install --no-cache-dir -e . - -# copy source data -COPY ./src/profiler_transform_python.py . -COPY ./src/profiler_local.py local/ - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chown=dpk:users dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chown=dpk:users requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt # Set environment ENV PYTHONPATH /home/dpk diff --git a/transforms/universal/profiler/ray/Dockerfile b/transforms/universal/profiler/Dockerfile.ray similarity index 58% rename from transforms/universal/profiler/ray/Dockerfile rename to transforms/universal/profiler/Dockerfile.ray index 131229d1f2..b8e52425b0 100644 --- a/transforms/universal/profiler/ray/Dockerfile +++ b/transforms/universal/profiler/Dockerfile.ray @@ -1,5 +1,4 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 - FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images @@ -11,33 +10,18 @@ RUN pip install --upgrade --no-cache-dir pip # install pytest RUN pip install --no-cache-dir pytest - ARG DPK_WHEEL_FILE_NAME +ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -## Copy the python version of the tansform -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ -RUN cd python-transform && pip install --no-cache-dir -e . - -# Install ray project source -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml -COPY --chmod=775 --chown=ray:root README.md README.md -RUN pip install --no-cache-dir -e . - -# copy the main() entry point to the image -COPY src/profiler_transform_ray.py . - -# copy some of the samples in -COPY src/profiler_local_ray.py local/ -# copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/universal/profiler/Dockerfile.spark b/transforms/universal/profiler/Dockerfile.spark new file mode 100644 index 0000000000..1af783438e --- /dev/null +++ b/transforms/universal/profiler/Dockerfile.spark @@ -0,0 +1,34 @@ +FROM quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest + +USER root +# install pytest +RUN pip install --no-cache-dir pytest + +WORKDIR ${SPARK_HOME}/work-dir +ARG DPK_WHEEL_FILE_NAME +ARG TRANSFORM_NAME + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chown=spark:users data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark] + + +# Install project source + +## Copy the python version of the tansform +COPY --chown=spark:users dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chown=spark:users requirements.txt requirements.txt +RUN pip install -r requirements.txt + + +USER spark + +# Set environment +ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${PYTHONPATH} + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/universal/profiler/Makefile b/transforms/universal/profiler/Makefile index bca6f7e85d..d370cb65ce 100644 --- a/transforms/universal/profiler/Makefile +++ b/transforms/universal/profiler/Makefile @@ -1,79 +1,21 @@ REPOROOT=../../.. # Use make help, to see the available rules -include $(REPOROOT)/.make.defaults +include $(REPOROOT)/transforms/.make.cicd.targets -setup:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse +# Until we make runtime.py the standard supported by Makefile infra +TRANSFORM_PYTHON_SRC="-m dpk_$(TRANSFORM_NAME).runtime" +TRANSFORM_RAY_SRC="-m dpk_$(TRANSFORM_NAME).ray.runtime" +TRANSFORM_SPARK_SRC="-m dpk_$(TRANSFORM_NAME).spark.runtime" -clean:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=$(shell basename `pwd`) -build:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse -venv:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse +################################################################################ -image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -set-versions: - @# Help: Recursively $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -publish:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-src:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -kind-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-save-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -.PHONY: workflow-venv -workflow-venv: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-venv; \ - fi - -.PHONY: workflow-test -workflow-test: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-test; \ - fi - -.PHONY: workflow-upload -workflow-upload: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-upload; \ - fi - -.PHONY: workflow-build -workflow-build: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-build; \ - fi diff --git a/transforms/universal/profiler/README.md b/transforms/universal/profiler/README.md index 25e9768c89..fc134330c1 100644 --- a/transforms/universal/profiler/README.md +++ b/transforms/universal/profiler/README.md @@ -1,19 +1,107 @@ # Profiler Transform + +## Summary + Profiler implement a word count. Typical implementation of the word count is done using map reduce. * It’s O(N2) complexity * shuffling with lots of data movement Implementation here is using “streaming” aggregation, based on central cache: -* At the heart of the implementation is a cache of partial word counts, implemented as a set of Ray actors and containing - word counts processed so far. +* At the heart of the implementation is a cache of partial word counts, implemented as a set of Ray actors and containing +word counts processed so far. * Individual data processors are responsible for: - * Reading data from data plane - * tokenizing documents (we use pluggable tokenizer) - * Coordinating with distributed cache to collect overall word counts + * Reading data from data plane + * tokenizing documents (we use pluggable tokenizer) + * Coordinating with distributed cache to collect overall word counts -The complication of mapping this model to transform model is the fact that implementation requires an aggregators cache, +The complication of mapping this model to transform model is the fact that implementation requires an aggregators cache, that transform mode knows nothing about. The solution here is to use transform runtime to create cache and pass it as a parameter to transforms. +## Transform runtime + +[Transform runtime](dpk_profiler/runtime.py) is responsible for creation cache actors and sending their +handles to the transforms themselves +Additionally it writes created word counts to the data storage (as .csv files) and enhances statistics information with the information about cache size and utilization + +## Configuration and command line Options + +The set of dictionary keys holding [ProfilerTransform](src/profiler_transform_python.py) +configuration for values are as follows: + +* _doc_column_ - specifies name of the column containing documents + +## Running + +### Launched Command Line Options +When running the transform with the Python launcher (i.e. TransformLauncher), +the following command line arguments are available in addition to +[the options provided by the launcher](../../../../data-processing-lib/doc/launcher-options.md). + +```shell + --profiler_doc_column PROFILER_DOC_COLUMN + key for accessing data + ``` + +These correspond to the configuration keys described above. + +### Running the samples +### Running the samples +To run the samples, run the following command from the transform folder transform/universal/profiler + +For example, +```shell +make venv && source venv/bin/activate +python -m dpk_profiler.local +``` + + +## Transform ray runtime + +[Transform runtime](dpk_profiler/ray/runtime.py) is responsible for creation cache actors and sending their +handles to the transforms themselves +Additionally it writes created word counts to the data storage (as .csv files) and enhances statistics information with the information about cache size and utilization + +## Configuration and command line Options + +In addition to the configuration parameters, defined above, +Ray version adds the following parameters: + +* _aggregator_cpu_ - specifies an amount of CPUs per aggregator actor +* _num_aggregators_ - specifies number of aggregator actors + +## Running + +### Launched Command Line Options +When running the transform with the Ray launcher (i.e. TransformLauncher), +the following command line arguments are available in addition to +[the options provided by the launcher](../../../../data-processing-lib/doc/launcher-options.md). + +```shell + --profiler_aggregator_cpu PROFILER_AGGREGATOR_CPU + number of CPUs per aggrigator + --profiler_num_aggregators PROFILER_NUM_AGGREGATORS + number of agregator actors to use + --profiler_doc_column PROFILER_DOC_COLUMN + key for accessing data + ``` + +These correspond to the configuration keys described above. + +### Running the samples +To run the samples, run the following command from the transform folder transform/universal/profiler + +For example, +```shell +make venv && source venv/bin/activate +python -m dpk_profiler.ray.local +``` + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + diff --git a/transforms/universal/profiler/dpk_profiler/__init__.py b/transforms/universal/profiler/dpk_profiler/__init__.py new file mode 100644 index 0000000000..260ad69ed2 --- /dev/null +++ b/transforms/universal/profiler/dpk_profiler/__init__.py @@ -0,0 +1,2 @@ +from .transform_base import * +from .runtime import * diff --git a/transforms/universal/profiler/python/src/base_tokenizer.py b/transforms/universal/profiler/dpk_profiler/base_tokenizer.py similarity index 100% rename from transforms/universal/profiler/python/src/base_tokenizer.py rename to transforms/universal/profiler/dpk_profiler/base_tokenizer.py diff --git a/transforms/universal/profiler/python/src/profiler_local.py b/transforms/universal/profiler/dpk_profiler/local.py similarity index 91% rename from transforms/universal/profiler/python/src/profiler_local.py rename to transforms/universal/profiler/dpk_profiler/local.py index cf9e0711fc..0b2fed48c4 100644 --- a/transforms/universal/profiler/python/src/profiler_local.py +++ b/transforms/universal/profiler/dpk_profiler/local.py @@ -13,9 +13,9 @@ import os from data_processing.data_access import DataAccessLocal, DataAccessFactory -from profiler_transform_base import DataAggregator -from profiler_transform_python import ProfilerTransform -from profiler_transform_base import doc_column_name_key +from dpk_profiler.transform_base import DataAggregator +from dpk_profiler.runtime import ProfilerTransform +from dpk_profiler.transform_base import doc_column_name_key # create parameters diff --git a/transforms/universal/profiler/python/src/profiler_local_python.py b/transforms/universal/profiler/dpk_profiler/local_python.py similarity index 92% rename from transforms/universal/profiler/python/src/profiler_local_python.py rename to transforms/universal/profiler/dpk_profiler/local_python.py index 18f0326798..3dea648751 100644 --- a/transforms/universal/profiler/python/src/profiler_local_python.py +++ b/transforms/universal/profiler/dpk_profiler/local_python.py @@ -15,8 +15,8 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from profiler_transform_python import ProfilerPythonTransformRuntimeConfiguration -from profiler_transform_base import doc_column_name_cli_param +from dpk_profiler.runtime import ProfilerPythonTransformRuntimeConfiguration +from dpk_profiler.transform_base import doc_column_name_cli_param # create launcher diff --git a/transforms/universal/profiler/dpk_profiler/ray/__init__.py b/transforms/universal/profiler/dpk_profiler/ray/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/transforms/universal/profiler/ray/src/profiler_local_ray.py b/transforms/universal/profiler/dpk_profiler/ray/local.py similarity index 96% rename from transforms/universal/profiler/ray/src/profiler_local_ray.py rename to transforms/universal/profiler/dpk_profiler/ray/local.py index b88692c497..6dae2895c0 100644 --- a/transforms/universal/profiler/ray/src/profiler_local_ray.py +++ b/transforms/universal/profiler/dpk_profiler/ray/local.py @@ -15,7 +15,7 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from profiler_transform_ray import ProfilerRayTransformRuntimeConfiguration +from dpk_profiler.ray.runtime import ProfilerRayTransformRuntimeConfiguration # create launcher diff --git a/transforms/universal/profiler/ray/src/profiler_transform_ray.py b/transforms/universal/profiler/dpk_profiler/ray/runtime.py similarity index 84% rename from transforms/universal/profiler/ray/src/profiler_transform_ray.py rename to transforms/universal/profiler/dpk_profiler/ray/runtime.py index 98944fae92..81dc10cfe3 100644 --- a/transforms/universal/profiler/ray/src/profiler_transform_ray.py +++ b/transforms/universal/profiler/dpk_profiler/ray/runtime.py @@ -10,12 +10,13 @@ # limitations under the License. ################################################################################ +import sys from argparse import ArgumentParser, Namespace from typing import Any import ray from data_processing.data_access import DataAccessFactoryBase -from data_processing.utils import CLIArgumentProvider, TransformUtils, UnrecoverableException +from data_processing.utils import ParamsUtils, CLIArgumentProvider, TransformUtils, UnrecoverableException from data_processing_ray.runtime.ray import ( DefaultRayTransformRuntime, RayTransformLauncher, @@ -25,7 +26,7 @@ RayTransformRuntimeConfiguration, ) from ray.actor import ActorHandle -from profiler_transform_base import ( +from dpk_profiler.transform_base import ( DataAggregator, ProfilerTransformBase, ProfilerTransformConfigurationBase, @@ -204,6 +205,40 @@ def __init__(self): super().__init__(transform_config=ProfilerRayTransformConfiguration(), runtime_class=ProfilerRuntime) + +# Class used by the notebooks to ingest binary files and create parquet files +class Profiler: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + try: + worker_options = {k: self.params[k] for k in ("num_cpus", "memory")} + self.params["runtime_worker_options"] = ParamsUtils.convert_to_ast(worker_options) + del self.params["num_cpus"] + del self.params["memory"] + except: + pass + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + # create launcher + launcher = RayTransformLauncher(ProfilerRayTransformRuntimeConfiguration()) + # launch + return_code = launcher.launch() + return return_code + + + + if __name__ == "__main__": launcher = RayTransformLauncher(ProfilerRayTransformRuntimeConfiguration()) launcher.launch() diff --git a/transforms/universal/profiler/ray/src/profiler_s3_ray.py b/transforms/universal/profiler/dpk_profiler/ray/s3.py similarity index 96% rename from transforms/universal/profiler/ray/src/profiler_s3_ray.py rename to transforms/universal/profiler/dpk_profiler/ray/s3.py index 57648cad87..90606b3c6a 100644 --- a/transforms/universal/profiler/ray/src/profiler_s3_ray.py +++ b/transforms/universal/profiler/dpk_profiler/ray/s3.py @@ -14,7 +14,7 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from profiler_transform_ray import ProfilerRayTransformRuntimeConfiguration +from dpk_profiler.ray.runtime import ProfilerRayTransformRuntimeConfiguration # create launcher diff --git a/transforms/universal/profiler/python/src/profiler_transform_python.py b/transforms/universal/profiler/dpk_profiler/runtime.py similarity index 83% rename from transforms/universal/profiler/python/src/profiler_transform_python.py rename to transforms/universal/profiler/dpk_profiler/runtime.py index bdd5aaee73..72eedde6fd 100644 --- a/transforms/universal/profiler/python/src/profiler_transform_python.py +++ b/transforms/universal/profiler/dpk_profiler/runtime.py @@ -9,18 +9,20 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import sys from argparse import Namespace from typing import Any + from data_processing.data_access import DataAccessFactoryBase from data_processing.runtime.pure_python import ( DefaultPythonTransformRuntime, PythonTransformLauncher, PythonTransformRuntimeConfiguration, ) -from data_processing.utils import UnrecoverableException +from data_processing.utils import ParamsUtils, UnrecoverableException from data_processing.transform import TransformStatistics -from profiler_transform_base import ( +from dpk_profiler.transform_base import ( ProfilerTransformBase, ProfilerTransformConfigurationBase, DataAggregator, @@ -120,6 +122,31 @@ def __init__(self): ) +class Profiler: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + # create launcher + launcher = PythonTransformLauncher(ProfilerPythonTransformRuntimeConfiguration()) + # launch + return_code = launcher.launch() + return return_code + + + + if __name__ == "__main__": launcher = PythonTransformLauncher(ProfilerPythonTransformRuntimeConfiguration()) launcher.launch() diff --git a/transforms/universal/profiler/dpk_profiler/spark/__init__.py b/transforms/universal/profiler/dpk_profiler/spark/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/transforms/universal/profiler/spark/src/profiler_local_spark.py b/transforms/universal/profiler/dpk_profiler/spark/local.py similarity index 92% rename from transforms/universal/profiler/spark/src/profiler_local_spark.py rename to transforms/universal/profiler/dpk_profiler/spark/local.py index dad113a607..8750a1dc84 100644 --- a/transforms/universal/profiler/spark/src/profiler_local_spark.py +++ b/transforms/universal/profiler/dpk_profiler/spark/local.py @@ -15,8 +15,8 @@ from data_processing.utils import ParamsUtils from data_processing_spark.runtime.spark import SparkTransformLauncher -from profiler_transform_spark import ProfilerSparkTransformRuntimeConfiguration -from profiler_transform_base import doc_column_name_cli_param +from dpk_profiler.spark.runtime import ProfilerSparkTransformRuntimeConfiguration +from dpk_profiler.transform_base import doc_column_name_cli_param # create parameters diff --git a/transforms/universal/profiler/spark/src/profiler_transform_spark.py b/transforms/universal/profiler/dpk_profiler/spark/runtime.py similarity index 97% rename from transforms/universal/profiler/spark/src/profiler_transform_spark.py rename to transforms/universal/profiler/dpk_profiler/spark/runtime.py index 2466c17854..4d8573b208 100644 --- a/transforms/universal/profiler/spark/src/profiler_transform_spark.py +++ b/transforms/universal/profiler/dpk_profiler/spark/runtime.py @@ -17,7 +17,7 @@ from data_processing.transform import TransformStatistics from data_processing_spark.runtime.spark import SparkTransformLauncher from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration, DefaultSparkTransformRuntime -from profiler_transform_base import ProfilerTransformBase, ProfilerTransformConfigurationBase, DataAggregator +from dpk_profiler.transform_base import ProfilerTransformBase, ProfilerTransformConfigurationBase, DataAggregator class ProfilerSparkTransform(ProfilerTransformBase): """ diff --git a/transforms/universal/profiler/python/src/profiler_transform_base.py b/transforms/universal/profiler/dpk_profiler/transform_base.py similarity index 99% rename from transforms/universal/profiler/python/src/profiler_transform_base.py rename to transforms/universal/profiler/dpk_profiler/transform_base.py index c28e57bfaf..08267e7a94 100644 --- a/transforms/universal/profiler/python/src/profiler_transform_base.py +++ b/transforms/universal/profiler/dpk_profiler/transform_base.py @@ -19,6 +19,7 @@ import pyarrow as pa from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import GB, CLIArgumentProvider, TransformUtils, UnrecoverableException +from dpk_profiler.base_tokenizer import tokenize short_name = "profiler" @@ -112,7 +113,6 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab :param file_name: name of the file to process :return: resulting table, statistics """ - from base_tokenizer import tokenize # make sure that the doc column exists TransformUtils.validate_columns(table=table, required=[self.doc_column]) diff --git a/transforms/universal/profiler/kfp_ray/Makefile b/transforms/universal/profiler/kfp_ray/Makefile index e4f6b860b2..7244ce1427 100644 --- a/transforms/universal/profiler/kfp_ray/Makefile +++ b/transforms/universal/profiler/kfp_ray/Makefile @@ -2,10 +2,15 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows -# Include the common configuration for this transform -include ../transform.config +SRC_DIR=${CURDIR}/../ +# Use the docker image that is built for ray runtime +TRANSFORM_RUNTIME=ray +## override settings in .make.default as they assume old structure with ray being the current folder +DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-$(TRANSFORM_RUNTIME) +DOCKER_LOCAL_IMAGE=$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -SRC_DIR=${CURDIR}/../ray/ +# Only build the image with -f Dockerfile.ray +BUILD_SPECIFIC_RUNTIME=ray PYTHON_WF := $(shell find ./ -name '*_wf.py') YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) @@ -15,29 +20,8 @@ workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} .PHONY: clean clean: @# Help: Clean up the virtual environment. - rm -rf ${REPOROOT}/transforms/venv + rm -rf ${REPOROOT}/transforms/venv -venv:: - -build:: - -setup:: - -test:: - -test-src:: - -test-image:: - -publish:: - -image:: - -kind-load-image:: - -docker-load-image:: - -docker-save-image:: .PHONY: workflow-build workflow-build: workflow-venv @@ -45,11 +29,14 @@ workflow-build: workflow-venv .PHONY: workflow-test workflow-test: workflow-build - $(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=profiler_wf.yaml + $(MAKE) TRANSFORM_SRC=${SRC_DIR} \ + TRANSFORM_RUNTIME=$(TRANSFORM_RUNTIME) \ + TRANSFORM_NAME=$(TRANSFORM_NAME) \ + BUILD_SPECIFIC_RUNTIME=$(BUILD_SPECIFIC_RUNTIME) \ + PIPELINE_FILE=$(TRANSFORM_NAME)_wf.yaml .workflows.test-pipeline .PHONY: workflow-upload workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done - + done \ No newline at end of file diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index 7a157c1461..b405d3f0c7 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -26,7 +26,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/profiler-ray:latest" # the name of the job script -EXEC_SCRIPT_NAME: str = "profiler_transform_ray.py" +EXEC_SCRIPT_NAME: str = "-m dpk_profiler.ray.runtime.py" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" diff --git a/transforms/universal/profiler/profiler-python.ipynb b/transforms/universal/profiler/profiler-python.ipynb new file mode 100644 index 0000000000..96b1f41056 --- /dev/null +++ b/transforms/universal/profiler/profiler-python.ipynb @@ -0,0 +1,137 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "%pip install data-prep-toolkit\n", + "%pip install data-prep-toolkit-transforms[profiler]" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. The set of dictionary keys holding profiler configuration values are as follows: \n", + "* _doc_column_ - specifies name of the column containing documents\n" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58", + "metadata": {}, + "outputs": [], + "source": [ + "from dpk_profiler.runtime import Profiler" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "Profiler(input_folder= \"test-data/input\",\n", + " output_folder= \"output\",\n", + " profiler_doc_column= \"contents\").transform()" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use python runtime to invoke the transform" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"output/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/profiler/profiler-ray.ipynb b/transforms/universal/profiler/profiler-ray.ipynb new file mode 100644 index 0000000000..fd85b8d8e4 --- /dev/null +++ b/transforms/universal/profiler/profiler-ray.ipynb @@ -0,0 +1,137 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "%pip install 'data-prep-toolkit[ray]'\n", + "%pip install 'data-prep-toolkit-transforms[profiler]'" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. The set of dictionary keys holding profiler configuration values are as follows: \n", + "* --profiler_aggregator_cpu PROFILER_AGGREGATOR_CPU\n", + " number of CPUs per aggrigator\n", + "* --profiler_num_aggregators PROFILER_NUM_AGGREGATORS\n", + " number of agregator actors to use\n", + "* --profiler_doc_column PROFILER_DOC_COLUMN\n", + " key for accessing data\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58", + "metadata": {}, + "outputs": [], + "source": [ + "from dpk_profiler.ray.runtime import Profiler" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "Profiler(input_folder= \"test-data/input\",\n", + " output_folder= \"output\",\n", + " run_locally= True,\n", + " profiler_num_aggregators=1,\n", + " profiler_doc_column= \"contents\").transform()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"output/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/profiler/python/.dockerignore b/transforms/universal/profiler/python/.dockerignore deleted file mode 100644 index f7275bbbd0..0000000000 --- a/transforms/universal/profiler/python/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/universal/profiler/python/Makefile b/transforms/universal/profiler/python/Makefile deleted file mode 100644 index 9832501843..0000000000 --- a/transforms/universal/profiler/python/Makefile +++ /dev/null @@ -1,64 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -venv:: .transforms.python-venv - -test:: .transforms.python-test - -clean:: .transforms.clean - -image:: .transforms.python-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-python - -setup:: .transforms.setup - -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(PROFILER_PYTHON_VERSION) TOML_VERSION=$(PROFILER_PYTHON_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -test-image:: .transforms.python-test-image - -run-cli-sample: .transforms.run-cli-python-sample - -run-local-sample: .transforms.run-local-sample - -run-local-python-sample: .transforms.run-local-python-sample - -#run-s3-ray-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/profiler/python/README.md b/transforms/universal/profiler/python/README.md deleted file mode 100644 index 264595eaad..0000000000 --- a/transforms/universal/profiler/python/README.md +++ /dev/null @@ -1,82 +0,0 @@ -# Profiler - -Please see the set of -[transform project conventions](../../../README.md) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Summary - -Profiler implement a word count. Typical implementation of the word count is done using map reduce. -* It’s O(N2) complexity -* shuffling with lots of data movement - -Implementation here is using “streaming” aggregation, based on central cache: - -* At the heart of the implementation is a cache of partial word counts, implemented as a set of Ray actors and containing -word counts processed so far. -* Individual data processors are responsible for: - * Reading data from data plane - * tokenizing documents (we use pluggable tokenizer) - * Coordinating with distributed cache to collect overall word counts - -The complication of mapping this model to transform model is the fact that implementation requires an aggregators cache, -that transform mode knows nothing about. The solution here is to use transform runtime to create cache -and pass it as a parameter to transforms. - -## Transform runtime - -[Transform runtime](src/profiler_transform_ray.py) is responsible for creation cache actors and sending their -handles to the transforms themselves -Additionally it writes created word counts to the data storage (as .csv files) and enhances statistics information with the information about cache size and utilization - -## Configuration and command line Options - -The set of dictionary keys holding [ProfilerTransform](src/profiler_transform_python.py) -configuration for values are as follows: - -* _doc_column_ - specifies name of the column containing documents - -## Running - -### Launched Command Line Options -When running the transform with the Python launcher (i.e. TransformLauncher), -the following command line arguments are available in addition to -[the options provided by the launcher](../../../../data-processing-lib/doc/launcher-options.md). - -```shell - --profiler_doc_column PROFILER_DOC_COLUMN - key for accessing data - ``` - -These correspond to the configuration keys described above. - -### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/ededup_transform_ray.py using command line args -* `run-local-sample` - runs src/ededup_local_ray.py -* `run-s3-sample` - runs src/ededup_s3_ray.py - * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) - and [here](https://min.io/docs/minio/linux/index.html) - and invocation of `make minio-start` to load data into local minio for S3 access. - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-cli-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/universal/profiler/python/pyproject.toml b/transforms/universal/profiler/python/pyproject.toml deleted file mode 100644 index 50d4880a83..0000000000 --- a/transforms/universal/profiler/python/pyproject.toml +++ /dev/null @@ -1,45 +0,0 @@ -[project] -name = "dpk_profiler_transform_python" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "profiler Python Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/universal/profiler/ray/.dockerignore b/transforms/universal/profiler/ray/.dockerignore deleted file mode 100644 index f7275bbbd0..0000000000 --- a/transforms/universal/profiler/ray/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/universal/profiler/ray/.gitignore b/transforms/universal/profiler/ray/.gitignore deleted file mode 100644 index 3ea7fd4abb..0000000000 --- a/transforms/universal/profiler/ray/.gitignore +++ /dev/null @@ -1,38 +0,0 @@ -test-data/output -output/* -/output/ -data-processing-lib/ - - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - - -# Distribution / packaging -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -.tox/ -htmlcov -.coverage -.cache -nosetests.xml -coverage.xml \ No newline at end of file diff --git a/transforms/universal/profiler/ray/Makefile b/transforms/universal/profiler/ray/Makefile deleted file mode 100644 index 12d75c4c3e..0000000000 --- a/transforms/universal/profiler/ray/Makefile +++ /dev/null @@ -1,68 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -BASE_IMAGE=${RAY_BASE_IMAGE} -venv:: .transforms.ray-venv - -test:: .transforms.ray-test - -clean:: .transforms.clean - -image:: .transforms.ray-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - - -# TRANSFORM_PYTHON_VERSION has no effect since requirements do not specify a python transform implementation -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(PROFILER_PYTHON_VERSION) TOML_VERSION=$(PROFILER_RAY_VERSION) .transforms.set-versions - -test-image:: .transforms.ray-test-image - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-ray - -setup:: .transforms.setup - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -run-local-sample: .transforms.run-local-ray-sample - -run-s3-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image - -run-cli-sample: - $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \ - RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \ - --profiler_num_aggregators 2" \ - .transforms.run-src-file diff --git a/transforms/universal/profiler/ray/README.md b/transforms/universal/profiler/ray/README.md deleted file mode 100644 index 1166d54533..0000000000 --- a/transforms/universal/profiler/ray/README.md +++ /dev/null @@ -1,72 +0,0 @@ -# Profiler - -Please see the set of -[transform project conventions](../../../README.md) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Summary - -This project wraps the [profiler transform](../python) with a Ray runtime. - -## Transform runtime - -[Transform runtime](src/profiler_transform_ray.py) is responsible for creation cache actors and sending their -handles to the transforms themselves -Additionally it writes created word counts to the data storage (as .csv files) and enhances statistics information with the information about cache size and utilization - -## Configuration and command line Options - -In addition to the configuration parameters, defined [here](../python/README.md) -Ray version adds the following parameters: - -* _aggregator_cpu_ - specifies an amount of CPUs per aggregator actor -* _num_aggregators_ - specifies number of aggregator actors - -## Running - -### Launched Command Line Options -When running the transform with the Ray launcher (i.e. TransformLauncher), -the following command line arguments are available in addition to -[the options provided by the launcher](../../../../data-processing-lib/doc/launcher-options.md). - -```shell - --profiler_aggregator_cpu PROFILER_AGGREGATOR_CPU - number of CPUs per aggrigator - --profiler_num_aggregators PROFILER_NUM_AGGREGATORS - number of agregator actors to use - --profiler_doc_column PROFILER_DOC_COLUMN - key for accessing data - ``` - -These correspond to the configuration keys described above. - -### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/ededup_transform_ray.py using command line args -* `run-local-sample` - runs src/ededup_local_ray.py -* `run-s3-sample` - runs src/ededup_s3_ray.py - * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) - and [here](https://min.io/docs/minio/linux/index.html) - and invocation of `make minio-start` to load data into local minio for S3 access. - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-cli-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/universal/profiler/ray/pyproject.toml b/transforms/universal/profiler/ray/pyproject.toml deleted file mode 100644 index a7138f3654..0000000000 --- a/transforms/universal/profiler/ray/pyproject.toml +++ /dev/null @@ -1,46 +0,0 @@ -[project] -name = "dpk_profiler_transform_ray" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "profiler Ray Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, -] -dependencies = [ - "data-prep-toolkit[ray]>=0.2.4.dev0", - "dpk_profiler_transform_python==0.2.4.dev0", - "tqdm==4.66.3", -] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/requirements.txt similarity index 52% rename from transforms/universal/profiler/python/requirements.txt rename to transforms/universal/profiler/requirements.txt index c9c874ffe7..82406fe178 100644 --- a/transforms/universal/profiler/python/requirements.txt +++ b/transforms/universal/profiler/requirements.txt @@ -1,5 +1,4 @@ -data-prep-toolkit>=0.2.3 mmh3==4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/profiler/spark/.dockerignore b/transforms/universal/profiler/spark/.dockerignore deleted file mode 100644 index f7275bbbd0..0000000000 --- a/transforms/universal/profiler/spark/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/universal/profiler/spark/.gitignore b/transforms/universal/profiler/spark/.gitignore deleted file mode 100644 index 34baad1325..0000000000 --- a/transforms/universal/profiler/spark/.gitignore +++ /dev/null @@ -1,39 +0,0 @@ -test-data/output -output/* -/output/ -data-processing-lib/ -data-processing-spark/ - - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - - -# Distribution / packaging -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -.tox/ -htmlcov -.coverage -.cache -nosetests.xml -coverage.xml diff --git a/transforms/universal/profiler/spark/Dockerfile b/transforms/universal/profiler/spark/Dockerfile deleted file mode 100644 index fa9946555d..0000000000 --- a/transforms/universal/profiler/spark/Dockerfile +++ /dev/null @@ -1,44 +0,0 @@ -ARG BASE_IMAGE=quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest -FROM ${BASE_IMAGE} - -USER root -# install pytest -RUN pip install --no-cache-dir pytest - -WORKDIR ${SPARK_HOME}/work-dir -ARG DPK_WHEEL_FILE_NAME - -# Copy and install data processing libraries -# These are expected to be placed in the docker context before this is run (see the make image). -COPY --chown=dpk:root data-processing-dist data-processing-dist -RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark] - -COPY --chown=spark:root python-transform/ python-transform/ -RUN cd python-transform && pip install --no-cache-dir -e . - -COPY --chown=root:root src/ src/ -COPY --chown=root:root pyproject.toml pyproject.toml -RUN pip install --no-cache-dir -e . - -# copy in the main() entry point to the image -COPY ./src/profiler_transform_spark.py . - -# Copy in some samples -COPY ./src/profiler_local_spark.py local/ - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ - -USER spark - -# Set environment -ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${PYTHONPATH} - -# Put these at the end since they seem to upset the docker cache. -ARG BUILD_DATE -ARG GIT_COMMIT -LABEL build-date=$BUILD_DATE -LABEL git-commit=$GIT_COMMIT - - diff --git a/transforms/universal/profiler/spark/Makefile b/transforms/universal/profiler/spark/Makefile deleted file mode 100644 index 39b16cac6e..0000000000 --- a/transforms/universal/profiler/spark/Makefile +++ /dev/null @@ -1,58 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -venv:: .transforms.spark-venv - -test:: .transforms.spark-test - -clean:: .transforms.clean - -image:: .transforms.spark-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-spark - -# set the version of python transform that this depends on. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=${PROFILER_PYTHON_VERSION} TOML_VERSION=$(PROFILER_SPARK_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -test-image:: .transforms.spark-test-image - -run-cli-sample: .transforms.run-cli-spark-sample - -run-local-sample: .transforms.run-local-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/profiler/spark/README.md b/transforms/universal/profiler/spark/README.md deleted file mode 100644 index a5fbb6fcff..0000000000 --- a/transforms/universal/profiler/spark/README.md +++ /dev/null @@ -1,60 +0,0 @@ -# Profiler Transform - -Per the set of -[transform project conventions](../../../README.md#transform-project-conventions) -the following runtimes are available: - -* [ray](../ray/README.md) - enables the running of the base python transformation -in a Ray runtime -* [kfp_ray](../kfp_ray/README.md) - enables running the ray docker image for -the transformer in a kubernetes cluster using a generated `yaml` file. - -## Summary - -This project wraps the [profiler transform](../python) with a Spark runtime. - -## Transform runtime - -[Transform runtime](src/profiler_transform_ray.py) is responsible for creation cache actors and sending their -handles to the transforms themselves -Additionally it writes created word counts to the data storage (as .csv files) and enhances statistics information with the information about cache size and utilization - -## Configuration and command line Options - -Spark version uses the same configuration parameters as the [Python one](../python/README.md) - -### Launched Command Line Options -When running the transform with the Spark launcher (i.e. TransformLauncher), -in addition to command line arguments provided by the [launcher](../../../../data-processing-lib/doc/launcher-options.md). -the same arguments are available as for the [python one](../python/README.md) - -### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/ededup_transform_ray.py using command line args -* `run-local-sample` - runs src/ededup_local_ray.py -* `run-s3-sample` - runs src/ededup_s3_ray.py - * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) - and [here](https://min.io/docs/minio/linux/index.html) - and invocation of `make minio-start` to load data into local minio for S3 access. - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-cli-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. - diff --git a/transforms/universal/profiler/spark/pyproject.toml b/transforms/universal/profiler/spark/pyproject.toml deleted file mode 100644 index ef783d41e5..0000000000 --- a/transforms/universal/profiler/spark/pyproject.toml +++ /dev/null @@ -1,45 +0,0 @@ -[project] -name = "dpk_profiler_transform_spark" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "Profiler Spark Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, -] -dependencies = [ - "dpk-profiler-transform-python==0.2.4.dev0", - "data-prep-toolkit[spark]>=0.2.4.dev0", -] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/universal/profiler/python/test-data/expected/98aafbca-62b0-4b3f-a81a-d9f13439d074.csv b/transforms/universal/profiler/test-data/expected/98aafbca-62b0-4b3f-a81a-d9f13439d074.csv similarity index 100% rename from transforms/universal/profiler/python/test-data/expected/98aafbca-62b0-4b3f-a81a-d9f13439d074.csv rename to transforms/universal/profiler/test-data/expected/98aafbca-62b0-4b3f-a81a-d9f13439d074.csv diff --git a/transforms/universal/profiler/python/test-data/expected/metadata.json b/transforms/universal/profiler/test-data/expected/metadata.json similarity index 100% rename from transforms/universal/profiler/python/test-data/expected/metadata.json rename to transforms/universal/profiler/test-data/expected/metadata.json diff --git a/transforms/universal/profiler/python/test-data/input/sample1.parquet b/transforms/universal/profiler/test-data/input/sample1.parquet similarity index 100% rename from transforms/universal/profiler/python/test-data/input/sample1.parquet rename to transforms/universal/profiler/test-data/input/sample1.parquet diff --git a/transforms/universal/profiler/python/test/test_profiler.py b/transforms/universal/profiler/test/test_profiler.py similarity index 91% rename from transforms/universal/profiler/python/test/test_profiler.py rename to transforms/universal/profiler/test/test_profiler.py index a174f68b97..4696816a5c 100644 --- a/transforms/universal/profiler/python/test/test_profiler.py +++ b/transforms/universal/profiler/test/test_profiler.py @@ -15,9 +15,9 @@ from data_processing.test_support import get_tables_in_folder from data_processing.test_support.transform import AbstractTableTransformTest -from profiler_transform_base import DataAggregator -from profiler_transform_python import ProfilerTransform -from profiler_transform_base import doc_column_name_key +from dpk_profiler.transform_base import DataAggregator +from dpk_profiler.runtime import ProfilerTransform +from dpk_profiler.transform_base import doc_column_name_key from data_processing.data_access import DataAccessFactory diff --git a/transforms/universal/profiler/python/test/test_profiler_python.py b/transforms/universal/profiler/test/test_profiler_python.py similarity index 94% rename from transforms/universal/profiler/python/test/test_profiler_python.py rename to transforms/universal/profiler/test/test_profiler_python.py index 2fb8df22f1..497a378118 100644 --- a/transforms/universal/profiler/python/test/test_profiler_python.py +++ b/transforms/universal/profiler/test/test_profiler_python.py @@ -16,8 +16,8 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from profiler_transform_python import ProfilerPythonTransformRuntimeConfiguration -from profiler_transform_base import doc_column_name_cli_param +from dpk_profiler.runtime import ProfilerPythonTransformRuntimeConfiguration +from dpk_profiler.transform_base import doc_column_name_cli_param class TestPythonProfilerTransform(AbstractTransformLauncherTest): diff --git a/transforms/universal/profiler/ray/test/test_profiler_ray.py b/transforms/universal/profiler/test/test_profiler_ray.py similarity index 96% rename from transforms/universal/profiler/ray/test/test_profiler_ray.py rename to transforms/universal/profiler/test/test_profiler_ray.py index 04c75a6c96..945b262a36 100644 --- a/transforms/universal/profiler/ray/test/test_profiler_ray.py +++ b/transforms/universal/profiler/test/test_profiler_ray.py @@ -17,7 +17,7 @@ AbstractTransformLauncherTest, ) from data_processing_ray.runtime.ray import RayTransformLauncher -from profiler_transform_ray import ProfilerRayTransformRuntimeConfiguration +from dpk_profiler.ray.runtime import ProfilerRayTransformRuntimeConfiguration class TestRayAggregatorTransform(AbstractTransformLauncherTest): @@ -27,7 +27,7 @@ class TestRayAggregatorTransform(AbstractTransformLauncherTest): """ def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../ray/test-data")) config = { "run_locally": True, # When running in ray, our Runtime's get_transform_config() method will load the domains using diff --git a/transforms/universal/profiler/spark/test/test_profiler_spark.py b/transforms/universal/profiler/test/test_profiler_spark.py similarity index 92% rename from transforms/universal/profiler/spark/test/test_profiler_spark.py rename to transforms/universal/profiler/test/test_profiler_spark.py index 99e64a8dfc..6cb9341952 100644 --- a/transforms/universal/profiler/spark/test/test_profiler_spark.py +++ b/transforms/universal/profiler/test/test_profiler_spark.py @@ -16,8 +16,8 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from profiler_transform_spark import ProfilerSparkTransformRuntimeConfiguration -from profiler_transform_base import doc_column_name_cli_param +from dpk_profiler.spark.runtime import ProfilerSparkTransformRuntimeConfiguration +from dpk_profiler.transform_base import doc_column_name_cli_param class TestPythonProfilerTransform(AbstractTransformLauncherTest): @@ -27,7 +27,7 @@ class TestPythonProfilerTransform(AbstractTransformLauncherTest): """ def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../spark/test-data")) launcher = SparkTransformLauncher(ProfilerSparkTransformRuntimeConfiguration()) config = {doc_column_name_cli_param: "contents"} return [(launcher, config, basedir + "/input", basedir + "/expected")] diff --git a/transforms/universal/profiler/transform.config b/transforms/universal/profiler/transform.config deleted file mode 100644 index c86cd64156..0000000000 --- a/transforms/universal/profiler/transform.config +++ /dev/null @@ -1,20 +0,0 @@ -# -# This is intended to be included across the Makefiles provided within -# a given transform's directory tree, so must use compatible syntax. -# -################################################################################ -# This defines the name of the transform and is used to match against -# expected files and is used to define the transform's image name. -TRANSFORM_NAME=profiler - -################################################################################ -# This defines the transforms' version number as would be used -# when publishing the wheel. In general, only the micro version -# number should be advanced relative to the DPK_VERSION. -# -# If you change the versions numbers, be sure to run "make set-versions" to -# update version numbers across the transform (e.g., pyproject.toml). -PROFILER_PYTHON_VERSION=$(DPK_VERSION) -PROFILER_RAY_VERSION=$(PROFILER_PYTHON_VERSION) -PROFILER_SPARK_VERSION=$(PROFILER_PYTHON_VERSION) - From ad7936b07712bcf738fe5761a1e17407d198030d Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 24 Jan 2025 10:48:53 -0500 Subject: [PATCH 2/5] update pyproject.toml Signed-off-by: Maroun Touma --- scripts/k8s-setup/populate_minio.sh | 8 ++++---- transforms/README-list.md | 5 +++++ transforms/pyproject.toml | 16 ++++++++++------ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/scripts/k8s-setup/populate_minio.sh b/scripts/k8s-setup/populate_minio.sh index 8c5bd2268a..3f29379235 100755 --- a/scripts/k8s-setup/populate_minio.sh +++ b/scripts/k8s-setup/populate_minio.sh @@ -40,9 +40,9 @@ mc cp --recursive ${REPOROOT}/transforms/language/html2parquet/test-data/input/t mc cp --recursive ${REPOROOT}/transforms/universal/doc_id/test-data/input/ kfp/test/doc_id/input mc cp --recursive ${REPOROOT}/transforms/universal/ededup/test-data/input/ kfp/test/ededup/input mc cp --recursive ${REPOROOT}/transforms/universal/fdedup/ray/test-data/input/ kfp/test/fdedup/input -mc cp --recursive ${REPOROOT}/transforms/universal/filter/ray/test-data/input/ kfp/test/filter/input -mc cp --recursive ${REPOROOT}/transforms/universal/noop/ray/test-data/input/ kfp/test/noop/input +mc cp --recursive ${REPOROOT}/transforms/universal/filter/test-data/input/ kfp/test/filter/input +mc cp --recursive ${REPOROOT}/transforms/universal/noop/test-data/input/ kfp/test/noop/input mc cp --recursive ${REPOROOT}/transforms/universal/tokenization/test-data/ds01/input/ kfp/test/tokenization/ds01/input -mc cp --recursive ${REPOROOT}/transforms/universal/profiler/ray/test-data/input/ kfp/test/profiler/input -mc cp --recursive ${REPOROOT}/transforms/universal/resize/ray/test-data/input/ kfp/test/resize/input +mc cp --recursive ${REPOROOT}/transforms/universal/profiler/test-data/input/ kfp/test/profiler/input +mc cp --recursive ${REPOROOT}/transforms/universal/resize/test-data/input/ kfp/test/resize/input mc cp --recursive ${REPOROOT}/transforms/universal/hap/test-data/input/ kfp/test/hap/input diff --git a/transforms/README-list.md b/transforms/README-list.md index 7e4d70ee28..052ab474d4 100644 --- a/transforms/README-list.md +++ b/transforms/README-list.md @@ -7,6 +7,8 @@ The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README `python -m pip install data-prep-toolkit-transforms[all]` or `python -m pip install data-prep-toolkit-transforms[ray, all]` +or +`python -m pip install data-prep-toolkit-transforms[language]` installing the python transforms will also install `data-prep-toolkit` @@ -41,6 +43,9 @@ Note: This list includes the transforms that were part of the release starting w ## Release notes: +### 1.0.0.a6 + Added Profiler + Added Resize ### 1.0.0.a5 Added Pii Redactor Relax fasttext requirement >= 0.9.2 diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 4e3dde1f32..af19f97cb4 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "1.0.0a5" +version = "1.0.0a6" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" @@ -33,8 +33,8 @@ all = { file = [ "language/pii_redactor/requirements.txt", -"universal/profiler/python/requirements.txt", -"universal/resize/python/requirements.txt", +"universal/profiler/requirements.txt", +"universal/resize/requirements.txt", "language/lang_id/requirements.txt", "language/doc_quality/requirements.txt", @@ -78,7 +78,9 @@ language = { file = [ "universal/fdedup/requirements.txt", "universal/hap/requirements.txt", "universal/tokenization/requirements.txt", -"universal/web2parquet/requirements.txt" +"universal/web2parquet/requirements.txt", +"universal/profiler/requirements.txt", +"universal/resize/requirements.txt" ]} # pyproject.toml must be in a parent and cannot be in sibling @@ -90,8 +92,8 @@ license_select = { file = ["code/license_select/python/requirements.txt"]} code_quality = { file = ["code/code_quality/python/requirements.txt"]} code2parquet = {file = ["code/code2parquet/python/requirements.txt"]} -profiler = { file = ["universal/profiler/python/requirements.txt"]} -resize = { file = ["universal/resize/python/requirements.txt"]} +profiler = { file = ["universal/profiler/requirements.txt"]} +resize = { file = ["universal/resize/requirements.txt"]} ######## Named transforms doc_chunk = { file = ["language/doc_chunk/requirements.txt"]} @@ -138,6 +140,8 @@ dpk_tokenization = "universal/tokenization/dpk_tokenization" dpk_similarity = "language/similarity/dpk_similarity" dpk_filter = "universal/filter/dpk_filter" dpk_code_profiler = "code/code_profiler/dpk_code_profiler" +dpk_profiler = "universal/profiler/dpk_profiler" +dpk_resize = "universal/resize/dpk_resize" #[tool.setuptools.package-data] From f9b19fa6dc2e63db1b0774c42138f4d7440255bd Mon Sep 17 00:00:00 2001 From: SHAHROKH DAIJAVAD Date: Fri, 24 Jan 2025 08:37:05 -0800 Subject: [PATCH 3/5] Various fixes to the README file Signed-off-by: SHAHROKH DAIJAVAD --- transforms/universal/profiler/README.md | 26 +++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/transforms/universal/profiler/README.md b/transforms/universal/profiler/README.md index fc134330c1..b7c75cba96 100644 --- a/transforms/universal/profiler/README.md +++ b/transforms/universal/profiler/README.md @@ -28,7 +28,7 @@ Additionally it writes created word counts to the data storage (as .csv files) a ## Configuration and command line Options -The set of dictionary keys holding [ProfilerTransform](src/profiler_transform_python.py) +The set of dictionary keys holding [ProfilerTransform](dpk_profiler/transform_base.py) configuration for values are as follows: * _doc_column_ - specifies name of the column containing documents @@ -36,9 +36,9 @@ configuration for values are as follows: ## Running ### Launched Command Line Options -When running the transform with the Python launcher (i.e. TransformLauncher), +When running the transform with the Python launcher (i.e., TransformLauncher), the following command line arguments are available in addition to -[the options provided by the launcher](../../../../data-processing-lib/doc/launcher-options.md). +[the options provided by the launcher](../../../data-processing-lib/doc/launcher-options.md). ```shell --profiler_doc_column PROFILER_DOC_COLUMN @@ -47,7 +47,6 @@ the following command line arguments are available in addition to These correspond to the configuration keys described above. -### Running the samples ### Running the samples To run the samples, run the following command from the transform folder transform/universal/profiler @@ -56,28 +55,28 @@ For example, make venv && source venv/bin/activate python -m dpk_profiler.local ``` +### Code example +[notebook](profiler-python.ipynb) ## Transform ray runtime [Transform runtime](dpk_profiler/ray/runtime.py) is responsible for creation cache actors and sending their -handles to the transforms themselves -Additionally it writes created word counts to the data storage (as .csv files) and enhances statistics information with the information about cache size and utilization +handles to the transforms themselves. +Additionally it writes created word counts to the data storage (as .csv files) and enhances statistics information with the information about cache size and utilization. ## Configuration and command line Options -In addition to the configuration parameters, defined above, +In addition to the configuration parameters defined above, Ray version adds the following parameters: * _aggregator_cpu_ - specifies an amount of CPUs per aggregator actor * _num_aggregators_ - specifies number of aggregator actors -## Running - ### Launched Command Line Options -When running the transform with the Ray launcher (i.e. TransformLauncher), +When running the transform with the Ray launcher (i.e., TransformLauncher), the following command line arguments are available in addition to -[the options provided by the launcher](../../../../data-processing-lib/doc/launcher-options.md). +[the options provided by the launcher](../../../data-processing-lib/doc/launcher-options.md): ```shell --profiler_aggregator_cpu PROFILER_AGGREGATOR_CPU @@ -98,10 +97,13 @@ For example, make venv && source venv/bin/activate python -m dpk_profiler.ray.local ``` +### Code example (Ray runtime) + +[notebook](profiler-ray.ipynb) ### Transforming data using the transform image To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +[running images quickstart](../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. From 82bfc42dd7f2a6e937caf7570e65a8e2cadfa991 Mon Sep 17 00:00:00 2001 From: SHAHROKH DAIJAVAD Date: Fri, 24 Jan 2025 09:12:22 -0800 Subject: [PATCH 4/5] Fixed a couple more typos in README Signed-off-by: SHAHROKH DAIJAVAD --- transforms/universal/profiler/README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/transforms/universal/profiler/README.md b/transforms/universal/profiler/README.md index b7c75cba96..5c55eda14b 100644 --- a/transforms/universal/profiler/README.md +++ b/transforms/universal/profiler/README.md @@ -22,9 +22,9 @@ and pass it as a parameter to transforms. ## Transform runtime -[Transform runtime](dpk_profiler/runtime.py) is responsible for creation cache actors and sending their -handles to the transforms themselves -Additionally it writes created word counts to the data storage (as .csv files) and enhances statistics information with the information about cache size and utilization +[Transform runtime](dpk_profiler/runtime.py) is responsible for creating cache actors and sending their +handles to the transforms themselves. +Additionally, it writes created word counts to the data storage (as .csv files) and enhances statistics information with the information about cache size and utilization ## Configuration and command line Options @@ -61,9 +61,8 @@ python -m dpk_profiler.local ## Transform ray runtime -[Transform runtime](dpk_profiler/ray/runtime.py) is responsible for creation cache actors and sending their +[Transform ray runtime](dpk_profiler/ray/runtime.py) is responsible for creating cache actors and sending their handles to the transforms themselves. -Additionally it writes created word counts to the data storage (as .csv files) and enhances statistics information with the information about cache size and utilization. ## Configuration and command line Options From c90157e5c0d9d454a97a1c65def7cb030cd6e49c Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 24 Jan 2025 12:43:32 -0500 Subject: [PATCH 5/5] fix command script Signed-off-by: Maroun Touma --- transforms/universal/profiler/kfp_ray/profiler_wf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index b405d3f0c7..395bb610a8 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -26,7 +26,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/profiler-ray:latest" # the name of the job script -EXEC_SCRIPT_NAME: str = "-m dpk_profiler.ray.runtime.py" +EXEC_SCRIPT_NAME: str = "-m dpk_profiler.ray.runtime" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"