From 3c9b9997b8830a04f065f20ced1ed937c35affc6 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 5 Dec 2024 14:12:21 -0500 Subject: [PATCH 01/28] refactored code as its own module Signed-off-by: Maroun Touma --- transforms/.make.cicd.targets | 89 ++++++++++-------- .../{python/Dockerfile => Dockerfile.python} | 16 +--- .../doc_id/{ray/Dockerfile => Dockerfile.ray} | 25 +---- .../{spark/Dockerfile => Dockerfile.spark} | 15 +-- transforms/universal/doc_id/Makefile | 87 +++-------------- transforms/universal/doc_id/README.md | 77 +++++++++++++-- .../doc_id_local.py => dpk_doc_id/local.py} | 27 +++--- .../local_python.py} | 14 +-- .../ray/local.py} | 14 +-- .../doc_id_s3_ray.py => dpk_doc_id/ray/s3.py} | 14 +-- .../ray/transform.py} | 22 ++--- .../spark/local.py} | 2 +- .../spark/transform.py} | 18 ++-- .../transform.py} | 19 ++-- .../transform_python.py} | 13 ++- transforms/universal/doc_id/kfp_ray/Makefile | 33 ++++--- .../universal/doc_id/kfp_ray/doc_id_wf.py | 21 +++-- .../universal/doc_id/python/.dockerignore | 1 - transforms/universal/doc_id/python/Makefile | 64 ------------- transforms/universal/doc_id/python/README.md | 49 ---------- .../universal/doc_id/python/pyproject.toml | 46 --------- .../python/test-data/expected/metadata.json | 48 ---------- transforms/universal/doc_id/ray/.dockerignore | 1 - transforms/universal/doc_id/ray/.gitignore | 38 -------- transforms/universal/doc_id/ray/Makefile | 68 ------------- transforms/universal/doc_id/ray/README.md | 31 ------ .../universal/doc_id/ray/pyproject.toml | 46 --------- .../ray/test-data/expected/metadata.json | 60 ------------ .../ray/test-data/expected/sample1.parquet | Bin 36668 -> 0 bytes .../ray/test-data/input/sample1.parquet | Bin 36132 -> 0 bytes .../doc_id/{python => }/requirements.txt | 0 .../universal/doc_id/spark/.dockerignore | 1 - transforms/universal/doc_id/spark/.gitignore | 39 -------- transforms/universal/doc_id/spark/Makefile | 57 ----------- transforms/universal/doc_id/spark/README.md | 59 ------------ .../universal/doc_id/spark/pyproject.toml | 45 --------- .../spark/test-data/expected/sample1.parquet | Bin 36668 -> 0 bytes .../spark/test-data/input/sample1.parquet | Bin 36132 -> 0 bytes .../test-data/expected/metadata.json | 26 ++--- .../test-data/expected/sample1.parquet | Bin .../test-data/input/sample1.parquet | Bin .../doc_id/{python => }/test/test_doc_id.py | 16 ++-- .../{python => }/test/test_doc_id_python.py | 26 ++--- .../doc_id/{ray => }/test/test_doc_id_ray.py | 13 +-- .../{spark => }/test/test_doc_id_spark.py | 2 +- transforms/universal/doc_id/transform.config | 20 ---- 46 files changed, 300 insertions(+), 962 deletions(-) rename transforms/universal/doc_id/{python/Dockerfile => Dockerfile.python} (66%) rename transforms/universal/doc_id/{ray/Dockerfile => Dockerfile.ray} (51%) rename transforms/universal/doc_id/{spark/Dockerfile => Dockerfile.spark} (69%) rename transforms/universal/doc_id/{python/src/doc_id_local.py => dpk_doc_id/local.py} (75%) rename transforms/universal/doc_id/{python/src/doc_id_local_python.py => dpk_doc_id/local_python.py} (84%) rename transforms/universal/doc_id/{ray/src/doc_id_local_ray.py => dpk_doc_id/ray/local.py} (85%) rename transforms/universal/doc_id/{ray/src/doc_id_s3_ray.py => dpk_doc_id/ray/s3.py} (85%) rename transforms/universal/doc_id/{ray/src/doc_id_transform_ray.py => dpk_doc_id/ray/transform.py} (87%) rename transforms/universal/doc_id/{spark/src/doc_id_local_spark.py => dpk_doc_id/spark/local.py} (98%) rename transforms/universal/doc_id/{spark/src/doc_id_transform_spark.py => dpk_doc_id/spark/transform.py} (94%) rename transforms/universal/doc_id/{python/src/doc_id_transform_base.py => dpk_doc_id/transform.py} (95%) rename transforms/universal/doc_id/{python/src/doc_id_transform_python.py => dpk_doc_id/transform_python.py} (95%) delete mode 100644 transforms/universal/doc_id/python/.dockerignore delete mode 100644 transforms/universal/doc_id/python/Makefile delete mode 100644 transforms/universal/doc_id/python/README.md delete mode 100644 transforms/universal/doc_id/python/pyproject.toml delete mode 100644 transforms/universal/doc_id/python/test-data/expected/metadata.json delete mode 100644 transforms/universal/doc_id/ray/.dockerignore delete mode 100644 transforms/universal/doc_id/ray/.gitignore delete mode 100644 transforms/universal/doc_id/ray/Makefile delete mode 100644 transforms/universal/doc_id/ray/README.md delete mode 100644 transforms/universal/doc_id/ray/pyproject.toml delete mode 100644 transforms/universal/doc_id/ray/test-data/expected/metadata.json delete mode 100644 transforms/universal/doc_id/ray/test-data/expected/sample1.parquet delete mode 100644 transforms/universal/doc_id/ray/test-data/input/sample1.parquet rename transforms/universal/doc_id/{python => }/requirements.txt (100%) delete mode 100644 transforms/universal/doc_id/spark/.dockerignore delete mode 100644 transforms/universal/doc_id/spark/.gitignore delete mode 100644 transforms/universal/doc_id/spark/Makefile delete mode 100644 transforms/universal/doc_id/spark/README.md delete mode 100644 transforms/universal/doc_id/spark/pyproject.toml delete mode 100644 transforms/universal/doc_id/spark/test-data/expected/sample1.parquet delete mode 100644 transforms/universal/doc_id/spark/test-data/input/sample1.parquet rename transforms/universal/doc_id/{spark => }/test-data/expected/metadata.json (60%) rename transforms/universal/doc_id/{python => }/test-data/expected/sample1.parquet (100%) rename transforms/universal/doc_id/{python => }/test-data/input/sample1.parquet (100%) rename transforms/universal/doc_id/{python => }/test/test_doc_id.py (84%) rename transforms/universal/doc_id/{python => }/test/test_doc_id_python.py (71%) rename transforms/universal/doc_id/{ray => }/test/test_doc_id_ray.py (83%) rename transforms/universal/doc_id/{spark => }/test/test_doc_id_spark.py (97%) delete mode 100644 transforms/universal/doc_id/transform.config diff --git a/transforms/.make.cicd.targets b/transforms/.make.cicd.targets index e392e8f36..23475f57f 100644 --- a/transforms/.make.cicd.targets +++ b/transforms/.make.cicd.targets @@ -51,63 +51,78 @@ publish: test-image-sequence:: .defaults.lib-whl-image .transforms.test-image-help .transforms.clean +test-image-python: + $(MAKE) BUILD_SPECIFIC_RUNTIME=python test-image + +test-image-ray: + $(MAKE) BUILD_SPECIFIC_RUNTIME=ray test-image + +test-image-spark: + $(MAKE) BUILD_SPECIFIC_RUNTIME=spark test-image + test-image:: .default.build-lib-wheel - @if [ -e Dockerfile.python ]; then \ - $(MAKE) DOCKER_FILE=Dockerfile.python \ - TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_PYTHON_SRC) \ - DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \ - test-image-sequence ; \ + @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \ + if [ -e Dockerfile.python ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.python \ + TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_PYTHON_SRC) \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \ + test-image-sequence ; \ + fi ;\ fi - @if [ -e Dockerfile.ray ]; then \ - $(MAKE) DOCKER_FILE=Dockerfile.ray \ - TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_RAY_SRC) \ - DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \ - BASE_IMAGE=$(RAY_BASE_IMAGE) \ - test-image-sequence ; \ + @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "ray" ]; then \ + if [ -e Dockerfile.ray ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.ray \ + TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_RAY_SRC) \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \ + BASE_IMAGE=$(RAY_BASE_IMAGE) \ + test-image-sequence ; \ + fi ;\ fi - @if [ -e Dockerfile.spark ]; then \ - $(MAKE) DOCKER_FILE=Dockerfile.spark \ - TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \ - DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \ - BASE_IMAGE=$(SPARK_BASE_IMAGE) \ - test-image-sequence ; \ + @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "spark" ]; then \ + if [ -e Dockerfile.spark ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.spark \ + TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \ + BASE_IMAGE=$(SPARK_BASE_IMAGE) \ + test-image-sequence ; \ + fi ;\ fi -rm -rf data-processing-dist image-python: - @if [ -e Dockerfile.python ]; then \ - $(MAKE) DOCKER_FILE=Dockerfile.python \ - DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \ - .defaults.lib-whl-image ; \ - fi + $(MAKE) BUILD_SPECIFIC_RUNTIME=python image image-ray: - @if [ -e Dockerfile.ray ]; then \ - $(MAKE) DOCKER_FILE=Dockerfile.ray \ - DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \ - BASE_IMAGE=$(RAY_BASE_IMAGE) \ - .defaults.lib-whl-image ; \ - fi + $(MAKE) BUILD_SPECIFIC_RUNTIME=ray image image-spark: - @if [ -e Dockerfile.spark ]; then \ - $(MAKE) DOCKER_FILE=Dockerfile.spark \ - DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \ - BASE_IMAGE=$(SPARK_BASE_IMAGE) \ - .defaults.lib-whl-image ; \ - fi + $(MAKE) BUILD_SPECIFIC_RUNTIME=spark image image:: .default.build-lib-wheel ## Build all possible images unless a specific runtime is specified @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \ - $(MAKE) image-python ; \ + if [ -e Dockerfile.python ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.python \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \ + .defaults.lib-whl-image ; \ + fi ; \ fi @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "ray" ]; then \ - $(MAKE) image-ray ; \ + if [ -e Dockerfile.ray ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.ray \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \ + BASE_IMAGE=$(RAY_BASE_IMAGE) \ + .defaults.lib-whl-image ; \ + fi ; \ fi @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "spark" ]; then \ - $(MAKE) image-spark ; \ + if [ -e Dockerfile.spark ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.spark \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \ + BASE_IMAGE=$(SPARK_BASE_IMAGE) \ + .defaults.lib-whl-image ; \ + fi ; \ fi -rm -rf data-processing-dist diff --git a/transforms/universal/doc_id/python/Dockerfile b/transforms/universal/doc_id/Dockerfile.python similarity index 66% rename from transforms/universal/doc_id/python/Dockerfile rename to transforms/universal/doc_id/Dockerfile.python index bbedf1eb7..fc634a043 100644 --- a/transforms/universal/doc_id/python/Dockerfile +++ b/transforms/universal/doc_id/Dockerfile.python @@ -2,9 +2,6 @@ FROM docker.io/python:3.10.14-slim-bullseye RUN pip install --upgrade --no-cache-dir pip -# install pytest -RUN pip install --no-cache-dir pytest - # Create a user and use it to run the transform RUN useradd -ms /bin/bash dpk USER dpk @@ -16,19 +13,10 @@ ARG DPK_WHEEL_FILE_NAME COPY --chown=dpk:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} -COPY --chown=dpk:root src/ src/ -COPY --chown=dpk:root pyproject.toml pyproject.toml -COPY --chown=dpk:root README.md README.md +COPY --chown=dpk:root dpk_doc_id/ dpk_doc_id/ COPY --chown=dpk:root requirements.txt requirements.txt -RUN pip install --no-cache-dir -e . - -# copy source data -COPY ./src/doc_id_transform_python.py . -COPY ./src/doc_id_local.py local/ +RUN pip install --no-cache-dir -r requirements.txt -# copy test -COPY test/ test/ -COPY test-data/ test-data/ # Set environment ENV PYTHONPATH /home/dpk diff --git a/transforms/universal/doc_id/ray/Dockerfile b/transforms/universal/doc_id/Dockerfile.ray similarity index 51% rename from transforms/universal/doc_id/ray/Dockerfile rename to transforms/universal/doc_id/Dockerfile.ray index f33aedefa..f5bf58cae 100644 --- a/transforms/universal/doc_id/ray/Dockerfile +++ b/transforms/universal/doc_id/Dockerfile.ray @@ -2,7 +2,7 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 FROM ${BASE_IMAGE} -RUN pip install --upgrade --no-cache-dir pip +RUN pip install --upgrade --no-cache-dir pip # install pytest RUN pip install --no-cache-dir pytest @@ -14,24 +14,9 @@ COPY --chown=ray:users data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chown=ray:users python-transform/ python-transform/ -RUN cd python-transform && pip install --no-cache-dir -e . - -# Install ray project source -COPY --chown=ray:users src/ src/ -COPY --chown=ray:users pyproject.toml pyproject.toml -COPY --chown=ray:users README.md README.md -RUN pip install --no-cache-dir -e . - -# copy the main() entry point to the image -COPY ./src/doc_id_transform_ray.py . - -# copy some of the samples in -COPY src/doc_id_local_ray.py local/ - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chown=ray:users dpk_html2parquet/ dpk_html2parquet/ +COPY --chown=ray:users requirements.txt requirements.txt +RUN pip install -r requirements.txt # Set environment ENV PYTHONPATH /home/ray @@ -40,4 +25,4 @@ ENV PYTHONPATH /home/ray ARG BUILD_DATE ARG GIT_COMMIT LABEL build-date=$BUILD_DATE -LABEL git-commit=$GIT_COMMIT +LABEL git-commit=$GIT_COMMIT \ No newline at end of file diff --git a/transforms/universal/doc_id/spark/Dockerfile b/transforms/universal/doc_id/Dockerfile.spark similarity index 69% rename from transforms/universal/doc_id/spark/Dockerfile rename to transforms/universal/doc_id/Dockerfile.spark index 3d39ed250..e8df6c522 100644 --- a/transforms/universal/doc_id/spark/Dockerfile +++ b/transforms/universal/doc_id/Dockerfile.spark @@ -15,19 +15,12 @@ RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark] # Install project source -COPY --chown=spark:root src/ src/ -COPY --chown=spark:root pyproject.toml pyproject.toml -RUN pip install --no-cache-dir -e . -# copy the main() entry point to the image -COPY ./src/doc_id_transform_spark.py . +## Copy the python version of the tansform +COPY --chown=spark:root dpk_doc_id/ dpk_doc_id/ +COPY --chown=spark:root requirements.txt requirements.txt +RUN pip install -r requirements.txt -# copy some of the samples in -COPY src/doc_id_local_spark.py local/ - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ USER spark diff --git a/transforms/universal/doc_id/Makefile b/transforms/universal/doc_id/Makefile index be26d3bf4..bf0d39543 100644 --- a/transforms/universal/doc_id/Makefile +++ b/transforms/universal/doc_id/Makefile @@ -1,79 +1,22 @@ REPOROOT=../../.. # Use make help, to see the available rules -include $(REPOROOT)/.make.defaults +include $(REPOROOT)/transforms/.make.cicd.targets -setup:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=$(shell basename `pwd`) -clean:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse +################################################################################ -build:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse -venv:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse -image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -publish:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-src:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -set-versions:: - @# Help: Recursively $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -kind-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-save-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -.PHONY: workflow-venv -workflow-venv: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-venv; \ - fi - -.PHONY: workflow-test -workflow-test: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-test; \ - fi - -.PHONY: workflow-upload -workflow-upload: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-upload; \ - fi - -.PHONY: workflow-build -workflow-build: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-build; \ - fi +run-cli-say-sample: + $(MAKE) RUN_FILE="-m dpk_$(TRANSFORM_NAME).ray.transform" \ + RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \ + --doc_id_int True " \ + .transforms.run-src-file diff --git a/transforms/universal/doc_id/README.md b/transforms/universal/doc_id/README.md index c5c785353..675995623 100644 --- a/transforms/universal/doc_id/README.md +++ b/transforms/universal/doc_id/README.md @@ -1,19 +1,10 @@ -# Doc ID Transform +# Document ID Python Annotator The Document ID transforms adds a document identification (unique integers and content hashes), which later can be used in de-duplication operations, per the set of [transform project conventions](../../README.md#transform-project-conventions) the following runtimes are available: -* [pythom](python/README.md) - enables the running of the base python transformation - in a Python runtime -* [ray](ray/README.md) - enables the running of the base python transformation - in a Ray runtime -* [spark](spark/README.md) - enables the running of a spark-based transformation -in a Spark runtime. -* [kfp](kfp_ray/README.md) - enables running the ray docker image -in a kubernetes cluster using a generated `yaml` file. - ## Summary This transform annotates documents with document "ids". @@ -31,3 +22,69 @@ Document IDs are generally useful for tracking annotations to specific documents [fuzzy deduping](../fdedup) relies on integer IDs to be present. If your dataset does not have document ID column(s), you can use this transform to create ones. + +## Configuration and command line Options + +The set of dictionary keys defined in [DocIDTransform](src/doc_id_transform_ray.py) +configuration for values are as follows: + +* _doc_column_ - specifies name of the column containing the document (required for ID generation) +* _hash_column_ - specifies name of the column created to hold the string document id, if None, id is not generated +* _int_id_column_ - specifies name of the column created to hold the integer document id, if None, id is not generated +* _start_id_ - an id from which ID generator starts () + +At least one of _hash_column_ or _int_id_column_ must be specified. + +## Running + +### Launched Command Line Options +When running the transform with the Ray launcher (i.e. TransformLauncher), +the following command line arguments are available in addition to +[the options provided by the ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md). +``` + --doc_id_doc_column DOC_ID_DOC_COLUMN + doc column name + --doc_id_hash_column DOC_ID_HASH_COLUMN + Compute document hash and place in the given named column + --doc_id_int_column DOC_ID_INT_COLUMN + Compute unique integer id and place in the given named column + --doc_id_start_id DOC_ID_START_ID + starting integer id +``` +These correspond to the configuration keys described above. + + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + + +### Running as spark-based application +``` +(venv) cma:src$ python doc_id_local.py +18:32:13 INFO - data factory data_ is using local data access: input_folder - /home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/test-data/input output_folder - /home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/output at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/data_access/data_access_factory.py:185" +18:32:13 INFO - data factory data_ max_files -1, n_sample -1 at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/data_access/data_access_factory.py:201" +18:32:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'] at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/data_access/data_access_factory.py:214" +18:32:13 INFO - pipeline id pipeline_id at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/runtime/execution_configuration.py:80" +18:32:13 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'} at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/runtime/execution_configuration.py:83" +18:32:13 INFO - spark execution config : {'spark_local_config_filepath': '/home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/config/spark_profile_local.yml', 'spark_kube_config_filepath': 'config/spark_profile_kube.yml'} at "/home/cma/de/data-prep-kit/data-processing-lib/spark/src/data_processing_spark/runtime/spark/spark_execution_config.py:42" +24/05/26 18:32:14 WARN Utils: Your hostname, li-7aed0a4c-2d51-11b2-a85c-dfad31db696b.ibm.com resolves to a loopback address: 127.0.0.1; using 192.168.1.223 instead (on interface wlp0s20f3) +24/05/26 18:32:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/05/26 18:32:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +18:32:17 INFO - files = ['/home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/test-data/input/test_doc_id_1.parquet', '/home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/test-data/input/test_doc_id_2.parquet'] at "/home/cma/de/data-prep-kit/data-processing-lib/spark/src/data_processing_spark/runtime/spark/spark_launcher.py:184" +24/05/26 18:32:23 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'. +``` + +### Doc ID Statistics +The metadata generated by the Spark `doc_id` transform contains the following statistics: + * `total_docs_count`, `total_columns_count`: total number of documents (rows), and columns in the input table, before the `doc_id` transform ran + * `docs_after_doc_id`, `columns_after_doc_id`: total number of documents (rows), and columns in the output table, after the `doc_id` transform ran + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + diff --git a/transforms/universal/doc_id/python/src/doc_id_local.py b/transforms/universal/doc_id/dpk_doc_id/local.py similarity index 75% rename from transforms/universal/doc_id/python/src/doc_id_local.py rename to transforms/universal/doc_id/dpk_doc_id/local.py index 9d525dfa2..dc688cd1c 100644 --- a/transforms/universal/doc_id/python/src/doc_id_local.py +++ b/transforms/universal/doc_id/dpk_doc_id/local.py @@ -13,13 +13,15 @@ import os from data_processing.data_access import DataAccessLocal -from doc_id_transform_python import DocIDTransform -from doc_id_transform_base import (IDGenerator, - doc_column_name_key, - hash_column_name_key, - int_column_name_key, - id_generator_key, - ) +from dpk_doc_id.transform import ( + IDGenerator, + doc_column_name_key, + hash_column_name_key, + id_generator_key, + int_column_name_key, +) +from dpk_doc_id.transform_python import DocIDTransform + # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) @@ -29,11 +31,12 @@ "output_folder": output_folder, } -doc_id_params = {doc_column_name_key: "contents", - hash_column_name_key: "hash_column", - int_column_name_key: "int_id_column", - id_generator_key: IDGenerator(5), - } +doc_id_params = { + doc_column_name_key: "contents", + hash_column_name_key: "hash_column", + int_column_name_key: "int_id_column", + id_generator_key: IDGenerator(5), +} doc_column_name_key = "doc_column" hash_column_name_key = "hash_column" int_column_name_key = "int_column" diff --git a/transforms/universal/doc_id/python/src/doc_id_local_python.py b/transforms/universal/doc_id/dpk_doc_id/local_python.py similarity index 84% rename from transforms/universal/doc_id/python/src/doc_id_local_python.py rename to transforms/universal/doc_id/dpk_doc_id/local_python.py index 1a234b79b..68a2def42 100644 --- a/transforms/universal/doc_id/python/src/doc_id_local_python.py +++ b/transforms/universal/doc_id/dpk_doc_id/local_python.py @@ -15,12 +15,14 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from doc_id_transform_python import DocIDPythonTransformRuntimeConfiguration -from doc_id_transform_base import (doc_column_name_cli_param, - hash_column_name_cli_param, - int_column_name_cli_param, - start_id_cli_param, - ) +from dpk_doc_id.transform import ( + doc_column_name_cli_param, + hash_column_name_cli_param, + int_column_name_cli_param, + start_id_cli_param, +) +from dpk_doc_id.transform_python import DocIDPythonTransformRuntimeConfiguration + # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) diff --git a/transforms/universal/doc_id/ray/src/doc_id_local_ray.py b/transforms/universal/doc_id/dpk_doc_id/ray/local.py similarity index 85% rename from transforms/universal/doc_id/ray/src/doc_id_local_ray.py rename to transforms/universal/doc_id/dpk_doc_id/ray/local.py index 9847da611..2a6e36113 100644 --- a/transforms/universal/doc_id/ray/src/doc_id_local_ray.py +++ b/transforms/universal/doc_id/dpk_doc_id/ray/local.py @@ -15,12 +15,14 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration -from doc_id_transform_base import (doc_column_name_cli_param, - hash_column_name_cli_param, - int_column_name_cli_param, - start_id_cli_param, - ) +from dpk_doc_id.ray.transform import DocIDRayTransformRuntimeConfiguration +from dpk_doc_id.transform import ( + doc_column_name_cli_param, + hash_column_name_cli_param, + int_column_name_cli_param, + start_id_cli_param, +) + # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) diff --git a/transforms/universal/doc_id/ray/src/doc_id_s3_ray.py b/transforms/universal/doc_id/dpk_doc_id/ray/s3.py similarity index 85% rename from transforms/universal/doc_id/ray/src/doc_id_s3_ray.py rename to transforms/universal/doc_id/dpk_doc_id/ray/s3.py index d6f5a63f7..4123a04a6 100644 --- a/transforms/universal/doc_id/ray/src/doc_id_s3_ray.py +++ b/transforms/universal/doc_id/dpk_doc_id/ray/s3.py @@ -14,12 +14,14 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration -from doc_id_transform_base import (doc_column_name_cli_param, - hash_column_name_cli_param, - int_column_name_cli_param, - start_id_cli_param, - ) +from dpk_doc_id.ray.transform import DocIDRayTransformRuntimeConfiguration +from dpk_doc_id.transform import ( + doc_column_name_cli_param, + hash_column_name_cli_param, + int_column_name_cli_param, + start_id_cli_param, +) + # create parameters s3_cred = { diff --git a/transforms/universal/doc_id/ray/src/doc_id_transform_ray.py b/transforms/universal/doc_id/dpk_doc_id/ray/transform.py similarity index 87% rename from transforms/universal/doc_id/ray/src/doc_id_transform_ray.py rename to transforms/universal/doc_id/dpk_doc_id/ray/transform.py index 19742c866..4ff20b9f5 100644 --- a/transforms/universal/doc_id/ray/src/doc_id_transform_ray.py +++ b/transforms/universal/doc_id/dpk_doc_id/ray/transform.py @@ -22,13 +22,14 @@ from data_processing_ray.runtime.ray.runtime_configuration import ( RayTransformRuntimeConfiguration, ) +from dpk_doc_id.transform import ( + DocIDTransformBase, + DocIDTransformConfigurationBase, + IDGenerator, + id_generator_key, + start_id_key, +) from ray.actor import ActorHandle -from doc_id_transform_base import (IDGenerator, - DocIDTransformBase, - DocIDTransformConfigurationBase, - start_id_key, - id_generator_key, - ) class DocIDRayTransform(DocIDTransformBase): @@ -44,9 +45,7 @@ def __init__(self, config: dict[str, Any]): super().__init__(config) self.id_generator = config.get(id_generator_key, None) if self.id_generator is None and self.int_column is not None: - raise UnrecoverableException( - "There is no id generating actor defined." - ) + raise UnrecoverableException("There is no id generating actor defined.") def _get_starting_id(self, n_rows: int) -> int: """ @@ -105,10 +104,7 @@ def __init__(self): class DocIDRayTransformRuntimeConfiguration(RayTransformRuntimeConfiguration): def __init__(self): - super().__init__( - transform_config=DocIDRayTransformConfiguration(), - runtime_class=DocIDRayRuntime - ) + super().__init__(transform_config=DocIDRayTransformConfiguration(), runtime_class=DocIDRayRuntime) if __name__ == "__main__": diff --git a/transforms/universal/doc_id/spark/src/doc_id_local_spark.py b/transforms/universal/doc_id/dpk_doc_id/spark/local.py similarity index 98% rename from transforms/universal/doc_id/spark/src/doc_id_local_spark.py rename to transforms/universal/doc_id/dpk_doc_id/spark/local.py index c9a167783..d6f821aa0 100644 --- a/transforms/universal/doc_id/spark/src/doc_id_local_spark.py +++ b/transforms/universal/doc_id/dpk_doc_id/spark/local.py @@ -14,7 +14,7 @@ from data_processing.utils import ParamsUtils from data_processing_spark.runtime.spark import SparkTransformLauncher -from doc_id_transform_spark import ( +from dpk_doc_id.spark.transform import ( DocIDSparkTransformConfiguration, doc_column_name_cli_param, hash_column_name_cli_param, diff --git a/transforms/universal/doc_id/spark/src/doc_id_transform_spark.py b/transforms/universal/doc_id/dpk_doc_id/spark/transform.py similarity index 94% rename from transforms/universal/doc_id/spark/src/doc_id_transform_spark.py rename to transforms/universal/doc_id/dpk_doc_id/spark/transform.py index beeb77ce5..4af3429b3 100644 --- a/transforms/universal/doc_id/spark/src/doc_id_transform_spark.py +++ b/transforms/universal/doc_id/dpk_doc_id/spark/transform.py @@ -14,12 +14,18 @@ from typing import Any import pyarrow as pa -from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.data_access import DataAccessFactoryBase -from data_processing.transform import TransformStatistics +from data_processing.transform import ( + AbstractTableTransform, + TransformConfiguration, + TransformStatistics, +) from data_processing.utils import CLIArgumentProvider, TransformUtils -from data_processing_spark.runtime.spark import SparkTransformLauncher -from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration, DefaultSparkTransformRuntime +from data_processing_spark.runtime.spark import ( + DefaultSparkTransformRuntime, + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) short_name = "doc_id" @@ -137,7 +143,6 @@ def apply_input_params(self, args: Namespace) -> bool: class DocIDSparkTransformRuntime(DefaultSparkTransformRuntime): - def __init__(self, params: dict[str, Any]): """ Create/config this runtime. @@ -146,7 +151,7 @@ def __init__(self, params: dict[str, Any]): super().__init__(params) def get_transform_config( - self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics + self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics ) -> dict[str, Any]: """ Get the dictionary of configuration that will be provided to the transform's initializer. @@ -161,7 +166,6 @@ def get_transform_config( return self.params | {"partition_index": partition} - class DocIDSparkTransformConfiguration(SparkTransformRuntimeConfiguration): """ Implements the SparkTransformConfiguration for NOOP as required by the PythonTransformLauncher. diff --git a/transforms/universal/doc_id/python/src/doc_id_transform_base.py b/transforms/universal/doc_id/dpk_doc_id/transform.py similarity index 95% rename from transforms/universal/doc_id/python/src/doc_id_transform_base.py rename to transforms/universal/doc_id/dpk_doc_id/transform.py index 132a3d964..08d316f48 100644 --- a/transforms/universal/doc_id/python/src/doc_id_transform_base.py +++ b/transforms/universal/doc_id/dpk_doc_id/transform.py @@ -14,17 +14,20 @@ from typing import Any import pyarrow as pa - from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import CLIArgumentProvider, TransformUtils, UnrecoverableException +from data_processing.utils import ( + CLIArgumentProvider, + TransformUtils, + UnrecoverableException, +) -class IDGenerator(): +class IDGenerator: """ A class maintaining unique integer ids """ - def __init__(self, start: int=0): + def __init__(self, start: int = 0): """ Initialization :param start: starting id number @@ -127,6 +130,7 @@ def __init__(self, transform_class: type[AbstractTableTransform]): transform_class=transform_class, ) from data_processing.utils import get_logger + self.logger = get_logger(__name__) def add_input_params(self, parser: ArgumentParser) -> None: @@ -137,10 +141,7 @@ def add_input_params(self, parser: ArgumentParser) -> None: (e.g, noop_, pii_, etc.) """ parser.add_argument( - f"--{doc_column_name_cli_param}", - type=str, - default=doc_column_name_default, - help="doc column name" + f"--{doc_column_name_cli_param}", type=str, default=doc_column_name_default, help="doc column name" ) parser.add_argument( f"--{hash_column_name_cli_param}", @@ -174,4 +175,4 @@ def apply_input_params(self, args: Namespace) -> bool: self.params = self.params | captured self.logger.info(f"Doc id parameters are : {self.params}") - return True \ No newline at end of file + return True diff --git a/transforms/universal/doc_id/python/src/doc_id_transform_python.py b/transforms/universal/doc_id/dpk_doc_id/transform_python.py similarity index 95% rename from transforms/universal/doc_id/python/src/doc_id_transform_python.py rename to transforms/universal/doc_id/dpk_doc_id/transform_python.py index cbc63592c..4dd5b4c6f 100644 --- a/transforms/universal/doc_id/python/src/doc_id_transform_python.py +++ b/transforms/universal/doc_id/dpk_doc_id/transform_python.py @@ -14,18 +14,18 @@ from typing import Any from data_processing.data_access import DataAccessFactoryBase -from data_processing.transform import TransformStatistics from data_processing.runtime.pure_python import ( DefaultPythonTransformRuntime, + PythonTransformLauncher, PythonTransformRuntimeConfiguration, - PythonTransformLauncher ) -from doc_id_transform_base import ( - IDGenerator, +from data_processing.transform import TransformStatistics +from dpk_doc_id.transform import ( DocIDTransformBase, DocIDTransformConfigurationBase, + IDGenerator, + id_generator_key, start_id_key, - id_generator_key ) @@ -52,7 +52,6 @@ def _get_starting_id(self, n_rows: int) -> int: class DocIDTransformConfiguration(DocIDTransformConfigurationBase): - def __init__(self): super().__init__(transform_class=DocIDTransform) @@ -81,7 +80,7 @@ def __init__(self, params: dict[str, Any]): self.id_generator = None def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] + self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] ) -> dict[str, Any]: """ Get the dictionary of configuration that will be provided to the transform's initializer. diff --git a/transforms/universal/doc_id/kfp_ray/Makefile b/transforms/universal/doc_id/kfp_ray/Makefile index f170326e2..be5a2144f 100644 --- a/transforms/universal/doc_id/kfp_ray/Makefile +++ b/transforms/universal/doc_id/kfp_ray/Makefile @@ -2,10 +2,20 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows + # Include the common configuration for this transform -include ../transform.config +#include ../transform.config + +SRC_DIR=${CURDIR}/../ +# Use the docker image that is built for ray runtime +TRANSFORM_RUNTIME=ray +## override settings in .make.default as they assume old structure with ray being the current folder +DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-$(TRANSFORM_RUNTIME) +DOCKER_LOCAL_IMAGE=$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) + -SRC_DIR=${CURDIR}/../ray/ +# Only build the image with -f Dockerfile.ray +BUILD_SPECIFIC_RUNTIME=ray PYTHON_WF := $(shell find ./ -name '*_wf.py') YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) @@ -21,8 +31,6 @@ venv:: build:: -setup:: - test:: test-src:: @@ -33,11 +41,7 @@ publish:: image:: -kind-load-image:: - -docker-load-image:: - -docker-save-image:: +load-image:: .PHONY: workflow-build workflow-build: workflow-venv @@ -45,10 +49,15 @@ workflow-build: workflow-venv .PHONY: workflow-test workflow-test: workflow-build - $(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=doc_id_wf.yaml + $(MAKE) TRANSFORM_SRC=${SRC_DIR} \ + TRANSFORM_RUNTIME=$(TRANSFORM_RUNTIME) \ + TRANSFORM_NAME=$(TRANSFORM_NAME) \ + BUILD_SPECIFIC_RUNTIME=$(BUILD_SPECIFIC_RUNTIME) \ + DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) \ + PIPELINE_FILE=$(TRANSFORM_NAME)_wf.yaml .workflows.test-pipeline .PHONY: workflow-upload -workflow-upload: workflow-build +workflow-upload: @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done + done \ No newline at end of file diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index f41231159..dbdb269e9 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -20,7 +20,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:latest" # the name of the job script -EXEC_SCRIPT_NAME: str = "doc_id_transform_ray.py" +EXEC_SCRIPT_NAME: str = "-m dpk_doc_id.ray.transform" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" @@ -109,7 +109,14 @@ def doc_id( ray_name: str = "doc_id-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/doc_id/input/', 'output_folder': 'test/doc_id/output/'}", @@ -120,9 +127,9 @@ def doc_id( data_data_sets: str = "", data_files_to_use: str = "['.parquet']", # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, # doc id parameters doc_id_doc_column: str = "contents", doc_id_hash_column: str = "hash_column", @@ -171,7 +178,9 @@ def doc_id( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -191,7 +200,7 @@ def doc_id( doc_id_doc_column=doc_id_doc_column, doc_id_hash_column=doc_id_hash_column, doc_id_int_column=doc_id_int_column, - doc_id_start_id=doc_id_start_id + doc_id_start_id=doc_id_start_id, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster diff --git a/transforms/universal/doc_id/python/.dockerignore b/transforms/universal/doc_id/python/.dockerignore deleted file mode 100644 index f7275bbbd..000000000 --- a/transforms/universal/doc_id/python/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/universal/doc_id/python/Makefile b/transforms/universal/doc_id/python/Makefile deleted file mode 100644 index 26da1fc8f..000000000 --- a/transforms/universal/doc_id/python/Makefile +++ /dev/null @@ -1,64 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -venv:: .transforms.python-venv - -test:: .transforms.python-test - -clean:: .transforms.clean - -image:: .transforms.python-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-python - -setup:: .transforms.setup - -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(DOC_ID_PYTHON_VERSION) TOML_VERSION=$(DOC_ID_PYTHON_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -test-image:: .transforms.python-test-image - -run-cli-sample: .transforms.run-cli-python-sample - -run-local-sample: .transforms.run-local-sample - -run-local-python-sample: .transforms.run-local-python-sample - -#run-s3-ray-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/doc_id/python/README.md b/transforms/universal/doc_id/python/README.md deleted file mode 100644 index dbb02093c..000000000 --- a/transforms/universal/doc_id/python/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# Document ID Python Annotator - -Please see the set of -[transform project conventions](../../../README.md) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Building - -A [docker file](Dockerfile) that can be used for building docker image. You can use - -```shell -make build -``` - -## Configuration and command line Options - -The set of dictionary keys defined in [DocIDTransform](src/doc_id_transform_ray.py) -configuration for values are as follows: - -* _doc_column_ - specifies name of the column containing the document (required for ID generation) -* _hash_column_ - specifies name of the column created to hold the string document id, if None, id is not generated -* _int_id_column_ - specifies name of the column created to hold the integer document id, if None, id is not generated -* _start_id_ - an id from which ID generator starts () - -At least one of _hash_column_ or _int_id_column_ must be specified. - -## Running - -### Launched Command Line Options -When running the transform with the Ray launcher (i.e. TransformLauncher), -the following command line arguments are available in addition to -[the options provided by the ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md). -``` - --doc_id_doc_column DOC_ID_DOC_COLUMN - doc column name - --doc_id_hash_column DOC_ID_HASH_COLUMN - Compute document hash and place in the given named column - --doc_id_int_column DOC_ID_INT_COLUMN - Compute unique integer id and place in the given named column - --doc_id_start_id DOC_ID_START_ID - starting integer id -``` -These correspond to the configuration keys described above. - - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml deleted file mode 100644 index 1a962662d..000000000 --- a/transforms/universal/doc_id/python/pyproject.toml +++ /dev/null @@ -1,46 +0,0 @@ -[project] -name = "dpk_doc_id_transform_python" -version = "0.2.3.dev0" -requires-python = ">=3.10,<3.13" -description = "ededup Python Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/universal/doc_id/python/test-data/expected/metadata.json b/transforms/universal/doc_id/python/test-data/expected/metadata.json deleted file mode 100644 index 83a938628..000000000 --- a/transforms/universal/doc_id/python/test-data/expected/metadata.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "doc_id", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-08-17 09:57:40", - "end_time": "2024-08-17 09:57:41", - "status": "success" - }, - "code": { - "github": "github", - "commit_hash": "12345", - "path": "path" - }, - "job_input_params": { - "doc_column": "contents", - "hash_column": "hash_column", - "int_column": "int_id_column", - "start_id": 5, - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [ - ".parquet" - ], - "num_processors": 0 - }, - "job_output_stats": { - "source_files": 1, - "source_size": 36132, - "result_files": 1, - "result_size": 36668, - "processing_time": 0.044, - "source_doc_count": 5, - "result_doc_count": 5, - "final id": 10 - }, - "source": { - "name": "/Users/borisl/IdeaProjects/data-prep-kit/transforms/universal/doc_id/python/test-data/input", - "type": "path" - }, - "target": { - "name": "/Users/borisl/IdeaProjects/data-prep-kit/transforms/universal/doc_id/python/output", - "type": "path" - } -} \ No newline at end of file diff --git a/transforms/universal/doc_id/ray/.dockerignore b/transforms/universal/doc_id/ray/.dockerignore deleted file mode 100644 index f7275bbbd..000000000 --- a/transforms/universal/doc_id/ray/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/universal/doc_id/ray/.gitignore b/transforms/universal/doc_id/ray/.gitignore deleted file mode 100644 index 3ea7fd4ab..000000000 --- a/transforms/universal/doc_id/ray/.gitignore +++ /dev/null @@ -1,38 +0,0 @@ -test-data/output -output/* -/output/ -data-processing-lib/ - - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - - -# Distribution / packaging -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -.tox/ -htmlcov -.coverage -.cache -nosetests.xml -coverage.xml \ No newline at end of file diff --git a/transforms/universal/doc_id/ray/Makefile b/transforms/universal/doc_id/ray/Makefile deleted file mode 100644 index 79787406b..000000000 --- a/transforms/universal/doc_id/ray/Makefile +++ /dev/null @@ -1,68 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -BASE_IMAGE=${RAY_BASE_IMAGE} -venv:: .transforms.ray-venv - -test:: .transforms.ray-test - -clean:: .transforms.clean - -image:: .transforms.ray-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -test-image:: .transforms.ray-test-image - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-ray - -setup:: .transforms.setup - -# TRANSFORM_PYTHON_VERSION has no effect since requirements do not specify a python transform implementation -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(DOC_ID_PYTHON_VERSION) TOML_VERSION=$(DOC_ID_RAY_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -run-local-sample: .transforms.run-local-ray-sample - -run-s3-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image - -run-cli-sample: - $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \ - RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \ - --doc_id_int True " \ - .transforms.run-src-file - diff --git a/transforms/universal/doc_id/ray/README.md b/transforms/universal/doc_id/ray/README.md deleted file mode 100644 index c9cb0d15c..000000000 --- a/transforms/universal/doc_id/ray/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Document ID Annotator - -Please see the set of -[transform project conventions](../../../README.md) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Building - -A [docker file](Dockerfile) that can be used for building docker image. You can use - -```shell -make build -``` - -## Driver options - -## Configuration and command line Options - -See [Python documentation](../python/README.md) - -## Running - -### Launched Command Line Options -When running the transform with the Ray launcher (i.e. TransformLauncher), -the following [command line arguments](../python/README.md) are available in addition to -[the options provided by the ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md). - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml deleted file mode 100644 index 372f39762..000000000 --- a/transforms/universal/doc_id/ray/pyproject.toml +++ /dev/null @@ -1,46 +0,0 @@ -[project] -name = "dpk_doc_id_transform_ray" -version = "0.2.3.dev0" -requires-python = ">=3.10,<3.13" -description = "docid Ray Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, -] -dependencies = [ - "dpk_doc_id_transform_python==0.2.3.dev0", - "data-prep-toolkit[ray]>=0.2.3.dev0", -] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/universal/doc_id/ray/test-data/expected/metadata.json b/transforms/universal/doc_id/ray/test-data/expected/metadata.json deleted file mode 100644 index 1072ffd27..000000000 --- a/transforms/universal/doc_id/ray/test-data/expected/metadata.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "doc_id", - "job type": "ray", - "job id": "job_id", - "start_time": "2024-08-17 21:12:06", - "end_time": "2024-08-17 21:12:07", - "status": "success" - }, - "code": { - "github": "github", - "commit_hash": "12345", - "path": "path" - }, - "job_input_params": { - "doc_column": "contents", - "hash_column": "hash_column", - "int_column": "int_id_column", - "start_id": 5, - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [ - ".parquet" - ], - "number of workers": 2, - "worker options": { - "num_cpus": 0.8, - "max_restarts": -1 - }, - "actor creation delay": 0 - }, - "execution_stats": { - "cpus": 12, - "gpus": 0, - "memory": 14.759533692151308, - "object_store": 2.0, - "execution time, min": 0.00696413516998291 - }, - "job_output_stats": { - "source_files": 1, - "source_size": 36132, - "result_files": 1, - "result_size": 36668, - "processing_time": 0.0373997688293457, - "source_doc_count": 5, - "result_doc_count": 5, - "final id": 10 - }, - "source": { - "name": "/Users/borisl/IdeaProjects/data-prep-kit/transforms/universal/doc_id/ray/test-data/input", - "type": "path" - }, - "target": { - "name": "/Users/borisl/IdeaProjects/data-prep-kit/transforms/universal/doc_id/ray/output", - "type": "path" - } -} \ No newline at end of file diff --git a/transforms/universal/doc_id/ray/test-data/expected/sample1.parquet b/transforms/universal/doc_id/ray/test-data/expected/sample1.parquet deleted file mode 100644 index e90ec7cba580c97f650012f2bce74b2c1b43506c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 36668 zcmeHw2YeI9^04HNF@XRwGF2G3P;9H)5|3iJ$u_oewM&JT z7AK6$2nh5Kd`1s^K}S@G-`~L$6aofCV|dWOpGM$|Mtj?XJnU^Bq`!TTq#-W%LXLXA z_8_NuGRAr&0m;?b^O^uldc_XfY-uJ%fq!+Ei&8-JdsZ*=|Q9hX!Wo;2@!^nSql$=6%$;f>8(%2QbS z2L+vupB97)$7VnE@d(`CkXPu@qet;3TD|$1p&MIFJ+N%hl=+BkdNsctE`+8G#I7>o zw2qM~PK1aXfvGq`b%a>RjfhaFl>)Jlr{HqcC?Df-l?nwaM#Vf#B@l2>aYVQ}oX-=8 zRU!pPh;jwYNIoV|g)76ABC$%v4F}&MRfHG~S1K^RO3YD;R0^I_ED}Vhl_*y%67vNd zF{b2+1$;rcfUo4M2zB&NBh=Bua{(Qh7toRsVAIC>DudFZ!}R9<8Wk?77v6!D-pG&j zkeR+M{~@EGRQ0WN#%g*J!^5rxAAagVYwslogEsNIve^o)1q(H5v{+~nTEZ6g*C=%T zHF`5vq(`-Ej?$oG7jwfnY)sEqp=OlbpCc9u!U{|Vy%%lc&|mjwsg?dyIbd=%X0&&Z ztK6~oKf!~ebwLJc0fqfJ)(?T1`3CzYbfI1Xp$l|>8nUseT^0NYRlxONgm>V0Z{$fM zWKlz?2XBzFH#%xc-I?RbcmZ^WWI+zE^Q6uDcTkHVn;Q82Hg0N%y>Yxv?qV;??TId<9U)yzpB z?oXL8y#Ctm54}G1*t`i&UT?1kAq_Rg(a}J6>^`ea*IjZB_Q6=;1HOvDV z=4?5V^37(W38-*MNl6$`l`yD3KsY9-tmVZzc0?Gjr`cfCD7%|13INohdR3?nE!61q zLQR+!Q<^o!Sg0}|)#qW%t}pG)Y*e#(93I!!pWa+#4Aq;AD2KxpV1&*%+mBXurSUdv z%vucBn2A1t!@QAg&5=1ws4@SXQBdl@j$Vv8t;sqt;Qaf-ZZtX8sBzcGk>RbH#icck zrC;C$M8+;^RDb!!p6H~YK5dr;cludcxEhl-xqj-@fBWB_J-nXhl8T}EVm+Pez+*cXkTYc5soh&VH?{=pq-ozDKw`%lb_<$CnosBVW$ zY5TVQa@+UWyea*1{b+O1omD#*4)gh3$Jp>iEA4Dl&1rTbYv<0eG04*RIXgG~cqO-J z&HOZOtNjL5i2CaEAd~6Vl8tf2WxJ~fbp6u5>-If8gh5wZ1cf|RcfO`N@nD(hv*hFF zo9yd6+j4g2a-W~~g!d53;ws}lRt%Wx@o+Eg*y3qloZLQkC>q%J*sfhspEmyK_lh$4 zDE5MUvtRc^Yr6f0#fA>gUXgM7-(H_DuLvqG+&;18u*9OPvvwSrdng*4^jv@B7e&H+ zzp8`lw(E|4dwY`kQR(cfUo804;79LnE*SVn{*&Do<>oPGI|p_M_dl_rW$?tQl1`zk zZ}nU(oVjFj6Ymv`PPWcU$eFO5cVuDDtcbaRKXgBWT%K%By?DxC_^Jjoh{& z^XzurojsfGeW=_L_jPRfbi*Om;^B!?SM5FiNXK}rpS*ec zw&SO_^%!tRoot-bXl`_rsz4Ry^W}~s7tS}#{`_CVSAI07vhu5C>vM0c$X>Rlo3twT zZ10vIeC+?h?|02(JeqAfvpjkEVb7>7vB~QnezEGv;%PH>`zkJU7#%k7^L@9|`~2F$ zz?ijj=={QC{i^z0)_5fz4*l1jaKAr>#7tiPUG~83+8GB!&efmMw|K&T=Tv{wWBi^; zBR;Hb{3DYc!DE~ZUAA@JnXKSJQ4=Pdt}4Bh&@*Jqo=-B5Zs^yw>WROkLGO{v?snce z>cMf}OTimH|Foe%*lghhF?Qh?v&+>-6O`44Epg^Gw-w)geYf-6kZvCt$2a)EykcCl z#LNfF$JcAw>f_F3Z8miuK0CeX+WURl-0>Pa_UfwC=}C{qeYG^E4Lhbff60HwedRm3 zb6S%v?C6sn=Ty%bUccRj@w@MpmiD=ODO7lN_ns>6%+H_F4Vi{$X}ey=1IY^?BU?my ze17SA^`w3SruEr>aPk*-8!)fz+IYV~%Yds*5*wGUJaT@(6AvbHp=8QlRkM>%FAb3S z`Fp;*x2Ap5G`I44-xHJf8t06;(JzBCb`v7IiDH^(Uy2)hN;Z4c;8m>oWkK4D_}-SmOOzs*0K z_*Lu1k;Y9i8Hp<9grEsQn)K9myXnTq_vUw)Gk?tVPi8zWzke$?N%SDc>x;0?<#!66 zw~YAjWc8>;oXUB_Op65VIuBSnuXIGO#QDz)GMjXeimLkWZuPj`+|%vuH6C6ye33zY zdeToHq&Dd+xYD{uVbcrE?{}YcV*DwvTE1Uct_oS#&THwiXFu=!B)4ai{@wF$K5Jb$ zsa#jtHn?=aCa%A#c-m*{`%21Yo@1NV8KfOUDlRkhI-utpJu`+1y&F|l9+VqRg$kJT-iyf^#iy6otci`8P ztM*59ydq4H3`T}-_xM_?6>K*3_$~DMHDy9CR{tIeSuwufb;-DP^-e~L@w{;Dr7`8* zV^=h|SbS=0?l=36hh=UvHqP6fer-@@b#UzX;S(!NPj=7P?w9kQ)O9xp{oI{yI2}-b?e2#Wv$8htI=lS6 zpQerN6x@Gr&NpgZhfW`~>oIU_kgrcxPW0fhr#j!Gw=KWsv2)d|3z0{bdH6TJue?#( zSGM)L8B4Bpy0gN!P4mb@o!kFQYUG3q(oRf1;yrw7v?VE~a8~m@mVqBk`7&ney|R_( zJ1<{*p(^*wg{Lz+h&$cb9CB|&+vz>_e50B%&Z4-i$LA3bq?PI!+7^;k^^p1L{vt zH*9X)y!m&Y6$@J(ZoB+r{}$4lLw}R5zR|CnURZ319IIu)iFHecIDc=|6X&iu=uV?a+y`?CCTi>Jihe1hKhsC z=Rb|i9NO=Pdpn1#e^UjCd(PwziQ3p{!q3wWTq=}(_q1N3 zc;S1^)|7497P+dR<+&wr2kz_?UG_-HNR>dhB$kyPn;09N85yx&nK2y;1zr{CD#f4&WmzWBPZ4G^8uTWN6=T5rR2#Hf zLkV!gy0T*RCQA{9mzJ|LQ{z}BYbP3$BbqQ!lbR$n57p?ySm~G+P@@8^)P^FRbu<|m z#+lUq6WhNE?vvdZ4lUx&wz+!eCZx|~TEdsn?;Z9^%7S4-MR!U}iD)TBm8FwUE zC@X_-C0Wt9E6K{R7>x!n-ZH?>5)^OGvU-_-gBYvCP^1D~vQWJNu(-&|Rg*ynn5o8z zFl%&xcZ4vCFe3)X23U`kVoD2OI+RRwQtxOeKf7xS`!qrEL9iO^=t8W51e^v`M#oE1 zqz=__X_n+;dIv&*e{4lyr8vxub9*Mfr*Xr6WFW)7T2 zM^(v06 zL2mmb2I#agt+Ct@^?;<_!j52n7_qSf|S8FXMxz-`e=dSEpjxh6xV%iwSvx^wEBau6V_ zl>~^aiJ*cwI1D7e$6%3SUH$}NzG7R8k*Py9X67~742W`x3K#6Hj&F7u9fzVUUp2i>B9s2t5VJ$ zzy9na?v)Y8e_K)geDduj?XUk7p4#V;^=6s8yCK7{cLUjferZt7)fkXHw6jctiXiOUDA+Vv4K)9wvuQi}5 zutu?M(LUlE$k7T5GnRp{OChH3E|EmZVtGj^QOQXu3H+pl6j@f9ESet|BkUc|WB#M4 zA#}n~S@&ReWn~cK10hZo*1s4lG6A;WDze%)aHB7>YXCC-L#Vd@sDb^%*i1Qr)QnPx z>BIkmPGbX!n3P2raCbsgI+g-8<+DPStPhN3=6v9<+LVv1bRSS?Jo|5U!Th@osP$%} z-mW(dNqVDKYw~c78Ri+d&=XmxMJ5(PjrqG+|2~F8E&x)eCZ4ndu%x z&5YwEF3f;?4qTI7&;#drATx`RwHC+>_g%jXD?5NCvcSu*1^+@n?6w z=LN_f^sXOe>KBf^NQjT||Ald46jCUiEWhy6v)F{KIWq?rcFZWLM;pSOCf4?t`9r2c z_RKK!L5kN4(Yk;CS8TX@fBwVo(S;RXx1iJ7S=L;8pt?GKSXcG+_{pF*Je~F66>Yq?*SUDNqqJGF+t+C@>Y58=(}aL55PzQ;Sug1GO4tIe8dY z9WLewg{U$@pjIpSA_0fbQwl{&E+$ZNMWS%vxLUL0PF~xN!FHi}*t3uzmvC(mvaJjX z>z_b8rRd*fl4|l&uaZexUrD#i-qTW~#b>HDk#zJEIBdRaMc>ta1TO;Ynwvm7XZrtft$GdPe32%K{l*3$Rv^(-=KBFVcKXZ@U-L|y6 zoWjC;l$SHXKVtqnx{lk%l5_y@zyflRRvzHn((_~J$igwW8W7!3&csD@Mlm^^$I6^c zr)7FD0aBRX4CU}bc@ddl4GIG}cR5&`xS?DT4}S5&y=al)=VyGm_~d4o@OQSHOzZjt z>%;c=e|RTM2qE=shY%xmvUHzMBn{O^r370CE5~nv^Cce0nelXF&p3RjuY!}3Ehic> zs_T*YGT@encN5z2E2R;Uc^kzW#-hwhl<7medW~&rf8*Hq(x832q>_(I_7$W3 zmo~2ouOARLdtgA(nscMk(&IhzX^L6w7T4+NA~%V z)h!#h?;T+L=u)S3vhfdn z=k3@trPsrt%R>=y^r5{Ojhl~&`?TGi#^rZ>R~Rpa2Kv;S?N|QOK_k;+{pPjlz4$w? zo?N&!dhSr8|L2beUO(0ToB}<3=w5E?kEZVMEBp4ni@_bmJ$qaj)Zj_Xg!lb^OZIRm~XbP<92<%sm0UC7HfXy4l1i$=d*q*XVBuG+CE8N`s&u_9yCAkH6}0%d@-s+|;wn zvO(ulnsZo@!xk*MM8DUtz47|O=pQ=B{90EG2)r@3GdEak9ysM@p?G~;zZSyNKTsVn+J^tgf7nff8?P-Ur72Nib z?YR>SeLUGmcdpua=40=ZM>>se*1BwyZfNlSN1TTg)978i^j*BPcH^bg?&R68NJh54 z3IiKkjS&WB@OAZ6P~f#D$cGHoswL$*yqpe0H>4A1IkV_Al?RNR z+nop&C)~DIcJ@!^@FRm#>&5r^p2D!&cCyevv3G#`BKS_$=Abx%+;}J_){-Vq15B;9 zo+YbsRQ}@;cob}+*U*v5)%f&zHRP|rBB9lTvkxP}>cWK+F5I87$X4Ipft^Jzed8cE zESkNE#3FyJ)Tk-KocMEb10C7FzJ@=4A2TI?LKr<9Vl9EIXfS?HGg=f{jq)SRWU_#C zumxnMDO~WzuC3yTxcRUqoh7yMZijiNFU&!PZPi~HiYByl$YkxeDgEllMUNn5m= zj_lY~qvXJ1Q6P93Zc5oVXM{K?c>@iN22+L>5ZP2~^q7i5x9imanf9O5Lz}jTG&G+g zERXODX43+xF9u@%!B?_p2lQ z;~)vC4}Mm~j&{*OI`RZiAI| zJw6{rF+Lw$Dv4l5Mu?SSfkGh=D%2QX7{L+nK$w?{3d2z>LZ}WG3WRD7Sjl;+a1iws zsKX;v9DW2Eu22iqA~ni&cg*|uYLfOfAdbTHH|3)|+~|XwA|t$!X-7dG`Vg*#k^g|9 zQ0hIn!r?GM;p*lMrX0Bc0j^*H1$DaO;f({B)-UR*x#T2?Ilkl^XC*yw2_4ySf{qM1 zfwRT2H%bCueU5BCO`aoIsksQX&yn9x(UD!J+|QAh7%q6{fPyp7&_EDwXy6ZO?p*nW z%pI5^H(^#!!D_nhpP?fw&*0o~?U9m0Oh#u15eA?%8oZ20+~}d8)sKKNYkpBvyX!oO zS|8FWi)r-0t-zBz2fW>LII%Syk-DTL_phwze=#p+T~!?_JFcn@AnFlTULuV!0D7za zmEdH97A&Xm0-`lFw*Mj>xpV;^r_HPXo2Xxb6XH;w@GoZwDJm13pa4;GV)``_CmQ0M zAkxkb%s79Ajy$?dW(Ea{mtm)5#=Ax7br+@ScS!Q9pAQZ@fKw(~i_RL|u`WrwZqbp6 zH*negW0xe#>&V_TAK*Cp;SMiQUz)q?$j~8%AEqB%#o#9pTt{{-1pvzH$P4ZRCKC6N z8vrv2E`O5QXuXg;>>kLx{6PozbzOUQgO5t-gPwH~(wIIbWj>}u8L8l!at9B`OVHLL z(v@KvBLLJPUrp^mOKJ&5gO}IbQzRE!@1c=PY7Xx@6cXni$%Xc#g5*;2nLDD~ACOgZ z=*>zE^=T)f0UU?(xfU{vbYg>|81A^p#UMLQeuN2A;u%s#BoJu~_(V?YSxF_5YeXAJ zoGlAQE+WTF$>omWATop$1tAwZMG6QI&O~BwZl#sjFcTqJouk_;E~fk;BSL1~@M?WEk0q z5#?}-IL=F2pnp)ypl2QH=2Zeno!sUBAGnDA|KkN}WZwVz`@hJ$TORMO9lfP%N3K2+ z^7=P3kT-wz0eN#55?=jM0rKwPc`e^Ipt!k#{9ibwjl9K!(8yajcZ~d>JvNK{51jf$ zUgP8@@|veFk=i&hiPZI3MC9GM91wPWci;Q%-S?(&yx!Vz7v$}nB0=i%BnR?3r!kP% zI8lMTwF3-D?Vm3|YV{BR@~Vk^CMJK0pujNBjcMPp3Bpw?;#lKH&C~3NX@7YRLe|SF%6b4)L8hEiCr@*2!mel|ZiUM$47MANUGYV4% zp!%R-Ob@Dq_W;1QS($O6qTmV=F^*IO+uxfT#)UmO(BlGiq6}p6!Ue8$)fAJ+f=sIj z0o2)sPwM(*Xd$c?k%HrK#LH)0u{r^r(EdFsxD1!*+&x-~p}ru+5GR~!@GcTc z+du-m%sLQfWE@`U;YxwqYEI<5=0qZ)fiH@9DIG+G+NHq$q6Jsvj<7d@7oH1nhnIlW zGF}x0KD?}U@qiNzu83@fwTsDi+1eEw3E+g0G*|Qv3qJ|)vPopKySN|Wva~w^YaKOy zt(t1IH2^1+uRBNa+s&>Bt+$?Gc`f6!U-Y9u>Ike|%nrAtTyeXAYZ-gZ1tbd84zSwA z>v#*u6}KzAcJUJjh+Hv~Ax=0+6ZMs5uV)fcb;o*tg#sOXkv~pGEh8k{eEi0SD_$3{6Z+TG z9NU!$SG*)Z?P9bZ*mFf_FL%QCx)QY=3UtLt0yv?3O{QA!CQu;6k0QeITE<7ZDL{eM z3HkPd}vsQS03h*dV5qK}Lp$HG&Xf=u=w5SZmZj5DUQIVkp@`2D{ zP|AgOg*d?=mO}8J1muNr!}!4!G-iM#J-v5MSL-2Y=8$+c{>ein5=k2P2S2?e5`iQt zPZB4QC=7Zad+5}BWUV=`TugJ-Q zbu6l!Qd3g2M5E-V=PUJTpiEL?doET+Cxj=&WSJ786U$Z7>=M{daxPZ|;WD1sq{`u1 za(PzV^4uDn7FiBg3+pQM*(OCa#3wh(b4m+yc@%i1ae(%?@?3tpP$`8pNi?b)p%L_7 zD9=cMxB-pa5-eNH!E&SUeiL+J4XVp7P(|Z#vphFFAN0YIWdM5PO&WQQut32}H!5^W zyua-1bZJs%o<{9NyRr<_k23@Q5p+p3**QX-et8TSYqX%$$}ea7-PF@Z0UiTBa>=x6 zEvgt9$HsTAA~#A%9xIR!C#4`w19_^*$>zv%((_f)*c#cWGfMJ+9BX)*pv|@@ba7^R zP8yJn1aBwTX|)Pzx*9KsJcv%rR^VgGE`;*Lan}WHy7XchKO4{{o_|G#ATsMy-thH*`%0?1?@d z%BkF)EWtdr0NY?u#V6x<>q?Fas4x8Nd^yiLUJ5yyfZX(|oH)IWr*$Pqs6T3zRLfD= zWI;ijOP#vnn;hEYI5EJ_%|~+t?(BySt_!)m^kPL$oKc~{?QC7~59ofW);2Go{db)U zU_DU)ox|4|xSmnX2c6uc;KkW(tLuEwDWu|jyKVkUbF@&wS6N^?FgNk}pUanPmHHH; zg6Ec(f7+(bWG@x-6}r-5u*Rt2yajrxcQ>zrousZy#p^d>dkCM#YP+FLqnJlvjZfEN zgq?J!3vZG>KJYVuFZCz%aZu@YD2#?qmk$ zH5b}Cp<8*q8QTKz3OUesviTWqY+ucqZ1soT>C~H|kq_7UL@RGx{Ca#}zyuSv?l<}b7>~xpFPsYt^w&rN!c-jLJWwF*m;i%}sn8^ciJ3d_RNx=%l>^ zh3^6a5O8mS`-j%Ol3P3TM%FFxny*|{u~O&Wr$>3&LKRP3CVz=fFNc0aZW=Gex(BD& zqdWOue`Rm0;~JD8h5_RMZq`#X8lO_2;Db0o-P*kprA}M`=4rV+C*8i5Q`kRcd}6VT zmtC&pWfuY&o9lR8$PoA4tj^>Ch?&t&STbYtq_vKf-b~?c8%2wch5s0k=IjT$`?&?mKl=Y}&`daaA zH}(Vf$+AIQhlo!|v)%6h+}3?la?S>^&gewo<7NZB6PLjlD2QpB(40iTQ?Qr7^Ak}* z@ViXGSK|C7%~ud_O(;#y5R@bpNHj1WPKx)~=RNGBAi)Cal_%&qFz+If#M|QWAdLc& zF(8AI77zZ4qQG;sL@AM^klNY_XPN|FKov={)7h0n3@Ii(~T;)7`%45o8p z;WKyw&mx!xN``5jXuK;Ke2&4>Jh8BA9;hhk1(QOMdLoGgpCzU)YBn2uM@jPGBc3F( z=7;b{NecXec?KAhugMVsoRX+=A)76NU^qmHM4D-Pj*SD@2~_w`_y|yee^CYy3pd#N ziAjL?3sd#re{T=KX<2wzV07>S{|o=bK$sNTooqEJwxpFyVg#^#$)B}{o!HugJypJn zR|?wVA48X7Fbb#kQXFK*kL!cfW!L?9zg8>-dTBLZ(2*6Ca=>K)!@6l*GgO zVhO|t0g_U@9#9ccF+f$DI|Ts70%|%!?tQzpOYE!a(O^jM-e_Om=CbN*kt2x)XVte;+I3xQ(*fp zN}|0R{y>`{QRL)X{nRuGk8PcA(URl>f?gYc@Tp1CAIPt$B6+Tsuiz&ZDsga>uBLrz zO?&HngVw-7{{*nT>)sNS%2LbO?6LwIzX|UXl@kPGl?yd-jLodB{uyc*%)F>2630aSIIpWuPuKO6+ve&I_cJU0Mrm^G&i$WBQGs?IJa z_~}Mp;n<_>J_w#85duF>DUrkWW#u;h*!0zUD8fEpqM&{`w@=rSax4HIt%UNy`@{7& z8E2~VdV^0TsXb2w8UmHcgMI<{#y`=LM92?Fv%C3&>xW}5EH6d`@t4B!;P|cbb>Tlg ze?UGQoCU!1y6`!=0Pv7qV$&~o{Ktnv%5UPtW`jA$E(LS;rSw9fN7d-F)!sx)Akqxj zopPu z>l&^Pe>&&JxSw^&IpanCBDvica{9qBwc5sNr4F|`DIN)V(;FX7O P(O4sCfbGGsLm1YBV|b4i z>*whfa$n%Z9jW-uBqItp!j4ciUd`%PN8#-yz^!Hn1;l~SnPd8h+^cj4Q%-8_ns zZ{qz1O`i?h-rHyDju#4CUPzA+<{`?^VeB%#jT>Lb$&ZrSvjNd}6llQ=9T@!Tg+CZ} zo@o1WXPvgLI&BKd?$MA>ZrdHbC0nKT%!$~=WwAq$%xNMZe zk;?E^myJ$QREDz%G{vVl5MGJ#5G0LxU`<+NMvcDu`Fr!r}$i>K_g1BAHOV5;ky z9XoVvqqLchZIm(~tm}ZJ#~s@qz{wkW`VHMRHQt#n^3}XhsGQ8EMOnYoX}oni`B)eE z>+SvT6ZBQ*$P>8X%Xq-hA?nf9-XsuNwu}r9r*c(cu$0XSeO=+f1#W5Bb~2mFpUf62 z!a{+|4Vyc6ALEPp;YDi5JS-H3(_Doc#t)1^*-5kXaoMaxbLC;NM5k?)F zDku?h3owL^3N+I>92=OUBP0#LyD;)bLtO#1vI2v2HzEOG%WS1mDudcAEiDa!V1|&ifFot38mo)>VaE{tkxH?YqQz*n-1!XQ7(6=sAhFl;f5-CQYUhG{b`_sa|AGB+cX2MhDU z%3z_umq)PGJ26eA)alkxAJMt3g~~!{uuv{>W;10KRBT9KGqE&{tUFo_d#XltBW}%> ziqu)jR&s)qKN+Yifn@$$iEe+E=eh!Q3^ zA}Y(6@Q5-dqSaEVSQdztVwG6p3KgPKBzWD2-uV4uK_Hfj3u#oq6^fNvVSy+Xaw|l+ zm=}m*hJX^HlFbW4#5DBfxdO2ulmxCKq@j_pEb!F|G=N5)K*+eBD>JNHYVuG29Mq-3NN+j7x=qLf&+4;(q@aXpiLlM?XxTlHO7)wN_qcL`( zhw)MrOZbD6Ou+9YMnjPaCZ_Mh=)OK8%tV+8HpWKu{g|b%U&91&)7}VsBF;t)mtodE zq8B>F*-@gU<4utY3GMA%rw&rY&L z62JHW5el;rCg>$%II;IA;fWqM>m8VL$bsEF>WB6~WH{sx#-Nuc6!8*2zW&oR{6Ndt zahkxxvoNt-EYyy(o*3&rN<>4RZ@eQvi4o0K|ilIH92(xC8IiJGy-lmSM&^NBW~<@t8N12=@JF zJazDaZi=SZiC=j^d=Zvl;3pU&JRWzHV0tvc(80$EHWnqk?5ICD@ErFcjn{j5j{T~J z_3EP-t&f^PHJ~*S4#cAbt=oZLrUicK9^T7Nd05tq@9iG*#E$l8_f7T2!(JxVH^q^A z!xOP!e1iEWO`IKg+(mRVfk5cg$RRHgjfEnNk3CB0hnUg`*j#J&$M|j82=H?;faxC} zha%f4E0)xKs+WoQ*vR1}ZDNe+e172Xy4{TD_$~BaikNT{qo=U{TEVD+zrOC0TY{%% zTyGu=1}4ygP5Fty1mOuyObq;SFXQpB;e#L6U5r_2{WgQ|GB*p-BRmmcHE=S<_DpCg zGdnba=}&1vLIb}I1mY(>fjAKbp8G>Vg7L)slZFwDx_Sh=26Azex+~fXmnEP;N z&O`x*0E)fe@D+@`|MS>g?q_iw)o)=5Z#*3EXYF|G;9vGe33keV>Pj7n*-nRW-LS^) zA2|8S2<6cFgFfBobOV?Z5BEkx@rXxP{Ni?Qa`0&#c?{+is3{wXgd#-9<%NK}6XMjd?8_1)}OK;7;C zfN3nW?vjHqYnb3QFS##)O^f~#6ZH4^LHa=qD4TeA&$HU2-Z3UR=7GUHk&yngm`&%8 zXLSBM#=DOa?)cM32CfB`McG*A>w6=i0O9rzTr=Q~e|^kh_QRUOS5`^_0rQe^GtpB5 z{eWiA5iQ12uW20AO*E#z-C#A*EHTP>qCGERp{H)&^K)HyY&y&uu0Q-$t;3S|`3r}? z-#tyZ!$q;+QjX33kyR>H5lh{1r|=stg1XZg{M1+3TSqdu)@k*-qd2}eup0LZzlRZrKZ`kSb9mJ7 zpY}#!A@Pj4?*S0Da&QQ{B93dEj;D6uNB+Wj5wHE?Z!zc72YW7NcijJEPyfMLm@%GE zcp8ZJ2W&TDu04kCZciv4#uK2lnClj>g^`}e|KAJWOzW=HxM=;|omUP#bH(GB#$`FN z3;%@8&}lvpW1`2=gpUXH;ci$?8DAGzZcIa&4_=Q`J+sEY>*@_hLcYkZZ+2sYIsyOhyZ)vNi~i$h zcm35)ETO*^qjqiAX^$K^g8%w!UED7Lm;GH`ogc;@)>0Pp;X5?`ps}=j*8@7{fA1{_ zm27&*ct8urD7x#|PBPxL>)&?ok9UCyrJnvWYt{N=)7+n!K;;vm$Z`E61J8EHU|rZl z>>hAyqUJp(cK^HQit5fIc|eI<5DL3!eFgmG5h7H`Wx*$s{a*nGcO)8S%;sIc*nMXh zv$D&j99Q5f*!>aipq)DZkd$Mk#jdQG;V`_xCv74VXX;sU{?ii_Ys zMOVrdAy068=kZt&{H`G)UIuS#K`57s1@O(h#m?BVOfi=$E`gsuLa>E$bwTJfnpTHn z;ZdTjV>Q{beFoDG5$vvq^__?9pT-VwGhDVXL_~z#u5EivM7EGAF64FJ(MAQKXR`0zjJkcg+O4UUNDTp=e+ zTBp#`AgxnGBYPx=QFQ!7A2BIHt8GXmKQmh(#O`gmC30GwU*ec891}}xJd;?SEdyf| zV1R+h7ga&2&Q9dv*4QZt8#qcxqav<~Wp%zbv=kZKI`|MFY8vvrMNLz1-t;s4EPZ#$ ztv{FEJK7PHk-v|edxZxNx(PcB(TP_&`;s(~L*KB;0MB099!Q(bq2|5VBeD8kOv@KR zEQLy0+>ZUvxbaR4`Rq7(?*Q74UC>0B*cUW$6E#tFTitEwmBVw}_oCR7pcYW;5DF&v zfof%d>Y0-YU4}(srV0@hQI{*mYst5+CZD?+X|mi|75;)@QQp+MUhZ3OPrnl?6&qz8 zdK4Ebu+=J-T(?$f>T7FjV2p;C>n~DuigXZQI$4jF+|yyauY>#{M?UvaMAW+*UX^6R za6o2x?`7S9f->@{63y&Z)|(ohptDjSuL*VZcV?=&+{tt?59_5UlRw9ekKyDSY4Xq0 zh=7K1nnqPc8`2*Ezz!t@P#R>HVUHr_+69F>Ok(U3!^mIH^c%Wrp=&KNrD&HSA`vqt z$tPyW(=+|{E=(27=?=q#07!}$e+qx&4{w8^a~GmLwhr2|3t zL?Gz5z{YJleq!2V8THyiW3-BZZo}V4_?g~U*fc5Vmr8X7HOP#IFnd}!xhO1@L0+Pk zJi5d9*E`5Z=gHS+k*dD0D0-jqS9K0$40G}fvOGHsGAQ#40yrvz={%9*%Cp2EPh3)3 zsmy{;Uz-lfX>f^(jt`&Z5T>*3(4;BvEY~$CyLXTu&S&WFJC))y4iG(0rz|vNqRkn>M$wS#PT48D#m3pJ7K@pJ+_hcc z9JJj=p)`4hwm4Iq6ZgLH341ziO$lj=a`4s+B+xT-#+8O4G8st1(*l*TyUdXL<{j2d zhPODaW(&>RoIE91dCKXuQwR@-Sr9m@#lfXBoXwR=3uc-FJgi*CVd0z>+Gb6et#&SL zccI+9(`5!hWHOW)Quad1$+&DD_dA?b&bcRZzJ-&BLxnV(;r6#=q?#U-)C3{QBobGT#d@RhGQC+wf4g%#+u5 zd|>kHccFmx2RFYCX|jEe&B(8RR4u=r92y!D2JYz<<1F}F`#y3wbM2X;;3uwT_p@{@ zKQIx%ip*zFe&8eLAU~jmKsPNV2$-*l`GK%%KzrT%07_R>AxJ$dK-SSz3tDr3dRcSK zT4knRh|yk%(Vo+6!L46YtOGmK0&K#wB^1^Y*V>o*jE@+}2mTZ%e|TD43@+&71%2FH zeN=T79>EPCmpz5Vm9?#Y&95QM_S#mzb9>J|*?XFV&zHdxQG!*<+KO%Q{6O zdmNH;iP&R53-;8y)dv%KAhM?gjw*Zp@)-qtx`gFYc0p+3&FxP^vfyvlcysn3Re5vN zV3vbK#u&;Zl&Yy*mcJHG-&P^ZT!jMyDqDd)t*uNdWiXs_t8n9U3L~0g?J8BV_lu%UVrCc}d-DWGf@Po>o1{Ne{V`RyOHhoc(hby}&k!%p#5JD(P; zE-K}+TWN@#@*HjB;eL}9;<9v_qpfh`$tnmAvz4>joG#jCw>w>~lpRHRg;XZxpeZh8 zg)lE%P_jVK*OhY8c4wNWIL?_)+i5epxs(!Un_zQUd1z;`aGYSVz#S~dF%YA}#k;Jj zl+~8X2o@WFp&|54aW*?AIBXfa&1%b-VGX3yc8EkPX3?X5ADr3z!GaRLMo8j0- z@iHCTC{{pNLzMTyR}>1`BOjhQS)S!+(Vk0}qJH=zoV@p+L`8i+(4aas;KGtTYjs2? z&#ccih z&y@2>SgtH^^?CH0pTc(Jr|svFn!!T%c~po;b_P|6b_RWEV!$&$S26%){>y^2vP&lG z$FJe!mtPYZ&^nYV6HJDFSuWj2(Y;goLYfM8TDi`&_N@uOn_gGow_R}*;~w02A5NbB zKREf*Ux?_fji4Ay6~T2^3n(aH;-9q^#ywpC!z*q1`n|+un78QKhj=yGm6P?GOH61?{^<+U3l21Np!C6;9sw zD<%0VW3LBRjr_j1FR@{!?PY#e$a97OTtGh9(RjUCDpK4E#thxiMtOJiAQpf)wp6}$ zrBGQYT@O>1tQ}HpD3nioDGTcdrj(`1b4YS|7ofFvE=a;g{WOT&0nyQX3P4$>$abp~ zBX?0*3<*NzBgziR&j>R)d4d_xlQP30p(iu!Y_!}!;+%LyODv z-M?x}UO1tC;jYA)ZaSZ%yArK_54qtp=H$6Q(@mbcbJy?vC^dQEtnDVB{8l-yPW~<& zQzg&ixlr=F9Lgkrm(K%|AAqB49Brv&wye@g(lB(q9XF$nKOI>5WoMi{qW|#G<@5?fZo`*|0uB^U1WoT0$gf_CwO_rLIobp;`d?+zS_jE z0{Ti_cNIKH;U1h&8@6$jOjgLH#Vd04ktMWFqWcU`xW8J+qT8t=2Gx9NWv!x^ znu8-{s5~!JI26~0<{#TH6refWGDdylvC%>2{_7OTG-9#8{{Ge>3XRl&PfKkRW{`cf zTU+s3D^{R|n0DP!z^+EVV(9h2qPk54D!;)JUSZk_wW+cR?Ab}7g=-5^Zy zq**J#wc^dCP)^CT0#w#)E=)NE*a}lwv$-%qlF};~T7foHZ7x*Bn-p5%HdSsioys#ihY)B_U6KYw0kRIwZhGXRJ#q?3QbYb1W_?p;8k#ngEvwTdQ?8=a%#1;ti?(#1tozRB?F7hzO@Hdbyf(OEKu`wLO zFel&#eU3AXm2uB7qYN{{cxJ>hx4WT!2^GS7BTbQJI%D`%4l+gyr{ zlu~(K9B*=Ln)@6xb&G@%zS8xN5fsd3D z*IYI2oi^9_PNk;YHYKefA9HZ-L>BQhm6$Y7CnB?HAG;=-Omt}m(zRxP(MIzu9`=gEwvYmR$i4njdxf)?rIQ$(^J7rdMvjD zI*Is8rSdtXWAc1zk!y+`zb2Uo*5pp*(!Nm`BQm=-FUHJeq^k;T5~@MAe8&WqFeQf}IE7P^)Pds2uXIjy#n zCCt+**oJC)Y(j*)RdP%reX&f=CTWRY6*-ncZiRGWv{2*eR>=|RPbTflnNu}csNmMp zW~=y?L^gTU1@Ng^E@5qFKk{f@n4%+#sl;e0l@;ymR`CyXe>qp17s&p%&IMRcQlN9< zI)m0TwfT@wmQ(a--L|&QhkVNCnyuUBcQr@nQ!twnz49B-@Q_n#w$aH9nFPWINeT7tSSq z7VtB`mpTJ}W)As8TZ>=Y^>?e+-1O9Vxt*S^)oqzx!J0M&ep+F^oy=ffQ^?lY+RE$P z*cO0mGb7)r=4Z6AeQRs7|Fq`Mw(fV5 z31|uaEH{>ocizBo-!uJZKcJjl4R#q(q( znMki^YZ~ge$_}1ErmNtKqJ1^eqpeymAkLC*yQhV{dI4fX?eJ;ZH$a@BDW=mjHyYzp zZQ>GXpKA&B@9W^Cs8{oq&eeKf^GP74waWR}m3DqyJq{4e@bv8L#MbT^>EzV-Dwl9o z+u;Z6+X_!_{rW15MOLOEHrmFX!MawjSDTMbx}*{deJ^$v`AVa!$*J}+Fchl-8z#lC z)b#C~tW6E^29>q5p?<4uh3FSSY#rp7E<@b4oh((`smXLh@$EMDL-ffeA+96GCw!A_ z_kV5Mo+~*|Lafs>4nFQA=$&f?#XupZUFH(wz*E>ui1`V(4Zg3WEWF5H#e9W$tABYS zYF!Gc=dVJ$Re!;Z@-Fb-@z&z;kVY9t&+eHq$e)}*bp>XcVSGzI zlu1F)EP4*1T#aYR4Y`*YCJu!&@XSo0XBs7Bj?X}@#%)EpmpFR%pl7cY(lg95Gl773 zp}vfs!|0ho&)#X2wke_X%^2fB@X)gtJ%`YP$Bc%WMQNbX6KLoe^z4E!%mjR4Jb)O( zq|h%fbQJSN3^N}t!2gsEPbcE25gH9X;#cvT7uEUDJfF-cVmY}a;CpEXx}K>&8l~1B z_As+)dKvnPpW>y9!^6&|{ywp)4qxQIuQp!+?Ra=$czAiv4n3hII||eSE9&%!-5DMg zig9vedzdlQ-o+qzr~;UbJ|aI5qiP=!baHCe%;!f}fl1t@kcxvk593>XH}KYUh?zj` zFOAFa+z}Cih@=8u5S6y9cc@hu^&e*C{_Ei@ulGQfFdVMo8}`chde!-Z`0EyZ7lZ@j zUvNtN^fP1O#Rh#4nWE4Kq{B?jcSc<^sJ%JnSYIB9)o$5m1pPlku(b2Prw4IPwPx!M%{2%2`sD@p)dX zcIVgnhu8W``UkTSK8pTU)W3Bv2c^^D)#2flxf;J^{cVO{MEx7~s(|(clJ6wykD9le zuZTas$$S#!HGMf6_8BOi0finX>Ob4mUu2R(z8*BTFOmaadxe{F!yd&|WkLwwOP6Z; zCCf{F{`i98`h_q={GyxSwdj)wFSrP6nVB!lGoi2r=<~X2`qV}~WfQq{A{DIh<0~<%@=bLHt?HxCW-p5tk(Eb(^u&bLS5h8NWYr<~zDVeGeU|Kvhe1H2 zsJ{YVi3tcP=_A==DW{7{u0icBUih^he?)lo{IocfHx8YK3r+Bv{ehZ&Xvoi_`2l(; z>ci?LP#Mx6aG+p39yw06k2JHThs!U=Ohk!WxTY#p2KiEHP(08>)DG+rlogKh(D2qU Scpu&t|HGaSVOW@nQ2!q=M{ The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. - -## Configuration and command line Options - -The set of dictionary keys holding [DocIdTransform](src/doc_id_transform.py) -configuration for values are as follows: - -* _doc_id_column_name_ - specifies the name of the DataFrame column that holds the generated document IDs. - -## Running -You can run the [doc_id_local.py](src/doc_id_local_spark.py) (spark-based implementation) to transform the `test1.parquet` file in [test input data](test-data/input) to an `output` directory. The directory will contain both the new annotated `test1.parquet` file and the `metadata.json` file. - -### Launched Command Line Options -When running the transform with the Spark launcher (i.e. SparkTransformLauncher), -the following command line arguments are available in addition to -the options provided by the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). - -``` - --doc_id_column_name DOC_ID_COLUMN_NAME - name of the column that holds the generated document ids -``` - -### Running as spark-based application -``` -(venv) cma:src$ python doc_id_local.py -18:32:13 INFO - data factory data_ is using local data access: input_folder - /home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/test-data/input output_folder - /home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/output at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/data_access/data_access_factory.py:185" -18:32:13 INFO - data factory data_ max_files -1, n_sample -1 at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/data_access/data_access_factory.py:201" -18:32:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'] at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/data_access/data_access_factory.py:214" -18:32:13 INFO - pipeline id pipeline_id at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/runtime/execution_configuration.py:80" -18:32:13 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'} at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/runtime/execution_configuration.py:83" -18:32:13 INFO - spark execution config : {'spark_local_config_filepath': '/home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/config/spark_profile_local.yml', 'spark_kube_config_filepath': 'config/spark_profile_kube.yml'} at "/home/cma/de/data-prep-kit/data-processing-lib/spark/src/data_processing_spark/runtime/spark/spark_execution_config.py:42" -24/05/26 18:32:14 WARN Utils: Your hostname, li-7aed0a4c-2d51-11b2-a85c-dfad31db696b.ibm.com resolves to a loopback address: 127.0.0.1; using 192.168.1.223 instead (on interface wlp0s20f3) -24/05/26 18:32:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address -Setting default log level to "WARN". -To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). -24/05/26 18:32:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable -18:32:17 INFO - files = ['/home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/test-data/input/test_doc_id_1.parquet', '/home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/test-data/input/test_doc_id_2.parquet'] at "/home/cma/de/data-prep-kit/data-processing-lib/spark/src/data_processing_spark/runtime/spark/spark_launcher.py:184" -24/05/26 18:32:23 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'. -``` - -### Doc ID Statistics -The metadata generated by the Spark `doc_id` transform contains the following statistics: - * `total_docs_count`, `total_columns_count`: total number of documents (rows), and columns in the input table, before the `doc_id` transform ran - * `docs_after_doc_id`, `columns_after_doc_id`: total number of documents (rows), and columns in the output table, after the `doc_id` transform ran - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml deleted file mode 100644 index 369a1bb72..000000000 --- a/transforms/universal/doc_id/spark/pyproject.toml +++ /dev/null @@ -1,45 +0,0 @@ -[project] -name = "dpk_doc_id_transform_spark" -version = "0.2.3.dev0" -requires-python = ">=3.10,<3.13" -description = "Doc ID Spark Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, -] -dependencies = [ - "data-prep-toolkit[spark]==0.2.3.dev0", -] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/universal/doc_id/spark/test-data/expected/sample1.parquet b/transforms/universal/doc_id/spark/test-data/expected/sample1.parquet deleted file mode 100644 index 765a6776b209cbc0903e7a09fa3af0f3844ba343..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 36668 zcmeHw30zah^LPRXh*+%}44x77)dR>4iC+)G6+l34NYvs>@&bWel7!1*)l$A{y{$*R z>RqpT-&)TeT5qM+t6ufiYOVV9_WR#`$s;exi)|ILzwci@HObrA-I<-4ot>T8eNB<1 zaT#8W6h=fM!;e90LZfwB$9~T0^bL(Rghr#&JZPRYFPinwD+4^y88k)!qf;=b-I(Fc z@NUKQCE!e7TSZy`trJBlfa0AukkLTDG@2jX({p$%4XN^I#qbIg(!3i7GW|34dQ^w0 zy0JK+T!w#uU%)eZzzaH}LVW)Yrodn@C>q0q20j{tFBdD#Q z?9INyBS$aXX7y+Nc<T^ien;>7ub*_i^&Z}syrn#a zB{wkebllWHR5&K{{agC@^IWYeno?QtPAV<2{w z38Qt2P;tUV+;B|A5vs$*LT-4tLah{tg**k9t48@4kE>KDP%$dzVJd-ugNnn$)M0#{ zK&%oeI6{;wU`FsUfhtTHrWA=)DsC9~7OBF;XqZxg@l|4uQlwJwlwy$}T&+a8Vv(3H z;D|9LPb}aI!UTLJUqz^+Un-%F9-i~*$Xx$c41b$8Hc%Oq79FNH=W0~Aq+ZkusH%tj zSRa|;)9N2G3QASqN@uL5Co(+jTJYhg9<&Z#axiEUzZ;va&|0t%qehE`6rm+-VXj7@ z%hl-3SdkvpvN=kFj$O^L4%I{)Nya}`|TSFlO~oudYXMu{HYgh+=!8#=8P}2L_yfU)Y@{#~L^37BM2Mb@SNN zW-;^&JpYK8MU5LQzt{_%7}&SnvY^gCOAA+H(x%r>o%(O??OB!eJ(pDUYM!#4U*1|& z<thy7z1U?ACt-A6*`&Gae|pevFd7qib3%6OwG*aA&tKPV_TAJ4)vVM7 z$su8W6LcfHKmH_bVoqsAcVtq<^}?7JO`dBmE!Yu$W@>|jJ5oEJ{hjxp=;_P#=(&;I z51CT;ZTsc6&$GFc`{(%5=Ab(l>|8j^`*R&*!xyc!vrsjs`H9S(J3~h!OXFtm-1Os> zoT4@JQn{`78&tvStJ4Bardvxk#uk_Dt{%|sOTTX0_w*D7UTql|{8ZiLn(D-ZWv0)P zj-PM3ugfgU*`3S1f8G<;Q!IL& z%H$*2^YhKVJr1qu{u>q(Qkk_P{q(=RK3`rDSX{V$LaSj3MOSC;I5Ou@6gKg>{>U$i z_<6ny4zAm-JNE7EiRMS8v#x$I|5Jl6y@$DA;2-%vwi<(Am5<0>M?t~)D#rpmm&Xtid3C$DB34zU(jCQMni_xK|nJq~(V_Be%pPt$+B%sw0c1PT%dLxX^J_=)lkS-A?QK zYexfP=FXw>3Xk<)(D$;&E8%d+zxIUr{xKwa((>=J25#3*KNx(j!SsH`L<7CLPt#i+01`UcFKmPQB(o6BZg17AXB;)9Y{@oTl@sl*{Gh*4@ zE;~m)IPP;PXv629HWCP%FB~t%E*xWaz4~apvf8jE*1YDn;=8Z!c9|30{UhVJh98($ zjBTEf@nHG5`mI`j+@-ATrXH2E(weQk-?!}@uQ6k;u1cAf_-O1`OQYMeqpS0m{AcV} zK9jnnHr>LGI@xJ<_3X+9?Kh0ueXq2%@7+rw!n3>gEU1_9`BSXw=9OI=?>B7af3;~slhTz(&JTFv!DKF!Ox~+%e)8$1 z0W$AA{kAV=+-TTz-};N2)y1Ng&#U&bdK)VyRunIP&u916)Nh*QR9)|PV$xpY?9n&+ z=g>IfroUb^F1*row~8}(s9)muXZlso>v~evH>htKyGikuidk1R8yW^oJ^1b93%h*& zC=J@+;pHp(s7<5KIxc=Nq@-}ZasI4-R|jm{GbJQ)?vWkAVJAQ7kU6nAd;Ef9zUR^t zCO$9jx|$I)ukYsgxXMeP23`DY(TSM2l}Ur=oS4}}_6KXRsQ%Xrql?Ey&)PA(vWcw8 zms-)B?a`ac&)#f5KBBuQx<%Hd*fFPMvqlbH#Y(6*G-Xk0N9CUVqejKWPw23lKCtrJ z{KE-fwP_Mz+!URjpkj^>93QAjOKHEGZhU-iUdP$`s4EZw{j9i53;?!2<=jS zr{H<3@c&Lyk6gs5nmf$2NYK8^fTeRwhxbmH_q-sZX-BDOLGJF>kK4~V-Tq#a$_15+ z4C>PpfBGP$X&1qjHa!cQU1)K?$HWukPJz|({laop@VfS1OP4+SdFLlNy_)9s$iMll zP1VG5T~)iF(gB;eeyZZBpRMmFDVuSQZCYoLb_%Yz%+Qy2ypn#sN1vXLd-`WL98;Qc z_p+sT(c-V92cF&^GW5qijRsxkbc%iN%ff`Jz`ZxWf3I1yFS;O0bL1~}wEir5Ab;$E zUr(;uAKvMTFkUhk8M@u$YpqtW+0^s5kn7i!@x58OJ>xT@eZK3Oe(ma=^kn0C;o3{1 z%X`GEXn3*s)YhDD_8kw+*k){!w>$0Hpp5FEm~oX8DojsyPv7pF{hyR|HwXRPhM!Sl z-jOaEJ7l?N{Jw<&gQhng5_9~c4`Y`s8#lXayOxu;nTK7#Gx)7{v|bX!UU-&CLO6)IVH-H7+pBC#U9JR4<>&ZJ>_26 z%JW^8uf4D!=gWnsGdhYp-`E^{Z+N?DJ@U(L& zqFWxHRIcOh-+1`@qu-A6@r;_>HpAod@CVW=^>pnDzh2rYLDS6JV~==Et9RkO6V(G6 zOiMFtZqlN~cb*jsTOV$>{A0hC(wjqnlditezq?*oY={`6Yg@la@jxFUy%K$Q>dZym zN6bq7{Ygf*xuHMqWRJ?Jj^8NA3O>~-EckZS+P(i?bFZ-Yu1Rv4Rq7?l=*&F6^t(oi zgUshYjmQ|<|A%`!E7iZN=Ee%9Y`m=ZXm+gkmbRkwhtEA{aEC;0>^%PGX$LM9%D#JA zKS8|kz2T1GZ_tf6UK@*;C-qM zTCJf3IAPsbF?y4w2*XRu*%>LZER(epjmZ&BD5yzI5}Jo<^r5UYObe({fmUimkI>0+Z7)6*7gJT1%$4W7!1uz{-CMvN{6qKLcwS|3}p!gtI4R&bNvZ@-e*wp};@3BCt{%>c+V}6K8jBeU%0v9lhCv`?tF?1J-szuJuF~v7mqZ zP9fhZWbG;>yOJIX*Mv8c@{L!x^puJ zOr;~M(~(DMP_cF`91#!+0i<@UieP+PRxT$Mow)&$%Zt)vH3(~)a3WSR^P$Duo?&M5~0 z!di)d$eIW$h=apG@_P&xDc0pj5auJc#f)e)WJdp13_n}&&=(qG7|h0L$_OY$j`l<5 z_iGjAN2A0#%Y!iiL~cpiJhq+}?GI^d)_|ogIqSUqx!pRCTb(LAayTu%Lwn}ro1I_u zEMrzlwtCi&q<3o7Y}lFsqE$l^nxex-aJZuVt9=(%|K2uY*~7KcK2gh#sWyFhVE=;T zv&XML`-ppG`0?LXR6n0|dr61uzm=}*QQq+GC!8B)XJVWGw{5OyiV^gSe!NLdUoF*!0RF*%-}7@sW5OqE6PW21$A;&{w| z6g7lSI4bKN%&x2qVtgRPslsxLu_6;-3$7xoeF8T6AiD-2<35CH`;QvfKa91&KgAs#d4o|@2QgONUOB*m>4yNYdR+F8Bej2?BE6uI3$df$+HhUm{ z3`B+vgv@(+_kV_+k}=&Fi4KfueVvAT10bZ#58;SHcp?@@*p0&h|8>PbDO4}eZDpo= z5H&N7m$)zk?m2KxdO;7E>w(NDM%G#&Gu(ImGOUzL013&FP6^4&AR+1Brr{1thsU4Y z`JU%5d(fvrq^W-x_98wm+V2;}iIGU5bdvnSPtRiFw`R{6T-YhSq&{s3cdA(1bH)!D z3fVKm&-+?)`ZWzeg8VeBF{xYj0U|?Sbm*xMAJY*W)J5{P4p0(Dnx& z_a3=Wu4vNW(3T(1(T2PamvZO*bA$?H2^9*ZIvl8{lBeMFRALcFjS0CtkWf^Fal`mR zR4Cx{Ib4)43d7KFUYL*zGm&Z@U!*`q%!n|RN}#}0TyD5hpavOAHBT*8fezGakmclI zTy>b3BNU>_aDiH_!zze|ReC)t%|}BfMLhV$3-h8ygq@%M<>Hf@VZz_pax$&! z6RZ#0B0vmHW=)XCC)K7lk;ZSJzLy5>+a;BJT(Yki z&0X4JL0AL-&{+fhi`JYQg_a)gl~2>s3TZ2Z5jp2h<*m)^-(|t-gcbYJ8cn~|y`@*j zMR%%hj2QgGuxq{h9y`%>!M0VCdR1gToVjWHiHF#ll4S*BEmuw@81A&=>?_Oe{ma>N z3F31n4*qLRivzLSxRcbgH$+<&ub0nilfKlqi!3;xfycez9;c?ysP}EJpE<2BH|?2q zenfSvCLQ|t8$Y_#c^z&12fgLmiOR6}%5l=!MndaU2PHmx^* z=hc%7w?@qwYV`a3(ZK7cI-FCWhY#J$Y4g#P9lmAXzIQRGlekyU3xgUyi5~yH?~nYT zS&f^YFGbD|PrNqD&#&GX@NaBTrD$+ZpRfF+2Pb^upC8@9?A?28Sle#`%7#r|%XYQb~s&(G$w{iw8{;A!Qw57j(DNpsk z+)Fm8H7|+gyD};3af7Myfy=OypN&2%TQ_alD&@?i7v^ld_vDFx1U{S$i!xG#e5WNi+L6UdE+a$+rM@-)QM zYU^3D8b{?n9sx(eCVCAWsalOspI1Zv3M>*@JvjR?!mTb`7~#VG8H;T7?H$-zJron@BA3!%B^sBFu?D7dOz6{p)M^^Y<}R@+X+l(;?Opu!;ub_cWtLq17lq!b~O$ zNC#U$W}3nUZ{k{uNl`|O+0jL>{Dw5-_drAuW-JKW=ac@!-iY+P8J@NNQ8H@@J{qG%MAiFg(Th+5niza@#gA&6`$fo@?cr%2kO z-E?Hft{No=7K;MG%WzZ5z6B%LLCG6xXf&8Iw1CK_TBFBQ6uMo%2FSGkq#oMTJ*1&| z7h!pXUoeXvu#1k&`hkud{~jNcy(e;~lr)3OA`Ude+aEpvKbR;mIzmCQj}62meo#|; z|3?zF_3Sz|fgZ4)jtu(|ga`KG#1h@u`=ccE)hE{Q10?h=g z(mxK8kb2{1RqSXN9i$^q09EehG6>3!a|+5n2SM5WZ5pra*l7N>43!F-Xx@Ob^69}R z=ADz}d*m#AF??C;+uqZXe^QN#>GZYOXOvgTQU1r}0g=A@6DtQqw_Cojm!d`Tk(N8E zZ$~uyC6)ijA6=vG6!iIg)vZg7BU8%4m*ndY-yOou?cHl*`?a~N6F&Jd;^57++gs(? zo%=raZRbU&jLTvQwIZCyMHK=NRuwC`d=O+6D@437kxI!$QISd&#^Z+YEM@6{yjYe*c0$u;GpJlyDmn4?-NCAoiPMgNO=G3%=8P}y-+bpTP1u<8;O)gXe~NxbjP|R?Yc!r zCfvYf_m5qY-s*Ma`S$@6iTlV6ftduCKgn#gUPvBx4`g2cpug^Y-N0v^g*2wONtuu7 zP(}*4rrgoP@e;JPh;(I`#_$Jq$X8Q4(h^&N35}Q6+*2eMTJNEeOKJ}9IusJ;9?6CF zqk`m8@|ioL+z*gdbLh=V4fSa!q5&L-^SKr>jC5jyq8RSD$i*N#PJV<5Q{ov?MkEkv z4ERJ&>sd)9l50d8NSrMTMJ^)8Ov&Yr;UF@E6a^s{J4Ffz5Y9wmZ*HZP*fCq_blh?h z&2Vb1J2rBWbz4M~z`pVAiM3zbroq;&g9}TvfZY=Q8&L*(@wWO>HqO~x7@~rebNF#g za*@NzN(MMDG-Mdrh!N#*ia5?oT7X|*tH5WS>gH7fNS)l}{~x%B{{Q0zYGm&J`TM`f zyIUUbt{uIlYe%jd1LXB@W*~3=>I3rTE+oAAr2^#L!Sh+q>^g;ds5Z<1WbC zIYolh?#pE9HKarS1>M%1DPClzX99_lzK z$a<62n%Xl3bz}7%6a-#xr~@SZCu(1Uj)fG6ETL&+iFhayyiqAp@A=ocqtu3h1#XS{-Om} z^V1)DE!P#p`$r$Q8FMyms*u2Z&rTlOawxNfY&zX0YvIk1I|Rpq5cOT)%Ne z>;iT||C+Pddfv(vt-ZW<@!@w|T+!7O*Df}AkHr-kQR;;2b#=#je}w`ae33s+MlB;G z+%)EwKD2v@u$K<#3*9@uk5XfJod_qr0b9SU^CNCG&ad`+fW?<46x+y?`)d~6ra1)+BM&W+~E@L&XwX;@ueG2d>Q4x4Av7rbL-e@(7BDAOs z#%_#dXi<@&1oDBpuY@GOxeiPFKZ@OYipB9~?fa(KzbinuJZLZ77Rqlr{tWeG)Swy-e1z>pWOPsj(i z7q#)4lDv4GFkg|K3F}x?*`=n$D2YbNPs>;8Q$d-e#P(dQjEWD7kIpp3MW6w>)4gGtp`%@t~+U|I57&&54(KrxUG08cvq<4C3Ar+ zEH?crk%2iR%Sl$N;sE5?)o&NN?K~gM5N(#|8MA;4%icY8kAGHJ6VExY5}&vqKZqx@z#|b6;NOJS^09Fb-WaEGy%EkRoSt6 z8&B&>j!=KpDyf#Eu*rggHkUee#Wy*$$+2R9pPP?n3*6Zc9b6Z3cxlCo>{z2hgWK7< z;vdlcQmt)XK>P1H7r=U=06K@SGjKhlnh!d;Nx_S?+g8{4pi@Z2`F7jTN5=`#&tOizR zdIi?B9N?$v3*E^K%xf;RbwaoDdNZ~K;1zP9?_~2c+}OUFHQDM9yVI#RMI#@s^$Awq zxcK$>zJLq-g)+?QyVup4Q7Gd7IA9}QvOdUjk_(hNZ3*xh zitBi-)yj2p;1dGhQkMn&Q=32Qy5Es!gPOpfMRVj@xYw#1R&tv0{0KCdnLDa=8ddd;5A=4s$!+iy-$zwvV>M zpI#3Ah@4bjvULwmu}63Ezy8YJR>w6cK@0=N0o<&oW;8CjK*0xbfV#DNB}$#R0L;^J zd3Kt8EvK-5%D9AL8853`$;&DPGB(%oJSmfBt6pYn9N=|j2meH-<-iw(`)at})zx|d z;w&n+ds?tp*Mr!QJ9?b<4Is|o6w`5<8;~Vl(wUU>gpH#PdMxHz;p&ZQ?TinqH*0(YxukP1ZEH14~24bUb>={_s z>~O02=%h=vWM`qT6k9IOaUTPNu_|E0tnn)5v|GBOErsSLjVx3V5z{kx3dM7S}F;EcG zHlf)GfTv(Df#)Y8h2VFYg0IB+OPa4B-Wp$;lrAVqERbkmJe(BovCn(hM?t&=)GLqI zb70;@B8jub<3Sn)Bx67ZB{dFwiXy>tltd|!ByV-YaXa5=?#-Yka{AC1fM0Q zu4*fq#((5DPch`-zT+_zP3?;D2uqz^R#dS73DT0sn*_(GVtub|*_siY;m7l4t>J zU-D<|VJEisU{95=;+2B7_@AL`F&KqYdnpdG-_)znQqn797bCkN+axikVE65E3Xp}@+{#WqT z=(HpW*6*50;EPPdao|Lf@Zs`xS$|=z6xf~}L$rSxy@dY(MuM?ZZ1ko?6ZA$~^}(4x zkiU_*FNISUUnH{fGhPyxQtY4)I8#XafGeEMmxpcDus$av{N?h1td1mnSTG-8eX+^L z-^iEo$Hgy)q$k7nU6n+8H~fJ%L!!vexB9865+2(+-=ZW*1q8h|{@_!Sq(6{fkwx+x zEnmS;C{*I$NL@|)l$!R|`39|lgZ>F%d)K`sD3zs@v)N?@HhvTO>x|z3+dJ%?p?!h! z&4TS=b$9a>@<*Dblfyi#RLcYV9vEK%g;65dKHsT5&Lon2qhR0CG%fJ`%g~%iuxAom zBz+*gT}y2GMaYYK{z#MY^%PPF`Nea>=b}$IzT{%CwsMsELP?(#J{VuL*rrczY`^fO5}q3X zHq4q+24p9v099v~6Z~|euW;;lGL6j0u6!6wfOQSmhw_^C@i>t#{Dr7v9nL}ke+}{%(rfc)Kxbgy0n{?+H30_< z@0=Jk;V(fWN!EY7p7S{nSO@A4aOgmPN?;s_KkLk(O4sCfbGGsLm1YBV|b4i z>*whfa$n%Z9jW-uBqItp!j4ciUd`%PN8#-yz^!Hn1;l~SnPd8h+^cj4Q%-8_ns zZ{qz1O`i?h-rHyDju#4CUPzA+<{`?^VeB%#jT>Lb$&ZrSvjNd}6llQ=9T@!Tg+CZ} zo@o1WXPvgLI&BKd?$MA>ZrdHbC0nKT%!$~=WwAq$%xNMZe zk;?E^myJ$QREDz%G{vVl5MGJ#5G0LxU`<+NMvcDu`Fr!r}$i>K_g1BAHOV5;ky z9XoVvqqLchZIm(~tm}ZJ#~s@qz{wkW`VHMRHQt#n^3}XhsGQ8EMOnYoX}oni`B)eE z>+SvT6ZBQ*$P>8X%Xq-hA?nf9-XsuNwu}r9r*c(cu$0XSeO=+f1#W5Bb~2mFpUf62 z!a{+|4Vyc6ALEPp;YDi5JS-H3(_Doc#t)1^*-5kXaoMaxbLC;NM5k?)F zDku?h3owL^3N+I>92=OUBP0#LyD;)bLtO#1vI2v2HzEOG%WS1mDudcAEiDa!V1|&ifFot38mo)>VaE{tkxH?YqQz*n-1!XQ7(6=sAhFl;f5-CQYUhG{b`_sa|AGB+cX2MhDU z%3z_umq)PGJ26eA)alkxAJMt3g~~!{uuv{>W;10KRBT9KGqE&{tUFo_d#XltBW}%> ziqu)jR&s)qKN+Yifn@$$iEe+E=eh!Q3^ zA}Y(6@Q5-dqSaEVSQdztVwG6p3KgPKBzWD2-uV4uK_Hfj3u#oq6^fNvVSy+Xaw|l+ zm=}m*hJX^HlFbW4#5DBfxdO2ulmxCKq@j_pEb!F|G=N5)K*+eBD>JNHYVuG29Mq-3NN+j7x=qLf&+4;(q@aXpiLlM?XxTlHO7)wN_qcL`( zhw)MrOZbD6Ou+9YMnjPaCZ_Mh=)OK8%tV+8HpWKu{g|b%U&91&)7}VsBF;t)mtodE zq8B>F*-@gU<4utY3GMA%rw&rY&L z62JHW5el;rCg>$%II;IA;fWqM>m8VL$bsEF>WB6~WH{sx#-Nuc6!8*2zW&oR{6Ndt zahkxxvoNt-EYyy(o*3&rN<>4RZ@eQvi4o0K|ilIH92(xC8IiJGy-lmSM&^NBW~<@t8N12=@JF zJazDaZi=SZiC=j^d=Zvl;3pU&JRWzHV0tvc(80$EHWnqk?5ICD@ErFcjn{j5j{T~J z_3EP-t&f^PHJ~*S4#cAbt=oZLrUicK9^T7Nd05tq@9iG*#E$l8_f7T2!(JxVH^q^A z!xOP!e1iEWO`IKg+(mRVfk5cg$RRHgjfEnNk3CB0hnUg`*j#J&$M|j82=H?;faxC} zha%f4E0)xKs+WoQ*vR1}ZDNe+e172Xy4{TD_$~BaikNT{qo=U{TEVD+zrOC0TY{%% zTyGu=1}4ygP5Fty1mOuyObq;SFXQpB;e#L6U5r_2{WgQ|GB*p-BRmmcHE=S<_DpCg zGdnba=}&1vLIb}I1mY(>fjAKbp8G>Vg7L)slZFwDx_Sh=26Azex+~fXmnEP;N z&O`x*0E)fe@D+@`|MS>g?q_iw)o)=5Z#*3EXYF|G;9vGe33keV>Pj7n*-nRW-LS^) zA2|8S2<6cFgFfBobOV?Z5BEkx@rXxP{Ni?Qa`0&#c?{+is3{wXgd#-9<%NK}6XMjd?8_1)}OK;7;C zfN3nW?vjHqYnb3QFS##)O^f~#6ZH4^LHa=qD4TeA&$HU2-Z3UR=7GUHk&yngm`&%8 zXLSBM#=DOa?)cM32CfB`McG*A>w6=i0O9rzTr=Q~e|^kh_QRUOS5`^_0rQe^GtpB5 z{eWiA5iQ12uW20AO*E#z-C#A*EHTP>qCGERp{H)&^K)HyY&y&uu0Q-$t;3S|`3r}? z-#tyZ!$q;+QjX33kyR>H5lh{1r|=stg1XZg{M1+3TSqdu)@k*-qd2}eup0LZzlRZrKZ`kSb9mJ7 zpY}#!A@Pj4?*S0Da&QQ{B93dEj;D6uNB+Wj5wHE?Z!zc72YW7NcijJEPyfMLm@%GE zcp8ZJ2W&TDu04kCZciv4#uK2lnClj>g^`}e|KAJWOzW=HxM=;|omUP#bH(GB#$`FN z3;%@8&}lvpW1`2=gpUXH;ci$?8DAGzZcIa&4_=Q`J+sEY>*@_hLcYkZZ+2sYIsyOhyZ)vNi~i$h zcm35)ETO*^qjqiAX^$K^g8%w!UED7Lm;GH`ogc;@)>0Pp;X5?`ps}=j*8@7{fA1{_ zm27&*ct8urD7x#|PBPxL>)&?ok9UCyrJnvWYt{N=)7+n!K;;vm$Z`E61J8EHU|rZl z>>hAyqUJp(cK^HQit5fIc|eI<5DL3!eFgmG5h7H`Wx*$s{a*nGcO)8S%;sIc*nMXh zv$D&j99Q5f*!>aipq)DZkd$Mk#jdQG;V`_xCv74VXX;sU{?ii_Ys zMOVrdAy068=kZt&{H`G)UIuS#K`57s1@O(h#m?BVOfi=$E`gsuLa>E$bwTJfnpTHn z;ZdTjV>Q{beFoDG5$vvq^__?9pT-VwGhDVXL_~z#u5EivM7EGAF64FJ(MAQKXR`0zjJkcg+O4UUNDTp=e+ zTBp#`AgxnGBYPx=QFQ!7A2BIHt8GXmKQmh(#O`gmC30GwU*ec891}}xJd;?SEdyf| zV1R+h7ga&2&Q9dv*4QZt8#qcxqav<~Wp%zbv=kZKI`|MFY8vvrMNLz1-t;s4EPZ#$ ztv{FEJK7PHk-v|edxZxNx(PcB(TP_&`;s(~L*KB;0MB099!Q(bq2|5VBeD8kOv@KR zEQLy0+>ZUvxbaR4`Rq7(?*Q74UC>0B*cUW$6E#tFTitEwmBVw}_oCR7pcYW;5DF&v zfof%d>Y0-YU4}(srV0@hQI{*mYst5+CZD?+X|mi|75;)@QQp+MUhZ3OPrnl?6&qz8 zdK4Ebu+=J-T(?$f>T7FjV2p;C>n~DuigXZQI$4jF+|yyauY>#{M?UvaMAW+*UX^6R za6o2x?`7S9f->@{63y&Z)|(ohptDjSuL*VZcV?=&+{tt?59_5UlRw9ekKyDSY4Xq0 zh=7K1nnqPc8`2*Ezz!t@P#R>HVUHr_+69F>Ok(U3!^mIH^c%Wrp=&KNrD&HSA`vqt z$tPyW(=+|{E=(27=?=q#07!}$e+qx&4{w8^a~GmLwhr2|3t zL?Gz5z{YJleq!2V8THyiW3-BZZo}V4_?g~U*fc5Vmr8X7HOP#IFnd}!xhO1@L0+Pk zJi5d9*E`5Z=gHS+k*dD0D0-jqS9K0$40G}fvOGHsGAQ#40yrvz={%9*%Cp2EPh3)3 zsmy{;Uz-lfX>f^(jt`&Z5T>*3(4;BvEY~$CyLXTu&S&WFJC))y4iG(0rz|vNqRkn>M$wS#PT48D#m3pJ7K@pJ+_hcc z9JJj=p)`4hwm4Iq6ZgLH341ziO$lj=a`4s+B+xT-#+8O4G8st1(*l*TyUdXL<{j2d zhPODaW(&>RoIE91dCKXuQwR@-Sr9m@#lfXBoXwR=3uc-FJgi*CVd0z>+Gb6et#&SL zccI+9(`5!hWHOW)Quad1$+&DD_dA?b&bcRZzJ-&BLxnV(;r6#=q?#U-)C3{QBobGT#d@RhGQC+wf4g%#+u5 zd|>kHccFmx2RFYCX|jEe&B(8RR4u=r92y!D2JYz<<1F}F`#y3wbM2X;;3uwT_p@{@ zKQIx%ip*zFe&8eLAU~jmKsPNV2$-*l`GK%%KzrT%07_R>AxJ$dK-SSz3tDr3dRcSK zT4knRh|yk%(Vo+6!L46YtOGmK0&K#wB^1^Y*V>o*jE@+}2mTZ%e|TD43@+&71%2FH zeN=T79>EPCmpz5Vm9?#Y&95QM_S#mzb9>J|*?XFV&zHdxQG!*<+KO%Q{6O zdmNH;iP&R53-;8y)dv%KAhM?gjw*Zp@)-qtx`gFYc0p+3&FxP^vfyvlcysn3Re5vN zV3vbK#u&;Zl&Yy*mcJHG-&P^ZT!jMyDqDd)t*uNdWiXs_t8n9U3L~0g?J8BV_lu%UVrCc}d-DWGf@Po>o1{Ne{V`RyOHhoc(hby}&k!%p#5JD(P; zE-K}+TWN@#@*HjB;eL}9;<9v_qpfh`$tnmAvz4>joG#jCw>w>~lpRHRg;XZxpeZh8 zg)lE%P_jVK*OhY8c4wNWIL?_)+i5epxs(!Un_zQUd1z;`aGYSVz#S~dF%YA}#k;Jj zl+~8X2o@WFp&|54aW*?AIBXfa&1%b-VGX3yc8EkPX3?X5ADr3z!GaRLMo8j0- z@iHCTC{{pNLzMTyR}>1`BOjhQS)S!+(Vk0}qJH=zoV@p+L`8i+(4aas;KGtTYjs2? z&#ccih z&y@2>SgtH^^?CH0pTc(Jr|svFn!!T%c~po;b_P|6b_RWEV!$&$S26%){>y^2vP&lG z$FJe!mtPYZ&^nYV6HJDFSuWj2(Y;goLYfM8TDi`&_N@uOn_gGow_R}*;~w02A5NbB zKREf*Ux?_fji4Ay6~T2^3n(aH;-9q^#ywpC!z*q1`n|+un78QKhj=yGm6P?GOH61?{^<+U3l21Np!C6;9sw zD<%0VW3LBRjr_j1FR@{!?PY#e$a97OTtGh9(RjUCDpK4E#thxiMtOJiAQpf)wp6}$ zrBGQYT@O>1tQ}HpD3nioDGTcdrj(`1b4YS|7ofFvE=a;g{WOT&0nyQX3P4$>$abp~ zBX?0*3<*NzBgziR&j>R)d4d_xlQP30p(iu!Y_!}!;+%LyODv z-M?x}UO1tC;jYA)ZaSZ%yArK_54qtp=H$6Q(@mbcbJy?vC^dQEtnDVB{8l-yPW~<& zQzg&ixlr=F9Lgkrm(K%|AAqB49Brv&wye@g(lB(q9XF$nKOI>5WoMi{qW|#G<@5?fZo`*|0uB^U1WoT0$gf_CwO_rLIobp;`d?+zS_jE z0{Ti_cNIKH;U1h&8@6$jOjgLH#Vd04ktMWFqWcU`xW8J+qT8t=2Gx9NWv!x^ znu8-{s5~!JI26~0<{#TH6refWGDdylvC%>2{_7OTG-9#8{{Ge>3XRl&PfKkRW{`cf zTU+s3D^{R|n0DP!z^+EVV(9h2qPk54D!;)JUSZk_wW+cR?Ab}7g=-5^Zy zq**J#wc^dCP)^CT0#w#)E=)NE*a}lwv$-%qlF};~T7foHZ7x*Bn-p5%HdSsioys#ihY)B_U6KYw0kRIwZhGXRJ#q?3QbYb1W_?p;8k#ngEvwTdQ?8=a%#1;ti?(#1tozRB?F7hzO@Hdbyf(OEKu`wLO zFel&#eU3AXm2uB7qYN{{cxJ>hx4WT!2^GS7BTbQJI%D`%4l+gyr{ zlu~(K9B*=Ln)@6xb&G@%zS8xN5fsd3D z*IYI2oi^9_PNk;YHYKefA9HZ-L>BQhm6$Y7CnB?HAG;=-Omt}m(zRxP(MIzu9`=gEwvYmR$i4njdxf)?rIQ$(^J7rdMvjD zI*Is8rSdtXWAc1zk!y+`zb2Uo*5pp*(!Nm`BQm=-FUHJeq^k;T5~@MAe8&WqFeQf}IE7P^)Pds2uXIjy#n zCCt+**oJC)Y(j*)RdP%reX&f=CTWRY6*-ncZiRGWv{2*eR>=|RPbTflnNu}csNmMp zW~=y?L^gTU1@Ng^E@5qFKk{f@n4%+#sl;e0l@;ymR`CyXe>qp17s&p%&IMRcQlN9< zI)m0TwfT@wmQ(a--L|&QhkVNCnyuUBcQr@nQ!twnz49B-@Q_n#w$aH9nFPWINeT7tSSq z7VtB`mpTJ}W)As8TZ>=Y^>?e+-1O9Vxt*S^)oqzx!J0M&ep+F^oy=ffQ^?lY+RE$P z*cO0mGb7)r=4Z6AeQRs7|Fq`Mw(fV5 z31|uaEH{>ocizBo-!uJZKcJjl4R#q(q( znMki^YZ~ge$_}1ErmNtKqJ1^eqpeymAkLC*yQhV{dI4fX?eJ;ZH$a@BDW=mjHyYzp zZQ>GXpKA&B@9W^Cs8{oq&eeKf^GP74waWR}m3DqyJq{4e@bv8L#MbT^>EzV-Dwl9o z+u;Z6+X_!_{rW15MOLOEHrmFX!MawjSDTMbx}*{deJ^$v`AVa!$*J}+Fchl-8z#lC z)b#C~tW6E^29>q5p?<4uh3FSSY#rp7E<@b4oh((`smXLh@$EMDL-ffeA+96GCw!A_ z_kV5Mo+~*|Lafs>4nFQA=$&f?#XupZUFH(wz*E>ui1`V(4Zg3WEWF5H#e9W$tABYS zYF!Gc=dVJ$Re!;Z@-Fb-@z&z;kVY9t&+eHq$e)}*bp>XcVSGzI zlu1F)EP4*1T#aYR4Y`*YCJu!&@XSo0XBs7Bj?X}@#%)EpmpFR%pl7cY(lg95Gl773 zp}vfs!|0ho&)#X2wke_X%^2fB@X)gtJ%`YP$Bc%WMQNbX6KLoe^z4E!%mjR4Jb)O( zq|h%fbQJSN3^N}t!2gsEPbcE25gH9X;#cvT7uEUDJfF-cVmY}a;CpEXx}K>&8l~1B z_As+)dKvnPpW>y9!^6&|{ywp)4qxQIuQp!+?Ra=$czAiv4n3hII||eSE9&%!-5DMg zig9vedzdlQ-o+qzr~;UbJ|aI5qiP=!baHCe%;!f}fl1t@kcxvk593>XH}KYUh?zj` zFOAFa+z}Cih@=8u5S6y9cc@hu^&e*C{_Ei@ulGQfFdVMo8}`chde!-Z`0EyZ7lZ@j zUvNtN^fP1O#Rh#4nWE4Kq{B?jcSc<^sJ%JnSYIB9)o$5m1pPlku(b2Prw4IPwPx!M%{2%2`sD@p)dX zcIVgnhu8W``UkTSK8pTU)W3Bv2c^^D)#2flxf;J^{cVO{MEx7~s(|(clJ6wykD9le zuZTas$$S#!HGMf6_8BOi0finX>Ob4mUu2R(z8*BTFOmaadxe{F!yd&|WkLwwOP6Z; zCCf{F{`i98`h_q={GyxSwdj)wFSrP6nVB!lGoi2r=<~X2`qV}~WfQq{A{DIh<0~<%@=bLHt?HxCW-p5tk(Eb(^u&bLS5h8NWYr<~zDVeGeU|Kvhe1H2 zsJ{YVi3tcP=_A==DW{7{u0icBUih^he?)lo{IocfHx8YK3r+Bv{ehZ&Xvoi_`2l(; z>ci?LP#Mx6aG+p39yw06k2JHThs!U=Ohk!WxTY#p2KiEHP(08>)DG+rlogKh(D2qU Scpu&t|HGaSVOW@nQ2!q=M{ list[tuple]: fixtures = [] basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) launcher = PythonTransformLauncher(DocIDPythonTransformRuntimeConfiguration()) - config = {doc_column_name_cli_param: "contents", - hash_column_name_cli_param: "hash_column", - int_column_name_cli_param: "int_id_column", - start_id_cli_param: 5, - } - return [(launcher, config, basedir + "/input", basedir + "/expected")] \ No newline at end of file + config = { + doc_column_name_cli_param: "contents", + hash_column_name_cli_param: "hash_column", + int_column_name_cli_param: "int_id_column", + start_id_cli_param: 5, + } + return [(launcher, config, basedir + "/input", basedir + "/expected")] diff --git a/transforms/universal/doc_id/ray/test/test_doc_id_ray.py b/transforms/universal/doc_id/test/test_doc_id_ray.py similarity index 83% rename from transforms/universal/doc_id/ray/test/test_doc_id_ray.py rename to transforms/universal/doc_id/test/test_doc_id_ray.py index c55342017..d2a3b3900 100644 --- a/transforms/universal/doc_id/ray/test/test_doc_id_ray.py +++ b/transforms/universal/doc_id/test/test_doc_id_ray.py @@ -16,12 +16,13 @@ AbstractTransformLauncherTest, ) from data_processing_ray.runtime.ray import RayTransformLauncher -from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration -from doc_id_transform_base import (doc_column_name_cli_param, - hash_column_name_cli_param, - int_column_name_cli_param, - start_id_cli_param, - ) +from dpk_doc_id.ray.transform import DocIDRayTransformRuntimeConfiguration +from dpk_doc_id.transform import ( + doc_column_name_cli_param, + hash_column_name_cli_param, + int_column_name_cli_param, + start_id_cli_param, +) class TestRayDocIDTransform(AbstractTransformLauncherTest): diff --git a/transforms/universal/doc_id/spark/test/test_doc_id_spark.py b/transforms/universal/doc_id/test/test_doc_id_spark.py similarity index 97% rename from transforms/universal/doc_id/spark/test/test_doc_id_spark.py rename to transforms/universal/doc_id/test/test_doc_id_spark.py index 6d945bf9e..7a899419d 100644 --- a/transforms/universal/doc_id/spark/test/test_doc_id_spark.py +++ b/transforms/universal/doc_id/test/test_doc_id_spark.py @@ -16,7 +16,7 @@ AbstractTransformLauncherTest, ) from data_processing_spark.runtime.spark import SparkTransformLauncher -from doc_id_transform_spark import ( +from dpk_doc_id.spark.transform import ( DocIDSparkTransformConfiguration, doc_column_name_cli_param, hash_column_name_cli_param, diff --git a/transforms/universal/doc_id/transform.config b/transforms/universal/doc_id/transform.config deleted file mode 100644 index d3715f3b2..000000000 --- a/transforms/universal/doc_id/transform.config +++ /dev/null @@ -1,20 +0,0 @@ -# -# This is intended to be included across the Makefiles provided within -# a given transform's directory tree, so must use compatible syntax. -# -################################################################################ -# This defines the name of the transform and is used to match against -# expected files and is used to define the transform's image name. -TRANSFORM_NAME=doc_id - -################################################################################ -# This defines the transforms' version number as would be used -# when publishing the wheel. In general, only the micro version -# number should be advanced relative to the DPK_VERSION. -# -# If you change the versions numbers, be sure to run "make set-versions" to -# update version numbers across the transform (e.g., pyproject.toml). -DOC_ID_PYTHON_VERSION=$(DPK_VERSION) -DOC_ID_RAY_VERSION=$(DOC_ID_PYTHON_VERSION) -DOC_ID_SPARK_VERSION=$(DOC_ID_PYTHON_VERSION) - From 49a22aea6a8c1c572fc997e0bcbf990871861217 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 5 Dec 2024 14:14:24 -0500 Subject: [PATCH 02/28] added __init__ Signed-off-by: Maroun Touma --- transforms/universal/doc_id/dpk_doc_id/__init__.py | 4 ++++ transforms/universal/doc_id/dpk_doc_id/ray/__init__.py | 0 transforms/universal/doc_id/dpk_doc_id/spark/__init__.py | 0 3 files changed, 4 insertions(+) create mode 100644 transforms/universal/doc_id/dpk_doc_id/__init__.py create mode 100644 transforms/universal/doc_id/dpk_doc_id/ray/__init__.py create mode 100644 transforms/universal/doc_id/dpk_doc_id/spark/__init__.py diff --git a/transforms/universal/doc_id/dpk_doc_id/__init__.py b/transforms/universal/doc_id/dpk_doc_id/__init__.py new file mode 100644 index 000000000..0bedd041c --- /dev/null +++ b/transforms/universal/doc_id/dpk_doc_id/__init__.py @@ -0,0 +1,4 @@ +from .transform import * +from .local_python import * +from .transform_python import * +from .local import * diff --git a/transforms/universal/doc_id/dpk_doc_id/ray/__init__.py b/transforms/universal/doc_id/dpk_doc_id/ray/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/transforms/universal/doc_id/dpk_doc_id/spark/__init__.py b/transforms/universal/doc_id/dpk_doc_id/spark/__init__.py new file mode 100644 index 000000000..e69de29bb From 808521a3a9b7d5e1ae700015d00a5d54012d964e Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 5 Dec 2024 15:15:30 -0500 Subject: [PATCH 03/28] Fix typo Signed-off-by: Maroun Touma --- transforms/universal/doc_id/Dockerfile.ray | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/doc_id/Dockerfile.ray b/transforms/universal/doc_id/Dockerfile.ray index f5bf58cae..377543678 100644 --- a/transforms/universal/doc_id/Dockerfile.ray +++ b/transforms/universal/doc_id/Dockerfile.ray @@ -14,7 +14,7 @@ COPY --chown=ray:users data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chown=ray:users dpk_html2parquet/ dpk_html2parquet/ +COPY --chown=ray:users dpk_doc_id/ dpk_doc_id/ COPY --chown=ray:users requirements.txt requirements.txt RUN pip install -r requirements.txt From b8f3b6f96d572dcc45017a33c4eb92ef45ab46f1 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 5 Dec 2024 15:53:54 -0500 Subject: [PATCH 04/28] remove spark unit test for now Signed-off-by: Maroun Touma --- .../doc_id/test/test_doc_id_spark.py | 45 ------------------- 1 file changed, 45 deletions(-) delete mode 100644 transforms/universal/doc_id/test/test_doc_id_spark.py diff --git a/transforms/universal/doc_id/test/test_doc_id_spark.py b/transforms/universal/doc_id/test/test_doc_id_spark.py deleted file mode 100644 index 7a899419d..000000000 --- a/transforms/universal/doc_id/test/test_doc_id_spark.py +++ /dev/null @@ -1,45 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import os - -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, -) -from data_processing_spark.runtime.spark import SparkTransformLauncher -from dpk_doc_id.spark.transform import ( - DocIDSparkTransformConfiguration, - doc_column_name_cli_param, - hash_column_name_cli_param, - int_column_name_cli_param, -) - - -class TestSparkDocIDTransform(AbstractTransformLauncherTest): - """ - Extends the super-class to define the test data for the tests defined there. - The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. - """ - - def get_test_transform_fixtures(self) -> list[tuple]: - basedir = "../test-data" - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) - fixtures = [] - launcher = SparkTransformLauncher(DocIDSparkTransformConfiguration()) - transform_config = { - doc_column_name_cli_param: "contents", - hash_column_name_cli_param: "hash_column", - int_column_name_cli_param: "int_id_column", - } - - fixtures.append((launcher, transform_config, basedir + "/input", basedir + "/expected")) - return fixtures From 3baad3e75e76d11455a448e3689140af853b9099 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 5 Dec 2024 15:54:41 -0500 Subject: [PATCH 05/28] Show example for runnign ray runtime Signed-off-by: Maroun Touma --- transforms/universal/doc_id/Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/transforms/universal/doc_id/Makefile b/transforms/universal/doc_id/Makefile index bf0d39543..6429d4c9d 100644 --- a/transforms/universal/doc_id/Makefile +++ b/transforms/universal/doc_id/Makefile @@ -15,8 +15,9 @@ TRANSFORM_NAME=$(shell basename `pwd`) -run-cli-say-sample: - $(MAKE) RUN_FILE="-m dpk_$(TRANSFORM_NAME).ray.transform" \ - RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \ - --doc_id_int True " \ - .transforms.run-src-file +run-cli-ray-sample: + #make venv + source venv/bin/activate && \ + $(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \ + --run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}" \ + --doc_id_int True From 37881ef5d76ff7a15bb6f98c03d09697b3aeb710 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 5 Dec 2024 17:54:49 -0500 Subject: [PATCH 06/28] fixing issues with spark Signed-off-by: Maroun Touma --- transforms/.make.cicd.targets | 4 +- transforms/universal/doc_id/Makefile | 9 +- transforms/universal/doc_id/README.md | 155 +++++++++++++++--- .../test-data/expected-spark/metadata.json | 46 ++++++ .../test-data/expected-spark/sample1.parquet | Bin 0 -> 36668 bytes .../doc_id/test/test_doc_id_spark.py | 45 +++++ 6 files changed, 234 insertions(+), 25 deletions(-) create mode 100644 transforms/universal/doc_id/test-data/expected-spark/metadata.json create mode 100644 transforms/universal/doc_id/test-data/expected-spark/sample1.parquet create mode 100644 transforms/universal/doc_id/test/test_doc_id_spark.py diff --git a/transforms/.make.cicd.targets b/transforms/.make.cicd.targets index 23475f57f..de840eee2 100644 --- a/transforms/.make.cicd.targets +++ b/transforms/.make.cicd.targets @@ -83,7 +83,7 @@ test-image:: .default.build-lib-wheel $(MAKE) DOCKER_FILE=Dockerfile.spark \ TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \ DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \ - BASE_IMAGE=$(SPARK_BASE_IMAGE) \ + BASE_IMAGE=$(DOCKER_HOSTNAME)/$(DOCKER_NAMESPACE)/$(DOCKER_SPARK_BASE_IMAGE) \ test-image-sequence ; \ fi ;\ fi @@ -120,7 +120,7 @@ image:: .default.build-lib-wheel if [ -e Dockerfile.spark ]; then \ $(MAKE) DOCKER_FILE=Dockerfile.spark \ DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \ - BASE_IMAGE=$(SPARK_BASE_IMAGE) \ + BASE_IMAGE=$(DOCKER_HOSTNAME)/$(DOCKER_NAMESPACE)/$(DOCKER_SPARK_BASE_IMAGE) \ .defaults.lib-whl-image ; \ fi ; \ fi diff --git a/transforms/universal/doc_id/Makefile b/transforms/universal/doc_id/Makefile index 6429d4c9d..1db88041b 100644 --- a/transforms/universal/doc_id/Makefile +++ b/transforms/universal/doc_id/Makefile @@ -15,8 +15,15 @@ TRANSFORM_NAME=$(shell basename `pwd`) +run-cli-spark-sample: + make venv + source venv/bin/activate && \ + $(PYTHON) -m dpk_$(TRANSFORM_NAME).spark.transform \ + --run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}" \ + --doc_id_int True + run-cli-ray-sample: - #make venv + make venv source venv/bin/activate && \ $(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \ --run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}" \ diff --git a/transforms/universal/doc_id/README.md b/transforms/universal/doc_id/README.md index 675995623..7146ff4bd 100644 --- a/transforms/universal/doc_id/README.md +++ b/transforms/universal/doc_id/README.md @@ -1,31 +1,41 @@ # Document ID Python Annotator -The Document ID transforms adds a document identification (unique integers and content hashes), which later can be -used in de-duplication operations, per the set of -[transform project conventions](../../README.md#transform-project-conventions) -the following runtimes are available: +Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, +transform configuration, testing and IDE set up. -## Summary +## Contributors +- Boris Lublinsky (blublinsk@ibm.com) -This transform annotates documents with document "ids". -It supports the following transformations of the original data: -* Adding document hash: this enables the addition of a document hash-based id to the data. - The hash is calculated with `hashlib.sha256(doc.encode("utf-8")).hexdigest()`. - To enable this annotation, set `hash_column` to the name of the column, - where you want to store it. -* Adding integer document id: this allows the addition of an integer document id to the data that - is unique across all rows in all tables provided to the `transform()` method. - To enable this annotation, set `int_id_column` to the name of the column, where you want - to store it. +## Description -Document IDs are generally useful for tracking annotations to specific documents. Additionally -[fuzzy deduping](../fdedup) relies on integer IDs to be present. If your dataset does not have -document ID column(s), you can use this transform to create ones. +This transform assigns unique identifiers to the documents in a dataset and supports the following annotations to the +original data: +* **Adding a Document Hash** to each document. The unique hash-based ID is generated using +`hashlib.sha256(doc.encode("utf-8")).hexdigest()`. To store this hash in the data specify the desired column name using +the `hash_column` parameter. +* **Adding an Integer Document ID**: to each document. The integer ID is unique across all rows and tables processed by +the `transform()` method. To store this ID in the data, specify the desired column name using the `int_id_column` +parameter. +Document IDs are essential for tracking annotations linked to specific documents. They are also required for processes +like [fuzzy deduplication](../../fdedup/README.md), which depend on the presence of integer IDs. If your dataset lacks document ID +columns, this transform can be used to generate them. -## Configuration and command line Options +## Input Columns Used by This Transform + +| Input Column Name | Data Type | Description | +|------------------------------------------------------------------|-----------|----------------------------------| +| Column specified by the _contents_column_ configuration argument | str | Column that stores document text | + +## Output Columns Annotated by This Transform +| Output Column Name | Data Type | Description | +|--------------------|-----------|---------------------------------------------| +| hash_column | str | Unique hash assigned to each document | +| int_id_column | uint64 | Unique integer ID assigned to each document | + +## Configuration and Command Line Options -The set of dictionary keys defined in [DocIDTransform](src/doc_id_transform_ray.py) +The set of dictionary keys defined in [DocIDTransform](src/doc_id_transform_base.py) configuration for values are as follows: * _doc_column_ - specifies name of the column containing the document (required for ID generation) @@ -35,7 +45,7 @@ configuration for values are as follows: At least one of _hash_column_ or _int_id_column_ must be specified. -## Running +## Usage ### Launched Command Line Options When running the transform with the Ray launcher (i.e. TransformLauncher), @@ -53,11 +63,113 @@ the following command line arguments are available in addition to ``` These correspond to the configuration keys described above. +### Running the samples +To run the samples, use the following `make` targets + +* `run-cli-sample` - runs src/doc_id_transform_python.py using command line args +* `run-local-sample` - runs src/doc_id_local_python.py + +These targets will activate the virtual environment and set up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-cli-sample +... +``` +Then +```shell +ls output +``` +To see results of the transform. + +### Code example + +[notebook](../doc_id.ipynb) + +### Transforming data using the transform image To use the transform image to transform your data, please refer to the [running images quickstart](../../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. +## Testing + +Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md) + +Currently we have: +- [Unit test](test/test_doc_id_python.py) +- [Integration test](test/test_doc_id.py) + + +# Document ID Ray Annotator + +Please see the set of +[transform project conventions](../../../README.md) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Ray Summary +This project wraps the Document ID transform with a Ray runtime. + +## Configuration and command line Options + +Document ID configuration and command line options are the same as for the +[base python transform](../python/README.md). + +## Building + +A [docker file](Dockerfile) that can be used for building docker image. You can use + +```shell +make build +``` + +## Driver options + +## Configuration and command line Options + +See [Python documentation](../python/README.md) + +## Running + +### Launched Command Line Options +When running the transform with the Ray launcher (i.e. TransformLauncher), +the following [command line arguments](../python/README.md) are available in addition to +[the options provided by the ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md). + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +# Document ID Spark Annotator + +## Summary + +This transform assigns a unique integer ID to each row in a Spark DataFrame. It relies on the +[monotonically_increasing_id](https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.sql.functions.monotonically_increasing_id.html) +pyspark function to generate the unique integer IDs. As described in the documentation of this function: +> The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. + +## Configuration and command line Options + +Document ID configuration and command line options are the same as for the +[base python transform](../python/README.md). + +## Running +You can run the [doc_id_local.py](src/doc_id_local_spark.py) (spark-based implementation) to transform the +`test1.parquet` file in [test input data](test-data/input) to an `output` directory. The directory will contain both +the new annotated `test1.parquet` file and the `metadata.json` file. + +### Launched Command Line Options +When running the transform with the Spark launcher (i.e. SparkTransformLauncher), the following command line arguments +are available in addition to the options provided by the +[python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). + +``` + --doc_id_column_name DOC_ID_COLUMN_NAME + name of the column that holds the generated document ids +``` ### Running as spark-based application ``` @@ -87,4 +199,3 @@ The metadata generated by the Spark `doc_id` transform contains the following st To use the transform image to transform your data, please refer to the [running images quickstart](../../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. - diff --git a/transforms/universal/doc_id/test-data/expected-spark/metadata.json b/transforms/universal/doc_id/test-data/expected-spark/metadata.json new file mode 100644 index 000000000..a55c4eff6 --- /dev/null +++ b/transforms/universal/doc_id/test-data/expected-spark/metadata.json @@ -0,0 +1,46 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "doc_id", + "job type": "spark", + "job id": "job_id", + "start_time": "2024-08-03 22:04:58", + "end_time": "2024-08-03 22:05:15", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "doc_column": "contents", + "hash_column": "hash_column", + "int_column": "int_id_column", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"] + }, + "execution_stats": { + "execution time, min": 0.29759878317515054 + }, + "job_output_stats": { + "source_size": 36132, + "result_size": 36668, + "result_doc_count": 5, + "source_files": 1, + "result_files": 1, + "processing_time": 0.08469605445861816, + "source_doc_count": 5 + }, + "source": { + "name": "/Users/borisl/Projects/data-prep-kit/transforms/universal/doc_id/spark/test-data/input", + "type": "path" + }, + "target": { + "name": "/Users/borisl/Projects/data-prep-kit/transforms/universal/doc_id/spark/output", + "type": "path" + } +} diff --git a/transforms/universal/doc_id/test-data/expected-spark/sample1.parquet b/transforms/universal/doc_id/test-data/expected-spark/sample1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..765a6776b209cbc0903e7a09fa3af0f3844ba343 GIT binary patch literal 36668 zcmeHw30zah^LPRXh*+%}44x77)dR>4iC+)G6+l34NYvs>@&bWel7!1*)l$A{y{$*R z>RqpT-&)TeT5qM+t6ufiYOVV9_WR#`$s;exi)|ILzwci@HObrA-I<-4ot>T8eNB<1 zaT#8W6h=fM!;e90LZfwB$9~T0^bL(Rghr#&JZPRYFPinwD+4^y88k)!qf;=b-I(Fc z@NUKQCE!e7TSZy`trJBlfa0AukkLTDG@2jX({p$%4XN^I#qbIg(!3i7GW|34dQ^w0 zy0JK+T!w#uU%)eZzzaH}LVW)Yrodn@C>q0q20j{tFBdD#Q z?9INyBS$aXX7y+Nc<T^ien;>7ub*_i^&Z}syrn#a zB{wkebllWHR5&K{{agC@^IWYeno?QtPAV<2{w z38Qt2P;tUV+;B|A5vs$*LT-4tLah{tg**k9t48@4kE>KDP%$dzVJd-ugNnn$)M0#{ zK&%oeI6{;wU`FsUfhtTHrWA=)DsC9~7OBF;XqZxg@l|4uQlwJwlwy$}T&+a8Vv(3H z;D|9LPb}aI!UTLJUqz^+Un-%F9-i~*$Xx$c41b$8Hc%Oq79FNH=W0~Aq+ZkusH%tj zSRa|;)9N2G3QASqN@uL5Co(+jTJYhg9<&Z#axiEUzZ;va&|0t%qehE`6rm+-VXj7@ z%hl-3SdkvpvN=kFj$O^L4%I{)Nya}`|TSFlO~oudYXMu{HYgh+=!8#=8P}2L_yfU)Y@{#~L^37BM2Mb@SNN zW-;^&JpYK8MU5LQzt{_%7}&SnvY^gCOAA+H(x%r>o%(O??OB!eJ(pDUYM!#4U*1|& z<thy7z1U?ACt-A6*`&Gae|pevFd7qib3%6OwG*aA&tKPV_TAJ4)vVM7 z$su8W6LcfHKmH_bVoqsAcVtq<^}?7JO`dBmE!Yu$W@>|jJ5oEJ{hjxp=;_P#=(&;I z51CT;ZTsc6&$GFc`{(%5=Ab(l>|8j^`*R&*!xyc!vrsjs`H9S(J3~h!OXFtm-1Os> zoT4@JQn{`78&tvStJ4Bardvxk#uk_Dt{%|sOTTX0_w*D7UTql|{8ZiLn(D-ZWv0)P zj-PM3ugfgU*`3S1f8G<;Q!IL& z%H$*2^YhKVJr1qu{u>q(Qkk_P{q(=RK3`rDSX{V$LaSj3MOSC;I5Ou@6gKg>{>U$i z_<6ny4zAm-JNE7EiRMS8v#x$I|5Jl6y@$DA;2-%vwi<(Am5<0>M?t~)D#rpmm&Xtid3C$DB34zU(jCQMni_xK|nJq~(V_Be%pPt$+B%sw0c1PT%dLxX^J_=)lkS-A?QK zYexfP=FXw>3Xk<)(D$;&E8%d+zxIUr{xKwa((>=J25#3*KNx(j!SsH`L<7CLPt#i+01`UcFKmPQB(o6BZg17AXB;)9Y{@oTl@sl*{Gh*4@ zE;~m)IPP;PXv629HWCP%FB~t%E*xWaz4~apvf8jE*1YDn;=8Z!c9|30{UhVJh98($ zjBTEf@nHG5`mI`j+@-ATrXH2E(weQk-?!}@uQ6k;u1cAf_-O1`OQYMeqpS0m{AcV} zK9jnnHr>LGI@xJ<_3X+9?Kh0ueXq2%@7+rw!n3>gEU1_9`BSXw=9OI=?>B7af3;~slhTz(&JTFv!DKF!Ox~+%e)8$1 z0W$AA{kAV=+-TTz-};N2)y1Ng&#U&bdK)VyRunIP&u916)Nh*QR9)|PV$xpY?9n&+ z=g>IfroUb^F1*row~8}(s9)muXZlso>v~evH>htKyGikuidk1R8yW^oJ^1b93%h*& zC=J@+;pHp(s7<5KIxc=Nq@-}ZasI4-R|jm{GbJQ)?vWkAVJAQ7kU6nAd;Ef9zUR^t zCO$9jx|$I)ukYsgxXMeP23`DY(TSM2l}Ur=oS4}}_6KXRsQ%Xrql?Ey&)PA(vWcw8 zms-)B?a`ac&)#f5KBBuQx<%Hd*fFPMvqlbH#Y(6*G-Xk0N9CUVqejKWPw23lKCtrJ z{KE-fwP_Mz+!URjpkj^>93QAjOKHEGZhU-iUdP$`s4EZw{j9i53;?!2<=jS zr{H<3@c&Lyk6gs5nmf$2NYK8^fTeRwhxbmH_q-sZX-BDOLGJF>kK4~V-Tq#a$_15+ z4C>PpfBGP$X&1qjHa!cQU1)K?$HWukPJz|({laop@VfS1OP4+SdFLlNy_)9s$iMll zP1VG5T~)iF(gB;eeyZZBpRMmFDVuSQZCYoLb_%Yz%+Qy2ypn#sN1vXLd-`WL98;Qc z_p+sT(c-V92cF&^GW5qijRsxkbc%iN%ff`Jz`ZxWf3I1yFS;O0bL1~}wEir5Ab;$E zUr(;uAKvMTFkUhk8M@u$YpqtW+0^s5kn7i!@x58OJ>xT@eZK3Oe(ma=^kn0C;o3{1 z%X`GEXn3*s)YhDD_8kw+*k){!w>$0Hpp5FEm~oX8DojsyPv7pF{hyR|HwXRPhM!Sl z-jOaEJ7l?N{Jw<&gQhng5_9~c4`Y`s8#lXayOxu;nTK7#Gx)7{v|bX!UU-&CLO6)IVH-H7+pBC#U9JR4<>&ZJ>_26 z%JW^8uf4D!=gWnsGdhYp-`E^{Z+N?DJ@U(L& zqFWxHRIcOh-+1`@qu-A6@r;_>HpAod@CVW=^>pnDzh2rYLDS6JV~==Et9RkO6V(G6 zOiMFtZqlN~cb*jsTOV$>{A0hC(wjqnlditezq?*oY={`6Yg@la@jxFUy%K$Q>dZym zN6bq7{Ygf*xuHMqWRJ?Jj^8NA3O>~-EckZS+P(i?bFZ-Yu1Rv4Rq7?l=*&F6^t(oi zgUshYjmQ|<|A%`!E7iZN=Ee%9Y`m=ZXm+gkmbRkwhtEA{aEC;0>^%PGX$LM9%D#JA zKS8|kz2T1GZ_tf6UK@*;C-qM zTCJf3IAPsbF?y4w2*XRu*%>LZER(epjmZ&BD5yzI5}Jo<^r5UYObe({fmUimkI>0+Z7)6*7gJT1%$4W7!1uz{-CMvN{6qKLcwS|3}p!gtI4R&bNvZ@-e*wp};@3BCt{%>c+V}6K8jBeU%0v9lhCv`?tF?1J-szuJuF~v7mqZ zP9fhZWbG;>yOJIX*Mv8c@{L!x^puJ zOr;~M(~(DMP_cF`91#!+0i<@UieP+PRxT$Mow)&$%Zt)vH3(~)a3WSR^P$Duo?&M5~0 z!di)d$eIW$h=apG@_P&xDc0pj5auJc#f)e)WJdp13_n}&&=(qG7|h0L$_OY$j`l<5 z_iGjAN2A0#%Y!iiL~cpiJhq+}?GI^d)_|ogIqSUqx!pRCTb(LAayTu%Lwn}ro1I_u zEMrzlwtCi&q<3o7Y}lFsqE$l^nxex-aJZuVt9=(%|K2uY*~7KcK2gh#sWyFhVE=;T zv&XML`-ppG`0?LXR6n0|dr61uzm=}*QQq+GC!8B)XJVWGw{5OyiV^gSe!NLdUoF*!0RF*%-}7@sW5OqE6PW21$A;&{w| z6g7lSI4bKN%&x2qVtgRPslsxLu_6;-3$7xoeF8T6AiD-2<35CH`;QvfKa91&KgAs#d4o|@2QgONUOB*m>4yNYdR+F8Bej2?BE6uI3$df$+HhUm{ z3`B+vgv@(+_kV_+k}=&Fi4KfueVvAT10bZ#58;SHcp?@@*p0&h|8>PbDO4}eZDpo= z5H&N7m$)zk?m2KxdO;7E>w(NDM%G#&Gu(ImGOUzL013&FP6^4&AR+1Brr{1thsU4Y z`JU%5d(fvrq^W-x_98wm+V2;}iIGU5bdvnSPtRiFw`R{6T-YhSq&{s3cdA(1bH)!D z3fVKm&-+?)`ZWzeg8VeBF{xYj0U|?Sbm*xMAJY*W)J5{P4p0(Dnx& z_a3=Wu4vNW(3T(1(T2PamvZO*bA$?H2^9*ZIvl8{lBeMFRALcFjS0CtkWf^Fal`mR zR4Cx{Ib4)43d7KFUYL*zGm&Z@U!*`q%!n|RN}#}0TyD5hpavOAHBT*8fezGakmclI zTy>b3BNU>_aDiH_!zze|ReC)t%|}BfMLhV$3-h8ygq@%M<>Hf@VZz_pax$&! z6RZ#0B0vmHW=)XCC)K7lk;ZSJzLy5>+a;BJT(Yki z&0X4JL0AL-&{+fhi`JYQg_a)gl~2>s3TZ2Z5jp2h<*m)^-(|t-gcbYJ8cn~|y`@*j zMR%%hj2QgGuxq{h9y`%>!M0VCdR1gToVjWHiHF#ll4S*BEmuw@81A&=>?_Oe{ma>N z3F31n4*qLRivzLSxRcbgH$+<&ub0nilfKlqi!3;xfycez9;c?ysP}EJpE<2BH|?2q zenfSvCLQ|t8$Y_#c^z&12fgLmiOR6}%5l=!MndaU2PHmx^* z=hc%7w?@qwYV`a3(ZK7cI-FCWhY#J$Y4g#P9lmAXzIQRGlekyU3xgUyi5~yH?~nYT zS&f^YFGbD|PrNqD&#&GX@NaBTrD$+ZpRfF+2Pb^upC8@9?A?28Sle#`%7#r|%XYQb~s&(G$w{iw8{;A!Qw57j(DNpsk z+)Fm8H7|+gyD};3af7Myfy=OypN&2%TQ_alD&@?i7v^ld_vDFx1U{S$i!xG#e5WNi+L6UdE+a$+rM@-)QM zYU^3D8b{?n9sx(eCVCAWsalOspI1Zv3M>*@JvjR?!mTb`7~#VG8H;T7?H$-zJron@BA3!%B^sBFu?D7dOz6{p)M^^Y<}R@+X+l(;?Opu!;ub_cWtLq17lq!b~O$ zNC#U$W}3nUZ{k{uNl`|O+0jL>{Dw5-_drAuW-JKW=ac@!-iY+P8J@NNQ8H@@J{qG%MAiFg(Th+5niza@#gA&6`$fo@?cr%2kO z-E?Hft{No=7K;MG%WzZ5z6B%LLCG6xXf&8Iw1CK_TBFBQ6uMo%2FSGkq#oMTJ*1&| z7h!pXUoeXvu#1k&`hkud{~jNcy(e;~lr)3OA`Ude+aEpvKbR;mIzmCQj}62meo#|; z|3?zF_3Sz|fgZ4)jtu(|ga`KG#1h@u`=ccE)hE{Q10?h=g z(mxK8kb2{1RqSXN9i$^q09EehG6>3!a|+5n2SM5WZ5pra*l7N>43!F-Xx@Ob^69}R z=ADz}d*m#AF??C;+uqZXe^QN#>GZYOXOvgTQU1r}0g=A@6DtQqw_Cojm!d`Tk(N8E zZ$~uyC6)ijA6=vG6!iIg)vZg7BU8%4m*ndY-yOou?cHl*`?a~N6F&Jd;^57++gs(? zo%=raZRbU&jLTvQwIZCyMHK=NRuwC`d=O+6D@437kxI!$QISd&#^Z+YEM@6{yjYe*c0$u;GpJlyDmn4?-NCAoiPMgNO=G3%=8P}y-+bpTP1u<8;O)gXe~NxbjP|R?Yc!r zCfvYf_m5qY-s*Ma`S$@6iTlV6ftduCKgn#gUPvBx4`g2cpug^Y-N0v^g*2wONtuu7 zP(}*4rrgoP@e;JPh;(I`#_$Jq$X8Q4(h^&N35}Q6+*2eMTJNEeOKJ}9IusJ;9?6CF zqk`m8@|ioL+z*gdbLh=V4fSa!q5&L-^SKr>jC5jyq8RSD$i*N#PJV<5Q{ov?MkEkv z4ERJ&>sd)9l50d8NSrMTMJ^)8Ov&Yr;UF@E6a^s{J4Ffz5Y9wmZ*HZP*fCq_blh?h z&2Vb1J2rBWbz4M~z`pVAiM3zbroq;&g9}TvfZY=Q8&L*(@wWO>HqO~x7@~rebNF#g za*@NzN(MMDG-Mdrh!N#*ia5?oT7X|*tH5WS>gH7fNS)l}{~x%B{{Q0zYGm&J`TM`f zyIUUbt{uIlYe%jd1LXB@W*~3=>I3rTE+oAAr2^#L!Sh+q>^g;ds5Z<1WbC zIYolh?#pE9HKarS1>M%1DPClzX99_lzK z$a<62n%Xl3bz}7%6a-#xr~@SZCu(1Uj)fG6ETL&+iFhayyiqAp@A=ocqtu3h1#XS{-Om} z^V1)DE!P#p`$r$Q8FMyms*u2Z&rTlOawxNfY&zX0YvIk1I|Rpq5cOT)%Ne z>;iT||C+Pddfv(vt-ZW<@!@w|T+!7O*Df}AkHr-kQR;;2b#=#je}w`ae33s+MlB;G z+%)EwKD2v@u$K<#3*9@uk5XfJod_qr0b9SU^CNCG&ad`+fW?<46x+y?`)d~6ra1)+BM&W+~E@L&XwX;@ueG2d>Q4x4Av7rbL-e@(7BDAOs z#%_#dXi<@&1oDBpuY@GOxeiPFKZ@OYipB9~?fa(KzbinuJZLZ77Rqlr{tWeG)Swy-e1z>pWOPsj(i z7q#)4lDv4GFkg|K3F}x?*`=n$D2YbNPs>;8Q$d-e#P(dQjEWD7kIpp3MW6w>)4gGtp`%@t~+U|I57&&54(KrxUG08cvq<4C3Ar+ zEH?crk%2iR%Sl$N;sE5?)o&NN?K~gM5N(#|8MA;4%icY8kAGHJ6VExY5}&vqKZqx@z#|b6;NOJS^09Fb-WaEGy%EkRoSt6 z8&B&>j!=KpDyf#Eu*rggHkUee#Wy*$$+2R9pPP?n3*6Zc9b6Z3cxlCo>{z2hgWK7< z;vdlcQmt)XK>P1H7r=U=06K@SGjKhlnh!d;Nx_S?+g8{4pi@Z2`F7jTN5=`#&tOizR zdIi?B9N?$v3*E^K%xf;RbwaoDdNZ~K;1zP9?_~2c+}OUFHQDM9yVI#RMI#@s^$Awq zxcK$>zJLq-g)+?QyVup4Q7Gd7IA9}QvOdUjk_(hNZ3*xh zitBi-)yj2p;1dGhQkMn&Q=32Qy5Es!gPOpfMRVj@xYw#1R&tv0{0KCdnLDa=8ddd;5A=4s$!+iy-$zwvV>M zpI#3Ah@4bjvULwmu}63Ezy8YJR>w6cK@0=N0o<&oW;8CjK*0xbfV#DNB}$#R0L;^J zd3Kt8EvK-5%D9AL8853`$;&DPGB(%oJSmfBt6pYn9N=|j2meH-<-iw(`)at})zx|d z;w&n+ds?tp*Mr!QJ9?b<4Is|o6w`5<8;~Vl(wUU>gpH#PdMxHz;p&ZQ?TinqH*0(YxukP1ZEH14~24bUb>={_s z>~O02=%h=vWM`qT6k9IOaUTPNu_|E0tnn)5v|GBOErsSLjVx3V5z{kx3dM7S}F;EcG zHlf)GfTv(Df#)Y8h2VFYg0IB+OPa4B-Wp$;lrAVqERbkmJe(BovCn(hM?t&=)GLqI zb70;@B8jub<3Sn)Bx67ZB{dFwiXy>tltd|!ByV-YaXa5=?#-Yka{AC1fM0Q zu4*fq#((5DPch`-zT+_zP3?;D2uqz^R#dS73DT0sn*_(GVtub|*_siY;m7l4t>J zU-D<|VJEisU{95=;+2B7_@AL`F&KqYdnpdG-_)znQqn797bCkN+axikVE65E3Xp}@+{#WqT z=(HpW*6*50;EPPdao|Lf@Zs`xS$|=z6xf~}L$rSxy@dY(MuM?ZZ1ko?6ZA$~^}(4x zkiU_*FNISUUnH{fGhPyxQtY4)I8#XafGeEMmxpcDus$av{N?h1td1mnSTG-8eX+^L z-^iEo$Hgy)q$k7nU6n+8H~fJ%L!!vexB9865+2(+-=ZW*1q8h|{@_!Sq(6{fkwx+x zEnmS;C{*I$NL@|)l$!R|`39|lgZ>F%d)K`sD3zs@v)N?@HhvTO>x|z3+dJ%?p?!h! z&4TS=b$9a>@<*Dblfyi#RLcYV9vEK%g;65dKHsT5&Lon2qhR0CG%fJ`%g~%iuxAom zBz+*gT}y2GMaYYK{z#MY^%PPF`Nea>=b}$IzT{%CwsMsELP?(#J{VuL*rrczY`^fO5}q3X zHq4q+24p9v099v~6Z~|euW;;lGL6j0u6!6wfOQSmhw_^C@i>t#{Dr7v9nL}ke+}{%(rfc)Kxbgy0n{?+H30_< z@0=Jk;V(fWN!EY7p7S{nSO@A4aOgmPN?;s_KkLk list[tuple]: + basedir = "../test-data" + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) + fixtures = [] + launcher = SparkTransformLauncher(DocIDSparkTransformConfiguration()) + transform_config = { + doc_column_name_cli_param: "contents", + hash_column_name_cli_param: "hash_column", + int_column_name_cli_param: "int_id_column", + } + + fixtures.append((launcher, transform_config, basedir + "/input", basedir + "/expected-spark")) + return fixtures From 9c869eea586a36ff08a02af04ea12defff8cd4b2 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 05:17:05 -0500 Subject: [PATCH 07/28] remove BASE_IMAGE arg from dockerfile.spark Signed-off-by: Maroun Touma --- transforms/.make.cicd.targets | 4 ++-- transforms/universal/doc_id/Dockerfile.spark | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/transforms/.make.cicd.targets b/transforms/.make.cicd.targets index de840eee2..23475f57f 100644 --- a/transforms/.make.cicd.targets +++ b/transforms/.make.cicd.targets @@ -83,7 +83,7 @@ test-image:: .default.build-lib-wheel $(MAKE) DOCKER_FILE=Dockerfile.spark \ TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \ DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \ - BASE_IMAGE=$(DOCKER_HOSTNAME)/$(DOCKER_NAMESPACE)/$(DOCKER_SPARK_BASE_IMAGE) \ + BASE_IMAGE=$(SPARK_BASE_IMAGE) \ test-image-sequence ; \ fi ;\ fi @@ -120,7 +120,7 @@ image:: .default.build-lib-wheel if [ -e Dockerfile.spark ]; then \ $(MAKE) DOCKER_FILE=Dockerfile.spark \ DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \ - BASE_IMAGE=$(DOCKER_HOSTNAME)/$(DOCKER_NAMESPACE)/$(DOCKER_SPARK_BASE_IMAGE) \ + BASE_IMAGE=$(SPARK_BASE_IMAGE) \ .defaults.lib-whl-image ; \ fi ; \ fi diff --git a/transforms/universal/doc_id/Dockerfile.spark b/transforms/universal/doc_id/Dockerfile.spark index e8df6c522..70c626a87 100644 --- a/transforms/universal/doc_id/Dockerfile.spark +++ b/transforms/universal/doc_id/Dockerfile.spark @@ -1,5 +1,4 @@ -ARG BASE_IMAGE=quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest -FROM ${BASE_IMAGE} +FROM quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest USER root # install pytest From 6fa0c0450fa29b2db2d1f15bfdc368b2af97d51e Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 06:18:30 -0500 Subject: [PATCH 08/28] added login to quay.io Signed-off-by: Maroun Touma --- transforms/.make.cicd.targets | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transforms/.make.cicd.targets b/transforms/.make.cicd.targets index 23475f57f..e77715b80 100644 --- a/transforms/.make.cicd.targets +++ b/transforms/.make.cicd.targets @@ -61,6 +61,7 @@ test-image-spark: $(MAKE) BUILD_SPECIFIC_RUNTIME=spark test-image test-image:: .default.build-lib-wheel + $(DOCKER) login $(DOCKER_HOSTNAME) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)' @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \ if [ -e Dockerfile.python ]; then \ $(MAKE) DOCKER_FILE=Dockerfile.python \ @@ -101,6 +102,7 @@ image-spark: image:: .default.build-lib-wheel ## Build all possible images unless a specific runtime is specified + $(DOCKER) login $(DOCKER_HOSTNAME) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)' @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \ if [ -e Dockerfile.python ]; then \ $(MAKE) DOCKER_FILE=Dockerfile.python \ From bbe9a023a99e4ff10241bf3f125042ab7790b2ad Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 06:32:26 -0500 Subject: [PATCH 09/28] debug registry credential Signed-off-by: Maroun Touma --- .github/workflows/test-universal-doc_id.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml index fce8faf11..42d85b993 100644 --- a/.github/workflows/test-universal-doc_id.yml +++ b/.github/workflows/test-universal-doc_id.yml @@ -114,7 +114,7 @@ jobs: if [ -d "transforms/universal/doc_id/spark" ]; then make -C data-processing-lib/spark DOCKER=docker image fi - make -C transforms/universal/doc_id DOCKER=docker test-image + make -C transforms/universal/doc_id DOCKER=docker DOCKER_REGISTRY_USER=${{ secrets.DOCKER_REGISTRY_USER }} DOCKER_REGISTRY_KEY=${{ secrets.DOCKER_REGISTRY_KEY }} test-image else echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform." fi From b77aaa3b3a2d21f58754906bfc902325dbf2e996 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 06:40:40 -0500 Subject: [PATCH 10/28] use dpk secrets Signed-off-by: Maroun Touma --- .github/workflows/test-universal-doc_id.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml index 42d85b993..824c9cae4 100644 --- a/.github/workflows/test-universal-doc_id.yml +++ b/.github/workflows/test-universal-doc_id.yml @@ -93,8 +93,8 @@ jobs: runs-on: ubuntu-22.04 timeout-minutes: 120 env: - DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} - DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }} + DOCKER_REGISTRY_USER: ${{ secrets.DPK_DOCKER_REGISTRY_USER }} + DOCKER_REGISTRY_KEY: ${{ secrets.DPK_DOCKER_REGISTRY_KEY }} steps: - name: Checkout uses: actions/checkout@v4 @@ -114,7 +114,7 @@ jobs: if [ -d "transforms/universal/doc_id/spark" ]; then make -C data-processing-lib/spark DOCKER=docker image fi - make -C transforms/universal/doc_id DOCKER=docker DOCKER_REGISTRY_USER=${{ secrets.DOCKER_REGISTRY_USER }} DOCKER_REGISTRY_KEY=${{ secrets.DOCKER_REGISTRY_KEY }} test-image + make -C transforms/universal/doc_id DOCKER=docker test-image else echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform." fi From 2c52fd504e1a1785e4da9ff30eb15c8a3cdf79e0 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 07:12:48 -0500 Subject: [PATCH 11/28] testing registry user Signed-off-by: Maroun Touma --- .github/workflows/test-universal-doc_id.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml index 824c9cae4..deee93eda 100644 --- a/.github/workflows/test-universal-doc_id.yml +++ b/.github/workflows/test-universal-doc_id.yml @@ -114,6 +114,7 @@ jobs: if [ -d "transforms/universal/doc_id/spark" ]; then make -C data-processing-lib/spark DOCKER=docker image fi + echo DOCKER_REGISTRY_USER="$DOCKER_REGISTRY_USER" - secret="${{ secrets.DPK_DOCKER_REGISTRY_USER }}" make -C transforms/universal/doc_id DOCKER=docker test-image else echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform." From 5ffadb43b74f31858635d4d19823b572fc72fbac Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 07:31:29 -0500 Subject: [PATCH 12/28] testing registry user Signed-off-by: Maroun Touma --- .github/workflows/test-universal-doc_id.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml index deee93eda..fc589f2ca 100644 --- a/.github/workflows/test-universal-doc_id.yml +++ b/.github/workflows/test-universal-doc_id.yml @@ -109,6 +109,9 @@ jobs: sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true df -h - name: Test transform image in transforms/universal/doc_id + env: + DOCKER_REGISTRY_USER: ${{ secrets.DPK_DOCKER_REGISTRY_USER }} + DOCKER_REGISTRY_KEY: ${{ secrets.DPK_DOCKER_REGISTRY_KEY }} run: | if [ -e "transforms/universal/doc_id/Makefile" ]; then if [ -d "transforms/universal/doc_id/spark" ]; then From 8b234a4da26e0e554e7a9a3d4a77dbc9c0ec6c70 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 09:00:03 -0500 Subject: [PATCH 13/28] testing environment secrets Signed-off-by: Maroun Touma --- .github/workflows/test-universal-doc_id.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml index fc589f2ca..7483935ce 100644 --- a/.github/workflows/test-universal-doc_id.yml +++ b/.github/workflows/test-universal-doc_id.yml @@ -93,8 +93,9 @@ jobs: runs-on: ubuntu-22.04 timeout-minutes: 120 env: - DOCKER_REGISTRY_USER: ${{ secrets.DPK_DOCKER_REGISTRY_USER }} - DOCKER_REGISTRY_KEY: ${{ secrets.DPK_DOCKER_REGISTRY_KEY }} + DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} + DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }} + DPK_TEST: ${{ secrets.DPK_TEST }} steps: - name: Checkout uses: actions/checkout@v4 @@ -109,15 +110,12 @@ jobs: sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true df -h - name: Test transform image in transforms/universal/doc_id - env: - DOCKER_REGISTRY_USER: ${{ secrets.DPK_DOCKER_REGISTRY_USER }} - DOCKER_REGISTRY_KEY: ${{ secrets.DPK_DOCKER_REGISTRY_KEY }} run: | if [ -e "transforms/universal/doc_id/Makefile" ]; then if [ -d "transforms/universal/doc_id/spark" ]; then make -C data-processing-lib/spark DOCKER=docker image fi - echo DOCKER_REGISTRY_USER="$DOCKER_REGISTRY_USER" - secret="${{ secrets.DPK_DOCKER_REGISTRY_USER }}" + echo "DPK_TEST=$(DPK_TEST) - $DPK_TEST - ${{ secrets.DPK_TEST }} " make -C transforms/universal/doc_id DOCKER=docker test-image else echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform." From 878fa42bc338fe5305cc5f7e784ab377a247cd35 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 09:08:35 -0500 Subject: [PATCH 14/28] testing environment secrets Signed-off-by: Maroun Touma --- .github/workflows/test-universal-doc_id.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml index 7483935ce..ebcb9df63 100644 --- a/.github/workflows/test-universal-doc_id.yml +++ b/.github/workflows/test-universal-doc_id.yml @@ -115,7 +115,7 @@ jobs: if [ -d "transforms/universal/doc_id/spark" ]; then make -C data-processing-lib/spark DOCKER=docker image fi - echo "DPK_TEST=$(DPK_TEST) - $DPK_TEST - ${{ secrets.DPK_TEST }} " + echo "DPK_TEST= $(DPK_TEST) , $DPK_TEST , ${{ secrets.DPK_TEST }} " make -C transforms/universal/doc_id DOCKER=docker test-image else echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform." From 5632c064ef61f2b96b4e6c4701a176dc7a4579aa Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 09:24:31 -0500 Subject: [PATCH 15/28] testing environment secrets Signed-off-by: Maroun Touma --- .github/workflows/test-universal-doc_id.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml index ebcb9df63..4f31422c0 100644 --- a/.github/workflows/test-universal-doc_id.yml +++ b/.github/workflows/test-universal-doc_id.yml @@ -115,7 +115,9 @@ jobs: if [ -d "transforms/universal/doc_id/spark" ]; then make -C data-processing-lib/spark DOCKER=docker image fi - echo "DPK_TEST= $(DPK_TEST) , $DPK_TEST , ${{ secrets.DPK_TEST }} " + echo "DPK_TEST= $(DPK_TEST) + echo " $DPK_TEST " + echo " ${{ secrets.DPK_TEST }} " make -C transforms/universal/doc_id DOCKER=docker test-image else echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform." From 6da893f6bdda0358571c9b41adec6fcee08c4bad Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 09:26:42 -0500 Subject: [PATCH 16/28] testing environment secrets Signed-off-by: Maroun Touma --- .github/workflows/test-universal-doc_id.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml index 4f31422c0..5e2c40805 100644 --- a/.github/workflows/test-universal-doc_id.yml +++ b/.github/workflows/test-universal-doc_id.yml @@ -115,7 +115,7 @@ jobs: if [ -d "transforms/universal/doc_id/spark" ]; then make -C data-processing-lib/spark DOCKER=docker image fi - echo "DPK_TEST= $(DPK_TEST) + echo "DPK_TEST= $(DPK_TEST) "" echo " $DPK_TEST " echo " ${{ secrets.DPK_TEST }} " make -C transforms/universal/doc_id DOCKER=docker test-image From d426439a339cf1d38be1ab2ed52ef806e8e953ca Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 09:28:28 -0500 Subject: [PATCH 17/28] testing environment secrets Signed-off-by: Maroun Touma --- .github/workflows/test-universal-doc_id.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml index 5e2c40805..11977528b 100644 --- a/.github/workflows/test-universal-doc_id.yml +++ b/.github/workflows/test-universal-doc_id.yml @@ -115,7 +115,7 @@ jobs: if [ -d "transforms/universal/doc_id/spark" ]; then make -C data-processing-lib/spark DOCKER=docker image fi - echo "DPK_TEST= $(DPK_TEST) "" + echo "DPK_TEST= $(DPK_TEST) " echo " $DPK_TEST " echo " ${{ secrets.DPK_TEST }} " make -C transforms/universal/doc_id DOCKER=docker test-image From e8bb04a86faca37da17bf81f19490aafef1f9ea1 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 09:41:13 -0500 Subject: [PATCH 18/28] Delete .github/workflows/test-universal-doc_id.yml --- .github/workflows/test-universal-doc_id.yml | 137 -------------------- 1 file changed, 137 deletions(-) delete mode 100644 .github/workflows/test-universal-doc_id.yml diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml deleted file mode 100644 index 11977528b..000000000 --- a/.github/workflows/test-universal-doc_id.yml +++ /dev/null @@ -1,137 +0,0 @@ -# -# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files -# -name: Test - transforms/universal/doc_id - -on: - workflow_dispatch: - push: - branches: - - "dev" - - "releases/**" - tags: - - "*" - paths: - - ".make.*" - - "transforms/.make.transforms" - - "transforms/universal/doc_id/**" - - "data-processing-lib/**" - - "!transforms/universal/doc_id/**/kfp_ray/**" # This is/will be tested in separate workflow - - "!data-processing-lib/**/test/**" - - "!data-processing-lib/**/test-data/**" - - "!**.md" - - "!**/doc/**" - - "!**/images/**" - - "!**.gitignore" - pull_request: - branches: - - "dev" - - "releases/**" - paths: - - ".make.*" - - "transforms/.make.transforms" - - "transforms/universal/doc_id/**" - - "data-processing-lib/**" - - "!transforms/universal/doc_id/**/kfp_ray/**" # This is/will be tested in separate workflow - - "!data-processing-lib/**/test/**" - - "!data-processing-lib/**/test-data/**" - - "!**.md" - - "!**/doc/**" - - "!**/images/**" - - "!**.gitignore" - -# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - check_if_push_image: - # check whether the Docker images should be pushed to the remote repository - # The images are pushed if it is a merge to dev branch or a new tag is created. - # The latter being part of the release process. - # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 - outputs: - publish_images: ${{ steps.version.outputs.publish_images }} - steps: - - id: version - run: | - publish_images='false' - if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; - then - publish_images='true' - fi - if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; - then - publish_images='true' - fi - echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" - test-src: - runs-on: ubuntu-22.04 - steps: - - name: Checkout - uses: actions/checkout@v4 - - name: Free up space in github runner - # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 - run: | - df -h - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup - sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true - df -h - - name: Test transform source in transforms/universal/doc_id - run: | - if [ -e "transforms/universal/doc_id/Makefile" ]; then - make -C transforms/universal/doc_id DOCKER=docker test-src - else - echo "transforms/universal/doc_id/Makefile not found - source testing disabled for this transform." - fi - test-image: - needs: [check_if_push_image] - runs-on: ubuntu-22.04 - timeout-minutes: 120 - env: - DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} - DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }} - DPK_TEST: ${{ secrets.DPK_TEST }} - steps: - - name: Checkout - uses: actions/checkout@v4 - - name: Free up space in github runner - # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 - run: | - df -h - sudo rm -rf /opt/ghc - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup - sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true - df -h - - name: Test transform image in transforms/universal/doc_id - run: | - if [ -e "transforms/universal/doc_id/Makefile" ]; then - if [ -d "transforms/universal/doc_id/spark" ]; then - make -C data-processing-lib/spark DOCKER=docker image - fi - echo "DPK_TEST= $(DPK_TEST) " - echo " $DPK_TEST " - echo " ${{ secrets.DPK_TEST }} " - make -C transforms/universal/doc_id DOCKER=docker test-image - else - echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform." - fi - - name: Print space - # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 - run: | - df -h - docker images - - name: Publish images - if: needs.check_if_push_image.outputs.publish_images == 'true' - run: | - if [ -e "transforms/universal/doc_id/Makefile" ]; then - make -C transforms/universal/doc_id publish - else - echo "transforms/universal/doc_id/Makefile not found - publishing disabled for this transform." - fi From 3647ebc3b9d78d70ff66a760b14c72b516be8a90 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 09:47:18 -0500 Subject: [PATCH 19/28] restore workflow file --- .github/workflows/test-universal-doc_id.yml | 133 ++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 .github/workflows/test-universal-doc_id.yml diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml new file mode 100644 index 000000000..fce8faf11 --- /dev/null +++ b/.github/workflows/test-universal-doc_id.yml @@ -0,0 +1,133 @@ +# +# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files +# +name: Test - transforms/universal/doc_id + +on: + workflow_dispatch: + push: + branches: + - "dev" + - "releases/**" + tags: + - "*" + paths: + - ".make.*" + - "transforms/.make.transforms" + - "transforms/universal/doc_id/**" + - "data-processing-lib/**" + - "!transforms/universal/doc_id/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + pull_request: + branches: + - "dev" + - "releases/**" + paths: + - ".make.*" + - "transforms/.make.transforms" + - "transforms/universal/doc_id/**" + - "data-processing-lib/**" + - "!transforms/universal/doc_id/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + check_if_push_image: + # check whether the Docker images should be pushed to the remote repository + # The images are pushed if it is a merge to dev branch or a new tag is created. + # The latter being part of the release process. + # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. + runs-on: ubuntu-22.04 + outputs: + publish_images: ${{ steps.version.outputs.publish_images }} + steps: + - id: version + run: | + publish_images='false' + if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; + then + publish_images='true' + fi + if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; + then + publish_images='true' + fi + echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" + test-src: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform source in transforms/universal/doc_id + run: | + if [ -e "transforms/universal/doc_id/Makefile" ]; then + make -C transforms/universal/doc_id DOCKER=docker test-src + else + echo "transforms/universal/doc_id/Makefile not found - source testing disabled for this transform." + fi + test-image: + needs: [check_if_push_image] + runs-on: ubuntu-22.04 + timeout-minutes: 120 + env: + DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} + DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }} + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform image in transforms/universal/doc_id + run: | + if [ -e "transforms/universal/doc_id/Makefile" ]; then + if [ -d "transforms/universal/doc_id/spark" ]; then + make -C data-processing-lib/spark DOCKER=docker image + fi + make -C transforms/universal/doc_id DOCKER=docker test-image + else + echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform." + fi + - name: Print space + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + docker images + - name: Publish images + if: needs.check_if_push_image.outputs.publish_images == 'true' + run: | + if [ -e "transforms/universal/doc_id/Makefile" ]; then + make -C transforms/universal/doc_id publish + else + echo "transforms/universal/doc_id/Makefile not found - publishing disabled for this transform." + fi From adaf78ffeb2c1a01f194e94ff5c6d73384f6a6e2 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 6 Dec 2024 10:33:14 -0500 Subject: [PATCH 20/28] clear testing of docker login Signed-off-by: Maroun Touma --- transforms/.make.cicd.targets | 2 -- 1 file changed, 2 deletions(-) diff --git a/transforms/.make.cicd.targets b/transforms/.make.cicd.targets index e77715b80..23475f57f 100644 --- a/transforms/.make.cicd.targets +++ b/transforms/.make.cicd.targets @@ -61,7 +61,6 @@ test-image-spark: $(MAKE) BUILD_SPECIFIC_RUNTIME=spark test-image test-image:: .default.build-lib-wheel - $(DOCKER) login $(DOCKER_HOSTNAME) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)' @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \ if [ -e Dockerfile.python ]; then \ $(MAKE) DOCKER_FILE=Dockerfile.python \ @@ -102,7 +101,6 @@ image-spark: image:: .default.build-lib-wheel ## Build all possible images unless a specific runtime is specified - $(DOCKER) login $(DOCKER_HOSTNAME) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)' @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \ if [ -e Dockerfile.python ]; then \ $(MAKE) DOCKER_FILE=Dockerfile.python \ From 0c38a4cd9b48fc333a11cbdc12a348c47de9761d Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 10 Dec 2024 11:05:57 +0100 Subject: [PATCH 21/28] fix Makefile failing targets Signed-off-by: Maroun Touma --- transforms/universal/doc_id/Makefile | 4 ++-- transforms/universal/doc_id/dpk_doc_id/transform_python.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/transforms/universal/doc_id/Makefile b/transforms/universal/doc_id/Makefile index 1db88041b..6d7dbfbf0 100644 --- a/transforms/universal/doc_id/Makefile +++ b/transforms/universal/doc_id/Makefile @@ -19,12 +19,12 @@ run-cli-spark-sample: make venv source venv/bin/activate && \ $(PYTHON) -m dpk_$(TRANSFORM_NAME).spark.transform \ - --run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}" \ + --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \ --doc_id_int True run-cli-ray-sample: make venv source venv/bin/activate && \ $(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \ - --run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}" \ + --run_locally True --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \ --doc_id_int True diff --git a/transforms/universal/doc_id/dpk_doc_id/transform_python.py b/transforms/universal/doc_id/dpk_doc_id/transform_python.py index 4dd5b4c6f..b8b886227 100644 --- a/transforms/universal/doc_id/dpk_doc_id/transform_python.py +++ b/transforms/universal/doc_id/dpk_doc_id/transform_python.py @@ -70,7 +70,7 @@ def apply_input_params(self, args: Namespace) -> bool: return super().apply_input_params(args=args) -class DocIDRuntime(DefaultPythonTransformRuntime): +class DocIDPythonRuntime(DefaultPythonTransformRuntime): """ Exact dedup runtime support """ @@ -110,7 +110,7 @@ class DocIDPythonTransformRuntimeConfiguration(PythonTransformRuntimeConfigurati def __init__(self): super().__init__( transform_config=DocIDTransformConfiguration(), - runtime_class=DocIDRuntime, + runtime_class=DocIDPythonRuntime, ) From 87eecf0881da090a0881d9ba681517b286f0eca4 Mon Sep 17 00:00:00 2001 From: SHAHROKH DAIJAVAD Date: Tue, 10 Dec 2024 12:34:20 -0800 Subject: [PATCH 22/28] README changes Signed-off-by: SHAHROKH DAIJAVAD --- transforms/universal/doc_id/README.md | 53 ++++++++++++--------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/transforms/universal/doc_id/README.md b/transforms/universal/doc_id/README.md index 7146ff4bd..0be7b6864 100644 --- a/transforms/universal/doc_id/README.md +++ b/transforms/universal/doc_id/README.md @@ -1,6 +1,6 @@ # Document ID Python Annotator -Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, +Please see the set of [transform project conventions](../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up. ## Contributors @@ -18,7 +18,7 @@ the `transform()` method. To store this ID in the data, specify the desired colu parameter. Document IDs are essential for tracking annotations linked to specific documents. They are also required for processes -like [fuzzy deduplication](../../fdedup/README.md), which depend on the presence of integer IDs. If your dataset lacks document ID +like [fuzzy deduplication](../fdedup/README.md), which depend on the presence of integer IDs. If your dataset lacks document ID columns, this transform can be used to generate them. ## Input Columns Used by This Transform @@ -35,7 +35,7 @@ columns, this transform can be used to generate them. ## Configuration and Command Line Options -The set of dictionary keys defined in [DocIDTransform](src/doc_id_transform_base.py) +The set of dictionary keys defined in [DocIDTransform](dpk_doc_id/transform.py) configuration for values are as follows: * _doc_column_ - specifies name of the column containing the document (required for ID generation) @@ -50,7 +50,7 @@ At least one of _hash_column_ or _int_id_column_ must be specified. ### Launched Command Line Options When running the transform with the Ray launcher (i.e. TransformLauncher), the following command line arguments are available in addition to -[the options provided by the ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md). +[the options provided by the ray launcher](../../../data-processing-lib/doc/ray-launcher-options.md). ``` --doc_id_doc_column DOC_ID_DOC_COLUMN doc column name @@ -64,12 +64,11 @@ the following command line arguments are available in addition to These correspond to the configuration keys described above. ### Running the samples -To run the samples, use the following `make` targets +To run the samples, use the following `make` target -* `run-cli-sample` - runs src/doc_id_transform_python.py using command line args -* `run-local-sample` - runs src/doc_id_local_python.py +* `run-cli-sample` - runs dpk_doc_id/transform_python.py using command line args -These targets will activate the virtual environment and set up any configuration needed. +This target will activate the virtual environment and sets up any configuration needed. Use the `-n` option of `make` to see the detail of what is done to run the sample. For example, @@ -85,17 +84,17 @@ To see results of the transform. ### Code example -[notebook](../doc_id.ipynb) +[notebook](doc_id.ipynb) ### Transforming data using the transform image To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +[running images quickstart](../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. ## Testing -Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md) +Following [the testing strategy of data-processing-lib](../../../data-processing-lib/doc/transform-testing.md) Currently we have: - [Unit test](test/test_doc_id_python.py) @@ -105,7 +104,7 @@ Currently we have: # Document ID Ray Annotator Please see the set of -[transform project conventions](../../../README.md) +[transform project conventions](../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up. @@ -115,31 +114,25 @@ This project wraps the Document ID transform with a Ray runtime. ## Configuration and command line Options Document ID configuration and command line options are the same as for the -[base python transform](../python/README.md). +base python transform. ## Building -A [docker file](Dockerfile) that can be used for building docker image. You can use +A [docker file](Dockerfile.ray) that can be used for building docker the ray image. You can use ```shell make build ``` -## Driver options - -## Configuration and command line Options - -See [Python documentation](../python/README.md) - ## Running ### Launched Command Line Options -When running the transform with the Ray launcher (i.e. TransformLauncher), -the following [command line arguments](../python/README.md) are available in addition to -[the options provided by the ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md). +When running the transform with the Ray launcher (i.e., RayTransformLauncher), in addition to Python +command line options, +[there are options provided by the ray launcher](../../../data-processing-lib/doc/ray-launcher-options.md). To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +[running images quickstart](../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. # Document ID Spark Annotator @@ -147,24 +140,24 @@ substituting the name of this transform image and runtime as appropriate. ## Summary This transform assigns a unique integer ID to each row in a Spark DataFrame. It relies on the -[monotonically_increasing_id](https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.sql.functions.monotonically_increasing_id.html) +[monotonically_increasing_id](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.monotonically_increasing_id.html) pyspark function to generate the unique integer IDs. As described in the documentation of this function: > The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. ## Configuration and command line Options Document ID configuration and command line options are the same as for the -[base python transform](../python/README.md). +base python transform. ## Running -You can run the [doc_id_local.py](src/doc_id_local_spark.py) (spark-based implementation) to transform the +You can run the [doc_id_local.py](dpk_doc_id/local.py) (spark-based implementation) to transform the `test1.parquet` file in [test input data](test-data/input) to an `output` directory. The directory will contain both the new annotated `test1.parquet` file and the `metadata.json` file. ### Launched Command Line Options -When running the transform with the Spark launcher (i.e. SparkTransformLauncher), the following command line arguments +When running the transform with the Spark launcher (i.e., SparkTransformLauncher), the following command line arguments are available in addition to the options provided by the -[python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). +[python launcher](../../../data-processing-lib/doc/python-launcher-options.md). ``` --doc_id_column_name DOC_ID_COLUMN_NAME @@ -197,5 +190,5 @@ The metadata generated by the Spark `doc_id` transform contains the following st ### Transforming data using the transform image To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +[running images quickstart](../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. From 3b2420bdc7466081cb2bffad8d15e308c3e83a22 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 10 Dec 2024 22:29:33 +0100 Subject: [PATCH 23/28] fix notebook Signed-off-by: Maroun Touma --- transforms/universal/doc_id/doc_id.ipynb | 97 +++++++++++++------ .../doc_id/dpk_doc_id/transform_python.py | 26 ++++- 2 files changed, 91 insertions(+), 32 deletions(-) diff --git a/transforms/universal/doc_id/doc_id.ipynb b/transforms/universal/doc_id/doc_id.ipynb index 7ecab7d65..8cf8039d7 100644 --- a/transforms/universal/doc_id/doc_id.ipynb +++ b/transforms/universal/doc_id/doc_id.ipynb @@ -24,7 +24,7 @@ "## This is here as a reference only\n", "# Users and application developers must use the right tag for the latest from pypi\n", "%pip install data-prep-toolkit\n", - "%pip install data-prep-toolkit-transforms==0.2.2.dev3" + "%pip install data-prep-toolkit-transforms" ] }, { @@ -51,23 +51,12 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "execution_count": 1, + "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58", "metadata": {}, "outputs": [], "source": [ - "import os\n", - "import sys\n", - "\n", - "from data_processing.runtime.pure_python import PythonTransformLauncher\n", - "from data_processing.utils import ParamsUtils\n", - "from doc_id_transform_python import DocIDPythonTransformRuntimeConfiguration\n", - "from doc_id_transform_base import (\n", - " doc_column_name_cli_param,\n", - " hash_column_name_cli_param,\n", - " int_column_name_cli_param,\n", - " start_id_cli_param,\n", - ")" + "from dpk_doc_id.transform_python import DocIDRuntime" ] }, { @@ -82,7 +71,11 @@ "cell_type": "code", "execution_count": null, "id": "e90a853e-412f-45d7-af3d-959e755aeebb", - "metadata": {}, + "metadata": { + "jupyter": { + "source_hidden": true + } + }, "outputs": [], "source": [ "\n", @@ -110,24 +103,55 @@ ] }, { - "cell_type": "markdown", - "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "cell_type": "code", + "execution_count": 2, + "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f", "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "21:42:29 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'hash_column', 'int_column': 'int_id_column', 'start_id': 5}\n", + "21:42:29 INFO - pipeline id pipeline_id\n", + "21:42:29 INFO - code location None\n", + "21:42:29 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", + "21:42:29 INFO - data factory data_ max_files -1, n_sample -1\n", + "21:42:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "21:42:29 INFO - orchestrator doc_id started at 2024-12-10 21:42:29\n", + "21:42:29 INFO - Number of files is 1, source profile {'max_file_size': 0.034458160400390625, 'min_file_size': 0.034458160400390625, 'total_file_size': 0.034458160400390625}\n", + "21:42:32 INFO - Completed 1 files (100.0%) in 0.049 min\n", + "21:42:32 INFO - Done processing 1 files, waiting for flush() completion.\n", + "21:42:32 INFO - done flushing in 0.0 sec\n", + "21:42:32 INFO - Completed execution in 0.049 min, execution result 0\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "##### ***** Use python runtime to invoke the transform" + "DocIDRuntime(input_folder= \"test-data/input\",\n", + " output_folder= \"output\",\n", + " doc_id_doc_column= \"contents\",\n", + " doc_id_hash_column= \"hash_column\",\n", + " doc_id_int_column= \"int_id_column\",\n", + " doc_id_start_id= 5).transform()" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "0775e400-7469-49a6-8998-bd4772931459", + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", "metadata": {}, - "outputs": [], "source": [ - "%%capture\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "launcher = PythonTransformLauncher(runtime_config=DocIDPythonTransformRuntimeConfiguration())\n", - "launcher.launch()" + "##### ***** Use python runtime to invoke the transform" ] }, { @@ -140,13 +164,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['output/sample1.parquet', 'output/metadata.json']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import glob\n", - "glob.glob(\"python/output/*\")" + "glob.glob(\"output/*\")" ] }, { @@ -174,7 +209,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/transforms/universal/doc_id/dpk_doc_id/transform_python.py b/transforms/universal/doc_id/dpk_doc_id/transform_python.py index b8b886227..0d58c850a 100644 --- a/transforms/universal/doc_id/dpk_doc_id/transform_python.py +++ b/transforms/universal/doc_id/dpk_doc_id/transform_python.py @@ -9,7 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ - +import sys from argparse import Namespace from typing import Any @@ -20,6 +20,7 @@ PythonTransformRuntimeConfiguration, ) from data_processing.transform import TransformStatistics +from data_processing.utils import ParamsUtils from dpk_doc_id.transform import ( DocIDTransformBase, DocIDTransformConfigurationBase, @@ -114,6 +115,29 @@ def __init__(self): ) +class DocIDRuntime: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + # create launcher + launcher = PythonTransformLauncher(DocIDPythonTransformRuntimeConfiguration()) + # launch + return_code = launcher.launch() + return return_code + + if __name__ == "__main__": launcher = PythonTransformLauncher(DocIDPythonTransformRuntimeConfiguration()) launcher.launch() From 30912d1a64bfe90e3ac9a180d5ba842bfdcad220 Mon Sep 17 00:00:00 2001 From: SHAHROKH DAIJAVAD Date: Tue, 10 Dec 2024 13:39:52 -0800 Subject: [PATCH 24/28] More changes to README Signed-off-by: SHAHROKH DAIJAVAD --- transforms/universal/doc_id/README.md | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/transforms/universal/doc_id/README.md b/transforms/universal/doc_id/README.md index 0be7b6864..631565e5c 100644 --- a/transforms/universal/doc_id/README.md +++ b/transforms/universal/doc_id/README.md @@ -63,25 +63,6 @@ the following command line arguments are available in addition to ``` These correspond to the configuration keys described above. -### Running the samples -To run the samples, use the following `make` target - -* `run-cli-sample` - runs dpk_doc_id/transform_python.py using command line args - -This target will activate the virtual environment and sets up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-cli-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - ### Code example [notebook](doc_id.ipynb) @@ -124,8 +105,6 @@ A [docker file](Dockerfile.ray) that can be used for building docker the ray ima make build ``` -## Running - ### Launched Command Line Options When running the transform with the Ray launcher (i.e., RayTransformLauncher), in addition to Python command line options, From 0cbe63f9f2f43d244f6b7c4b59f03d375d6a1131 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Wed, 11 Dec 2024 07:35:15 +0100 Subject: [PATCH 25/28] fix notebook Signed-off-by: Maroun Touma --- transforms/universal/doc_id/doc_id.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/doc_id/doc_id.ipynb b/transforms/universal/doc_id/doc_id.ipynb index 8cf8039d7..6bfbb6749 100644 --- a/transforms/universal/doc_id/doc_id.ipynb +++ b/transforms/universal/doc_id/doc_id.ipynb @@ -24,7 +24,7 @@ "## This is here as a reference only\n", "# Users and application developers must use the right tag for the latest from pypi\n", "%pip install data-prep-toolkit\n", - "%pip install data-prep-toolkit-transforms" + "%pip install data-prep-toolkit-transforms[doc_id]" ] }, { From cabd57766ac4f29d7964c370eebf83be52a2f78e Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 12 Dec 2024 11:50:18 +0100 Subject: [PATCH 26/28] fix notebook Signed-off-by: Maroun Touma --- transforms/universal/doc_id/doc_id.ipynb | 41 ++----------------- .../doc_id/dpk_doc_id/transform_python.py | 4 +- 2 files changed, 5 insertions(+), 40 deletions(-) diff --git a/transforms/universal/doc_id/doc_id.ipynb b/transforms/universal/doc_id/doc_id.ipynb index 6bfbb6749..9c1e4916f 100644 --- a/transforms/universal/doc_id/doc_id.ipynb +++ b/transforms/universal/doc_id/doc_id.ipynb @@ -51,12 +51,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58", "metadata": {}, "outputs": [], "source": [ - "from dpk_doc_id.transform_python import DocIDRuntime" + "from dpk_doc_id.transform_python import DocID" ] }, { @@ -70,41 +70,6 @@ { "cell_type": "code", "execution_count": null, - "id": "e90a853e-412f-45d7-af3d-959e755aeebb", - "metadata": { - "jupyter": { - "source_hidden": true - } - }, - "outputs": [], - "source": [ - "\n", - "# create parameters\n", - "input_folder = os.path.join(\"python\", \"test-data\", \"input\")\n", - "output_folder = os.path.join( \"python\", \"output\")\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", - "params = {\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # execution info\n", - " \"runtime_pipeline_id\": \"pipeline_id\",\n", - " \"runtime_job_id\": \"job_id\",\n", - " \"runtime_code_location\": ParamsUtils.convert_to_ast(code_location),\n", - " # doc id params\n", - " doc_column_name_cli_param: \"contents\",\n", - " hash_column_name_cli_param: \"hash_column\",\n", - " int_column_name_cli_param: \"int_id_column\",\n", - " start_id_cli_param: 5,\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 2, "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f", "metadata": {}, "outputs": [ @@ -138,7 +103,7 @@ } ], "source": [ - "DocIDRuntime(input_folder= \"test-data/input\",\n", + "DocID(input_folder= \"test-data/input\",\n", " output_folder= \"output\",\n", " doc_id_doc_column= \"contents\",\n", " doc_id_hash_column= \"hash_column\",\n", diff --git a/transforms/universal/doc_id/dpk_doc_id/transform_python.py b/transforms/universal/doc_id/dpk_doc_id/transform_python.py index 0d58c850a..27fc7bc3c 100644 --- a/transforms/universal/doc_id/dpk_doc_id/transform_python.py +++ b/transforms/universal/doc_id/dpk_doc_id/transform_python.py @@ -71,7 +71,7 @@ def apply_input_params(self, args: Namespace) -> bool: return super().apply_input_params(args=args) -class DocIDPythonRuntime(DefaultPythonTransformRuntime): +class DocIDRuntime(DefaultPythonTransformRuntime): """ Exact dedup runtime support """ @@ -115,7 +115,7 @@ def __init__(self): ) -class DocIDRuntime: +class DocID: def __init__(self, **kwargs): self.params = {} for key in kwargs: From 4750dd749a6fc0122ed2178ec12ace0cf1869186 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 12 Dec 2024 15:04:03 +0100 Subject: [PATCH 27/28] added ray notebook Signed-off-by: Maroun Touma --- transforms/universal/doc_id/doc_id-ray.ipynb | 198 ++++++++++++++++++ transforms/universal/doc_id/doc_id.ipynb | 28 +-- .../doc_id/dpk_doc_id/ray/transform.py | 35 +++- .../doc_id/dpk_doc_id/transform_python.py | 2 +- 4 files changed, 246 insertions(+), 17 deletions(-) create mode 100644 transforms/universal/doc_id/doc_id-ray.ipynb diff --git a/transforms/universal/doc_id/doc_id-ray.ipynb b/transforms/universal/doc_id/doc_id-ray.ipynb new file mode 100644 index 000000000..9bfb99785 --- /dev/null +++ b/transforms/universal/doc_id/doc_id-ray.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "%pip install data-prep-toolkit[ray]\n", + "%pip install data-prep-toolkit-transforms[doc_id]" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. The set of dictionary keys holding DocIDTransform configuration for values are as follows: \n", + "* doc_column - specifies name of the column containing the document (required for ID generation)\n", + "* hash_column - specifies name of the column created to hold the string document id, if None, id is not generated\n", + "* int_id_column - specifies name of the column created to hold the integer document id, if None, id is not generated\n", + "* start_id - an id from which ID generator starts () " + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-12-12 15:01:10,711\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + ] + } + ], + "source": [ + "from dpk_doc_id.ray.transform import DocID" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "15:01:11 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'hash_column', 'int_column': 'int_id_column', 'start_id': 5}\n", + "15:01:11 INFO - pipeline id pipeline_id\n", + "15:01:11 INFO - code location None\n", + "15:01:11 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "15:01:11 INFO - actor creation delay 0\n", + "15:01:11 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", + "15:01:11 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", + "15:01:11 INFO - data factory data_ max_files -1, n_sample -1\n", + "15:01:11 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "15:01:11 INFO - Running locally\n", + "2024-12-12 15:01:18,744\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:20 INFO - orchestrator started at 2024-12-12 15:01:20\n", + "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:20 INFO - Number of files is 1, source profile {'max_file_size': 0.034458160400390625, 'min_file_size': 0.034458160400390625, 'total_file_size': 0.034458160400390625}\n", + "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:20 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 13.119487762451172, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:20 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:23 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:23 INFO - Completed processing 1 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:23 INFO - done flushing in 0.001 sec\n", + "15:01:33 INFO - Completed execution in 0.36 min, execution result 0\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DocID(input_folder= \"test-data/input\",\n", + " output_folder= \"output\",\n", + " run_locally= True,\n", + " doc_id_doc_column= \"contents\",\n", + " doc_id_hash_column= \"hash_column\",\n", + " doc_id_int_column= \"int_id_column\",\n", + " doc_id_start_id= 5).transform()" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use python runtime to invoke the transform" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['output/sample1.parquet', 'output/metadata.json']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"output/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/doc_id/doc_id.ipynb b/transforms/universal/doc_id/doc_id.ipynb index 9c1e4916f..cc007e09a 100644 --- a/transforms/universal/doc_id/doc_id.ipynb +++ b/transforms/universal/doc_id/doc_id.ipynb @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58", "metadata": {}, "outputs": [], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f", "metadata": {}, "outputs": [ @@ -77,18 +77,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "21:42:29 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'hash_column', 'int_column': 'int_id_column', 'start_id': 5}\n", - "21:42:29 INFO - pipeline id pipeline_id\n", - "21:42:29 INFO - code location None\n", - "21:42:29 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", - "21:42:29 INFO - data factory data_ max_files -1, n_sample -1\n", - "21:42:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "21:42:29 INFO - orchestrator doc_id started at 2024-12-10 21:42:29\n", - "21:42:29 INFO - Number of files is 1, source profile {'max_file_size': 0.034458160400390625, 'min_file_size': 0.034458160400390625, 'total_file_size': 0.034458160400390625}\n", - "21:42:32 INFO - Completed 1 files (100.0%) in 0.049 min\n", - "21:42:32 INFO - Done processing 1 files, waiting for flush() completion.\n", - "21:42:32 INFO - done flushing in 0.0 sec\n", - "21:42:32 INFO - Completed execution in 0.049 min, execution result 0\n" + "15:00:23 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'hash_column', 'int_column': 'int_id_column', 'start_id': 5}\n", + "15:00:23 INFO - pipeline id pipeline_id\n", + "15:00:23 INFO - code location None\n", + "15:00:23 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", + "15:00:23 INFO - data factory data_ max_files -1, n_sample -1\n", + "15:00:23 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "15:00:23 INFO - orchestrator doc_id started at 2024-12-12 15:00:23\n", + "15:00:23 INFO - Number of files is 1, source profile {'max_file_size': 0.034458160400390625, 'min_file_size': 0.034458160400390625, 'total_file_size': 0.034458160400390625}\n", + "15:00:26 INFO - Completed 1 files (100.0%) in 0.047 min\n", + "15:00:26 INFO - Done processing 1 files, waiting for flush() completion.\n", + "15:00:26 INFO - done flushing in 0.0 sec\n", + "15:00:26 INFO - Completed execution in 0.047 min, execution result 0\n" ] }, { diff --git a/transforms/universal/doc_id/dpk_doc_id/ray/transform.py b/transforms/universal/doc_id/dpk_doc_id/ray/transform.py index 4ff20b9f5..4c48ddeb2 100644 --- a/transforms/universal/doc_id/dpk_doc_id/ray/transform.py +++ b/transforms/universal/doc_id/dpk_doc_id/ray/transform.py @@ -9,12 +9,12 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ - +import sys from typing import Any import ray from data_processing.data_access import DataAccessFactoryBase -from data_processing.utils import UnrecoverableException +from data_processing.utils import ParamsUtils, UnrecoverableException from data_processing_ray.runtime.ray import ( DefaultRayTransformRuntime, RayTransformLauncher, @@ -107,6 +107,37 @@ def __init__(self): super().__init__(transform_config=DocIDRayTransformConfiguration(), runtime_class=DocIDRayRuntime) +# Class used by the notebooks to ingest binary files and create parquet files +class DocID: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + try: + worker_options = {k: self.params[k] for k in ("num_cpus", "memory")} + self.params["runtime_worker_options"] = ParamsUtils.convert_to_ast(worker_options) + del self.params["num_cpus"] + del self.params["memory"] + except: + pass + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + # create launcher + launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration()) + # launch + return_code = launcher.launch() + return return_code + + if __name__ == "__main__": launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration()) launcher.launch() diff --git a/transforms/universal/doc_id/dpk_doc_id/transform_python.py b/transforms/universal/doc_id/dpk_doc_id/transform_python.py index 27fc7bc3c..f97ace554 100644 --- a/transforms/universal/doc_id/dpk_doc_id/transform_python.py +++ b/transforms/universal/doc_id/dpk_doc_id/transform_python.py @@ -111,7 +111,7 @@ class DocIDPythonTransformRuntimeConfiguration(PythonTransformRuntimeConfigurati def __init__(self): super().__init__( transform_config=DocIDTransformConfiguration(), - runtime_class=DocIDPythonRuntime, + runtime_class=DocIDRuntime, ) From 2eb47bda47f362745b27853e98f04668a95492c9 Mon Sep 17 00:00:00 2001 From: SHAHROKH DAIJAVAD Date: Thu, 12 Dec 2024 10:49:40 -0800 Subject: [PATCH 28/28] Added the link to the Ray notebook in README Signed-off-by: SHAHROKH DAIJAVAD --- transforms/universal/doc_id/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/transforms/universal/doc_id/README.md b/transforms/universal/doc_id/README.md index 631565e5c..a29ada704 100644 --- a/transforms/universal/doc_id/README.md +++ b/transforms/universal/doc_id/README.md @@ -114,6 +114,10 @@ To use the transform image to transform your data, please refer to the [running images quickstart](../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. +### Code example + +[notebook](doc_id-ray.ipynb) + # Document ID Spark Annotator ## Summary