From 3c9b9997b8830a04f065f20ced1ed937c35affc6 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Thu, 5 Dec 2024 14:12:21 -0500
Subject: [PATCH 01/28] refactored code as its own module

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/.make.cicd.targets                 |  89 ++++++++++--------
 .../{python/Dockerfile => Dockerfile.python}  |  16 +---
 .../doc_id/{ray/Dockerfile => Dockerfile.ray} |  25 +----
 .../{spark/Dockerfile => Dockerfile.spark}    |  15 +--
 transforms/universal/doc_id/Makefile          |  87 +++--------------
 transforms/universal/doc_id/README.md         |  77 +++++++++++++--
 .../doc_id_local.py => dpk_doc_id/local.py}   |  27 +++---
 .../local_python.py}                          |  14 +--
 .../ray/local.py}                             |  14 +--
 .../doc_id_s3_ray.py => dpk_doc_id/ray/s3.py} |  14 +--
 .../ray/transform.py}                         |  22 ++---
 .../spark/local.py}                           |   2 +-
 .../spark/transform.py}                       |  18 ++--
 .../transform.py}                             |  19 ++--
 .../transform_python.py}                      |  13 ++-
 transforms/universal/doc_id/kfp_ray/Makefile  |  33 ++++---
 .../universal/doc_id/kfp_ray/doc_id_wf.py     |  21 +++--
 .../universal/doc_id/python/.dockerignore     |   1 -
 transforms/universal/doc_id/python/Makefile   |  64 -------------
 transforms/universal/doc_id/python/README.md  |  49 ----------
 .../universal/doc_id/python/pyproject.toml    |  46 ---------
 .../python/test-data/expected/metadata.json   |  48 ----------
 transforms/universal/doc_id/ray/.dockerignore |   1 -
 transforms/universal/doc_id/ray/.gitignore    |  38 --------
 transforms/universal/doc_id/ray/Makefile      |  68 -------------
 transforms/universal/doc_id/ray/README.md     |  31 ------
 .../universal/doc_id/ray/pyproject.toml       |  46 ---------
 .../ray/test-data/expected/metadata.json      |  60 ------------
 .../ray/test-data/expected/sample1.parquet    | Bin 36668 -> 0 bytes
 .../ray/test-data/input/sample1.parquet       | Bin 36132 -> 0 bytes
 .../doc_id/{python => }/requirements.txt      |   0
 .../universal/doc_id/spark/.dockerignore      |   1 -
 transforms/universal/doc_id/spark/.gitignore  |  39 --------
 transforms/universal/doc_id/spark/Makefile    |  57 -----------
 transforms/universal/doc_id/spark/README.md   |  59 ------------
 .../universal/doc_id/spark/pyproject.toml     |  45 ---------
 .../spark/test-data/expected/sample1.parquet  | Bin 36668 -> 0 bytes
 .../spark/test-data/input/sample1.parquet     | Bin 36132 -> 0 bytes
 .../test-data/expected/metadata.json          |  26 ++---
 .../test-data/expected/sample1.parquet        | Bin
 .../test-data/input/sample1.parquet           | Bin
 .../doc_id/{python => }/test/test_doc_id.py   |  16 ++--
 .../{python => }/test/test_doc_id_python.py   |  26 ++---
 .../doc_id/{ray => }/test/test_doc_id_ray.py  |  13 +--
 .../{spark => }/test/test_doc_id_spark.py     |   2 +-
 transforms/universal/doc_id/transform.config  |  20 ----
 46 files changed, 300 insertions(+), 962 deletions(-)
 rename transforms/universal/doc_id/{python/Dockerfile => Dockerfile.python} (66%)
 rename transforms/universal/doc_id/{ray/Dockerfile => Dockerfile.ray} (51%)
 rename transforms/universal/doc_id/{spark/Dockerfile => Dockerfile.spark} (69%)
 rename transforms/universal/doc_id/{python/src/doc_id_local.py => dpk_doc_id/local.py} (75%)
 rename transforms/universal/doc_id/{python/src/doc_id_local_python.py => dpk_doc_id/local_python.py} (84%)
 rename transforms/universal/doc_id/{ray/src/doc_id_local_ray.py => dpk_doc_id/ray/local.py} (85%)
 rename transforms/universal/doc_id/{ray/src/doc_id_s3_ray.py => dpk_doc_id/ray/s3.py} (85%)
 rename transforms/universal/doc_id/{ray/src/doc_id_transform_ray.py => dpk_doc_id/ray/transform.py} (87%)
 rename transforms/universal/doc_id/{spark/src/doc_id_local_spark.py => dpk_doc_id/spark/local.py} (98%)
 rename transforms/universal/doc_id/{spark/src/doc_id_transform_spark.py => dpk_doc_id/spark/transform.py} (94%)
 rename transforms/universal/doc_id/{python/src/doc_id_transform_base.py => dpk_doc_id/transform.py} (95%)
 rename transforms/universal/doc_id/{python/src/doc_id_transform_python.py => dpk_doc_id/transform_python.py} (95%)
 delete mode 100644 transforms/universal/doc_id/python/.dockerignore
 delete mode 100644 transforms/universal/doc_id/python/Makefile
 delete mode 100644 transforms/universal/doc_id/python/README.md
 delete mode 100644 transforms/universal/doc_id/python/pyproject.toml
 delete mode 100644 transforms/universal/doc_id/python/test-data/expected/metadata.json
 delete mode 100644 transforms/universal/doc_id/ray/.dockerignore
 delete mode 100644 transforms/universal/doc_id/ray/.gitignore
 delete mode 100644 transforms/universal/doc_id/ray/Makefile
 delete mode 100644 transforms/universal/doc_id/ray/README.md
 delete mode 100644 transforms/universal/doc_id/ray/pyproject.toml
 delete mode 100644 transforms/universal/doc_id/ray/test-data/expected/metadata.json
 delete mode 100644 transforms/universal/doc_id/ray/test-data/expected/sample1.parquet
 delete mode 100644 transforms/universal/doc_id/ray/test-data/input/sample1.parquet
 rename transforms/universal/doc_id/{python => }/requirements.txt (100%)
 delete mode 100644 transforms/universal/doc_id/spark/.dockerignore
 delete mode 100644 transforms/universal/doc_id/spark/.gitignore
 delete mode 100644 transforms/universal/doc_id/spark/Makefile
 delete mode 100644 transforms/universal/doc_id/spark/README.md
 delete mode 100644 transforms/universal/doc_id/spark/pyproject.toml
 delete mode 100644 transforms/universal/doc_id/spark/test-data/expected/sample1.parquet
 delete mode 100644 transforms/universal/doc_id/spark/test-data/input/sample1.parquet
 rename transforms/universal/doc_id/{spark => }/test-data/expected/metadata.json (60%)
 rename transforms/universal/doc_id/{python => }/test-data/expected/sample1.parquet (100%)
 rename transforms/universal/doc_id/{python => }/test-data/input/sample1.parquet (100%)
 rename transforms/universal/doc_id/{python => }/test/test_doc_id.py (84%)
 rename transforms/universal/doc_id/{python => }/test/test_doc_id_python.py (71%)
 rename transforms/universal/doc_id/{ray => }/test/test_doc_id_ray.py (83%)
 rename transforms/universal/doc_id/{spark => }/test/test_doc_id_spark.py (97%)
 delete mode 100644 transforms/universal/doc_id/transform.config

diff --git a/transforms/.make.cicd.targets b/transforms/.make.cicd.targets
index e392e8f36..23475f57f 100644
--- a/transforms/.make.cicd.targets
+++ b/transforms/.make.cicd.targets
@@ -51,63 +51,78 @@ publish:
 
 test-image-sequence:: .defaults.lib-whl-image .transforms.test-image-help .transforms.clean
 
+test-image-python:
+	$(MAKE) BUILD_SPECIFIC_RUNTIME=python test-image
+
+test-image-ray:
+	$(MAKE) BUILD_SPECIFIC_RUNTIME=ray test-image
+
+test-image-spark:
+	$(MAKE) BUILD_SPECIFIC_RUNTIME=spark test-image
+
 test-image:: .default.build-lib-wheel
-	@if [ -e Dockerfile.python ]; then \
-		$(MAKE) DOCKER_FILE=Dockerfile.python \
-				TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_PYTHON_SRC) \
-				DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
-				test-image-sequence ; \
+	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \
+		if [ -e Dockerfile.python ]; then \
+			$(MAKE) DOCKER_FILE=Dockerfile.python \
+					TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_PYTHON_SRC) \
+					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
+					test-image-sequence ; \
+		fi ;\
 	fi
-	@if [ -e Dockerfile.ray ]; then \
-		$(MAKE) DOCKER_FILE=Dockerfile.ray \
-				TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_RAY_SRC) \
-				DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
-				BASE_IMAGE=$(RAY_BASE_IMAGE)  \
-				test-image-sequence ; \
+	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "ray" ]; then \
+		if [ -e Dockerfile.ray ]; then \
+			$(MAKE) DOCKER_FILE=Dockerfile.ray \
+					TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_RAY_SRC) \
+					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
+					BASE_IMAGE=$(RAY_BASE_IMAGE)  \
+					test-image-sequence ; \
+		fi ;\
 	fi
-	@if [ -e Dockerfile.spark ]; then \
-		$(MAKE) DOCKER_FILE=Dockerfile.spark \
-				TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \
-				DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
-				BASE_IMAGE=$(SPARK_BASE_IMAGE)  \
-				test-image-sequence ; \
+	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "spark" ]; then \
+		if [ -e Dockerfile.spark ]; then \
+			$(MAKE) DOCKER_FILE=Dockerfile.spark \
+					TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \
+					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
+					BASE_IMAGE=$(SPARK_BASE_IMAGE)  \
+					test-image-sequence ; \
+		fi ;\
 	fi
 	-rm -rf data-processing-dist
 
 
 image-python:
-	@if [ -e Dockerfile.python ]; then \
-		$(MAKE) DOCKER_FILE=Dockerfile.python \
-				DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
-				.defaults.lib-whl-image ; \
-	fi
+	$(MAKE) BUILD_SPECIFIC_RUNTIME=python image
 
 image-ray:
-	@if [ -e Dockerfile.ray ]; then \
-		$(MAKE) DOCKER_FILE=Dockerfile.ray \
-				DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
-				BASE_IMAGE=$(RAY_BASE_IMAGE)  \
-				.defaults.lib-whl-image ; \
-	fi
+	$(MAKE) BUILD_SPECIFIC_RUNTIME=ray image
 
 image-spark:
-	@if [ -e Dockerfile.spark ]; then \
-		$(MAKE) DOCKER_FILE=Dockerfile.spark \
-				DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
-				BASE_IMAGE=$(SPARK_BASE_IMAGE)  \
-				.defaults.lib-whl-image ; \
-	fi
+	$(MAKE) BUILD_SPECIFIC_RUNTIME=spark image
 
 image:: .default.build-lib-wheel
 	## Build all possible images unless a specific runtime is specified
 	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \
-		$(MAKE) image-python ; \
+		if [ -e Dockerfile.python ]; then \
+			$(MAKE) DOCKER_FILE=Dockerfile.python \
+					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
+					.defaults.lib-whl-image ; \
+		fi ; \
 	fi
 	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "ray" ]; then \
-		$(MAKE) image-ray ; \
+		if [ -e Dockerfile.ray ]; then \
+			$(MAKE) DOCKER_FILE=Dockerfile.ray \
+					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
+					BASE_IMAGE=$(RAY_BASE_IMAGE)  \
+					.defaults.lib-whl-image ; \
+		fi ; \
 	fi
 	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "spark" ]; then \
-		$(MAKE) image-spark ; \
+		if [ -e Dockerfile.spark ]; then \
+			$(MAKE) DOCKER_FILE=Dockerfile.spark \
+					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
+					BASE_IMAGE=$(SPARK_BASE_IMAGE)  \
+					.defaults.lib-whl-image ; \
+		fi ; \
 	fi
 	-rm -rf data-processing-dist
 
diff --git a/transforms/universal/doc_id/python/Dockerfile b/transforms/universal/doc_id/Dockerfile.python
similarity index 66%
rename from transforms/universal/doc_id/python/Dockerfile
rename to transforms/universal/doc_id/Dockerfile.python
index bbedf1eb7..fc634a043 100644
--- a/transforms/universal/doc_id/python/Dockerfile
+++ b/transforms/universal/doc_id/Dockerfile.python
@@ -2,9 +2,6 @@ FROM docker.io/python:3.10.14-slim-bullseye
 
 RUN pip install --upgrade --no-cache-dir pip 
 
-# install pytest
-RUN pip install --no-cache-dir pytest
-
 # Create a user and use it to run the transform
 RUN useradd -ms /bin/bash dpk
 USER dpk
@@ -16,19 +13,10 @@ ARG DPK_WHEEL_FILE_NAME
 COPY --chown=dpk:root data-processing-dist data-processing-dist
 RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
 
-COPY --chown=dpk:root src/ src/
-COPY --chown=dpk:root pyproject.toml pyproject.toml
-COPY --chown=dpk:root README.md README.md
+COPY --chown=dpk:root dpk_doc_id/ dpk_doc_id/
 COPY --chown=dpk:root requirements.txt requirements.txt
-RUN pip install --no-cache-dir -e .
-
-# copy source data
-COPY ./src/doc_id_transform_python.py .
-COPY ./src/doc_id_local.py local/
+RUN pip install --no-cache-dir -r requirements.txt
 
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
 
 # Set environment
 ENV PYTHONPATH /home/dpk
diff --git a/transforms/universal/doc_id/ray/Dockerfile b/transforms/universal/doc_id/Dockerfile.ray
similarity index 51%
rename from transforms/universal/doc_id/ray/Dockerfile
rename to transforms/universal/doc_id/Dockerfile.ray
index f33aedefa..f5bf58cae 100644
--- a/transforms/universal/doc_id/ray/Dockerfile
+++ b/transforms/universal/doc_id/Dockerfile.ray
@@ -2,7 +2,7 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310
 
 FROM ${BASE_IMAGE}
 
-RUN pip install --upgrade --no-cache-dir pip
+RUN pip install --upgrade --no-cache-dir pip 
 
 # install pytest
 RUN pip install --no-cache-dir pytest
@@ -14,24 +14,9 @@ COPY --chown=ray:users data-processing-dist data-processing-dist
 RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]
 
 ## Copy the python version of the tansform
-COPY --chown=ray:users python-transform/  python-transform/
-RUN cd python-transform && pip install --no-cache-dir -e .
-
-# Install ray project source
-COPY --chown=ray:users src/ src/
-COPY --chown=ray:users pyproject.toml pyproject.toml
-COPY --chown=ray:users README.md README.md
-RUN pip install --no-cache-dir -e .
-
-# copy the main() entry point to the image 
-COPY ./src/doc_id_transform_ray.py .
-
-# copy some of the samples in
-COPY src/doc_id_local_ray.py local/
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
+COPY --chown=ray:users dpk_html2parquet/ dpk_html2parquet/
+COPY --chown=ray:users requirements.txt requirements.txt
+RUN pip install -r requirements.txt
 
 # Set environment
 ENV PYTHONPATH /home/ray
@@ -40,4 +25,4 @@ ENV PYTHONPATH /home/ray
 ARG BUILD_DATE
 ARG GIT_COMMIT
 LABEL build-date=$BUILD_DATE
-LABEL git-commit=$GIT_COMMIT
+LABEL git-commit=$GIT_COMMIT
\ No newline at end of file
diff --git a/transforms/universal/doc_id/spark/Dockerfile b/transforms/universal/doc_id/Dockerfile.spark
similarity index 69%
rename from transforms/universal/doc_id/spark/Dockerfile
rename to transforms/universal/doc_id/Dockerfile.spark
index 3d39ed250..e8df6c522 100644
--- a/transforms/universal/doc_id/spark/Dockerfile
+++ b/transforms/universal/doc_id/Dockerfile.spark
@@ -15,19 +15,12 @@ RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark]
 
 
 # Install project source
-COPY --chown=spark:root src/ src/
-COPY --chown=spark:root pyproject.toml pyproject.toml 
-RUN pip install --no-cache-dir -e .
 
-# copy the main() entry point to the image 
-COPY ./src/doc_id_transform_spark.py .
+## Copy the python version of the tansform
+COPY --chown=spark:root dpk_doc_id/ dpk_doc_id/
+COPY --chown=spark:root requirements.txt requirements.txt
+RUN pip install -r requirements.txt
 
-# copy some of the samples in
-COPY src/doc_id_local_spark.py local/
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
 
 USER spark
 
diff --git a/transforms/universal/doc_id/Makefile b/transforms/universal/doc_id/Makefile
index be26d3bf4..bf0d39543 100644
--- a/transforms/universal/doc_id/Makefile
+++ b/transforms/universal/doc_id/Makefile
@@ -1,79 +1,22 @@
 REPOROOT=../../..
 # Use make help, to see the available rules
-include $(REPOROOT)/.make.defaults
+include $(REPOROOT)/transforms/.make.cicd.targets
 
-setup::
-	@# Help: Recursively make $@ all subdirs 
-	$(MAKE) RULE=$@ .recurse
+#
+# This is intended to be included across the Makefiles provided within
+# a given transform's directory tree,  so must use compatible syntax.
+#
+################################################################################
+# This defines the name of the transform and is used to match against
+# expected files and is used to define the transform's image name. 
+TRANSFORM_NAME=$(shell basename `pwd`)
 
-clean::
-	@# Help: Recursively make $@ all subdirs 
-	$(MAKE) RULE=$@ .recurse
+################################################################################
 
-build::
-	@# Help: Recursively make $@ in subdirs 
-	$(MAKE) RULE=$@ .recurse
-venv::
-	@# Help: Recursively make $@ in subdirs 
-	$(MAKE) RULE=$@ .recurse
 
-image:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-publish:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test-image:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test-src::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-set-versions::  
-	@# Help: Recursively $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-kind-load-image::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-docker-load-image::
-	@# Help: Recursively make $@ in all subdirs
-	$(MAKE) RULE=$@ .recurse
-
-docker-save-image::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-.PHONY: workflow-venv
-workflow-venv:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-venv;   \
-	fi
-
-.PHONY: workflow-test
-workflow-test:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-test;   \
-	fi
-	
-.PHONY: workflow-upload
-workflow-upload:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-upload; \
-	fi
-
-.PHONY: workflow-build
-workflow-build:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C  kfp_ray workflow-build; \
-	fi
 
+run-cli-say-sample: 
+	$(MAKE) RUN_FILE="-m dpk_$(TRANSFORM_NAME).ray.transform" \
+                RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\"  \
+                --doc_id_int True "	\
+                .transforms.run-src-file
diff --git a/transforms/universal/doc_id/README.md b/transforms/universal/doc_id/README.md
index c5c785353..675995623 100644
--- a/transforms/universal/doc_id/README.md
+++ b/transforms/universal/doc_id/README.md
@@ -1,19 +1,10 @@
-# Doc ID Transform 
+# Document ID Python Annotator
 
 The Document ID transforms adds a document identification (unique integers and content hashes), which later can be 
 used in de-duplication operations, per the set of 
 [transform project conventions](../../README.md#transform-project-conventions)
 the following runtimes are available:
 
-* [pythom](python/README.md) - enables the running of the base python transformation
-  in a Python runtime
-* [ray](ray/README.md) - enables the running of the base python transformation
-  in a Ray runtime
-* [spark](spark/README.md) - enables the running of a spark-based transformation
-in a Spark runtime. 
-* [kfp](kfp_ray/README.md) - enables running the ray docker image 
-in a kubernetes cluster using a generated `yaml` file.
-
 ## Summary
 
 This transform annotates documents with document "ids".
@@ -31,3 +22,69 @@ Document IDs are generally useful for tracking annotations to specific documents
 [fuzzy deduping](../fdedup) relies on integer IDs to be present. If your dataset does not have
 document ID column(s), you can use this transform to create ones.
 
+
+## Configuration and command line Options
+
+The set of dictionary keys defined in [DocIDTransform](src/doc_id_transform_ray.py)
+configuration for values are as follows:
+
+* _doc_column_ - specifies name of the column containing the document (required for ID generation)
+* _hash_column_ - specifies name of the column created to hold the string document id, if None, id is not generated
+* _int_id_column_ - specifies name of the column created to hold the integer document id, if None, id is not generated
+* _start_id_ - an id from which ID generator starts () 
+
+At least one of _hash_column_ or _int_id_column_ must be specified.
+
+## Running
+
+### Launched Command Line Options 
+When running the transform with the Ray launcher (i.e. TransformLauncher),
+the following command line arguments are available in addition to 
+[the options provided by the ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md).
+```
+  --doc_id_doc_column DOC_ID_DOC_COLUMN
+                        doc column name
+  --doc_id_hash_column DOC_ID_HASH_COLUMN
+                        Compute document hash and place in the given named column
+  --doc_id_int_column DOC_ID_INT_COLUMN
+                        Compute unique integer id and place in the given named column
+  --doc_id_start_id DOC_ID_START_ID
+                        starting integer id
+```
+These correspond to the configuration keys described above.
+
+
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
+
+
+### Running as spark-based application
+```
+(venv) cma:src$ python doc_id_local.py
+18:32:13 INFO - data factory data_ is using local data access: input_folder - /home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/test-data/input output_folder - /home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/output at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/data_access/data_access_factory.py:185"
+18:32:13 INFO - data factory data_ max_files -1, n_sample -1 at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/data_access/data_access_factory.py:201"
+18:32:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'] at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/data_access/data_access_factory.py:214"
+18:32:13 INFO - pipeline id pipeline_id at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/runtime/execution_configuration.py:80"
+18:32:13 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'} at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/runtime/execution_configuration.py:83"
+18:32:13 INFO - spark execution config : {'spark_local_config_filepath': '/home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/config/spark_profile_local.yml', 'spark_kube_config_filepath': 'config/spark_profile_kube.yml'} at "/home/cma/de/data-prep-kit/data-processing-lib/spark/src/data_processing_spark/runtime/spark/spark_execution_config.py:42"
+24/05/26 18:32:14 WARN Utils: Your hostname, li-7aed0a4c-2d51-11b2-a85c-dfad31db696b.ibm.com resolves to a loopback address: 127.0.0.1; using 192.168.1.223 instead (on interface wlp0s20f3)
+24/05/26 18:32:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/05/26 18:32:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+18:32:17 INFO - files = ['/home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/test-data/input/test_doc_id_1.parquet', '/home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/test-data/input/test_doc_id_2.parquet'] at "/home/cma/de/data-prep-kit/data-processing-lib/spark/src/data_processing_spark/runtime/spark/spark_launcher.py:184"
+24/05/26 18:32:23 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+```
+
+### Doc ID Statistics
+The metadata generated by the Spark `doc_id` transform contains the following statistics:
+  * `total_docs_count`, `total_columns_count`: total number of documents (rows), and columns in the input table, before the `doc_id` transform ran    
+  * `docs_after_doc_id`, `columns_after_doc_id`: total number of documents (rows), and columns in the output table, after the `doc_id` transform ran  
+
+### Transforming data using the transform image
+
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
+
diff --git a/transforms/universal/doc_id/python/src/doc_id_local.py b/transforms/universal/doc_id/dpk_doc_id/local.py
similarity index 75%
rename from transforms/universal/doc_id/python/src/doc_id_local.py
rename to transforms/universal/doc_id/dpk_doc_id/local.py
index 9d525dfa2..dc688cd1c 100644
--- a/transforms/universal/doc_id/python/src/doc_id_local.py
+++ b/transforms/universal/doc_id/dpk_doc_id/local.py
@@ -13,13 +13,15 @@
 import os
 
 from data_processing.data_access import DataAccessLocal
-from doc_id_transform_python import DocIDTransform
-from doc_id_transform_base import (IDGenerator,
-                                   doc_column_name_key,
-                                   hash_column_name_key,
-                                   int_column_name_key,
-                                   id_generator_key,
-                                   )
+from dpk_doc_id.transform import (
+    IDGenerator,
+    doc_column_name_key,
+    hash_column_name_key,
+    id_generator_key,
+    int_column_name_key,
+)
+from dpk_doc_id.transform_python import DocIDTransform
+
 
 # create parameters
 input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
@@ -29,11 +31,12 @@
     "output_folder": output_folder,
 }
 
-doc_id_params = {doc_column_name_key: "contents",
-                 hash_column_name_key: "hash_column",
-                 int_column_name_key: "int_id_column",
-                 id_generator_key: IDGenerator(5),
-                 }
+doc_id_params = {
+    doc_column_name_key: "contents",
+    hash_column_name_key: "hash_column",
+    int_column_name_key: "int_id_column",
+    id_generator_key: IDGenerator(5),
+}
 doc_column_name_key = "doc_column"
 hash_column_name_key = "hash_column"
 int_column_name_key = "int_column"
diff --git a/transforms/universal/doc_id/python/src/doc_id_local_python.py b/transforms/universal/doc_id/dpk_doc_id/local_python.py
similarity index 84%
rename from transforms/universal/doc_id/python/src/doc_id_local_python.py
rename to transforms/universal/doc_id/dpk_doc_id/local_python.py
index 1a234b79b..68a2def42 100644
--- a/transforms/universal/doc_id/python/src/doc_id_local_python.py
+++ b/transforms/universal/doc_id/dpk_doc_id/local_python.py
@@ -15,12 +15,14 @@
 
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
-from doc_id_transform_python import DocIDPythonTransformRuntimeConfiguration
-from doc_id_transform_base import (doc_column_name_cli_param,
-                                   hash_column_name_cli_param,
-                                   int_column_name_cli_param,
-                                   start_id_cli_param,
-                                   )
+from dpk_doc_id.transform import (
+    doc_column_name_cli_param,
+    hash_column_name_cli_param,
+    int_column_name_cli_param,
+    start_id_cli_param,
+)
+from dpk_doc_id.transform_python import DocIDPythonTransformRuntimeConfiguration
+
 
 # create parameters
 input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
diff --git a/transforms/universal/doc_id/ray/src/doc_id_local_ray.py b/transforms/universal/doc_id/dpk_doc_id/ray/local.py
similarity index 85%
rename from transforms/universal/doc_id/ray/src/doc_id_local_ray.py
rename to transforms/universal/doc_id/dpk_doc_id/ray/local.py
index 9847da611..2a6e36113 100644
--- a/transforms/universal/doc_id/ray/src/doc_id_local_ray.py
+++ b/transforms/universal/doc_id/dpk_doc_id/ray/local.py
@@ -15,12 +15,14 @@
 
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration
-from doc_id_transform_base import (doc_column_name_cli_param,
-                                   hash_column_name_cli_param,
-                                   int_column_name_cli_param,
-                                   start_id_cli_param,
-                                   )
+from dpk_doc_id.ray.transform import DocIDRayTransformRuntimeConfiguration
+from dpk_doc_id.transform import (
+    doc_column_name_cli_param,
+    hash_column_name_cli_param,
+    int_column_name_cli_param,
+    start_id_cli_param,
+)
+
 
 # create parameters
 input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
diff --git a/transforms/universal/doc_id/ray/src/doc_id_s3_ray.py b/transforms/universal/doc_id/dpk_doc_id/ray/s3.py
similarity index 85%
rename from transforms/universal/doc_id/ray/src/doc_id_s3_ray.py
rename to transforms/universal/doc_id/dpk_doc_id/ray/s3.py
index d6f5a63f7..4123a04a6 100644
--- a/transforms/universal/doc_id/ray/src/doc_id_s3_ray.py
+++ b/transforms/universal/doc_id/dpk_doc_id/ray/s3.py
@@ -14,12 +14,14 @@
 
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration
-from doc_id_transform_base import (doc_column_name_cli_param,
-                                   hash_column_name_cli_param,
-                                   int_column_name_cli_param,
-                                   start_id_cli_param,
-                                   )
+from dpk_doc_id.ray.transform import DocIDRayTransformRuntimeConfiguration
+from dpk_doc_id.transform import (
+    doc_column_name_cli_param,
+    hash_column_name_cli_param,
+    int_column_name_cli_param,
+    start_id_cli_param,
+)
+
 
 # create parameters
 s3_cred = {
diff --git a/transforms/universal/doc_id/ray/src/doc_id_transform_ray.py b/transforms/universal/doc_id/dpk_doc_id/ray/transform.py
similarity index 87%
rename from transforms/universal/doc_id/ray/src/doc_id_transform_ray.py
rename to transforms/universal/doc_id/dpk_doc_id/ray/transform.py
index 19742c866..4ff20b9f5 100644
--- a/transforms/universal/doc_id/ray/src/doc_id_transform_ray.py
+++ b/transforms/universal/doc_id/dpk_doc_id/ray/transform.py
@@ -22,13 +22,14 @@
 from data_processing_ray.runtime.ray.runtime_configuration import (
     RayTransformRuntimeConfiguration,
 )
+from dpk_doc_id.transform import (
+    DocIDTransformBase,
+    DocIDTransformConfigurationBase,
+    IDGenerator,
+    id_generator_key,
+    start_id_key,
+)
 from ray.actor import ActorHandle
-from doc_id_transform_base import (IDGenerator,
-                                   DocIDTransformBase,
-                                   DocIDTransformConfigurationBase,
-                                   start_id_key,
-                                   id_generator_key,
-                                   )
 
 
 class DocIDRayTransform(DocIDTransformBase):
@@ -44,9 +45,7 @@ def __init__(self, config: dict[str, Any]):
         super().__init__(config)
         self.id_generator = config.get(id_generator_key, None)
         if self.id_generator is None and self.int_column is not None:
-            raise UnrecoverableException(
-                "There is no id generating actor defined."
-            )
+            raise UnrecoverableException("There is no id generating actor defined.")
 
     def _get_starting_id(self, n_rows: int) -> int:
         """
@@ -105,10 +104,7 @@ def __init__(self):
 
 class DocIDRayTransformRuntimeConfiguration(RayTransformRuntimeConfiguration):
     def __init__(self):
-        super().__init__(
-            transform_config=DocIDRayTransformConfiguration(),
-            runtime_class=DocIDRayRuntime
-        )
+        super().__init__(transform_config=DocIDRayTransformConfiguration(), runtime_class=DocIDRayRuntime)
 
 
 if __name__ == "__main__":
diff --git a/transforms/universal/doc_id/spark/src/doc_id_local_spark.py b/transforms/universal/doc_id/dpk_doc_id/spark/local.py
similarity index 98%
rename from transforms/universal/doc_id/spark/src/doc_id_local_spark.py
rename to transforms/universal/doc_id/dpk_doc_id/spark/local.py
index c9a167783..d6f821aa0 100644
--- a/transforms/universal/doc_id/spark/src/doc_id_local_spark.py
+++ b/transforms/universal/doc_id/dpk_doc_id/spark/local.py
@@ -14,7 +14,7 @@
 
 from data_processing.utils import ParamsUtils
 from data_processing_spark.runtime.spark import SparkTransformLauncher
-from doc_id_transform_spark import (
+from dpk_doc_id.spark.transform import (
     DocIDSparkTransformConfiguration,
     doc_column_name_cli_param,
     hash_column_name_cli_param,
diff --git a/transforms/universal/doc_id/spark/src/doc_id_transform_spark.py b/transforms/universal/doc_id/dpk_doc_id/spark/transform.py
similarity index 94%
rename from transforms/universal/doc_id/spark/src/doc_id_transform_spark.py
rename to transforms/universal/doc_id/dpk_doc_id/spark/transform.py
index beeb77ce5..4af3429b3 100644
--- a/transforms/universal/doc_id/spark/src/doc_id_transform_spark.py
+++ b/transforms/universal/doc_id/dpk_doc_id/spark/transform.py
@@ -14,12 +14,18 @@
 from typing import Any
 
 import pyarrow as pa
-from data_processing.transform import AbstractTableTransform, TransformConfiguration
 from data_processing.data_access import DataAccessFactoryBase
-from data_processing.transform import TransformStatistics
+from data_processing.transform import (
+    AbstractTableTransform,
+    TransformConfiguration,
+    TransformStatistics,
+)
 from data_processing.utils import CLIArgumentProvider, TransformUtils
-from data_processing_spark.runtime.spark import SparkTransformLauncher
-from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration, DefaultSparkTransformRuntime
+from data_processing_spark.runtime.spark import (
+    DefaultSparkTransformRuntime,
+    SparkTransformLauncher,
+    SparkTransformRuntimeConfiguration,
+)
 
 
 short_name = "doc_id"
@@ -137,7 +143,6 @@ def apply_input_params(self, args: Namespace) -> bool:
 
 
 class DocIDSparkTransformRuntime(DefaultSparkTransformRuntime):
-
     def __init__(self, params: dict[str, Any]):
         """
         Create/config this runtime.
@@ -146,7 +151,7 @@ def __init__(self, params: dict[str, Any]):
         super().__init__(params)
 
         def get_transform_config(
-                self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics
+            self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics
         ) -> dict[str, Any]:
             """
             Get the dictionary of configuration that will be provided to the transform's initializer.
@@ -161,7 +166,6 @@ def get_transform_config(
             return self.params | {"partition_index": partition}
 
 
-
 class DocIDSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
     """
     Implements the SparkTransformConfiguration for NOOP as required by the PythonTransformLauncher.
diff --git a/transforms/universal/doc_id/python/src/doc_id_transform_base.py b/transforms/universal/doc_id/dpk_doc_id/transform.py
similarity index 95%
rename from transforms/universal/doc_id/python/src/doc_id_transform_base.py
rename to transforms/universal/doc_id/dpk_doc_id/transform.py
index 132a3d964..08d316f48 100644
--- a/transforms/universal/doc_id/python/src/doc_id_transform_base.py
+++ b/transforms/universal/doc_id/dpk_doc_id/transform.py
@@ -14,17 +14,20 @@
 from typing import Any
 
 import pyarrow as pa
-
 from data_processing.transform import AbstractTableTransform, TransformConfiguration
-from data_processing.utils import CLIArgumentProvider, TransformUtils, UnrecoverableException
+from data_processing.utils import (
+    CLIArgumentProvider,
+    TransformUtils,
+    UnrecoverableException,
+)
 
 
-class IDGenerator():
+class IDGenerator:
     """
     A class maintaining unique integer ids
     """
 
-    def __init__(self, start: int=0):
+    def __init__(self, start: int = 0):
         """
         Initialization
         :param start: starting id number
@@ -127,6 +130,7 @@ def __init__(self, transform_class: type[AbstractTableTransform]):
             transform_class=transform_class,
         )
         from data_processing.utils import get_logger
+
         self.logger = get_logger(__name__)
 
     def add_input_params(self, parser: ArgumentParser) -> None:
@@ -137,10 +141,7 @@ def add_input_params(self, parser: ArgumentParser) -> None:
         (e.g, noop_, pii_, etc.)
         """
         parser.add_argument(
-            f"--{doc_column_name_cli_param}",
-            type=str,
-            default=doc_column_name_default,
-            help="doc column name"
+            f"--{doc_column_name_cli_param}", type=str, default=doc_column_name_default, help="doc column name"
         )
         parser.add_argument(
             f"--{hash_column_name_cli_param}",
@@ -174,4 +175,4 @@ def apply_input_params(self, args: Namespace) -> bool:
 
         self.params = self.params | captured
         self.logger.info(f"Doc id parameters are : {self.params}")
-        return True
\ No newline at end of file
+        return True
diff --git a/transforms/universal/doc_id/python/src/doc_id_transform_python.py b/transforms/universal/doc_id/dpk_doc_id/transform_python.py
similarity index 95%
rename from transforms/universal/doc_id/python/src/doc_id_transform_python.py
rename to transforms/universal/doc_id/dpk_doc_id/transform_python.py
index cbc63592c..4dd5b4c6f 100644
--- a/transforms/universal/doc_id/python/src/doc_id_transform_python.py
+++ b/transforms/universal/doc_id/dpk_doc_id/transform_python.py
@@ -14,18 +14,18 @@
 from typing import Any
 
 from data_processing.data_access import DataAccessFactoryBase
-from data_processing.transform import TransformStatistics
 from data_processing.runtime.pure_python import (
     DefaultPythonTransformRuntime,
+    PythonTransformLauncher,
     PythonTransformRuntimeConfiguration,
-    PythonTransformLauncher
 )
-from doc_id_transform_base import (
-    IDGenerator,
+from data_processing.transform import TransformStatistics
+from dpk_doc_id.transform import (
     DocIDTransformBase,
     DocIDTransformConfigurationBase,
+    IDGenerator,
+    id_generator_key,
     start_id_key,
-    id_generator_key
 )
 
 
@@ -52,7 +52,6 @@ def _get_starting_id(self, n_rows: int) -> int:
 
 
 class DocIDTransformConfiguration(DocIDTransformConfigurationBase):
-
     def __init__(self):
         super().__init__(transform_class=DocIDTransform)
 
@@ -81,7 +80,7 @@ def __init__(self, params: dict[str, Any]):
         self.id_generator = None
 
     def get_transform_config(
-            self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str]
+        self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str]
     ) -> dict[str, Any]:
         """
         Get the dictionary of configuration that will be provided to the transform's initializer.
diff --git a/transforms/universal/doc_id/kfp_ray/Makefile b/transforms/universal/doc_id/kfp_ray/Makefile
index f170326e2..be5a2144f 100644
--- a/transforms/universal/doc_id/kfp_ray/Makefile
+++ b/transforms/universal/doc_id/kfp_ray/Makefile
@@ -2,10 +2,20 @@ REPOROOT=${CURDIR}/../../../../
 WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
 include $(REPOROOT)/transforms/.make.workflows
 
+
 # Include the common configuration for this transform
-include ../transform.config
+#include ../transform.config
+
+SRC_DIR=${CURDIR}/../
+# Use the docker image that is built for ray runtime
+TRANSFORM_RUNTIME=ray
+## override settings in .make.default as they assume old structure with ray being the current folder
+DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-$(TRANSFORM_RUNTIME)
+DOCKER_LOCAL_IMAGE=$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION)
+
 
-SRC_DIR=${CURDIR}/../ray/
+# Only build the image with -f Dockerfile.ray
+BUILD_SPECIFIC_RUNTIME=ray
 
 PYTHON_WF := $(shell find ./ -name '*_wf.py')
 YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF})
@@ -21,8 +31,6 @@ venv::
 
 build::
 
-setup::
-
 test::
 
 test-src::
@@ -33,11 +41,7 @@ publish::
 
 image::
 
-kind-load-image::
-
-docker-load-image::
-
-docker-save-image::
+load-image::
 
 .PHONY: workflow-build
 workflow-build: workflow-venv
@@ -45,10 +49,15 @@ workflow-build: workflow-venv
 
 .PHONY: workflow-test
 workflow-test: workflow-build
-	$(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=doc_id_wf.yaml
+	$(MAKE) TRANSFORM_SRC=${SRC_DIR} \
+		TRANSFORM_RUNTIME=$(TRANSFORM_RUNTIME) \
+		TRANSFORM_NAME=$(TRANSFORM_NAME) \
+		BUILD_SPECIFIC_RUNTIME=$(BUILD_SPECIFIC_RUNTIME) \
+		DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) \
+		PIPELINE_FILE=$(TRANSFORM_NAME)_wf.yaml .workflows.test-pipeline
 
 .PHONY: workflow-upload
-workflow-upload: workflow-build
+workflow-upload:
 	@for file in $(YAML_WF); do \
 		$(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \
-	done
+	done
\ No newline at end of file
diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py
index f41231159..dbdb269e9 100644
--- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py
+++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py
@@ -20,7 +20,7 @@
 task_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:latest"
 
 # the name of the job script
-EXEC_SCRIPT_NAME: str = "doc_id_transform_ray.py"
+EXEC_SCRIPT_NAME: str = "-m dpk_doc_id.ray.transform"
 # components
 base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
@@ -109,7 +109,14 @@ def doc_id(
     ray_name: str = "doc_id-kfp-ray",  # name of Ray cluster
     # Add image_pull_secret and image_pull_policy to ray workers if needed
     ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image},
-    ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image},
+    ray_worker_options: dict = {
+        "replicas": 2,
+        "max_replicas": 2,
+        "min_replicas": 2,
+        "cpu": 2,
+        "memory": 4,
+        "image": task_image,
+    },
     server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
     # data access
     data_s3_config: str = "{'input_folder': 'test/doc_id/input/', 'output_folder': 'test/doc_id/output/'}",
@@ -120,9 +127,9 @@ def doc_id(
     data_data_sets: str = "",
     data_files_to_use: str = "['.parquet']",
     # orchestrator
-    runtime_actor_options: dict = {'num_cpus': 0.8},
+    runtime_actor_options: dict = {"num_cpus": 0.8},
     runtime_pipeline_id: str = "pipeline_id",
-    runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'},
+    runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"},
     # doc id parameters
     doc_id_doc_column: str = "contents",
     doc_id_hash_column: str = "hash_column",
@@ -171,7 +178,9 @@ def doc_id(
     :return: None
     """
     # create clean_up task
-    clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params)
+    clean_up_task = cleanup_ray_op(
+        ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params
+    )
     ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2)
     # pipeline definition
     with dsl.ExitHandler(clean_up_task):
@@ -191,7 +200,7 @@ def doc_id(
             doc_id_doc_column=doc_id_doc_column,
             doc_id_hash_column=doc_id_hash_column,
             doc_id_int_column=doc_id_int_column,
-            doc_id_start_id=doc_id_start_id
+            doc_id_start_id=doc_id_start_id,
         )
         ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2)
         # start Ray cluster
diff --git a/transforms/universal/doc_id/python/.dockerignore b/transforms/universal/doc_id/python/.dockerignore
deleted file mode 100644
index f7275bbbd..000000000
--- a/transforms/universal/doc_id/python/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-venv/
diff --git a/transforms/universal/doc_id/python/Makefile b/transforms/universal/doc_id/python/Makefile
deleted file mode 100644
index 26da1fc8f..000000000
--- a/transforms/universal/doc_id/python/Makefile
+++ /dev/null
@@ -1,64 +0,0 @@
-# Define the root of the local git clone for the common rules to be able 
-# know where they are running from.
-REPOROOT=../../../..
-
-# Set this, before including .make.defaults, to 
-#   1 if requirements reference the latest code in the data processing library 
-#     in this repo (that is not yet published to pypi).	 This is the default setting.
-#   0 if the transforms DPK dependencies are on wheels published to 
-#     pypi (e.g. data-prep-toolkit=0.2.1)
-#USE_REPO_LIB_SRC=1
-
-# Include a library of common .transform.* targets which most
-# transforms should be able to reuse.  However, feel free
-# to override/redefine the rules below. 
-include $(REPOROOT)/transforms/.make.transforms
-
-# Include the common configuration for this transform
-include ../transform.config
-
-venv::	.transforms.python-venv
-
-test::	.transforms.python-test
-
-clean:: .transforms.clean
-
-image:: .transforms.python-image
-
-test-src:: .transforms.test-src
-
-setup:: .transforms.setup
-
-build:: build-dist image
-
-publish: publish-image
-
-publish-image:: .transforms.publish-image-python
-
-setup:: .transforms.setup
-
-# distribution versions is the same as image version.
-set-versions:
-	$(MAKE) TRANSFORM_PYTHON_VERSION=$(DOC_ID_PYTHON_VERSION) TOML_VERSION=$(DOC_ID_PYTHON_VERSION) .transforms.set-versions
-        
-build-dist:: .defaults.build-dist 
-
-publish-dist:: .defaults.publish-dist
-
-test-image:: .transforms.python-test-image
-
-run-cli-sample: .transforms.run-cli-python-sample
-
-run-local-sample: .transforms.run-local-sample
-
-run-local-python-sample: .transforms.run-local-python-sample
-
-#run-s3-ray-sample: .transforms.run-s3-ray-sample
-
-minio-start:	.minio-start
-
-kind-load-image:: .transforms.kind-load-image
-
-docker-load-image: .defaults.docker-load-image
-
-docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/doc_id/python/README.md b/transforms/universal/doc_id/python/README.md
deleted file mode 100644
index dbb02093c..000000000
--- a/transforms/universal/doc_id/python/README.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Document ID Python Annotator
-
-Please see the set of
-[transform project conventions](../../../README.md)
-for details on general project conventions, transform configuration,
-testing and IDE set up.
-
-## Building
-
-A [docker file](Dockerfile) that can be used for building docker image. You can use
-
-```shell
-make build 
-```
-
-## Configuration and command line Options
-
-The set of dictionary keys defined in [DocIDTransform](src/doc_id_transform_ray.py)
-configuration for values are as follows:
-
-* _doc_column_ - specifies name of the column containing the document (required for ID generation)
-* _hash_column_ - specifies name of the column created to hold the string document id, if None, id is not generated
-* _int_id_column_ - specifies name of the column created to hold the integer document id, if None, id is not generated
-* _start_id_ - an id from which ID generator starts () 
-
-At least one of _hash_column_ or _int_id_column_ must be specified.
-
-## Running
-
-### Launched Command Line Options 
-When running the transform with the Ray launcher (i.e. TransformLauncher),
-the following command line arguments are available in addition to 
-[the options provided by the ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md).
-```
-  --doc_id_doc_column DOC_ID_DOC_COLUMN
-                        doc column name
-  --doc_id_hash_column DOC_ID_HASH_COLUMN
-                        Compute document hash and place in the given named column
-  --doc_id_int_column DOC_ID_INT_COLUMN
-                        Compute unique integer id and place in the given named column
-  --doc_id_start_id DOC_ID_START_ID
-                        starting integer id
-```
-These correspond to the configuration keys described above.
-
-
-To use the transform image to transform your data, please refer to the 
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
-substituting the name of this transform image and runtime as appropriate.
diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml
deleted file mode 100644
index 1a962662d..000000000
--- a/transforms/universal/doc_id/python/pyproject.toml
+++ /dev/null
@@ -1,46 +0,0 @@
-[project]
-name = "dpk_doc_id_transform_python"
-version = "0.2.3.dev0"
-requires-python = ">=3.10,<3.13"
-description = "ededup Python Transform"
-license = {text = "Apache-2.0"}
-readme = {file = "README.md", content-type = "text/markdown"}
-authors = [
-    { name = "David Wood", email = "dawood@us.ibm.com" },
-    { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
-]
-dynamic = ["dependencies"]
-
-[build-system]
-requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
-build-backend = "setuptools.build_meta"
-
-[tool.setuptools.dynamic]
-dependencies = {file = ["requirements.txt"]}
-
-[project.optional-dependencies]
-dev = [
-    "twine",
-    "pytest>=7.3.2",
-    "pytest-dotenv>=0.5.2",
-    "pytest-env>=1.0.0",
-    "pre-commit>=3.3.2",
-    "pytest-cov>=4.1.0",
-    "pytest-mock>=3.10.0",
-    "moto==5.0.5",
-    "markupsafe==2.0.1",
-]
-
-[options]
-package_dir = ["src","test"]
-
-[options.packages.find]
-where = ["src/"]
-
-[tool.pytest.ini_options]
-# Currently we use low coverage since we have to run tests separately (see makefile)
-#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
-markers = ["unit: unit tests", "integration: integration tests"]
-
-[tool.coverage.run]
-include = ["src/*"]
diff --git a/transforms/universal/doc_id/python/test-data/expected/metadata.json b/transforms/universal/doc_id/python/test-data/expected/metadata.json
deleted file mode 100644
index 83a938628..000000000
--- a/transforms/universal/doc_id/python/test-data/expected/metadata.json
+++ /dev/null
@@ -1,48 +0,0 @@
-{
-  "pipeline": "pipeline_id",
-  "job details": {
-    "job category": "preprocessing",
-    "job name": "doc_id",
-    "job type": "pure python",
-    "job id": "job_id",
-    "start_time": "2024-08-17 09:57:40",
-    "end_time": "2024-08-17 09:57:41",
-    "status": "success"
-  },
-  "code": {
-    "github": "github",
-    "commit_hash": "12345",
-    "path": "path"
-  },
-  "job_input_params": {
-    "doc_column": "contents",
-    "hash_column": "hash_column",
-    "int_column": "int_id_column",
-    "start_id": 5,
-    "checkpointing": false,
-    "max_files": -1,
-    "random_samples": -1,
-    "files_to_use": [
-      ".parquet"
-    ],
-    "num_processors": 0
-  },
-  "job_output_stats": {
-    "source_files": 1,
-    "source_size": 36132,
-    "result_files": 1,
-    "result_size": 36668,
-    "processing_time": 0.044,
-    "source_doc_count": 5,
-    "result_doc_count": 5,
-    "final id": 10
-  },
-  "source": {
-    "name": "/Users/borisl/IdeaProjects/data-prep-kit/transforms/universal/doc_id/python/test-data/input",
-    "type": "path"
-  },
-  "target": {
-    "name": "/Users/borisl/IdeaProjects/data-prep-kit/transforms/universal/doc_id/python/output",
-    "type": "path"
-  }
-}
\ No newline at end of file
diff --git a/transforms/universal/doc_id/ray/.dockerignore b/transforms/universal/doc_id/ray/.dockerignore
deleted file mode 100644
index f7275bbbd..000000000
--- a/transforms/universal/doc_id/ray/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-venv/
diff --git a/transforms/universal/doc_id/ray/.gitignore b/transforms/universal/doc_id/ray/.gitignore
deleted file mode 100644
index 3ea7fd4ab..000000000
--- a/transforms/universal/doc_id/ray/.gitignore
+++ /dev/null
@@ -1,38 +0,0 @@
-test-data/output
-output/*
-/output/
-data-processing-lib/
-
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-
-# Distribution / packaging
-bin/
-build/
-develop-eggs/
-dist/
-eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-.tox/
-htmlcov
-.coverage
-.cache
-nosetests.xml
-coverage.xml
\ No newline at end of file
diff --git a/transforms/universal/doc_id/ray/Makefile b/transforms/universal/doc_id/ray/Makefile
deleted file mode 100644
index 79787406b..000000000
--- a/transforms/universal/doc_id/ray/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-# Define the root of the local git clone for the common rules to be able 
-# know where they are running from.
-REPOROOT=../../../..
-
-# Set this, before including .make.defaults, to 
-#   1 if requirements reference the latest code in the data processing library 
-#     in this repo (that is not yet published to pypi).	 This is the default setting.
-#   0 if the transforms DPK dependencies are on wheels published to 
-#     pypi (e.g. data-prep-toolkit=0.2.1)
-#USE_REPO_LIB_SRC=1
-
-# Include a library of common .transform.* targets which most
-# transforms should be able to reuse.  However, feel free
-# to override/redefine the rules below. 
-include $(REPOROOT)/transforms/.make.transforms
-
-# Include the common configuration for this transform
-include ../transform.config
-
-BASE_IMAGE=${RAY_BASE_IMAGE}
-venv::	.transforms.ray-venv
-
-test::	.transforms.ray-test
-
-clean:: .transforms.clean
-
-image:: .transforms.ray-image
-
-test-src:: .transforms.test-src
-
-setup:: .transforms.setup
-
-test-image:: .transforms.ray-test-image
-
-build:: build-dist image
-
-publish: publish-image
-
-publish-image:: .transforms.publish-image-ray
-
-setup:: .transforms.setup
-
-# TRANSFORM_PYTHON_VERSION has no effect since requirements do not specify a python transform implementation
-set-versions:
-	$(MAKE) TRANSFORM_PYTHON_VERSION=$(DOC_ID_PYTHON_VERSION) TOML_VERSION=$(DOC_ID_RAY_VERSION) .transforms.set-versions 
-
-build-dist:: .defaults.build-dist
-
-publish-dist:: .defaults.publish-dist
-
-run-local-sample: .transforms.run-local-ray-sample
-
-run-s3-sample: .transforms.run-s3-ray-sample
-
-minio-start:	.minio-start
-
-kind-load-image:: .transforms.kind-load-image
-
-docker-load-image: .defaults.docker-load-image
-
-docker-save-image: .defaults.docker-save-image
-
-run-cli-sample: 
-	$(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \
-                RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\"  \
-                --doc_id_int True "	\
-                .transforms.run-src-file
-
diff --git a/transforms/universal/doc_id/ray/README.md b/transforms/universal/doc_id/ray/README.md
deleted file mode 100644
index c9cb0d15c..000000000
--- a/transforms/universal/doc_id/ray/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# Document ID Annotator
-
-Please see the set of
-[transform project conventions](../../../README.md)
-for details on general project conventions, transform configuration,
-testing and IDE set up.
-
-## Building
-
-A [docker file](Dockerfile) that can be used for building docker image. You can use
-
-```shell
-make build 
-```
-
-## Driver options
-
-## Configuration and command line Options
-
-See [Python documentation](../python/README.md)
-
-## Running
-
-### Launched Command Line Options 
-When running the transform with the Ray launcher (i.e. TransformLauncher),
-the following [command line arguments](../python/README.md) are available in addition to 
-[the options provided by the ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md).
-
-To use the transform image to transform your data, please refer to the
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
-substituting the name of this transform image and runtime as appropriate.
diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml
deleted file mode 100644
index 372f39762..000000000
--- a/transforms/universal/doc_id/ray/pyproject.toml
+++ /dev/null
@@ -1,46 +0,0 @@
-[project]
-name = "dpk_doc_id_transform_ray"
-version = "0.2.3.dev0"
-requires-python = ">=3.10,<3.13"
-description = "docid Ray Transform"
-license = {text = "Apache-2.0"}
-readme = {file = "README.md", content-type = "text/markdown"}
-authors = [
-    { name = "David Wood", email = "dawood@us.ibm.com" },
-    { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
-]
-dependencies = [
-    "dpk_doc_id_transform_python==0.2.3.dev0",
-    "data-prep-toolkit[ray]>=0.2.3.dev0",
-]
-
-[build-system]
-requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
-build-backend = "setuptools.build_meta"
-
-[project.optional-dependencies]
-dev = [
-    "twine",
-    "pytest>=7.3.2",
-    "pytest-dotenv>=0.5.2",
-    "pytest-env>=1.0.0",
-    "pre-commit>=3.3.2",
-    "pytest-cov>=4.1.0",
-    "pytest-mock>=3.10.0",
-    "moto==5.0.5",
-    "markupsafe==2.0.1",
-]
-
-[options]
-package_dir = ["src","test"]
-
-[options.packages.find]
-where = ["src/"]
-
-[tool.pytest.ini_options]
-# Currently we use low coverage since we have to run tests separately (see makefile)
-#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
-markers = ["unit: unit tests", "integration: integration tests"]
-
-[tool.coverage.run]
-include = ["src/*"]
diff --git a/transforms/universal/doc_id/ray/test-data/expected/metadata.json b/transforms/universal/doc_id/ray/test-data/expected/metadata.json
deleted file mode 100644
index 1072ffd27..000000000
--- a/transforms/universal/doc_id/ray/test-data/expected/metadata.json
+++ /dev/null
@@ -1,60 +0,0 @@
-{
-  "pipeline": "pipeline_id",
-  "job details": {
-    "job category": "preprocessing",
-    "job name": "doc_id",
-    "job type": "ray",
-    "job id": "job_id",
-    "start_time": "2024-08-17 21:12:06",
-    "end_time": "2024-08-17 21:12:07",
-    "status": "success"
-  },
-  "code": {
-    "github": "github",
-    "commit_hash": "12345",
-    "path": "path"
-  },
-  "job_input_params": {
-    "doc_column": "contents",
-    "hash_column": "hash_column",
-    "int_column": "int_id_column",
-    "start_id": 5,
-    "checkpointing": false,
-    "max_files": -1,
-    "random_samples": -1,
-    "files_to_use": [
-      ".parquet"
-    ],
-    "number of workers": 2,
-    "worker options": {
-      "num_cpus": 0.8,
-      "max_restarts": -1
-    },
-    "actor creation delay": 0
-  },
-  "execution_stats": {
-    "cpus": 12,
-    "gpus": 0,
-    "memory": 14.759533692151308,
-    "object_store": 2.0,
-    "execution time, min": 0.00696413516998291
-  },
-  "job_output_stats": {
-    "source_files": 1,
-    "source_size": 36132,
-    "result_files": 1,
-    "result_size": 36668,
-    "processing_time": 0.0373997688293457,
-    "source_doc_count": 5,
-    "result_doc_count": 5,
-    "final id": 10
-  },
-  "source": {
-    "name": "/Users/borisl/IdeaProjects/data-prep-kit/transforms/universal/doc_id/ray/test-data/input",
-    "type": "path"
-  },
-  "target": {
-    "name": "/Users/borisl/IdeaProjects/data-prep-kit/transforms/universal/doc_id/ray/output",
-    "type": "path"
-  }
-}
\ No newline at end of file
diff --git a/transforms/universal/doc_id/ray/test-data/expected/sample1.parquet b/transforms/universal/doc_id/ray/test-data/expected/sample1.parquet
deleted file mode 100644
index e90ec7cba580c97f650012f2bce74b2c1b43506c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 36668
zcmeHw2YeI9^04HNF@XRwGF2G3P;9H)5|3iJ$u_oew<REOl1{d4H7v=Em;fPuK<I=L
zT4*M;&^v^h0-=LLXrYA;fe_%)dEf3yXPu-INx+!=--mv|((UZ-%+Act&d%&!QYGnJ
zh8H815t+pBXV4ncXdTzFpR+oCL!%9$(daY}nkUVRX8m|&f+sqI#t39|3<0$pF?<+4
zEt!4<oatw)NDHKOq$mYYyz}}p8t9is^QU`yj)<cnl^!h_UO_^dPop4aK$c#Q>M&JT
z7AK6$2nh5Kd`1s^K}S@G-`~L$6aofCV|dWOpGM$|Mtj?XJnU^Bq`!TTq#-W%LXLXA
z_8_NuGRAr&0m;<Ja{mKQ+OE;98?L-czkG2)Cv0l_brAtW0_v@NbgJt2u_G#<oDIp@
z>?b^O^uldc_XfY-uJ%fq!+Ei&8-JdsZ*=|Q9hX!Wo;2@!^nSql$=6%$;f>8(%2QbS
z2L+vupB97)$7VnE@d(`CkXPu@qet;3TD|$1p&MIFJ+N%hl=+BkdNsctE`+8G#I7>o
zw2qM~PK1aXfvGq`b%a>RjfhaFl>)Jlr{HqcC?Df-l?nwaM#Vf#B@l2>aYVQ}oX-=8
zRU!pPh;jwYNIoV|g)76ABC$%v4F}&MRfHG~S1K^RO3YD;R0^I_ED}Vhl_*y%67vNd
zF{b2+1$;rcfUo4M2zB&NBh=Bua{(Qh7toRsVAIC>DudFZ!}R9<8Wk?77v6!D-pG&j
zkeR+M{~@EGRQ0WN#%g*J!^5rxAAagVYwslogEsNIve^o)1q(H5v{+~nTEZ6g*C=%T
zHF`5vq(`-Ej?$oG7jwfnY)sEqp=OlbpCc9u!U{|Vy%%lc&|mjwsg?dyIbd=%X0&&Z
ztK6~oKf!~ebwLJc0fqfJ)(?T1`3CzYbfI1Xp$l|>8nUseT^0NYRlxONgm>V0Z{$fM
zWKlz?2XBzFH#%xc-I?RbcmZ^WWI+zE^Q6uDcTkHVn;Q82Hg0N%y>Yx<w=&lpd9v?z
z#h`mT18D8&^*QPCLoM1|JT2<GC2du&`RqQo_jO(@*xDxKW>v?qV;??TId<9U)yzpB
z?oXL8y#Ctm54}G1*t<V?(6OI7>`i&UT?1kAq_Rg(a}J6>^`ea*IjZB_Q6=;1HOvDV
z=4?5V^37(W38-*MNl6$`l`yD3KsY9-tmVZzc0?Gjr`cfCD7%|13INohdR3?nE!61q
zLQR+!Q<^o!Sg0}|)#qW%t}pG)Y*e#(93I!!pWa+#4Aq;AD2KxpV1&*%+mBXurSUdv
z%vucBn2A1t!@QAg&5=1ws4@SXQBdl@j$Vv8t;sqt;Qaf-ZZtX8sBzcGk>RbH#icck
zrC;C$M8+;^RDb!!p6H~YK5dr;cludcxEhl-xqj-@fBWB_J-nXhl8T<qQn&NVTZt+?
zTF|}ebo$h-r_Xd+8anoAK<}yy&*m%MA0NA!AAkPZkoHFNv)JP1>}EVm+P<t|P4+4^
zB|ekqA^%+;TQ%)s=eeB3F?nT^=dJ(gLEFJ-Y}Cz(Ijz=CoF21aUDr8x)2gajX;mqq
z;r<hKqq;r*Bz;nDX+<|=a>ez+*cXkTYc5soh&VH?{=pq-ozDKw`%lb_<$CnosBVW$
zY5TVQa@+UWyea*1{b+O1omD#*4)gh3$Jp>iEA4Dl&1rTbYv<0eG04*RIXgG~cqO-J
z&HOZOtNjL5i2CaEAd~6Vl8tf2WxJ~fbp6u5>-If8gh5wZ1cf|RcfO`N@nD(hv*hFF
zo9yd6+j4g2a-W~~g!d53;ws}lRt%Wx@o+Eg*y3qloZLQkC>q%J*sfhspEmyK_lh$4
zDE5MUvtRc^Yr6f0#fA>gUXgM7-(H_DuLvqG+&;18u*9OPvvwSrdng*4^jv@B7e&H+
zzp8`lw(E|4dwY`kQR(cfUo804;79LnE*SVn{*&Do<>oPGI|p_M_dl_rW$?tQl1`zk
zZ}nU(oVjFj6Ymv`PPWcU$eFO5cVuDDtcbaRKXgBWT<e!m-R>%K%By?DxC_^Jjoh{&
z^XzurojsfGeW=_L_jP<l<hXTb<<C@E*B7nUEa>Rfbi*Om;^B!?SM5FiNXK}rpS*ec
zw&SO_^%!tRoot-bXl`_rsz4Ry^W}~s7tS}#{`_CVSAI07vhu5C>vM0c$X>Rlo3twT
zZ10vIeC+?h?|02(JeqAfvpjkEVb7>7vB~QnezEGv;%PH>`zkJU7#%k7^L@9|`~2F$
zz?ijj=={QC{i^z0)_5fz4*l1jaKAr>#7tiPUG~83+8GB!&efmMw|K&T=Tv{wWBi^;
zBR;Hb{3DYc!DE~ZUAA@JnXKSJQ4=Pdt}4Bh&@*Jqo=-B5Zs^yw>WROkLGO{v?snce
z>cMf}OTimH|Foe%*lghhF?Qh?v&+>-6O`44Epg^Gw-w)geYf-6kZvCt$2a)EykcCl
z#LNfF$JcAw>f_F3Z8miuK0CeX+WURl-0>Pa_UfwC=}C{qeYG^E4Lhbff60HwedRm3
zb6S%v?C6sn=Ty%bUccRj@w@MpmiD=ODO7lN_ns>6%+H_F4Vi{$X}ey=1IY^?BU?my
ze17SA^`w3SruEr>aPk*-8!)fz+IYV~%Yds*5*wGUJaT@(6AvbHp=8QlRkM>%FAb3S
z<mtD4G4n=)Ci~W3+^jAZwRm2+m(|NyF{z?>`Fp;*x2Ap5G`I44-xHJf8t06;(Jz<A
z5jXkuqH*DsCc9Of!9)F%zCY8qdVZIasy@Me(%FrRw^Yo&s@c#WaN5Cdr(D?O`$uW;
z1`jVk(MPQte%4{}gCQk_3yce9|GPSH+n%YRQS*-M2nj#=N&Box&DayFj`^L-NSyS%
zxXWrr?EF5P6XJ(o`ZW0BXNyk6#;;5sJom(`#<D+Hi$(RmUKmq6K4$ig5yKnH8h@!3
z&D|cesr>BCb`v7IiDH^(Uy2)hN;Z4c;8m<d@1d!S(mE*j>>oWkK4D_}-SmOOzs*0K
z_*Lu1k;Y9i8Hp<9grEsQn)K9myXnTq_vUw)Gk?tVPi8zWzke$?N%SDc>x;0?<#!66
zw~YAjWc8>;oXUB_Op65VIuBSnuXIGO#QDz)GMjXeimLkWZuPj`+|%vuH6C6ye33zY
zdeToHq&Dd+xYD{uVbcrE?{}YcV*DwvTE1Uct_oS#&THwiXFu=!B)4ai{@wF$K5Jb$
zsa#jtHn?=aCa%A#c-m*{`%21Yo@1NV8KfOUDlRkh<sGhMT<_ky$KxIWISs~^X5PJQ
z=~cA&E9rr!_lFGqaZkfR*Et>I-utpJu`+1y&F|l9+VqRg$kJT-iyf^#iy6otci`8P
ztM*59ydq4H3`T}-_xM_?6>K*3_$~DMHDy9CR{tIeSuwufb;-DP^-e~L@w{;Dr7`8*
zV^=h|SbS=0?l=36hh=UvHqP6fer-@@b#UzX;S(!NPj=7P?w9kQ)O9xp{oI<LSz_Li
zAsRPixoE<^g@J=+G#V0n{G$)!mMj}Tr%T%wleuAw*K>{yI2}-b?e2#Wv$8htI=lS6
zpQerN6x@Gr&NpgZhfW`~>oIU_kgrcxPW0fhr#j!Gw=KWsv2)d|3z0{bdH6TJue?#(
zSGM)L8B4Bpy0gN!P4mb@o!kFQYUG3q(oRf1;yrw7v?VE~a8~m@mVqBk`7&ney|R_(
zJ1<{*p(^*wg{Lz+h&$cb9CB|&+vz>_e50B%&Z4-<H14Ta(&#{5d)Y@Dmd@^TX~&{l
z9-j<f$KAj2@b^c*9q;QIJ*7>i$LA3bq?PI!+7<piwNrzqo43au@tW>^;k^^p1L{vt
zH*9X)y!m&Y6$@J(ZoB+r{}$4lLw}R5zR|CnURZ319II<nuSoGgA1b{Pb9dUTMcqcu
zPW$~yX4iRPKkj6Y&aF<^D98>u)iFHecIDc=|6X&iu=uV?a+y`?CCTi>Jihe1hKhsC
z=Rb|i9NO=Pdpn1#e^<?m6HMKBS?|&GSg$Q@L>UjCd(PwziQ3p{!q3wWTq=}(_q1N3
zc;S1^)|7497P+dR<+&wr2kz_?UG_-HNR>dhB$kyPn;09N85<MA%8+Gb#wN28GFVZu
zNxgGeT#k^(%E**t#<F5djapQXnl%PJOJ=aJl&GEs_BmJ)%bbs~<ghoE&39rM^x86(
zMsICyGNWdUWhi2y3PUj##>yx&nK2y;1zr{CD#f4&WmzWBPZ4G^8uTWN6=T5rR2#Hf
zLkV!gy0T*RCQA{9mzJ|LQ{z}BYbP3$BbqQ!lbR$n57p?ySm~G+P@@8^)P^FRbu<|m
z#<KR>+lUq6WhNE?vvdZ4lUx&wz+!eCZx|~TEdsn?;Z9^%7S4-MR!U}iD)TBm8FwUE
zC@X_-C0Wt9E6K{R7>x!n-ZH?>5)^OGvU-_-gBYvCP^1D~vQWJNu(-&|Rg*ynn5o8z
zFl%&xcZ4vCFe3)X23U`kVoD2OI+RRwQtxOeKf7xS`!qrEL9iO^=t8W51e^v`M#oE1
zqz=__X_n+;dIv&*e{4lyr8vxub9*M<?%et*4L~}2vkCWacVz{x?TB3Kfh=M{|Ms0i
zzEjBBRY=NxN&o}wQb<t_AFg%(j_E046CRkJ3H}kP?YP$;pGneeU>fr*Xr6WFW)7T2
zM^<MbkJ6!H6P-|!Q^XkVxN{corPG1{Lbq0<J^-flij}c|fk}zZs0SYjOHN_u>(v06
zL2mmb2I#agt<JHJupZbj_qT6Or+Wl$^FTIcBU7^A82=g;N)E6XJO_a^8cL%D0D{Qz
zB?{35kl{>+Ct@^?;<_!j52n7_qSf|S8FXMxz-`e=dSEpjxh6xV%iwSvx^wEBau6V_
zl>~^aiJ*cwI1D7e$6%3SUH$}NzG7R<h(<$Z_G`)Tw*?RVpfQHQY@DXdz*6LBUt~ew
zmf`+1O02Uz7!yI{mZZ(&yuE0DNL#T6EN#JA=M}*1+G+giG~toM=?U%IF{j+@^rA-@
zvr@9vvtAUvW6P$))(jA>8k*Py9X67~742W`x3K#6Hj&F7u9fzVUUp2i>B9s2t5VJ$
zzy9na?v)Y8e_K)geDduj?XUk<x~_Y9gS($_Zj_au?2+j?^+E4-$Ajq}w4NjTwi^{h
zA*|2nM9>7p4#V;^=6s8yCK7{cLUjferZt7)fkXHw6jctiXiOUDA+Vv4K)9wvuQi}5
zutu?M(LUlE$k7T5GnRp{OChH3E|EmZVtGj^QOQXu3H+pl6j@f9ESet|BkUc|WB#M4
zA#}n~S@&ReWn~cK10hZo*1s4lG6A;WDze%)aHB7>YXCC-L#Vd@sDb^%*i1Qr)QnPx
z>BIkmPGbX!n3P2raCbsgI+g-8<+DPStPhN3=6v9<+LVv1bRSS?Jo|5U!Th@osP$%}
z-mW(dNqVDKYw~c78Ri+d&=XmxMJ5(PjrqG+|2~<Ne2-#eI`F-1Jvyxw7?I+GJWP+7
zG^S8vkwF#84F@?GF-Yd{1UxPkms`EG0rTZxY7TBS**WO1(W|i1{xuePibvpP59E)5
z$gqKsc`xt&&#+T6rYj@KfibPF({OJ9gp~QA98oAw#Nr6Maya0>F8E&x)eCZ4ndu%x
z&5YwEF3f;?4qTI7&;#drATx`RwHC+>_g%jXD<u;^LUNQ-Lh>?5NCvcSu*1^+@n?6w
z=LN_f^sXOe>KBf^NQjT||Ald46jCUiEWhy6v)F{KIWq?rcFZWLM;pSOCf4?t`9r2c
z_RKK!L5kN4(Yk;CS8TX@fBwVo(S;RXx1iJ7S=L;8pt?GKSXcG+_{p<Ayl_6O-GRrw
zMlF;p8rMIx<;QchA@9Sb+<E^TsRCI-g+i&00P3maDfm2<Sj16dLM{&^6cypzaJ~=~
z3ix~u7v+n>F*Je~F66>Yq?*SUDNqqJGF+t+C@>Y58=(}aL55PzQ;Sug1GO4tIe8dY
z9WLewg{U$@pjIpSA_0fbQwl{&E+$ZNMWS%vxLUL0PF~xN!FHi}*t3uzmvC(mvaJjX
z>z_b8rRd*fl4|l&uaZexUrD#i-qTW~#b>HDk#zJEI<j^s9g6=So=SRkXL|i8Fn+?D
z_65->BdRaMc>ta1TO;Ynwvm7XZrtft$GdPe32%K{l*3$Rv^(-=KBFVcKXZ@U-L|y6
zoWjC;l$SHXKVtqnx{lk%l5_y@zyflRRvzHn((_~J$igwW8W7!3&csD@Mlm^^$I6^c
zr)7FD0aBRX4CU}bc@ddl4GIG}cR5&`xS?DT4}S5&y=al)=VyGm_~d4o@OQSHOzZjt
z>%;c=e|RTM2qE=shY%xmvUHzMBn{O^r370CE5~nv^Cce0nelXF&p3RjuY!}3Ehic>
zs_T*YGT@encN5z2E2R;Uc^kzW#-hwhl<7me<jGx>dW~&rf8*Hq(x832q>_(I_7$W3
zmo~2ouOARLdtgA(nscMk(&IhzX<Aw#ZG|v0_uQ$xwORc-SFKK5u`j*hj9cAWcy(BG
zr}D<g!9NVU)~nC46J4sdt(x4kBJ1I-P1{dA#MYE7D;Q_Faw^eqr!8k+Sx&EC&YnvY
zpF45zUu&8lh}*`Ute&$W#<F<5e0J-MrGA}dA%XQh?uB$eHDjjtw>^L6w7T4+NA~%V
z)h!#h?;T+L=u)S3v<V;dl4~a^!xM&&m!=fFcWBo8joNp^VrRFw(q){q<prg2>hfdn
z=k3@trPsrt%R>=y^r5{Ojhl~&`?TGi#^rZ>R~Rpa2Kv;S?N|QOK_k;+{pPjlz4$w?
zo?N&!dhSr8|L2beUO(0ToB}<3=w5E?kEZVMEBp4ni@_bmJ$qaj)Zj_Xg!lb^<Ok1g
z)a-mIa(+b8wbB0m-ebYPalykygM0XX<u5%r@soi3nD%C$UgN^sd=pqUY|0wXEh2Gi
zpNyGzd<~O6>OZIRm~XbP<92<%sm0UC7HfXy4l1i$=d*q*XVBuG+CE8N`s<hSw1CS!
zWs_U+l4*V`ld~V!pC%u;3_JPRn6t8V)0eGM&Psk^UN>&u_9yCAkH6}0%d@-s+|;wn
zvO(ulnsZo@!xk*MM8DUtz47|O=pQ=B{90EG2)r@3GdEak9ysM@p?G~;zZSy<Pus7!
zk~}9SOte3+w{plF`ph4ObiG?XxZ$WMpXHpy7K>NKTsVn+J^tgf7nff8?P-Ur72Nib
z?YR>SeLUGmcdpua=40=ZM>>se*1BwyZfNlSN1TTg)978i^j*BPcH^bg?&R68NJh54
z3IiKkjS&WB@OAZ6P~f#D$c<Ta<Uf;eOZ!eO->GHoswL$*yqpe0H>4A1IkV_Al?RNR
z+nop&C)~DIcJ@!^@FRm#>&5r^p2D!&cCyevv3G#`BKS_$=Abx%+;}J_){-Vq15B;9
zo+YbsRQ}@;cob}+*U*v5)%f&zHRP|rBB9lTvkxP}>cWK+F5I87$X4Ipft^Jzed8cE
zESkNE#3FyJ)Tk-KocMEb10C7FzJ@=4A2TI?LKr<9Vl9EIXfS?HGg=f{jq)SRWU_#C
zumxnMDO~WzuC<sHWyF{rUG(AKkcRvoh$w=bC_MHDzj3QtO%HrTN3Lw4Bcr$A!=~t(
z5*_70UwvP5d^;eN*iZQtVr4KTZOz`o4b|xFbmZ(dKwzzJsP^n6A@<efo0JC9X6nTb
zI&yf2d%Q1?qD6p>3yTxcRUqoh7yMZijiNFU&!PZPi~HiYByl$YkxeDgEllMUNn5m=
zj_lY~qvXJ1Q6P93Zc5oVXM{K?c>@iN22+L>5ZP2~^q7i5x9imanf9O5Lz}jTG&G+g
zERXODX43<A(UI9d(2?Wc<72Y-MDCQ5W^h@=fo6F7!x!KO69q;`C@A)|ftbV(YHIKQ
zNTSx;u2U1~f$QnWupdEqU@uNA(T%-7N<v?KVvRUJ(r+&_nrEL>+xF9u@%!B?_p2lQ
z;~)vC4}Mm~j&{*OI`RZi<!&y6pzL_3pzL!Hl-=LD(aH`D7hKC!sjx}r4Ja$09&%#-
zIa$6(?$Q?{mbJR=Gd<-e)#%ucUyFT5dzBpJe_S3I<+nd+_<)$U%NO=kG*3CwVn_Au
z$fm!f@&EXvOU#{u-k-0!b*WKQYFWgReEs3OL%98W_1xHQZU5DYpZpkk@Mik$t@4~s
zeIEO@^`cY8Wif?X5y9i43IPbKij`bG2(pS5B3`&irR1WhNTmwraYO*1z<5GTC>AI|
zJw6{rF+Lw$Dv4l5Mu?SSfkGh=D%2QX7{L+nK$w?{3d2z>LZ}WG3WRD7Sjl;+a1iws
zsKX;v9DW2Eu22iqA~ni&cg*|uYLfOfAdbTHH|3)|+~|XwA|t$!X-7dG`Vg*#k^g|9
zQ0hIn!r?GM;p*lMrX0Bc0j^*H1$DaO;f({B)-UR*x#T2?Ilkl^XC*yw2_4ySf{qM1
zfwRT2H%bCueU5BCO`aoIsksQX&yn9x(UD!J+|QAh7%q6{fPyp7&_EDwXy6ZO?p*nW
z%pI5^H(^#!!D_nhpP?fw&*0o~?U9m0Oh#u15eA?%8oZ20+~}d8)sKKNYkpBvyX!oO
zS|8FWi)r-0t-zBz2fW>LII%Syk-DTL_phwze=#p+T~!?_JFcn@AnFlTULuV!0D7za
zmEdH97A&Xm0-`lFw*Mj>xpV;^r_HPXo2Xxb6XH;w@GoZwDJm13pa4;GV)``_CmQ0M
zAkxkb%s79Ajy$?dW(Ea{mtm)5#=Ax7br+@ScS!Q9pAQZ@fKw(~i_RL|u`WrwZqbp6
zH*negW0xe#>&V_TAK*Cp;SMiQUz)q?$j~8%AEqB%#o#9pTt{{-1pvzH$P4ZRCKC6N
z8vrv2E`O5QXuXg;>>kLx{6PozbzOUQgO5t-gPwH~(wIIbWj>}u8L8l!at9B`OVHLL
z(v@KvBLLJPUrp^mOKJ&5gO}IbQzRE!@1c=PY7Xx@6cXni$%Xc#g5*;2nLDD~ACOgZ
z=*>zE^=T)f0UU?(xfU{vbYg>|81A^p#UMLQeuN2A;u%s#BoJu~_(V?YSxF_5YeXAJ
zoGlAQE+WTF$>omWATop$1tAwZMG6QI&O~BwZl#sjF<a?$+;S4laB8hPHgb`5TSSz=
zzVYpewO`w&!Pc#V3rn<s-4gyAQ3iYQvHDUr&e>cTqJouk_;E~fk;BSL1~@M?WEk0q
z5#?}-IL=F2pnp)ypl2QH=2Zeno!sUBAGnDA|KkN}WZwVz`@hJ$TORMO9lfP%N3K2+
z^7=P3kT-wz0eN#55?=jM0rKwPc`e^Ipt!k#{9ibwjl9K!(8yajcZ~d>JvNK{51jf$
zUgP8@@|veFk=i&hiPZI3MC9GM91wPWci;Q%-S?(&yx!Vz7v$}nB0=i%BnR?3r!kP%
zI8lMTwF3-D?Vm3|YV{BR@~Vk^<kge$$p1(p8L7pbFY@YnTcnngt4M99KasbUZA9K;
zau4~RNK7Ghm>CMJK0pujNBjcMPp3Bpw?;#lKH&C~3N<kgbr=+Ey~%1#?U{nRvib}P
z2Cp|%fnUKz@U@1)A*^6{&lMEt!MhQ_6L@V;kr_O4@ZSa__$kDHV(=%oq91s$_6J@A
z3a-gwwr(AJQEM=*3co>X@7YRLe|SF%6b4)L8hEiCr@*2!mel|ZiUM$47MANUGYV4%
zp!%R-Ob@Dq_W;1QS($O6qTmV=F^*IO+uxfT#)UmO(BlGiq6}p6!Ue8$)fAJ+f=sIj
z0o2)sPwM(*Xd$c?k%HrK#LH)0u{r^r(EdFsxD1!*+&x-~p}r<g{LHK?!kXgR9U+nJ
zaz#jn)GkgUyXuOQ45?k5K$6z?ytpED0M#y5(hFH!@j3x(8ME!3D->u+5GR~!@GcTc
z+du-m%sLQfWE@`U;YxwqYEI<5=0qZ)fiH@9DIG+G+NHq$q6Jsvj<7d@7oH1nhnIlW
zGF}x0KD?}U@qiNzu83@fwTsDi+1eEw3E+g0G*|Qv3qJ|)vPopKySN|Wva~w^YaKOy
zt(t1IH2^1+uRBNa+s&>Bt+$?Gc`f6!U-Y9u>Ike|%nrAtTyeXAYZ-gZ1tbd84zSwA
z>v#*u6}KzAcJUJjh+Hv~Ax=0+6ZMs5u<c@xD^3!imQgxfzi~zE0(L_GnzPt?-pUoN
zy}Wku;dfkI(bW{!E;e|N#T6M*>V)fcb;o*tg#sOXkv~pGEh8k{eEi0SD_$3{6Z+TG
z9NU!$SG*)Z?P9bZ*mFf_FL%QCx)QY=3UtLt0yv?3O{QA!CQu;6k0QeITE<7ZDL{eM
z3Hk<b6P`Xs;eP@yV>Pd}vsQS03h*dV5qK}Lp$HG&Xf=u=w5SZmZj5DUQIVkp@`2D{
zP|AgOg*d?=mO}8J1muNr!}!4!G-iM#J-v5MSL-2Y=8$+c{>ein5=k2P2S2?e5`iQt
zPZB4Q<Vm9Q@Uo~V2Y3lAjJ3nlGOf?xFWw6dPmoATB%mGmhMmMHW5E}AmPnFC8Q@n$
zf=+9ZOS1*Jyp&=^e70GkPuBF-L@BVc#3D3DSeQ^?$V<>C=7Zad+5}BWUV=`TugJ-Q
zbu6l!Qd3g2M5E-V=PUJTpiEL?doET+Cxj=&WSJ786U$Z7>=M{daxPZ|;WD1sq{`u1
za(PzV^4uDn7FiBg3+pQM*(OCa#3wh(b4m+yc@%i1ae(%?@?3tpP$`8pNi?b)p%L_7
zD9=cMxB-pa5-eNH!E&SUeiL+J4XVp7P(|Z#vphFFAN0YIWdM5PO&WQQut32}H!5^W
zyua-1bZJs%o<{9NyRr<_k23@Q5p+p3**QX-et8TSYqX%$$}ea7-PF@Z0UiTBa>=x6
zEvgt9$HsTAA~#A%9xIR!C#4`w19_^*$>zv%((_f)*c#cWGfMJ+9BX)*pv|@@ba7^R
zP8yJn1aBwTX|)Pzx*9KsJcv%rR^VgGE`;*Lan}WHy7XchKO4{{<N&=WbjLfnR9paL
zs#WOI)o6}&?8@5K11WgdowQV(6b<KxUA}hQR=gCvD^&1OxIh*bn|_tZz#Nk0rl?i%
z+7h6XkiT4oP78I6m`^TpRqzrA%5zd|a_4GQ(m2pZdVbBki07D~u99TNL37hhHl5d@
zIoT$8oY*dReY#O4Ew$pKlBGV|g65<Y%fTGNbxEIXR>o_|G#ATsMy-thH*`%0?1?@d
z%BkF)EWtdr0NY?u#V6x<>q?Fas4x8Nd^yiLUJ5yyfZX(|oH)IWr*$Pqs6T3zRLfD=
zWI;ijOP#vnn;hEYI5EJ_%|~+t?(BySt_!)m^kPL$oKc~{?QC7~59ofW);2Go{db)U
zU_DU)ox|4|xSmnX2c6uc;KkW(tLuEwDWu|jyKVkUbF@&wS6N^?FgNk}pUanPmHHH;
zg6Ec(f7+(bWG@x-6}r-5u*Rt2yajrxcQ>zrousZy#p^d>dkCM#YP+FLqnJlvjZfEN
zgq?J!3vZG>KJYVuFZCz%aZu<J)h&H)*WbEcb5*&CCU<&P1FJK=0&7|>@YD2#?qmk$
zH5b}Cp<8*q8QTKz3OUesviTWqY+ucqZ1soT>C~H|kq_7UL@RGx{Ca#}zy<z78D{m}
z>uSv?l<}b7>~<bH!0XQ5W|zzOiAH5Sun{j=ALO|y1xlT^1o#Zab-dPU<vKa=34w2^
z%ZC1`&7XDM@5pmNP2kU>xpFPsYt^w&rN!c-jLJWwF*m;i%}sn8^ciJ3d_RNx=%l>^
zh3^6a5O8mS`-j%Ol3P3TM%FFxny*|{u~O&Wr$>3&LKRP3CVz=fFNc0aZW=Gex(BD&
zqdWOue`Rm0;~JD8h5_RMZq`#X8lO_2;Db0o-P*kprA}M`=4rV+C*8i5Q`kRcd}6VT
zmtC&pWfuY&o9lR<l*w~cFS9ic@Vc^te<IUz;ETe2HC*rNYP|q)7M0sQE!eB;L2Sq!
zJx==u5NB|T={U^|$M}?+xP(e7E&==Zm(WClQ~647YQ4AlBp{|$j!I+8-1%|#I6#V8
z$;;1AuG>8$PoA4tj^>Ch?&t&STbYtq_v<SbpI#;du~9en46JK*IMsY~(xqCmv(Q(H
zE0^cGkAcBh6|iB}_!X7Dy^*!aA>Kf-b~?c8%2wch5s0k=IjT$`?&?mKl=Y}&`daaA
zH}(Vf$+AIQhlo!|v)%6h+}3?la?S>^&gewo<7NZB6PLjlD2QpB(40iTQ?Qr7^Ak}*
z@ViXGSK|C7%~ud_O(;#y5R@bpNHj1WPKx)~=RNGBAi)Cal_%&qFz+If#M|QWAdLc&
zF(8AI77zZ4qQG;sL@AM^<VnFV6RfM2$Ur8h1UyTW@Hs+~C6Oehf!s}$APVGS(&2Ly
ze2yuExtC&?ph-*vxtl2bIT=sbz~>klNY_XPN|FKov={)7h0n3@Ii(~T;)7`%45o8p
z;WKyw&mx!xN``5jXuK;Ke2&4>Jh8BA9;hhk1(QOMdLoGgpCzU)YBn2uM@jPGBc3F(
z=7;b{NecXec?KAhugMVsoRX+=A)76NU^qmHM4D-Pj*SD@2~_w`_y|yee^CYy3pd#N
ziAjL?3sd#re{T=KX<2wzV07>S{|o=bK$sNTooqEJwxpFyVg#^#$)B}{o!HugJypJn
zR|?wVA48X7Fbb#kQXFK*kL!cfW<S8#vr>!L?9zg8>-dTBLZ(2*6Ca=>K)!@6l*GgO
zVhO|t0g_U@9#9<kQ1ii2%X9NNN?lwz7&+Pn<OTsWS|Tm~EBI=3T9ORwcgZ5~MWy37
za3V?gaQV8dzpz#+Y|oA*+P{om!v6px!Pu!bdQ)QvdSk5m;LIP$-zeOd!YNBA5?T3~
zAc;>ccF+f$DI|Ts70%|%!?tQzpOYE!a(O^jM-e_Om=CbN*kt2x)XVte;+I3xQ(*fp
zN}|0R{y>`{QRL)X{nRuGk8PcA(URl>f?gYc@Tp1CAIPt$B6+Tsuiz&ZDsga>uBLrz
zO?&HngVw-7{{*nT>)sNS%2LbO?6LwIzX|<y#&3Y_9rn)9zCihA!}hScyZH+FBhA*y
zVV+g0<$-+<jIV&gXc26m@6;Y=5=p+%uy1L)7Wn>UXl@kPGl?yd-jLodB{uyc<V8Jy
zq$&7%3MqvA;yK}S(I*^VN-<blIZAz@q<1PGj4wuP(<e9b!R?u2+$}8{V*^!$1C_)$
zw|CZm+&<LA3nj@#K$W09O_1c`_#yRHdk7W|_<;Y#|3G@<v&$5cXaO8w-k;TnsHNIM
zU{92=zR{_^3;(V4t>*%)F>2630aSIIpWuPuKO6+ve&I_cJU0Mrm^G&i$WBQGs?IJa
z_~}Mp;n<_>J_w#85duF>DUrkWW#u;h*!0zUD8fEpqM&{`w@=rSax4HIt%UNy`@{7&
z8E2~VdV^0TsXb2w8UmHcgMI<{#y`=LM92?Fv%C3&>xW}5EH6d`@t4B!;P|cbb>Tlg
ze?UGQoCU!1y6`!=0Pv7qV$&~o{Ktnv%5UPtW`jA$E(LS;rSw9fN7d-F)!sx)Akqxj
zo<y&e2{`NU{;l@dnq!6q17UrB4EXah{^0oR`EtCcm@LqUt#`ubA|FT#SdYo)1>pPu
z>l&^P<u&aSa3Wp!3sJ{8oP_}X8ssme*XGZF&cM6_sAbS=0uC77IWcI$UxG%Gtp9jD
z=W`&i4%8pu(1HGxz&H>e>&&JxSw^&IpanCBDvica{9qBwc5sNr4F|`DIN)V(;FX7O
P<v-f`G#V{clFt2qO6+a#

diff --git a/transforms/universal/doc_id/ray/test-data/input/sample1.parquet b/transforms/universal/doc_id/ray/test-data/input/sample1.parquet
deleted file mode 100644
index c0af946f119bbbeea1a282941cde60a50574137f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 36132
zcmeHw3v?UhndpooCz4|)RyNX+sYA!P$&iHD){Nf1Kr@oYvK(1b^ssC#kd8(pTee<U
zl3#^hLRre|ZWl@^?XtOWOS#bPa(3zCz#euBcNdn@(z18Ey_D_gLJw_eFT16Cxtwkf
zhu-g>(O4sC<PhvQl#NboYyQ{w{J;PEzwiGiVJ1QuG={L@vVft>fbGGsLm1YBV|b4i
z>*&#5whfa$n%Z9jW-uBqItp!j4ciUd`%PN8#-yz^!Hn1;l~SnPd8h+^cj4Q%-8_ns
zZ{qz1O`i?h-rHyDju#4CUPzA+<{`?^VeB%#jT>Lb$&ZrSvjNd}6llQ=9T@!Tg+CZ}
zo@o1WXPvgLI&BKd?$MA>Zrd<z_iQuVw=DpYK$815{>HbC0nKT%!$~=WwAq$%xNMZe
zk;?E^myJ$QREDz%G{vV<oQreOLfUFIb1sKHW4F*&SK65}+c?UK#;^$1w4Jx}PFFfj
z+2OA<?Qn5+J|$SvE;H{;r)b{gv^p|8N4cCXi`DEBc-m#PSnXB|Z%Lyuc&FXvvhx|*
z?4r$Sv&&}VEKZx#%%xIxn>l5MGJ#5G0LxU`<+NMvcDu`Fr!r}$i>K_g1BAHOV5;ky
z9XoVvqqLchZIm(~tm}ZJ#~s@qz{wkW`VHMRHQt#n^3}XhsGQ8EMOnYoX}oni`B)eE
z>+SvT6ZBQ*$P>8X%Xq-hA?nf9-XsuNwu}r9r*c(cu$0XSeO=+f1#W5Bb~2mFpUf62
z!a{+|4Vyc6ALEPp;YDi5JS-H3(_Doc#t)1^*-5kXaoMaxbLC<Iq`AjnY>;NM5k?)F
zDku?h3owL^3N+I>92=OUBP0#LyD;)bLtO#1vI2<Gzq!-+wubzkp1ipiN&g(8bWX`t
z)tLLX8NS*fTb>v2HzEOG%WS1mDudcAEiDa!V1|&ifFot38mo)>VaE{tkxH?Y<u5N+
zQ&5@b3hBW-H=ixc43>qQz*n-1!XQ7(6=sAhFl;f5-CQYUhG{b`_sa|AGB+cX2MhDU
z%3z_umq)PGJ26eA)alkxAJMt3g~~!{uuv{>W;10KRBT9KGqE&{tUFo_d#XltBW}%>
z<nD{>iqu)jR&s)<MR#sDeyNju@<Zh7CZt90AM{oA;>qKN+Yifn@$$iEe+E=eh!Q3^
zA}Y(6@Q5-dqSaEVSQdztVwG6p3KgPKBzWD2-uV4uK_Hfj3u#oq6^fNvVSy+Xaw|l+
zm=}m*hJX^HlFbW4#5DBfxdO2ulmxCKq@j_pEb!F|G=N5)K*+<ySfZ|RcnmA!H10i=
z0mF8b0CO!Y=pO9#j4?qUJ&biOcQ4Hf1^Qu33#zH(FuwEAV?#IhVp#AvJ53A{!JsEf
z;N8n$->eBD>JNHYVuG29Mq-3NN+j7x=qLf&+4;(q@aXpiLlM?XxTlHO7)wN_qcL`(
zhw)MrOZbD6Ou+9YMnjPaCZ_Mh=)OK8%tV+8HpWKu{g|b%U&91&)7}VsBF;t)mtodE
zq8B>F*-<y9v9UcD9SDYEM8Y2%^9ON#Hw+dzNa+6gO3Ka>@gU<4utY3GMA%rw&rY&L
z62JHW5el;rCg>$%II;IA;fWqM>m8VL$bsEF>WB6~WH{sx#-Nuc6!8*2zW&oR{6Ndt
zahkxxvoNt-EYyy(o*3&rN<>4R<IzNz@GwFAfq)yB5%T&+{Xq0+Bs4(;L&1SL{BxaJ
zA~D7WiD)2{z-=0TR8L?-J$Gqfa$kk@9J=}{tTUpCh4gICRXukOzG_Gwu=oj2I1VH&
z+ui~!?jjaBSdR1`$E?8v820I&1KJ<o_K5xvW(yv|^nu<46D10C0y6{Y(V#aGBqrj~
z*ua5YfQ?3h`AiUSj>Z@eQvi4o0K|ilIH92(xC8IiJGy-lmSM&^NBW~<@t8N12=@JF
zJazDaZi=SZiC=j^d=Zvl;3pU&JRWzHV0tvc(80$EHWnqk?5ICD@ErFcjn{j5j{T~J
z_3EP-t&f^PHJ~*S4#cAbt=oZLrUicK9^T7Nd05tq@9iG*#E$l8_f7T2!(JxVH^q^A
z!xOP!e1iEWO`IKg+(mRVfk5cg$RRHgjfEnNk3CB0hnUg`*j#J&$M|j82=H?;faxC}
zha%f4E0)xKs+WoQ*vR1}ZDNe+e172Xy4{TD_$~BaikNT{qo=U{TEVD+zrOC0TY{%%
zTyGu=1}4ygP5Fty1mOuyObq;SFXQpB;e#L6U5r_2{WgQ|GB*p-BRmmcHE=S<_DpCg
zGdnba=}&1vLIb}I1mY(>fjAKbp8G>Vg7L)slZFwDx_Sh=<KKxOs5Zd{V=QavJbmi7
z`<~cW)Z3=~!EiiA1Sdj4K!ElWZr0}yg0AZO39mmI4lvXDi#1k1b5GA_2qqAP`4SI$
z^%rCIf)?+l%r=_RF3w}xpvQf<e8?a4$NVs@l1yx%doK_Y4zL4by8W2_@q7D0GYMd<
z`~Dy3_F;~`eci#3mp$0m^C8UnMZBK~jgAKVLAJ*TRQXdS4AG>26Azex+~fXmnEP;N
z&O`x*0E)fe@D+@`|MS>g?q_iw)o)=5Z#*3EXYF|G;9vGe33keV>Pj7n*-nRW-LS^)
zA2|8S2<6cFgFfBobOV?Z5BEkx@rXxP{Ni?Qa`0&#c?{+is3{wXgd#-9<B3O@<8&0;
zo2#PT(Z1wUdq%C;^FP7=;FUk?1r$7y!ThPWpQj8-jK%kNM`40J^^@KR8;bb2+a~69
z_x{1l+A!<+<`Y;%|Lv#0fW`0|G=aGO2xbXBgN@#aVP3<X8Y}zpJ29;%6vXH4P-H){
zN2`eveSgvmRtaSA^*?(Ni=FxxElflF_b~I*j%#k|JxcgSqxkf|t@f)hjgx(4x0V=V
zUb*oA*uNl{jBqIO#m^u7+^+3fO(gbB0`qeRu&AEHT*AXx@Tm!Z48LRGlYL*X9n;-)
z)MAF|#KvIsP-I{KsT0Qlkr?9-!h{Wlz^VqPcfGAMML><%NK}6XMjd?8_1)}OK;7;C
zfN3nW?vjHqYnb3QFS##)O^f~#6ZH4^LHa=qD4TeA&$HU2-Z3UR=7GUHk&yngm`&%8
zXLSBM#=DOa?)cM32CfB`McG*A>w6=i0O9rzTr=Q~e|^kh_QRUOS5`^_0rQe^GtpB5
z{eWiA5iQ12uW20AO*E#z-C#A*EHTP>qCGERp{H)&^K)HyY&y&uu0Q-$t;3S|`3r}?
z-#tyZ!$<D#GJ!2-+@Xl^NEa47e4rbwc9f0xF!%qx#^c@-=(;?MF`bUXwte5hY`46&
zYo+UoL9Fi^m_2e38!_RS1OB>q;+QjX33kyR>H5lh{<RD9WdGa1bM)vo%*ning<dge
zBGcJ!&;@@`=+*>1r|=stg1XZg{M1+3TSqdu)@k*-qd2}eup0LZzlRZrKZ`kSb9mJ7
zpY}#!A@Pj4?*S0Da&QQ{B93dEj;D6uNB+Wj5wHE?Z!zc72YW7NcijJEPyfMLm@%GE
zcp8ZJ2W&TDu04kCZciv4#uK2lnClj>g^`}e|KAJWOzW=HxM=;|omUP#bH(GB#$`FN
z3;%@8&}lvpW1`2=gpUXH;ci$?8D<gl;a@lk7Srp86$tFzH2V!r#Ebur?$LN4pf_W7
zdU4xdT<zb#7yN@lHLpJse`55LSl?In+^BoL<M2y^f7SEL_)9no_&s#+6PWA%CxpB3
z{?31%{<i~64Aji)4>AGzZcIa&4_=Q`J+sEY>*@_hLcYkZZ+2sYIsyOhyZ)vNi~i$h
zcm35)ETO*^qjqiAX^$K^g8%w!UED7Lm;GH`ogc;@)>0Pp;X5?`ps}=j*8@7{fA1{_
zm27&*ct8urD7x#|PBPxL>)&?ok9UCyrJnvWYt{N=)7+n!K;;vm$Z`E61J8EHU|rZl
z>>hAyqUJp(cK^HQit5fIc|eI<5DL3!eFgmG5h7H`Wx*$s{a*nGcO)8S%;sIc*nMXh
zv$D&j99Q5f*<t}aB4b3q$Ee>!>aipq)DZkd$Mk#jdQG;V`_xCv74VXX;sU{?ii_Ys
zMOVrdAy068=kZt&{H`G)UIuS#K`57s1@O(h#m?BVOfi=$E`gsuLa>E$bwTJfnpTHn
z;ZdTjV>Q{beFoDG5$vvq^__?9pT-VwGhDVXL_~z#u5EivM7EGAF64FJ(MAQKXR<p~
z5*9l59w-w~O5}?R0(}JQ1$3&F(W@|QXe5sxLc}3oD;5c>`0zjJkcg+O4UUNDTp=e+
zTBp#`AgxnGBYPx=QFQ!7A2BIHt8GXmKQmh(#O`gmC30GwU*ec891}}xJd;?SEdyf|
zV1R+h7ga&2&Q9dv*4QZt8#qcxqav<~Wp%zbv=kZKI`|MFY8vvrMNLz1-t;s4EPZ#$
ztv{FEJK7PHk-v|edxZxNx(PcB(TP_&`;s(~L*KB;0MB099!Q(bq2|5VBeD8kOv@KR
zEQLy0+>ZUvxbaR4`Rq7(?*Q74UC>0B*cUW$6E#tFTitEwmBVw}_oCR7pcYW;5DF&v
zfof%d>Y0-YU4}(srV0@hQI{*mYst5+CZD?+X|mi|75;)@QQp+MUhZ3OPrnl?6&qz8
zdK4Ebu+=J-T(?$f>T7FjV2p;C>n~DuigXZQI$4jF+|yyauY>#{M?UvaMAW+*UX^6R
za6o2x?`7S9f->@{63y&Z)|(ohptDjSuL*VZcV?=&+{tt?59_5UlRw9ekKyDSY4Xq0
zh=7K1nnqPc8`2*Ezz!t@P#R>HVUHr_+69F>Ok(U3!^mIH^c%Wrp=&KNrD&HSA`vqt
z$tPyW(=+|{E=(27=?=q#07!}$e+qx&4{w8^<yaU*iic*hmDy@)EeyzI2lK_WkSh<0
zF=F&<kmIkZX3JT$k`5#KAy&9l$Q8LX*z)08U|0SNERyPmN-Gden->a~GmLwhr2|3t
zL?Gz5z{YJleq!2V8THyiW3-BZZo}V4_?g~U*fc5Vmr8X7HOP#IFnd}!xhO1@L0+Pk
zJi5d9*E`5Z=gHS+k*dD0D0-jqS9K0$40G}fvOGHsGAQ#40yrvz={%9*%Cp2EPh3)3
zsmy{;Uz-lfX>f^(jt`&Z5T>*3(4;BvEY~$<w?b12ne2?HBR6a_-nxza%_8|`73s*?
zf;)TsR5|T7jLMuIxPBkdkh)|>C<x_jd9bulOb=3aNMX7l8%kSgN{z4k*6m<gzq*NM
zKfSKbv#x9*Ei9j0GrV_q7*BVQU%Hlj_G5@$>yLXTu&S&WF<c?DVqXcZ<Pd<h44R#T
zw39H~M$BgTcd7VWMZNmkHa4}dY9jyDQ;_(VGp05AQ6yhfmv?aEO&#Q8*W=`m{tIG2
z`%%{es7iQquH17B5KBV>JC))y4iG(0rz|vNqRkn>M$wS#PT48D#m3pJ7K@pJ+_hcc
z9JJj=p)`4hwm4Iq6ZgLH341ziO$lj=a`4s+B+xT-#+8O4G8st1(*l*TyUdXL<{j2d
zhPODaW(&>RoIE91dCKXuQwR@-Sr9m@#lfXBoXwR=3uc-FJgi*CVd0z>+Gb6et#&SL
zccI+9(`5!hWHOW)Quad1$+<vQt-10I(q=lgQOba@2EXm4TNI-2XvmTO(I;{8g-_r}
z=EpWFlubZfRf8_%JjLz(X68KKxkI55+o}sW(F#Z@&xbyRli#~tREhU_((^2Q?w3CU
z1j|nM4hT7wxWbA^H1fG0_zX_o1TETm-Dd&l(a$OX?W~0`P%wK9*jsnw<coK=2iCIZ
zTwPUJ5;|5_P4EL#UAe`1<Z}wS0-04veM@AbJ?D?^!O2JO5hW{kSDkPV;f8DFZD-7D
z0vKTSs6jJ5NIPP%Ms;rM9fFimvZvn!OBgjsIV1hh2JgR&wxfFSpV&ghKANPp4E1Vk
zYeF>&DD_dA?b&bcRZzJ-&BLxnV(;r6#=q?#U-)C3{QBobGT#d@RhGQC+wf4g%#+u5
zd|>kHccFmx2RFYCX|jEe&B(8RR4u=r92y!D2JYz<<1F}F`#y3wbM2X;;3uwT_p@{@
zKQIx%ip*zFe&8eLAU~jmKsPNV2$-*l`GK%%KzrT%07_R>AxJ$dK-SSz3tDr3dRcSK
zT4knRh|yk%(Vo+6!L46YtOGmK0&K#wB^1^Y*V>o*jE@+}2mTZ%e|TD43@+&71%2FH
zeN=T79>EPCmpz5Vm9?#Y&95QM_S#mzb9>J|*?XFV&<Kt1J4V~<^`iHqVpMB8bL2iX
zH5PZ~EpWtfnjNUOpHfVm9wC#Fj)1QDQEzn^pX(sM^CY;7-w>zHdxQG!*<+KO%Q{6O
zdmNH;iP&R53-;8y)dv%KAhM?gjw*Zp@)-qtx`gFYc0p+3&FxP^vfyvlcysn3Re5vN
zV3vbK#u&;Zl&Yy*mcJHG-&P^ZT!jMyDqDd)t*uNdWiXs_t8n9U3L<wwoD)*5=<qB0
zE85TR;l>~0<eSgp<U`MjgsP0G3P+W+v)DpC_HDpS-m7WhNGCdxDrJGuX6@~7L6quS
zfKi(i$yq?U{W}UsJMyz-zNpx!z3@Ef|MTquwF&7tV3$Og9V1rfh{bxQke*6oIl`t=
z)g;1O|6U=&9*C&&peRDRDXRCQ@8aZ*-(6Ex*s-aMydF?hd0u3=SXR{@Gs-qrA)=ar
zvodKF4b`nxn(AMhh<V_93Szb|2s83Jei}DEh?9@~15Up4zeFnPLnud6#S$(}$yn|@
zv;!c5o}-iTDmd<_Rhi^3P58a~eFc7<bxpYwH+~)`um3(yKKG9zdUF5zK&q&ng_^$f
zqC!nI6>g?J8BV_lu%UVrCc}d-DWGf@Po>o1{Ne{V`RyOHhoc(hby}&k!%p#5JD(P;
zE-K}+TWN@#@*HjB;eL}9;<9v_qpfh`$tnmAvz4>joG#jCw>w>~lpRHRg;XZxpeZh8
zg)lE%P_jVK*OhY8c4wNWIL?_)+i5epxs(!Un_zQUd1z;`aGYSVz#S~dF%YA}#k;Jj
zl+~8X2o@WFp&|54aW*?AIBXfa&1%b-VGX3yc8Ek<Gj>PX3?X5ADr3z!GaRLMo8j0-
z@iHCTC{{pNLzMTyR}>1`BOjhQS)S!+(Vk0}qJH=zoV@p+L`8i+(4aas;KGtTYjs2?
z&<WR^;C^E3O{Pw<!X*8HcFOhbZ-1g-#|~xw^d{W+hdBA#s}Lo6Rb)itFsdk>#ccih
z&y@2>SgtH^^?CH0pTc(Jr|svFn!!T%c~po;b_P|6b_RWEV!$&$S26%){>y^2vP&lG
z$FJe!mtPYZ&^nYV6HJDFSuWj2(Y;goLYfM8TDi`&_N@uOn_gGow_R}*;~w02A5NbB
zKREf*Ux?_fji4Ay6~T2^<ae25lNQH@#*K?(Lsb(^kNi?WQ#bM%Px5doBL}M@x?L-!
z*>3n(aH;-9q^#ywpC!z*q1`n|+un78QKhj=yGm6P?GOH61?{^<+U3l21Np!C6;9sw
zD<%0VW3LBRjr_j1FR@{!?PY#e$a97OTtGh9(RjUCDpK4E#thxiMtOJiAQpf)wp6}$
zrBGQYT@O>1tQ}HpD3nioDGTcdrj(`1b4YS|7ofFvE=a;g{WOT&0nyQX3P4$>$abp~
zBX?0*3<*NzBgziR&j>R)d4d_xlQP30p(<w`<UR<wT8v&<qzu<6%Nuv2${NK+Oj+Et
zb&;z?CdzvU30K*{m+)-#m*s9EwvzLyERy^^xde?Zx?$3|HD9SVY{SAU_dvtSYmi)q
zM%gYc4>iu!Y_!}!;+%LyO<B}nR+JUZbFNlr?CR_9dz;*vi~r=7T*3bU-2VQLcz>Dv
z-M?x}UO1tC;jYA)ZaSZ%yArK_54qtp=H$6Q(@mbcbJy?vC^dQEtnDVB{8l-yPW~<&
zQzg&ixlr=F9Lgkrm(K%|AAqB4<OWWoksCZ<MsC6pF><R9V38Lx{SZpJa8LDJyk%zV
ze6RDCL?_Qrewy6Mj4`>9Brv&wye@g(lB(q9XF$nKO<I!gH4jO?_tYZ!0m*8So0yIw
z-+P9M+~lMXx!Fk^@_c1b$n%)KFuwRSZtNnp#+OgydokEZnYP3Agfv$+efQwU_DfgQ
z_+qY_FO>I>5WoMi{qW|#G<@5?fZo`*|0uB^U1WoT0$gf_CwO_rLIobp;`d?+zS_jE
z0{Ti_cNIKH<H0K>;U1h&8@6$jOjgLH#Vd04ktMWFqWcU`xW8J+qT8t=2Gx9NWv!x^
znu8-{s5~!JI26~0<{#TH6refWGDdylvC%>2{_7OTG-9#8{{Ge>3XRl&PfKkRW{`cf
zTU+s3D^{R|n0DP!z^+EVV(9h2qPk54D!;)JUSZk_wW+cR?Ab}7g<N^?_Fz>=-5^Zy
zq**J#wc^dCP)^CT0#w#)E=)NE*a}lwv$-%qlF};~T7foHZ7x*Bn-p5%HdSsi<l4Iv
zRKO}~nqX$&O$VyBK~b@ub&zw~4fpa}F=*SY37rj2Bn23B_j_GD<OZ9ILH#CmEAYm;
za{(76uiL?u>oys#ihY)B_U6KYw0kRIwZhGXRJ#q?3QbYb1W_?p;8k#ng<nyz-XyBo
zUD1zdS=yT_HyyNiDN>EvwTdQ?8=a%#1;ti?(#1tozRB?F7hzO@Hdbyf<c6CBt*~3F
zHyQfcL46g_4RxCfxAE+JE9}<#&4n+Ya%qLEtZ9O&n5ge*2G<VDx589ZY%<7(1MaPW
zTdJGDZ*Ue%hk{zc*2^~+o_NB%72I0!=0Zb<%3FcSrA@FlsyouzausmsO^6~In+#BP
z^Tor^t#Dhao4{{Sb82UgTj44yHW#9Fs;m`Yy}SwDMkQK1xz-9%QPBi)gG`mq3#$MW
z&mO@^m9%c}ylsrDSemOXtz#s35@3u~{ak{WCr12Iap0PprME{MUKTsF19Ev%r}
zO=*Q&SST(bKFB%@r4%}iZ{82F6q)ymN_vPIvh2SOGj%iYn(f4hbW+#!>(OEKu`wLO
zFel&#eU3AXm2uB7qYN{{cxJ>hx4WT!2^GS7<K_AjG3gn8i=&A3eui0s_eV4E7Y*X&
zS@;9b3^U=3!Z(LMpQ|Q)lh!FZxR@H7tfUGP*-+M<5?00+xP)!qKUbXb7shAd(ooKy
zU7GRdZL_IF9JQ&Y6U*g*hspAm$Shwt0cFfm?Rk;+`0ajgyzKXkucm#IOK6;lDJqTX
zr)gI?ouH~yv;;djwFa{~ouG24ZK^O?PI(Z#WGR_go}Z#s;QB@Z_tDiUOT@<e5Kc@s
zov@W){P|?mk6;6i)RHjiG7D2~aa@1imF4o2b7_xQUrA0yW?>BTbQJI%D`%4l+gyr{
zlu~(K9B*<m;tRxPvY95_`4wnCS^@sacrn?@giXXh=>=Ln)@6xb&G@%zS8xN5fsd3D
z*IYI2oi^9_PNk;YHYKefA9HZ-L>BQhm6$Y7CnB?HAG;=-Omt}m<haIDe{Qmx%8yo(
zi4!0jM(mf&=W;1uBqNq19(cwlQzA{1^GKc|?5)5pA6cBXOag8;Gw8*9JKiOIt~rot
zE>(zRxP(MIzu9`=gEwvYmR$i4njdxf)?rI<Rd_d_qJtF3qFU3hrD>Q$(^J7rdMvjD
zI*Is8rSdtXWAc1zk!y+`zb2Uo*5pp*(!Nm`BQm=-FUHJeq^k;<nYpP*xu)}ZE-_h7
zj=Jh{FGNad-?9WxO_qhpDwhZ@CSeYVx>T5~@MAe8&WqFeQf}IE7P^)Pds2uXIjy#n
zCCt+**oJC)Y(j*)RdP%reX&f=CTWRY6*-ncZiRGWv{2*eR>=|RPbTflnNu}csNmMp
zW~=y?L^gTU1@Ng^E@5qFKk{f@n4%+#sl;e0l@;ymR`CyXe>qp17s&p%&IMRcQlN9<
zI)m0TwfT@wmQ(a--L|&QhkVNCnyuUBcQr@nQ<iiU^@F)7&i^S(GRGH!r4-$kmuKv=
zWwQ6VEUEnRBCIhPG;cvK3+?7L*h%%aYP>!twnz49B-@Q_n#w$aH9nFPWINeT7tSSq
z7VtB`mpTJ}W)As8TZ>=Y^>?e+-1O9Vxt*S^)oqzx!J0M&ep+F^oy=ffQ^?lY+RE$P
z*cO0mGb7)r=4Z6AeQRs7<PW#gsdI&+1+Det5^q}kdU0PsfxoaKNWS}4tr_#vH1eC<
z&clZKt+Th2tJ9Y85<dnuVx9FNITf7a^SLGP85XyAt<5F#N$?55x6Dr>|Fq`Mw(fV5
z31|uaEH{<Rp}p1?`&2n$%w6KogyYog5;rw|KJY6|=f(Yu=%Xw44l2G21R&7fLi7)%
zy;55{b57PRcsb5gdXdk!_vtx$(w3%OE6H_y`XurrrcTiC|7Eq=qc{0q?`ChCr;0qp
zFi;$z&3anl#)5Mx3&a7o*6uCwdDk4w)74}mQeVqe>>ocizBo-!uJZKcJjl4R#q(q(
znMki^YZ~ge$_}1ErmNtKqJ1^eqpeymAkLC*yQhV{dI4fX?eJ;ZH$a@BDW=mjHyYzp
zZQ>GXpKA&B@9W^Cs8{oq&eeKf^GP74waWR}m3DqyJq{4e@bv8L#MbT^>EzV-Dwl9o
z+u;Z6+X_!_{rW15MOLOEHrmFX!MawjSDTMbx}*{deJ^$v`AVa!$*J}+Fchl-8z#lC
z)b#C~tW6E^29>q5p?<4uh3FSSY#rp7E<@b4oh((`smXLh@$EMDL-ffeA+96GCw!A_
z_kV5Mo+~*|Lafs>4nFQA=$&f?#XupZUFH(wz*E>ui1`V(4Zg3WEWF5H#e9W$tABYS
zYF!G<F<BH3SHyej^B#>c=dVJ$Re!;Z@-Fb-@z&z;kVY9t&+eHq$e)}*bp>XcVSGzI
zlu1F)EP4*1T#aYR4Y`*YCJu!&@XSo0XBs7Bj?X}@#%)EpmpFR%pl7cY(lg95Gl773
zp}vfs!|0ho&)#X2wke_X%^2fB@X)gtJ%`YP$Bc%WMQNbX6KLoe^z4E!%mjR4Jb)O(
zq|h%fbQJSN3^N}t!2gsEPbcE25gH9X;#cvT7uEUDJfF-cVmY}a;CpEXx}K>&8l~1B
z_As+)dKvnPpW>y9!^6&|{ywp)4qxQIuQp!+?Ra=$czAiv4n3hII||eSE9&%!-5DMg
zig9vedzdlQ-o+qzr~;UbJ|aI5qiP=!baHCe%;!f}fl1t@kcxvk593>XH}KYUh?zj`
zFOAFa+z}Cih@=8u5S6y9cc@hu^&e*C{_Ei@ulGQfFdVMo8}`chde!-Z`0EyZ7lZ@j
zUvNtN^fP1O#Rh#4nWE4Kq{B?jcSc<^sJ%JnSYIB9)o$5m1pP<tUF8~o-RtqE#ScdC
zgQ)+dyxhMHe_)P+z9(iSzwiV@4}-o!k%yU>lku(b2Prw4IPwPx!M%{2%2`sD@p)dX
zcIVgnhu8W``UkTSK8pTU)W3Bv2c^^D)#2flxf;J^{cVO{MEx7~s(|(clJ6wykD9le
zuZTas$$S#!HGMf6_8BOi0finX>Ob4mUu2R(z8*BTFOmaadxe{F!yd&|WkLwwOP6Z;
zCCf{F{`i98`h_q={GyxSwdj)wFSrP6nVB!lGoi2r=<~X2`qV}~Wf<Pw80R|S5r#n(
z5uuE?xj&MpQa+-6Sc}gw6APe9$e#F_DG@$=t??HbZlyg$HklclTuCt=E23}aOzjb9
zUv3`k36I*Bn%cMUUurMq;iZ@`3^0dOUF0V`$oxk{Aj2<z;G6)%pgcX;Fe!%%x)2P5
zst>Qq{A{DIh<0~<%@=bLHt?HxCW-p5tk(Eb(^u&bLS5h8NWYr<M{<gs2Go0aBp-1+
zQGdlnsb#$pr;@Lpmw}F;GBe1;!(Z{^Va5?Zl=dGDsFbg$AC0-E8Ltz;Uq<wZ@TK*x
zh5zFGf&4S%-o-pBdiKl#4~Lg(`qhsA;?k+eCyK~U!kimkhB><~zDVeGeU|Kvhe1H2
zsJ{YVi3tcP=_A==DW{7{u0icBUih^he?)lo{IocfHx8YK3r+Bv{ehZ&Xvoi_`2l(;
z>ci?<f4_)i3x5&l(T1}J3==^7MffrzIcPALcK~{taW9Du5E@)2+6_()`XD2z)PLkC
zG0o>LP#Mx6aG+p39yw06k2JHThs!U=Ohk!WxTY#p2KiEHP(08>)DG+rlogKh(D2qU
Scpu&t|HGaSVOW@nQ2!q=M{<k+

diff --git a/transforms/universal/doc_id/python/requirements.txt b/transforms/universal/doc_id/requirements.txt
similarity index 100%
rename from transforms/universal/doc_id/python/requirements.txt
rename to transforms/universal/doc_id/requirements.txt
diff --git a/transforms/universal/doc_id/spark/.dockerignore b/transforms/universal/doc_id/spark/.dockerignore
deleted file mode 100644
index f7275bbbd..000000000
--- a/transforms/universal/doc_id/spark/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-venv/
diff --git a/transforms/universal/doc_id/spark/.gitignore b/transforms/universal/doc_id/spark/.gitignore
deleted file mode 100644
index 34baad132..000000000
--- a/transforms/universal/doc_id/spark/.gitignore
+++ /dev/null
@@ -1,39 +0,0 @@
-test-data/output
-output/*
-/output/
-data-processing-lib/
-data-processing-spark/
-
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-
-# Distribution / packaging
-bin/
-build/
-develop-eggs/
-dist/
-eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-.tox/
-htmlcov
-.coverage
-.cache
-nosetests.xml
-coverage.xml
diff --git a/transforms/universal/doc_id/spark/Makefile b/transforms/universal/doc_id/spark/Makefile
deleted file mode 100644
index 9303d021f..000000000
--- a/transforms/universal/doc_id/spark/Makefile
+++ /dev/null
@@ -1,57 +0,0 @@
-# Define the root of the local git clone for the common rules to be able 
-# know where they are running from.
-REPOROOT=../../../..
-
-# Set this, before including .make.defaults, to 
-#   1 if requirements reference the latest code in the data processing library 
-#     in this repo (that is not yet published to pypi).	 This is the default setting.
-#   0 if the transforms DPK dependencies are on wheels published to 
-#     pypi (e.g. data-prep-toolkit=0.2.1)
-#USE_REPO_LIB_SRC=1
-
-# Include a library of common .transform.* targets which most
-# transforms should be able to reuse.  However, feel free
-# to override/redefine the rules below. 
-include $(REPOROOT)/transforms/.make.transforms
-
-# Include the common configuration for this transform
-include ../transform.config
-
-venv::	.transforms.spark-venv
-
-test::	.transforms.spark-test
-
-clean:: .transforms.clean
-
-image:: .transforms.spark-image
-
-test-src:: .transforms.test-src
-
-setup:: .transforms.setup
-
-build:: build-dist image
-
-publish: publish-image
-
-publish-image:: .transforms.publish-image-spark
-
-set-versions:
-	$(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(DOC_ID_SPARK_VERSION) .transforms.set-versions
-        
-build-dist:: .defaults.build-dist 
-
-publish-dist:: .defaults.publish-dist
-
-test-image:: .transforms.spark-test-image
-
-run-cli-sample: .transforms.run-cli-spark-sample
-
-run-local-sample: .transforms.run-local-sample
-
-minio-start:	.minio-start
-
-kind-load-image:: .transforms.kind-load-image
-
-docker-load-image: .defaults.docker-load-image
-
-docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/doc_id/spark/README.md b/transforms/universal/doc_id/spark/README.md
deleted file mode 100644
index 932637c54..000000000
--- a/transforms/universal/doc_id/spark/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# Document ID Generation
-Please see the set of
-[transform project conventions](../../../README.md)
-for details on general project conventions, transform configuration,
-testing and IDE set up.
-
-## Summary 
-
-This transform assigns a unique integer ID to each row in a Spark DataFrame. It relies on the [monotonically_increasing_id](https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.sql.functions.monotonically_increasing_id.html) pyspark function to generate the unique integer IDs. As described in the documentation of this function:
-> The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. 
-
-## Configuration and command line Options
-
-The set of dictionary keys holding [DocIdTransform](src/doc_id_transform.py) 
-configuration for values are as follows:
-
-* _doc_id_column_name_ - specifies the name of the DataFrame column that holds the generated document IDs.
-
-## Running
-You can run the [doc_id_local.py](src/doc_id_local_spark.py) (spark-based implementation) to transform the `test1.parquet` file in [test input data](test-data/input) to an `output` directory.  The directory will contain both the new annotated `test1.parquet` file and the `metadata.json` file.
-
-### Launched Command Line Options 
-When running the transform with the Spark launcher (i.e. SparkTransformLauncher),
-the following command line arguments are available in addition to 
-the options provided by the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md).
-
-```
-  --doc_id_column_name DOC_ID_COLUMN_NAME
-                        name of the column that holds the generated document ids
-```
-
-### Running as spark-based application
-```
-(venv) cma:src$ python doc_id_local.py
-18:32:13 INFO - data factory data_ is using local data access: input_folder - /home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/test-data/input output_folder - /home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/output at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/data_access/data_access_factory.py:185"
-18:32:13 INFO - data factory data_ max_files -1, n_sample -1 at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/data_access/data_access_factory.py:201"
-18:32:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'] at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/data_access/data_access_factory.py:214"
-18:32:13 INFO - pipeline id pipeline_id at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/runtime/execution_configuration.py:80"
-18:32:13 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'} at "/home/cma/de/data-prep-kit/data-processing-lib/ray/src/data_processing/runtime/execution_configuration.py:83"
-18:32:13 INFO - spark execution config : {'spark_local_config_filepath': '/home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/config/spark_profile_local.yml', 'spark_kube_config_filepath': 'config/spark_profile_kube.yml'} at "/home/cma/de/data-prep-kit/data-processing-lib/spark/src/data_processing_spark/runtime/spark/spark_execution_config.py:42"
-24/05/26 18:32:14 WARN Utils: Your hostname, li-7aed0a4c-2d51-11b2-a85c-dfad31db696b.ibm.com resolves to a loopback address: 127.0.0.1; using 192.168.1.223 instead (on interface wlp0s20f3)
-24/05/26 18:32:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
-Setting default log level to "WARN".
-To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
-24/05/26 18:32:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
-18:32:17 INFO - files = ['/home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/test-data/input/test_doc_id_1.parquet', '/home/cma/de/data-prep-kit/transforms/universal/doc_id/spark/test-data/input/test_doc_id_2.parquet'] at "/home/cma/de/data-prep-kit/data-processing-lib/spark/src/data_processing_spark/runtime/spark/spark_launcher.py:184"
-24/05/26 18:32:23 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
-```
-
-### Doc ID Statistics
-The metadata generated by the Spark `doc_id` transform contains the following statistics:
-  * `total_docs_count`, `total_columns_count`: total number of documents (rows), and columns in the input table, before the `doc_id` transform ran    
-  * `docs_after_doc_id`, `columns_after_doc_id`: total number of documents (rows), and columns in the output table, after the `doc_id` transform ran  
-
-### Transforming data using the transform image
-
-To use the transform image to transform your data, please refer to the 
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
-substituting the name of this transform image and runtime as appropriate.
diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml
deleted file mode 100644
index 369a1bb72..000000000
--- a/transforms/universal/doc_id/spark/pyproject.toml
+++ /dev/null
@@ -1,45 +0,0 @@
-[project]
-name = "dpk_doc_id_transform_spark"
-version = "0.2.3.dev0"
-requires-python = ">=3.10,<3.13"
-description = "Doc ID Spark Transform"
-license = {text = "Apache-2.0"}
-readme = {file = "README.md", content-type = "text/markdown"}
-authors = [
-    { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
-    { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
-]
-dependencies = [
-    "data-prep-toolkit[spark]==0.2.3.dev0",
-]
-
-[build-system]
-requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
-build-backend = "setuptools.build_meta"
-
-[project.optional-dependencies]
-dev = [
-    "twine",
-    "pytest>=7.3.2",
-    "pytest-dotenv>=0.5.2",
-    "pytest-env>=1.0.0",
-    "pre-commit>=3.3.2",
-    "pytest-cov>=4.1.0",
-    "pytest-mock>=3.10.0",
-    "moto==5.0.5",
-    "markupsafe==2.0.1",
-]
-
-[options]
-package_dir = ["src","test"]
-
-[options.packages.find]
-where = ["src/"]
-
-[tool.pytest.ini_options]
-# Currently we use low coverage since we have to run tests separately (see makefile)
-#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
-markers = ["unit: unit tests", "integration: integration tests"]
-
-[tool.coverage.run]
-include = ["src/*"]
diff --git a/transforms/universal/doc_id/spark/test-data/expected/sample1.parquet b/transforms/universal/doc_id/spark/test-data/expected/sample1.parquet
deleted file mode 100644
index 765a6776b209cbc0903e7a09fa3af0f3844ba343..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 36668
zcmeHw30zah^LPRXh*+%}44x77)dR>4iC+)G6+l34NYvs>@&bWel7!1*)l$A{y{$*R
z>RqpT-&)TeT5qM+t6ufiYOVV9_WR#`$s;exi)|ILzwci@HObrA-I<-4ot>T8eNB<1
zaT#8W6h=fM!;e90LZfwB$9~T0^bL(Rghr#&JZPRYFPinwD+4^y88k)!qf;=b-I(Fc
z@NUKQCE!e7TSZy`trJBlfa0AukkLTDG@2jX({p$%4XN^I#qbIg(!3i7GW|34dQ^w0
zy0JK+T!w#uU%)eZzzaH}LVW)Yrodn@C>q0q20j{tFB<J_5Atw5`yl=7gCq@cxfgQO
z^R)*#)sr#CBN0fZMwa^@c+z%_YSU=tRr=+N^E+cxI;;!#AL3ts<)c#zejhWu>dD#Q
z?9INyBS$aXX7y<J>+Nc<<k_4Ti@NjYdiq2)DBf{Nb>T^ien;>7ub*_i^&Z}syrn#a
zB{wkebllWHR5&K<skcYK{zkk)kDfh?H___P%Lv)na>{{agC@^IWYeno?QtPAV<2{w
z38Qt2P;tUV+;B|A5vs$*LT-4tLah{tg**k9t48@4kE>KDP%$dzVJd-ugNnn$)M0#{
zK&%oeI6{;wU`FsUfhtTHrWA=)DsC9~7OBF;XqZxg@l|4uQlwJwlwy$}T&+a8Vv(3H
z;D|9LPb}aI!UTLJUqz^+Un-%F9-i~*$Xx$c41b$8Hc%Oq79FNH=W0~Aq+ZkusH%tj
zSRa|;)9N2G3QASqN@uL5Co(+jTJYhg9<&Z#axiEUzZ;va&|0t%qehE`6rm+-VXj7@
z%hl-3SdkvpvN=kFj$O<R<*+e5TZNiYb}mOO6oeL-40<ox#-YFN&r~b@rf|UIYQpH?
zAXm9#?|*^^N9%$N(gF%|Io3Y{GxH7hP3S`XctRKGel%obGrKDI5~_gf!SH$k<LV(#
z8Y7DuK|Oebl)ceWQ|iuaPsR(NJ0uHoc%3J0?!N<D4%yVu_qVZAI_{0-?Yfn*=E#$M
zw<`wS+v!hhPjA3UlOJl?_Tp(#zb&b&de39`y}hr?V!_t7!8aFl3O)AW!<A!}onFnH
z_~HKK@s$nM_IT*^smI>^L4%I{)Nya}`|TSFlO~oudYXMu{HYgh+=!8#=8P<vSHDpn
z&@gAqk(h5b8%;okOG-*YfvSW;{Q<%;L1is3*0IAwdA-aAqej`oWKjU14%MqdbZDVQ
zpBG}nw3yPYDaJyS`KUe*V|IIKZ)T&K&ExR6w*K_yB4dc&WJEa}t^gx+#@T+fwku6N
zv&O8&aE+PZ9WbmOvaJO&n+Y}MpEC+d9oW&EF}n>}2L_yfU)Y@{#~L^37BM2Mb@SNN
zW-;^&JpYK8MU5LQzt{_%7}&SnvY^gCOAA+H(x%r>o%(O??OB!eJ(pDUYM!#4U*1|&
z<<XK}k4~pg*?Rg+=cOTIp8EG$knY)H#rxx8HuK}oUmMcFXnqz`+=AVlXGz_cIjrek
zrKZGt(p==f>thy7z1U?ACt-A6*`&Gae|pevFd7qib3%6OwG*aA&tKPV_TAJ4)vVM7
z$su8W6LcfHKmH_bVoqsAcVtq<^}?7JO`dBmE!Yu$W@>|jJ5oEJ{hjxp=;_P#=(&;I
z51CT;ZTsc6&$GFc`{(%5=Ab(l>|8j^`*R&*!xyc!vrsjs`H9S(J3~h!OXFtm-1Os>
zoT4@JQn{`78&tvStJ4Bardvxk#uk_Dt{%|sOTTX0_w*D7UTql|{8ZiLn(D-ZWv0)P
zj-PM3ugfgU*`3S1f8G<;Q!I<Eiv3tIV2a1Xy|iPCr+#sA`<S6<K)++Vc13>L<fq>&
z%H$*2^YhKVJr1qu{u>q(Qkk_P{q(=RK3`rDSX{V$LaSj3MOSC;I5Ou@6gKg>{>U$i
z_<6ny4zAm-JNE7EiRMS8v#x$I|5Jl6y@$DA;2-%<c3+g6N1yEy&@s&K#D-Qu6Q)Qy
zhpfKUYq4;~l1WYLt!R9*O=f)d_~pDK3wvdT&k6XU#}VXO|M=?mSCLa*Ju1dtxV~$|
zwhb9)x9jfg*>vwi<(Am5<0>M?t~)D#rpmm&Xtid3C$DB34zU(jCQMni_xK|n<FS6y
z=4soGpWfDUz#VmxadzW5QIV<wRjBuuJC0m9-ze+ze^svhXm(ZASIgGt+*pydY)yCR
zf}FE`T7B@b-v__nHIMdazUj>Jq~(V_Be%pPt$+B%sw0c1PT%dLxX^J_=)lkS-A?QK
zYexfP=FXw>3Xk<)(D$;&E8%d+zxIUr{xKwa((>=J25#3*KNx(j!SsH`<NrIm`kS8P
z_Dmf9VO5hKne1>L<7CLPt#i+01`UcFKmPQB(o6BZg17AXB;)9Y{@oTl@sl*{Gh*4@
zE;~m)IPP;PXv629HWCP%FB~t%E*xWaz4~apvf8jE*1YDn;=8Z!c9|30{UhVJh98($
zjBTEf@nHG5`mI`j+@-ATrXH2E(weQk-?!}@uQ6k;u1cAf_-O1`OQYMeqpS0m{AcV}
zK9jnnHr>LGI@xJ<_3X+9?Kh0ueXq2%@7+rw!n3>gEU1_9`BS<f!w@BH-`jW~Y2jmJ
ziztuJFI}&m*nhy(zWWbO`r>Xw=9OI=?>B7af3;~slhTz(&JTFv!DKF!Ox~+%e)8$1
z0W$AA{kAV=+-TTz-};N2)y1Ng&#U&bdK)VyRunIP&u916)Nh*QR9)|PV$xpY?9n&+
z=g>IfroUb^F1*row~8}(s9)muXZlso>v~evH>htKyGikuidk1R8yW^oJ^1b93%h*&
zC=J@+;pHp(s7<5KIxc=Nq@-}ZasI4-R|jm{GbJQ)?vWkAVJAQ7kU6nAd;Ef9zUR^t
zCO$9jx|$I)ukYsgxXMeP23`DY(TSM2l}Ur=oS4}}_6KXRsQ%Xrql?Ey&)PA(vWcw8
zms-)B?a`ac&)#f5KBBuQx<%Hd*fFPMvqlbH#Y(6*G-Xk0N9CUVqejKWPw23lKCtrJ
z{KE-fwP_Mz+!URjpkj^>93QAjOKHEGZhU-iUdP$<Mo;@>`s4EZw{j9i53;?!2<=jS
zr{H<3@c&Lyk6gs5nmf$2NYK8^fTeRwhxbmH_q-sZX-BDOLGJF>kK4~V-Tq#a$_15+
z4C>PpfBGP$X&1qjHa!cQU1)K?$HWukPJz|({laop@VfS1OP4+SdFLlNy_)9s$iMll
zP1VG5T~)iF(gB;eeyZZBpRMmFDVuSQZCYoLb_%Yz%+Qy2ypn#sN1vXLd-`WL98;Qc
z_p+sT(c-V92cF&^GW5qijRsxkbc%iN%ff`Jz`ZxWf3I1yFS;O0bL1~}wEir5Ab;$E
zUr(;uAKvMTFkUhk8M@u$YpqtW+0^s5kn7i!@x58OJ>xT@eZK3Oe(ma=^kn0C;o3{1
z%X`GEXn3*s)YhDD_8kw+*k){!w>$0Hpp5FEm~oX8DojsyPv7pF{hyR|HwXRPhM!Sl
z-jOaEJ7l?N{Jw<&gQhng5_9~c4`Y`s8#lXayOxu<p^MjZjtw~N-(c<Thv73bH}5*T
z{Jo#1j_DkfyEpqAwXS3558C$}I401?J2N|K@R(Cw?$O(oU-Q_xYUYKABg;Jen%r03
zDD5ZP`rY&;*E-)>;nTK7#Gx)7{v|bX!UU-&CLO6)IVH-H7+pBC#U9JR4<>&ZJ>_26
z%JW^8uf4D!=gWnsGdhYp-`E^{Z+N?DJ@<U0nm*Q|xXCo`sbA9gKwbygM;n&T>U(L&
zqFWxHRIcOh-+1`@qu-A6@r;_>HpAod@CVW=^>pnDzh2rYLDS6JV~==Et9RkO6V(G6
zOiMFtZqlN~cb*jsTOV$>{A0hC(wjqnlditezq?*oY={`6Yg@la@jxFUy%K$Q>dZym
zN6bq7{Ygf*xuHMqWRJ?Jj^8NA3O>~-EckZS+P(i?bFZ-Yu1Rv4Rq7?l=*&F6^t(oi
zgUshYjmQ|<|A%`!E7iZN=Ee%9Y`m=ZXm+gkmbRkwhtEA{aEC;0>^%PGX$LM9%D#JA
zKS8|kz2<AmHf@VoRnY3(61W3*c8V^0B&VlHpj#5dN{dN|iOPtH4rZmx(lcU`Sn=ts
z$e6@F*(@$c$YZ5vNHStrF{MT=sz=QlgPtWbSXfF_&jR}#tcYdK$5?XM8_VW9u?%``
z8B3$LHaD43GsZF$u~3Df7z<^kmzm6%j)elR3Urlh(1Wr}6X>T1GZ_tf6UK@*;C-qM
zTCJf3IAPsbF?y4w2*XRu*%>LZER(epjmZ&BD5yzI5}Jo<^r5UYObe({fmUimk<L1r
zbPQuzd+lSyitsWM3xHWV1HehH2}WQsyN)-Mm4X%lUa)W{vKtHMMF=Z7BQ1q_m7at<
zk}Q;!PPme+DBP7~rCW?f0~l`^U}p)6w`W<sOu#{mRbnVofi797-T+u!WaX;KpaaZQ
z<3yM>I>0+Z7)6*7gJT1%$4W7!1uz{-CMvN{6qKLcwS|3}p!gtI4R&<FRzU(z11h89
zB*{{T>bNvZ@-e*wp};@3BCt{%>c+V}6K8jBeU%0v9lhCv`?tF?1J-szuJuF~v7mqZ
zP9fhZWbG;><vzuq0d^^*D2F%Kx_`&?6tM{pOwR=W2-SAn>yOJIX*Mv8c@{L!x^puJ
zOr;~M(~(DMP_c<lD9I^eR66dQ#e3<rK!DJ#)o1{K$-QG_EMQ=gqtffcN5Ya**!lW3
zKxUxZK8XQ3ZA`1P?IWxYHq5#9&FM6cfNdVg#w=uVCLH5m!$Qdc7K7&?kj6u4G=D%4
zIle?88h<jJ>F`91#!+0i<@UieP+PRxT$Mow)&$%Zt)vH3(~)a3WSR^P$Duo?&M5~0
z!di)d$eIW$h=apG@_P&xDc0pj5auJc#f)e)WJdp13_n}&&=(qG7|h0L$_OY$j`l<5
z_iGjAN2A0#%Y!iiL~cpiJhq+}?GI^d)_|ogIqSUqx!pRCTb(LAayTu%Lwn}ro1I_u
zEMrzlwtCi&q<3o7Y}lFsqE$l^nxex-aJZuVt9=(%|K2uY*~7KcK2gh#sWyFhVE=;T
zv&XML`-ppG`0?LXR6n0|dr61uzm=}*QQq+GC!8B)<tKY)cuslHXWj82x(BV-h<@!y
z22u#?GddA80kOl-JdHWuqNs_)pqdb!L4|2eA$Z^reuSXPffkKP13d&b6cPy6l<2hv
zR0Y;3wk_I6d;>XJVWGw{5OyiV^gSe!NLdUoF*!0RF*%-}7@sW5OqE6PW21$A;&{w|
z6g7lSI4bKN%&x2qVtgRPslsxLu_6;-3$7xoeF8T6AiD-2<35CH`;QvfKa9<k6G+V{
zb(lW<FX%KjkcdfHgaLOaM5SXXP*XlDM9KQVSZ2-#{;Ey+$V&GCg~qf0W*5xA+kjec
zHtOwq(}<)udbK7G*O+0R0Si5mg<51nA=H?^i}ml5Ny+y}Mur35+tsJjT7wZOKFGuL
zm`P&_F%}tAA>1&KgAs#d4o|@2QgONUOB*m>4yNYdR+F8Bej2?BE6uI3$df$+HhUm{
z3`B+vgv@(+_kV_+k}=&Fi4KfueVvAT10bZ#58;SHcp?@@*p0&h|8>PbDO4}eZDpo=
z5H&N7m$)zk?m2KxdO;7E>w(NDM%G#&Gu(ImGOUzL013&FP6^4&AR+1Brr{1thsU4Y
z`JU%5d(fvrq^W-x_98wm+V2;}iIGU5bdvnSPtRiFw`R{6T-YhSq&{s3cdA(1bH)!D
z3fVKm&<Dw0FGTD9{a>-+?)`ZWzeg8VeBF{xYj0U|?Sbm*xMAJY*W)J5{P4p0(Dnx&
z_a3=Wu4vNW(3T(1(T2PamvZO*bA$?H2^9*ZIvl8{lBeMFRALcFjS0CtkWf^Fal`mR
zR4Cx{Ib4)43d7KFUYL*zGm&Z@U!*`q%!n|RN}#}0TyD5hpavOAHBT*8fezGakmclI
zTy>b3BNU>_aDiH_<ckCxK2IqWDY=+H$rXvhfa7Y-jyrj6I|kc@=3&o5f?UG2LCCf;
zD6D@1@sy%}mr1J0OT9`aX?+#lE_+W)krtn+)<n|LPw2?np>!zze|ReC)t%|}Bf<Cy
zZ`ub$ql~D&4CeuKrf-d)Bilv*4!CirUmfqlQ6#(#Y*7w#nbGdZpZScAEdR_sZg<<#
z@^T6b?@?aP1pkQn@8~*i8$;3ozyk}&L0Wl$Z%faQrXveS<7z;3Lpc){(HX_$bRH{X
z5}lUe!30QQelvu_3*m)lfHf!-<lN<8apHz>MLhV$3-h8ygq@%M<>Hf@VZz_pax$&!
z6RZ#0<Nx8EFd>B0vmHW=)XCC)K7lk;Z<P{k9jqL`1<sdvAZNzWkv(Jap}q=EO17M6
z#Hg-M=F5Ov9`%~ij$bJakI367-Y^DbR-sI9+9gl!qLgcFQ->SJzLy5>+a;BJT(Yki
z&0X4JL0AL-&{+fhi`JYQg_a)gl~2>s3TZ2Z5jp2h<*m)^-(|t-gcbYJ8cn~|y`@*j
zMR%%hj2QgGuxq{h9y`%>!M0VCdR1gToVjWHiHF#ll4S*BEmuw@81A&=>?_Oe{ma>N
z3F31n4*qLRivzLSxRcbgH$+<&ub0nilfKlqi!3;xfycez9;c?ysP}EJpE<2BH|?2q
zenfSvCLQ|t8$Y_#c^z&12fgLmiOR6}%5l=<g7*&1e7|vr?pVyMmRGutmA1N|G)`H5
z?ESnQdnWgO7<hRoB91z=H@!)V(XpSlztg1rj?W6?rH}yc`m=n?e>!MndaU2PHmx^*
z=hc%7w?@qwYV`a3(ZK7cI-FCWhY#J$Y4g#P9lmAXzIQRGlekyU3xgUyi5~yH?~nYT
zS&f^YFGbD|PrNqD&#&GX@NaBTrD$+ZpRfF+2Pb^upC8@9?A?28Sle#`%7#r|<GDp7
zZsVOk<BpGE;zzl&yNv#3`#Ns7=bKtSjcB>%XYQb~s&(G$w{iw8{;A!Qw57j(DNpsk
z+)Fm8H7|+gyD};3af7Myfy=OypN&2%TQ_alD&@?i7v^<i=Wc(ZZvFVHj<-B}sLxF~
zyDSTIKDh;l6)|l7qD%C9jXD^wFO2%3qs+HW#ejetbGmSYwB~`6Zx)KzxASdTDR|mp
z#g(Ml(V?RK0ezH1X47Z<Fr?ev^1+QpMtU#jB(z+-D*D1n?CWtKpS`&B(r-^YUajDE
zi0Hr_Z|LjEKDu+&#xoz+J9(t@sOD|THtB{2?SI61NHLAx#Y^ACOKUe?O6^Xb{fcB{
z8>ld_vDFx1U<O}TPXz{CYl_^MNk{%O5x2DO)bgEL)~;GouEWdeFmyvYah5%kPE&cn
z$hqB#U~$51du3<;WHvt{Fr|K6-|s05t8FI>{S$i!xG#e5WNi+L6UdE+a$+rM@-)QM
zYU^3D8b{?n9sx(eCVCAWsalOspI1Zv3M>*@JvjR?!mTb`7~#VG8H;T7?H$-z<kB||
za>Jron@BA3!%B^sBFu?D7dOz6{p)M^^Y<}R@+X+l(;?Opu!;ub_cWtLq17lq!b~O$
zNC#U$W}3nUZ{k{uNl`|O+0jL>{Dw5-_drAu<V4}IKlqJX)oOacBRX<r3mqA?1s^s=
z*Ocfe2m0#!n&aC6p~Qa5w-76XDQRo=7H+6UZKoq=w*dlceM7ZpCke5SF5jdykTz2<
zcF>W-JKW=ac@!-iY+P8J@NNQ8H@@J{qG%MAiFg(Th+5niza@#gA&6`$fo@?cr%2kO
z-E?Hft{No=7K;MG%WzZ5z6B%LLCG6xXf&8Iw1CK_TBFBQ6uMo%2FSGkq#oMTJ*1&|
z7h!pXUoeXvu#1k&`hkud{~jNcy(e;~lr)3OA`Ude+aEpvKbR;mIzmCQj}62meo#|;
z|3?zF_3Sz|fgZ4)jtu(|ga`KG#1h@u`=ccE)hE{Q10?<SGNXC+NwsZ19T~UZy>h=g
z(mxK8kb2{1RqSXN9i$^q09EehG6>3!a|+5n2SM5WZ5pra*l7N>43!F-Xx@Ob^69}R
z=ADz}d*m#AF??C;+uqZXe^QN#>GZYOXOvgTQU1r}0g=A@6DtQqw_Cojm!d`Tk(N8E
zZ$~uyC6)ijA6=vG6!iIg)vZg7BU8%4m*ndY-yOou?cHl*`?a~N6F&Jd;^57++gs(?
zo%=raZRbU&jLTvQwIZCyMHK=NRuwC`d=O+6D@437kxI!$QISd&#^Z<pK!Ndum{2TG
zf_i*Dieh{|xKt9(j0hJi#R7#wAXKO^zA&64;DInN7Zrw~Sh!FfCKL$O9I%q}RAC_M
zD^Q1pt2q2{G)$ous6}d&>+YEM@6{yjYe*c0$u;GpJlyDmn<B&OAybcnJoF)43nTsk
zL!s1raD~HRg2L6U9+-0A{s*{%1r*fjibp*hz_fl*Pt7GKNzCyf=QykA0ZZt}juUib
z$O)V+j=fP5`08_H^J(%N!Ai|VsC|z7eu|FlI^}+jyu@(9I|me;frbWxa6<!sP;=+X
zFJ$h(47mxjdJ0z4b^i<<S$PKMj%$yU9AYxMIEc_6rP1JJJmN+V1+BgWj9K%Gn%Z6G
zNz{6iPFYN)2W$nN+&SRwp2LZ)>4?-NCAoiPMgNO=G3%=8P}y-+bpTP1u<8<Ng#OT5
z%~gVv4O+0A!V8Gj)Y$%ubmY<ne4I9~{%@jw1x|=Vc*4J&A*85GaDoCv&53E(NStVd
zbAm`aJ22z?6*}_hGMO0^C|-u0k{Rz7rPp1QrrjaQuR%UI>;O)gXe~NxbjP|R?Yc!r
zCfvYf_m5qY-s*Ma`S$@6iTlV6ftduCKgn#gUPvBx4`g2cpug^Y-N0v^g*2wONtuu7
zP(}*4rrgoP@e;JPh;(I`#_$Jq$X8Q4(h^&N35}Q6+*2eMTJNEeOKJ}9IusJ;9?6CF
zqk`m8@|ioL+z*gdbLh=V4fSa!q5&L-^SKr>jC5jyq8RSD$i*N#PJV<5Q{ov?MkEkv
z4ERJ&>sd)9l50d8NSrMTMJ^)8Ov&Yr;UF@E6a^s{J4Ffz5Y9wmZ*HZP*fCq_blh?h
z&2Vb1J2rBWbz4M~z`pVAiM3zbroq;&g9}TvfZY=Q8&L*(@wWO>HqO~x7@~rebNF#g
za*@NzN(MMDG-Mdrh!N#*ia5?oT7X|*tH5WS>gH7fNS)l}{~x%B{{Q0zYGm&J`TM`f
zyIUUbt{uIlYe%jd1LXB@W*~3=>I3rTE+oAAr2^#L!Sh<aZ9s8z1NpyjN*j5L2ceO-
zaPAoSKYMHz`5!p-iM+<iP2@FCUm~?}ViKwAvxvyMb2%XF`tH8>+q>^g;ds5Z<1WbC
zIYolh<w*|Ybxva-uW_ORd20t4klH_AfYj<C0OVB@`N*p$<B|W7L^4u~IbY<}^R`GW
zCs&c$PJbeAE8B>?#pE9HKarS1>M%1DP<?<N;D`7IoS#N-25yZ8Grhs>ClzX99_lzK
z$a<62n%Xl3bz}7%6a-#xr~<!&ir{MvgMwK>@SZCu(1Uj)fG6<Uo+2}N<lw&zM(|UJ
z|HR-=Zbg6aVC@gQ1{7SA#cbU=_M+BcS`~hS*50#~tXz0M3KRxe^cr}v9jCyeGnUl=
z42lA9To#t=Ff$5M2B7+&AWRRcgZBWywwW2RA)=rP5;2Zc1lix48_I<}Ind(*b)pPp
z^1=kJbk!7-$bw9(2nN*IhEMAHWoRL+7LkJEam34KU9ma=ozVV0DYy)m=-fS8ilM$H
zPW;TQE5e%M+8rU0?Q%s(hSV-jBD?B}lMJa{oIsM+_q@0wbpX{aR?-VuT=6;qYZ<fc
zohuY*Nf0NTYVa-+O4~pJyv#ZfXJi~+>ETL&+iFhayyiqAp@A=ocqtu3h1#XS{-Om}
z<c_d6fft?&aEF(G)iPcc1wOp2cJY7{4X%i6g|&;xcG=n$8wucqku+EI4hugC@Ulr{
zv%9z-;j*+l0c#yKeyy5nv^4-Hl&?ES@!QR=2(7oCVR<d%vtRV1K<WsrUCa)*q+D^k
zfNL3h%>^V1)DE!P#p`$r$Q8FMyms*u2Z&rTlOawxNfY&zX0YvIk1I|Rpq5cOT)%Ne
z>;iT||C+Pddfv(vt-ZW<@!@w|T+!7O*Df}AkHr-kQR;;2b#=#je}w`ae33s+MlB;G
z+<g4ThAUnduoL>%)EwKD2v@u$K<#3*9@uk5XfJod_qr0b9SU^CNCG&ad`+fW?<P<n
z#E&Av@><46x+y?`)d~6ra1)+BM&W+~E@L&XwX;@ueG2d>Q4x4Av7rbL-e@(7BDAOs
z#%_#dXi<@&1oDB<VNlA2cZE1XAeKV#o&@BDazpt+6*Q*5BrUB^b~o!GXy%YOHvX4~
zN+gn0@DKj=mPiDW$UI4`M3N_o%EQYdBOTx+urS6BPtCACgHOB{93C%`lt@53@C`eO
zR>puY@GOxeiPFKZ@OYipB9~?fa(KzbinuJZLZ77Rqlr{tWeG)Swy-e1z>pWOPsj(i
z7q#)4lDv4GFkg|K3F}x?*`=n$D2YbNPs>;8Q$d-e#P(dQjEWD7kIpp3M<tZ2q**1f
zpQIeF3c_VPu}PK9wdC-uxaB!DI4!blt`^o+=(9|UD2PvPlxLR~=I|)+N@D@-vE@1Z
zG@()oX_9DE*+L`ezfhhY4{-w;xg}VZn1kg+;{C?!#2QqWRiKK(;bwVGT0ZE5BTEPL
z#+fwoY+-?dmu6Jxlz4wxS!vS5j699niFRcfs2^(v{3Ga+XtJ_}IQ{ZyFxDtRsg+;O
z^t-92j|4mheB_d8)ml{1GLDV!Tt!Z#kUUl(A5L;Xss{2@k)6enWvAt<q%k$JQKy&W
z0Xf$2G+vu!QRrgL^6XR~8wuV{uG4B2(lj+*4tWrjkfp%KlvN1jiQ}#d+H`5fGJY1I
zO~?UyQRt3$a;dlg$W*J)r>W6w>)4gGtp`%@t~+U|I57&&54(KrxUG08cvq<4C3Ar+
zEH?crk%2iR%Sl$N;<P0|Cn0~i3Y`|}7%`t*<f`B$43uXl+vLvGs-&@?kF@-nc@f7k
zL0u)ujDzN+nQS_*L$k9?@>sE5?)o&NN?K~gM<q*rmIcjDE|!BigzJ(%%dCvkl4&lM
z>5N(#|8MA;4%icY8kAGHJ6VExY5}&vqKZqx@z#|b6;NOJS^09Fb-WaEGy%EkRoSt6
z8&B&>j!=KpDyf#Eu*rggHkUee#Wy*$$+2R9pPP?n3*6Zc9b6Z3cxlCo>{z2hgWK7<
z;vdlcQmt)XK>P1H7r=U=06K@SGjKhlnh!d;Nx_S?+g8{4pi@Z2`F7j<m*!}pg0Hf`
zc3^Je^FN0#*DCeNMg`9;FaNYnoylG*<|}li#bAw5!+8tzQtxhF13O7wmx|YK#P$$A
zjn#HTn?^B@z#5;X#Rxm;P8Z%JeSF|&0AK1)=;NT!C#qZe+^)ZMz2>TN5=`#&tOizR
zdIi?B9N?$v3*E^K%xf;RbwaoDdNZ~K;1zP9?_~2c+}OUFHQDM9yVI#RMI#@s^$Awq
zxcK$>zJLq-g)+?QyVup4Q7Gd<zuE0Pbb!~Lz0E3@@e_>7IA9}QvOdUjk_(hNZ3*xh
zitBi-)yj2p;1dGhQkMn&Q=32Qy5Es!gPOpfMRVj@xYw#<pGu3xMH-cVMq^HX37V7e
zHs~|TbohP-_t8mv2MXT>1R&tv0{0KCdnLDa=8ddd;5A=4s$!+iy-$zwvV<z0xJ>>M
zpI#3Ah@4bjvULwmu}63Ezy8YJR>w6cK@0=N0o<&oW;8CjK*0xbfV#DNB}$#R0L;^J
zd3Kt8EvK-5%D9AL8853`$;&DPGB(%oJSmfBt6pYn9N=|j2meH-<-iw(`)at})zx|d
z;w&n+ds?tp*Mr!QJ9?b<4Is|o6w`5<8;<cQH*pD-R9ph~?=PW=1gG+q-qd<;^GQHV
zs~nZal)3Zc>~Vl(wUU>gpH#PdMxHz;p&ZQ?TinqH*0(YxukP1ZEH14~24bUb>={_s
z>~O02=%h=vWM`qT6k9IOaUTPNu_|E0tnn)<eS0HolS90LV(oN**Ojfn{UQ)s2Xa)I
zK-|@xEGg?z$@I12+ivU!?vrJKxDFAYkY>5v|GBOErsSLjVx3V5z{kx3dM7S}F;EcG
zHlf)GfTv(Df#)Y8h2VFYg0IB+OPa4B-Wp$;lrAVqERbkmJe(BovCn(hM?t&=)GLqI
zb70;@B8jub<3Sn)Bx67ZB{dFwiXy>tltd|!B<D%NFB7b*mdHRRrUX1ol<+xRk|~iS
zrh?o}q#zRHV$$GqBz%r8gt?bun4n2W1-Y9@{5c6v*udv#8A#Vi21=3u{?upykAcrI
z@Hx383gUxl91Ny&V&F4)0?#6t21<fyohZC38GMe$(>yV-YaXa5=?#-Yka{AC1fM0Q
zu4*<Ld`C+1;Uk_TvgU{IM@cgLf_VlQlds7U0i2S^av_^7f?zmAiA0)Vdya_(*a=kl
zPxuH>fq#((5DPch`-zT+_zP3?;D2uqz^R#dS73DT0sn*_(GVtub|*_siY;m7l4t>J
zU-D<|VJEisU{95=;+2B7_@AL`F&KqYdnpdG<Hz+uYO^0;?3pP=Y<6iun05Tbcp+0D
z<B30@BS5}{EtJH;`eF&h2LX~&ydF>-_)znQqn797bCkN+axikVE65E3Xp}@+{#WqT
z=(HpW*6*50;EPPdao|Lf@Zs`xS$|=z6xf~}L$rSxy@dY(MuM?ZZ1ko?6ZA$~^}(4x
zkiU_*FNISUUnH{fGhPyxQtY4)I8#XafGeEMmxpcDus$av{N?h1td1mnSTG-8eX+^L
z-^iEo$Hgy)q$k7nU6n+8H~fJ%L!!vexB9865+2(+-=ZW*1q8h|{@_!Sq(6{fkwx+x
zEnmS;C{*I$NL@|)l$!R|`39|lgZ>F%d)K`sD3zs@v)N?@HhvTO>x|z3+dJ%?p?!h!
z&4TS=b$9a>@<*Dblfyi#RLcYV9vEK%g;65dKHsT5&Lon2qhR0CG%fJ`%g~%iuxAom
zBz+*gT}y2GMaYYK{z#MY^%PPF`Nea>=b}$IzT{%CwsMsELP?(#J{VuL*rrcz<b&HY
z$GBT+6vhUs2nQ;Oc5d&i|G0gqi5E(eihwFXdm1mv!SO@tt@aQs9Pk1E#6KXtaam;w
zNt6JNFYnLlL)21jA+RS(Sl{SW--Z9y`qpy+))+PDpa7~m&QI__@E;C>Y`^fO5}q3X
zHq4q+24p9v099v~6Z~|euW;;<b{_=KkqChwr<BNH`?7Kye{A|{JrrS|FOg8coZF{q
zNjVk(k5WSU;QitHn}jpfdA-4>lGL6j0u6!6<UzjxeB*ynk_5;PNVB{7gX@Q5E-Wuv
z1o4-`@!<Ha@^#@qK7T+y9GnHf^SbakssQkiU1HNOcl^hPLdtLA#Abmx$1VkP_NDYf
zp-0x}v(?^2Ng&d6*q%hMl?gcO@cym#*qUR81p{Gyel+-a8Gmqm_Ix?sQ*<V1#MV3E
zbCD0E1+2&9^8#>wfOQSmhw_^C@i>t#{Dr7v9nL}ke+}{%(rfc)Kxbgy0n{?+H30_<
z@0=Jk;V(fWN!EY7p7S{nSO@A4aOgmPN?;s_KkLk<F<C~mXrKi%hbWE45d2^f%XV;x
c#SH_;hB)A5Z{U@OZ{<JQ`cxV%MUuw-e+XP{ng9R*

diff --git a/transforms/universal/doc_id/spark/test-data/input/sample1.parquet b/transforms/universal/doc_id/spark/test-data/input/sample1.parquet
deleted file mode 100644
index c0af946f119bbbeea1a282941cde60a50574137f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 36132
zcmeHw3v?UhndpooCz4|)RyNX+sYA!P$&iHD){Nf1Kr@oYvK(1b^ssC#kd8(pTee<U
zl3#^hLRre|ZWl@^?XtOWOS#bPa(3zCz#euBcNdn@(z18Ey_D_gLJw_eFT16Cxtwkf
zhu-g>(O4sC<PhvQl#NboYyQ{w{J;PEzwiGiVJ1QuG={L@vVft>fbGGsLm1YBV|b4i
z>*&#5whfa$n%Z9jW-uBqItp!j4ciUd`%PN8#-yz^!Hn1;l~SnPd8h+^cj4Q%-8_ns
zZ{qz1O`i?h-rHyDju#4CUPzA+<{`?^VeB%#jT>Lb$&ZrSvjNd}6llQ=9T@!Tg+CZ}
zo@o1WXPvgLI&BKd?$MA>Zrd<z_iQuVw=DpYK$815{>HbC0nKT%!$~=WwAq$%xNMZe
zk;?E^myJ$QREDz%G{vV<oQreOLfUFIb1sKHW4F*&SK65}+c?UK#;^$1w4Jx}PFFfj
z+2OA<?Qn5+J|$SvE;H{;r)b{gv^p|8N4cCXi`DEBc-m#PSnXB|Z%Lyuc&FXvvhx|*
z?4r$Sv&&}VEKZx#%%xIxn>l5MGJ#5G0LxU`<+NMvcDu`Fr!r}$i>K_g1BAHOV5;ky
z9XoVvqqLchZIm(~tm}ZJ#~s@qz{wkW`VHMRHQt#n^3}XhsGQ8EMOnYoX}oni`B)eE
z>+SvT6ZBQ*$P>8X%Xq-hA?nf9-XsuNwu}r9r*c(cu$0XSeO=+f1#W5Bb~2mFpUf62
z!a{+|4Vyc6ALEPp;YDi5JS-H3(_Doc#t)1^*-5kXaoMaxbLC<Iq`AjnY>;NM5k?)F
zDku?h3owL^3N+I>92=OUBP0#LyD;)bLtO#1vI2<Gzq!-+wubzkp1ipiN&g(8bWX`t
z)tLLX8NS*fTb>v2HzEOG%WS1mDudcAEiDa!V1|&ifFot38mo)>VaE{tkxH?Y<u5N+
zQ&5@b3hBW-H=ixc43>qQz*n-1!XQ7(6=sAhFl;f5-CQYUhG{b`_sa|AGB+cX2MhDU
z%3z_umq)PGJ26eA)alkxAJMt3g~~!{uuv{>W;10KRBT9KGqE&{tUFo_d#XltBW}%>
z<nD{>iqu)jR&s)<MR#sDeyNju@<Zh7CZt90AM{oA;>qKN+Yifn@$$iEe+E=eh!Q3^
zA}Y(6@Q5-dqSaEVSQdztVwG6p3KgPKBzWD2-uV4uK_Hfj3u#oq6^fNvVSy+Xaw|l+
zm=}m*hJX^HlFbW4#5DBfxdO2ulmxCKq@j_pEb!F|G=N5)K*+<ySfZ|RcnmA!H10i=
z0mF8b0CO!Y=pO9#j4?qUJ&biOcQ4Hf1^Qu33#zH(FuwEAV?#IhVp#AvJ53A{!JsEf
z;N8n$->eBD>JNHYVuG29Mq-3NN+j7x=qLf&+4;(q@aXpiLlM?XxTlHO7)wN_qcL`(
zhw)MrOZbD6Ou+9YMnjPaCZ_Mh=)OK8%tV+8HpWKu{g|b%U&91&)7}VsBF;t)mtodE
zq8B>F*-<y9v9UcD9SDYEM8Y2%^9ON#Hw+dzNa+6gO3Ka>@gU<4utY3GMA%rw&rY&L
z62JHW5el;rCg>$%II;IA;fWqM>m8VL$bsEF>WB6~WH{sx#-Nuc6!8*2zW&oR{6Ndt
zahkxxvoNt-EYyy(o*3&rN<>4R<IzNz@GwFAfq)yB5%T&+{Xq0+Bs4(;L&1SL{BxaJ
zA~D7WiD)2{z-=0TR8L?-J$Gqfa$kk@9J=}{tTUpCh4gICRXukOzG_Gwu=oj2I1VH&
z+ui~!?jjaBSdR1`$E?8v820I&1KJ<o_K5xvW(yv|^nu<46D10C0y6{Y(V#aGBqrj~
z*ua5YfQ?3h`AiUSj>Z@eQvi4o0K|ilIH92(xC8IiJGy-lmSM&^NBW~<@t8N12=@JF
zJazDaZi=SZiC=j^d=Zvl;3pU&JRWzHV0tvc(80$EHWnqk?5ICD@ErFcjn{j5j{T~J
z_3EP-t&f^PHJ~*S4#cAbt=oZLrUicK9^T7Nd05tq@9iG*#E$l8_f7T2!(JxVH^q^A
z!xOP!e1iEWO`IKg+(mRVfk5cg$RRHgjfEnNk3CB0hnUg`*j#J&$M|j82=H?;faxC}
zha%f4E0)xKs+WoQ*vR1}ZDNe+e172Xy4{TD_$~BaikNT{qo=U{TEVD+zrOC0TY{%%
zTyGu=1}4ygP5Fty1mOuyObq;SFXQpB;e#L6U5r_2{WgQ|GB*p-BRmmcHE=S<_DpCg
zGdnba=}&1vLIb}I1mY(>fjAKbp8G>Vg7L)slZFwDx_Sh=<KKxOs5Zd{V=QavJbmi7
z`<~cW)Z3=~!EiiA1Sdj4K!ElWZr0}yg0AZO39mmI4lvXDi#1k1b5GA_2qqAP`4SI$
z^%rCIf)?+l%r=_RF3w}xpvQf<e8?a4$NVs@l1yx%doK_Y4zL4by8W2_@q7D0GYMd<
z`~Dy3_F;~`eci#3mp$0m^C8UnMZBK~jgAKVLAJ*TRQXdS4AG>26Azex+~fXmnEP;N
z&O`x*0E)fe@D+@`|MS>g?q_iw)o)=5Z#*3EXYF|G;9vGe33keV>Pj7n*-nRW-LS^)
zA2|8S2<6cFgFfBobOV?Z5BEkx@rXxP{Ni?Qa`0&#c?{+is3{wXgd#-9<B3O@<8&0;
zo2#PT(Z1wUdq%C;^FP7=;FUk?1r$7y!ThPWpQj8-jK%kNM`40J^^@KR8;bb2+a~69
z_x{1l+A!<+<`Y;%|Lv#0fW`0|G=aGO2xbXBgN@#aVP3<X8Y}zpJ29;%6vXH4P-H){
zN2`eveSgvmRtaSA^*?(Ni=FxxElflF_b~I*j%#k|JxcgSqxkf|t@f)hjgx(4x0V=V
zUb*oA*uNl{jBqIO#m^u7+^+3fO(gbB0`qeRu&AEHT*AXx@Tm!Z48LRGlYL*X9n;-)
z)MAF|#KvIsP-I{KsT0Qlkr?9-!h{Wlz^VqPcfGAMML><%NK}6XMjd?8_1)}OK;7;C
zfN3nW?vjHqYnb3QFS##)O^f~#6ZH4^LHa=qD4TeA&$HU2-Z3UR=7GUHk&yngm`&%8
zXLSBM#=DOa?)cM32CfB`McG*A>w6=i0O9rzTr=Q~e|^kh_QRUOS5`^_0rQe^GtpB5
z{eWiA5iQ12uW20AO*E#z-C#A*EHTP>qCGERp{H)&^K)HyY&y&uu0Q-$t;3S|`3r}?
z-#tyZ!$<D#GJ!2-+@Xl^NEa47e4rbwc9f0xF!%qx#^c@-=(;?MF`bUXwte5hY`46&
zYo+UoL9Fi^m_2e38!_RS1OB>q;+QjX33kyR>H5lh{<RD9WdGa1bM)vo%*ning<dge
zBGcJ!&;@@`=+*>1r|=stg1XZg{M1+3TSqdu)@k*-qd2}eup0LZzlRZrKZ`kSb9mJ7
zpY}#!A@Pj4?*S0Da&QQ{B93dEj;D6uNB+Wj5wHE?Z!zc72YW7NcijJEPyfMLm@%GE
zcp8ZJ2W&TDu04kCZciv4#uK2lnClj>g^`}e|KAJWOzW=HxM=;|omUP#bH(GB#$`FN
z3;%@8&}lvpW1`2=gpUXH;ci$?8D<gl;a@lk7Srp86$tFzH2V!r#Ebur?$LN4pf_W7
zdU4xdT<zb#7yN@lHLpJse`55LSl?In+^BoL<M2y^f7SEL_)9no_&s#+6PWA%CxpB3
z{?31%{<i~64Aji)4>AGzZcIa&4_=Q`J+sEY>*@_hLcYkZZ+2sYIsyOhyZ)vNi~i$h
zcm35)ETO*^qjqiAX^$K^g8%w!UED7Lm;GH`ogc;@)>0Pp;X5?`ps}=j*8@7{fA1{_
zm27&*ct8urD7x#|PBPxL>)&?ok9UCyrJnvWYt{N=)7+n!K;;vm$Z`E61J8EHU|rZl
z>>hAyqUJp(cK^HQit5fIc|eI<5DL3!eFgmG5h7H`Wx*$s{a*nGcO)8S%;sIc*nMXh
zv$D&j99Q5f*<t}aB4b3q$Ee>!>aipq)DZkd$Mk#jdQG;V`_xCv74VXX;sU{?ii_Ys
zMOVrdAy068=kZt&{H`G)UIuS#K`57s1@O(h#m?BVOfi=$E`gsuLa>E$bwTJfnpTHn
z;ZdTjV>Q{beFoDG5$vvq^__?9pT-VwGhDVXL_~z#u5EivM7EGAF64FJ(MAQKXR<p~
z5*9l59w-w~O5}?R0(}JQ1$3&F(W@|QXe5sxLc}3oD;5c>`0zjJkcg+O4UUNDTp=e+
zTBp#`AgxnGBYPx=QFQ!7A2BIHt8GXmKQmh(#O`gmC30GwU*ec891}}xJd;?SEdyf|
zV1R+h7ga&2&Q9dv*4QZt8#qcxqav<~Wp%zbv=kZKI`|MFY8vvrMNLz1-t;s4EPZ#$
ztv{FEJK7PHk-v|edxZxNx(PcB(TP_&`;s(~L*KB;0MB099!Q(bq2|5VBeD8kOv@KR
zEQLy0+>ZUvxbaR4`Rq7(?*Q74UC>0B*cUW$6E#tFTitEwmBVw}_oCR7pcYW;5DF&v
zfof%d>Y0-YU4}(srV0@hQI{*mYst5+CZD?+X|mi|75;)@QQp+MUhZ3OPrnl?6&qz8
zdK4Ebu+=J-T(?$f>T7FjV2p;C>n~DuigXZQI$4jF+|yyauY>#{M?UvaMAW+*UX^6R
za6o2x?`7S9f->@{63y&Z)|(ohptDjSuL*VZcV?=&+{tt?59_5UlRw9ekKyDSY4Xq0
zh=7K1nnqPc8`2*Ezz!t@P#R>HVUHr_+69F>Ok(U3!^mIH^c%Wrp=&KNrD&HSA`vqt
z$tPyW(=+|{E=(27=?=q#07!}$e+qx&4{w8^<yaU*iic*hmDy@)EeyzI2lK_WkSh<0
zF=F&<kmIkZX3JT$k`5#KAy&9l$Q8LX*z)08U|0SNERyPmN-Gden->a~GmLwhr2|3t
zL?Gz5z{YJleq!2V8THyiW3-BZZo}V4_?g~U*fc5Vmr8X7HOP#IFnd}!xhO1@L0+Pk
zJi5d9*E`5Z=gHS+k*dD0D0-jqS9K0$40G}fvOGHsGAQ#40yrvz={%9*%Cp2EPh3)3
zsmy{;Uz-lfX>f^(jt`&Z5T>*3(4;BvEY~$<w?b12ne2?HBR6a_-nxza%_8|`73s*?
zf;)TsR5|T7jLMuIxPBkdkh)|>C<x_jd9bulOb=3aNMX7l8%kSgN{z4k*6m<gzq*NM
zKfSKbv#x9*Ei9j0GrV_q7*BVQU%Hlj_G5@$>yLXTu&S&WF<c?DVqXcZ<Pd<h44R#T
zw39H~M$BgTcd7VWMZNmkHa4}dY9jyDQ;_(VGp05AQ6yhfmv?aEO&#Q8*W=`m{tIG2
z`%%{es7iQquH17B5KBV>JC))y4iG(0rz|vNqRkn>M$wS#PT48D#m3pJ7K@pJ+_hcc
z9JJj=p)`4hwm4Iq6ZgLH341ziO$lj=a`4s+B+xT-#+8O4G8st1(*l*TyUdXL<{j2d
zhPODaW(&>RoIE91dCKXuQwR@-Sr9m@#lfXBoXwR=3uc-FJgi*CVd0z>+Gb6et#&SL
zccI+9(`5!hWHOW)Quad1$+<vQt-10I(q=lgQOba@2EXm4TNI-2XvmTO(I;{8g-_r}
z=EpWFlubZfRf8_%JjLz(X68KKxkI55+o}sW(F#Z@&xbyRli#~tREhU_((^2Q?w3CU
z1j|nM4hT7wxWbA^H1fG0_zX_o1TETm-Dd&l(a$OX?W~0`P%wK9*jsnw<coK=2iCIZ
zTwPUJ5;|5_P4EL#UAe`1<Z}wS0-04veM@AbJ?D?^!O2JO5hW{kSDkPV;f8DFZD-7D
z0vKTSs6jJ5NIPP%Ms;rM9fFimvZvn!OBgjsIV1hh2JgR&wxfFSpV&ghKANPp4E1Vk
zYeF>&DD_dA?b&bcRZzJ-&BLxnV(;r6#=q?#U-)C3{QBobGT#d@RhGQC+wf4g%#+u5
zd|>kHccFmx2RFYCX|jEe&B(8RR4u=r92y!D2JYz<<1F}F`#y3wbM2X;;3uwT_p@{@
zKQIx%ip*zFe&8eLAU~jmKsPNV2$-*l`GK%%KzrT%07_R>AxJ$dK-SSz3tDr3dRcSK
zT4knRh|yk%(Vo+6!L46YtOGmK0&K#wB^1^Y*V>o*jE@+}2mTZ%e|TD43@+&71%2FH
zeN=T79>EPCmpz5Vm9?#Y&95QM_S#mzb9>J|*?XFV&<Kt1J4V~<^`iHqVpMB8bL2iX
zH5PZ~EpWtfnjNUOpHfVm9wC#Fj)1QDQEzn^pX(sM^CY;7-w>zHdxQG!*<+KO%Q{6O
zdmNH;iP&R53-;8y)dv%KAhM?gjw*Zp@)-qtx`gFYc0p+3&FxP^vfyvlcysn3Re5vN
zV3vbK#u&;Zl&Yy*mcJHG-&P^ZT!jMyDqDd)t*uNdWiXs_t8n9U3L<wwoD)*5=<qB0
zE85TR;l>~0<eSgp<U`MjgsP0G3P+W+v)DpC_HDpS-m7WhNGCdxDrJGuX6@~7L6quS
zfKi(i$yq?U{W}UsJMyz-zNpx!z3@Ef|MTquwF&7tV3$Og9V1rfh{bxQke*6oIl`t=
z)g;1O|6U=&9*C&&peRDRDXRCQ@8aZ*-(6Ex*s-aMydF?hd0u3=SXR{@Gs-qrA)=ar
zvodKF4b`nxn(AMhh<V_93Szb|2s83Jei}DEh?9@~15Up4zeFnPLnud6#S$(}$yn|@
zv;!c5o}-iTDmd<_Rhi^3P58a~eFc7<bxpYwH+~)`um3(yKKG9zdUF5zK&q&ng_^$f
zqC!nI6>g?J8BV_lu%UVrCc}d-DWGf@Po>o1{Ne{V`RyOHhoc(hby}&k!%p#5JD(P;
zE-K}+TWN@#@*HjB;eL}9;<9v_qpfh`$tnmAvz4>joG#jCw>w>~lpRHRg;XZxpeZh8
zg)lE%P_jVK*OhY8c4wNWIL?_)+i5epxs(!Un_zQUd1z;`aGYSVz#S~dF%YA}#k;Jj
zl+~8X2o@WFp&|54aW*?AIBXfa&1%b-VGX3yc8Ek<Gj>PX3?X5ADr3z!GaRLMo8j0-
z@iHCTC{{pNLzMTyR}>1`BOjhQS)S!+(Vk0}qJH=zoV@p+L`8i+(4aas;KGtTYjs2?
z&<WR^;C^E3O{Pw<!X*8HcFOhbZ-1g-#|~xw^d{W+hdBA#s}Lo6Rb)itFsdk>#ccih
z&y@2>SgtH^^?CH0pTc(Jr|svFn!!T%c~po;b_P|6b_RWEV!$&$S26%){>y^2vP&lG
z$FJe!mtPYZ&^nYV6HJDFSuWj2(Y;goLYfM8TDi`&_N@uOn_gGow_R}*;~w02A5NbB
zKREf*Ux?_fji4Ay6~T2^<ae25lNQH@#*K?(Lsb(^kNi?WQ#bM%Px5doBL}M@x?L-!
z*>3n(aH;-9q^#ywpC!z*q1`n|+un78QKhj=yGm6P?GOH61?{^<+U3l21Np!C6;9sw
zD<%0VW3LBRjr_j1FR@{!?PY#e$a97OTtGh9(RjUCDpK4E#thxiMtOJiAQpf)wp6}$
zrBGQYT@O>1tQ}HpD3nioDGTcdrj(`1b4YS|7ofFvE=a;g{WOT&0nyQX3P4$>$abp~
zBX?0*3<*NzBgziR&j>R)d4d_xlQP30p(<w`<UR<wT8v&<qzu<6%Nuv2${NK+Oj+Et
zb&;z?CdzvU30K*{m+)-#m*s9EwvzLyERy^^xde?Zx?$3|HD9SVY{SAU_dvtSYmi)q
zM%gYc4>iu!Y_!}!;+%LyO<B}nR+JUZbFNlr?CR_9dz;*vi~r=7T*3bU-2VQLcz>Dv
z-M?x}UO1tC;jYA)ZaSZ%yArK_54qtp=H$6Q(@mbcbJy?vC^dQEtnDVB{8l-yPW~<&
zQzg&ixlr=F9Lgkrm(K%|AAqB4<OWWoksCZ<MsC6pF><R9V38Lx{SZpJa8LDJyk%zV
ze6RDCL?_Qrewy6Mj4`>9Brv&wye@g(lB(q9XF$nKO<I!gH4jO?_tYZ!0m*8So0yIw
z-+P9M+~lMXx!Fk^@_c1b$n%)KFuwRSZtNnp#+OgydokEZnYP3Agfv$+efQwU_DfgQ
z_+qY_FO>I>5WoMi{qW|#G<@5?fZo`*|0uB^U1WoT0$gf_CwO_rLIobp;`d?+zS_jE
z0{Ti_cNIKH<H0K>;U1h&8@6$jOjgLH#Vd04ktMWFqWcU`xW8J+qT8t=2Gx9NWv!x^
znu8-{s5~!JI26~0<{#TH6refWGDdylvC%>2{_7OTG-9#8{{Ge>3XRl&PfKkRW{`cf
zTU+s3D^{R|n0DP!z^+EVV(9h2qPk54D!;)JUSZk_wW+cR?Ab}7g<N^?_Fz>=-5^Zy
zq**J#wc^dCP)^CT0#w#)E=)NE*a}lwv$-%qlF};~T7foHZ7x*Bn-p5%HdSsi<l4Iv
zRKO}~nqX$&O$VyBK~b@ub&zw~4fpa}F=*SY37rj2Bn23B_j_GD<OZ9ILH#CmEAYm;
za{(76uiL?u>oys#ihY)B_U6KYw0kRIwZhGXRJ#q?3QbYb1W_?p;8k#ng<nyz-XyBo
zUD1zdS=yT_HyyNiDN>EvwTdQ?8=a%#1;ti?(#1tozRB?F7hzO@Hdbyf<c6CBt*~3F
zHyQfcL46g_4RxCfxAE+JE9}<#&4n+Ya%qLEtZ9O&n5ge*2G<VDx589ZY%<7(1MaPW
zTdJGDZ*Ue%hk{zc*2^~+o_NB%72I0!=0Zb<%3FcSrA@FlsyouzausmsO^6~In+#BP
z^Tor^t#Dhao4{{Sb82UgTj44yHW#9Fs;m`Yy}SwDMkQK1xz-9%QPBi)gG`mq3#$MW
z&mO@^m9%c}ylsrDSemOXtz#s35@3u~{ak{WCr12Iap0PprME{MUKTsF19Ev%r}
zO=*Q&SST(bKFB%@r4%}iZ{82F6q)ymN_vPIvh2SOGj%iYn(f4hbW+#!>(OEKu`wLO
zFel&#eU3AXm2uB7qYN{{cxJ>hx4WT!2^GS7<K_AjG3gn8i=&A3eui0s_eV4E7Y*X&
zS@;9b3^U=3!Z(LMpQ|Q)lh!FZxR@H7tfUGP*-+M<5?00+xP)!qKUbXb7shAd(ooKy
zU7GRdZL_IF9JQ&Y6U*g*hspAm$Shwt0cFfm?Rk;+`0ajgyzKXkucm#IOK6;lDJqTX
zr)gI?ouH~yv;;djwFa{~ouG24ZK^O?PI(Z#WGR_go}Z#s;QB@Z_tDiUOT@<e5Kc@s
zov@W){P|?mk6;6i)RHjiG7D2~aa@1imF4o2b7_xQUrA0yW?>BTbQJI%D`%4l+gyr{
zlu~(K9B*<m;tRxPvY95_`4wnCS^@sacrn?@giXXh=>=Ln)@6xb&G@%zS8xN5fsd3D
z*IYI2oi^9_PNk;YHYKefA9HZ-L>BQhm6$Y7CnB?HAG;=-Omt}m<haIDe{Qmx%8yo(
zi4!0jM(mf&=W;1uBqNq19(cwlQzA{1^GKc|?5)5pA6cBXOag8;Gw8*9JKiOIt~rot
zE>(zRxP(MIzu9`=gEwvYmR$i4njdxf)?rI<Rd_d_qJtF3qFU3hrD>Q$(^J7rdMvjD
zI*Is8rSdtXWAc1zk!y+`zb2Uo*5pp*(!Nm`BQm=-FUHJeq^k;<nYpP*xu)}ZE-_h7
zj=Jh{FGNad-?9WxO_qhpDwhZ@CSeYVx>T5~@MAe8&WqFeQf}IE7P^)Pds2uXIjy#n
zCCt+**oJC)Y(j*)RdP%reX&f=CTWRY6*-ncZiRGWv{2*eR>=|RPbTflnNu}csNmMp
zW~=y?L^gTU1@Ng^E@5qFKk{f@n4%+#sl;e0l@;ymR`CyXe>qp17s&p%&IMRcQlN9<
zI)m0TwfT@wmQ(a--L|&QhkVNCnyuUBcQr@nQ<iiU^@F)7&i^S(GRGH!r4-$kmuKv=
zWwQ6VEUEnRBCIhPG;cvK3+?7L*h%%aYP>!twnz49B-@Q_n#w$aH9nFPWINeT7tSSq
z7VtB`mpTJ}W)As8TZ>=Y^>?e+-1O9Vxt*S^)oqzx!J0M&ep+F^oy=ffQ^?lY+RE$P
z*cO0mGb7)r=4Z6AeQRs7<PW#gsdI&+1+Det5^q}kdU0PsfxoaKNWS}4tr_#vH1eC<
z&clZKt+Th2tJ9Y85<dnuVx9FNITf7a^SLGP85XyAt<5F#N$?55x6Dr>|Fq`Mw(fV5
z31|uaEH{<Rp}p1?`&2n$%w6KogyYog5;rw|KJY6|=f(Yu=%Xw44l2G21R&7fLi7)%
zy;55{b57PRcsb5gdXdk!_vtx$(w3%OE6H_y`XurrrcTiC|7Eq=qc{0q?`ChCr;0qp
zFi;$z&3anl#)5Mx3&a7o*6uCwdDk4w)74}mQeVqe>>ocizBo-!uJZKcJjl4R#q(q(
znMki^YZ~ge$_}1ErmNtKqJ1^eqpeymAkLC*yQhV{dI4fX?eJ;ZH$a@BDW=mjHyYzp
zZQ>GXpKA&B@9W^Cs8{oq&eeKf^GP74waWR}m3DqyJq{4e@bv8L#MbT^>EzV-Dwl9o
z+u;Z6+X_!_{rW15MOLOEHrmFX!MawjSDTMbx}*{deJ^$v`AVa!$*J}+Fchl-8z#lC
z)b#C~tW6E^29>q5p?<4uh3FSSY#rp7E<@b4oh((`smXLh@$EMDL-ffeA+96GCw!A_
z_kV5Mo+~*|Lafs>4nFQA=$&f?#XupZUFH(wz*E>ui1`V(4Zg3WEWF5H#e9W$tABYS
zYF!G<F<BH3SHyej^B#>c=dVJ$Re!;Z@-Fb-@z&z;kVY9t&+eHq$e)}*bp>XcVSGzI
zlu1F)EP4*1T#aYR4Y`*YCJu!&@XSo0XBs7Bj?X}@#%)EpmpFR%pl7cY(lg95Gl773
zp}vfs!|0ho&)#X2wke_X%^2fB@X)gtJ%`YP$Bc%WMQNbX6KLoe^z4E!%mjR4Jb)O(
zq|h%fbQJSN3^N}t!2gsEPbcE25gH9X;#cvT7uEUDJfF-cVmY}a;CpEXx}K>&8l~1B
z_As+)dKvnPpW>y9!^6&|{ywp)4qxQIuQp!+?Ra=$czAiv4n3hII||eSE9&%!-5DMg
zig9vedzdlQ-o+qzr~;UbJ|aI5qiP=!baHCe%;!f}fl1t@kcxvk593>XH}KYUh?zj`
zFOAFa+z}Cih@=8u5S6y9cc@hu^&e*C{_Ei@ulGQfFdVMo8}`chde!-Z`0EyZ7lZ@j
zUvNtN^fP1O#Rh#4nWE4Kq{B?jcSc<^sJ%JnSYIB9)o$5m1pP<tUF8~o-RtqE#ScdC
zgQ)+dyxhMHe_)P+z9(iSzwiV@4}-o!k%yU>lku(b2Prw4IPwPx!M%{2%2`sD@p)dX
zcIVgnhu8W``UkTSK8pTU)W3Bv2c^^D)#2flxf;J^{cVO{MEx7~s(|(clJ6wykD9le
zuZTas$$S#!HGMf6_8BOi0finX>Ob4mUu2R(z8*BTFOmaadxe{F!yd&|WkLwwOP6Z;
zCCf{F{`i98`h_q={GyxSwdj)wFSrP6nVB!lGoi2r=<~X2`qV}~Wf<Pw80R|S5r#n(
z5uuE?xj&MpQa+-6Sc}gw6APe9$e#F_DG@$=t??HbZlyg$HklclTuCt=E23}aOzjb9
zUv3`k36I*Bn%cMUUurMq;iZ@`3^0dOUF0V`$oxk{Aj2<z;G6)%pgcX;Fe!%%x)2P5
zst>Qq{A{DIh<0~<%@=bLHt?HxCW-p5tk(Eb(^u&bLS5h8NWYr<M{<gs2Go0aBp-1+
zQGdlnsb#$pr;@Lpmw}F;GBe1;!(Z{^Va5?Zl=dGDsFbg$AC0-E8Ltz;Uq<wZ@TK*x
zh5zFGf&4S%-o-pBdiKl#4~Lg(`qhsA;?k+eCyK~U!kimkhB><~zDVeGeU|Kvhe1H2
zsJ{YVi3tcP=_A==DW{7{u0icBUih^he?)lo{IocfHx8YK3r+Bv{ehZ&Xvoi_`2l(;
z>ci?<f4_)i3x5&l(T1}J3==^7MffrzIcPALcK~{taW9Du5E@)2+6_()`XD2z)PLkC
zG0o>LP#Mx6aG+p39yw06k2JHThs!U=Ohk!WxTY#p2KiEHP(08>)DG+rlogKh(D2qU
Scpu&t|HGaSVOW@nQ2!q=M{<k+

diff --git a/transforms/universal/doc_id/spark/test-data/expected/metadata.json b/transforms/universal/doc_id/test-data/expected/metadata.json
similarity index 60%
rename from transforms/universal/doc_id/spark/test-data/expected/metadata.json
rename to transforms/universal/doc_id/test-data/expected/metadata.json
index a55c4eff6..5d36dce8f 100644
--- a/transforms/universal/doc_id/spark/test-data/expected/metadata.json
+++ b/transforms/universal/doc_id/test-data/expected/metadata.json
@@ -3,10 +3,10 @@
     "job details": {
         "job category": "preprocessing",
         "job name": "doc_id",
-        "job type": "spark",
+        "job type": "pure python",
         "job id": "job_id",
-        "start_time": "2024-08-03 22:04:58",
-        "end_time": "2024-08-03 22:05:15",
+        "start_time": "2024-08-17 09:57:40",
+        "end_time": "2024-08-17 09:57:41",
         "status": "success"
     },
     "code": {
@@ -18,29 +18,29 @@
         "doc_column": "contents",
         "hash_column": "hash_column",
         "int_column": "int_id_column",
+        "start_id": 5,
         "checkpointing": false,
         "max_files": -1,
         "random_samples": -1,
-        "files_to_use": [".parquet"]
-    },
-    "execution_stats": {
-        "execution time, min": 0.29759878317515054
+        "files_to_use": [".parquet"],
+        "num_processors": 0
     },
     "job_output_stats": {
+        "source_files": 1,
         "source_size": 36132,
+        "result_files": 1,
         "result_size": 36668,
+        "processing_time": 0.044,
+        "source_doc_count": 5,
         "result_doc_count": 5,
-        "source_files": 1,
-        "result_files": 1,
-        "processing_time": 0.08469605445861816,
-        "source_doc_count": 5
+        "final id": 10
     },
     "source": {
-        "name": "/Users/borisl/Projects/data-prep-kit/transforms/universal/doc_id/spark/test-data/input",
+        "name": "/Users/borisl/IdeaProjects/data-prep-kit/transforms/universal/doc_id/python/test-data/input",
         "type": "path"
     },
     "target": {
-        "name": "/Users/borisl/Projects/data-prep-kit/transforms/universal/doc_id/spark/output",
+        "name": "/Users/borisl/IdeaProjects/data-prep-kit/transforms/universal/doc_id/python/output",
         "type": "path"
     }
 }
diff --git a/transforms/universal/doc_id/python/test-data/expected/sample1.parquet b/transforms/universal/doc_id/test-data/expected/sample1.parquet
similarity index 100%
rename from transforms/universal/doc_id/python/test-data/expected/sample1.parquet
rename to transforms/universal/doc_id/test-data/expected/sample1.parquet
diff --git a/transforms/universal/doc_id/python/test-data/input/sample1.parquet b/transforms/universal/doc_id/test-data/input/sample1.parquet
similarity index 100%
rename from transforms/universal/doc_id/python/test-data/input/sample1.parquet
rename to transforms/universal/doc_id/test-data/input/sample1.parquet
diff --git a/transforms/universal/doc_id/python/test/test_doc_id.py b/transforms/universal/doc_id/test/test_doc_id.py
similarity index 84%
rename from transforms/universal/doc_id/python/test/test_doc_id.py
rename to transforms/universal/doc_id/test/test_doc_id.py
index e881171c9..4f1554ef4 100644
--- a/transforms/universal/doc_id/python/test/test_doc_id.py
+++ b/transforms/universal/doc_id/test/test_doc_id.py
@@ -15,14 +15,14 @@
 import pyarrow as pa
 from data_processing.test_support.transform import AbstractTableTransformTest
 from data_processing.utils import TransformUtils
-from doc_id_transform_python import DocIDTransform
-from doc_id_transform_base import (IDGenerator,
-                                   doc_column_name_key,
-                                   hash_column_name_key,
-                                   int_column_name_key,
-                                   id_generator_key,
-                                   )
-
+from dpk_doc_id.transform import (
+    IDGenerator,
+    doc_column_name_key,
+    hash_column_name_key,
+    id_generator_key,
+    int_column_name_key,
+)
+from dpk_doc_id.transform_python import DocIDTransform
 
 
 table = pa.Table.from_pydict(
diff --git a/transforms/universal/doc_id/python/test/test_doc_id_python.py b/transforms/universal/doc_id/test/test_doc_id_python.py
similarity index 71%
rename from transforms/universal/doc_id/python/test/test_doc_id_python.py
rename to transforms/universal/doc_id/test/test_doc_id_python.py
index c1de5a63e..6433b11f4 100644
--- a/transforms/universal/doc_id/python/test/test_doc_id_python.py
+++ b/transforms/universal/doc_id/test/test_doc_id_python.py
@@ -16,12 +16,13 @@
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
-from doc_id_transform_python import DocIDPythonTransformRuntimeConfiguration
-from doc_id_transform_base import (doc_column_name_cli_param,
-                                   hash_column_name_cli_param,
-                                   int_column_name_cli_param,
-                                   start_id_cli_param,
-                                   )
+from dpk_doc_id.transform import (
+    doc_column_name_cli_param,
+    hash_column_name_cli_param,
+    int_column_name_cli_param,
+    start_id_cli_param,
+)
+from dpk_doc_id.transform_python import DocIDPythonTransformRuntimeConfiguration
 
 
 class TestPythonDocIDTransform(AbstractTransformLauncherTest):
@@ -35,9 +36,10 @@ def get_test_transform_fixtures(self) -> list[tuple]:
         fixtures = []
         basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
         launcher = PythonTransformLauncher(DocIDPythonTransformRuntimeConfiguration())
-        config = {doc_column_name_cli_param: "contents",
-                  hash_column_name_cli_param: "hash_column",
-                  int_column_name_cli_param: "int_id_column",
-                  start_id_cli_param: 5,
-                  }
-        return [(launcher, config, basedir + "/input", basedir + "/expected")]
\ No newline at end of file
+        config = {
+            doc_column_name_cli_param: "contents",
+            hash_column_name_cli_param: "hash_column",
+            int_column_name_cli_param: "int_id_column",
+            start_id_cli_param: 5,
+        }
+        return [(launcher, config, basedir + "/input", basedir + "/expected")]
diff --git a/transforms/universal/doc_id/ray/test/test_doc_id_ray.py b/transforms/universal/doc_id/test/test_doc_id_ray.py
similarity index 83%
rename from transforms/universal/doc_id/ray/test/test_doc_id_ray.py
rename to transforms/universal/doc_id/test/test_doc_id_ray.py
index c55342017..d2a3b3900 100644
--- a/transforms/universal/doc_id/ray/test/test_doc_id_ray.py
+++ b/transforms/universal/doc_id/test/test_doc_id_ray.py
@@ -16,12 +16,13 @@
     AbstractTransformLauncherTest,
 )
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration
-from doc_id_transform_base import (doc_column_name_cli_param,
-                                   hash_column_name_cli_param,
-                                   int_column_name_cli_param,
-                                   start_id_cli_param,
-                                   )
+from dpk_doc_id.ray.transform import DocIDRayTransformRuntimeConfiguration
+from dpk_doc_id.transform import (
+    doc_column_name_cli_param,
+    hash_column_name_cli_param,
+    int_column_name_cli_param,
+    start_id_cli_param,
+)
 
 
 class TestRayDocIDTransform(AbstractTransformLauncherTest):
diff --git a/transforms/universal/doc_id/spark/test/test_doc_id_spark.py b/transforms/universal/doc_id/test/test_doc_id_spark.py
similarity index 97%
rename from transforms/universal/doc_id/spark/test/test_doc_id_spark.py
rename to transforms/universal/doc_id/test/test_doc_id_spark.py
index 6d945bf9e..7a899419d 100644
--- a/transforms/universal/doc_id/spark/test/test_doc_id_spark.py
+++ b/transforms/universal/doc_id/test/test_doc_id_spark.py
@@ -16,7 +16,7 @@
     AbstractTransformLauncherTest,
 )
 from data_processing_spark.runtime.spark import SparkTransformLauncher
-from doc_id_transform_spark import (
+from dpk_doc_id.spark.transform import (
     DocIDSparkTransformConfiguration,
     doc_column_name_cli_param,
     hash_column_name_cli_param,
diff --git a/transforms/universal/doc_id/transform.config b/transforms/universal/doc_id/transform.config
deleted file mode 100644
index d3715f3b2..000000000
--- a/transforms/universal/doc_id/transform.config
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# This is intended to be included across the Makefiles provided within
-# a given transform's directory tree,  so must use compatible syntax.
-#
-################################################################################
-# This defines the name of the transform and is used to match against
-# expected files and is used to define the transform's image name. 
-TRANSFORM_NAME=doc_id
-
-################################################################################
-# This defines the transforms' version number as would be used
-# when publishing the wheel.  In general, only the micro version
-# number should be advanced relative to the DPK_VERSION. 
-#
-# If you change the versions numbers, be sure to run "make set-versions" to 
-# update version numbers across the transform (e.g., pyproject.toml).
-DOC_ID_PYTHON_VERSION=$(DPK_VERSION)
-DOC_ID_RAY_VERSION=$(DOC_ID_PYTHON_VERSION)
-DOC_ID_SPARK_VERSION=$(DOC_ID_PYTHON_VERSION)
-

From 49a22aea6a8c1c572fc997e0bcbf990871861217 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Thu, 5 Dec 2024 14:14:24 -0500
Subject: [PATCH 02/28] added __init__

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/doc_id/dpk_doc_id/__init__.py       | 4 ++++
 transforms/universal/doc_id/dpk_doc_id/ray/__init__.py   | 0
 transforms/universal/doc_id/dpk_doc_id/spark/__init__.py | 0
 3 files changed, 4 insertions(+)
 create mode 100644 transforms/universal/doc_id/dpk_doc_id/__init__.py
 create mode 100644 transforms/universal/doc_id/dpk_doc_id/ray/__init__.py
 create mode 100644 transforms/universal/doc_id/dpk_doc_id/spark/__init__.py

diff --git a/transforms/universal/doc_id/dpk_doc_id/__init__.py b/transforms/universal/doc_id/dpk_doc_id/__init__.py
new file mode 100644
index 000000000..0bedd041c
--- /dev/null
+++ b/transforms/universal/doc_id/dpk_doc_id/__init__.py
@@ -0,0 +1,4 @@
+from .transform import *
+from .local_python import *
+from .transform_python import *
+from .local import *
diff --git a/transforms/universal/doc_id/dpk_doc_id/ray/__init__.py b/transforms/universal/doc_id/dpk_doc_id/ray/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/transforms/universal/doc_id/dpk_doc_id/spark/__init__.py b/transforms/universal/doc_id/dpk_doc_id/spark/__init__.py
new file mode 100644
index 000000000..e69de29bb

From 808521a3a9b7d5e1ae700015d00a5d54012d964e Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Thu, 5 Dec 2024 15:15:30 -0500
Subject: [PATCH 03/28] Fix typo

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/doc_id/Dockerfile.ray | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transforms/universal/doc_id/Dockerfile.ray b/transforms/universal/doc_id/Dockerfile.ray
index f5bf58cae..377543678 100644
--- a/transforms/universal/doc_id/Dockerfile.ray
+++ b/transforms/universal/doc_id/Dockerfile.ray
@@ -14,7 +14,7 @@ COPY --chown=ray:users data-processing-dist data-processing-dist
 RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]
 
 ## Copy the python version of the tansform
-COPY --chown=ray:users dpk_html2parquet/ dpk_html2parquet/
+COPY --chown=ray:users dpk_doc_id/ dpk_doc_id/
 COPY --chown=ray:users requirements.txt requirements.txt
 RUN pip install -r requirements.txt
 

From b8f3b6f96d572dcc45017a33c4eb92ef45ab46f1 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Thu, 5 Dec 2024 15:53:54 -0500
Subject: [PATCH 04/28] remove spark unit test for now

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .../doc_id/test/test_doc_id_spark.py          | 45 -------------------
 1 file changed, 45 deletions(-)
 delete mode 100644 transforms/universal/doc_id/test/test_doc_id_spark.py

diff --git a/transforms/universal/doc_id/test/test_doc_id_spark.py b/transforms/universal/doc_id/test/test_doc_id_spark.py
deleted file mode 100644
index 7a899419d..000000000
--- a/transforms/universal/doc_id/test/test_doc_id_spark.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-import os
-
-from data_processing.test_support.launch.transform_test import (
-    AbstractTransformLauncherTest,
-)
-from data_processing_spark.runtime.spark import SparkTransformLauncher
-from dpk_doc_id.spark.transform import (
-    DocIDSparkTransformConfiguration,
-    doc_column_name_cli_param,
-    hash_column_name_cli_param,
-    int_column_name_cli_param,
-)
-
-
-class TestSparkDocIDTransform(AbstractTransformLauncherTest):
-    """
-    Extends the super-class to define the test data for the tests defined there.
-    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
-    """
-
-    def get_test_transform_fixtures(self) -> list[tuple]:
-        basedir = "../test-data"
-        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
-        fixtures = []
-        launcher = SparkTransformLauncher(DocIDSparkTransformConfiguration())
-        transform_config = {
-            doc_column_name_cli_param: "contents",
-            hash_column_name_cli_param: "hash_column",
-            int_column_name_cli_param: "int_id_column",
-        }
-
-        fixtures.append((launcher, transform_config, basedir + "/input", basedir + "/expected"))
-        return fixtures

From 3baad3e75e76d11455a448e3689140af853b9099 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Thu, 5 Dec 2024 15:54:41 -0500
Subject: [PATCH 05/28] Show example for runnign ray runtime

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/doc_id/Makefile | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/transforms/universal/doc_id/Makefile b/transforms/universal/doc_id/Makefile
index bf0d39543..6429d4c9d 100644
--- a/transforms/universal/doc_id/Makefile
+++ b/transforms/universal/doc_id/Makefile
@@ -15,8 +15,9 @@ TRANSFORM_NAME=$(shell basename `pwd`)
 
 
 
-run-cli-say-sample: 
-	$(MAKE) RUN_FILE="-m dpk_$(TRANSFORM_NAME).ray.transform" \
-                RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\"  \
-                --doc_id_int True "	\
-                .transforms.run-src-file
+run-cli-ray-sample: 
+	#make venv
+	source venv/bin/activate && \
+	$(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \
+                --run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}"  \
+                --doc_id_int True

From 37881ef5d76ff7a15bb6f98c03d09697b3aeb710 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Thu, 5 Dec 2024 17:54:49 -0500
Subject: [PATCH 06/28] fixing issues with spark

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/.make.cicd.targets                 |   4 +-
 transforms/universal/doc_id/Makefile          |   9 +-
 transforms/universal/doc_id/README.md         | 155 +++++++++++++++---
 .../test-data/expected-spark/metadata.json    |  46 ++++++
 .../test-data/expected-spark/sample1.parquet  | Bin 0 -> 36668 bytes
 .../doc_id/test/test_doc_id_spark.py          |  45 +++++
 6 files changed, 234 insertions(+), 25 deletions(-)
 create mode 100644 transforms/universal/doc_id/test-data/expected-spark/metadata.json
 create mode 100644 transforms/universal/doc_id/test-data/expected-spark/sample1.parquet
 create mode 100644 transforms/universal/doc_id/test/test_doc_id_spark.py

diff --git a/transforms/.make.cicd.targets b/transforms/.make.cicd.targets
index 23475f57f..de840eee2 100644
--- a/transforms/.make.cicd.targets
+++ b/transforms/.make.cicd.targets
@@ -83,7 +83,7 @@ test-image:: .default.build-lib-wheel
 			$(MAKE) DOCKER_FILE=Dockerfile.spark \
 					TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \
 					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
-					BASE_IMAGE=$(SPARK_BASE_IMAGE)  \
+					BASE_IMAGE=$(DOCKER_HOSTNAME)/$(DOCKER_NAMESPACE)/$(DOCKER_SPARK_BASE_IMAGE)  \
 					test-image-sequence ; \
 		fi ;\
 	fi
@@ -120,7 +120,7 @@ image:: .default.build-lib-wheel
 		if [ -e Dockerfile.spark ]; then \
 			$(MAKE) DOCKER_FILE=Dockerfile.spark \
 					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
-					BASE_IMAGE=$(SPARK_BASE_IMAGE)  \
+					BASE_IMAGE=$(DOCKER_HOSTNAME)/$(DOCKER_NAMESPACE)/$(DOCKER_SPARK_BASE_IMAGE)  \
 					.defaults.lib-whl-image ; \
 		fi ; \
 	fi
diff --git a/transforms/universal/doc_id/Makefile b/transforms/universal/doc_id/Makefile
index 6429d4c9d..1db88041b 100644
--- a/transforms/universal/doc_id/Makefile
+++ b/transforms/universal/doc_id/Makefile
@@ -15,8 +15,15 @@ TRANSFORM_NAME=$(shell basename `pwd`)
 
 
 
+run-cli-spark-sample: 
+	make venv
+	source venv/bin/activate && \
+	$(PYTHON) -m dpk_$(TRANSFORM_NAME).spark.transform \
+                --run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}"  \
+                --doc_id_int True
+
 run-cli-ray-sample: 
-	#make venv
+	make venv
 	source venv/bin/activate && \
 	$(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \
                 --run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}"  \
diff --git a/transforms/universal/doc_id/README.md b/transforms/universal/doc_id/README.md
index 675995623..7146ff4bd 100644
--- a/transforms/universal/doc_id/README.md
+++ b/transforms/universal/doc_id/README.md
@@ -1,31 +1,41 @@
 # Document ID Python Annotator
 
-The Document ID transforms adds a document identification (unique integers and content hashes), which later can be 
-used in de-duplication operations, per the set of 
-[transform project conventions](../../README.md#transform-project-conventions)
-the following runtimes are available:
+Please see the set of [transform project conventions](../../../README.md) for details on general project conventions,
+transform configuration, testing and IDE set up.
 
-## Summary
+## Contributors
+- Boris Lublinsky (blublinsk@ibm.com)
 
-This transform annotates documents with document "ids".
-It supports the following transformations of the original data:
-* Adding document hash: this enables the addition of a document hash-based id to the data.
-  The hash is calculated with `hashlib.sha256(doc.encode("utf-8")).hexdigest()`.
-  To enable this annotation, set `hash_column` to the name of the column,
-  where you want to store it.
-* Adding integer document id: this allows the addition of an integer document id to the data that
-  is unique across all rows in all tables provided to the `transform()` method.
-  To enable this annotation, set `int_id_column` to the name of the column, where you want
-  to store it.
+## Description
 
-Document IDs are generally useful for tracking annotations to specific documents. Additionally
-[fuzzy deduping](../fdedup) relies on integer IDs to be present. If your dataset does not have
-document ID column(s), you can use this transform to create ones.
+This transform assigns unique identifiers to the documents in a dataset and supports the following annotations to the
+original data:
+* **Adding a Document Hash** to each document. The unique hash-based ID is generated using
+`hashlib.sha256(doc.encode("utf-8")).hexdigest()`. To store this hash in the data specify the desired column name using
+the `hash_column` parameter.
+* **Adding an Integer Document ID**: to each document. The integer ID is unique across all rows and tables processed by
+the `transform()` method. To store this ID in the data, specify the desired column name using the `int_id_column`
+parameter.
 
+Document IDs are essential for tracking annotations linked to specific documents. They are also required for processes
+like [fuzzy deduplication](../../fdedup/README.md), which depend on the presence of integer IDs. If your dataset lacks document ID
+columns, this transform can be used to generate them.
 
-## Configuration and command line Options
+## Input Columns Used by This Transform
+
+| Input Column Name                                                | Data Type | Description                      |
+|------------------------------------------------------------------|-----------|----------------------------------|
+| Column specified by the _contents_column_ configuration argument | str       | Column that stores document text |
+
+## Output Columns Annotated by This Transform
+| Output Column Name | Data Type | Description                                 |
+|--------------------|-----------|---------------------------------------------|
+| hash_column        | str       | Unique hash assigned to each document       |
+| int_id_column      | uint64    | Unique integer ID assigned to each document |
+
+## Configuration and Command Line Options
 
-The set of dictionary keys defined in [DocIDTransform](src/doc_id_transform_ray.py)
+The set of dictionary keys defined in [DocIDTransform](src/doc_id_transform_base.py)
 configuration for values are as follows:
 
 * _doc_column_ - specifies name of the column containing the document (required for ID generation)
@@ -35,7 +45,7 @@ configuration for values are as follows:
 
 At least one of _hash_column_ or _int_id_column_ must be specified.
 
-## Running
+## Usage
 
 ### Launched Command Line Options 
 When running the transform with the Ray launcher (i.e. TransformLauncher),
@@ -53,11 +63,113 @@ the following command line arguments are available in addition to
 ```
 These correspond to the configuration keys described above.
 
+### Running the samples
+To run the samples, use the following `make` targets
+
+* `run-cli-sample` - runs src/doc_id_transform_python.py using command line args
+* `run-local-sample` - runs src/doc_id_local_python.py
+
+These targets will activate the virtual environment and set up any configuration needed.
+Use the `-n` option of `make` to see the detail of what is done to run the sample.
+
+For example, 
+```shell
+make run-cli-sample
+...
+```
+Then 
+```shell
+ls output
+```
+To see results of the transform.
+
+### Code example
+
+[notebook](../doc_id.ipynb)
+
+### Transforming data using the transform image
 
 To use the transform image to transform your data, please refer to the 
 [running images quickstart](../../../../doc/quick-start/run-transform-image.md),
 substituting the name of this transform image and runtime as appropriate.
 
+## Testing
+
+Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md)
+
+Currently we have:
+- [Unit test](test/test_doc_id_python.py)
+- [Integration test](test/test_doc_id.py)
+
+
+# Document ID Ray Annotator
+
+Please see the set of
+[transform project conventions](../../../README.md)
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Ray Summary
+This project wraps the Document ID transform with a Ray runtime.
+
+## Configuration and command line Options
+
+Document ID configuration and command line options are the same as for the
+[base python transform](../python/README.md).
+
+## Building
+
+A [docker file](Dockerfile) that can be used for building docker image. You can use
+
+```shell
+make build 
+```
+
+## Driver options
+
+## Configuration and command line Options
+
+See [Python documentation](../python/README.md)
+
+## Running
+
+### Launched Command Line Options 
+When running the transform with the Ray launcher (i.e. TransformLauncher),
+the following [command line arguments](../python/README.md) are available in addition to 
+[the options provided by the ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md).
+
+To use the transform image to transform your data, please refer to the
+[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
+
+# Document ID Spark Annotator
+
+## Summary 
+
+This transform assigns a unique integer ID to each row in a Spark DataFrame. It relies on the
+[monotonically_increasing_id](https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.sql.functions.monotonically_increasing_id.html)
+pyspark function to generate the unique integer IDs. As described in the documentation of this function:
+> The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. 
+
+## Configuration and command line Options
+
+Document ID configuration and command line options are the same as for the
+[base python transform](../python/README.md).
+
+## Running
+You can run the [doc_id_local.py](src/doc_id_local_spark.py) (spark-based implementation) to transform the
+`test1.parquet` file in [test input data](test-data/input) to an `output` directory.  The directory will contain both
+the new annotated `test1.parquet` file and the `metadata.json` file.
+
+### Launched Command Line Options 
+When running the transform with the Spark launcher (i.e. SparkTransformLauncher), the following command line arguments
+are available in addition to the options provided by the
+[python launcher](../../../../data-processing-lib/doc/python-launcher-options.md).
+
+```
+  --doc_id_column_name DOC_ID_COLUMN_NAME
+                        name of the column that holds the generated document ids
+```
 
 ### Running as spark-based application
 ```
@@ -87,4 +199,3 @@ The metadata generated by the Spark `doc_id` transform contains the following st
 To use the transform image to transform your data, please refer to the 
 [running images quickstart](../../../../doc/quick-start/run-transform-image.md),
 substituting the name of this transform image and runtime as appropriate.
-
diff --git a/transforms/universal/doc_id/test-data/expected-spark/metadata.json b/transforms/universal/doc_id/test-data/expected-spark/metadata.json
new file mode 100644
index 000000000..a55c4eff6
--- /dev/null
+++ b/transforms/universal/doc_id/test-data/expected-spark/metadata.json
@@ -0,0 +1,46 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "doc_id",
+        "job type": "spark",
+        "job id": "job_id",
+        "start_time": "2024-08-03 22:04:58",
+        "end_time": "2024-08-03 22:05:15",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "doc_column": "contents",
+        "hash_column": "hash_column",
+        "int_column": "int_id_column",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"]
+    },
+    "execution_stats": {
+        "execution time, min": 0.29759878317515054
+    },
+    "job_output_stats": {
+        "source_size": 36132,
+        "result_size": 36668,
+        "result_doc_count": 5,
+        "source_files": 1,
+        "result_files": 1,
+        "processing_time": 0.08469605445861816,
+        "source_doc_count": 5
+    },
+    "source": {
+        "name": "/Users/borisl/Projects/data-prep-kit/transforms/universal/doc_id/spark/test-data/input",
+        "type": "path"
+    },
+    "target": {
+        "name": "/Users/borisl/Projects/data-prep-kit/transforms/universal/doc_id/spark/output",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/doc_id/test-data/expected-spark/sample1.parquet b/transforms/universal/doc_id/test-data/expected-spark/sample1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..765a6776b209cbc0903e7a09fa3af0f3844ba343
GIT binary patch
literal 36668
zcmeHw30zah^LPRXh*+%}44x77)dR>4iC+)G6+l34NYvs>@&bWel7!1*)l$A{y{$*R
z>RqpT-&)TeT5qM+t6ufiYOVV9_WR#`$s;exi)|ILzwci@HObrA-I<-4ot>T8eNB<1
zaT#8W6h=fM!;e90LZfwB$9~T0^bL(Rghr#&JZPRYFPinwD+4^y88k)!qf;=b-I(Fc
z@NUKQCE!e7TSZy`trJBlfa0AukkLTDG@2jX({p$%4XN^I#qbIg(!3i7GW|34dQ^w0
zy0JK+T!w#uU%)eZzzaH}LVW)Yrodn@C>q0q20j{tFB<J_5Atw5`yl=7gCq@cxfgQO
z^R)*#)sr#CBN0fZMwa^@c+z%_YSU=tRr=+N^E+cxI;;!#AL3ts<)c#zejhWu>dD#Q
z?9INyBS$aXX7y<J>+Nc<<k_4Ti@NjYdiq2)DBf{Nb>T^ien;>7ub*_i^&Z}syrn#a
zB{wkebllWHR5&K<skcYK{zkk)kDfh?H___P%Lv)na>{{agC@^IWYeno?QtPAV<2{w
z38Qt2P;tUV+;B|A5vs$*LT-4tLah{tg**k9t48@4kE>KDP%$dzVJd-ugNnn$)M0#{
zK&%oeI6{;wU`FsUfhtTHrWA=)DsC9~7OBF;XqZxg@l|4uQlwJwlwy$}T&+a8Vv(3H
z;D|9LPb}aI!UTLJUqz^+Un-%F9-i~*$Xx$c41b$8Hc%Oq79FNH=W0~Aq+ZkusH%tj
zSRa|;)9N2G3QASqN@uL5Co(+jTJYhg9<&Z#axiEUzZ;va&|0t%qehE`6rm+-VXj7@
z%hl-3SdkvpvN=kFj$O<R<*+e5TZNiYb}mOO6oeL-40<ox#-YFN&r~b@rf|UIYQpH?
zAXm9#?|*^^N9%$N(gF%|Io3Y{GxH7hP3S`XctRKGel%obGrKDI5~_gf!SH$k<LV(#
z8Y7DuK|Oebl)ceWQ|iuaPsR(NJ0uHoc%3J0?!N<D4%yVu_qVZAI_{0-?Yfn*=E#$M
zw<`wS+v!hhPjA3UlOJl?_Tp(#zb&b&de39`y}hr?V!_t7!8aFl3O)AW!<A!}onFnH
z_~HKK@s$nM_IT*^smI>^L4%I{)Nya}`|TSFlO~oudYXMu{HYgh+=!8#=8P<vSHDpn
z&@gAqk(h5b8%;okOG-*YfvSW;{Q<%;L1is3*0IAwdA-aAqej`oWKjU14%MqdbZDVQ
zpBG}nw3yPYDaJyS`KUe*V|IIKZ)T&K&ExR6w*K_yB4dc&WJEa}t^gx+#@T+fwku6N
zv&O8&aE+PZ9WbmOvaJO&n+Y}MpEC+d9oW&EF}n>}2L_yfU)Y@{#~L^37BM2Mb@SNN
zW-;^&JpYK8MU5LQzt{_%7}&SnvY^gCOAA+H(x%r>o%(O??OB!eJ(pDUYM!#4U*1|&
z<<XK}k4~pg*?Rg+=cOTIp8EG$knY)H#rxx8HuK}oUmMcFXnqz`+=AVlXGz_cIjrek
zrKZGt(p==f>thy7z1U?ACt-A6*`&Gae|pevFd7qib3%6OwG*aA&tKPV_TAJ4)vVM7
z$su8W6LcfHKmH_bVoqsAcVtq<^}?7JO`dBmE!Yu$W@>|jJ5oEJ{hjxp=;_P#=(&;I
z51CT;ZTsc6&$GFc`{(%5=Ab(l>|8j^`*R&*!xyc!vrsjs`H9S(J3~h!OXFtm-1Os>
zoT4@JQn{`78&tvStJ4Bardvxk#uk_Dt{%|sOTTX0_w*D7UTql|{8ZiLn(D-ZWv0)P
zj-PM3ugfgU*`3S1f8G<;Q!I<Eiv3tIV2a1Xy|iPCr+#sA`<S6<K)++Vc13>L<fq>&
z%H$*2^YhKVJr1qu{u>q(Qkk_P{q(=RK3`rDSX{V$LaSj3MOSC;I5Ou@6gKg>{>U$i
z_<6ny4zAm-JNE7EiRMS8v#x$I|5Jl6y@$DA;2-%<c3+g6N1yEy&@s&K#D-Qu6Q)Qy
zhpfKUYq4;~l1WYLt!R9*O=f)d_~pDK3wvdT&k6XU#}VXO|M=?mSCLa*Ju1dtxV~$|
zwhb9)x9jfg*>vwi<(Am5<0>M?t~)D#rpmm&Xtid3C$DB34zU(jCQMni_xK|n<FS6y
z=4soGpWfDUz#VmxadzW5QIV<wRjBuuJC0m9-ze+ze^svhXm(ZASIgGt+*pydY)yCR
zf}FE`T7B@b-v__nHIMdazUj>Jq~(V_Be%pPt$+B%sw0c1PT%dLxX^J_=)lkS-A?QK
zYexfP=FXw>3Xk<)(D$;&E8%d+zxIUr{xKwa((>=J25#3*KNx(j!SsH`<NrIm`kS8P
z_Dmf9VO5hKne1>L<7CLPt#i+01`UcFKmPQB(o6BZg17AXB;)9Y{@oTl@sl*{Gh*4@
zE;~m)IPP;PXv629HWCP%FB~t%E*xWaz4~apvf8jE*1YDn;=8Z!c9|30{UhVJh98($
zjBTEf@nHG5`mI`j+@-ATrXH2E(weQk-?!}@uQ6k;u1cAf_-O1`OQYMeqpS0m{AcV}
zK9jnnHr>LGI@xJ<_3X+9?Kh0ueXq2%@7+rw!n3>gEU1_9`BS<f!w@BH-`jW~Y2jmJ
ziztuJFI}&m*nhy(zWWbO`r>Xw=9OI=?>B7af3;~slhTz(&JTFv!DKF!Ox~+%e)8$1
z0W$AA{kAV=+-TTz-};N2)y1Ng&#U&bdK)VyRunIP&u916)Nh*QR9)|PV$xpY?9n&+
z=g>IfroUb^F1*row~8}(s9)muXZlso>v~evH>htKyGikuidk1R8yW^oJ^1b93%h*&
zC=J@+;pHp(s7<5KIxc=Nq@-}ZasI4-R|jm{GbJQ)?vWkAVJAQ7kU6nAd;Ef9zUR^t
zCO$9jx|$I)ukYsgxXMeP23`DY(TSM2l}Ur=oS4}}_6KXRsQ%Xrql?Ey&)PA(vWcw8
zms-)B?a`ac&)#f5KBBuQx<%Hd*fFPMvqlbH#Y(6*G-Xk0N9CUVqejKWPw23lKCtrJ
z{KE-fwP_Mz+!URjpkj^>93QAjOKHEGZhU-iUdP$<Mo;@>`s4EZw{j9i53;?!2<=jS
zr{H<3@c&Lyk6gs5nmf$2NYK8^fTeRwhxbmH_q-sZX-BDOLGJF>kK4~V-Tq#a$_15+
z4C>PpfBGP$X&1qjHa!cQU1)K?$HWukPJz|({laop@VfS1OP4+SdFLlNy_)9s$iMll
zP1VG5T~)iF(gB;eeyZZBpRMmFDVuSQZCYoLb_%Yz%+Qy2ypn#sN1vXLd-`WL98;Qc
z_p+sT(c-V92cF&^GW5qijRsxkbc%iN%ff`Jz`ZxWf3I1yFS;O0bL1~}wEir5Ab;$E
zUr(;uAKvMTFkUhk8M@u$YpqtW+0^s5kn7i!@x58OJ>xT@eZK3Oe(ma=^kn0C;o3{1
z%X`GEXn3*s)YhDD_8kw+*k){!w>$0Hpp5FEm~oX8DojsyPv7pF{hyR|HwXRPhM!Sl
z-jOaEJ7l?N{Jw<&gQhng5_9~c4`Y`s8#lXayOxu<p^MjZjtw~N-(c<Thv73bH}5*T
z{Jo#1j_DkfyEpqAwXS3558C$}I401?J2N|K@R(Cw?$O(oU-Q_xYUYKABg;Jen%r03
zDD5ZP`rY&;*E-)>;nTK7#Gx)7{v|bX!UU-&CLO6)IVH-H7+pBC#U9JR4<>&ZJ>_26
z%JW^8uf4D!=gWnsGdhYp-`E^{Z+N?DJ@<U0nm*Q|xXCo`sbA9gKwbygM;n&T>U(L&
zqFWxHRIcOh-+1`@qu-A6@r;_>HpAod@CVW=^>pnDzh2rYLDS6JV~==Et9RkO6V(G6
zOiMFtZqlN~cb*jsTOV$>{A0hC(wjqnlditezq?*oY={`6Yg@la@jxFUy%K$Q>dZym
zN6bq7{Ygf*xuHMqWRJ?Jj^8NA3O>~-EckZS+P(i?bFZ-Yu1Rv4Rq7?l=*&F6^t(oi
zgUshYjmQ|<|A%`!E7iZN=Ee%9Y`m=ZXm+gkmbRkwhtEA{aEC;0>^%PGX$LM9%D#JA
zKS8|kz2<AmHf@VoRnY3(61W3*c8V^0B&VlHpj#5dN{dN|iOPtH4rZmx(lcU`Sn=ts
z$e6@F*(@$c$YZ5vNHStrF{MT=sz=QlgPtWbSXfF_&jR}#tcYdK$5?XM8_VW9u?%``
z8B3$LHaD43GsZF$u~3Df7z<^kmzm6%j)elR3Urlh(1Wr}6X>T1GZ_tf6UK@*;C-qM
zTCJf3IAPsbF?y4w2*XRu*%>LZER(epjmZ&BD5yzI5}Jo<^r5UYObe({fmUimk<L1r
zbPQuzd+lSyitsWM3xHWV1HehH2}WQsyN)-Mm4X%lUa)W{vKtHMMF=Z7BQ1q_m7at<
zk}Q;!PPme+DBP7~rCW?f0~l`^U}p)6w`W<sOu#{mRbnVofi797-T+u!WaX;KpaaZQ
z<3yM>I>0+Z7)6*7gJT1%$4W7!1uz{-CMvN{6qKLcwS|3}p!gtI4R&<FRzU(z11h89
zB*{{T>bNvZ@-e*wp};@3BCt{%>c+V}6K8jBeU%0v9lhCv`?tF?1J-szuJuF~v7mqZ
zP9fhZWbG;><vzuq0d^^*D2F%Kx_`&?6tM{pOwR=W2-SAn>yOJIX*Mv8c@{L!x^puJ
zOr;~M(~(DMP_c<lD9I^eR66dQ#e3<rK!DJ#)o1{K$-QG_EMQ=gqtffcN5Ya**!lW3
zKxUxZK8XQ3ZA`1P?IWxYHq5#9&FM6cfNdVg#w=uVCLH5m!$Qdc7K7&?kj6u4G=D%4
zIle?88h<jJ>F`91#!+0i<@UieP+PRxT$Mow)&$%Zt)vH3(~)a3WSR^P$Duo?&M5~0
z!di)d$eIW$h=apG@_P&xDc0pj5auJc#f)e)WJdp13_n}&&=(qG7|h0L$_OY$j`l<5
z_iGjAN2A0#%Y!iiL~cpiJhq+}?GI^d)_|ogIqSUqx!pRCTb(LAayTu%Lwn}ro1I_u
zEMrzlwtCi&q<3o7Y}lFsqE$l^nxex-aJZuVt9=(%|K2uY*~7KcK2gh#sWyFhVE=;T
zv&XML`-ppG`0?LXR6n0|dr61uzm=}*QQq+GC!8B)<tKY)cuslHXWj82x(BV-h<@!y
z22u#?GddA80kOl-JdHWuqNs_)pqdb!L4|2eA$Z^reuSXPffkKP13d&b6cPy6l<2hv
zR0Y;3wk_I6d;>XJVWGw{5OyiV^gSe!NLdUoF*!0RF*%-}7@sW5OqE6PW21$A;&{w|
z6g7lSI4bKN%&x2qVtgRPslsxLu_6;-3$7xoeF8T6AiD-2<35CH`;QvfKa9<k6G+V{
zb(lW<FX%KjkcdfHgaLOaM5SXXP*XlDM9KQVSZ2-#{;Ey+$V&GCg~qf0W*5xA+kjec
zHtOwq(}<)udbK7G*O+0R0Si5mg<51nA=H?^i}ml5Ny+y}Mur35+tsJjT7wZOKFGuL
zm`P&_F%}tAA>1&KgAs#d4o|@2QgONUOB*m>4yNYdR+F8Bej2?BE6uI3$df$+HhUm{
z3`B+vgv@(+_kV_+k}=&Fi4KfueVvAT10bZ#58;SHcp?@@*p0&h|8>PbDO4}eZDpo=
z5H&N7m$)zk?m2KxdO;7E>w(NDM%G#&Gu(ImGOUzL013&FP6^4&AR+1Brr{1thsU4Y
z`JU%5d(fvrq^W-x_98wm+V2;}iIGU5bdvnSPtRiFw`R{6T-YhSq&{s3cdA(1bH)!D
z3fVKm&<Dw0FGTD9{a>-+?)`ZWzeg8VeBF{xYj0U|?Sbm*xMAJY*W)J5{P4p0(Dnx&
z_a3=Wu4vNW(3T(1(T2PamvZO*bA$?H2^9*ZIvl8{lBeMFRALcFjS0CtkWf^Fal`mR
zR4Cx{Ib4)43d7KFUYL*zGm&Z@U!*`q%!n|RN}#}0TyD5hpavOAHBT*8fezGakmclI
zTy>b3BNU>_aDiH_<ckCxK2IqWDY=+H$rXvhfa7Y-jyrj6I|kc@=3&o5f?UG2LCCf;
zD6D@1@sy%}mr1J0OT9`aX?+#lE_+W)krtn+)<n|LPw2?np>!zze|ReC)t%|}Bf<Cy
zZ`ub$ql~D&4CeuKrf-d)Bilv*4!CirUmfqlQ6#(#Y*7w#nbGdZpZScAEdR_sZg<<#
z@^T6b?@?aP1pkQn@8~*i8$;3ozyk}&L0Wl$Z%faQrXveS<7z;3Lpc){(HX_$bRH{X
z5}lUe!30QQelvu_3*m)lfHf!-<lN<8apHz>MLhV$3-h8ygq@%M<>Hf@VZz_pax$&!
z6RZ#0<Nx8EFd>B0vmHW=)XCC)K7lk;Z<P{k9jqL`1<sdvAZNzWkv(Jap}q=EO17M6
z#Hg-M=F5Ov9`%~ij$bJakI367-Y^DbR-sI9+9gl!qLgcFQ->SJzLy5>+a;BJT(Yki
z&0X4JL0AL-&{+fhi`JYQg_a)gl~2>s3TZ2Z5jp2h<*m)^-(|t-gcbYJ8cn~|y`@*j
zMR%%hj2QgGuxq{h9y`%>!M0VCdR1gToVjWHiHF#ll4S*BEmuw@81A&=>?_Oe{ma>N
z3F31n4*qLRivzLSxRcbgH$+<&ub0nilfKlqi!3;xfycez9;c?ysP}EJpE<2BH|?2q
zenfSvCLQ|t8$Y_#c^z&12fgLmiOR6}%5l=<g7*&1e7|vr?pVyMmRGutmA1N|G)`H5
z?ESnQdnWgO7<hRoB91z=H@!)V(XpSlztg1rj?W6?rH}yc`m=n?e>!MndaU2PHmx^*
z=hc%7w?@qwYV`a3(ZK7cI-FCWhY#J$Y4g#P9lmAXzIQRGlekyU3xgUyi5~yH?~nYT
zS&f^YFGbD|PrNqD&#&GX@NaBTrD$+ZpRfF+2Pb^upC8@9?A?28Sle#`%7#r|<GDp7
zZsVOk<BpGE;zzl&yNv#3`#Ns7=bKtSjcB>%XYQb~s&(G$w{iw8{;A!Qw57j(DNpsk
z+)Fm8H7|+gyD};3af7Myfy=OypN&2%TQ_alD&@?i7v^<i=Wc(ZZvFVHj<-B}sLxF~
zyDSTIKDh;l6)|l7qD%C9jXD^wFO2%3qs+HW#ejetbGmSYwB~`6Zx)KzxASdTDR|mp
z#g(Ml(V?RK0ezH1X47Z<Fr?ev^1+QpMtU#jB(z+-D*D1n?CWtKpS`&B(r-^YUajDE
zi0Hr_Z|LjEKDu+&#xoz+J9(t@sOD|THtB{2?SI61NHLAx#Y^ACOKUe?O6^Xb{fcB{
z8>ld_vDFx1U<O}TPXz{CYl_^MNk{%O5x2DO)bgEL)~;GouEWdeFmyvYah5%kPE&cn
z$hqB#U~$51du3<;WHvt{Fr|K6-|s05t8FI>{S$i!xG#e5WNi+L6UdE+a$+rM@-)QM
zYU^3D8b{?n9sx(eCVCAWsalOspI1Zv3M>*@JvjR?!mTb`7~#VG8H;T7?H$-z<kB||
za>Jron@BA3!%B^sBFu?D7dOz6{p)M^^Y<}R@+X+l(;?Opu!;ub_cWtLq17lq!b~O$
zNC#U$W}3nUZ{k{uNl`|O+0jL>{Dw5-_drAu<V4}IKlqJX)oOacBRX<r3mqA?1s^s=
z*Ocfe2m0#!n&aC6p~Qa5w-76XDQRo=7H+6UZKoq=w*dlceM7ZpCke5SF5jdykTz2<
zcF>W-JKW=ac@!-iY+P8J@NNQ8H@@J{qG%MAiFg(Th+5niza@#gA&6`$fo@?cr%2kO
z-E?Hft{No=7K;MG%WzZ5z6B%LLCG6xXf&8Iw1CK_TBFBQ6uMo%2FSGkq#oMTJ*1&|
z7h!pXUoeXvu#1k&`hkud{~jNcy(e;~lr)3OA`Ude+aEpvKbR;mIzmCQj}62meo#|;
z|3?zF_3Sz|fgZ4)jtu(|ga`KG#1h@u`=ccE)hE{Q10?<SGNXC+NwsZ19T~UZy>h=g
z(mxK8kb2{1RqSXN9i$^q09EehG6>3!a|+5n2SM5WZ5pra*l7N>43!F-Xx@Ob^69}R
z=ADz}d*m#AF??C;+uqZXe^QN#>GZYOXOvgTQU1r}0g=A@6DtQqw_Cojm!d`Tk(N8E
zZ$~uyC6)ijA6=vG6!iIg)vZg7BU8%4m*ndY-yOou?cHl*`?a~N6F&Jd;^57++gs(?
zo%=raZRbU&jLTvQwIZCyMHK=NRuwC`d=O+6D@437kxI!$QISd&#^Z<pK!Ndum{2TG
zf_i*Dieh{|xKt9(j0hJi#R7#wAXKO^zA&64;DInN7Zrw~Sh!FfCKL$O9I%q}RAC_M
zD^Q1pt2q2{G)$ous6}d&>+YEM@6{yjYe*c0$u;GpJlyDmn<B&OAybcnJoF)43nTsk
zL!s1raD~HRg2L6U9+-0A{s*{%1r*fjibp*hz_fl*Pt7GKNzCyf=QykA0ZZt}juUib
z$O)V+j=fP5`08_H^J(%N!Ai|VsC|z7eu|FlI^}+jyu@(9I|me;frbWxa6<!sP;=+X
zFJ$h(47mxjdJ0z4b^i<<S$PKMj%$yU9AYxMIEc_6rP1JJJmN+V1+BgWj9K%Gn%Z6G
zNz{6iPFYN)2W$nN+&SRwp2LZ)>4?-NCAoiPMgNO=G3%=8P}y-+bpTP1u<8<Ng#OT5
z%~gVv4O+0A!V8Gj)Y$%ubmY<ne4I9~{%@jw1x|=Vc*4J&A*85GaDoCv&53E(NStVd
zbAm`aJ22z?6*}_hGMO0^C|-u0k{Rz7rPp1QrrjaQuR%UI>;O)gXe~NxbjP|R?Yc!r
zCfvYf_m5qY-s*Ma`S$@6iTlV6ftduCKgn#gUPvBx4`g2cpug^Y-N0v^g*2wONtuu7
zP(}*4rrgoP@e;JPh;(I`#_$Jq$X8Q4(h^&N35}Q6+*2eMTJNEeOKJ}9IusJ;9?6CF
zqk`m8@|ioL+z*gdbLh=V4fSa!q5&L-^SKr>jC5jyq8RSD$i*N#PJV<5Q{ov?MkEkv
z4ERJ&>sd)9l50d8NSrMTMJ^)8Ov&Yr;UF@E6a^s{J4Ffz5Y9wmZ*HZP*fCq_blh?h
z&2Vb1J2rBWbz4M~z`pVAiM3zbroq;&g9}TvfZY=Q8&L*(@wWO>HqO~x7@~rebNF#g
za*@NzN(MMDG-Mdrh!N#*ia5?oT7X|*tH5WS>gH7fNS)l}{~x%B{{Q0zYGm&J`TM`f
zyIUUbt{uIlYe%jd1LXB@W*~3=>I3rTE+oAAr2^#L!Sh<aZ9s8z1NpyjN*j5L2ceO-
zaPAoSKYMHz`5!p-iM+<iP2@FCUm~?}ViKwAvxvyMb2%XF`tH8>+q>^g;ds5Z<1WbC
zIYolh<w*|Ybxva-uW_ORd20t4klH_AfYj<C0OVB@`N*p$<B|W7L^4u~IbY<}^R`GW
zCs&c$PJbeAE8B>?#pE9HKarS1>M%1DP<?<N;D`7IoS#N-25yZ8Grhs>ClzX99_lzK
z$a<62n%Xl3bz}7%6a-#xr~<!&ir{MvgMwK>@SZCu(1Uj)fG6<Uo+2}N<lw&zM(|UJ
z|HR-=Zbg6aVC@gQ1{7SA#cbU=_M+BcS`~hS*50#~tXz0M3KRxe^cr}v9jCyeGnUl=
z42lA9To#t=Ff$5M2B7+&AWRRcgZBWywwW2RA)=rP5;2Zc1lix48_I<}Ind(*b)pPp
z^1=kJbk!7-$bw9(2nN*IhEMAHWoRL+7LkJEam34KU9ma=ozVV0DYy)m=-fS8ilM$H
zPW;TQE5e%M+8rU0?Q%s(hSV-jBD?B}lMJa{oIsM+_q@0wbpX{aR?-VuT=6;qYZ<fc
zohuY*Nf0NTYVa-+O4~pJyv#ZfXJi~+>ETL&+iFhayyiqAp@A=ocqtu3h1#XS{-Om}
z<c_d6fft?&aEF(G)iPcc1wOp2cJY7{4X%i6g|&;xcG=n$8wucqku+EI4hugC@Ulr{
zv%9z-;j*+l0c#yKeyy5nv^4-Hl&?ES@!QR=2(7oCVR<d%vtRV1K<WsrUCa)*q+D^k
zfNL3h%>^V1)DE!P#p`$r$Q8FMyms*u2Z&rTlOawxNfY&zX0YvIk1I|Rpq5cOT)%Ne
z>;iT||C+Pddfv(vt-ZW<@!@w|T+!7O*Df}AkHr-kQR;;2b#=#je}w`ae33s+MlB;G
z+<g4ThAUnduoL>%)EwKD2v@u$K<#3*9@uk5XfJod_qr0b9SU^CNCG&ad`+fW?<P<n
z#E&Av@><46x+y?`)d~6ra1)+BM&W+~E@L&XwX;@ueG2d>Q4x4Av7rbL-e@(7BDAOs
z#%_#dXi<@&1oDB<VNlA2cZE1XAeKV#o&@BDazpt+6*Q*5BrUB^b~o!GXy%YOHvX4~
zN+gn0@DKj=mPiDW$UI4`M3N_o%EQYdBOTx+urS6BPtCACgHOB{93C%`lt@53@C`eO
zR>puY@GOxeiPFKZ@OYipB9~?fa(KzbinuJZLZ77Rqlr{tWeG)Swy-e1z>pWOPsj(i
z7q#)4lDv4GFkg|K3F}x?*`=n$D2YbNPs>;8Q$d-e#P(dQjEWD7kIpp3M<tZ2q**1f
zpQIeF3c_VPu}PK9wdC-uxaB!DI4!blt`^o+=(9|UD2PvPlxLR~=I|)+N@D@-vE@1Z
zG@()oX_9DE*+L`ezfhhY4{-w;xg}VZn1kg+;{C?!#2QqWRiKK(;bwVGT0ZE5BTEPL
z#+fwoY+-?dmu6Jxlz4wxS!vS5j699niFRcfs2^(v{3Ga+XtJ_}IQ{ZyFxDtRsg+;O
z^t-92j|4mheB_d8)ml{1GLDV!Tt!Z#kUUl(A5L;Xss{2@k)6enWvAt<q%k$JQKy&W
z0Xf$2G+vu!QRrgL^6XR~8wuV{uG4B2(lj+*4tWrjkfp%KlvN1jiQ}#d+H`5fGJY1I
zO~?UyQRt3$a;dlg$W*J)r>W6w>)4gGtp`%@t~+U|I57&&54(KrxUG08cvq<4C3Ar+
zEH?crk%2iR%Sl$N;<P0|Cn0~i3Y`|}7%`t*<f`B$43uXl+vLvGs-&@?kF@-nc@f7k
zL0u)ujDzN+nQS_*L$k9?@>sE5?)o&NN?K~gM<q*rmIcjDE|!BigzJ(%%dCvkl4&lM
z>5N(#|8MA;4%icY8kAGHJ6VExY5}&vqKZqx@z#|b6;NOJS^09Fb-WaEGy%EkRoSt6
z8&B&>j!=KpDyf#Eu*rggHkUee#Wy*$$+2R9pPP?n3*6Zc9b6Z3cxlCo>{z2hgWK7<
z;vdlcQmt)XK>P1H7r=U=06K@SGjKhlnh!d;Nx_S?+g8{4pi@Z2`F7j<m*!}pg0Hf`
zc3^Je^FN0#*DCeNMg`9;FaNYnoylG*<|}li#bAw5!+8tzQtxhF13O7wmx|YK#P$$A
zjn#HTn?^B@z#5;X#Rxm;P8Z%JeSF|&0AK1)=;NT!C#qZe+^)ZMz2>TN5=`#&tOizR
zdIi?B9N?$v3*E^K%xf;RbwaoDdNZ~K;1zP9?_~2c+}OUFHQDM9yVI#RMI#@s^$Awq
zxcK$>zJLq-g)+?QyVup4Q7Gd<zuE0Pbb!~Lz0E3@@e_>7IA9}QvOdUjk_(hNZ3*xh
zitBi-)yj2p;1dGhQkMn&Q=32Qy5Es!gPOpfMRVj@xYw#<pGu3xMH-cVMq^HX37V7e
zHs~|TbohP-_t8mv2MXT>1R&tv0{0KCdnLDa=8ddd;5A=4s$!+iy-$zwvV<z0xJ>>M
zpI#3Ah@4bjvULwmu}63Ezy8YJR>w6cK@0=N0o<&oW;8CjK*0xbfV#DNB}$#R0L;^J
zd3Kt8EvK-5%D9AL8853`$;&DPGB(%oJSmfBt6pYn9N=|j2meH-<-iw(`)at})zx|d
z;w&n+ds?tp*Mr!QJ9?b<4Is|o6w`5<8;<cQH*pD-R9ph~?=PW=1gG+q-qd<;^GQHV
zs~nZal)3Zc>~Vl(wUU>gpH#PdMxHz;p&ZQ?TinqH*0(YxukP1ZEH14~24bUb>={_s
z>~O02=%h=vWM`qT6k9IOaUTPNu_|E0tnn)<eS0HolS90LV(oN**Ojfn{UQ)s2Xa)I
zK-|@xEGg?z$@I12+ivU!?vrJKxDFAYkY>5v|GBOErsSLjVx3V5z{kx3dM7S}F;EcG
zHlf)GfTv(Df#)Y8h2VFYg0IB+OPa4B-Wp$;lrAVqERbkmJe(BovCn(hM?t&=)GLqI
zb70;@B8jub<3Sn)Bx67ZB{dFwiXy>tltd|!B<D%NFB7b*mdHRRrUX1ol<+xRk|~iS
zrh?o}q#zRHV$$GqBz%r8gt?bun4n2W1-Y9@{5c6v*udv#8A#Vi21=3u{?upykAcrI
z@Hx383gUxl91Ny&V&F4)0?#6t21<fyohZC38GMe$(>yV-YaXa5=?#-Yka{AC1fM0Q
zu4*<Ld`C+1;Uk_TvgU{IM@cgLf_VlQlds7U0i2S^av_^7f?zmAiA0)Vdya_(*a=kl
zPxuH>fq#((5DPch`-zT+_zP3?;D2uqz^R#dS73DT0sn*_(GVtub|*_siY;m7l4t>J
zU-D<|VJEisU{95=;+2B7_@AL`F&KqYdnpdG<Hz+uYO^0;?3pP=Y<6iun05Tbcp+0D
z<B30@BS5}{EtJH;`eF&h2LX~&ydF>-_)znQqn797bCkN+axikVE65E3Xp}@+{#WqT
z=(HpW*6*50;EPPdao|Lf@Zs`xS$|=z6xf~}L$rSxy@dY(MuM?ZZ1ko?6ZA$~^}(4x
zkiU_*FNISUUnH{fGhPyxQtY4)I8#XafGeEMmxpcDus$av{N?h1td1mnSTG-8eX+^L
z-^iEo$Hgy)q$k7nU6n+8H~fJ%L!!vexB9865+2(+-=ZW*1q8h|{@_!Sq(6{fkwx+x
zEnmS;C{*I$NL@|)l$!R|`39|lgZ>F%d)K`sD3zs@v)N?@HhvTO>x|z3+dJ%?p?!h!
z&4TS=b$9a>@<*Dblfyi#RLcYV9vEK%g;65dKHsT5&Lon2qhR0CG%fJ`%g~%iuxAom
zBz+*gT}y2GMaYYK{z#MY^%PPF`Nea>=b}$IzT{%CwsMsELP?(#J{VuL*rrcz<b&HY
z$GBT+6vhUs2nQ;Oc5d&i|G0gqi5E(eihwFXdm1mv!SO@tt@aQs9Pk1E#6KXtaam;w
zNt6JNFYnLlL)21jA+RS(Sl{SW--Z9y`qpy+))+PDpa7~m&QI__@E;C>Y`^fO5}q3X
zHq4q+24p9v099v~6Z~|euW;;<b{_=KkqChwr<BNH`?7Kye{A|{JrrS|FOg8coZF{q
zNjVk(k5WSU;QitHn}jpfdA-4>lGL6j0u6!6<UzjxeB*ynk_5;PNVB{7gX@Q5E-Wuv
z1o4-`@!<Ha@^#@qK7T+y9GnHf^SbakssQkiU1HNOcl^hPLdtLA#Abmx$1VkP_NDYf
zp-0x}v(?^2Ng&d6*q%hMl?gcO@cym#*qUR81p{Gyel+-a8Gmqm_Ix?sQ*<V1#MV3E
zbCD0E1+2&9^8#>wfOQSmhw_^C@i>t#{Dr7v9nL}ke+}{%(rfc)Kxbgy0n{?+H30_<
z@0=Jk;V(fWN!EY7p7S{nSO@A4aOgmPN?;s_KkLk<F<C~mXrKi%hbWE45d2^f%XV;x
c#SH_;hB)A5Z{U@OZ{<JQ`cxV%MUuw-e+XP{ng9R*

literal 0
HcmV?d00001

diff --git a/transforms/universal/doc_id/test/test_doc_id_spark.py b/transforms/universal/doc_id/test/test_doc_id_spark.py
new file mode 100644
index 000000000..3e3fd2cc7
--- /dev/null
+++ b/transforms/universal/doc_id/test/test_doc_id_spark.py
@@ -0,0 +1,45 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+from dpk_doc_id.spark.transform import (
+    DocIDSparkTransformConfiguration,
+    doc_column_name_cli_param,
+    hash_column_name_cli_param,
+    int_column_name_cli_param,
+)
+
+
+class TestSparkDocIDTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = "../test-data"
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
+        fixtures = []
+        launcher = SparkTransformLauncher(DocIDSparkTransformConfiguration())
+        transform_config = {
+            doc_column_name_cli_param: "contents",
+            hash_column_name_cli_param: "hash_column",
+            int_column_name_cli_param: "int_id_column",
+        }
+
+        fixtures.append((launcher, transform_config, basedir + "/input", basedir + "/expected-spark"))
+        return fixtures

From 9c869eea586a36ff08a02af04ea12defff8cd4b2 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 05:17:05 -0500
Subject: [PATCH 07/28] remove BASE_IMAGE arg from dockerfile.spark

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/.make.cicd.targets                | 4 ++--
 transforms/universal/doc_id/Dockerfile.spark | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/transforms/.make.cicd.targets b/transforms/.make.cicd.targets
index de840eee2..23475f57f 100644
--- a/transforms/.make.cicd.targets
+++ b/transforms/.make.cicd.targets
@@ -83,7 +83,7 @@ test-image:: .default.build-lib-wheel
 			$(MAKE) DOCKER_FILE=Dockerfile.spark \
 					TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \
 					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
-					BASE_IMAGE=$(DOCKER_HOSTNAME)/$(DOCKER_NAMESPACE)/$(DOCKER_SPARK_BASE_IMAGE)  \
+					BASE_IMAGE=$(SPARK_BASE_IMAGE)  \
 					test-image-sequence ; \
 		fi ;\
 	fi
@@ -120,7 +120,7 @@ image:: .default.build-lib-wheel
 		if [ -e Dockerfile.spark ]; then \
 			$(MAKE) DOCKER_FILE=Dockerfile.spark \
 					DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
-					BASE_IMAGE=$(DOCKER_HOSTNAME)/$(DOCKER_NAMESPACE)/$(DOCKER_SPARK_BASE_IMAGE)  \
+					BASE_IMAGE=$(SPARK_BASE_IMAGE)  \
 					.defaults.lib-whl-image ; \
 		fi ; \
 	fi
diff --git a/transforms/universal/doc_id/Dockerfile.spark b/transforms/universal/doc_id/Dockerfile.spark
index e8df6c522..70c626a87 100644
--- a/transforms/universal/doc_id/Dockerfile.spark
+++ b/transforms/universal/doc_id/Dockerfile.spark
@@ -1,5 +1,4 @@
-ARG BASE_IMAGE=quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest
-FROM ${BASE_IMAGE}
+FROM quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest
 
 USER root
 # install pytest

From 6fa0c0450fa29b2db2d1f15bfdc368b2af97d51e Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 06:18:30 -0500
Subject: [PATCH 08/28] added login to quay.io

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/.make.cicd.targets | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/transforms/.make.cicd.targets b/transforms/.make.cicd.targets
index 23475f57f..e77715b80 100644
--- a/transforms/.make.cicd.targets
+++ b/transforms/.make.cicd.targets
@@ -61,6 +61,7 @@ test-image-spark:
 	$(MAKE) BUILD_SPECIFIC_RUNTIME=spark test-image
 
 test-image:: .default.build-lib-wheel
+	$(DOCKER) login $(DOCKER_HOSTNAME) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)'
 	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \
 		if [ -e Dockerfile.python ]; then \
 			$(MAKE) DOCKER_FILE=Dockerfile.python \
@@ -101,6 +102,7 @@ image-spark:
 
 image:: .default.build-lib-wheel
 	## Build all possible images unless a specific runtime is specified
+	$(DOCKER) login $(DOCKER_HOSTNAME) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)'
 	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \
 		if [ -e Dockerfile.python ]; then \
 			$(MAKE) DOCKER_FILE=Dockerfile.python \

From bbe9a023a99e4ff10241bf3f125042ab7790b2ad Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 06:32:26 -0500
Subject: [PATCH 09/28] debug registry credential

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .github/workflows/test-universal-doc_id.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml
index fce8faf11..42d85b993 100644
--- a/.github/workflows/test-universal-doc_id.yml
+++ b/.github/workflows/test-universal-doc_id.yml
@@ -114,7 +114,7 @@ jobs:
                       if [ -d "transforms/universal/doc_id/spark" ]; then
                           make -C data-processing-lib/spark DOCKER=docker image
                       fi
-                      make -C transforms/universal/doc_id DOCKER=docker test-image
+                      make -C transforms/universal/doc_id DOCKER=docker DOCKER_REGISTRY_USER=${{ secrets.DOCKER_REGISTRY_USER }} DOCKER_REGISTRY_KEY=${{ secrets.DOCKER_REGISTRY_KEY }} test-image
                   else
                       echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform."
                   fi

From b77aaa3b3a2d21f58754906bfc902325dbf2e996 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 06:40:40 -0500
Subject: [PATCH 10/28] use dpk secrets

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .github/workflows/test-universal-doc_id.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml
index 42d85b993..824c9cae4 100644
--- a/.github/workflows/test-universal-doc_id.yml
+++ b/.github/workflows/test-universal-doc_id.yml
@@ -93,8 +93,8 @@ jobs:
         runs-on: ubuntu-22.04
         timeout-minutes: 120
         env:
-            DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
-            DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
+            DOCKER_REGISTRY_USER: ${{ secrets.DPK_DOCKER_REGISTRY_USER }}
+            DOCKER_REGISTRY_KEY: ${{ secrets.DPK_DOCKER_REGISTRY_KEY }}
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -114,7 +114,7 @@ jobs:
                       if [ -d "transforms/universal/doc_id/spark" ]; then
                           make -C data-processing-lib/spark DOCKER=docker image
                       fi
-                      make -C transforms/universal/doc_id DOCKER=docker DOCKER_REGISTRY_USER=${{ secrets.DOCKER_REGISTRY_USER }} DOCKER_REGISTRY_KEY=${{ secrets.DOCKER_REGISTRY_KEY }} test-image
+                      make -C transforms/universal/doc_id DOCKER=docker test-image
                   else
                       echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform."
                   fi

From 2c52fd504e1a1785e4da9ff30eb15c8a3cdf79e0 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 07:12:48 -0500
Subject: [PATCH 11/28] testing registry user

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .github/workflows/test-universal-doc_id.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml
index 824c9cae4..deee93eda 100644
--- a/.github/workflows/test-universal-doc_id.yml
+++ b/.github/workflows/test-universal-doc_id.yml
@@ -114,6 +114,7 @@ jobs:
                       if [ -d "transforms/universal/doc_id/spark" ]; then
                           make -C data-processing-lib/spark DOCKER=docker image
                       fi
+                      echo DOCKER_REGISTRY_USER="$DOCKER_REGISTRY_USER" - secret="${{ secrets.DPK_DOCKER_REGISTRY_USER }}"
                       make -C transforms/universal/doc_id DOCKER=docker test-image
                   else
                       echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform."

From 5ffadb43b74f31858635d4d19823b572fc72fbac Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 07:31:29 -0500
Subject: [PATCH 12/28] testing registry user

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .github/workflows/test-universal-doc_id.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml
index deee93eda..fc589f2ca 100644
--- a/.github/workflows/test-universal-doc_id.yml
+++ b/.github/workflows/test-universal-doc_id.yml
@@ -109,6 +109,9 @@ jobs:
                   sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
                   df -h
             - name: Test transform image in transforms/universal/doc_id
+              env:
+                DOCKER_REGISTRY_USER: ${{ secrets.DPK_DOCKER_REGISTRY_USER }}
+                DOCKER_REGISTRY_KEY: ${{ secrets.DPK_DOCKER_REGISTRY_KEY }}
               run: |
                   if [ -e "transforms/universal/doc_id/Makefile" ]; then
                       if [ -d "transforms/universal/doc_id/spark" ]; then

From 8b234a4da26e0e554e7a9a3d4a77dbc9c0ec6c70 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 09:00:03 -0500
Subject: [PATCH 13/28] testing environment secrets

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .github/workflows/test-universal-doc_id.yml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml
index fc589f2ca..7483935ce 100644
--- a/.github/workflows/test-universal-doc_id.yml
+++ b/.github/workflows/test-universal-doc_id.yml
@@ -93,8 +93,9 @@ jobs:
         runs-on: ubuntu-22.04
         timeout-minutes: 120
         env:
-            DOCKER_REGISTRY_USER: ${{ secrets.DPK_DOCKER_REGISTRY_USER }}
-            DOCKER_REGISTRY_KEY: ${{ secrets.DPK_DOCKER_REGISTRY_KEY }}
+            DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
+            DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
+            DPK_TEST: ${{ secrets.DPK_TEST }}
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -109,15 +110,12 @@ jobs:
                   sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
                   df -h
             - name: Test transform image in transforms/universal/doc_id
-              env:
-                DOCKER_REGISTRY_USER: ${{ secrets.DPK_DOCKER_REGISTRY_USER }}
-                DOCKER_REGISTRY_KEY: ${{ secrets.DPK_DOCKER_REGISTRY_KEY }}
               run: |
                   if [ -e "transforms/universal/doc_id/Makefile" ]; then
                       if [ -d "transforms/universal/doc_id/spark" ]; then
                           make -C data-processing-lib/spark DOCKER=docker image
                       fi
-                      echo DOCKER_REGISTRY_USER="$DOCKER_REGISTRY_USER" - secret="${{ secrets.DPK_DOCKER_REGISTRY_USER }}"
+                      echo "DPK_TEST=$(DPK_TEST) - $DPK_TEST - ${{ secrets.DPK_TEST }} "
                       make -C transforms/universal/doc_id DOCKER=docker test-image
                   else
                       echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform."

From 878fa42bc338fe5305cc5f7e784ab377a247cd35 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 09:08:35 -0500
Subject: [PATCH 14/28] testing environment secrets

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .github/workflows/test-universal-doc_id.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml
index 7483935ce..ebcb9df63 100644
--- a/.github/workflows/test-universal-doc_id.yml
+++ b/.github/workflows/test-universal-doc_id.yml
@@ -115,7 +115,7 @@ jobs:
                       if [ -d "transforms/universal/doc_id/spark" ]; then
                           make -C data-processing-lib/spark DOCKER=docker image
                       fi
-                      echo "DPK_TEST=$(DPK_TEST) - $DPK_TEST - ${{ secrets.DPK_TEST }} "
+                      echo "DPK_TEST= $(DPK_TEST) , $DPK_TEST , ${{ secrets.DPK_TEST }} "
                       make -C transforms/universal/doc_id DOCKER=docker test-image
                   else
                       echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform."

From 5632c064ef61f2b96b4e6c4701a176dc7a4579aa Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 09:24:31 -0500
Subject: [PATCH 15/28] testing environment secrets

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .github/workflows/test-universal-doc_id.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml
index ebcb9df63..4f31422c0 100644
--- a/.github/workflows/test-universal-doc_id.yml
+++ b/.github/workflows/test-universal-doc_id.yml
@@ -115,7 +115,9 @@ jobs:
                       if [ -d "transforms/universal/doc_id/spark" ]; then
                           make -C data-processing-lib/spark DOCKER=docker image
                       fi
-                      echo "DPK_TEST= $(DPK_TEST) , $DPK_TEST , ${{ secrets.DPK_TEST }} "
+                      echo "DPK_TEST= $(DPK_TEST) 
+                      echo " $DPK_TEST "
+                      echo " ${{ secrets.DPK_TEST }} "
                       make -C transforms/universal/doc_id DOCKER=docker test-image
                   else
                       echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform."

From 6da893f6bdda0358571c9b41adec6fcee08c4bad Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 09:26:42 -0500
Subject: [PATCH 16/28] testing environment secrets

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .github/workflows/test-universal-doc_id.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml
index 4f31422c0..5e2c40805 100644
--- a/.github/workflows/test-universal-doc_id.yml
+++ b/.github/workflows/test-universal-doc_id.yml
@@ -115,7 +115,7 @@ jobs:
                       if [ -d "transforms/universal/doc_id/spark" ]; then
                           make -C data-processing-lib/spark DOCKER=docker image
                       fi
-                      echo "DPK_TEST= $(DPK_TEST) 
+                      echo "DPK_TEST= $(DPK_TEST) ""
                       echo " $DPK_TEST "
                       echo " ${{ secrets.DPK_TEST }} "
                       make -C transforms/universal/doc_id DOCKER=docker test-image

From d426439a339cf1d38be1ab2ed52ef806e8e953ca Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 09:28:28 -0500
Subject: [PATCH 17/28] testing environment secrets

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .github/workflows/test-universal-doc_id.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml
index 5e2c40805..11977528b 100644
--- a/.github/workflows/test-universal-doc_id.yml
+++ b/.github/workflows/test-universal-doc_id.yml
@@ -115,7 +115,7 @@ jobs:
                       if [ -d "transforms/universal/doc_id/spark" ]; then
                           make -C data-processing-lib/spark DOCKER=docker image
                       fi
-                      echo "DPK_TEST= $(DPK_TEST) ""
+                      echo "DPK_TEST= $(DPK_TEST) "
                       echo " $DPK_TEST "
                       echo " ${{ secrets.DPK_TEST }} "
                       make -C transforms/universal/doc_id DOCKER=docker test-image

From e8bb04a86faca37da17bf81f19490aafef1f9ea1 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 09:41:13 -0500
Subject: [PATCH 18/28] Delete .github/workflows/test-universal-doc_id.yml

---
 .github/workflows/test-universal-doc_id.yml | 137 --------------------
 1 file changed, 137 deletions(-)
 delete mode 100644 .github/workflows/test-universal-doc_id.yml

diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml
deleted file mode 100644
index 11977528b..000000000
--- a/.github/workflows/test-universal-doc_id.yml
+++ /dev/null
@@ -1,137 +0,0 @@
-#
-# DO NOT EDIT THIS FILE: it is generated from test-transform.template,  Edit there and run make to change these files
-#
-name: Test - transforms/universal/doc_id
-
-on:
-    workflow_dispatch:
-    push:
-        branches:
-            - "dev"
-            - "releases/**"
-        tags:
-            - "*"
-        paths:
-            - ".make.*"
-            - "transforms/.make.transforms"
-            - "transforms/universal/doc_id/**"
-            - "data-processing-lib/**"
-            - "!transforms/universal/doc_id/**/kfp_ray/**" # This is/will be tested in separate workflow
-            - "!data-processing-lib/**/test/**"
-            - "!data-processing-lib/**/test-data/**"
-            - "!**.md"
-            - "!**/doc/**"
-            - "!**/images/**"
-            - "!**.gitignore"
-    pull_request:
-        branches:
-            - "dev"
-            - "releases/**"
-        paths:
-            - ".make.*"
-            - "transforms/.make.transforms"
-            - "transforms/universal/doc_id/**"
-            - "data-processing-lib/**"
-            - "!transforms/universal/doc_id/**/kfp_ray/**" # This is/will be tested in separate workflow
-            - "!data-processing-lib/**/test/**"
-            - "!data-processing-lib/**/test-data/**"
-            - "!**.md"
-            - "!**/doc/**"
-            - "!**/images/**"
-            - "!**.gitignore"
-
-# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
-concurrency:
-    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-    cancel-in-progress: true
-
-jobs:
-    check_if_push_image:
-        # check whether the Docker images should be pushed to the remote repository
-        # The images are pushed if it is a merge to dev branch or a new tag is created.
-        # The latter being part of the release process.
-        # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
-        outputs:
-            publish_images: ${{ steps.version.outputs.publish_images }}
-        steps:
-            - id: version
-              run: |
-                  publish_images='false'
-                  if  [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
-                  then
-                    publish_images='true'
-                  fi
-                  if  [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
-                  then
-                    publish_images='true'
-                  fi
-                  echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
-    test-src:
-        runs-on: ubuntu-22.04
-        steps:
-            - name: Checkout
-              uses: actions/checkout@v4
-            - name: Free up space in github runner
-              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
-              run: |
-                  df -h
-                  sudo rm -rf "/usr/local/share/boost"
-                  sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-                  sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
-                  sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
-                  df -h
-            - name: Test transform source in transforms/universal/doc_id
-              run: |
-                  if [ -e "transforms/universal/doc_id/Makefile" ]; then
-                      make -C transforms/universal/doc_id DOCKER=docker test-src
-                  else
-                      echo "transforms/universal/doc_id/Makefile not found - source testing disabled for this transform."
-                  fi
-    test-image:
-        needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
-        timeout-minutes: 120
-        env:
-            DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
-            DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
-            DPK_TEST: ${{ secrets.DPK_TEST }}
-        steps:
-            - name: Checkout
-              uses: actions/checkout@v4
-            - name: Free up space in github runner
-              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
-              run: |
-                  df -h
-                  sudo rm -rf /opt/ghc
-                  sudo rm -rf "/usr/local/share/boost"
-                  sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-                  sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
-                  sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
-                  df -h
-            - name: Test transform image in transforms/universal/doc_id
-              run: |
-                  if [ -e "transforms/universal/doc_id/Makefile" ]; then
-                      if [ -d "transforms/universal/doc_id/spark" ]; then
-                          make -C data-processing-lib/spark DOCKER=docker image
-                      fi
-                      echo "DPK_TEST= $(DPK_TEST) "
-                      echo " $DPK_TEST "
-                      echo " ${{ secrets.DPK_TEST }} "
-                      make -C transforms/universal/doc_id DOCKER=docker test-image
-                  else
-                      echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform."
-                  fi
-            - name: Print space
-              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
-              run: |
-                  df -h
-                  docker images
-            - name: Publish images
-              if: needs.check_if_push_image.outputs.publish_images == 'true'
-              run: |
-                  if [ -e "transforms/universal/doc_id/Makefile" ]; then
-                      make -C transforms/universal/doc_id publish
-                  else
-                      echo "transforms/universal/doc_id/Makefile not found - publishing disabled for this transform."
-                  fi

From 3647ebc3b9d78d70ff66a760b14c72b516be8a90 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 09:47:18 -0500
Subject: [PATCH 19/28] restore workflow file

---
 .github/workflows/test-universal-doc_id.yml | 133 ++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 .github/workflows/test-universal-doc_id.yml

diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml
new file mode 100644
index 000000000..fce8faf11
--- /dev/null
+++ b/.github/workflows/test-universal-doc_id.yml
@@ -0,0 +1,133 @@
+#
+# DO NOT EDIT THIS FILE: it is generated from test-transform.template,  Edit there and run make to change these files
+#
+name: Test - transforms/universal/doc_id
+
+on:
+    workflow_dispatch:
+    push:
+        branches:
+            - "dev"
+            - "releases/**"
+        tags:
+            - "*"
+        paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
+            - "transforms/universal/doc_id/**"
+            - "data-processing-lib/**"
+            - "!transforms/universal/doc_id/**/kfp_ray/**" # This is/will be tested in separate workflow
+            - "!data-processing-lib/**/test/**"
+            - "!data-processing-lib/**/test-data/**"
+            - "!**.md"
+            - "!**/doc/**"
+            - "!**/images/**"
+            - "!**.gitignore"
+    pull_request:
+        branches:
+            - "dev"
+            - "releases/**"
+        paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
+            - "transforms/universal/doc_id/**"
+            - "data-processing-lib/**"
+            - "!transforms/universal/doc_id/**/kfp_ray/**" # This is/will be tested in separate workflow
+            - "!data-processing-lib/**/test/**"
+            - "!data-processing-lib/**/test-data/**"
+            - "!**.md"
+            - "!**/doc/**"
+            - "!**/images/**"
+            - "!**.gitignore"
+
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
+jobs:
+    check_if_push_image:
+        # check whether the Docker images should be pushed to the remote repository
+        # The images are pushed if it is a merge to dev branch or a new tag is created.
+        # The latter being part of the release process.
+        # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
+        runs-on: ubuntu-22.04
+        outputs:
+            publish_images: ${{ steps.version.outputs.publish_images }}
+        steps:
+            - id: version
+              run: |
+                  publish_images='false'
+                  if  [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
+                  then
+                    publish_images='true'
+                  fi
+                  if  [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
+                  then
+                    publish_images='true'
+                  fi
+                  echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
+    test-src:
+        runs-on: ubuntu-22.04
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v4
+            - name: Free up space in github runner
+              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
+              run: |
+                  df -h
+                  sudo rm -rf "/usr/local/share/boost"
+                  sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+                  sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
+                  sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
+                  df -h
+            - name: Test transform source in transforms/universal/doc_id
+              run: |
+                  if [ -e "transforms/universal/doc_id/Makefile" ]; then
+                      make -C transforms/universal/doc_id DOCKER=docker test-src
+                  else
+                      echo "transforms/universal/doc_id/Makefile not found - source testing disabled for this transform."
+                  fi
+    test-image:
+        needs: [check_if_push_image]
+        runs-on: ubuntu-22.04
+        timeout-minutes: 120
+        env:
+            DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
+            DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v4
+            - name: Free up space in github runner
+              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
+              run: |
+                  df -h
+                  sudo rm -rf /opt/ghc
+                  sudo rm -rf "/usr/local/share/boost"
+                  sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+                  sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
+                  sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
+                  df -h
+            - name: Test transform image in transforms/universal/doc_id
+              run: |
+                  if [ -e "transforms/universal/doc_id/Makefile" ]; then
+                      if [ -d "transforms/universal/doc_id/spark" ]; then
+                          make -C data-processing-lib/spark DOCKER=docker image
+                      fi
+                      make -C transforms/universal/doc_id DOCKER=docker test-image
+                  else
+                      echo "transforms/universal/doc_id/Makefile not found - testing disabled for this transform."
+                  fi
+            - name: Print space
+              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
+              run: |
+                  df -h
+                  docker images
+            - name: Publish images
+              if: needs.check_if_push_image.outputs.publish_images == 'true'
+              run: |
+                  if [ -e "transforms/universal/doc_id/Makefile" ]; then
+                      make -C transforms/universal/doc_id publish
+                  else
+                      echo "transforms/universal/doc_id/Makefile not found - publishing disabled for this transform."
+                  fi

From adaf78ffeb2c1a01f194e94ff5c6d73384f6a6e2 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 6 Dec 2024 10:33:14 -0500
Subject: [PATCH 20/28] clear testing of docker login

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/.make.cicd.targets | 2 --
 1 file changed, 2 deletions(-)

diff --git a/transforms/.make.cicd.targets b/transforms/.make.cicd.targets
index e77715b80..23475f57f 100644
--- a/transforms/.make.cicd.targets
+++ b/transforms/.make.cicd.targets
@@ -61,7 +61,6 @@ test-image-spark:
 	$(MAKE) BUILD_SPECIFIC_RUNTIME=spark test-image
 
 test-image:: .default.build-lib-wheel
-	$(DOCKER) login $(DOCKER_HOSTNAME) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)'
 	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \
 		if [ -e Dockerfile.python ]; then \
 			$(MAKE) DOCKER_FILE=Dockerfile.python \
@@ -102,7 +101,6 @@ image-spark:
 
 image:: .default.build-lib-wheel
 	## Build all possible images unless a specific runtime is specified
-	$(DOCKER) login $(DOCKER_HOSTNAME) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)'
 	@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \
 		if [ -e Dockerfile.python ]; then \
 			$(MAKE) DOCKER_FILE=Dockerfile.python \

From 0c38a4cd9b48fc333a11cbdc12a348c47de9761d Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Tue, 10 Dec 2024 11:05:57 +0100
Subject: [PATCH 21/28] fix Makefile failing targets

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/doc_id/Makefile                       | 4 ++--
 transforms/universal/doc_id/dpk_doc_id/transform_python.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/transforms/universal/doc_id/Makefile b/transforms/universal/doc_id/Makefile
index 1db88041b..6d7dbfbf0 100644
--- a/transforms/universal/doc_id/Makefile
+++ b/transforms/universal/doc_id/Makefile
@@ -19,12 +19,12 @@ run-cli-spark-sample:
 	make venv
 	source venv/bin/activate && \
 	$(PYTHON) -m dpk_$(TRANSFORM_NAME).spark.transform \
-                --run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}"  \
+                --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}"  \
                 --doc_id_int True
 
 run-cli-ray-sample: 
 	make venv
 	source venv/bin/activate && \
 	$(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \
-                --run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}"  \
+                --run_locally True --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}"  \
                 --doc_id_int True
diff --git a/transforms/universal/doc_id/dpk_doc_id/transform_python.py b/transforms/universal/doc_id/dpk_doc_id/transform_python.py
index 4dd5b4c6f..b8b886227 100644
--- a/transforms/universal/doc_id/dpk_doc_id/transform_python.py
+++ b/transforms/universal/doc_id/dpk_doc_id/transform_python.py
@@ -70,7 +70,7 @@ def apply_input_params(self, args: Namespace) -> bool:
         return super().apply_input_params(args=args)
 
 
-class DocIDRuntime(DefaultPythonTransformRuntime):
+class DocIDPythonRuntime(DefaultPythonTransformRuntime):
     """
     Exact dedup runtime support
     """
@@ -110,7 +110,7 @@ class DocIDPythonTransformRuntimeConfiguration(PythonTransformRuntimeConfigurati
     def __init__(self):
         super().__init__(
             transform_config=DocIDTransformConfiguration(),
-            runtime_class=DocIDRuntime,
+            runtime_class=DocIDPythonRuntime,
         )
 
 

From 87eecf0881da090a0881d9ba681517b286f0eca4 Mon Sep 17 00:00:00 2001
From: SHAHROKH DAIJAVAD <shahrokh@us.ibm.com>
Date: Tue, 10 Dec 2024 12:34:20 -0800
Subject: [PATCH 22/28] README changes

Signed-off-by: SHAHROKH DAIJAVAD <shahrokh@us.ibm.com>
---
 transforms/universal/doc_id/README.md | 53 ++++++++++++---------------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/transforms/universal/doc_id/README.md b/transforms/universal/doc_id/README.md
index 7146ff4bd..0be7b6864 100644
--- a/transforms/universal/doc_id/README.md
+++ b/transforms/universal/doc_id/README.md
@@ -1,6 +1,6 @@
 # Document ID Python Annotator
 
-Please see the set of [transform project conventions](../../../README.md) for details on general project conventions,
+Please see the set of [transform project conventions](../../README.md#transform-project-conventions) for details on general project conventions,
 transform configuration, testing and IDE set up.
 
 ## Contributors
@@ -18,7 +18,7 @@ the `transform()` method. To store this ID in the data, specify the desired colu
 parameter.
 
 Document IDs are essential for tracking annotations linked to specific documents. They are also required for processes
-like [fuzzy deduplication](../../fdedup/README.md), which depend on the presence of integer IDs. If your dataset lacks document ID
+like [fuzzy deduplication](../fdedup/README.md), which depend on the presence of integer IDs. If your dataset lacks document ID
 columns, this transform can be used to generate them.
 
 ## Input Columns Used by This Transform
@@ -35,7 +35,7 @@ columns, this transform can be used to generate them.
 
 ## Configuration and Command Line Options
 
-The set of dictionary keys defined in [DocIDTransform](src/doc_id_transform_base.py)
+The set of dictionary keys defined in [DocIDTransform](dpk_doc_id/transform.py)
 configuration for values are as follows:
 
 * _doc_column_ - specifies name of the column containing the document (required for ID generation)
@@ -50,7 +50,7 @@ At least one of _hash_column_ or _int_id_column_ must be specified.
 ### Launched Command Line Options 
 When running the transform with the Ray launcher (i.e. TransformLauncher),
 the following command line arguments are available in addition to 
-[the options provided by the ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md).
+[the options provided by the ray launcher](../../../data-processing-lib/doc/ray-launcher-options.md).
 ```
   --doc_id_doc_column DOC_ID_DOC_COLUMN
                         doc column name
@@ -64,12 +64,11 @@ the following command line arguments are available in addition to
 These correspond to the configuration keys described above.
 
 ### Running the samples
-To run the samples, use the following `make` targets
+To run the samples, use the following `make` target
 
-* `run-cli-sample` - runs src/doc_id_transform_python.py using command line args
-* `run-local-sample` - runs src/doc_id_local_python.py
+* `run-cli-sample` - runs dpk_doc_id/transform_python.py using command line args
 
-These targets will activate the virtual environment and set up any configuration needed.
+This target will activate the virtual environment and sets up any configuration needed.
 Use the `-n` option of `make` to see the detail of what is done to run the sample.
 
 For example, 
@@ -85,17 +84,17 @@ To see results of the transform.
 
 ### Code example
 
-[notebook](../doc_id.ipynb)
+[notebook](doc_id.ipynb)
 
 ### Transforming data using the transform image
 
 To use the transform image to transform your data, please refer to the 
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+[running images quickstart](../../../doc/quick-start/run-transform-image.md),
 substituting the name of this transform image and runtime as appropriate.
 
 ## Testing
 
-Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md)
+Following [the testing strategy of data-processing-lib](../../../data-processing-lib/doc/transform-testing.md)
 
 Currently we have:
 - [Unit test](test/test_doc_id_python.py)
@@ -105,7 +104,7 @@ Currently we have:
 # Document ID Ray Annotator
 
 Please see the set of
-[transform project conventions](../../../README.md)
+[transform project conventions](../../README.md#transform-project-conventions)
 for details on general project conventions, transform configuration,
 testing and IDE set up.
 
@@ -115,31 +114,25 @@ This project wraps the Document ID transform with a Ray runtime.
 ## Configuration and command line Options
 
 Document ID configuration and command line options are the same as for the
-[base python transform](../python/README.md).
+base python transform.
 
 ## Building
 
-A [docker file](Dockerfile) that can be used for building docker image. You can use
+A [docker file](Dockerfile.ray) that can be used for building docker the ray image. You can use
 
 ```shell
 make build 
 ```
 
-## Driver options
-
-## Configuration and command line Options
-
-See [Python documentation](../python/README.md)
-
 ## Running
 
 ### Launched Command Line Options 
-When running the transform with the Ray launcher (i.e. TransformLauncher),
-the following [command line arguments](../python/README.md) are available in addition to 
-[the options provided by the ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md).
+When running the transform with the Ray launcher (i.e., RayTransformLauncher), in addition to Python 
+command line options, 
+[there are options provided by the ray launcher](../../../data-processing-lib/doc/ray-launcher-options.md).
 
 To use the transform image to transform your data, please refer to the
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+[running images quickstart](../../../doc/quick-start/run-transform-image.md),
 substituting the name of this transform image and runtime as appropriate.
 
 # Document ID Spark Annotator
@@ -147,24 +140,24 @@ substituting the name of this transform image and runtime as appropriate.
 ## Summary 
 
 This transform assigns a unique integer ID to each row in a Spark DataFrame. It relies on the
-[monotonically_increasing_id](https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.sql.functions.monotonically_increasing_id.html)
+[monotonically_increasing_id](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.monotonically_increasing_id.html)
 pyspark function to generate the unique integer IDs. As described in the documentation of this function:
 > The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. 
 
 ## Configuration and command line Options
 
 Document ID configuration and command line options are the same as for the
-[base python transform](../python/README.md).
+base python transform.
 
 ## Running
-You can run the [doc_id_local.py](src/doc_id_local_spark.py) (spark-based implementation) to transform the
+You can run the [doc_id_local.py](dpk_doc_id/local.py) (spark-based implementation) to transform the
 `test1.parquet` file in [test input data](test-data/input) to an `output` directory.  The directory will contain both
 the new annotated `test1.parquet` file and the `metadata.json` file.
 
 ### Launched Command Line Options 
-When running the transform with the Spark launcher (i.e. SparkTransformLauncher), the following command line arguments
+When running the transform with the Spark launcher (i.e., SparkTransformLauncher), the following command line arguments
 are available in addition to the options provided by the
-[python launcher](../../../../data-processing-lib/doc/python-launcher-options.md).
+[python launcher](../../../data-processing-lib/doc/python-launcher-options.md).
 
 ```
   --doc_id_column_name DOC_ID_COLUMN_NAME
@@ -197,5 +190,5 @@ The metadata generated by the Spark `doc_id` transform contains the following st
 ### Transforming data using the transform image
 
 To use the transform image to transform your data, please refer to the 
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+[running images quickstart](../../../doc/quick-start/run-transform-image.md),
 substituting the name of this transform image and runtime as appropriate.

From 3b2420bdc7466081cb2bffad8d15e308c3e83a22 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Tue, 10 Dec 2024 22:29:33 +0100
Subject: [PATCH 23/28] fix notebook

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/doc_id/doc_id.ipynb      | 97 +++++++++++++------
 .../doc_id/dpk_doc_id/transform_python.py     | 26 ++++-
 2 files changed, 91 insertions(+), 32 deletions(-)

diff --git a/transforms/universal/doc_id/doc_id.ipynb b/transforms/universal/doc_id/doc_id.ipynb
index 7ecab7d65..8cf8039d7 100644
--- a/transforms/universal/doc_id/doc_id.ipynb
+++ b/transforms/universal/doc_id/doc_id.ipynb
@@ -24,7 +24,7 @@
     "## This is here as a reference only\n",
     "# Users and application developers must use the right tag for the latest from pypi\n",
     "%pip install data-prep-toolkit\n",
-    "%pip install data-prep-toolkit-transforms==0.2.2.dev3"
+    "%pip install data-prep-toolkit-transforms"
    ]
   },
   {
@@ -51,23 +51,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
+   "execution_count": 1,
+   "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "import sys\n",
-    "\n",
-    "from data_processing.runtime.pure_python import PythonTransformLauncher\n",
-    "from data_processing.utils import ParamsUtils\n",
-    "from doc_id_transform_python import DocIDPythonTransformRuntimeConfiguration\n",
-    "from doc_id_transform_base import (\n",
-    "    doc_column_name_cli_param,\n",
-    "    hash_column_name_cli_param,\n",
-    "    int_column_name_cli_param,\n",
-    "    start_id_cli_param,\n",
-    ")"
+    "from dpk_doc_id.transform_python import DocIDRuntime"
    ]
   },
   {
@@ -82,7 +71,11 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
-   "metadata": {},
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
    "outputs": [],
    "source": [
     "\n",
@@ -110,24 +103,55 @@
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f",
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "21:42:29 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'hash_column', 'int_column': 'int_id_column', 'start_id': 5}\n",
+      "21:42:29 INFO - pipeline id pipeline_id\n",
+      "21:42:29 INFO - code location None\n",
+      "21:42:29 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n",
+      "21:42:29 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "21:42:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "21:42:29 INFO - orchestrator doc_id started at 2024-12-10 21:42:29\n",
+      "21:42:29 INFO - Number of files is 1, source profile {'max_file_size': 0.034458160400390625, 'min_file_size': 0.034458160400390625, 'total_file_size': 0.034458160400390625}\n",
+      "21:42:32 INFO - Completed 1 files (100.0%) in 0.049 min\n",
+      "21:42:32 INFO - Done processing 1 files, waiting for flush() completion.\n",
+      "21:42:32 INFO - done flushing in 0.0 sec\n",
+      "21:42:32 INFO - Completed execution in 0.049 min, execution result 0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "##### ***** Use python runtime to invoke the transform"
+    "DocIDRuntime(input_folder= \"test-data/input\",\n",
+    "        output_folder= \"output\",\n",
+    "        doc_id_doc_column= \"contents\",\n",
+    "        doc_id_hash_column= \"hash_column\",\n",
+    "        doc_id_int_column= \"int_id_column\",\n",
+    "        doc_id_start_id= 5).transform()"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0775e400-7469-49a6-8998-bd4772931459",
+   "cell_type": "markdown",
+   "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "%%capture\n",
-    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
-    "launcher = PythonTransformLauncher(runtime_config=DocIDPythonTransformRuntimeConfiguration())\n",
-    "launcher.launch()"
+    "##### ***** Use python runtime to invoke the transform"
    ]
   },
   {
@@ -140,13 +164,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "7276fe84-6512-4605-ab65-747351e13a7c",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['output/sample1.parquet', 'output/metadata.json']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "import glob\n",
-    "glob.glob(\"python/output/*\")"
+    "glob.glob(\"output/*\")"
    ]
   },
   {
@@ -174,7 +209,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.10"
   }
  },
  "nbformat": 4,
diff --git a/transforms/universal/doc_id/dpk_doc_id/transform_python.py b/transforms/universal/doc_id/dpk_doc_id/transform_python.py
index b8b886227..0d58c850a 100644
--- a/transforms/universal/doc_id/dpk_doc_id/transform_python.py
+++ b/transforms/universal/doc_id/dpk_doc_id/transform_python.py
@@ -9,7 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ################################################################################
-
+import sys
 from argparse import Namespace
 from typing import Any
 
@@ -20,6 +20,7 @@
     PythonTransformRuntimeConfiguration,
 )
 from data_processing.transform import TransformStatistics
+from data_processing.utils import ParamsUtils
 from dpk_doc_id.transform import (
     DocIDTransformBase,
     DocIDTransformConfigurationBase,
@@ -114,6 +115,29 @@ def __init__(self):
         )
 
 
+class DocIDRuntime:
+    def __init__(self, **kwargs):
+        self.params = {}
+        for key in kwargs:
+            self.params[key] = kwargs[key]
+        # if input_folder and output_folder are specified, then assume it is represent data_local_config
+        try:
+            local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")}
+            self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf)
+            del self.params["input_folder"]
+            del self.params["output_folder"]
+        except:
+            pass
+
+    def transform(self):
+        sys.argv = ParamsUtils.dict_to_req(d=(self.params))
+        # create launcher
+        launcher = PythonTransformLauncher(DocIDPythonTransformRuntimeConfiguration())
+        # launch
+        return_code = launcher.launch()
+        return return_code
+
+
 if __name__ == "__main__":
     launcher = PythonTransformLauncher(DocIDPythonTransformRuntimeConfiguration())
     launcher.launch()

From 30912d1a64bfe90e3ac9a180d5ba842bfdcad220 Mon Sep 17 00:00:00 2001
From: SHAHROKH DAIJAVAD <shahrokh@us.ibm.com>
Date: Tue, 10 Dec 2024 13:39:52 -0800
Subject: [PATCH 24/28] More changes to README

Signed-off-by: SHAHROKH DAIJAVAD <shahrokh@us.ibm.com>
---
 transforms/universal/doc_id/README.md | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/transforms/universal/doc_id/README.md b/transforms/universal/doc_id/README.md
index 0be7b6864..631565e5c 100644
--- a/transforms/universal/doc_id/README.md
+++ b/transforms/universal/doc_id/README.md
@@ -63,25 +63,6 @@ the following command line arguments are available in addition to
 ```
 These correspond to the configuration keys described above.
 
-### Running the samples
-To run the samples, use the following `make` target
-
-* `run-cli-sample` - runs dpk_doc_id/transform_python.py using command line args
-
-This target will activate the virtual environment and sets up any configuration needed.
-Use the `-n` option of `make` to see the detail of what is done to run the sample.
-
-For example, 
-```shell
-make run-cli-sample
-...
-```
-Then 
-```shell
-ls output
-```
-To see results of the transform.
-
 ### Code example
 
 [notebook](doc_id.ipynb)
@@ -124,8 +105,6 @@ A [docker file](Dockerfile.ray) that can be used for building docker the ray ima
 make build 
 ```
 
-## Running
-
 ### Launched Command Line Options 
 When running the transform with the Ray launcher (i.e., RayTransformLauncher), in addition to Python 
 command line options, 

From 0cbe63f9f2f43d244f6b7c4b59f03d375d6a1131 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Wed, 11 Dec 2024 07:35:15 +0100
Subject: [PATCH 25/28] fix notebook

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/doc_id/doc_id.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transforms/universal/doc_id/doc_id.ipynb b/transforms/universal/doc_id/doc_id.ipynb
index 8cf8039d7..6bfbb6749 100644
--- a/transforms/universal/doc_id/doc_id.ipynb
+++ b/transforms/universal/doc_id/doc_id.ipynb
@@ -24,7 +24,7 @@
     "## This is here as a reference only\n",
     "# Users and application developers must use the right tag for the latest from pypi\n",
     "%pip install data-prep-toolkit\n",
-    "%pip install data-prep-toolkit-transforms"
+    "%pip install data-prep-toolkit-transforms[doc_id]"
    ]
   },
   {

From cabd57766ac4f29d7964c370eebf83be52a2f78e Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Thu, 12 Dec 2024 11:50:18 +0100
Subject: [PATCH 26/28] fix notebook

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/doc_id/doc_id.ipynb      | 41 ++-----------------
 .../doc_id/dpk_doc_id/transform_python.py     |  4 +-
 2 files changed, 5 insertions(+), 40 deletions(-)

diff --git a/transforms/universal/doc_id/doc_id.ipynb b/transforms/universal/doc_id/doc_id.ipynb
index 6bfbb6749..9c1e4916f 100644
--- a/transforms/universal/doc_id/doc_id.ipynb
+++ b/transforms/universal/doc_id/doc_id.ipynb
@@ -51,12 +51,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from dpk_doc_id.transform_python import DocIDRuntime"
+    "from dpk_doc_id.transform_python import DocID"
    ]
   },
   {
@@ -70,41 +70,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
-   "metadata": {
-    "jupyter": {
-     "source_hidden": true
-    }
-   },
-   "outputs": [],
-   "source": [
-    "\n",
-    "# create parameters\n",
-    "input_folder = os.path.join(\"python\", \"test-data\", \"input\")\n",
-    "output_folder = os.path.join( \"python\", \"output\")\n",
-    "local_conf = {\n",
-    "    \"input_folder\": input_folder,\n",
-    "    \"output_folder\": output_folder,\n",
-    "}\n",
-    "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n",
-    "params = {\n",
-    "    # Data access. Only required parameters are specified\n",
-    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
-    "    # execution info\n",
-    "    \"runtime_pipeline_id\": \"pipeline_id\",\n",
-    "    \"runtime_job_id\": \"job_id\",\n",
-    "    \"runtime_code_location\": ParamsUtils.convert_to_ast(code_location),\n",
-    "    # doc id params\n",
-    "    doc_column_name_cli_param: \"contents\",\n",
-    "    hash_column_name_cli_param: \"hash_column\",\n",
-    "    int_column_name_cli_param: \"int_id_column\",\n",
-    "    start_id_cli_param: 5,\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
    "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f",
    "metadata": {},
    "outputs": [
@@ -138,7 +103,7 @@
     }
    ],
    "source": [
-    "DocIDRuntime(input_folder= \"test-data/input\",\n",
+    "DocID(input_folder= \"test-data/input\",\n",
     "        output_folder= \"output\",\n",
     "        doc_id_doc_column= \"contents\",\n",
     "        doc_id_hash_column= \"hash_column\",\n",
diff --git a/transforms/universal/doc_id/dpk_doc_id/transform_python.py b/transforms/universal/doc_id/dpk_doc_id/transform_python.py
index 0d58c850a..27fc7bc3c 100644
--- a/transforms/universal/doc_id/dpk_doc_id/transform_python.py
+++ b/transforms/universal/doc_id/dpk_doc_id/transform_python.py
@@ -71,7 +71,7 @@ def apply_input_params(self, args: Namespace) -> bool:
         return super().apply_input_params(args=args)
 
 
-class DocIDPythonRuntime(DefaultPythonTransformRuntime):
+class DocIDRuntime(DefaultPythonTransformRuntime):
     """
     Exact dedup runtime support
     """
@@ -115,7 +115,7 @@ def __init__(self):
         )
 
 
-class DocIDRuntime:
+class DocID:
     def __init__(self, **kwargs):
         self.params = {}
         for key in kwargs:

From 4750dd749a6fc0122ed2178ec12ace0cf1869186 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Thu, 12 Dec 2024 15:04:03 +0100
Subject: [PATCH 27/28] added ray notebook

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/doc_id/doc_id-ray.ipynb  | 198 ++++++++++++++++++
 transforms/universal/doc_id/doc_id.ipynb      |  28 +--
 .../doc_id/dpk_doc_id/ray/transform.py        |  35 +++-
 .../doc_id/dpk_doc_id/transform_python.py     |   2 +-
 4 files changed, 246 insertions(+), 17 deletions(-)
 create mode 100644 transforms/universal/doc_id/doc_id-ray.ipynb

diff --git a/transforms/universal/doc_id/doc_id-ray.ipynb b/transforms/universal/doc_id/doc_id-ray.ipynb
new file mode 100644
index 000000000..9bfb99785
--- /dev/null
+++ b/transforms/universal/doc_id/doc_id-ray.ipynb
@@ -0,0 +1,198 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "afd55886-5f5b-4794-838e-ef8179fb0394",
+   "metadata": {},
+   "source": [
+    "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
+    "```\n",
+    "make venv \n",
+    "source venv/bin/activate \n",
+    "pip install jupyterlab\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "## This is here as a reference only\n",
+    "# Users and application developers must use the right tag for the latest from pypi\n",
+    "%pip install data-prep-toolkit[ray]\n",
+    "%pip install data-prep-toolkit-transforms[doc_id]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "##### **** Configure the transform parameters. The set of dictionary keys holding DocIDTransform configuration for values are as follows: \n",
+    "* doc_column - specifies name of the column containing the document (required for ID generation)\n",
+    "* hash_column - specifies name of the column created to hold the string document id, if None, id is not generated\n",
+    "* int_id_column - specifies name of the column created to hold the integer document id, if None, id is not generated\n",
+    "* start_id - an id from which ID generator starts () "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ebf1f782-0e61-485c-8670-81066beb734c",
+   "metadata": {},
+   "source": [
+    "##### ***** Import required classes and modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-12-12 15:01:10,711\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from dpk_doc_id.ray.transform import DocID"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7234563c-2924-4150-8a31-4aec98c1bf33",
+   "metadata": {},
+   "source": [
+    "##### ***** Setup runtime parameters for this transform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "15:01:11 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'hash_column', 'int_column': 'int_id_column', 'start_id': 5}\n",
+      "15:01:11 INFO - pipeline id pipeline_id\n",
+      "15:01:11 INFO - code location None\n",
+      "15:01:11 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
+      "15:01:11 INFO - actor creation delay 0\n",
+      "15:01:11 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n",
+      "15:01:11 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n",
+      "15:01:11 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "15:01:11 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "15:01:11 INFO - Running locally\n",
+      "2024-12-12 15:01:18,744\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
+      "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:20 INFO - orchestrator started at 2024-12-12 15:01:20\n",
+      "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:20 INFO - Number of files is 1, source profile {'max_file_size': 0.034458160400390625, 'min_file_size': 0.034458160400390625, 'total_file_size': 0.034458160400390625}\n",
+      "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:20 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 13.119487762451172, 'object_store': 2.0}\n",
+      "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:20 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
+      "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:23 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
+      "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:23 INFO - Completed processing 1 files in 0.001 min\n",
+      "\u001b[36m(orchestrate pid=85413)\u001b[0m 15:01:23 INFO - done flushing in 0.001 sec\n",
+      "15:01:33 INFO - Completed execution in 0.36 min, execution result 0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "DocID(input_folder= \"test-data/input\",\n",
+    "        output_folder= \"output\",\n",
+    "        run_locally= True,\n",
+    "        doc_id_doc_column= \"contents\",\n",
+    "        doc_id_hash_column= \"hash_column\",\n",
+    "        doc_id_int_column= \"int_id_column\",\n",
+    "        doc_id_start_id= 5).transform()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
+   "metadata": {},
+   "source": [
+    "##### ***** Use python runtime to invoke the transform"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
+   "metadata": {},
+   "source": [
+    "##### **** The specified folder will include the transformed parquet files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7276fe84-6512-4605-ab65-747351e13a7c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['output/sample1.parquet', 'output/metadata.json']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import glob\n",
+    "glob.glob(\"output/*\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/transforms/universal/doc_id/doc_id.ipynb b/transforms/universal/doc_id/doc_id.ipynb
index 9c1e4916f..cc007e09a 100644
--- a/transforms/universal/doc_id/doc_id.ipynb
+++ b/transforms/universal/doc_id/doc_id.ipynb
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58",
    "metadata": {},
    "outputs": [],
@@ -69,7 +69,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f",
    "metadata": {},
    "outputs": [
@@ -77,18 +77,18 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "21:42:29 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'hash_column', 'int_column': 'int_id_column', 'start_id': 5}\n",
-      "21:42:29 INFO - pipeline id pipeline_id\n",
-      "21:42:29 INFO - code location None\n",
-      "21:42:29 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n",
-      "21:42:29 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "21:42:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "21:42:29 INFO - orchestrator doc_id started at 2024-12-10 21:42:29\n",
-      "21:42:29 INFO - Number of files is 1, source profile {'max_file_size': 0.034458160400390625, 'min_file_size': 0.034458160400390625, 'total_file_size': 0.034458160400390625}\n",
-      "21:42:32 INFO - Completed 1 files (100.0%) in 0.049 min\n",
-      "21:42:32 INFO - Done processing 1 files, waiting for flush() completion.\n",
-      "21:42:32 INFO - done flushing in 0.0 sec\n",
-      "21:42:32 INFO - Completed execution in 0.049 min, execution result 0\n"
+      "15:00:23 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'hash_column', 'int_column': 'int_id_column', 'start_id': 5}\n",
+      "15:00:23 INFO - pipeline id pipeline_id\n",
+      "15:00:23 INFO - code location None\n",
+      "15:00:23 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n",
+      "15:00:23 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "15:00:23 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "15:00:23 INFO - orchestrator doc_id started at 2024-12-12 15:00:23\n",
+      "15:00:23 INFO - Number of files is 1, source profile {'max_file_size': 0.034458160400390625, 'min_file_size': 0.034458160400390625, 'total_file_size': 0.034458160400390625}\n",
+      "15:00:26 INFO - Completed 1 files (100.0%) in 0.047 min\n",
+      "15:00:26 INFO - Done processing 1 files, waiting for flush() completion.\n",
+      "15:00:26 INFO - done flushing in 0.0 sec\n",
+      "15:00:26 INFO - Completed execution in 0.047 min, execution result 0\n"
      ]
     },
     {
diff --git a/transforms/universal/doc_id/dpk_doc_id/ray/transform.py b/transforms/universal/doc_id/dpk_doc_id/ray/transform.py
index 4ff20b9f5..4c48ddeb2 100644
--- a/transforms/universal/doc_id/dpk_doc_id/ray/transform.py
+++ b/transforms/universal/doc_id/dpk_doc_id/ray/transform.py
@@ -9,12 +9,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ################################################################################
-
+import sys
 from typing import Any
 
 import ray
 from data_processing.data_access import DataAccessFactoryBase
-from data_processing.utils import UnrecoverableException
+from data_processing.utils import ParamsUtils, UnrecoverableException
 from data_processing_ray.runtime.ray import (
     DefaultRayTransformRuntime,
     RayTransformLauncher,
@@ -107,6 +107,37 @@ def __init__(self):
         super().__init__(transform_config=DocIDRayTransformConfiguration(), runtime_class=DocIDRayRuntime)
 
 
+# Class used by the notebooks to ingest binary files and create parquet files
+class DocID:
+    def __init__(self, **kwargs):
+        self.params = {}
+        for key in kwargs:
+            self.params[key] = kwargs[key]
+        # if input_folder and output_folder are specified, then assume it is represent data_local_config
+        try:
+            local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")}
+            self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf)
+            del self.params["input_folder"]
+            del self.params["output_folder"]
+        except:
+            pass
+        try:
+            worker_options = {k: self.params[k] for k in ("num_cpus", "memory")}
+            self.params["runtime_worker_options"] = ParamsUtils.convert_to_ast(worker_options)
+            del self.params["num_cpus"]
+            del self.params["memory"]
+        except:
+            pass
+
+    def transform(self):
+        sys.argv = ParamsUtils.dict_to_req(d=(self.params))
+        # create launcher
+        launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration())
+        # launch
+        return_code = launcher.launch()
+        return return_code
+
+
 if __name__ == "__main__":
     launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration())
     launcher.launch()
diff --git a/transforms/universal/doc_id/dpk_doc_id/transform_python.py b/transforms/universal/doc_id/dpk_doc_id/transform_python.py
index 27fc7bc3c..f97ace554 100644
--- a/transforms/universal/doc_id/dpk_doc_id/transform_python.py
+++ b/transforms/universal/doc_id/dpk_doc_id/transform_python.py
@@ -111,7 +111,7 @@ class DocIDPythonTransformRuntimeConfiguration(PythonTransformRuntimeConfigurati
     def __init__(self):
         super().__init__(
             transform_config=DocIDTransformConfiguration(),
-            runtime_class=DocIDPythonRuntime,
+            runtime_class=DocIDRuntime,
         )
 
 

From 2eb47bda47f362745b27853e98f04668a95492c9 Mon Sep 17 00:00:00 2001
From: SHAHROKH DAIJAVAD <shahrokh@us.ibm.com>
Date: Thu, 12 Dec 2024 10:49:40 -0800
Subject: [PATCH 28/28] Added the link to the Ray notebook in README

Signed-off-by: SHAHROKH DAIJAVAD <shahrokh@us.ibm.com>
---
 transforms/universal/doc_id/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/transforms/universal/doc_id/README.md b/transforms/universal/doc_id/README.md
index 631565e5c..a29ada704 100644
--- a/transforms/universal/doc_id/README.md
+++ b/transforms/universal/doc_id/README.md
@@ -114,6 +114,10 @@ To use the transform image to transform your data, please refer to the
 [running images quickstart](../../../doc/quick-start/run-transform-image.md),
 substituting the name of this transform image and runtime as appropriate.
 
+### Code example
+
+[notebook](doc_id-ray.ipynb)
+
 # Document ID Spark Annotator
 
 ## Summary