Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor doc_id with its own dpk_ module name #860

Merged
merged 30 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
3c9b999
refactored code as its own module
touma-I Dec 5, 2024
49a22ae
added __init__
touma-I Dec 5, 2024
24b6d9d
merge with latest from dev
touma-I Dec 5, 2024
808521a
Fix typo
touma-I Dec 5, 2024
b8f3b6f
remove spark unit test for now
touma-I Dec 5, 2024
3baad3e
Show example for runnign ray runtime
touma-I Dec 5, 2024
37881ef
fixing issues with spark
touma-I Dec 5, 2024
9c869ee
remove BASE_IMAGE arg from dockerfile.spark
touma-I Dec 6, 2024
6fa0c04
added login to quay.io
touma-I Dec 6, 2024
bbe9a02
debug registry credential
touma-I Dec 6, 2024
b77aaa3
use dpk secrets
touma-I Dec 6, 2024
2c52fd5
testing registry user
touma-I Dec 6, 2024
5ffadb4
testing registry user
touma-I Dec 6, 2024
8b234a4
testing environment secrets
touma-I Dec 6, 2024
878fa42
testing environment secrets
touma-I Dec 6, 2024
5632c06
testing environment secrets
touma-I Dec 6, 2024
6da893f
testing environment secrets
touma-I Dec 6, 2024
d426439
testing environment secrets
touma-I Dec 6, 2024
e8bb04a
Delete .github/workflows/test-universal-doc_id.yml
touma-I Dec 6, 2024
3647ebc
restore workflow file
touma-I Dec 6, 2024
adaf78f
clear testing of docker login
touma-I Dec 6, 2024
0c38a4c
fix Makefile failing targets
touma-I Dec 10, 2024
fd0b261
merged with dev
touma-I Dec 10, 2024
87eecf0
README changes
shahrokhDaijavad Dec 10, 2024
3b2420b
fix notebook
touma-I Dec 10, 2024
30912d1
More changes to README
shahrokhDaijavad Dec 10, 2024
0cbe63f
fix notebook
touma-I Dec 11, 2024
cabd577
fix notebook
touma-I Dec 12, 2024
4750dd7
added ray notebook
touma-I Dec 12, 2024
2eb47bd
Added the link to the Ray notebook in README
shahrokhDaijavad Dec 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 52 additions & 37 deletions transforms/.make.cicd.targets
Original file line number Diff line number Diff line change
Expand Up @@ -51,63 +51,78 @@ publish:

test-image-sequence:: .defaults.lib-whl-image .transforms.test-image-help .transforms.clean

test-image-python:
$(MAKE) BUILD_SPECIFIC_RUNTIME=python test-image

test-image-ray:
$(MAKE) BUILD_SPECIFIC_RUNTIME=ray test-image

test-image-spark:
$(MAKE) BUILD_SPECIFIC_RUNTIME=spark test-image

test-image:: .default.build-lib-wheel
@if [ -e Dockerfile.python ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.python \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_PYTHON_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
test-image-sequence ; \
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \
if [ -e Dockerfile.python ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.python \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_PYTHON_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
test-image-sequence ; \
fi ;\
fi
@if [ -e Dockerfile.ray ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.ray \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_RAY_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
BASE_IMAGE=$(RAY_BASE_IMAGE) \
test-image-sequence ; \
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "ray" ]; then \
if [ -e Dockerfile.ray ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.ray \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_RAY_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
BASE_IMAGE=$(RAY_BASE_IMAGE) \
test-image-sequence ; \
fi ;\
fi
@if [ -e Dockerfile.spark ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.spark \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
BASE_IMAGE=$(SPARK_BASE_IMAGE) \
test-image-sequence ; \
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "spark" ]; then \
if [ -e Dockerfile.spark ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.spark \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
BASE_IMAGE=$(SPARK_BASE_IMAGE) \
test-image-sequence ; \
fi ;\
fi
-rm -rf data-processing-dist


image-python:
@if [ -e Dockerfile.python ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.python \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
.defaults.lib-whl-image ; \
fi
$(MAKE) BUILD_SPECIFIC_RUNTIME=python image

image-ray:
@if [ -e Dockerfile.ray ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.ray \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
BASE_IMAGE=$(RAY_BASE_IMAGE) \
.defaults.lib-whl-image ; \
fi
$(MAKE) BUILD_SPECIFIC_RUNTIME=ray image

image-spark:
@if [ -e Dockerfile.spark ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.spark \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
BASE_IMAGE=$(SPARK_BASE_IMAGE) \
.defaults.lib-whl-image ; \
fi
$(MAKE) BUILD_SPECIFIC_RUNTIME=spark image

image:: .default.build-lib-wheel
## Build all possible images unless a specific runtime is specified
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \
$(MAKE) image-python ; \
if [ -e Dockerfile.python ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.python \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
.defaults.lib-whl-image ; \
fi ; \
fi
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "ray" ]; then \
$(MAKE) image-ray ; \
if [ -e Dockerfile.ray ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.ray \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
BASE_IMAGE=$(RAY_BASE_IMAGE) \
.defaults.lib-whl-image ; \
fi ; \
fi
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "spark" ]; then \
$(MAKE) image-spark ; \
if [ -e Dockerfile.spark ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.spark \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
BASE_IMAGE=$(SPARK_BASE_IMAGE) \
.defaults.lib-whl-image ; \
fi ; \
fi
-rm -rf data-processing-dist

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@ FROM docker.io/python:3.10.14-slim-bullseye

RUN pip install --upgrade --no-cache-dir pip

# install pytest
RUN pip install --no-cache-dir pytest

# Create a user and use it to run the transform
RUN useradd -ms /bin/bash dpk
USER dpk
Expand All @@ -16,19 +13,10 @@ ARG DPK_WHEEL_FILE_NAME
COPY --chown=dpk:root data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root README.md README.md
COPY --chown=dpk:root dpk_doc_id/ dpk_doc_id/
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -e .

# copy source data
COPY ./src/doc_id_transform_python.py .
COPY ./src/doc_id_local.py local/
RUN pip install --no-cache-dir -r requirements.txt

# copy test
COPY test/ test/
COPY test-data/ test-data/

# Set environment
ENV PYTHONPATH /home/dpk
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310

FROM ${BASE_IMAGE}

RUN pip install --upgrade --no-cache-dir pip
RUN pip install --upgrade --no-cache-dir pip

# install pytest
RUN pip install --no-cache-dir pytest
Expand All @@ -14,24 +14,9 @@ COPY --chown=ray:users data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]

## Copy the python version of the tansform
COPY --chown=ray:users python-transform/ python-transform/
RUN cd python-transform && pip install --no-cache-dir -e .

# Install ray project source
COPY --chown=ray:users src/ src/
COPY --chown=ray:users pyproject.toml pyproject.toml
COPY --chown=ray:users README.md README.md
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/doc_id_transform_ray.py .

# copy some of the samples in
COPY src/doc_id_local_ray.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/
COPY --chown=ray:users dpk_doc_id/ dpk_doc_id/
COPY --chown=ray:users requirements.txt requirements.txt
RUN pip install -r requirements.txt

# Set environment
ENV PYTHONPATH /home/ray
Expand All @@ -40,4 +25,4 @@ ENV PYTHONPATH /home/ray
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
LABEL git-commit=$GIT_COMMIT
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ARG BASE_IMAGE=quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest
FROM ${BASE_IMAGE}
FROM quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest

USER root
# install pytest
Expand All @@ -15,19 +14,12 @@ RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark]


# Install project source
COPY --chown=spark:root src/ src/
COPY --chown=spark:root pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/doc_id_transform_spark.py .
## Copy the python version of the tansform
COPY --chown=spark:root dpk_doc_id/ dpk_doc_id/
COPY --chown=spark:root requirements.txt requirements.txt
RUN pip install -r requirements.txt

# copy some of the samples in
COPY src/doc_id_local_spark.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/

USER spark

Expand Down
105 changes: 28 additions & 77 deletions transforms/universal/doc_id/Makefile
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The target run-cli-spark-sample don't work:

$ make run-cli-spark-sample
. . .
python3.11 -m dpk_doc_id.spark.transform \
                --run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}"  \
                --doc_id_int True
usage: transform.py [-h] [--doc_id_doc_column DOC_ID_DOC_COLUMN] [--doc_id_hash_column DOC_ID_HASH_COLUMN] [--doc_id_int_column DOC_ID_INT_COLUMN] [--data_s3_cred DATA_S3_CRED]
                    [--data_s3_config DATA_S3_CONFIG] [--data_local_config DATA_LOCAL_CONFIG] [--data_max_files DATA_MAX_FILES] [--data_checkpointing DATA_CHECKPOINTING]
                    [--data_files_to_checkpoint DATA_FILES_TO_CHECKPOINT] [--data_data_sets DATA_DATA_SETS] [--data_files_to_use DATA_FILES_TO_USE] [--data_num_samples DATA_NUM_SAMPLES]
                    [--runtime_parallelization RUNTIME_PARALLELIZATION] [--runtime_pipeline_id RUNTIME_PIPELINE_ID] [--runtime_job_id RUNTIME_JOB_ID] [--runtime_code_location RUNTIME_CODE_LOCATION]
transform.py: error: unrecognized arguments: --run_locally True
make: *** [Makefile:20: run-cli-spark-sample] Error 2

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @cmadam . Is this something that you can help fix? If not, don't worry, I will tackle it in a next iteration.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The target run-cli-ray-sample doesn't work:

$ make run-cli-ray-sample
. . .
python3.11 -m dpk_doc_id.ray.transform \
                --run_locally True --data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}"  \
                --doc_id_int True
17:42:02 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': None, 'int_column': 'True', 'start_id': 0}
17:42:02 INFO - pipeline id pipeline_id
17:42:02 INFO - code location None
17:42:02 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}
17:42:02 INFO - actor creation delay 0
17:42:02 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}
17:42:02 INFO - data factory data_ is using local data access: input_folder - ../test-data/input output_folder - ../output
17:42:02 INFO - data factory data_ max_files -1, n_sample -1
17:42:02 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']
17:42:02 INFO - Running locally
2024-12-06 17:42:05,111	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 
(orchestrate pid=165088) 17:42:07 INFO - orchestrator started at 2024-12-06 17:42:07
(orchestrate pid=165088) 17:42:07 ERROR - No input files to process - exiting

Original file line number Diff line number Diff line change
@@ -1,79 +1,30 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/.make.defaults

setup::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

clean::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

build::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse
venv::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse

image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-src::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

set-versions::
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

kind-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-save-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

.PHONY: workflow-venv
workflow-venv:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-venv; \
fi

.PHONY: workflow-test
workflow-test:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-test; \
fi

.PHONY: workflow-upload
workflow-upload:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-upload; \
fi

.PHONY: workflow-build
workflow-build:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-build; \
fi

include $(REPOROOT)/transforms/.make.cicd.targets

#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=$(shell basename `pwd`)

################################################################################



run-cli-spark-sample:
make venv
source venv/bin/activate && \
$(PYTHON) -m dpk_$(TRANSFORM_NAME).spark.transform \
--data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \
--doc_id_int True

run-cli-ray-sample:
make venv
source venv/bin/activate && \
$(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \
--run_locally True --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \
--doc_id_int True
Loading