diff --git a/transforms/code/code2parquet/python/Dockerfile b/transforms/code/code2parquet/Dockerfile.python similarity index 61% rename from transforms/code/code2parquet/python/Dockerfile rename to transforms/code/code2parquet/Dockerfile.python index 7abaaeb45a..9f38097b72 100644 --- a/transforms/code/code2parquet/python/Dockerfile +++ b/transforms/code/code2parquet/Dockerfile.python @@ -10,28 +10,18 @@ RUN useradd -ms /bin/bash dpk USER dpk WORKDIR /home/dpk ARG DPK_WHEEL_FILE_NAME +ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chown=dpk:root data-processing-dist data-processing-dist +COPY --chown=dpk:users data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} # END OF STEPS destined for a data-prep-kit base image -COPY --chown=dpk:root src/ src/ -COPY --chown=dpk:root pyproject.toml pyproject.toml -COPY --chown=dpk:root requirements.txt requirements.txt -RUN pip install --no-cache-dir -e . - -# copy the main() entry point to the image -COPY ./src/code2parquet_transform_python.py . - -# copy some of the samples in -COPY ./src/code2parquet_local.py local/ - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chown=dpk:users dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chown=dpk:users requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt # Set environment ENV PYTHONPATH /home/dpk diff --git a/transforms/code/code2parquet/ray/Dockerfile b/transforms/code/code2parquet/Dockerfile.ray similarity index 54% rename from transforms/code/code2parquet/ray/Dockerfile rename to transforms/code/code2parquet/Dockerfile.ray index 6681f09d50..b8e52425b0 100644 --- a/transforms/code/code2parquet/ray/Dockerfile +++ b/transforms/code/code2parquet/Dockerfile.ray @@ -1,41 +1,27 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 - FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod g=u /home/ray +RUN chown ray:root /home/ray && chmod 775 /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip # install pytest RUN pip install --no-cache-dir pytest - ARG DPK_WHEEL_FILE_NAME +ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ -RUN cd python-transform && pip install --no-cache-dir -e . - -# Install ray project source -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml -RUN pip install --no-cache-dir -e . - -# copy the main() entry point to the image -COPY --chmod=775 --chown=ray:root src/code2parquet_transform_ray.py . - -# copy some of the samples in -COPY --chmod=775 --chown=ray:root src/code2parquet_local_ray.py local/ -# copy test -COPY --chmod=775 --chown=ray:root test/ test/ -COPY --chmod=775 --chown=ray:root test-data/ test-data/ +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/code2parquet/Makefile b/transforms/code/code2parquet/Makefile index 027d29644f..21c4727efe 100644 --- a/transforms/code/code2parquet/Makefile +++ b/transforms/code/code2parquet/Makefile @@ -1,79 +1,32 @@ REPOROOT=../../.. # Use make help, to see the available rules -include $(REPOROOT)/.make.defaults - -setup:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -clean:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -build:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse -venv:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse - -image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -publish:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-src:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -kind-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-save-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -set-versions: - @# Help: Recursively $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -.PHONY: workflow-venv -workflow-venv: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-venv; \ - fi - -.PHONY: workflow-test -workflow-test: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-test; \ - fi - -.PHONY: workflow-upload -workflow-upload: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-upload; \ - fi - -.PHONY: workflow-build -workflow-build: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-build; \ - fi - +include $(REPOROOT)/transforms/.make.cicd.targets + +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=$(shell basename `pwd`) + +################################################################################ + + + +run-cli-sample: + make venv + source venv/bin/activate && \ + $(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \ + --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" + + + +run-ray-cli-sample: + make venv + source venv/bin/activate && \ + $(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \ + --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \ + --run_locally True + diff --git a/transforms/code/code2parquet/README.md b/transforms/code/code2parquet/README.md deleted file mode 100644 index 56051ef160..0000000000 --- a/transforms/code/code2parquet/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# Code2Parquet Transform -This code2parquet transform is designed to convert raw particularly ZIP files contain programming files (.py, .c, .java, etc) , -into Parquet format. -Per the set of -[transform project conventions](../../README.md#transform-project-conventions) -the following runtimes are available: - -* [python](python/README.md) - provides the base python-based transformation -implementation and python runtime. -* [ray](ray/README.md) - enables the running of the base python transformation -in a Ray runtime -* [kfp_ray](kfp_ray/README.md) - enables running the ray docker image -in a kubernetes cluster using a generated `yaml` file. diff --git a/transforms/code/code2parquet/code2parquet-ray.ipynb b/transforms/code/code2parquet/code2parquet-ray.ipynb new file mode 100644 index 0000000000..c608ba4f3e --- /dev/null +++ b/transforms/code/code2parquet/code2parquet-ray.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "fd7d20d2-1001-420a-a6b6-1efbca17ae8b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-02-13 16:49:38,711\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + ] + } + ], + "source": [ + "from dpk_code2parquet.ray.transform import Code2Parquet" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5957a088-10a0-41fd-9b82-c329f088f2c8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "16:49:41 INFO - data factory code2parquet_ is using local configuration without input/output path\n", + "16:49:41 INFO - data factory code2parquet_ max_files -1, n_sample -1\n", + "16:49:41 INFO - data factory code2parquet_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "16:49:41 INFO - pipeline id pipeline_id\n", + "16:49:41 INFO - code location None\n", + "16:49:41 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "16:49:41 INFO - actor creation delay 0\n", + "16:49:41 INFO - job details {'job category': 'preprocessing', 'job name': 'code2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", + "16:49:41 INFO - data factory data_ is using local data access: input_folder - ./test-data/input/ output_folder - code2parquet_output\n", + "16:49:41 INFO - data factory data_ max_files -1, n_sample -1\n", + "16:49:41 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.zip'], files to checkpoint ['.parquet']\n", + "16:49:41 INFO - Running locally\n", + "2025-02-13 16:49:44,109\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=95855)\u001b[0m 16:49:45 INFO - orchestrator started at 2025-02-13 16:49:45\n", + "\u001b[36m(orchestrate pid=95855)\u001b[0m 16:49:45 INFO - Number of files is 3, source profile {'max_file_size': 27.35206699371338, 'min_file_size': 0.11289310455322266, 'total_file_size': 32.31587600708008}\n", + "\u001b[36m(orchestrate pid=95855)\u001b[0m 16:49:45 INFO - Cluster resources: {'cpus': 16, 'gpus': 0, 'memory': 32.681750488467515, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=95855)\u001b[0m 16:49:45 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=95855)\u001b[0m 16:49:46 INFO - Completed 1 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=95855)\u001b[0m 16:49:46 INFO - Completed 2 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=95855)\u001b[0m 16:49:46 INFO - Completed 2 files (66.667%) in 0.001 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=95855)\u001b[0m 16:49:46 INFO - Completed processing 3 files in 0.002 min\n", + "\u001b[36m(orchestrate pid=95855)\u001b[0m 16:49:46 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/application-java.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/fabric-gateway-java-2.1.1.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/fabric-sdk-java-2.1.1.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/grpc-protobuf-1.23.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/protobuf-java-util-3.10.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/api-common-1.9.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/milagro-crypto-java-0.4.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/grpc-stub-1.23.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/grpc-netty-1.23.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/grpc-core-1.23.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/grpc-protobuf-lite-1.23.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/grpc-api-1.23.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/guava-29.0-jre.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/failureaccess-1.0.1.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/perfmark-api-0.17.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/jsr305-3.0.2.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/checker-qual-2.11.1.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/error_prone_annotations-2.3.4.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/j2objc-annotations-1.3.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/cloudant-client-2.19.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/netty-tcnative-boringssl-static-2.0.30.Final.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/netty-codec-http2-4.1.49.Final.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/protobuf-java-3.10.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/bcpkix-jdk15on-1.62.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/httpclient-4.5.12.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/commons-logging-1.2.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/commons-cli-1.4.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/commons-compress-1.20.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/cloudant-http-2.19.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/commons-io-2.6.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/apache-log4j-extras-1.2.17.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/log4j-1.2.17.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/futures-extra-4.2.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/javax.json-1.1.4.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/snakeyaml-1.26.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/jaxb-api-2.3.1.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/javax.annotation-api-1.3.2.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/gson-2.8.5.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/commons-codec-1.11.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/netty-handler-proxy-4.1.38.Final.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/proto-google-common-protos-1.12.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/netty-codec-http-4.1.49.Final.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/netty-handler-4.1.49.Final.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/netty-codec-socks-4.1.38.Final.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/netty-codec-4.1.49.Final.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/netty-transport-4.1.49.Final.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/netty-buffer-4.1.49.Final.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/netty-resolver-4.1.49.Final.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/netty-common-4.1.49.Final.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/bcprov-jdk15on-1.62.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/httpcore-4.4.13.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/auto-value-annotations-1.7.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/commons-math3-3.6.1.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/javax.activation-api-1.2.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/annotations-4.1.1.4.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/opencensus-contrib-grpc-metrics-0.21.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/opencensus-api-0.21.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/grpc-context-1.23.0.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file application-java/lib/animal-sniffer-annotations-1.17.jar is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/test-data/data_processing/daf/input/ds1/sample2.parquet is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/test-data/data_processing/daf/input/ds1/sample1.parquet is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/test-data/data_processing/daf/input/ds2/sample3.parquet is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/test-data/data_processing/daf/output/ds1/sample1.parquet is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/test-data/data_processing/input/sample1.parquet is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/test-data/data_processing/ray/noop/input/subdir/test1.parquet is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/test-data/data_processing/ray/noop/input/sample1.parquet is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/test-data/data_processing/ray/noop/expected/subdir/test1.parquet is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/test-data/data_processing/ray/noop/expected/sample1.parquet is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/test-data/data_processing/input_multiple/sample2.parquet is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/test-data/data_processing/input_multiple/sample3.parquet is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/test-data/data_processing/input_multiple/sample1.parquet is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/src/data_processing_ray/test_support/__init__.py is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file ray/data-processing-lib.zip is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file environments-master/cfortunes/diebenkorn_notes.dat is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file environments-master/cfortunes/obliquestrategies.dat is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file environments-master/commands/grel is empty. content , skipping\n", + "\u001b[36m(RayTransformFileProcessor pid=95871)\u001b[0m 16:49:46 WARNING - file environments-master/commands/ldid is empty. content , skipping\n", + "16:49:56 INFO - Completed execution in 0.255 min, execution result 0\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Code2Parquet(\n", + " input_folder=\"./test-data/input/\",\n", + " output_folder=\"code2parquet_output\",\n", + " data_files_to_use = ['.zip',],\n", + " supported_languages = \"./test-data/languages/lang_extensions.json\",\n", + " detect_programming_lang = True,\n", + " run_locally = True\n", + " ).transform()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "96b4b004-cb01-46d1-bff9-b8b08de2e691", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "application-java.parquet\n", + "data-processing-lib.parquet\n", + "https___github.com_00000o1_environments_archive_refs_heads_master.parquet\n", + "metadata.json\n" + ] + } + ], + "source": [ + "!ls output/" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/code/code2parquet/code2parquet.ipynb b/transforms/code/code2parquet/code2parquet.ipynb new file mode 100644 index 0000000000..52944835e5 --- /dev/null +++ b/transforms/code/code2parquet/code2parquet.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "9d0cf445-a5e8-4e1f-a8de-55bb51143597", + "metadata": {}, + "outputs": [], + "source": [ + "from dpk_code2parquet.transform_python import Code2Parquet" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f194bdb9-cbc4-4435-9639-2559bc39cbdc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "16:43:58 INFO - data factory code2parquet_ is using local configuration without input/output path\n", + "16:43:58 INFO - data factory code2parquet_ max_files -1, n_sample -1\n", + "16:43:58 INFO - data factory code2parquet_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "16:43:58 INFO - pipeline id pipeline_id\n", + "16:43:58 INFO - code location None\n", + "16:43:58 INFO - data factory data_ is using local data access: input_folder - ./test-data/input/ output_folder - code2parquet_output\n", + "16:43:58 INFO - data factory data_ max_files -1, n_sample -1\n", + "16:43:58 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.zip'], files to checkpoint ['.parquet']\n", + "16:43:58 INFO - orchestrator code2parquet started at 2025-02-13 16:43:58\n", + "16:43:58 INFO - Number of files is 3, source profile {'max_file_size': 27.35206699371338, 'min_file_size': 0.11289310455322266, 'total_file_size': 32.31587600708008}\n", + "16:43:59 WARNING - file application-java/lib/application-java.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/fabric-gateway-java-2.1.1.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/fabric-sdk-java-2.1.1.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/grpc-protobuf-1.23.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/protobuf-java-util-3.10.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/api-common-1.9.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/milagro-crypto-java-0.4.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/grpc-stub-1.23.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/grpc-netty-1.23.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/grpc-core-1.23.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/grpc-protobuf-lite-1.23.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/grpc-api-1.23.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/guava-29.0-jre.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/failureaccess-1.0.1.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/perfmark-api-0.17.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/jsr305-3.0.2.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/checker-qual-2.11.1.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/error_prone_annotations-2.3.4.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/j2objc-annotations-1.3.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/cloudant-client-2.19.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/netty-tcnative-boringssl-static-2.0.30.Final.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/netty-codec-http2-4.1.49.Final.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/protobuf-java-3.10.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/bcpkix-jdk15on-1.62.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/httpclient-4.5.12.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/commons-logging-1.2.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/commons-cli-1.4.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/commons-compress-1.20.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/cloudant-http-2.19.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/commons-io-2.6.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/apache-log4j-extras-1.2.17.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/log4j-1.2.17.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/futures-extra-4.2.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/javax.json-1.1.4.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/snakeyaml-1.26.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/jaxb-api-2.3.1.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/javax.annotation-api-1.3.2.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/gson-2.8.5.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/commons-codec-1.11.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/netty-handler-proxy-4.1.38.Final.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/proto-google-common-protos-1.12.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/netty-codec-http-4.1.49.Final.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/netty-handler-4.1.49.Final.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/netty-codec-socks-4.1.38.Final.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/netty-codec-4.1.49.Final.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/netty-transport-4.1.49.Final.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/netty-buffer-4.1.49.Final.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/netty-resolver-4.1.49.Final.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/netty-common-4.1.49.Final.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/bcprov-jdk15on-1.62.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/httpcore-4.4.13.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/auto-value-annotations-1.7.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/commons-math3-3.6.1.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/javax.activation-api-1.2.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/annotations-4.1.1.4.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/opencensus-contrib-grpc-metrics-0.21.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/opencensus-api-0.21.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/grpc-context-1.23.0.jar is empty. content , skipping\n", + "16:43:59 WARNING - file application-java/lib/animal-sniffer-annotations-1.17.jar is empty. content , skipping\n", + "16:43:59 INFO - Completed 1 files (33.33%) in 0.002 min\n", + "16:43:59 WARNING - file ray/test-data/data_processing/daf/input/ds1/sample2.parquet is empty. content , skipping\n", + "16:43:59 WARNING - file ray/test-data/data_processing/daf/input/ds1/sample1.parquet is empty. content , skipping\n", + "16:43:59 WARNING - file ray/test-data/data_processing/daf/input/ds2/sample3.parquet is empty. content , skipping\n", + "16:43:59 WARNING - file ray/test-data/data_processing/daf/output/ds1/sample1.parquet is empty. content , skipping\n", + "16:43:59 WARNING - file ray/test-data/data_processing/input/sample1.parquet is empty. content , skipping\n", + "16:43:59 WARNING - file ray/test-data/data_processing/ray/noop/input/subdir/test1.parquet is empty. content , skipping\n", + "16:43:59 WARNING - file ray/test-data/data_processing/ray/noop/input/sample1.parquet is empty. content , skipping\n", + "16:43:59 WARNING - file ray/test-data/data_processing/ray/noop/expected/subdir/test1.parquet is empty. content , skipping\n", + "16:43:59 WARNING - file ray/test-data/data_processing/ray/noop/expected/sample1.parquet is empty. content , skipping\n", + "16:43:59 WARNING - file ray/test-data/data_processing/input_multiple/sample2.parquet is empty. content , skipping\n", + "16:43:59 WARNING - file ray/test-data/data_processing/input_multiple/sample3.parquet is empty. content , skipping\n", + "16:43:59 WARNING - file ray/test-data/data_processing/input_multiple/sample1.parquet is empty. content , skipping\n", + "16:43:59 WARNING - file ray/src/data_processing_ray/test_support/__init__.py is empty. content , skipping\n", + "16:43:59 WARNING - file ray/data-processing-lib.zip is empty. content , skipping\n", + "16:43:59 INFO - Completed 2 files (66.67%) in 0.002 min\n", + "16:43:59 WARNING - file environments-master/cfortunes/diebenkorn_notes.dat is empty. content , skipping\n", + "16:43:59 WARNING - file environments-master/cfortunes/obliquestrategies.dat is empty. content , skipping\n", + "16:43:59 WARNING - file environments-master/commands/grel is empty. content , skipping\n", + "16:43:59 WARNING - file environments-master/commands/ldid is empty. content , skipping\n", + "16:43:59 INFO - Completed 3 files (100.0%) in 0.002 min\n", + "16:43:59 INFO - Done processing 3 files, waiting for flush() completion.\n", + "16:43:59 INFO - done flushing in 0.0 sec\n", + "16:43:59 INFO - Completed execution in 0.003 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SD: {'data_files_to_use': ['.zip'], 'data_local_config': \"{'input_folder': './test-data/input/', 'output_folder': 'code2parquet_output'}\", 'code2parquet_detect_programming_lang': True, 'code2parquet_supported_langs_file': './test-data/languages/lang_extensions.json'}\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Code2Parquet(\n", + " input_folder=\"./test-data/input/\",\n", + " output_folder=\"code2parquet_output\",\n", + " data_files_to_use = ['.zip',],\n", + " supported_languages = \"./test-data/languages/lang_extensions.json\",\n", + " detect_programming_lang = True\n", + ").transform()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "72581e48-ab1e-46fa-9ebb-a7864001fa0e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/shivdeep/workspace/projects/current/dpk-newapi/transforms/code/code2parquet\n" + ] + } + ], + "source": [ + "!pwd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22afe1c0-1c16-4176-9c79-871a86cf88fa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/code/code2parquet/python/src/code2parquet_local.py b/transforms/code/code2parquet/dpk_code2parquet/local.py similarity index 96% rename from transforms/code/code2parquet/python/src/code2parquet_local.py rename to transforms/code/code2parquet/dpk_code2parquet/local.py index 8ebd4370b8..05bbdc98d0 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_local.py +++ b/transforms/code/code2parquet/dpk_code2parquet/local.py @@ -13,13 +13,13 @@ import ast import os -from code2parquet_transform import ( # domain_key,; snapshot_key, +from data_processing.data_access import DataAccessFactory, DataAccessLocal +from dpk_code2parquet.transform import ( # domain_key,; snapshot_key, CodeToParquetTransform, data_factory_key, detect_programming_lang_key, supported_langs_file_key, ) -from data_processing.data_access import DataAccessFactory, DataAccessLocal supported_languages_file = os.path.abspath( diff --git a/transforms/code/code2parquet/python/src/code2parquet_local_python.py b/transforms/code/code2parquet/dpk_code2parquet/local_python.py similarity index 93% rename from transforms/code/code2parquet/python/src/code2parquet_local_python.py rename to transforms/code/code2parquet/dpk_code2parquet/local_python.py index 66713a02f3..fe8e35941b 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_local_python.py +++ b/transforms/code/code2parquet/dpk_code2parquet/local_python.py @@ -14,13 +14,13 @@ import os import sys -from code2parquet_transform import ( # domain_key,; snapshot_key, +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from dpk_code2parquet.transform import ( detect_programming_lang_cli_key, supported_langs_file_cli_key, ) -from code2parquet_transform_python import CodeToParquetPythonConfiguration -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.utils import ParamsUtils +from dpk_code2parquet.transform_python import CodeToParquetPythonConfiguration # create parameters diff --git a/transforms/code/code2parquet/dpk_code2parquet/ray/__init__.py b/transforms/code/code2parquet/dpk_code2parquet/ray/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/transforms/code/code2parquet/ray/src/code2parquet_transform_ray.py b/transforms/code/code2parquet/dpk_code2parquet/ray/transform.py similarity index 71% rename from transforms/code/code2parquet/ray/src/code2parquet_transform_ray.py rename to transforms/code/code2parquet/dpk_code2parquet/ray/transform.py index 5c81ca910b..dace36239e 100644 --- a/transforms/code/code2parquet/ray/src/code2parquet_transform_ray.py +++ b/transforms/code/code2parquet/dpk_code2parquet/ray/transform.py @@ -10,18 +10,13 @@ # limitations under the License. ################################################################################ +import os +import sys from typing import Any import ray -from code2parquet_transform import ( - CodeToParquetTransform, - CodeToParquetTransformConfiguration, - data_factory_key, - get_supported_languages, - supported_langs_file_key, - supported_languages_key, -) from data_processing.data_access import DataAccessFactoryBase +from data_processing.utils import ParamsUtils, get_logger from data_processing_ray.runtime.ray import ( DefaultRayTransformRuntime, RayTransformLauncher, @@ -29,6 +24,18 @@ from data_processing_ray.runtime.ray.runtime_configuration import ( RayTransformRuntimeConfiguration, ) +from dpk_code2parquet.transform import ( + CodeToParquetTransform, + CodeToParquetTransformConfiguration, + data_factory_key, + detect_programming_lang_cli_key, + detect_programming_lang_default, + detect_programming_lang_key, + get_supported_languages, + supported_langs_file_cli_key, + supported_langs_file_key, + supported_languages_key, +) from ray.actor import ActorHandle @@ -118,6 +125,48 @@ def __init__(self): ) +class Code2Parquet: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + + # create parameters + + detect_programming_lang_key = "detect_programming_lang" + + if detect_programming_lang_key not in self.params: + self.params[detect_programming_lang_cli_key] = detect_programming_lang_default + else: + self.params[detect_programming_lang_cli_key] = self.params[detect_programming_lang_key] + del self.params[detect_programming_lang_key] + + if "supported_languages" not in self.params: + supported_languages_file = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../../test-data/languages/lang_extensions.json") + ) + self.params[supported_langs_file_cli_key] = supported_languages_file + else: + self.params[supported_langs_file_cli_key] = self.params["supported_languages"] + del self.params["supported_languages"] + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + # create launcher + launcher = RayTransformLauncher(CodeToParquetRayConfiguration()) + # launch + return_code = launcher.launch() + return return_code + + if __name__ == "__main__": launcher = RayTransformLauncher(CodeToParquetRayConfiguration()) launcher.launch() diff --git a/transforms/code/code2parquet/python/src/code2parquet_transform.py b/transforms/code/code2parquet/dpk_code2parquet/transform.py similarity index 100% rename from transforms/code/code2parquet/python/src/code2parquet_transform.py rename to transforms/code/code2parquet/dpk_code2parquet/transform.py diff --git a/transforms/code/code2parquet/dpk_code2parquet/transform_python.py b/transforms/code/code2parquet/dpk_code2parquet/transform_python.py new file mode 100644 index 0000000000..fe4fd002e9 --- /dev/null +++ b/transforms/code/code2parquet/dpk_code2parquet/transform_python.py @@ -0,0 +1,88 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import ParamsUtils, get_logger +from dpk_code2parquet.transform import ( + CodeToParquetTransform, + CodeToParquetTransformConfiguration, + data_factory_key, + detect_programming_lang_cli_key, + detect_programming_lang_default, + detect_programming_lang_key, + get_supported_languages, + supported_langs_file_cli_key, + supported_langs_file_key, +) + + +logger = get_logger(__name__) + + +class CodeToParquetPythonConfiguration(PythonTransformRuntimeConfiguration): + def __init__(self): + super().__init__(transform_config=CodeToParquetTransformConfiguration(transform_class=CodeToParquetTransform)) + + +class Code2Parquet: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + + # create parameters + + detect_programming_lang_key = "detect_programming_lang" + + if detect_programming_lang_key not in self.params: + self.params[detect_programming_lang_cli_key] = detect_programming_lang_default + else: + self.params[detect_programming_lang_cli_key] = self.params[detect_programming_lang_key] + del self.params[detect_programming_lang_key] + + if "supported_languages" not in self.params: + supported_languages_file = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../test-data/languages/lang_extensions.json") + ) + self.params[supported_langs_file_cli_key] = supported_languages_file + else: + self.params[supported_langs_file_cli_key] = self.params["supported_languages"] + del self.params["supported_languages"] + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + # create launcher + launcher = PythonTransformLauncher(CodeToParquetPythonConfiguration()) + # launch + return_code = launcher.launch() + return return_code + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = PythonTransformLauncher(CodeToParquetPythonConfiguration()) + logger.info("Launching noop transform") + launcher.launch() diff --git a/transforms/code/code2parquet/kfp_ray/Makefile b/transforms/code/code2parquet/kfp_ray/Makefile index 847a743b8a..7244ce1427 100644 --- a/transforms/code/code2parquet/kfp_ray/Makefile +++ b/transforms/code/code2parquet/kfp_ray/Makefile @@ -2,10 +2,15 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows -# Include the common configuration for this transform -include ../transform.config +SRC_DIR=${CURDIR}/../ +# Use the docker image that is built for ray runtime +TRANSFORM_RUNTIME=ray +## override settings in .make.default as they assume old structure with ray being the current folder +DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-$(TRANSFORM_RUNTIME) +DOCKER_LOCAL_IMAGE=$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -SRC_DIR=${CURDIR}/../ray/ +# Only build the image with -f Dockerfile.ray +BUILD_SPECIFIC_RUNTIME=ray PYTHON_WF := $(shell find ./ -name '*_wf.py') YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) @@ -15,29 +20,8 @@ workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} .PHONY: clean clean: @# Help: Clean up the virtual environment. - rm -rf ${REPOROOT}/transforms/venv + rm -rf ${REPOROOT}/transforms/venv -venv:: - -build:: - -setup:: - -test:: - -test-src:: - -publish:: - -image:: - -test-image:: - -kind-load-image:: - -docker-load-image:: - -docker-save-image:: .PHONY: workflow-build workflow-build: workflow-venv @@ -45,10 +29,14 @@ workflow-build: workflow-venv .PHONY: workflow-test workflow-test: workflow-build - $(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=code2parquet_wf.yaml + $(MAKE) TRANSFORM_SRC=${SRC_DIR} \ + TRANSFORM_RUNTIME=$(TRANSFORM_RUNTIME) \ + TRANSFORM_NAME=$(TRANSFORM_NAME) \ + BUILD_SPECIFIC_RUNTIME=$(BUILD_SPECIFIC_RUNTIME) \ + PIPELINE_FILE=$(TRANSFORM_NAME)_wf.yaml .workflows.test-pipeline .PHONY: workflow-upload workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done + done \ No newline at end of file diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index c5cba02308..30fa74f146 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -24,7 +24,7 @@ # the name of the job script -EXEC_SCRIPT_NAME: str = "code2parquet_transform_ray.py" +EXEC_SCRIPT_NAME: str = "-m dpk_code2parquet.ray.transform" task_image = "quay.io/dataprep1/data-prep-kit/code2parquet-ray:latest" @@ -101,10 +101,17 @@ def compute_exec_params_func( ) def code2parquet( ray_name: str = "code2parquet-kfp-ray", # name of Ray cluster - ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/code2parquet/input', 'output_folder': 'test/code2parquet/output/'}", @@ -113,9 +120,9 @@ def code2parquet( data_num_samples: int = -1, data_files_to_use: str = "['.zip']", # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, # code to parquet code2parquet_supported_langs_file: str = "test/code2parquet/languages/lang_extensions.json", code2parquet_detect_programming_lang: bool = True, @@ -173,13 +180,17 @@ def code2parquet( # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": - print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - "same version of the same pipeline !!!") + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!" + ) run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/code/code2parquet/python/.dockerignore b/transforms/code/code2parquet/python/.dockerignore deleted file mode 100644 index f7275bbbd0..0000000000 --- a/transforms/code/code2parquet/python/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/code/code2parquet/python/.gitignore b/transforms/code/code2parquet/python/.gitignore deleted file mode 100644 index 17cee1df3a..0000000000 --- a/transforms/code/code2parquet/python/.gitignore +++ /dev/null @@ -1,37 +0,0 @@ -test-data/output -output/* -/output/ -data-processing-lib/ - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - - -# Distribution / packaging -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -.tox/ -htmlcov -.coverage -.cache -nosetests.xml -coverage.xml \ No newline at end of file diff --git a/transforms/code/code2parquet/python/Makefile b/transforms/code/code2parquet/python/Makefile deleted file mode 100644 index e27e402c7c..0000000000 --- a/transforms/code/code2parquet/python/Makefile +++ /dev/null @@ -1,71 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -venv:: .transforms.python-venv - -test:: .transforms.python-test - -clean:: .transforms.clean - -image:: .transforms.python-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-python - -setup:: .transforms.setup - -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(CODE2PARQUET_PYTHON_VERSION) TOML_VERSION=$(CODE2PARQUET_PYTHON_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -test-image:: .transforms.python-test-image - -run-cli-sample: - $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_python.py \ - RUN_ARGS=" \ - --data_local_config \" { 'input_folder' : '../test-data/input', 'output_folder' : '../output' } \" \ - --data_files_to_use \"['.zip']\" \ - --code2parquet_supported_langs_file ../test-data/languages/lang_extensions.json \ - --code2parquet_detect_programming_lang True " \ - .transforms.run-src-file - -run-local-sample: .transforms.run-local-sample - -run-local-python-sample: .transforms.run-local-python-sample - -#run-s3-ray-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/code/code2parquet/python/README.md b/transforms/code/code2parquet/python/README.md deleted file mode 100644 index 33df6ec045..0000000000 --- a/transforms/code/code2parquet/python/README.md +++ /dev/null @@ -1,135 +0,0 @@ -# Code2Parquet - -## Summary -This code2parquet transform is designed to convert raw particularly ZIP files contain programming files (.py, .c, .java, etc) , -into Parquet format. -As a transform It is built to handle concurrent processing of Ray-based -multiple files using multiprocessing for efficient execution. -Each file contained within the ZIP is transformed into a distinct row within the Parquet dataset, adhering to the below schema. - -**title:** (string) - -- **Description:** Path to the file within the ZIP archive. -- **Example:** `"title": "data/file.txt"` - -**document:** (string) - -- **Description:** Name of the ZIP file containing the current file. -- **Example:** `"document": "example.zip"` - -**repo_name:** - -- **Description:** The name of the repository to which the code belongs. This should match the name of the zip file containing the repository. -- **Example:** `"repo_name": "example"` - -**contents:** (string) - -- **Description:** Content of the file, converted to a string. -- **Example:** `"contents": "This is the content of the file."` - -**document_id:** (string) - -- **Description:** Unique identifier computed as a uuid. -- **Example:** `"document_id": "b1e4a879-41c5-4a6d-a4a8-0d7a53ec7e8f"` - -**ext:** (string) - -- **Description:** File extension extracted from the file path. -- **Example:** `"ext": ".txt"` - -**hash:** (string) - -- **Description:** sha256 hash value computed from the file content string. -- **Example:** `"hash": "a1b2c3d4"` - -**size:** (int64) - -- **Description:** Size of the file content in bytes. -- **Example:** `"size": 1024` - -**date_acquired:** (string) - -- **Description:** Timestamp indicating when the file was processed. -- **Example:** `"date_acquired": "2024-03-25T12:00:00"` - -**snapshot:** (string)(optional) - -- **Description:** Name indicating which dataset it belong to. -- **Example:** `"snapshot": "github"` - -**programming_language:** (string)(optional) - -- **Description:** Programming language detected using the file extension. -- **Example:** `"programming_language": "Java"` - -**domain:** (string)(optional) - -- **Description:** Name indicating which domain it belong to, whether code, natural language etc.. -- **Example:** `"domain": "code"` - - - -## Configuration - -The set of dictionary keys holding [code2parquet](src/code2parquet_transform.py) -configuration for values are as follows: - -The transform can be configured with the following key/value pairs -from the configuration dictionary. -* `supported_languages` - a dictionary mapping file extensions to language names. -* `supported_langs_file` - used if `supported_languages` key is not provided, - and specifies the path to a JSON file containing the mapping of languages - to extensions. The json file is expected to contain a dictionary of - languages names as keys, with values being a list of strings specifying the - associated extensions. As an example, see - [lang_extensions](test-data/languages/lang_extensions.json) . -* `data_access_factory` - used to create the DataAccess instance used to read -the file specified in `supported_langs_file`. -* `detect_programming_lang` - a flag that indicates if the language:extension mappings - should be applied in a new column value named `programming_language`. -* `domain` - optional value assigned to the imported data in the 'domain' column. -* `snapshot` - optional value assigned to the imported data in the 'snapshot' column. - -## Running - -### Launched Command Line Options -When running the transform with the Ray launcher (i.e. TransformLauncher), -the following command line arguments are available in addition to -[the options provided by the launcher](../../../../data-processing-lib/doc/launcher-options.md). - -* `--code2parquet_supported_langs_file` - set the `supported_langs_file` configuration key. -* `--code2parquet_detect_programming_lang` - set the `detect_programming_lang` configuration key. -* `--code2parquet_domain` - set the `domain` configuration key. -* `--code2parquet_snapshot` - set the `snapshot` configuration key. - -### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/code2parquet_transform_ray.py using command line args -* `run-local-sample` - runs src/code2parquet.py -* `run-s3-sample` - runs src/code2parquet.py - * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) - and [here](https://min.io/docs/minio/linux/index.html) - and invocation of `make minio-start` to load data into local minio for S3 access. - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-cli-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. ---------------------------------- - - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml deleted file mode 100644 index 9e895b44b7..0000000000 --- a/transforms/code/code2parquet/python/pyproject.toml +++ /dev/null @@ -1,46 +0,0 @@ -[project] -name = "dpk_code2parquet_transform_python" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "code2parquet Python Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt deleted file mode 100644 index d871e3142f..0000000000 --- a/transforms/code/code2parquet/python/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -data-prep-toolkit>=0.2.3 -parameterized -pandas diff --git a/transforms/code/code2parquet/python/src/code2parquet_s3_python.py b/transforms/code/code2parquet/python/src/code2parquet_s3_python.py deleted file mode 100644 index ca26b19cdf..0000000000 --- a/transforms/code/code2parquet/python/src/code2parquet_s3_python.py +++ /dev/null @@ -1,61 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import ast -import sys - -from code2parquet_transform import ( # domain_key,; snapshot_key, - detect_programming_lang_cli_key, - supported_langs_file_cli_key, -) -from code2parquet_transform_python import CodeToParquetPythonConfiguration -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.utils import GB, ParamsUtils - - -# create parameters -s3_cred = { - "access_key": "localminioaccesskey", - "secret_key": "localminiosecretkey", - "url": "http://localhost:9000", -} -s3_conf = { - "input_folder": "test/ingest_2_parquet/input", - "output_folder": "test/ingest_2_parquet/output", -} -worker_options = {"num_cpus": 0.8, "memory": 2 * GB} -code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -ingest_config = { - supported_langs_file_cli_key: "test/ingest_2_parquet/languages/lang_extensions.json", - detect_programming_lang_cli_key: True, - # snapshot_key: "github", - # domain_key: "code", - "code2parquet_s3_cred": ParamsUtils.convert_to_ast(s3_cred), -} - -params = { - # Data access. Only required parameters are specified - "data_s3_cred": ParamsUtils.convert_to_ast(s3_cred), - "data_s3_config": ParamsUtils.convert_to_ast(s3_conf), - "data_files_to_use": ast.literal_eval("['.zip']"), - # orchestrator - "runtime_pipeline_id": "pipeline_id", - "runtime_job_id": "job_id", - "runtime_code_location": ParamsUtils.convert_to_ast(code_location), -} - -if __name__ == "__main__": - sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config)) - # create launcher - launcher = PythonTransformLauncher(CodeToParquetPythonConfiguration()) - # launch - launcher.launch() diff --git a/transforms/code/code2parquet/python/src/code2parquet_transform_python.py b/transforms/code/code2parquet/python/src/code2parquet_transform_python.py deleted file mode 100644 index ea09a18084..0000000000 --- a/transforms/code/code2parquet/python/src/code2parquet_transform_python.py +++ /dev/null @@ -1,39 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -from code2parquet_transform import ( - CodeToParquetTransform, - CodeToParquetTransformConfiguration, - data_factory_key, - get_supported_languages, - supported_langs_file_key, -) -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.runtime.pure_python.runtime_configuration import ( - PythonTransformRuntimeConfiguration, -) -from data_processing.utils import get_logger - - -logger = get_logger(__name__) - - -class CodeToParquetPythonConfiguration(PythonTransformRuntimeConfiguration): - def __init__(self): - super().__init__(transform_config=CodeToParquetTransformConfiguration(transform_class=CodeToParquetTransform)) - - -if __name__ == "__main__": - # launcher = NOOPRayLauncher() - launcher = PythonTransformLauncher(CodeToParquetPythonConfiguration()) - logger.info("Launching noop transform") - launcher.launch() diff --git a/transforms/code/code2parquet/ray/.dockerignore b/transforms/code/code2parquet/ray/.dockerignore deleted file mode 100644 index f7275bbbd0..0000000000 --- a/transforms/code/code2parquet/ray/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/code/code2parquet/ray/.gitignore b/transforms/code/code2parquet/ray/.gitignore deleted file mode 100644 index 17cee1df3a..0000000000 --- a/transforms/code/code2parquet/ray/.gitignore +++ /dev/null @@ -1,37 +0,0 @@ -test-data/output -output/* -/output/ -data-processing-lib/ - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - - -# Distribution / packaging -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -.tox/ -htmlcov -.coverage -.cache -nosetests.xml -coverage.xml \ No newline at end of file diff --git a/transforms/code/code2parquet/ray/Makefile b/transforms/code/code2parquet/ray/Makefile deleted file mode 100644 index 42383457f9..0000000000 --- a/transforms/code/code2parquet/ray/Makefile +++ /dev/null @@ -1,70 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -BASE_IMAGE=${RAY_BASE_IMAGE} -venv:: .transforms.ray-venv - -test:: .transforms.ray-test - -clean:: .transforms.clean - -image:: .transforms.ray-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -test-image:: .transforms.ray-test-image - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-ray - -setup:: .transforms.setup - -# set the version of python transform that this depends on. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(CODE2PARQUET_PYTHON_VERSION) TOML_VERSION=$(CODE2PARQUET_RAY_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -run-cli-sample: - $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \ - RUN_ARGS="--run_locally True \ - --data_local_config \" { 'input_folder' : '../test-data/input', 'output_folder' : '../output' } \" \ - --data_files_to_use \"['.zip']\" \ - --code2parquet_supported_langs_file ../test-data/languages/lang_extensions.json \ - --code2parquet_detect_programming_lang True " \ - .transforms.run-src-file - -run-local-sample: .transforms.run-local-ray-sample - -run-s3-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/code/code2parquet/ray/README.md b/transforms/code/code2parquet/ray/README.md deleted file mode 100644 index 658b6cdf64..0000000000 --- a/transforms/code/code2parquet/ray/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# NOOP Ray Transform -Please see the set of -[transform project conventions](../../../README.md#transform-project-conventions) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Summary -This project wraps the [code2parquet transform](../python) with a Ray runtime. - -## Configuration and command line Options - -code2parquet transform configuration and command line options are the same as for the base python transform. - -## Running - -### Launched Command Line Options -In addition to those available to the transform as defined in [here](../python/README.md), -the set of -[launcher options](../../../../data-processing-lib/doc/launcher-options.md) are available. - -### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/code2parquet_transform.py using command line args -* `run-local-sample` - runs src/code2parquet_local_ray.py -* `run-s3-sample` - runs src/code2parquet_s3_ray.py - * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) - and [here](https://min.io/docs/minio/linux/index.html) - and invocation of `make minio-start` to load data into local minio for S3 access. - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-cli-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml deleted file mode 100644 index 55157457cb..0000000000 --- a/transforms/code/code2parquet/ray/pyproject.toml +++ /dev/null @@ -1,48 +0,0 @@ -[project] -name = "dpk_code2parquet_transform_ray" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "code2parquet Ray Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, -] -dependencies = [ - "data-prep-toolkit[ray]>=0.2.4.dev0", - "dpk-code2parquet-transform-python==0.2.4.dev0", - "parameterized", - "pandas", -] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/code/code2parquet/ray/src/code2parquet_local_ray.py b/transforms/code/code2parquet/ray/src/code2parquet_local_ray.py deleted file mode 100644 index 1f2e4a0087..0000000000 --- a/transforms/code/code2parquet/ray/src/code2parquet_local_ray.py +++ /dev/null @@ -1,63 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import ast -import os -import sys - -from code2parquet_transform import ( - detect_programming_lang_cli_key, - supported_langs_file_cli_key, -) -from code2parquet_transform_ray import CodeToParquetRayConfiguration -from data_processing.utils import GB, ParamsUtils -from data_processing_ray.runtime.ray import RayTransformLauncher - - -# create parameters -supported_languages_file = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../test-data/languages/lang_extensions.json") -) -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) -local_conf = { - "input_folder": input_folder, - "output_folder": output_folder, -} -worker_options = {"num_cpus": 0.8, "memory": 2 * GB} -code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -ingest_config = { - supported_langs_file_cli_key: supported_languages_file, - detect_programming_lang_cli_key: True, -} - -params = { - # where to run - "run_locally": True, - # Data access. Only required parameters are specified - "data_local_config": ParamsUtils.convert_to_ast(local_conf), - "data_files_to_use": ast.literal_eval("['.zip']"), - # orchestrator - "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), - "runtime_num_workers": 3, - "runtime_pipeline_id": "pipeline_id", - "runtime_job_id": "job_id", - "runtime_creation_delay": 0, - "runtime_code_location": ParamsUtils.convert_to_ast(code_location), -} - -if __name__ == "__main__": - sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config)) - # create launcher - launcher = RayTransformLauncher(CodeToParquetRayConfiguration()) - # launch - launcher.launch() diff --git a/transforms/code/code2parquet/ray/src/code2parquet_s3_ray.py b/transforms/code/code2parquet/ray/src/code2parquet_s3_ray.py deleted file mode 100644 index 783edd60c6..0000000000 --- a/transforms/code/code2parquet/ray/src/code2parquet_s3_ray.py +++ /dev/null @@ -1,64 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import ast -import sys - -from code2parquet_transform import ( - detect_programming_lang_cli_key, - supported_langs_file_cli_key, -) -from code2parquet_transform_ray import CodeToParquetRayConfiguration -from data_processing.utils import GB, ParamsUtils -from data_processing_ray.runtime.ray import RayTransformLauncher - - -# create parameters -s3_cred = { - "access_key": "localminioaccesskey", - "secret_key": "localminiosecretkey", - "url": "http://localhost:9000", -} -s3_conf = { - "input_folder": "test/ingest_2_parquet/input", - "output_folder": "test/ingest_2_parquet/output", -} -worker_options = {"num_cpus": 0.8, "memory": 2 * GB} -code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -ingest_config = { - supported_langs_file_cli_key: "test/ingest_2_parquet/languages/lang_extensions.json", - detect_programming_lang_cli_key: True, - "code2parquet_s3_cred": ParamsUtils.convert_to_ast(s3_cred), -} - -params = { - # where to run - "run_locally": True, - # Data access. Only required parameters are specified - "data_s3_cred": ParamsUtils.convert_to_ast(s3_cred), - "data_s3_config": ParamsUtils.convert_to_ast(s3_conf), - "data_files_to_use": ast.literal_eval("['.zip']"), - # orchestrator - "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), - "runtime_num_workers": 3, - "runtime_pipeline_id": "pipeline_id", - "runtime_job_id": "job_id", - "runtime_creation_delay": 0, - "runtime_code_location": ParamsUtils.convert_to_ast(code_location), -} - -if __name__ == "__main__": - sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config)) - # create launcher - launcher = RayTransformLauncher(CodeToParquetRayConfiguration()) - # launch - launcher.launch() diff --git a/transforms/code/code2parquet/ray/test-data/expected/application-java.parquet b/transforms/code/code2parquet/ray/test-data/expected/application-java.parquet deleted file mode 100644 index 68be60a255..0000000000 Binary files a/transforms/code/code2parquet/ray/test-data/expected/application-java.parquet and /dev/null differ diff --git a/transforms/code/code2parquet/ray/test-data/expected/data-processing-lib.parquet b/transforms/code/code2parquet/ray/test-data/expected/data-processing-lib.parquet deleted file mode 100644 index f9c39bb6f8..0000000000 Binary files a/transforms/code/code2parquet/ray/test-data/expected/data-processing-lib.parquet and /dev/null differ diff --git a/transforms/code/code2parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet b/transforms/code/code2parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet deleted file mode 100644 index bee0b0abcb..0000000000 Binary files a/transforms/code/code2parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet and /dev/null differ diff --git a/transforms/code/code2parquet/ray/test-data/expected/metadata.json b/transforms/code/code2parquet/ray/test-data/expected/metadata.json deleted file mode 100644 index 5c2c6d0a0a..0000000000 --- a/transforms/code/code2parquet/ray/test-data/expected/metadata.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "code2parquet", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-07-25 15:38:20", - "end_time": "2024-07-25 15:38:21", - "status": "success" - }, - "code": null, - "job_input_params": { - "supported_langs_file": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/languages/lang_extensions.json", - "detect_programming_lang": true, - "snapshot": null, - "domain": null, - "s3_cred": null, - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [".zip"] - }, - "job_output_stats": { - "source_files": 3, - "source_size": 33885652, - "result_files": 3, - "result_size": 70167, - "processing_time": 1.5678541660308838, - "number of rows": 74 - }, - "source": { - "name": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/input", - "type": "path" - }, - "target": { - "name": "/tmp/code2parquetbl3prm61", - "type": "path" - } -} diff --git a/transforms/code/code2parquet/ray/test-data/input/application-java.zip b/transforms/code/code2parquet/ray/test-data/input/application-java.zip deleted file mode 100755 index 7cb7cd976c..0000000000 Binary files a/transforms/code/code2parquet/ray/test-data/input/application-java.zip and /dev/null differ diff --git a/transforms/code/code2parquet/ray/test-data/input/data-processing-lib.zip b/transforms/code/code2parquet/ray/test-data/input/data-processing-lib.zip deleted file mode 100644 index 069bc536a5..0000000000 Binary files a/transforms/code/code2parquet/ray/test-data/input/data-processing-lib.zip and /dev/null differ diff --git a/transforms/code/code2parquet/ray/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip b/transforms/code/code2parquet/ray/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip deleted file mode 100644 index ba239d5523..0000000000 Binary files a/transforms/code/code2parquet/ray/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip and /dev/null differ diff --git a/transforms/code/code2parquet/ray/test-data/languages/lang_extensions.json b/transforms/code/code2parquet/ray/test-data/languages/lang_extensions.json deleted file mode 100644 index 45c20ff929..0000000000 --- a/transforms/code/code2parquet/ray/test-data/languages/lang_extensions.json +++ /dev/null @@ -1,566 +0,0 @@ -{ - "ABAP": [".abap"], - "AGS Script": [".ash"], - "AMPL": [".ampl"], - "ANTLR": [".g4"], - "API Blueprint": [".apib"], - "APL": [".apl", ".dyalog"], - "ASP": [".asp", ".asax", ".ascx", ".ashx", ".asmx", ".aspx", ".axd"], - "ATS": [".dats", ".hats", ".sats"], - "ActionScript": [".as"], - "Ada": [".adb", ".ada", ".ads"], - "Agda": [".agda"], - "Alloy": [".als"], - "ApacheConf": [".apacheconf", ".vhost"], - "AppleScript": [".applescript", ".scpt"], - "Arc": [".arc"], - "Arduino": [".ino"], - "AsciiDoc": [".asciidoc", ".adoc"], - "AspectJ": [".aj"], - "Assembly": [".asm", ".a51", ".nasm"], - "Augeas": [".aug"], - "AutoHotkey": [".ahk", ".ahkl"], - "AutoIt": [".au3"], - "Awk": [".awk", ".auk", ".gawk", ".mawk", ".nawk"], - "Batchfile": [".bat", ".cmd"], - "Befunge": [".befunge"], - "Bison": [".bison"], - "BitBake": [".bb"], - "BlitzBasic": [".decls"], - "BlitzMax": [".bmx"], - "Bluespec": [".bsv"], - "Boo": [".boo"], - "Brainfuck": [".bf"], - "Brightscript": [".brs"], - "Bro": [".bro"], - "C": [".c", ".cats", ".h", ".idc", ".w"], - "C#": [".cs", ".cake", ".cshtml", ".csx"], - "C++": [ - ".cpp", - ".c++", - ".cc", - ".cp", - ".cxx", - ".h++", - ".hh", - ".hpp", - ".hxx", - ".inl", - ".ipp", - ".tcc", - ".tpp", - ".C", - ".H" - ], - "C-ObjDump": [".c-objdump"], - "C2hs Haskell": [".chs"], - "CLIPS": [".clp"], - "CMake": [".cmake", ".cmake.in"], - "COBOL": [".cob", ".cbl", ".ccp", ".cobol", ".cpy"], - "CSS": [".css"], - "CSV": [".csv"], - "Cap'n Proto": [".capnp"], - "CartoCSS": [".mss"], - "Ceylon": [".ceylon"], - "Chapel": [".chpl"], - "ChucK": [".ck"], - "Cirru": [".cirru"], - "Clarion": [".clw"], - "Clean": [".icl", ".dcl"], - "Click": [".click"], - "Clojure": [".clj", ".boot", ".cl2", ".cljc", ".cljs", ".cljs.hl", ".cljscm", ".cljx", ".hic"], - "CoffeeScript": [".coffee", "._coffee", ".cjsx", ".cson", ".iced"], - "ColdFusion": [".cfm", ".cfml"], - "ColdFusion CFC": [".cfc"], - "Common Lisp": [".lisp", ".asd", ".lsp", ".ny", ".podsl", ".sexp"], - "Component Pascal": [".cps"], - "Coq": [".coq"], - "Cpp-ObjDump": [".cppobjdump", ".c++-objdump", ".c++objdump", ".cpp-objdump", ".cxx-objdump"], - "Creole": [".creole"], - "Crystal": [".cr"], - "Csound": [".csd"], - "Cucumber": [".feature"], - "Cuda": [".cu", ".cuh"], - "Cycript": [".cy"], - "Cython": [".pyx", ".pxd", ".pxi"], - "D": [".di"], - "D-ObjDump": [".d-objdump"], - "DIGITAL Command Language": [".com"], - "DM": [".dm"], - "DNS Zone": [".zone", ".arpa"], - "Darcs Patch": [".darcspatch", ".dpatch"], - "Dart": [".dart"], - "Diff": [".diff", ".patch"], - "Dockerfile": [".dockerfile", "Dockerfile"], - "Dogescript": [".djs"], - "Dylan": [".dylan", ".dyl", ".intr", ".lid"], - "E": [".E"], - "ECL": [".ecl", ".eclxml"], - "Eagle": [".sch", ".brd"], - "Ecere Projects": [".epj"], - "Eiffel": [".e"], - "Elixir": [".ex", ".exs"], - "Elm": [".elm"], - "Emacs Lisp": [".el", ".emacs", ".emacs.desktop"], - "EmberScript": [".em", ".emberscript"], - "Erlang": [".erl", ".escript", ".hrl", ".xrl", ".yrl"], - "F#": [".fs", ".fsi", ".fsx"], - "FLUX": [".flux"], - "FORTRAN": [".f90", ".f", ".f03", ".f08", ".f77", ".f95", ".for", ".fpp"], - "Factor": [".factor"], - "Fancy": [".fy", ".fancypack"], - "Fantom": [".fan"], - "Formatted": [".eam.fs"], - "Forth": [".fth", ".4th", ".forth", ".frt"], - "FreeMarker": [".ftl"], - "G-code": [".g", ".gco", ".gcode"], - "GAMS": [".gms"], - "GAP": [".gap", ".gi"], - "GAS": [".s"], - "GDScript": [".gd"], - "GLSL": [ - ".glsl", - ".fp", - ".frag", - ".frg", - ".fsh", - ".fshader", - ".geo", - ".geom", - ".glslv", - ".gshader", - ".shader", - ".vert", - ".vrx", - ".vsh", - ".vshader" - ], - "Genshi": [".kid"], - "Gentoo Ebuild": [".ebuild"], - "Gentoo Eclass": [".eclass"], - "Gettext Catalog": [".po", ".pot"], - "Glyph": [".glf"], - "Gnuplot": [".gp", ".gnu", ".gnuplot", ".plot", ".plt"], - "Go": [".go"], - "Golo": [".golo"], - "Gosu": [".gst", ".gsx", ".vark"], - "Grace": [".grace"], - "Gradle": [".gradle"], - "Grammatical Framework": [".gf"], - "GraphQL": [".graphql"], - "Graphviz (DOT)": [".dot", ".gv"], - "Groff": [ - ".man", - ".1", - ".1in", - ".1m", - ".1x", - ".2", - ".3", - ".3in", - ".3m", - ".3qt", - ".3x", - ".4", - ".5", - ".6", - ".7", - ".8", - ".9", - ".me", - ".rno", - ".roff" - ], - "Groovy": [".groovy", ".grt", ".gtpl", ".gvy"], - "Groovy Server Pages": [".gsp"], - "HCL": [".hcl", ".tf"], - "HLSL": [".hlsl", ".fxh", ".hlsli"], - "HTML": [".html", ".htm", ".html.hl", ".xht", ".xhtml"], - "HTML+Django": [".mustache", ".jinja"], - "HTML+EEX": [".eex"], - "HTML+ERB": [".erb", ".erb.deface"], - "HTML+PHP": [".phtml"], - "HTTP": [".http"], - "Haml": [".haml", ".haml.deface"], - "Handlebars": [".handlebars", ".hbs"], - "Harbour": [".hb"], - "Haskell": [".hs", ".hsc"], - "Haxe": [".hx", ".hxsl"], - "Hy": [".hy"], - "IDL": [".dlm"], - "IGOR Pro": [".ipf"], - "INI": [".ini", ".cfg", ".prefs", ".properties"], - "IRC log": [".irclog", ".weechatlog"], - "Idris": [".idr", ".lidr"], - "Inform 7": [".ni", ".i7x"], - "Inno Setup": [".iss"], - "Io": [".io"], - "Ioke": [".ik"], - "Isabelle": [".thy"], - "J": [".ijs"], - "JFlex": [".flex", ".jflex"], - "JSON": [".json", ".geojson", ".lock", ".topojson"], - "JSON5": [".json5"], - "JSONLD": [".jsonld"], - "JSONiq": [".jq"], - "JSX": [".jsx"], - "Jade": [".jade"], - "Jasmin": [".j"], - "Java": [".java"], - "Java Server Pages": [".jsp"], - "JavaScript": [ - ".js", - "._js", - ".bones", - ".es6", - ".jake", - ".jsb", - ".jscad", - ".jsfl", - ".jsm", - ".jss", - ".njs", - ".pac", - ".sjs", - ".ssjs", - ".xsjs", - ".xsjslib" - ], - "Julia": [".jl"], - "Jupyter Notebook": [".ipynb"], - "KRL": [".krl"], - "KiCad": [".kicad_pcb"], - "Kit": [".kit"], - "Kotlin": [".kt", ".ktm", ".kts"], - "LFE": [".lfe"], - "LLVM": [".ll"], - "LOLCODE": [".lol"], - "LSL": [".lsl", ".lslp"], - "LabVIEW": [".lvproj"], - "Lasso": [".lasso", ".las", ".lasso8", ".lasso9", ".ldml"], - "Latte": [".latte"], - "Lean": [".lean", ".hlean"], - "Less": [".less"], - "Lex": [".lex"], - "LilyPond": [".ly", ".ily"], - "Linker Script": [".ld", ".lds"], - "Liquid": [".liquid"], - "Literate Agda": [".lagda"], - "Literate CoffeeScript": [".litcoffee"], - "Literate Haskell": [".lhs"], - "LiveScript": [".ls", "._ls"], - "Logos": [".xm", ".x", ".xi"], - "Logtalk": [".lgt", ".logtalk"], - "LookML": [".lookml"], - "Lua": [".lua", ".nse", ".pd_lua", ".rbxs", ".wlua"], - "M": [".mumps"], - "M4": [".m4"], - "MAXScript": [".mcr"], - "MTML": [".mtml"], - "MUF": [".muf"], - "Makefile": [".mak", ".mk", ".mkfile", "Makefile"], - "Mako": [".mako", ".mao"], - "Maple": [".mpl"], - "Markdown": [".md", ".markdown", ".mkd", ".mkdn", ".mkdown", ".ron"], - "Mask": [".mask"], - "Mathematica": [".mathematica", ".cdf", ".ma", ".mt", ".nb", ".nbp", ".wl", ".wlt"], - "Matlab": [".matlab"], - "Max": [".maxpat", ".maxhelp", ".maxproj", ".mxt", ".pat"], - "MediaWiki": [".mediawiki", ".wiki"], - "Metal": [".metal"], - "MiniD": [".minid"], - "Mirah": [".druby", ".duby", ".mir", ".mirah"], - "Modelica": [".mo"], - "Module Management System": [".mms", ".mmk"], - "Monkey": [".monkey"], - "MoonScript": [".moon"], - "Myghty": [".myt"], - "NSIS": [".nsi", ".nsh"], - "NetLinx": [".axs", ".axi"], - "NetLinx+ERB": [".axs.erb", ".axi.erb"], - "NetLogo": [".nlogo"], - "Nginx": [".nginxconf"], - "Nimrod": [".nim", ".nimrod"], - "Ninja": [".ninja"], - "Nit": [".nit"], - "Nix": [".nix"], - "Nu": [".nu"], - "NumPy": [".numpy", ".numpyw", ".numsc"], - "OCaml": [".ml", ".eliom", ".eliomi", ".ml4", ".mli", ".mll", ".mly"], - "ObjDump": [".objdump"], - "Objective-C++": [".mm"], - "Objective-J": [".sj"], - "Octave": [".oct"], - "Omgrofl": [".omgrofl"], - "Opa": [".opa"], - "Opal": [".opal"], - "OpenCL": [".cl", ".opencl"], - "OpenEdge ABL": [".p"], - "OpenSCAD": [".scad"], - "Org": [".org"], - "Ox": [".ox", ".oxh", ".oxo"], - "Oxygene": [".oxygene"], - "Oz": [".oz"], - "PAWN": [".pwn"], - "PHP": [".php", ".aw", ".ctp", ".php3", ".php4", ".php5", ".phps", ".phpt"], - "POV-Ray SDL": [".pov"], - "Pan": [".pan"], - "Papyrus": [".psc"], - "Parrot": [".parrot"], - "Parrot Assembly": [".pasm"], - "Parrot Internal Representation": [".pir"], - "Pascal": [".pas", ".dfm", ".dpr", ".lpr"], - "Perl": [".pl", ".al", ".perl", ".ph", ".plx", ".pm", ".psgi", ".t"], - "Perl6": [".6pl", ".6pm", ".nqp", ".p6", ".p6l", ".p6m", ".pl6", ".pm6"], - "Pickle": [".pkl"], - "PigLatin": [".pig"], - "Pike": [".pike", ".pmod"], - "PLI": [".PLI", ".pli", ".inc", ".INC"], - "Pod": [".pod"], - "PogoScript": [".pogo"], - "Pony": [".pony"], - "PostScript": [".ps", ".eps"], - "PowerShell": [".ps1", ".psd1", ".psm1"], - "Processing": [".pde"], - "Prolog": [".prolog", ".yap"], - "Propeller Spin": [".spin"], - "Protocol Buffer": [".proto"], - "Public Key": [".pub"], - "Pure Data": [".pd"], - "PureBasic": [".pb", ".pbi"], - "PureScript": [".purs"], - "Python": [".py", ".bzl", ".gyp", ".lmi", ".pyde", ".pyp", ".pyt", ".pyw", ".tac", ".wsgi", ".xpy"], - "Python traceback": [".pytb"], - "QML": [".qml", ".qbs"], - "QMake": [".pri"], - "R": [".r", ".rd", ".rsx"], - "RAML": [".raml"], - "RDoc": [".rdoc"], - "REALbasic": [".rbbas", ".rbfrm", ".rbmnu", ".rbres", ".rbtbar", ".rbuistate"], - "RHTML": [".rhtml"], - "RMarkdown": [".rmd"], - "Racket": [".rkt", ".rktd", ".rktl", ".scrbl"], - "Ragel in Ruby Host": [".rl"], - "Raw token data": [".raw"], - "Rebol": [".reb", ".r2", ".r3", ".rebol"], - "Red": [".red", ".reds"], - "Redcode": [".cw"], - "Ren'Py": [".rpy"], - "RenderScript": [".rsh"], - "RobotFramework": [".robot"], - "Rouge": [".rg"], - "Ruby": [ - ".rb", - ".builder", - ".gemspec", - ".god", - ".irbrc", - ".jbuilder", - ".mspec", - ".podspec", - ".rabl", - ".rake", - ".rbuild", - ".rbw", - ".rbx", - ".ru", - ".ruby", - ".thor", - ".watchr" - ], - "Rust": [".rs", ".rs.in"], - "SAS": [".sas"], - "SCSS": [".scss"], - "SMT": [".smt2", ".smt"], - "SPARQL": [".sparql", ".rq"], - "SQF": [".sqf", ".hqf"], - "SQL": [ - ".pls", - ".pck", - ".pkb", - ".pks", - ".plb", - ".plsql", - ".sql", - ".cql", - ".ddl", - ".prc", - ".tab", - ".udf", - ".viw", - ".db2" - ], - "STON": [".ston"], - "SVG": [".svg"], - "Sage": [".sage", ".sagews"], - "SaltStack": [".sls"], - "Sass": [".sass"], - "Scala": [".scala", ".sbt"], - "Scaml": [".scaml"], - "Scheme": [".scm", ".sld", ".sps", ".ss"], - "Scilab": [".sci", ".sce"], - "Self": [".self"], - "Shell": [".sh", ".bash", ".bats", ".command", ".ksh", ".sh.in", ".tmux", ".tool", ".zsh"], - "ShellSession": [".sh-session"], - "Shen": [".shen"], - "Slash": [".sl"], - "Slim": [".slim"], - "Smali": [".smali"], - "Smalltalk": [".st"], - "Smarty": [".tpl"], - "Solidity": [".sol"], - "SourcePawn": [".sp", ".sma"], - "Squirrel": [".nut"], - "Stan": [".stan"], - "Standard ML": [".ML", ".fun", ".sig", ".sml"], - "Stata": [".do", ".ado", ".doh", ".ihlp", ".mata", ".matah", ".sthlp"], - "Stylus": [".styl"], - "SuperCollider": [".scd"], - "Swift": [".swift"], - "SystemVerilog": [".sv", ".svh", ".vh"], - "TOML": [".toml"], - "TXL": [".txl"], - "Tcl": [".tcl", ".adp", ".tm"], - "Tcsh": [".tcsh", ".csh"], - "TeX": [ - ".tex", - ".aux", - ".bbx", - ".bib", - ".cbx", - ".dtx", - ".ins", - ".lbx", - ".ltx", - ".mkii", - ".mkiv", - ".mkvi", - ".sty", - ".toc" - ], - "Tea": [".tea"], - "Text": [".txt", ".no"], - "Textile": [".textile"], - "Thrift": [".thrift"], - "Turing": [".tu"], - "Turtle": [".ttl"], - "Twig": [".twig"], - "TypeScript": [".ts", ".tsx"], - "Unified Parallel C": [".upc"], - "Unity3D Asset": [".anim", ".asset", ".mat", ".meta", ".prefab", ".unity"], - "Uno": [".uno"], - "UnrealScript": [".uc"], - "UrWeb": [".ur", ".urs"], - "VCL": [".vcl"], - "VHDL": [".vhdl", ".vhd", ".vhf", ".vhi", ".vho", ".vhs", ".vht", ".vhw"], - "Vala": [".vala", ".vapi"], - "Verilog": [".veo"], - "VimL": [".vim"], - "Visual Basic": [".vb", ".bas", ".frm", ".frx", ".vba", ".vbhtml", ".vbs"], - "Volt": [".volt"], - "Vue": [".vue"], - "Web Ontology Language": [".owl"], - "WebAssembly": [".wat"], - "WebIDL": [".webidl"], - "X10": [".x10"], - "XC": [".xc"], - "XML": [ - ".xml", - ".ant", - ".axml", - ".ccxml", - ".clixml", - ".cproject", - ".csl", - ".csproj", - ".ct", - ".dita", - ".ditamap", - ".ditaval", - ".dll.config", - ".dotsettings", - ".filters", - ".fsproj", - ".fxml", - ".glade", - ".grxml", - ".iml", - ".ivy", - ".jelly", - ".jsproj", - ".kml", - ".launch", - ".mdpolicy", - ".mxml", - ".nproj", - ".nuspec", - ".odd", - ".osm", - ".plist", - ".props", - ".ps1xml", - ".psc1", - ".pt", - ".rdf", - ".rss", - ".scxml", - ".srdf", - ".storyboard", - ".stTheme", - ".sublime-snippet", - ".targets", - ".tmCommand", - ".tml", - ".tmLanguage", - ".tmPreferences", - ".tmSnippet", - ".tmTheme", - ".ui", - ".urdf", - ".ux", - ".vbproj", - ".vcxproj", - ".vssettings", - ".vxml", - ".wsdl", - ".wsf", - ".wxi", - ".wxl", - ".wxs", - ".x3d", - ".xacro", - ".xaml", - ".xib", - ".xlf", - ".xliff", - ".xmi", - ".xml.dist", - ".xproj", - ".xsd", - ".xul", - ".zcml" - ], - "XPages": [".xsp-config", ".xsp.metadata"], - "XProc": [".xpl", ".xproc"], - "XQuery": [".xquery", ".xq", ".xql", ".xqm", ".xqy"], - "XS": [".xs"], - "XSLT": [".xslt", ".xsl"], - "Xojo": [".xojo_code", ".xojo_menu", ".xojo_report", ".xojo_script", ".xojo_toolbar", ".xojo_window"], - "Xtend": [".xtend"], - "YAML": [".yml", ".reek", ".rviz", ".sublime-syntax", ".syntax", ".yaml", ".yaml-tmlanguage"], - "YANG": [".yang"], - "Yacc": [".y", ".yacc", ".yy"], - "Zephir": [".zep"], - "Zig": [".zig"], - "Zimpl": [".zimpl", ".zmpl", ".zpl"], - "desktop": [".desktop", ".desktop.in"], - "eC": [".ec", ".eh"], - "edn": [".edn"], - "fish": [".fish"], - "mupad": [".mu"], - "nesC": [".nc"], - "ooc": [".ooc"], - "reStructuredText": [".rst", ".rest", ".rest.txt", ".rst.txt"], - "wisp": [".wisp"], - "xBase": [".prg", ".prw"] -} diff --git a/transforms/code/code2parquet/ray/test/test_code2parquet_ray.py b/transforms/code/code2parquet/ray/test/test_code2parquet_ray.py deleted file mode 100644 index e05cba502a..0000000000 --- a/transforms/code/code2parquet/ray/test/test_code2parquet_ray.py +++ /dev/null @@ -1,61 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import ast -import os - -from code2parquet_transform import ( - detect_programming_lang_cli_key, - detect_programming_lang_key, - domain_cli_key, - snapshot_cli_key, - supported_langs_file_cli_key, - supported_langs_file_key, -) -from code2parquet_transform_ray import CodeToParquetRayConfiguration -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, -) -from data_processing_ray.runtime.ray import RayTransformLauncher - - -class TestRayIngestToParquetTransform(AbstractTransformLauncherTest): - """ - Extends the super-class to define the test data for the tests defined there. - The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. - """ - - def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) - lang_supported_file = os.path.abspath( - os.path.join( - basedir, - "languages/lang_extensions.json", - ) - ) - config = { - "run_locally": True, - "data_files_to_use": ast.literal_eval("['.zip']"), - supported_langs_file_cli_key: lang_supported_file, - detect_programming_lang_cli_key: True, - } - fixtures = [ - ( - RayTransformLauncher(CodeToParquetRayConfiguration()), - config, - basedir + "/input", - basedir + "/expected", - # this is added as a fixture to remove these 2 columns from comparison - ["date_acquired", "document_id"], - ) - ] - return fixtures diff --git a/transforms/code/code2parquet/requirements.txt b/transforms/code/code2parquet/requirements.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/transforms/code/code2parquet/python/test-data/expected/application-java.parquet b/transforms/code/code2parquet/test-data/expected/application-java.parquet similarity index 100% rename from transforms/code/code2parquet/python/test-data/expected/application-java.parquet rename to transforms/code/code2parquet/test-data/expected/application-java.parquet diff --git a/transforms/code/code2parquet/python/test-data/expected/data-processing-lib.parquet b/transforms/code/code2parquet/test-data/expected/data-processing-lib.parquet similarity index 100% rename from transforms/code/code2parquet/python/test-data/expected/data-processing-lib.parquet rename to transforms/code/code2parquet/test-data/expected/data-processing-lib.parquet diff --git a/transforms/code/code2parquet/python/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet b/transforms/code/code2parquet/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet similarity index 100% rename from transforms/code/code2parquet/python/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet rename to transforms/code/code2parquet/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet diff --git a/transforms/code/code2parquet/python/test-data/expected/metadata.json b/transforms/code/code2parquet/test-data/expected/metadata.json similarity index 100% rename from transforms/code/code2parquet/python/test-data/expected/metadata.json rename to transforms/code/code2parquet/test-data/expected/metadata.json diff --git a/transforms/code/code2parquet/python/test-data/input/application-java.zip b/transforms/code/code2parquet/test-data/input/application-java.zip similarity index 100% rename from transforms/code/code2parquet/python/test-data/input/application-java.zip rename to transforms/code/code2parquet/test-data/input/application-java.zip diff --git a/transforms/code/code2parquet/python/test-data/input/data-processing-lib.zip b/transforms/code/code2parquet/test-data/input/data-processing-lib.zip similarity index 100% rename from transforms/code/code2parquet/python/test-data/input/data-processing-lib.zip rename to transforms/code/code2parquet/test-data/input/data-processing-lib.zip diff --git a/transforms/code/code2parquet/python/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip b/transforms/code/code2parquet/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip similarity index 100% rename from transforms/code/code2parquet/python/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip rename to transforms/code/code2parquet/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip diff --git a/transforms/code/code2parquet/python/test-data/languages/lang_extensions.json b/transforms/code/code2parquet/test-data/languages/lang_extensions.json similarity index 100% rename from transforms/code/code2parquet/python/test-data/languages/lang_extensions.json rename to transforms/code/code2parquet/test-data/languages/lang_extensions.json diff --git a/transforms/code/code2parquet/python/test/test_code2parquet.py b/transforms/code/code2parquet/test/test_code2parquet.py similarity index 97% rename from transforms/code/code2parquet/python/test/test_code2parquet.py rename to transforms/code/code2parquet/test/test_code2parquet.py index 22524264bd..6255ab72a3 100644 --- a/transforms/code/code2parquet/python/test/test_code2parquet.py +++ b/transforms/code/code2parquet/test/test_code2parquet.py @@ -12,7 +12,11 @@ import os -from code2parquet_transform import ( # domain_key,; snapshot_key, +from data_processing.data_access import DataAccessFactory +from data_processing.test_support import get_files_in_folder +from data_processing.test_support.transform import AbstractBinaryTransformTest +from data_processing.utils import TransformUtils +from dpk_code2parquet.transform import ( # domain_key,; snapshot_key, CodeToParquetTransform, data_factory_key, detect_programming_lang_key, @@ -20,10 +24,6 @@ snapshot_key, supported_langs_file_key, ) -from data_processing.data_access import DataAccessFactory -from data_processing.test_support import get_files_in_folder -from data_processing.test_support.transform import AbstractBinaryTransformTest -from data_processing.utils import TransformUtils class TestIngestToParquetTransform(AbstractBinaryTransformTest): diff --git a/transforms/code/code2parquet/python/test/test_code2parquet_python.py b/transforms/code/code2parquet/test/test_code2parquet_python.py similarity index 93% rename from transforms/code/code2parquet/python/test/test_code2parquet_python.py rename to transforms/code/code2parquet/test/test_code2parquet_python.py index cee24e09f1..a909124132 100644 --- a/transforms/code/code2parquet/python/test/test_code2parquet_python.py +++ b/transforms/code/code2parquet/test/test_code2parquet_python.py @@ -13,17 +13,17 @@ import ast import os -from code2parquet_transform import ( # domain_key,; snapshot_key, +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from dpk_code2parquet.transform import ( # domain_key,; snapshot_key, detect_programming_lang_cli_key, domain_cli_key, snapshot_cli_key, supported_langs_file_cli_key, ) -from code2parquet_transform_python import CodeToParquetPythonConfiguration -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, -) +from dpk_code2parquet.transform_python import CodeToParquetPythonConfiguration class TestPythonIngestToParquetTransform(AbstractTransformLauncherTest): diff --git a/transforms/code/code2parquet/transform.config b/transforms/code/code2parquet/transform.config deleted file mode 100644 index 2049a2261d..0000000000 --- a/transforms/code/code2parquet/transform.config +++ /dev/null @@ -1,20 +0,0 @@ -# -# This is intended to be included across the Makefiles provided within -# a given transform's directory tree, so must use compatible syntax. -# -################################################################################ -# This defines the name of the transform and is used to match against -# expected files and is used to define the transform's image name. -TRANSFORM_NAME=code2parquet - -################################################################################ -# This defines the transforms' version number as would be used -# when publishing the wheel. In general, only the micro version -# number should be advanced relative to the DPK_VERSION. -# -# If you change the versions numbers, be sure to run "make set-versions" to -# update version numbers across the transform (e.g., pyproject.toml). -CODE2PARQUET_PYTHON_VERSION=$(DPK_VERSION) -CODE2PARQUET_RAY_VERSION=$(CODE2PARQUET_PYTHON_VERSION) -CODE2PARQUET_SPARK_VERSION=$(CODE2PARQUET_PYTHON_VERSION) - diff --git a/transforms/code/proglang_select/python/Dockerfile b/transforms/code/license_select/Dockerfile.python similarity index 61% rename from transforms/code/proglang_select/python/Dockerfile rename to transforms/code/license_select/Dockerfile.python index e96a50c985..9f38097b72 100644 --- a/transforms/code/proglang_select/python/Dockerfile +++ b/transforms/code/license_select/Dockerfile.python @@ -10,28 +10,18 @@ RUN useradd -ms /bin/bash dpk USER dpk WORKDIR /home/dpk ARG DPK_WHEEL_FILE_NAME +ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chown=dpk:root data-processing-dist data-processing-dist +COPY --chown=dpk:users data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} # END OF STEPS destined for a data-prep-kit base image -COPY --chown=dpk:root src/ src/ -COPY --chown=dpk:root pyproject.toml pyproject.toml -COPY --chown=dpk:root requirements.txt requirements.txt -RUN pip install --no-cache-dir -e . - -# copy the main() entry point to the image -COPY ./src/proglang_select_transform_python.py . - -# copy some of the samples in -COPY ./src/proglang_select_local.py local/ - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chown=dpk:users dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chown=dpk:users requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt # Set environment ENV PYTHONPATH /home/dpk diff --git a/transforms/code/license_select/ray/Dockerfile b/transforms/code/license_select/Dockerfile.ray similarity index 53% rename from transforms/code/license_select/ray/Dockerfile rename to transforms/code/license_select/Dockerfile.ray index d7b3be5f80..b8e52425b0 100644 --- a/transforms/code/license_select/ray/Dockerfile +++ b/transforms/code/license_select/Dockerfile.ray @@ -1,10 +1,9 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 - FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod g=u /home/ray +RUN chown ray:root /home/ray && chmod 775 /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -12,31 +11,23 @@ RUN pip install --upgrade --no-cache-dir pip # install pytest RUN pip install --no-cache-dir pytest ARG DPK_WHEEL_FILE_NAME +ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ -RUN cd python-transform && pip install --no-cache-dir -e . - -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml -COPY --chmod=775 --chown=ray:root README.md README.md -RUN pip install --no-cache-dir -e . -# copy source data -COPY --chmod=775 --chown=ray:root src/license_select_transform_ray.py . -COPY --chmod=775 --chown=ray:root src/license_select_local_ray.py local/ +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt -# copy test -COPY --chmod=775 --chown=ray:root test/ test/ -COPY --chmod=775 --chown=ray:root test-data/ test-data/ +# Set environment +ENV PYTHONPATH /home/ray # Put these at the end since they seem to upset the docker cache. ARG BUILD_DATE ARG GIT_COMMIT LABEL build-date=$BUILD_DATE LABEL git-commit=$GIT_COMMIT - diff --git a/transforms/code/license_select/Makefile b/transforms/code/license_select/Makefile index 04b1cc4512..21c4727efe 100644 --- a/transforms/code/license_select/Makefile +++ b/transforms/code/license_select/Makefile @@ -1,71 +1,32 @@ REPOROOT=../../.. # Use make help, to see the available rules -include $(REPOROOT)/.make.defaults - -setup:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -clean:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -build:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse -venv:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse - -image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -set-versions: - @# Help: Recursively $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -publish:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-src:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -.PHONY: workflow-venv -workflow-venv: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-venv; \ - fi - -.PHONY: workflow-test -workflow-test: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-test; \ - fi - -.PHONY: workflow-upload -workflow-upload: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-upload; \ - fi - -.PHONY: workflow-build -workflow-build: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-build; \ - fi - +include $(REPOROOT)/transforms/.make.cicd.targets + +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=$(shell basename `pwd`) + +################################################################################ + + + +run-cli-sample: + make venv + source venv/bin/activate && \ + $(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \ + --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" + + + +run-ray-cli-sample: + make venv + source venv/bin/activate && \ + $(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \ + --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \ + --run_locally True + diff --git a/transforms/code/license_select/README.md b/transforms/code/license_select/README.md deleted file mode 100644 index 1e01366289..0000000000 --- a/transforms/code/license_select/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# License Select - -The License Select transform checks if the `license` of input data is in approved/denied list. It is implemented as per the set of [transform project conventions](../../README.md#transform-project-conventions) the following runtimes are available: - -* [python](python/README.md) - provides the base python-based transformation -implementation. -* [ray](ray/README.md) - enables the running of the base python transformation -in a Ray runtime -* [kfp](kfp_ray/README.md) - enables running the ray docker image -in a kubernetes cluster using a generated `yaml` file. - diff --git a/transforms/code/license_select/python/src/license_select_local.py b/transforms/code/license_select/dpk_license_select/local.py similarity index 96% rename from transforms/code/license_select/python/src/license_select_local.py rename to transforms/code/license_select/dpk_license_select/local.py index a16cbb27a8..6de731b56c 100644 --- a/transforms/code/license_select/python/src/license_select_local.py +++ b/transforms/code/license_select/dpk_license_select/local.py @@ -14,7 +14,7 @@ import os from data_processing.data_access import DataAccessLocal -from license_select_transform import LicenseSelectTransform +from dpk_license_select.transform import LicenseSelectTransform input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) diff --git a/transforms/code/license_select/python/src/license_select_local_python.py b/transforms/code/license_select/dpk_license_select/local_python.py similarity index 96% rename from transforms/code/license_select/python/src/license_select_local_python.py rename to transforms/code/license_select/dpk_license_select/local_python.py index 2306e00ac0..b4a2754cce 100644 --- a/transforms/code/license_select/python/src/license_select_local_python.py +++ b/transforms/code/license_select/dpk_license_select/local_python.py @@ -16,7 +16,7 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from license_select_transform import LicenseSelectTransformConfiguration +from dpk_license_select.transform import LicenseSelectTransformConfiguration input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) diff --git a/transforms/code/license_select/dpk_license_select/ray/__init__.py b/transforms/code/license_select/dpk_license_select/ray/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/transforms/code/license_select/dpk_license_select/ray/transform.py b/transforms/code/license_select/dpk_license_select/ray/transform.py new file mode 100644 index 0000000000..abf2cb54f3 --- /dev/null +++ b/transforms/code/license_select/dpk_license_select/ray/transform.py @@ -0,0 +1,69 @@ +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +from data_processing.utils import ParamsUtils + +################################################################################ +from data_processing_ray.runtime.ray import RayTransformLauncher +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) +from dpk_license_select.transform import LicenseSelectTransformConfiguration + + +class LicenseSelectRayTransformConfiguration(RayTransformRuntimeConfiguration): + def __init__(self): + super().__init__(transform_config=LicenseSelectTransformConfiguration()) + + +class LicenseSelect: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + + # create parameters + default_language_column = "license" + if "lc_license_column_name" not in self.params: + self.params["lc_license_column_name"] = default_language_column + if "lc_licenses_file" not in self.params: + approved_license_file = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../../test-data/sample_approved_licenses.json") + ) + + self.params["lc_licenses_file"] = approved_license_file + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + # create launcher + launcher = RayTransformLauncher(LicenseSelectRayTransformConfiguration()) + # launch + return_code = launcher.launch() + return return_code + + +def main(): + launcher = RayTransformLauncher(LicenseSelectRayTransformConfiguration()) + launcher.launch() + + +if __name__ == "__main__": + main() diff --git a/transforms/code/license_select/python/src/license_select_transform.py b/transforms/code/license_select/dpk_license_select/transform.py similarity index 98% rename from transforms/code/license_select/python/src/license_select_transform.py rename to transforms/code/license_select/dpk_license_select/transform.py index a43d399a37..0052d6cf41 100644 --- a/transforms/code/license_select/python/src/license_select_transform.py +++ b/transforms/code/license_select/dpk_license_select/transform.py @@ -23,7 +23,10 @@ get_logger, str2bool, ) -from transformer import AllowLicenseStatusTransformer, DenyLicenseStatusTransformer +from dpk_license_select.transformer import ( + AllowLicenseStatusTransformer, + DenyLicenseStatusTransformer, +) logger = get_logger(__name__) @@ -48,6 +51,7 @@ LICENSE_COLUMN_DEFAULT = "license" LICENSES_KEY = "licenses" + def _get_supported_licenses(license_file: str, data_access: DataAccess) -> list[str]: logger.info(f"Getting supported licenses from file {license_file}") licenses_list = None @@ -119,6 +123,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab new_table = self.transformer.transform(table) return [new_table], {} + class LicenseSelectTransformConfiguration(TransformConfiguration): def __init__(self): super().__init__(name="license_select", transform_class=LicenseSelectTransform) @@ -159,13 +164,13 @@ def add_input_params(self, parser: ArgumentParser) -> None: def apply_input_params(self, args: Namespace) -> bool: if not self.daf.apply_input_params(args): return False - + captured = CLIArgumentProvider.capture_parameters(args, CLI_PREFIX, False) license_column_name = captured.get(LICENSE_COLUMN_NAME_KEY) allow_licenses = captured.get(ALLOW_NO_LICENSE_KEY) deny_licenses = captured.get(DENY_LICENSES_KEY, False) licenses_file = captured.get(LICENSES_FILE_KEY) - + # Read licenses from allow-list or deny-list data_access = self.daf.create_data_access() licenses = _get_supported_licenses(licenses_file, data_access) diff --git a/transforms/code/license_select/dpk_license_select/transform_python.py b/transforms/code/license_select/dpk_license_select/transform_python.py new file mode 100644 index 0000000000..5e9bff93d7 --- /dev/null +++ b/transforms/code/license_select/dpk_license_select/transform_python.py @@ -0,0 +1,65 @@ +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher + +################################################################################ +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import ParamsUtils +from dpk_license_select.transform import LicenseSelectTransformConfiguration + + +class LicenseSelectPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + def __init__(self): + super().__init__(transform_config=LicenseSelectTransformConfiguration()) + + +class LicenseSelect: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + + # create parameters + default_language_column = "license" + if "lc_license_column_name" not in self.params: + self.params["lc_license_column_name"] = default_language_column + if "lc_licenses_file" not in self.params: + approved_license_file = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../test-data/sample_approved_licenses.json") + ) + + self.params["lc_licenses_file"] = approved_license_file + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + # create launcher + launcher = PythonTransformLauncher(LicenseSelectPythonTransformConfiguration()) + # launch + return_code = launcher.launch() + return return_code + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(LicenseSelectPythonTransformConfiguration()) + launcher.launch() diff --git a/transforms/code/license_select/python/src/transformer.py b/transforms/code/license_select/dpk_license_select/transformer.py similarity index 100% rename from transforms/code/license_select/python/src/transformer.py rename to transforms/code/license_select/dpk_license_select/transformer.py diff --git a/transforms/code/license_select/kfp_ray/Makefile b/transforms/code/license_select/kfp_ray/Makefile index 28e244faa9..7244ce1427 100644 --- a/transforms/code/license_select/kfp_ray/Makefile +++ b/transforms/code/license_select/kfp_ray/Makefile @@ -2,10 +2,15 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows -# Include the common configuration for this transform -include ../transform.config +SRC_DIR=${CURDIR}/../ +# Use the docker image that is built for ray runtime +TRANSFORM_RUNTIME=ray +## override settings in .make.default as they assume old structure with ray being the current folder +DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-$(TRANSFORM_RUNTIME) +DOCKER_LOCAL_IMAGE=$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -SRC_DIR=${CURDIR}/../ray/ +# Only build the image with -f Dockerfile.ray +BUILD_SPECIFIC_RUNTIME=ray PYTHON_WF := $(shell find ./ -name '*_wf.py') YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) @@ -17,27 +22,6 @@ clean: @# Help: Clean up the virtual environment. rm -rf ${REPOROOT}/transforms/venv -venv:: - -build:: - -setup:: - -test:: - -test-src:: - -test-image:: - -publish:: - -image:: - -load-image:: - -docker-load-image:: - -docker-save-image:: .PHONY: workflow-build workflow-build: workflow-venv @@ -45,10 +29,14 @@ workflow-build: workflow-venv .PHONY: workflow-test workflow-test: workflow-build - $(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=license_select_wf.yaml + $(MAKE) TRANSFORM_SRC=${SRC_DIR} \ + TRANSFORM_RUNTIME=$(TRANSFORM_RUNTIME) \ + TRANSFORM_NAME=$(TRANSFORM_NAME) \ + BUILD_SPECIFIC_RUNTIME=$(BUILD_SPECIFIC_RUNTIME) \ + PIPELINE_FILE=$(TRANSFORM_NAME)_wf.yaml .workflows.test-pipeline .PHONY: workflow-upload workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done + done \ No newline at end of file diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index f29f1c839c..60f596d606 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -24,7 +24,7 @@ # the name of the job script -EXEC_SCRIPT_NAME: str = "license_select_transform_ray.py" +EXEC_SCRIPT_NAME: str = "-m dpk_license_select.ray.transform" task_image = "quay.io/dataprep1/data-prep-kit/license_select-ray:latest" @@ -94,7 +94,7 @@ def compute_exec_params_func( ) def license_select( ray_name: str = "license_select-kfp-ray", # name of Ray cluster - ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -114,7 +114,7 @@ def license_select( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.7}, runtime_pipeline_id: str = "runtime_pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, # license select parameters lc_license_column_name: str = "license", lc_licenses_file: str = "test/license_select/sample_approved_licenses.json", @@ -161,8 +161,10 @@ def license_select( # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": - print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - "same version of the same pipeline !!!") + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!" + ) run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER diff --git a/transforms/code/license_select/license_select-ray.ipynb b/transforms/code/license_select/license_select-ray.ipynb new file mode 100644 index 0000000000..e6a7f1be6f --- /dev/null +++ b/transforms/code/license_select/license_select-ray.ipynb @@ -0,0 +1,117 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "fd7d20d2-1001-420a-a6b6-1efbca17ae8b", + "metadata": {}, + "outputs": [], + "source": [ + "from dpk_license_select.ray.transform import LicenseSelect" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5957a088-10a0-41fd-9b82-c329f088f2c8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "15:42:21 INFO - Running locally\n", + "15:42:21 INFO - data factory lc_ is using local configuration without input/output path\n", + "15:42:21 INFO - data factory lc_ max_files -1, n_sample -1\n", + "15:42:21 INFO - data factory lc_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "15:42:21 INFO - Getting supported licenses from file ./test-data/sample_approved_licenses.json\n", + "15:42:21 INFO - Read a list of 171 licenses.\n", + "15:42:21 INFO - data factory data_ is using local data access: input_folder - ./test-data/input output_folder - output\n", + "15:42:21 INFO - data factory data_ max_files -1, n_sample -1\n", + "15:42:21 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "15:42:21 INFO - pipeline id pipeline_id\n", + "15:42:21 INFO - code location None\n", + "15:42:21 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "15:42:21 INFO - actor creation delay 0\n", + "15:42:21 INFO - job details {'job category': 'preprocessing', 'job name': 'license_select', 'job type': 'ray', 'job id': 'job_id'}\n", + "2025-02-13 15:42:21,811\tINFO worker.py:1568 -- Connecting to existing Ray cluster at address: 127.0.0.1:6379...\n", + "2025-02-13 15:42:21,815\tINFO worker.py:1744 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=65550)\u001b[0m 15:42:21 INFO - orchestrator started at 2025-02-13 15:42:21\n", + "\u001b[36m(orchestrate pid=65550)\u001b[0m 15:42:21 ERROR - No input files to process - exiting\n", + "15:42:31 INFO - Completed execution in 0.16988400220870972 min, execution result 0\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "LicenseSelect(\n", + " input_folder = \"./test-data/input\",\n", + " output_folder = \"output\",\n", + " lc_licenses_file = \"./test-data/sample_approved_licenses.json\",\n", + " run_locally = True\n", + " ).transform()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "96b4b004-cb01-46d1-bff9-b8b08de2e691", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dockerfile.python Untitled1.ipynb \u001b[34moutput\u001b[m\u001b[m \u001b[34mvenv\u001b[m\u001b[m\n", + "Dockerfile.ray \u001b[34mdpk_license_select\u001b[m\u001b[m requirements.txt\n", + "Makefile \u001b[34mkfp_ray\u001b[m\u001b[m \u001b[34mtest\u001b[m\u001b[m\n", + "Untitled.ipynb \u001b[34mout\u001b[m\u001b[m \u001b[34mtest-data\u001b[m\u001b[m\n" + ] + } + ], + "source": [ + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed625663-fefc-4eae-8551-439fc6b9bda0", + "metadata": {}, + "outputs": [], + "source": [ + "!ls test" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/code/license_select/license_select.ipynb b/transforms/code/license_select/license_select.ipynb new file mode 100644 index 0000000000..53e71c891e --- /dev/null +++ b/transforms/code/license_select/license_select.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "abf3d328-83e1-4627-9ec9-d2d16ec0c08a", + "metadata": {}, + "outputs": [], + "source": [ + "from dpk_license_select.transform_python import LicenseSelect" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4783b780-ab14-4856-940d-5a79d657bd1e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "15:38:17 INFO - data factory lc_ is using local configuration without input/output path\n", + "15:38:17 INFO - data factory lc_ max_files -1, n_sample -1\n", + "15:38:17 INFO - data factory lc_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "15:38:17 INFO - Getting supported licenses from file /Users/shivdeep/workspace/projects/current/dpk-newapi/transforms/code/license_select/test-data/sample_approved_licenses.json\n", + "15:38:17 INFO - Read a list of 171 licenses.\n", + "15:38:17 INFO - pipeline id pipeline_id\n", + "15:38:17 INFO - job details {'job category': 'preprocessing', 'job name': 'license_select', 'job type': 'pure python', 'job id': 'job_id'}\n", + "15:38:17 INFO - code location None\n", + "15:38:17 INFO - data factory data_ is using local data access: input_folder - ./test-data/input output_folder - output\n", + "15:38:17 INFO - data factory data_ max_files -1, n_sample -1\n", + "15:38:17 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "15:38:17 INFO - orchestrator license_select started at 2025-02-13 15:38:17\n", + "15:38:17 INFO - Number of files is 2, source profile {'max_file_size': 0.0034952163696289062, 'min_file_size': 0.0031719207763671875, 'total_file_size': 0.006667137145996094}\n", + "15:38:17 INFO - Completed 1 files (50.0%) in 0.00031479994455973305 min\n", + "15:38:17 INFO - Completed 2 files (100.0%) in 0.0003341992696126302 min\n", + "15:38:17 INFO - done flushing in 5.7220458984375e-06 sec\n", + "15:38:17 INFO - Completed execution in 0.0033513466517130536 min, execution result 0\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "LicenseSelect(\n", + " input_folder = \"./test-data/input\",\n", + " output_folder = \"output\",\n", + " ).transform()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c12bb468-e860-4a6a-b40d-9d9e0a03efdb", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/code/license_select/python/.dockerignore b/transforms/code/license_select/python/.dockerignore deleted file mode 100644 index f7275bbbd0..0000000000 --- a/transforms/code/license_select/python/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/code/license_select/python/Makefile b/transforms/code/license_select/python/Makefile deleted file mode 100644 index 2152710687..0000000000 --- a/transforms/code/license_select/python/Makefile +++ /dev/null @@ -1,73 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -# Use default rule inherited from makefile.common -clean:: .transforms.clean - -# Use default rule inherited from makefile.common -test:: .transforms.python-test - -# Use default rule inherited from makefile.common -image:: .transforms.python-image - -# Use default rule inherited from makefile.common -venv:: .transforms.python-venv - -test-src:: .transforms.test-src - -test-image:: .transforms.python-test-image - -run-cli-sample: - $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_python.py \ - RUN_ARGS="--data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \ - --lc_license_column_name license \ - --lc_licenses_file ../test-data/sample_approved_licenses.json" \ - .transforms.run-src-file - -run-local-sample: .transforms.run-local-sample - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-python - -setup:: .transforms.setup - -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=${LICENSE_SELECT_PYTHON_VERSION} TOML_VERSION=$(LICENSE_SELECT_PYTHON_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -setup:: .transforms.setup -run-local-sample: .transforms.run-local-sample -run-local-python-sample: .transforms.run-local-python-sample - -load-image:: .transforms.load-image - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/code/license_select/python/README.md b/transforms/code/license_select/python/README.md deleted file mode 100644 index c35baace75..0000000000 --- a/transforms/code/license_select/python/README.md +++ /dev/null @@ -1,99 +0,0 @@ -# License Select - -Please see the set of -[transform project conventions](../../../README.md#transform-project-conventions) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Summary - -The License Select transform checks if the `license` of input data is in approved/denied list. It is implemented as per the set of [transform project conventions](../../README.md#transform-project-conventions) the following runtimes are available: -This filter scans the license column of an input dataset and appends the `license_status` column to the dataset. - -The type of the license column can be either string or list of strings. For strings, the license name is checked against the list of approved licenses. For list of strings, each license name in the list is checked against the list of approved licenses, and all must be approved. - -If the license is approved, the license_status column contains True; otherwise False. - -## Configuration and command line Options - -The set of dictionary keys holding license_select configuration for values are as follows: - -The transform can be configured with the following key/value pairs from the configuration dictionary. - -```python -# Sample params dictionary passed to the transform - -{ -"license_select_params" : { - "license_column_name": "license", - "deny_licenses": False, - "licenses": [ 'MIT', 'Apache'], - "allow_no_license": False, - } -} -``` - -**license_column_name** - The name of the column with licenses. - -**deny_licenses** - A boolean value, True for denied licesnes, False for approved licenses. - -**licenses** - A list of licenses used as approve/deny list. - -**allow_no_license** - A boolean value, used to retain the values with no license in the column `license_column_name` - - -## Running - -### Launcher Command Line Options - -The following command line arguments are available in addition to -the options provided by the [launcher](../../../../data-processing-lib/doc/launcher-options.md). - - `--lc_license_column_name` - set the name of the column holds license to process - - `--lc_allow_no_license` - allow entries with no associated license (default: false) - - `--lc_licenses_file` - S3 or local path to allowed/denied licenses JSON file - - `--lc_deny_licenses` - allow all licences except those in licenses_file (default: false) - -- The optional `lc_license_column_name` parameter is used to specify the column name in the input dataset that contains the license information. The default column name is license. - -- The optional `lc_allow_no_license` option allows any records without a license to be accepted by the filter. If this option is not set, records without a license are rejected. - -- The required `lc_licenses_file` options allows a list of licenses to be specified. An S3 or local file path should be supplied (including bucket name, for example: bucket-name/path/to/licenses.json) with the file contents being a JSON list of strings. For example: - - >[ - 'Apache-2.0', - 'MIT' - ] - -- The optional `lc_deny_licenses` flag is used when `lc_licenses_file` specifies the licenses that will be rejected, with all other licenses being accepted. These parameters do not affect handling of records with no license information, which is dictated by the allow_no_license option. - - -### Running the samples - -To run the samples, use the following make targets - -`run-cli-sample` - -`run-local-python-sample` - -These targets will activate the virtual environment and set up any configuration needed. Use the -n option of make to see the detail of what is done to run the sample. - -For example, -``` -make run-cli-sample - -``` -... -Then - -ls output -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml deleted file mode 100644 index 4fa39803c2..0000000000 --- a/transforms/code/license_select/python/pyproject.toml +++ /dev/null @@ -1,47 +0,0 @@ -[project] -name = "dpk_license_select_transform_python" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "License Select Python Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, - { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt deleted file mode 100644 index 013ce90111..0000000000 --- a/transforms/code/license_select/python/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -data-prep-toolkit>=0.2.3 \ No newline at end of file diff --git a/transforms/code/license_select/python/src/license_select_transform_python.py b/transforms/code/license_select/python/src/license_select_transform_python.py deleted file mode 100644 index 3ceaf7f325..0000000000 --- a/transforms/code/license_select/python/src/license_select_transform_python.py +++ /dev/null @@ -1,27 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from data_processing.runtime.pure_python import PythonTransformLauncher - -################################################################################ -from data_processing.runtime.pure_python.runtime_configuration import ( - PythonTransformRuntimeConfiguration, -) -from license_select_transform import LicenseSelectTransformConfiguration - - -class LicenseSelectPythonTransformConfiguration(PythonTransformRuntimeConfiguration): - def __init__(self): - super().__init__(transform_config=LicenseSelectTransformConfiguration()) - - -if __name__ == "__main__": - launcher = PythonTransformLauncher(LicenseSelectPythonTransformConfiguration()) - launcher.launch() diff --git a/transforms/code/license_select/ray/.dockerignore b/transforms/code/license_select/ray/.dockerignore deleted file mode 100644 index f7275bbbd0..0000000000 --- a/transforms/code/license_select/ray/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/code/license_select/ray/Makefile b/transforms/code/license_select/ray/Makefile deleted file mode 100644 index 687302af4a..0000000000 --- a/transforms/code/license_select/ray/Makefile +++ /dev/null @@ -1,74 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -BASE_IMAGE=$(RAY_BASE_IMAGE) - -# Use default rule inherited from makefile.common -clean:: .transforms.clean - -# Use default rule inherited from makefile.common -test:: .transforms.ray-test - -# Use default rule inherited from makefile.common -image:: .transforms.ray-image - -# Use default rule inherited from makefile.common -venv:: .transforms.ray-venv - -image:: .transforms.ray-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -test-image:: .transforms.ray-test-image - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-ray - -setup:: .transforms.setup - -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=${LICENSE_SELECT_PYTHON_VERSION} TOML_VERSION=$(LICENSE_SELECT_RAY_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -run-cli-ray-sample:. - $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \ - RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../../python/test-data/input', 'output_folder' : '../output'}\" \ - --lc_license_column_name license \ - --lc_licenses_file ../test-data/sample_approved_licenses.json" \ - .transforms.run-src-file - -run-local-sample: .transforms.run-local-ray-sample - -run-s3-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -load-image:: .transforms.load-image diff --git a/transforms/code/license_select/ray/README.md b/transforms/code/license_select/ray/README.md deleted file mode 100644 index b41d5b0800..0000000000 --- a/transforms/code/license_select/ray/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# License Select - -Please see the set of -[transform project conventions](../../../README.md) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Summary - -This project wraps the [license select transform](../python/README.md) with a Ray runtime. - -## Running - -### Launcher Command Line Options - -In addition to those available to the transform as defined in [here](../python/README.md), -the set of -[launcher options](../../../../data-processing-lib/doc/launcher-options.md) are available. - -### Running the samples - -To run the samples, use the following `make` targets - -* `run-cli-ray-sample` -* `run-local-ray-sample` -* `run-s3-ray-sample` - * Requires prior invocation of `make minio-start` to load data into local minio for S3 access. - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, - -```shell -make run-cli-ray-sample -... -``` - -Then - -```shell -ls output -``` -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/code/license_select/ray/pyproject.toml b/transforms/code/license_select/ray/pyproject.toml deleted file mode 100644 index 0617ab2ae7..0000000000 --- a/transforms/code/license_select/ray/pyproject.toml +++ /dev/null @@ -1,46 +0,0 @@ -[project] -name = "dpk_license_select_transform_ray" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "License Select Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, - { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, -] -dependencies = [ - "dpk-license-select-transform-python==0.2.4.dev0", - "data-prep-toolkit[ray]>=0.2.4.dev0", -] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/code/license_select/ray/src/license_select_local_ray.py b/transforms/code/license_select/ray/src/license_select_local_ray.py deleted file mode 100644 index 10fdced1de..0000000000 --- a/transforms/code/license_select/ray/src/license_select_local_ray.py +++ /dev/null @@ -1,61 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import os -import sys -from pathlib import Path - -from data_processing.utils import ParamsUtils -from data_processing_ray.runtime.ray import RayTransformLauncher -from license_select_transform_ray import LicenseSelectRayTransformConfiguration - - -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) -approved_licenses_file = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../test-data/sample_approved_licenses.json") -) - - -local_conf = { - "input_folder": input_folder, - "output_folder": output_folder, -} - -# create launcher -launcher = RayTransformLauncher(LicenseSelectRayTransformConfiguration()) - - -worker_options = {"num_cpus": 0.8} -code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -params = { - # where to run - "run_locally": True, - # Data access. Only required parameters are specified - "data_local_config": ParamsUtils.convert_to_ast(local_conf), - # orchestrator - "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), - "runtime_num_workers": 3, - "runtime_pipeline_id": "pipeline_id", - "runtime_job_id": "job_id", - "runtime_creation_delay": 0, - "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - # license select configuration - "lc_license_column_name": "license", - "lc_licenses_file": approved_licenses_file, -} - -if __name__ == "__main__": - Path(output_folder).mkdir(parents=True, exist_ok=True) - sys.argv = ParamsUtils.dict_to_req(d=params) - # launch - launcher.launch() diff --git a/transforms/code/license_select/ray/src/license_select_s3_ray.py b/transforms/code/license_select/ray/src/license_select_s3_ray.py deleted file mode 100644 index 6d26df93cc..0000000000 --- a/transforms/code/license_select/ray/src/license_select_s3_ray.py +++ /dev/null @@ -1,56 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### - -import sys - -from data_processing.utils import ParamsUtils -from data_processing_ray.runtime.ray import RayTransformLauncher -from license_select_transform_ray import LicenseSelectRayTransformConfiguration - - -s3_cred = { - "access_key": "localminioaccesskey", - "secret_key": "localminiosecretkey", - "url": "http://localhost:9000", -} -s3_conf = { - "input_folder": "test-data/input", - "output_folder": "test-data/output", -} - -# create launcher -launcher = RayTransformLauncher(LicenseSelectRayTransformConfiguration()) - - -worker_options = {"num_cpus": 0.8} -code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -params = { - # where to run - "run_locally": True, - # Data access. Only required parameters are specified - "data_s3_cred": ParamsUtils.convert_to_ast(s3_cred), - "data_s3_config": ParamsUtils.convert_to_ast(s3_conf), - # orchestrator - "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), - "runtime_num_workers": 3, - "runtime_pipeline_id": "pipeline_id", - "runtime_job_id": "job_id", - "runtime_creation_delay": 0, - "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - "lc_license_column_name": "license", - "lc_licenses_file": "test-data/sample_approved_licenses.json", -} - -if __name__ == "__main__": - sys.argv = ParamsUtils.dict_to_req(d=params) - # launch - launcher.launch() diff --git a/transforms/code/license_select/ray/src/license_select_transform_ray.py b/transforms/code/license_select/ray/src/license_select_transform_ray.py deleted file mode 100644 index de384997a7..0000000000 --- a/transforms/code/license_select/ray/src/license_select_transform_ray.py +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -################################################################################ -from data_processing_ray.runtime.ray import RayTransformLauncher -from data_processing_ray.runtime.ray.runtime_configuration import ( - RayTransformRuntimeConfiguration, -) -from license_select_transform import LicenseSelectTransformConfiguration - - -class LicenseSelectRayTransformConfiguration(RayTransformRuntimeConfiguration): - def __init__(self): - super().__init__(transform_config=LicenseSelectTransformConfiguration()) - - -def main(): - launcher = RayTransformLauncher(LicenseSelectRayTransformConfiguration()) - launcher.launch() - - -if __name__ == "__main__": - main() diff --git a/transforms/code/license_select/ray/test-data/expected/metadata.json b/transforms/code/license_select/ray/test-data/expected/metadata.json deleted file mode 100644 index 105043eabf..0000000000 --- a/transforms/code/license_select/ray/test-data/expected/metadata.json +++ /dev/null @@ -1,214 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "license_select", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-06-27 15:53:31", - "end_time": "2024-06-27 15:53:31", - "status": "success" - }, - "code": null, - "job_input_params": { - "license_select_params": { - "license_column_name": "license", - "allow_no_license": false, - "licenses": [ - "MIT", - "Apache-2.0", - "BSD-3-Clause", - "Unlicense", - "CC0-1.0", - "BSD-2-Clause", - "CC-BY-4.0", - "CC-BY-3.0", - "0BSD", - "WTFPL", - "MIT-0", - "ISC", - "ADSL", - "BSL-1.0", - "Zlib", - "FTL", - "MS-PL", - "BSD-2-Clause-FreeBSD", - "FSFAP", - "BSD-Source-Code", - "Apache-1.1", - "BSD-4-Clause", - "Ruby", - "MulanPSL-1.0", - "BSD-1-Clause", - "X11", - "Condor-1.1", - "PostgreSQL", - "CECILL-B", - "Intel", - "Vim", - "Naumen", - "OML", - "BSD-3-Clause-Clear", - "AML", - "PHP-3.01", - "OpenSSL", - "PSF-2.0", - "Xnet", - "Linux-OpenIB", - "BSD-3-Clause-LBNL", - "UPL-1.0", - "BlueOak-1.0.0", - "Info-ZIP", - "BSD-4-Clause-UC", - "bzip2-1.0.6", - "W3C", - "W3C-20150513", - "DOC", - "ICU", - "CC-BY-2.0", - "curl", - "MTLL", - "OLDAP-2.2.1", - "ECL-2.0", - "Adobe-Glyph", - "BSD-2-Clause-Patent", - "IJG", - "PHP-3.0", - "ZPL-2.1", - "MIT-advertising", - "NCSA", - "Fair", - "BSD-3-Clause-Attribution", - "OLDAP-2.3", - "NLPL", - "BSD-3-Clause-Open-MPI", - "Python-2.0", - "NASA-1.3", - "TCL", - "BSD-3-Clause-No-Nuclear-Warranty", - "ImageMagick", - "Net-SNMP", - "OLDAP-2.5", - "MIT-feh", - "OLDAP-2.4", - "MITNFA", - "libpng-2.0", - "EFL-2.0", - "OLDAP-2.7", - "IBM-pibs", - "libtiff", - "OLDAP-2.8", - "Adobe-2006", - "BSD-2-Clause-NetBSD", - "zlib-acknowledgement", - "OLDAP-2.6", - "BSD-3-Clause-No-Nuclear-License-2014", - "OLDAP-1.4", - "Libpng", - "MIT-CMU", - "JasPer-2.0", - "Zend-2.0", - "TCP-wrappers", - "XFree86-1.1", - "FSFUL", - "OLDAP-1.3", - "SGI-B-2.0", - "NetCDF", - "Zed", - "ZPL-2.0", - "Apache-1.0", - "CC-BY-1.0", - "OLDAP-2.1", - "OLDAP-1.2", - "OLDAP-2.0", - "NTP", - "AMPAS", - "Barr", - "mpich2", - "ANTLR-PD", - "Xerox", - "Spencer-94", - "AMDPLPA", - "BSD-3-Clause-No-Nuclear-License", - "HPND", - "ECL-1.0", - "MirOS", - "Qhull", - "ZPL-1.1", - "TU-Berlin-2.0", - "Spencer-86", - "SMLNJ", - "xinetd", - "OLDAP-2.2.2", - "MIT-enna", - "Font-exception-2.0", - "FSFULLR", - "TU-Berlin-1.0", - "xpp", - "NRL", - "W3C-19980720", - "EFL-1.0", - "eGenix", - "Unicode-DFS-2016", - "SWL", - "Spencer-99", - "Plexus", - "VSL-1.0", - "Leptonica", - "Unicode-DFS-2015", - "Mup", - "Giftware", - "OLDAP-2.2", - "APAFML", - "NBPL-1.0", - "OLDAP-1.1", - "Entessa", - "Multics", - "Newsletr", - "psutils", - "bzip2-1.0.5", - "Afmparse", - "diffmark", - "BSD-2-Clause-Views", - "DSDP", - "MIT-Modern-Variant", - "ANTLR-PD-fallback", - "Bahyph", - "BSD-3-Clause-Modification", - "BSD-4-Clause-Shortened", - "HTMLTIDY", - "MIT-open-group", - "MulanPSL-2.0", - "OLDAP-2.0.1", - "Saxpath", - "Borceux", - "Crossword", - "CrystalStacker", - "Rdisc", - "Wsuipa" - ], - "deny": false - }, - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [".parquet"] - }, - "job_output_stats": { - "source_files": 2, - "source_size": 6991, - "result_files": 2, - "result_size": 7511, - "processing_time": 0.032202959060668945, - "source_doc_count": 4, - "result_doc_count": 4 - }, - "source": { - "name": "../test-data/input", - "type": "path" - }, - "target": { - "name": "../output", - "type": "path" - } -} diff --git a/transforms/code/license_select/ray/test-data/expected/sample_1.parquet b/transforms/code/license_select/ray/test-data/expected/sample_1.parquet deleted file mode 100644 index 6ef75f6fb1..0000000000 Binary files a/transforms/code/license_select/ray/test-data/expected/sample_1.parquet and /dev/null differ diff --git a/transforms/code/license_select/ray/test-data/expected/sample_2.parquet b/transforms/code/license_select/ray/test-data/expected/sample_2.parquet deleted file mode 100644 index 4cf36d37d0..0000000000 Binary files a/transforms/code/license_select/ray/test-data/expected/sample_2.parquet and /dev/null differ diff --git a/transforms/code/license_select/ray/test-data/input/sample_1.parquet b/transforms/code/license_select/ray/test-data/input/sample_1.parquet deleted file mode 100644 index 51fd1b49b0..0000000000 Binary files a/transforms/code/license_select/ray/test-data/input/sample_1.parquet and /dev/null differ diff --git a/transforms/code/license_select/ray/test-data/input/sample_2.parquet b/transforms/code/license_select/ray/test-data/input/sample_2.parquet deleted file mode 100644 index 33ee7a6323..0000000000 Binary files a/transforms/code/license_select/ray/test-data/input/sample_2.parquet and /dev/null differ diff --git a/transforms/code/license_select/ray/test-data/sample_approved_licenses.json b/transforms/code/license_select/ray/test-data/sample_approved_licenses.json deleted file mode 100644 index 6b4a376987..0000000000 --- a/transforms/code/license_select/ray/test-data/sample_approved_licenses.json +++ /dev/null @@ -1,173 +0,0 @@ -[ - "MIT", - "Apache-2.0", - "BSD-3-Clause", - "Unlicense", - "CC0-1.0", - "BSD-2-Clause", - "CC-BY-4.0", - "CC-BY-3.0", - "0BSD", - "WTFPL", - "MIT-0", - "ISC", - "ADSL", - "BSL-1.0", - "Zlib", - "FTL", - "MS-PL", - "BSD-2-Clause-FreeBSD", - "FSFAP", - "BSD-Source-Code", - "Apache-1.1", - "BSD-4-Clause", - "Ruby", - "MulanPSL-1.0", - "BSD-1-Clause", - "X11", - "Condor-1.1", - "PostgreSQL", - "CECILL-B", - "Intel", - "Vim", - "Naumen", - "OML", - "BSD-3-Clause-Clear", - "AML", - "PHP-3.01", - "OpenSSL", - "PSF-2.0", - "Xnet", - "Linux-OpenIB", - "BSD-3-Clause-LBNL", - "UPL-1.0", - "BlueOak-1.0.0", - "Info-ZIP", - "BSD-4-Clause-UC", - "bzip2-1.0.6", - "W3C", - "W3C-20150513", - "DOC", - "ICU", - "CC-BY-2.0", - "curl", - "MTLL", - "OLDAP-2.2.1", - "ECL-2.0", - "Adobe-Glyph", - "BSD-2-Clause-Patent", - "IJG", - "PHP-3.0", - "ZPL-2.1", - "MIT-advertising", - "NCSA", - "Fair", - "BSD-3-Clause-Attribution", - "OLDAP-2.3", - "NLPL", - "BSD-3-Clause-Open-MPI", - "Python-2.0", - "NASA-1.3", - "TCL", - "BSD-3-Clause-No-Nuclear-Warranty", - "ImageMagick", - "Net-SNMP", - "OLDAP-2.5", - "MIT-feh", - "OLDAP-2.4", - "MITNFA", - "libpng-2.0", - "EFL-2.0", - "OLDAP-2.7", - "IBM-pibs", - "libtiff", - "OLDAP-2.8", - "Adobe-2006", - "BSD-2-Clause-NetBSD", - "zlib-acknowledgement", - "OLDAP-2.6", - "BSD-3-Clause-No-Nuclear-License-2014", - "OLDAP-1.4", - "Libpng", - "MIT-CMU", - "JasPer-2.0", - "Zend-2.0", - "TCP-wrappers", - "XFree86-1.1", - "FSFUL", - "OLDAP-1.3", - "SGI-B-2.0", - "NetCDF", - "Zed", - "ZPL-2.0", - "Apache-1.0", - "CC-BY-1.0", - "OLDAP-2.1", - "OLDAP-1.2", - "OLDAP-2.0", - "NTP", - "AMPAS", - "Barr", - "mpich2", - "ANTLR-PD", - "Xerox", - "Spencer-94", - "AMDPLPA", - "BSD-3-Clause-No-Nuclear-License", - "HPND", - "ECL-1.0", - "MirOS", - "Qhull", - "ZPL-1.1", - "TU-Berlin-2.0", - "Spencer-86", - "SMLNJ", - "xinetd", - "OLDAP-2.2.2", - "MIT-enna", - "Font-exception-2.0", - "FSFULLR", - "TU-Berlin-1.0", - "xpp", - "NRL", - "W3C-19980720", - "EFL-1.0", - "eGenix", - "Unicode-DFS-2016", - "SWL", - "Spencer-99", - "Plexus", - "VSL-1.0", - "Leptonica", - "Unicode-DFS-2015", - "Mup", - "Giftware", - "OLDAP-2.2", - "APAFML", - "NBPL-1.0", - "OLDAP-1.1", - "Entessa", - "Multics", - "Newsletr", - "psutils", - "bzip2-1.0.5", - "Afmparse", - "diffmark", - "BSD-2-Clause-Views", - "DSDP", - "MIT-Modern-Variant", - "ANTLR-PD-fallback", - "Bahyph", - "BSD-3-Clause-Modification", - "BSD-4-Clause-Shortened", - "HTMLTIDY", - "MIT-open-group", - "MulanPSL-2.0", - "OLDAP-2.0.1", - "Saxpath", - "Borceux", - "Crossword", - "CrystalStacker", - "Rdisc", - "Wsuipa" -] diff --git a/transforms/code/license_select/ray/test/test_license_select_ray.py b/transforms/code/license_select/ray/test/test_license_select_ray.py deleted file mode 100644 index 2d3190e698..0000000000 --- a/transforms/code/license_select/ray/test/test_license_select_ray.py +++ /dev/null @@ -1,40 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import os - -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, -) -from data_processing_ray.runtime.ray import RayTransformLauncher -from license_select_transform import LICENSE_COLUMN_NAME_CLI_KEY, LICENSES_FILE_CLI_KEY -from license_select_transform_ray import LicenseSelectRayTransformConfiguration - - -class TestPythonLicenseSelect(AbstractTransformLauncherTest): - """ - Extends the super-class to define the test data for the tests defined there. - The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. - """ - - def get_test_transform_fixtures(self) -> list[tuple]: - basedir = "../test-data" - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) - fixtures = [] - launcher = RayTransformLauncher(LicenseSelectRayTransformConfiguration()) - config = { - "run_locally": True, - LICENSE_COLUMN_NAME_CLI_KEY: "license", - LICENSES_FILE_CLI_KEY: os.path.join(basedir, "sample_approved_licenses.json"), - } - fixtures.append((launcher, config, basedir + "/input", basedir + "/expected")) - return fixtures diff --git a/transforms/code/license_select/requirements.txt b/transforms/code/license_select/requirements.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/transforms/code/license_select/python/test-data/expected/metadata.json b/transforms/code/license_select/test-data/expected/metadata.json similarity index 100% rename from transforms/code/license_select/python/test-data/expected/metadata.json rename to transforms/code/license_select/test-data/expected/metadata.json diff --git a/transforms/code/license_select/python/test-data/expected/sample_1.parquet b/transforms/code/license_select/test-data/expected/sample_1.parquet similarity index 100% rename from transforms/code/license_select/python/test-data/expected/sample_1.parquet rename to transforms/code/license_select/test-data/expected/sample_1.parquet diff --git a/transforms/code/license_select/python/test-data/expected/sample_2.parquet b/transforms/code/license_select/test-data/expected/sample_2.parquet similarity index 100% rename from transforms/code/license_select/python/test-data/expected/sample_2.parquet rename to transforms/code/license_select/test-data/expected/sample_2.parquet diff --git a/transforms/code/license_select/python/test-data/input/sample_1.parquet b/transforms/code/license_select/test-data/input/sample_1.parquet similarity index 100% rename from transforms/code/license_select/python/test-data/input/sample_1.parquet rename to transforms/code/license_select/test-data/input/sample_1.parquet diff --git a/transforms/code/license_select/python/test-data/input/sample_2.parquet b/transforms/code/license_select/test-data/input/sample_2.parquet similarity index 100% rename from transforms/code/license_select/python/test-data/input/sample_2.parquet rename to transforms/code/license_select/test-data/input/sample_2.parquet diff --git a/transforms/code/license_select/python/test-data/sample_approved_licenses.json b/transforms/code/license_select/test-data/sample_approved_licenses.json similarity index 100% rename from transforms/code/license_select/python/test-data/sample_approved_licenses.json rename to transforms/code/license_select/test-data/sample_approved_licenses.json diff --git a/transforms/code/license_select/python/test/test_license_select.py b/transforms/code/license_select/test/test_license_select.py similarity index 98% rename from transforms/code/license_select/python/test/test_license_select.py rename to transforms/code/license_select/test/test_license_select.py index fc0b6a9ba3..74c7509e53 100644 --- a/transforms/code/license_select/python/test/test_license_select.py +++ b/transforms/code/license_select/test/test_license_select.py @@ -15,7 +15,7 @@ import pyarrow as pa from data_processing.test_support.transform import AbstractTableTransformTest from data_processing.transform import get_transform_config -from license_select_transform import ( +from dpk_license_select.transform import ( LICENSE_COLUMN_NAME_CLI_KEY, LICENSE_SELECT_PARAMS, LICENSES_FILE_CLI_KEY, diff --git a/transforms/code/license_select/python/test/test_license_select_python.py b/transforms/code/license_select/test/test_license_select_python.py similarity index 89% rename from transforms/code/license_select/python/test/test_license_select_python.py rename to transforms/code/license_select/test/test_license_select_python.py index 329ba2862c..1a00f6d016 100644 --- a/transforms/code/license_select/python/test/test_license_select_python.py +++ b/transforms/code/license_select/test/test_license_select_python.py @@ -16,8 +16,13 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from license_select_transform import LICENSE_COLUMN_NAME_CLI_KEY, LICENSES_FILE_CLI_KEY -from license_select_transform_python import LicenseSelectPythonTransformConfiguration +from dpk_license_select.transform import ( + LICENSE_COLUMN_NAME_CLI_KEY, + LICENSES_FILE_CLI_KEY, +) +from dpk_license_select.transform_python import ( + LicenseSelectPythonTransformConfiguration, +) class TestPythonLicenseSelect(AbstractTransformLauncherTest): diff --git a/transforms/code/license_select/transform.config b/transforms/code/license_select/transform.config deleted file mode 100644 index bba10d3e5f..0000000000 --- a/transforms/code/license_select/transform.config +++ /dev/null @@ -1,20 +0,0 @@ -# -# This is intended to be included across the Makefiles provided within -# a given transform's directory tree, so must use compatible syntax. -# -################################################################################ -# This defines the name of the transform and is used to match against -# expected files and is used to define the transform's image name. -TRANSFORM_NAME=license_select - -################################################################################ -# This defines the transforms' version number as would be used -# when publishing the wheel. In general, only the micro version -# number should be advanced relative to the DPK_VERSION. -# -# If you change the versions numbers, be sure to run "make set-versions" to -# update version numbers across the transform (e.g., pyproject.toml). -LICENSE_SELECT_PYTHON_VERSION=$(DPK_VERSION) -LICENSE_SELECT_RAY_VERSION=$(LICENSE_SELECT_PYTHON_VERSION) -LICENSE_SELECT_SPARK_VERSION=$(LICENSE_SELECT_PYTHON_VERSION) - diff --git a/transforms/code/license_select/python/Dockerfile b/transforms/code/proglang_select/Dockerfile.python similarity index 59% rename from transforms/code/license_select/python/Dockerfile rename to transforms/code/proglang_select/Dockerfile.python index 8e9071ab2e..9f38097b72 100644 --- a/transforms/code/license_select/python/Dockerfile +++ b/transforms/code/proglang_select/Dockerfile.python @@ -1,6 +1,7 @@ FROM docker.io/python:3.10.14-slim-bullseye -RUN pip install --upgrade --no-cache-dir pip +RUN pip install --upgrade --no-cache-dir pip + # install pytest RUN pip install --no-cache-dir pytest @@ -9,26 +10,21 @@ RUN useradd -ms /bin/bash dpk USER dpk WORKDIR /home/dpk ARG DPK_WHEEL_FILE_NAME +ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chown=dpk:root data-processing-dist data-processing-dist +COPY --chown=dpk:users data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} # END OF STEPS destined for a data-prep-kit base image -COPY --chown=dpk:root src/ src/ -COPY --chown=dpk:root pyproject.toml pyproject.toml -COPY --chown=dpk:root requirements.txt requirements.txt -RUN pip install --no-cache-dir -e . - -# copy source data -COPY ./src/license_select_transform_python.py . -COPY ./src/license_select_local.py local/ +COPY --chown=dpk:users dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chown=dpk:users requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt -# copy test -COPY test/ test/ -COPY test-data/ test-data/ +# Set environment +ENV PYTHONPATH /home/dpk # Put these at the end since they seem to upset the docker cache. ARG BUILD_DATE diff --git a/transforms/code/proglang_select/ray/Dockerfile b/transforms/code/proglang_select/Dockerfile.ray similarity index 51% rename from transforms/code/proglang_select/ray/Dockerfile rename to transforms/code/proglang_select/Dockerfile.ray index 7f457ed4e6..b8e52425b0 100644 --- a/transforms/code/proglang_select/ray/Dockerfile +++ b/transforms/code/proglang_select/Dockerfile.ray @@ -1,10 +1,9 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 - FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod g=u /home/ray +RUN chown ray:root /home/ray && chmod 775 /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -12,31 +11,17 @@ RUN pip install --upgrade --no-cache-dir pip # install pytest RUN pip install --no-cache-dir pytest ARG DPK_WHEEL_FILE_NAME +ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ -RUN cd python-transform && pip install --no-cache-dir -e . - -#COPY requirements.txt requirements.txt -#RUN pip install --no-cache-dir -r requirements.txt - -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml -RUN pip install --no-cache-dir -e . - -# copy the main() entry point to the image -COPY --chmod=775 --chown=ray:root ./src/proglang_select_transform_ray.py . - -# copy some of the samples in -COPY --chmod=775 --chown=ray:root ./src/proglang_select_local_ray.py local/ -# copy test -COPY --chmod=775 --chown=ray:root test/ test/ -COPY --chmod=775 --chown=ray:root test-data/ test-data/ +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/proglang_select/Makefile b/transforms/code/proglang_select/Makefile index 9e222ee795..21c4727efe 100644 --- a/transforms/code/proglang_select/Makefile +++ b/transforms/code/proglang_select/Makefile @@ -1,79 +1,32 @@ REPOROOT=../../.. # Use make help, to see the available rules -include $(REPOROOT)/.make.defaults - -setup:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -clean:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -build:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse -venv:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse - -image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -publish:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -set-versions: - @# Help: Recursively $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-src:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -kind-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-save-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -.PHONY: workflow-venv -workflow-venv: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-venv; \ - fi - -.PHONY: workflow-test -workflow-test: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-test; \ - fi - -.PHONY: workflow-upload -workflow-upload: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-upload; \ - fi - -.PHONY: workflow-build -workflow-build: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-build; \ - fi - +include $(REPOROOT)/transforms/.make.cicd.targets + +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=$(shell basename `pwd`) + +################################################################################ + + + +run-cli-sample: + make venv + source venv/bin/activate && \ + $(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \ + --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" + + + +run-ray-cli-sample: + make venv + source venv/bin/activate && \ + $(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \ + --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \ + --run_locally True + diff --git a/transforms/code/proglang_select/README.md b/transforms/code/proglang_select/README.md index 92d8f6164d..e69de29bb2 100644 --- a/transforms/code/proglang_select/README.md +++ b/transforms/code/proglang_select/README.md @@ -1,14 +0,0 @@ -# Programming Language Selection Transform -The Programming Language Selection Transform -annotates input parquet files to add a True/False column indicating if the row's language matches -one of those specified in the transform configuration. -Per the set of -[transform project conventions](../../README.md#transform-project-conventions) -the following runtimes are available: - -* [python](python/README.md) - provides the base python-based transformation -implementation. -* [ray](ray/README.md) - enables the running of the base python transformation -in a Ray runtime -* [kfp_ray](kfp_ray/README.md) - enables running the ray docker image -in a kubernetes cluster using a generated `yaml` file. diff --git a/transforms/code/proglang_select/dpk_proglang_select/__init__.py b/transforms/code/proglang_select/dpk_proglang_select/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/transforms/code/proglang_select/python/src/proglang_select_transform_python.py b/transforms/code/proglang_select/dpk_proglang_select/local.py similarity index 93% rename from transforms/code/proglang_select/python/src/proglang_select_transform_python.py rename to transforms/code/proglang_select/dpk_proglang_select/local.py index ec1d3fa68d..55ce0f9bcb 100644 --- a/transforms/code/proglang_select/python/src/proglang_select_transform_python.py +++ b/transforms/code/proglang_select/dpk_proglang_select/local.py @@ -16,7 +16,7 @@ PythonTransformRuntimeConfiguration, ) from data_processing.utils import get_logger -from proglang_select_transform import ProgLangSelectTransformConfiguration +from dpk_proglang_select.transform import ProgLangSelectTransformConfiguration logger = get_logger(__name__) diff --git a/transforms/code/proglang_select/python/src/proglang_select_local_python.py b/transforms/code/proglang_select/dpk_proglang_select/local_python.py similarity index 98% rename from transforms/code/proglang_select/python/src/proglang_select_local_python.py rename to transforms/code/proglang_select/dpk_proglang_select/local_python.py index a7119a242e..c362e3acc3 100644 --- a/transforms/code/proglang_select/python/src/proglang_select_local_python.py +++ b/transforms/code/proglang_select/dpk_proglang_select/local_python.py @@ -58,4 +58,4 @@ # create launcher launcher = PythonTransformLauncher(ProgLangSelectPythonConfiguration()) # launch - launcher.launch() + launcher.launch() #!/usr/bin/env python diff --git a/transforms/code/proglang_select/dpk_proglang_select/ray/__init__.py b/transforms/code/proglang_select/dpk_proglang_select/ray/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/transforms/code/proglang_select/ray/src/proglang_select_transform_ray.py b/transforms/code/proglang_select/dpk_proglang_select/ray/transform.py similarity index 65% rename from transforms/code/proglang_select/ray/src/proglang_select_transform_ray.py rename to transforms/code/proglang_select/dpk_proglang_select/ray/transform.py index 1395658802..f856415f4d 100644 --- a/transforms/code/proglang_select/ray/src/proglang_select_transform_ray.py +++ b/transforms/code/proglang_select/dpk_proglang_select/ray/transform.py @@ -10,26 +10,32 @@ # limitations under the License. ################################################################################ +import os +import sys from typing import Any from data_processing.data_access import DataAccessFactoryBase -from data_processing_ray.runtime.ray import ( - DefaultRayTransformRuntime, - RayTransformLauncher, -) +from data_processing.utils import ParamsUtils, get_logger +from data_processing_ray.runtime.ray import RayTransformLauncher from data_processing_ray.runtime.ray.runtime_configuration import ( + DefaultRayTransformRuntime, RayTransformRuntimeConfiguration, ) -from proglang_select_transform import ( +from dpk_proglang_select.transform import ( ProgLangSelectTransformConfiguration, _get_supported_languages, lang_allowed_langs_file_key, lang_allowed_languages, lang_data_factory_key, + lang_lang_column_key, + lang_output_column_key, ) from ray.actor import ActorHandle +logger = get_logger(__name__) + + class ProgLangSelectRuntime(DefaultRayTransformRuntime): """ Language selector runtime support @@ -83,6 +89,44 @@ def __init__(self): super().__init__(transform_config=ProgLangSelectTransformConfiguration(), runtime_class=ProgLangSelectRuntime) +class ProglangSelect: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + + # create parameters + default_language_column = "language" + default_annotated_column = "lang_selected" + if lang_lang_column_key not in self.params: + self.params[lang_lang_column_key] = default_language_column + if lang_allowed_langs_file_key not in self.params: + self.params[lang_allowed_langs_file_key] = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "../../test-data/languages/allowed-code-languages.txt", + ) + ) + if lang_output_column_key not in self.params: + self.params[lang_output_column_key] = default_annotated_column + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + # create launcher + launcher = RayTransformLauncher(ProgLangSelectRayConfiguration()) + # launch + return_code = launcher.launch() + return return_code + + if __name__ == "__main__": launcher = RayTransformLauncher(ProgLangSelectRayConfiguration()) launcher.launch() diff --git a/transforms/code/proglang_select/python/src/proglang_select_transform.py b/transforms/code/proglang_select/dpk_proglang_select/transform.py similarity index 100% rename from transforms/code/proglang_select/python/src/proglang_select_transform.py rename to transforms/code/proglang_select/dpk_proglang_select/transform.py diff --git a/transforms/code/proglang_select/dpk_proglang_select/transform_python.py b/transforms/code/proglang_select/dpk_proglang_select/transform_python.py new file mode 100644 index 0000000000..1c6572d8de --- /dev/null +++ b/transforms/code/proglang_select/dpk_proglang_select/transform_python.py @@ -0,0 +1,77 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import ( + PythonTransformLauncher, + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import ParamsUtils, get_logger +from dpk_proglang_select.transform import ( + ProgLangSelectTransformConfiguration, + lang_allowed_langs_file_key, + lang_lang_column_key, + lang_output_column_key, +) + + +logger = get_logger(__name__) + + +class ProgLangSelectPythonConfiguration(PythonTransformRuntimeConfiguration): + def __init__(self): + super().__init__(transform_config=ProgLangSelectTransformConfiguration()) + + +class ProglangSelect: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + + # create parameters + default_language_column = "language" + default_annotated_column = "lang_selected" + if lang_lang_column_key not in self.params: + self.params[lang_lang_column_key] = default_language_column + if lang_allowed_langs_file_key not in self.params: + self.params[lang_allowed_langs_file_key] = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "../test-data/languages/allowed-code-languages.txt", + ) + ) + if lang_output_column_key not in self.params: + self.params[lang_output_column_key] = default_annotated_column + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + # create launcher + launcher = PythonTransformLauncher(ProgLangSelectPythonConfiguration()) + # launch + return_code = launcher.launch() + return return_code + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(ProgLangSelectPythonConfiguration()) + launcher.launch() diff --git a/transforms/code/proglang_select/kfp_ray/Makefile b/transforms/code/proglang_select/kfp_ray/Makefile index b8a21bca83..7244ce1427 100644 --- a/transforms/code/proglang_select/kfp_ray/Makefile +++ b/transforms/code/proglang_select/kfp_ray/Makefile @@ -2,10 +2,15 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows -# Include the common configuration for this transform -include ../transform.config +SRC_DIR=${CURDIR}/../ +# Use the docker image that is built for ray runtime +TRANSFORM_RUNTIME=ray +## override settings in .make.default as they assume old structure with ray being the current folder +DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-$(TRANSFORM_RUNTIME) +DOCKER_LOCAL_IMAGE=$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -SRC_DIR=${CURDIR}/../ray/ +# Only build the image with -f Dockerfile.ray +BUILD_SPECIFIC_RUNTIME=ray PYTHON_WF := $(shell find ./ -name '*_wf.py') YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) @@ -15,29 +20,8 @@ workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} .PHONY: clean clean: @# Help: Clean up the virtual environment. - rm -rf ${REPOROOT}/transforms/venv + rm -rf ${REPOROOT}/transforms/venv -venv:: - -build:: - -setup:: - -test:: - -test-src:: - -test-image:: - -publish:: - -image:: - -kind-load-image:: - -docker-load-image:: - -docker-save-image:: .PHONY: workflow-build workflow-build: workflow-venv @@ -45,10 +29,14 @@ workflow-build: workflow-venv .PHONY: workflow-test workflow-test: workflow-build - $(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=proglang_select_wf.yaml + $(MAKE) TRANSFORM_SRC=${SRC_DIR} \ + TRANSFORM_RUNTIME=$(TRANSFORM_RUNTIME) \ + TRANSFORM_NAME=$(TRANSFORM_NAME) \ + BUILD_SPECIFIC_RUNTIME=$(BUILD_SPECIFIC_RUNTIME) \ + PIPELINE_FILE=$(TRANSFORM_NAME)_wf.yaml .workflows.test-pipeline .PHONY: workflow-upload workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done + done \ No newline at end of file diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index 5a6d1d20cd..ab8603c515 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -24,7 +24,7 @@ # the name of the job script -EXEC_SCRIPT_NAME: str = "proglang_select_transform_ray.py" +EXEC_SCRIPT_NAME: str = "-m dpk_proglang_select.ray.transform" task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:latest" @@ -35,7 +35,6 @@ component_spec_path = os.getenv("KFP_COMPONENT_SPEC_PATH", DEFAULT_KFP_COMPONENT_SPEC_PATH) - # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -94,10 +93,17 @@ def compute_exec_params_func( ) def lang_select( ray_name: str = "proglang-match-kfp-ray", # name of Ray cluster - ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/proglang_select/input/', 'output_folder': 'test/proglang_select/output/'}", @@ -105,9 +111,9 @@ def lang_select( data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, # Proglang match parameters proglang_select_allowed_langs_file: str = "test/proglang_select/languages/allowed-code-languages.txt", proglang_select_language_column: str = "language", @@ -160,13 +166,17 @@ def lang_select( # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": - print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - "same version of the same pipeline !!!") + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!" + ) run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/code/proglang_select/proglang-select-ray.ipynb b/transforms/code/proglang_select/proglang-select-ray.ipynb new file mode 100644 index 0000000000..40ab5c8fec --- /dev/null +++ b/transforms/code/proglang_select/proglang-select-ray.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "%pip install \"data-prep-toolkit-transforms[proglang_select]\"" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. The set of dictionary keys holding ProglangSelect configuration for values are as follows: \n", + "* proglang_select_allowed_langs_file - specifies path to allowed languages file.\n", + "* proglang_select_language_column - specifies column name that contains programming language. By default, \"language\" is used.\n", + "#####" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "from dpk_proglang_select.ray.transform import ProglangSelect\n", + "from data_processing.utils import GB" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters and invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95737436", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "DocQuality(input_folder='test-data/input',\n", + " output_folder= 'output',\n", + " run_locally= True,\n", + " num_cpus= 0.8,\n", + " memory= 2 * GB,\n", + " runtime_num_workers = 3,\n", + " runtime_creation_delay = 0,\n", + " proglang_select_allowed_langs_file=\"./test-data/languages/allowed-code-languages.tx\").transform()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"output/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/code/proglang_select/proglang-select.ipynb b/transforms/code/proglang_select/proglang-select.ipynb new file mode 100644 index 0000000000..beced755e9 --- /dev/null +++ b/transforms/code/proglang_select/proglang-select.ipynb @@ -0,0 +1,121 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "%pip install \"data-prep-toolkit-transforms[proglang_select]\"" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": {}, + "source": [ + "##### **** Configure the transform parameters. The set of dictionary keys holding ProglangSelect configuration for values are as follows: \n", + "* proglang_select_allowed_langs_file - specifies path to allowed languages file.\n", + "* proglang_select_language_column - specifies column name that contains programming language. By default, \"language\" is used.\n", + "#####" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "from dpk_proglang_select.transform_python import ProglangSelect" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters and invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95737436", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "ProglangSelect(\n", + " input_folder= \"./test-data/input\",\n", + " output_folder= \"./output\"\n", + ").transform()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"output/*\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/code/proglang_select/python/.dockerignore b/transforms/code/proglang_select/python/.dockerignore deleted file mode 100644 index f7275bbbd0..0000000000 --- a/transforms/code/proglang_select/python/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/code/proglang_select/python/.gitignore b/transforms/code/proglang_select/python/.gitignore deleted file mode 100644 index 17cee1df3a..0000000000 --- a/transforms/code/proglang_select/python/.gitignore +++ /dev/null @@ -1,37 +0,0 @@ -test-data/output -output/* -/output/ -data-processing-lib/ - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - - -# Distribution / packaging -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -.tox/ -htmlcov -.coverage -.cache -nosetests.xml -coverage.xml \ No newline at end of file diff --git a/transforms/code/proglang_select/python/Makefile b/transforms/code/proglang_select/python/Makefile deleted file mode 100644 index 7d64e0a904..0000000000 --- a/transforms/code/proglang_select/python/Makefile +++ /dev/null @@ -1,65 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -venv:: .transforms.python-venv - -test:: .transforms.python-test - -clean:: .transforms.clean - -image:: .transforms.python-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -test-image:: .transforms.python-test-image - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-python - -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(PROGLANG_SELECT_PYTHON_VERSION) TOML_VERSION=$(PROGLANG_SELECT_PYTHON_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - - -run-cli-sample: #.transforms.run-cli-python-sample - $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_python.py \ - RUN_ARGS="--data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \ - --proglang_select_language_column language \ - --proglang_select_output_column lang_selected \ - --proglang_select_allowed_langs_file ../test-data/languages/allowed-code-languages.txt " \ - .transforms.run-src-file - -run-local-sample: .transforms.run-local-sample - -run-local-python-sample: .transforms.run-local-python-sample - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/code/proglang_select/python/README.md b/transforms/code/proglang_select/python/README.md deleted file mode 100644 index e7fbf65e4a..0000000000 --- a/transforms/code/proglang_select/python/README.md +++ /dev/null @@ -1,79 +0,0 @@ -# Programming Language Select - -Please see the set of -[transform project conventions](../../../README.md) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Summary - -This is a transform which can be used while preprocessing code data. It allows the -user to specify the programming languages for which the data should be identifies as matching -a defined set of programming languages. -It adds a new annotation column which can specify boolean True/False based on whether the rows belong to the -specified programming languages. The rows which belongs to the programming languages which are -not matched are annotated as False. - -It requires a text file specifying the allowed languages. It is specified by the -command line param `proglang_select_allowed_langs_file`. -A sample file is included at `test-data/languages/allowed-code-languages.lst`. -The column specifying programming languages is to be specified by -commandline params `proglang_select_language_column`. - -## Configuration and command line Options - -The set of dictionary keys holding configuration for values are as follows: - -* _proglang_select_allowed_langs_file_ - specifies the location of the list of supported languages -* _proglang_select_language_column_ - specifies the name of the column containing the language -* _proglang_select_output_column_ - specifies the name of the annotation column appended to the parquet. -* _proglang_select_return_known_ - specifies whether to return supported or unsupported languages - -## Running - -### Launched Command Line Options -The following command line arguments are available in addition to -the options provided by the [launcher](../../../../data-processing-lib/doc/launcher-options.md). - -``` - --proglang_select_allowed_langs_file PROGLANG_MATCH_ALLOWED_LANGS_FILE - Path to file containing the list of languages to be matched. - --proglang_select_language_column PROGLANG_MATCH_LANGUAGE_COLUMN - The column name holding the name of the programming language assigned to the document - --proglang_select_output_column PROGLANG_MATCH_OUTPUT_COLUMN - The column name to add and that contains the matching information - --proglang_select_s3_cred PROGLANG_MATCH_S3_CRED - AST string of options for s3 credentials. Only required for S3 data access. - access_key: access key help text - secret_key: secret key help text - url: optional s3 url - region: optional s3 region``` -``` - - -### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/proglang_select_transform.py using command line args -* `run-local-sample` - runs src/proglang_select_local.py -* `run-local-python-sample` - runs src/proglang_select_local_python.py - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-cli-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml deleted file mode 100644 index c35fdb1209..0000000000 --- a/transforms/code/proglang_select/python/pyproject.toml +++ /dev/null @@ -1,46 +0,0 @@ -[project] -name = "dpk_proglang_select_transform_python" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "Programming Language Selection Python Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt deleted file mode 100644 index 013ce90111..0000000000 --- a/transforms/code/proglang_select/python/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -data-prep-toolkit>=0.2.3 \ No newline at end of file diff --git a/transforms/code/proglang_select/python/src/proglang_select_local.py b/transforms/code/proglang_select/python/src/proglang_select_local.py deleted file mode 100644 index 065e34364e..0000000000 --- a/transforms/code/proglang_select/python/src/proglang_select_local.py +++ /dev/null @@ -1,51 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import os - -from data_processing.data_access import DataAccessFactory, DataAccessLocal -from proglang_select_transform import ( - ProgLangSelectTransform, - lang_allowed_langs_file_key, - lang_data_factory_key, - lang_lang_column_key, - lang_output_column_key, -) - - -# create parameters -language_column_name = "language" -annotated_column_name = "lang_selected" - -selected_languages_file = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../test-data/languages/allowed-code-languages.txt") -) -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) - -params = { - lang_allowed_langs_file_key: selected_languages_file, - lang_lang_column_key: language_column_name, - lang_output_column_key: annotated_column_name, - lang_data_factory_key: DataAccessFactory(), # Expect to create DataAccessLocal -} -if __name__ == "__main__": - # Here we show how to run outside of ray - # Create and configure the transform. - transform = ProgLangSelectTransform(params) - # Use the local data access to read a parquet table. - data_access = DataAccessLocal() - table, _ = data_access.get_table(os.path.join(input_folder, "test1.parquet")) - print(f"input table: {table}") - # Transform the table - table_list, metadata = transform.transform(table) - print(f"\noutput table: {table_list}") - print(f"output metadata : {metadata}") diff --git a/transforms/code/proglang_select/ray/.dockerignore b/transforms/code/proglang_select/ray/.dockerignore deleted file mode 100644 index f7275bbbd0..0000000000 --- a/transforms/code/proglang_select/ray/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/code/proglang_select/ray/.gitignore b/transforms/code/proglang_select/ray/.gitignore deleted file mode 100644 index 17cee1df3a..0000000000 --- a/transforms/code/proglang_select/ray/.gitignore +++ /dev/null @@ -1,37 +0,0 @@ -test-data/output -output/* -/output/ -data-processing-lib/ - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - - -# Distribution / packaging -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -.tox/ -htmlcov -.coverage -.cache -nosetests.xml -coverage.xml \ No newline at end of file diff --git a/transforms/code/proglang_select/ray/Makefile b/transforms/code/proglang_select/ray/Makefile deleted file mode 100644 index 20315a2347..0000000000 --- a/transforms/code/proglang_select/ray/Makefile +++ /dev/null @@ -1,69 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -BASE_IMAGE=${RAY_BASE_IMAGE} -venv:: .transforms.ray-venv - -test:: .transforms.ray-test - -clean:: .transforms.clean - -image:: .transforms.ray-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -test-image:: .transforms.ray-test-image - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-ray - -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(PROGLANG_SELECT_PYTHON_VERSION) TOML_VERSION=$(PROGLANG_SELECT_RAY_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - - -run-cli-sample: .transforms.run-cli-ray-sample - $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \ - RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \ - --proglang_select_language_column language \ - --proglang_select_output_column lang_selected \ - --proglang_select_allowed_langs_file ../test-data/languages/allowed-code-languages.txt " \ - .transforms.run-src-file - -#run-local-sample: .transforms.run-local-sample - -run-local-ray-sample: .transforms.run-local-ray-sample - -#run-s3-ray-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/code/proglang_select/ray/README.md b/transforms/code/proglang_select/ray/README.md deleted file mode 100644 index 1afbeef3a5..0000000000 --- a/transforms/code/proglang_select/ray/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# Programming Language Select - -Please see the set of -[transform project conventions](../../../README.md) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Summary -This project enables the [python malware transform](../python) to be run in a Ray runtime. -Please see the [python project](../python) for details on the transform implementation and use. - -## Configuration and Command Line Options - -Transform configuration options are the same as the base python transform. - -## Running - -### Launched Command Line Options -In addition to those available to the transform as defined in [here](../python/README.md), -the set of -[launcher options](../../../../data-processing-lib/doc/launcher-options.md) are available. - - -### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/proglang_select_transform_ray.py using command line args -* `run-local-ray-sample` - runs src/proglang_select_local_ray.py - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-cli-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/code/proglang_select/ray/pyproject.toml b/transforms/code/proglang_select/ray/pyproject.toml deleted file mode 100644 index f70dae6a1d..0000000000 --- a/transforms/code/proglang_select/ray/pyproject.toml +++ /dev/null @@ -1,45 +0,0 @@ -[project] -name = "dpk_proglang_select_transform_ray" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "Programming Language Selection Ray Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, -] -dependencies = [ - "dpk-proglang-select-transform-python==0.2.4.dev0", - "data-prep-toolkit[ray]>=0.2.4.dev0", -] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/code/proglang_select/ray/src/proglang_select_local_ray.py b/transforms/code/proglang_select/ray/src/proglang_select_local_ray.py deleted file mode 100644 index 509b058519..0000000000 --- a/transforms/code/proglang_select/ray/src/proglang_select_local_ray.py +++ /dev/null @@ -1,67 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import os -import sys - -from data_processing.utils import ParamsUtils -from data_processing_ray.runtime.ray import RayTransformLauncher -from proglang_select_transform import ( - lang_allowed_langs_file_key, - lang_lang_column_key, - lang_output_column_key, -) -from proglang_select_transform_ray import ProgLangSelectRayConfiguration - - -# create parameters -language_column_name = "language" -annotated_column_name = "lang_selected" - -selected_languages_file = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../test-data/languages/allowed-code-languages.txt") -) -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) -local_conf = { - "input_folder": input_folder, - "output_folder": output_folder, -} -worker_options = {"num_cpus": 0.8} -code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -langselect_config = { - lang_allowed_langs_file_key: selected_languages_file, - lang_lang_column_key: language_column_name, - lang_output_column_key: annotated_column_name, -} -params = { - # where to run - "run_locally": True, - # Data access. Only required parameters are specified - "data_local_config": ParamsUtils.convert_to_ast(local_conf), - # orchestrator - "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), - "runtime_num_workers": 3, - "runtime_pipeline_id": "pipeline_id", - "runtime_job_id": "job_id", - "runtime_creation_delay": 0, - "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - # lanuage selection specific parameters - **langselect_config, -} - -if __name__ == "__main__": - sys.argv = ParamsUtils.dict_to_req(d=params) - # create launcher - launcher = RayTransformLauncher(ProgLangSelectRayConfiguration()) - # launch - launcher.launch() diff --git a/transforms/code/proglang_select/ray/test-data/expected/metadata.json b/transforms/code/proglang_select/ray/test-data/expected/metadata.json deleted file mode 100644 index 57a98827e4..0000000000 --- a/transforms/code/proglang_select/ray/test-data/expected/metadata.json +++ /dev/null @@ -1,52 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "lang_select", - "job type": "ray", - "job id": "job_id", - "start_time": "2024-03-12 07:45:26", - "end_time": "2024-03-12 07:45:28", - "status": "success" - }, - "code": { - "github": "github", - "commit_hash": "12345", - "path": "path" - }, - "job_input_params": { - "ls_language_column": "language", - "ls_allowed_langs_file": "/Users/boris/Projects/fm-data-engineering/transforms/code/select_language/test-data/languages/allowed-code-languages.txt", - "ls_return_known": true, - "checkpointing": false, - "max_files": -1, - "number of workers": 1, - "worker options": { - "num_cpus": 0.8 - }, - "actor creation delay": 0 - }, - "execution_stats": { - "cpus": 16, - "gpus": 0, - "memory": 11.76053390558809, - "object_store": 2.0 - }, - "job_output_stats": { - "source_files": 1, - "source_size": 31, - "result_files": 1, - "result_size": 32, - "table_processing": 0.017200946807861328, - "documents with supported languages": 2, - "documents with unsupported languages": 0 - }, - "source": { - "name": "/Users/boris/Projects/fm-data-engineering/transforms/code/select_language/test-data/input", - "type": "path" - }, - "target": { - "name": "/Users/boris/Projects/fm-data-engineering/transforms/code/select_language/test-data/output", - "type": "path" - } -} diff --git a/transforms/code/proglang_select/ray/test-data/expected/test1.parquet b/transforms/code/proglang_select/ray/test-data/expected/test1.parquet deleted file mode 100644 index e36035b740..0000000000 Binary files a/transforms/code/proglang_select/ray/test-data/expected/test1.parquet and /dev/null differ diff --git a/transforms/code/proglang_select/ray/test-data/input/test1.parquet b/transforms/code/proglang_select/ray/test-data/input/test1.parquet deleted file mode 100644 index 176f7598ff..0000000000 Binary files a/transforms/code/proglang_select/ray/test-data/input/test1.parquet and /dev/null differ diff --git a/transforms/code/proglang_select/ray/test-data/languages/allowed-code-languages.txt b/transforms/code/proglang_select/ray/test-data/languages/allowed-code-languages.txt deleted file mode 100644 index 7a79e86387..0000000000 --- a/transforms/code/proglang_select/ray/test-data/languages/allowed-code-languages.txt +++ /dev/null @@ -1,4 +0,0 @@ -Java -C -Go -ABAP diff --git a/transforms/code/proglang_select/ray/test/test_proglang_select_ray.py b/transforms/code/proglang_select/ray/test/test_proglang_select_ray.py deleted file mode 100644 index 832fbff7a4..0000000000 --- a/transforms/code/proglang_select/ray/test/test_proglang_select_ray.py +++ /dev/null @@ -1,57 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import os - -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, -) -from data_processing_ray.runtime.ray import RayTransformLauncher -from proglang_select_transform import ( - lang_allowed_langs_file_key, - lang_lang_column_key, - lang_output_column_key, -) -from proglang_select_transform_ray import ProgLangSelectRayConfiguration - - -class TestRayProgLangSelectTransform(AbstractTransformLauncherTest): - """ - Extends the super-class to define the test data for the tests defined there. - The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. - """ - - def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) - languages_file = os.path.abspath( - os.path.join( - os.path.dirname(__file__), - "../test-data/languages/allowed-code-languages.txt", - ) - ) - config = { - "run_locally": True, - # When running in ray, our Runtime's get_transform_config() method will load the domains using - # the orchestrator's DataAccess/Factory. So we don't need to provide the lang_select_local_config configuration. - lang_allowed_langs_file_key: languages_file, - lang_lang_column_key: "language", - lang_output_column_key: "allowed_languages", - } - fixtures = [ - ( - RayTransformLauncher(ProgLangSelectRayConfiguration()), - config, - basedir + "/input", - basedir + "/expected", - ) - ] - return fixtures diff --git a/transforms/code/proglang_select/requirements.txt b/transforms/code/proglang_select/requirements.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/transforms/code/proglang_select/python/test-data/expected/metadata.json b/transforms/code/proglang_select/test-data/expected/metadata.json similarity index 100% rename from transforms/code/proglang_select/python/test-data/expected/metadata.json rename to transforms/code/proglang_select/test-data/expected/metadata.json diff --git a/transforms/code/proglang_select/python/test-data/expected/test1.parquet b/transforms/code/proglang_select/test-data/expected/test1.parquet similarity index 100% rename from transforms/code/proglang_select/python/test-data/expected/test1.parquet rename to transforms/code/proglang_select/test-data/expected/test1.parquet diff --git a/transforms/code/proglang_select/python/test-data/input/test1.parquet b/transforms/code/proglang_select/test-data/input/test1.parquet similarity index 100% rename from transforms/code/proglang_select/python/test-data/input/test1.parquet rename to transforms/code/proglang_select/test-data/input/test1.parquet diff --git a/transforms/code/proglang_select/python/test-data/languages/allowed-code-languages.txt b/transforms/code/proglang_select/test-data/languages/allowed-code-languages.txt similarity index 100% rename from transforms/code/proglang_select/python/test-data/languages/allowed-code-languages.txt rename to transforms/code/proglang_select/test-data/languages/allowed-code-languages.txt diff --git a/transforms/code/proglang_select/python/test/test_proglang_select.py b/transforms/code/proglang_select/test/test_proglang_select.py similarity index 98% rename from transforms/code/proglang_select/python/test/test_proglang_select.py rename to transforms/code/proglang_select/test/test_proglang_select.py index b3caa02dda..39e2ca4475 100644 --- a/transforms/code/proglang_select/python/test/test_proglang_select.py +++ b/transforms/code/proglang_select/test/test_proglang_select.py @@ -15,7 +15,7 @@ import pyarrow as pa from data_processing.test_support.transform import AbstractTableTransformTest from data_processing.transform import get_transform_config -from proglang_select_transform import ( +from dpk_proglang_select.transform import ( ProgLangSelectTransform, ProgLangSelectTransformConfiguration, lang_allowed_langs_file_key, diff --git a/transforms/code/proglang_select/python/test/test_proglang_select_python.py b/transforms/code/proglang_select/test/test_proglang_select_python.py similarity index 94% rename from transforms/code/proglang_select/python/test/test_proglang_select_python.py rename to transforms/code/proglang_select/test/test_proglang_select_python.py index c6b528021a..1c1d13465e 100644 --- a/transforms/code/proglang_select/python/test/test_proglang_select_python.py +++ b/transforms/code/proglang_select/test/test_proglang_select_python.py @@ -16,12 +16,12 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from proglang_select_transform import ( +from dpk_proglang_select.transform import ( lang_allowed_langs_file_key, lang_lang_column_key, lang_output_column_key, ) -from proglang_select_transform_python import ProgLangSelectPythonConfiguration +from dpk_proglang_select.transform_python import ProgLangSelectPythonConfiguration class TestPythonProgLangSelectTransform(AbstractTransformLauncherTest): diff --git a/transforms/code/proglang_select/transform.config b/transforms/code/proglang_select/transform.config deleted file mode 100644 index c32cb9775b..0000000000 --- a/transforms/code/proglang_select/transform.config +++ /dev/null @@ -1,20 +0,0 @@ -# -# This is intended to be included across the Makefiles provided within -# a given transform's directory tree, so must use compatible syntax. -# -################################################################################ -# This defines the name of the transform and is used to match against -# expected files and is used to define the transform's image name. -TRANSFORM_NAME=proglang_select - -################################################################################ -# This defines the transforms' version number as would be used -# when publishing the wheel. In general, only the micro version -# number should be advanced relative to the DPK_VERSION. -# -# If you change the versions numbers, be sure to run "make set-versions" to -# update version numbers across the transform (e.g., pyproject.toml). -PROGLANG_SELECT_PYTHON_VERSION=$(DPK_VERSION) -PROGLANG_SELECT_RAY_VERSION=$(PROGLANG_SELECT_PYTHON_VERSION) -PROGLANG_SELECT_SPARK_VERSION=$(PROGLANG_SELECT_PYTHON_VERSION) -