Skip to content

Commit

Permalink
Merge pull request #966 from touma-I/profiler-simplify
Browse files Browse the repository at this point in the history
refactored profiler transform
  • Loading branch information
touma-I authored Jan 24, 2025
2 parents 3ca9926 + c90157e commit d96c014
Show file tree
Hide file tree
Showing 50 changed files with 561 additions and 861 deletions.
8 changes: 4 additions & 4 deletions scripts/k8s-setup/populate_minio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ mc cp --recursive ${REPOROOT}/transforms/language/html2parquet/test-data/input/t
mc cp --recursive ${REPOROOT}/transforms/universal/doc_id/test-data/input/ kfp/test/doc_id/input
mc cp --recursive ${REPOROOT}/transforms/universal/ededup/test-data/input/ kfp/test/ededup/input
mc cp --recursive ${REPOROOT}/transforms/universal/fdedup/ray/test-data/input/ kfp/test/fdedup/input
mc cp --recursive ${REPOROOT}/transforms/universal/filter/ray/test-data/input/ kfp/test/filter/input
mc cp --recursive ${REPOROOT}/transforms/universal/noop/ray/test-data/input/ kfp/test/noop/input
mc cp --recursive ${REPOROOT}/transforms/universal/filter/test-data/input/ kfp/test/filter/input
mc cp --recursive ${REPOROOT}/transforms/universal/noop/test-data/input/ kfp/test/noop/input
mc cp --recursive ${REPOROOT}/transforms/universal/tokenization/test-data/ds01/input/ kfp/test/tokenization/ds01/input
mc cp --recursive ${REPOROOT}/transforms/universal/profiler/ray/test-data/input/ kfp/test/profiler/input
mc cp --recursive ${REPOROOT}/transforms/universal/resize/ray/test-data/input/ kfp/test/resize/input
mc cp --recursive ${REPOROOT}/transforms/universal/profiler/test-data/input/ kfp/test/profiler/input
mc cp --recursive ${REPOROOT}/transforms/universal/resize/test-data/input/ kfp/test/resize/input
mc cp --recursive ${REPOROOT}/transforms/universal/hap/test-data/input/ kfp/test/hap/input
5 changes: 5 additions & 0 deletions transforms/README-list.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README
`python -m pip install data-prep-toolkit-transforms[all]`
or
`python -m pip install data-prep-toolkit-transforms[ray, all]`
or
`python -m pip install data-prep-toolkit-transforms[language]`


installing the python transforms will also install `data-prep-toolkit`
Expand Down Expand Up @@ -41,6 +43,9 @@ Note: This list includes the transforms that were part of the release starting w

## Release notes:

### 1.0.0.a6
Added Profiler
Added Resize
### 1.0.0.a5
Added Pii Redactor
Relax fasttext requirement >= 0.9.2
Expand Down
16 changes: 10 additions & 6 deletions transforms/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_transforms"
version = "1.0.0a5"
version = "1.0.0a6"
requires-python = ">=3.10,<3.13"
keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
description = "Data Preparation Toolkit Transforms using Ray"
Expand Down Expand Up @@ -33,8 +33,8 @@ all = { file = [

"language/pii_redactor/requirements.txt",

"universal/profiler/python/requirements.txt",
"universal/resize/python/requirements.txt",
"universal/profiler/requirements.txt",
"universal/resize/requirements.txt",

"language/lang_id/requirements.txt",
"language/doc_quality/requirements.txt",
Expand Down Expand Up @@ -78,7 +78,9 @@ language = { file = [
"universal/fdedup/requirements.txt",
"universal/hap/requirements.txt",
"universal/tokenization/requirements.txt",
"universal/web2parquet/requirements.txt"
"universal/web2parquet/requirements.txt",
"universal/profiler/requirements.txt",
"universal/resize/requirements.txt"
]}

# pyproject.toml must be in a parent and cannot be in sibling
Expand All @@ -90,8 +92,8 @@ license_select = { file = ["code/license_select/python/requirements.txt"]}
code_quality = { file = ["code/code_quality/python/requirements.txt"]}
code2parquet = {file = ["code/code2parquet/python/requirements.txt"]}

profiler = { file = ["universal/profiler/python/requirements.txt"]}
resize = { file = ["universal/resize/python/requirements.txt"]}
profiler = { file = ["universal/profiler/requirements.txt"]}
resize = { file = ["universal/resize/requirements.txt"]}

######## Named transforms
doc_chunk = { file = ["language/doc_chunk/requirements.txt"]}
Expand Down Expand Up @@ -138,6 +140,8 @@ dpk_tokenization = "universal/tokenization/dpk_tokenization"
dpk_similarity = "language/similarity/dpk_similarity"
dpk_filter = "universal/filter/dpk_filter"
dpk_code_profiler = "code/code_profiler/dpk_code_profiler"
dpk_profiler = "universal/profiler/dpk_profiler"
dpk_resize = "universal/resize/dpk_resize"


#[tool.setuptools.package-data]
Expand Down
10 changes: 10 additions & 0 deletions transforms/universal/profiler/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
venv/
test-data/output
output/*

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class


Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,18 @@ RUN useradd -ms /bin/bash dpk
USER dpk
WORKDIR /home/dpk
ARG DPK_WHEEL_FILE_NAME
ARG TRANSFORM_NAME

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=dpk:root data-processing-dist data-processing-dist
COPY --chown=dpk:users data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root README.md README.md
COPY --chown=dpk:root requirements.txt requirements.txt
# END OF STEPS destined for a data-prep-kit base image

RUN pip install --no-cache-dir -e .

# copy source data
COPY ./src/profiler_transform_python.py .
COPY ./src/profiler_local.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/
COPY --chown=dpk:users dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/
COPY --chown=dpk:users requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Set environment
ENV PYTHONPATH /home/dpk
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310

FROM ${BASE_IMAGE}

# see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images
Expand All @@ -11,33 +10,18 @@ RUN pip install --upgrade --no-cache-dir pip

# install pytest
RUN pip install --no-cache-dir pytest

ARG DPK_WHEEL_FILE_NAME
ARG TRANSFORM_NAME

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]

## Copy the python version of the tansform
COPY --chmod=775 --chown=ray:root python-transform/ python-transform/
RUN cd python-transform && pip install --no-cache-dir -e .

# Install ray project source
COPY --chmod=775 --chown=ray:root src/ src/
COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml
COPY --chmod=775 --chown=ray:root README.md README.md
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY src/profiler_transform_ray.py .

# copy some of the samples in
COPY src/profiler_local_ray.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/
COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/
COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Set environment
ENV PYTHONPATH /home/ray
Expand Down
34 changes: 34 additions & 0 deletions transforms/universal/profiler/Dockerfile.spark
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
FROM quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest

USER root
# install pytest
RUN pip install --no-cache-dir pytest

WORKDIR ${SPARK_HOME}/work-dir
ARG DPK_WHEEL_FILE_NAME
ARG TRANSFORM_NAME

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=spark:users data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark]


# Install project source

## Copy the python version of the tansform
COPY --chown=spark:users dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/
COPY --chown=spark:users requirements.txt requirements.txt
RUN pip install -r requirements.txt


USER spark

# Set environment
ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${PYTHONPATH}

# Put these at the end since they seem to upset the docker cache.
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
86 changes: 14 additions & 72 deletions transforms/universal/profiler/Makefile
Original file line number Diff line number Diff line change
@@ -1,79 +1,21 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/.make.defaults
include $(REPOROOT)/transforms/.make.cicd.targets

setup::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse
# Until we make runtime.py the standard supported by Makefile infra
TRANSFORM_PYTHON_SRC="-m dpk_$(TRANSFORM_NAME).runtime"
TRANSFORM_RAY_SRC="-m dpk_$(TRANSFORM_NAME).ray.runtime"
TRANSFORM_SPARK_SRC="-m dpk_$(TRANSFORM_NAME).spark.runtime"

clean::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse
#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=$(shell basename `pwd`)

build::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse
venv::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse
################################################################################

image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

set-versions:
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-src::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

kind-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-save-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

.PHONY: workflow-venv
workflow-venv:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-venv; \
fi

.PHONY: workflow-test
workflow-test:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-test; \
fi

.PHONY: workflow-upload
workflow-upload:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-upload; \
fi

.PHONY: workflow-build
workflow-build:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-build; \
fi

Loading

0 comments on commit d96c014

Please sign in to comment.