diff --git a/.github/workflows/test-language-gneissweb_classification.yml b/.github/workflows/test-language-gneissweb_classification.yml new file mode 100644 index 000000000..16e942d60 --- /dev/null +++ b/.github/workflows/test-language-gneissweb_classification.yml @@ -0,0 +1,133 @@ +# +# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files +# +name: Test - transforms/language/gneissweb_classification + +on: + workflow_dispatch: + push: + branches: + - "dev" + - "releases/**" + tags: + - "*" + paths: + - ".make.*" + - "transforms/.make.transforms" + - "transforms/language/gneissweb_classification/**" + - "data-processing-lib/**" + - "!transforms/language/gneissweb_classification/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + pull_request: + branches: + - "dev" + - "releases/**" + paths: + - ".make.*" + - "transforms/.make.transforms" + - "transforms/language/gneissweb_classification/**" + - "data-processing-lib/**" + - "!transforms/language/gneissweb_classification/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + check_if_push_image: + # check whether the Docker images should be pushed to the remote repository + # The images are pushed if it is a merge to dev branch or a new tag is created. + # The latter being part of the release process. + # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. + runs-on: ubuntu-22.04 + outputs: + publish_images: ${{ steps.version.outputs.publish_images }} + steps: + - id: version + run: | + publish_images='false' + if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; + then + publish_images='true' + fi + if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; + then + publish_images='true' + fi + echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" + test-src: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform source in transforms/language/gneissweb_classification + run: | + if [ -e "transforms/language/gneissweb_classification/Makefile" ]; then + make -C transforms/language/gneissweb_classification DOCKER=docker test-src + else + echo "transforms/language/gneissweb_classification/Makefile not found - source testing disabled for this transform." + fi + test-image: + needs: [check_if_push_image] + runs-on: ubuntu-22.04 + timeout-minutes: 120 + env: + DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} + DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }} + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform image in transforms/language/gneissweb_classification + run: | + if [ -e "transforms/language/gneissweb_classification/Makefile" ]; then + if [ -d "transforms/language/gneissweb_classification/spark" ]; then + make -C data-processing-lib/spark DOCKER=docker image + fi + make -C transforms/language/gneissweb_classification DOCKER=docker test-image + else + echo "transforms/language/gneissweb_classification/Makefile not found - testing disabled for this transform." + fi + - name: Print space + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + docker images + - name: Publish images + if: needs.check_if_push_image.outputs.publish_images == 'true' + run: | + if [ -e "transforms/language/gneissweb_classification/Makefile" ]; then + make -C transforms/language/gneissweb_classification publish + else + echo "transforms/language/gneissweb_classification/Makefile not found - publishing disabled for this transform." + fi diff --git a/transforms/language/gneissweb_classification/Dockerfile.python b/transforms/language/gneissweb_classification/Dockerfile.python new file mode 100644 index 000000000..420edae58 --- /dev/null +++ b/transforms/language/gneissweb_classification/Dockerfile.python @@ -0,0 +1,46 @@ +FROM docker.io/python:3.11.11-slim-bullseye + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest + +# Create a user and use it to run the transform +RUN useradd -ms /bin/bash dpk +USER dpk +WORKDIR /home/dpk +ARG DPK_WHEEL_FILE_NAME + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chown=dpk:root data-processing-dist/ data-processing-dist/ +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} + +# END OF STEPS destined for a data-prep-kit base image + +# set up environment required to install and use huggingface and fasttext +USER root +RUN apt update && apt install gcc g++ -y +RUN mkdir -p /home/dpk/.cache/huggingface/hub && chmod -R 777 /home/dpk/.cache/huggingface/hub +USER dpk + +COPY --chown=dpk:root dpk_gneissweb_classification/ dpk_gneissweb_classification/ +COPY --chown=dpk:root requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# clean up apt +USER root +RUN apt-get remove gcc g++ -y \ + && apt clean \ + && rm -rf /var/cache/apt/archives/* /var/lib/apt/lists/* +USER dpk + + +# Set environment +ENV PYTHONPATH /home/dpk + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/language/gneissweb_classification/Dockerfile.ray b/transforms/language/gneissweb_classification/Dockerfile.ray new file mode 100644 index 000000000..42f12a118 --- /dev/null +++ b/transforms/language/gneissweb_classification/Dockerfile.ray @@ -0,0 +1,46 @@ +ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py311 + +FROM ${BASE_IMAGE} + +# see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images +USER root +RUN chown ray:root /home/ray && chmod 775 /home/ray +USER ray + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest +ARG DPK_WHEEL_FILE_NAME + +# set up environment required to install and use huggingface and fasttext +USER root +RUN sudo apt update && sudo apt install gcc g++ -y +RUN mkdir -p /home/ray/.cache/huggingface/hub && chmod -R 777 /home/ray/.cache/huggingface/hub +USER ray + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] + + +COPY --chmod=775 --chown=ray:root dpk_gneissweb_classification/ dpk_gneissweb_classification/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# clean up apt +USER root +RUN sudo apt remove gcc g++ -y \ + && sudo apt clean \ + && sudo rm -rf /var/cache/apt/archives/* /var/lib/apt/lists/* +USER ray + +# Set environment +ENV PYTHONPATH /home/ray + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/language/gneissweb_classification/Makefile b/transforms/language/gneissweb_classification/Makefile new file mode 100644 index 000000000..f3a088ff3 --- /dev/null +++ b/transforms/language/gneissweb_classification/Makefile @@ -0,0 +1,36 @@ +REPOROOT=../../.. +# Use make help, to see the available rules +include $(REPOROOT)/transforms/.make.cicd.targets + +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=$(shell basename `pwd`) + +################################################################################ + + + +run-cli-sample: + make venv + source venv/bin/activate && \ + $(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \ + --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \ + --gcls_model_credential "PUT YOUR OWN HUGGINGFACE CREDENTIAL" \ + --gcls_model_file_name "model.bin" \ + --gcls_model_url "facebook/fasttext-language-identification" \ + --gcls_content_column_name "text" + +run-cli-ray-sample: + make venv + source venv/bin/activate && \ + $(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \ + --run_locally True --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \ + --gcls_model_credential "PUT YOUR OWN HUGGINGFACE CREDENTIAL" \ + --gcls_model_file_name "model.bin" \ + --gcls_model_url "facebook/fasttext-language-identification" \ + --gcls_content_column_name "text" diff --git a/transforms/language/gneissweb_classification/README.md b/transforms/language/gneissweb_classification/README.md new file mode 100644 index 000000000..4c22cf033 --- /dev/null +++ b/transforms/language/gneissweb_classification/README.md @@ -0,0 +1,79 @@ +# Gneissweb Classification Transform +The Gneissweb Classification transform serves as a simple exemplar to demonstrate the development +of a simple 1:1 transform. +Please see the set of [transform project conventions](../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up. + +## Summary +This transform will classify each text with confidence score with fasttext classification model such as [ref](https://huggingface.co/facebook/fasttext-language-identification). + +## Configuration and command line Options + +The set of dictionary keys holding [ClassificationTransform](dpk_gneissweb_classification/transform.py) +configuration for values are as follows: + +| Configuration Parameters | Default | Description | +|------------|----------|--------------| +| gcls_model_credential | _unset_ | specifies the credential you use to get model. This will be huggingface token. [Guide to get huggingface token](https://huggingface.co/docs/hub/security-tokens) | +| gcls_model_file_name | _unset_ | specifies what filename of model you use to get model, like `model.bin` | +| gcls_model_url | _unset_ | specifies url that model locates. For fasttext, this will be repo name of the model, like `facebook/fasttext-language-identification` | +| gcls_content_column_name | `contents` | specifies name of the column containing documents | +| gcls_output_lablel_column_name | `label` | specifies name of the output column to hold predicted classes| +| gcls_output_score_column_name | `score` | specifies name of the output column to hold score of prediction | + +## Running + +### Launched Command Line Options +The following command line arguments are available in addition to +the options provided by +the [launcher](../../../data-processing-lib/doc/launcher-options.md). +The prefix gcls is short name for Gneissweb CLaSsification. +``` + --gcls_model_credential GCLS_MODEL_CREDENTIAL the credential you use to get model. This will be huggingface token. + --gcls_model_file_name GCLS_MODEL_KIND filename of model you use to get model. Currently,like `model.bin` + --gcls_model_url GCLS_MODEL_URL url that model locates. For fasttext, this will be repo name of the model, like `facebook/fasttext-language-identification` + --gcls_content_column_name GCLS_CONTENT_COLUMN_NAME A name of the column containing documents + --gcls_output_lable_column_name GCLS_OUTPUT_LABEL_COLUMN_NAME Column name to store classification results + --gcls_output_score_column_name GCLS_OUTPUT_SCORE_COLUMN_NAME Column name to store the score of prediction +``` +These correspond to the configuration keys described above. + +### Code example +Here is a sample [notebook](gneissweb_classification.ipynb) + +## Troubleshooting guide + +For M1 Mac user, if you see following error during make command, `error: command '/usr/bin/clang' failed with exit code 1`, you should follow [this step](https://freeman.vc/notes/installing-fasttext-on-an-m1-mac) + + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +# Gneissweb Classification Ray Transform +Please see the set of +[transform project conventions](../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Summary +This project wraps the gneissweb classification transform with a Ray runtime. + +## Configuration and command line Options + +Gneissweb Classification configuration and command line options are the same as for the base python transform. + +### Launched Command Line Options +In addition to those available to the transform as defined here, +the set of +[launcher options](../../../data-processing-lib/doc/launcher-options.md) are available. + +### Code example (Ray version) +Here is a sample [notebook](gneissweb_classification-ray.ipynb) + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/classification_models.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/classification_models.py new file mode 100644 index 000000000..f24a06487 --- /dev/null +++ b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/classification_models.py @@ -0,0 +1,63 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import math +from abc import ABCMeta, abstractmethod + +import fasttext +import numpy as np +from huggingface_hub import hf_hub_download +import os +from langcodes import standardize_tag + + +class ClassificationModel(metaclass=ABCMeta): + @abstractmethod + def detect_label(self, text: str) -> tuple[str, float]: + pass + + +class NoopModel(ClassificationModel): + def detect_label(self, text: str) -> tuple[str, float]: # pylint: disable=unused-argument + return "en", 0.0 + + +class FastTextModel(ClassificationModel): + def __init__(self, url, file_name, credential): + model_path = hf_hub_download(repo_id=url, filename=file_name, token=credential) + self.nlp = fasttext.load_model(model_path) + self.url = url + + def detect_label(self, text: str) -> tuple[str, float]: + if self.url == "facebook/fasttext-language-identification": + label, score = self.nlp.predict( + text.replace("\n", " "), 1 + ) # replace newline to avoid ERROR: predict processes one line at a time (remove '\n') skipping the file + return standardize_tag(label[0].replace("__label__", "")), math.floor(score[0] * 1000) / 1000 + elif self.url == "mlfoundations/fasttext-oh-eli5": + label, score = self.nlp.predict(" ".join(text.strip().splitlines())) + score = score[0] + if label == "__label__cc": + score = 1 - score + return label[0].replace("__label__", ""), score + + else: + label, score = self.nlp.predict( + text.replace("\n", " "), 1 + ) # replace newline to avoid ERROR: predict processes one line at a time (remove '\n') skipping the file + return label[0].replace("__label__", ""), math.floor(score[0] * 1000) / 1000 + + +class ClassificationModelFactory: + @staticmethod + def create_model( url: str, file_name:str, credential: str) -> ClassificationModel: + return FastTextModel(url, file_name, credential) \ No newline at end of file diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/local.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/local.py new file mode 100644 index 000000000..c5de1a4d4 --- /dev/null +++ b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/local.py @@ -0,0 +1,48 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.data_access import DataAccessLocal +from dpk_gneissweb_classification.transform import ( + ClassificationTransform, + content_column_name_key, + model_credential_key, + model_file_name_key, + model_url_key, +) + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) + +classification_params = { + model_credential_key: "PUT YOUR OWN HUGGINGFACE CREDENTIAL", + model_file_name_key: "model.bin", + model_url_key:"facebook/fasttext-language-identification", + content_column_name_key: "text", +} +if __name__ == "__main__": + # Here we show how to run outside of the runtime + # Create and configure the transform. + transform = ClassificationTransform(classification_params) + # Use the local data access to read a parquet table. + data_access = DataAccessLocal() + table, _ = data_access.get_table(os.path.join(input_folder, "test_01.parquet")) + print(f"input table: {table}") + # Transform the table + try: + table_list, metadata = transform.transform(table) + print(f"\noutput table: {table_list}") + print(f"output metadata : {metadata}") + except Exception as e: + print(f"Exception executing transform {e}") diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/local_python.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/local_python.py new file mode 100644 index 000000000..bc2845d9e --- /dev/null +++ b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/local_python.py @@ -0,0 +1,54 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from dpk_gneissweb_classification.transform import ( + content_column_name_cli_param, + model_credential_cli_param, + model_file_name_cli_param, + model_url_cli_param, +) +from dpk_gneissweb_classification.transform_python import ClassificationPythonTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # classification params + model_credential_cli_param: "PUT YOUR OWN HUGGINGFACE CREDENTIAL", + model_file_name_cli_param: "model.bin", + model_url_cli_param: "facebook/fasttext-language-identification", + content_column_name_cli_param: "text", +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = PythonTransformLauncher(runtime_config=ClassificationPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/nlp.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/nlp.py new file mode 100644 index 000000000..a4bfe3ac5 --- /dev/null +++ b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/nlp.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any + +import pyarrow as pa +from data_processing.utils import TransformUtils, get_logger +from dpk_gneissweb_classification.classification_models import ClassificationModel + + +logger = get_logger(__name__) + + +def get_label_ds_pa( + table: pa.table, + nlp: ClassificationModel, + content_column_name: str, + output_label_column_name: str, + output_score_column_name: str, +) -> tuple[pa.table, dict[str, Any]]: + detected_label = pa.Table.from_pylist( + list( + map( + lambda r: {"label": r[0], "score": r[1]}, + map(nlp.detect_label, table[content_column_name].to_pylist()), + ) + ) + ) + stats = pa.table([detected_label["label"]], names=["label"]).group_by("label").aggregate([("label", "count")]) + stats_dict = {} + for batch in stats.to_batches(): + d = batch.to_pydict() + for label, count in zip(d["label"], d["label_count"]): + stats_dict[label] = count + result = TransformUtils.add_column(table=table, name=output_label_column_name, content=detected_label["label"]) + result = TransformUtils.add_column(table=result, name=output_score_column_name, content=detected_label["score"]) + return result, stats_dict diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/ray/local.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/ray/local.py new file mode 100644 index 000000000..a77a6bc76 --- /dev/null +++ b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/ray/local.py @@ -0,0 +1,64 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from dpk_gneissweb_classification.ray.transform import ClassificationRayTransformConfiguration +from dpk_gneissweb_classification.transform import ( + content_column_name_cli_param, + model_credential_cli_param, + model_file_name_cli_param, + model_url_cli_param, + output_label_column_name_cli_param, + output_score_column_name_cli_param, +) + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +worker_options = {"num_cpus": 0.8} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # classification params + model_credential_cli_param: "PUT YOUR OWN HUGGINGFACE CREDENTIAL", + model_file_name_cli_param: "model.bin", + model_url_cli_param:"facebook/fasttext-language-identification", + content_column_name_cli_param: "text", + output_label_column_name_cli_param: "ft_label", + output_score_column_name_cli_param: "ft_score", +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(ClassificationRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/ray/s3.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/ray/s3.py new file mode 100644 index 000000000..af91ca4c0 --- /dev/null +++ b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/ray/s3.py @@ -0,0 +1,73 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from dpk_gneissweb_classification.ray.transform import ClassificationRayTransformConfiguration +from dpk_gneissweb_classification.transform import ( + content_column_name_cli_param, + model_credential_cli_param, + model_file_name_cli_param, + model_url_cli_param, + output_label_column_name_cli_param, + output_score_column_name_cli_param, +) + + + +print(os.environ) +# create launcher +launcher = RayTransformLauncher(ClassificationRayTransformConfiguration()) +# create parameters +s3_cred = { + "access_key": "localminioaccesskey", + "secret_key": "localminiosecretkey", + "url": "http://localhost:9000", +} +s3_conf = { + "input_folder": "test/gneissweb_classification/input", + "output_folder": "test/gneissweb_classification/output", +} + + +worker_options = {"num_cpus": 0.8} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_s3_cred": ParamsUtils.convert_to_ast(s3_cred), + "data_s3_config": ParamsUtils.convert_to_ast(s3_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # classification params + model_credential_cli_param: "PUT YOUR OWN HUGGINGFACE CREDENTIAL", + model_file_name_cli_param: "model.bin", + model_url_cli_param:"facebook/fasttext-language-identification", + content_column_name_cli_param: "text", + output_label_column_name_cli_param: "ft_label", + output_score_column_name_cli_param: "ft_score", +} +sys.argv = ParamsUtils.dict_to_req(d=params) +# for arg in sys.argv: +# print(arg) + +# launch +launcher.launch() diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/ray/transform.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/ray/transform.py new file mode 100644 index 000000000..fbd56dd99 --- /dev/null +++ b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/ray/transform.py @@ -0,0 +1,75 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import sys + +from data_processing.utils import ParamsUtils, get_logger +from data_processing_ray.runtime.ray import RayTransformLauncher +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) +from dpk_gneissweb_classification.transform import ClassificationTransformConfiguration + + +logger = get_logger(__name__) + + +class ClassificationRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for Classification as required by the RayTransformLauncher. + Classification does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=ClassificationTransformConfiguration()) + + +# Class used by the notebooks to ingest binary files and create parquet files +class Classification: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + try: + worker_options = {k: self.params[k] for k in ("num_cpus", "memory")} + self.params["runtime_worker_options"] = ParamsUtils.convert_to_ast(worker_options) + del self.params["num_cpus"] + del self.params["memory"] + except: + pass + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + # create launcher + launcher = RayTransformLauncher(ClassificationRayTransformConfiguration()) + # launch + return_code = launcher.launch() + return return_code + + +if __name__ == "__main__": + launcher = RayTransformLauncher(ClassificationRayTransformConfiguration()) + logger.info("Launching classification transform") + launcher.launch() diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/transform.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/transform.py new file mode 100644 index 000000000..4825d16cd --- /dev/null +++ b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/transform.py @@ -0,0 +1,171 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from argparse import ArgumentParser, Namespace +from typing import Any + +import pyarrow as pa + +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, TransformUtils +from dpk_gneissweb_classification.classification_models import ClassificationModelFactory, ClassificationModel +from dpk_gneissweb_classification.nlp import get_label_ds_pa + + +short_name = "gcls" +cli_prefix = f"{short_name}_" +model_credential_key = "model_credential" +model_file_name_key = "model_file_name" +model_url_key = "model_url" +content_column_name_key = "content_column_name" +output_label_column_name_key = "output_label_column_name" +output_score_column_name_key = "output_score_column_name" +model_credential_cli_param = f"{cli_prefix}{model_credential_key}" +model_file_name_cli_param = f"{cli_prefix}{model_file_name_key}" +model_url_cli_param = f"{cli_prefix}{model_url_key}" +content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}" +output_label_column_name_cli_param = f"{cli_prefix}{output_label_column_name_key}" +output_score_column_name_cli_param = f"{cli_prefix}{output_score_column_name_key}" + +default_content_column_name = "contents" +default_output_label_column_name = "lang" +default_output_score_column_name = "score" + + +class ClassificationTransform(AbstractTableTransform): + """ + Implements a simple copy of a pyarrow Table. + """ + + nlp_classfication: ClassificationModel + content_column_name: str + output_label_column_name: str + output_score_column_name: str + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, ClassificationTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + # Make sure that the param name corresponds to the name used in apply_input_params method + # of ClassificationTransformConfiguration class + super().__init__(config) + self.nlp_classfication = self._get_nlp_classfication(config) + self.content_column_name = config.get(content_column_name_cli_param, default_content_column_name) + self.output_label_column_name = config.get(output_label_column_name_cli_param, default_output_label_column_name) + self.output_score_column_name = config.get(output_score_column_name_cli_param, default_output_score_column_name) + + @staticmethod + def _get_nlp_classfication(config) -> ClassificationModel: + nlp_classfication: ClassificationModel + + model_credential = config.get(model_credential_cli_param) + model_file_name = config.get(model_file_name_cli_param) + model_url = config.get(model_url_cli_param) + + if model_credential is None or len(model_credential) == 0: + raise ValueError("model_credential_cli_param is not specified.") + elif model_file_name is None or len(model_credential) == 0: + raise ValueError("model_file_name_cli_param is not specified.") + else: + nlp_classfication = ClassificationModelFactory.create_model(url=model_url, file_name = model_file_name, credential=model_credential) + + return nlp_classfication + + def transform(self, table: pa.Table, file_name: str | None = None) -> tuple[list[pa.Table], dict[str, Any]]: # pylint:disable=unused-argument + """ + Put Transform-specific to convert one Table to 0 or more tables. It also returns + a dictionary of execution statistics - arbitrary dictionary + This implementation makes no modifications so effectively implements a copy of the + input parquet to the output folder, without modification. + """ + TransformUtils.validate_columns(table, [self.content_column_name]) + if self.output_label_column_name in table.schema.names: + raise Exception(f"column to store label ({self.output_label_column_name}) already exist") + if self.output_score_column_name in table.schema.names: + raise Exception( + f"column to store score of label ({self.output_score_column_name}) already exist" + ) + self.logger.debug(f"Transforming one table with {len(table)} rows") + table, stats = get_label_ds_pa( + table, + self.nlp_classfication, + self.content_column_name, + self.output_label_column_name, + self.output_score_column_name, + ) + self.logger.debug(f"Transformed one table with {len(table)} rows") + return [table], stats + + +class ClassificationTransformConfiguration(TransformConfiguration): + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=ClassificationTransform, + ) + from data_processing.utils import get_logger + + self.logger = get_logger(__name__) + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the ClassificationTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{model_credential_cli_param}", + required=True, + help="Credential to access huggingface model", + ) + parser.add_argument( + f"--{model_file_name_cli_param}", + type=str, + default="", + help="filename of model", + ) + parser.add_argument(f"--{model_url_cli_param}", help="Url to model") + parser.add_argument( + f"--{content_column_name_cli_param}", + default=default_content_column_name, + help="Column name to get content", + ) + parser.add_argument( + f"--{output_label_column_name_cli_param}", + default=default_output_label_column_name, + help="Column name to store label", + ) + parser.add_argument( + f"--{output_score_column_name_cli_param}", + default=default_output_score_column_name, + help="Column name to store the score", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, True) + self.params = self.params | captured + self.logger.info(f"parameters are : {self.params}") + return True diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/transform_python.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/transform_python.py new file mode 100644 index 000000000..381e1d172 --- /dev/null +++ b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/transform_python.py @@ -0,0 +1,66 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import ParamsUtils, get_logger +from dpk_gneissweb_classification.transform import ClassificationTransformConfiguration + + +logger = get_logger(__name__) + + +class ClassificationPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for classification as required by the PythonTransformLauncher. + Classification does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=ClassificationTransformConfiguration()) + + +class Classification: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + # create launcher + launcher = PythonTransformLauncher(ClassificationPythonTransformConfiguration()) + # launch + return_code = launcher.launch() + return return_code + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(ClassificationPythonTransformConfiguration()) + logger.info("Launching classification transform") + launcher.launch() diff --git a/transforms/language/gneissweb_classification/gneissweb_classification-ray.ipynb b/transforms/language/gneissweb_classification/gneissweb_classification-ray.ipynb new file mode 100644 index 000000000..a22ebae54 --- /dev/null +++ b/transforms/language/gneissweb_classification/gneissweb_classification-ray.ipynb @@ -0,0 +1,193 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "%pip install 'data-prep-toolkit[ray]'\n", + "%pip install 'data-prep-toolkit-transforms[gneissweb_classification]'" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. The set of dictionary keys holding DocIDTransform configuration for values are as follows: \n", + "| Configuration Parameters | Default | Description |\n", + "|------------|----------|--------------|\n", + "| gcls_model_credential | _unset_ | specifies the credential you use to get model. This will be huggingface token. [Guide to get huggingface token](https://huggingface.co/docs/hub/security-tokens) |\n", + "| gcls_model_file_name | _unset_ | specifies what filename of model you use to get model, like `model.bin` |\n", + "| gcls_model_url | _unset_ | specifies url that model locates. For fasttext, this will be repo nme of the model, like `facebook/fasttext-language-identification` |\n", + "| gcls_content_column_name | `contents` | specifies name of the column containing documents |\n", + "| gcls_output_label_column_name | `label` | specifies name of the output column to hold predicted classes |\n", + "| gcls_output_score_column_name | `score` | specifies name of the output column to hold score of prediction |" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58", + "metadata": {}, + "outputs": [], + "source": [ + "from dpk_gneissweb_classification.ray.transform import Classification" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "09:56:06 INFO - parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_file_name': 'model.bin', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_label_column_name': 'lang', 'output_score_column_name': 'score'}\n", + "09:56:06 INFO - pipeline id pipeline_id\n", + "09:56:06 INFO - code location None\n", + "09:56:06 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "09:56:06 INFO - actor creation delay 0\n", + "09:56:06 INFO - job details {'job category': 'preprocessing', 'job name': 'gcls', 'job type': 'ray', 'job id': 'job_id'}\n", + "09:56:06 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", + "09:56:06 INFO - data factory data_ max_files -1, n_sample -1\n", + "09:56:06 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "09:56:06 INFO - Running locally\n", + "2025-01-27 09:56:08,919\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=97043)\u001b[0m 09:56:09 INFO - orchestrator started at 2025-01-27 09:56:09\n", + "\u001b[36m(orchestrate pid=97043)\u001b[0m 09:56:09 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n", + "\u001b[36m(orchestrate pid=97043)\u001b[0m 09:56:09 INFO - Cluster resources: {'cpus': 10, 'gpus': 0, 'memory': 28.60002746619284, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=97043)\u001b[0m 09:56:09 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(RayTransformFileProcessor pid=97047)\u001b[0m Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n", + "\u001b[36m(orchestrate pid=97043)\u001b[0m 09:56:12 INFO - Completed 1 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=97043)\u001b[0m 09:56:12 INFO - Completed 2 files in 0.006 min\n", + "\u001b[36m(orchestrate pid=97043)\u001b[0m 09:56:12 INFO - Completed 2 files (66.667%) in 0.006 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=97043)\u001b[0m 09:56:12 INFO - Completed processing 3 files in 0.008 min\n", + "\u001b[36m(orchestrate pid=97043)\u001b[0m 09:56:12 INFO - done flushing in 0.001 sec\n", + "09:56:22 INFO - Completed execution in 0.26 min, execution result 0\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "Classification(input_folder= \"test-data/input\",\n", + " output_folder= \"output\",\n", + " gcls_model_credential= \"PUT YOUR OWN HUGGINGFACE CREDENTIAL\",\n", + " gcls_model_file_name= \"model.bin\",\n", + " gcls_model_url= \"facebook/fasttext-language-identification\",\n", + " run_locally= True,\n", + " gcls_content_column_name= \"text\").transform()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['output/test_03.parquet',\n", + " 'output/test_02.parquet',\n", + " 'output/metadata.json',\n", + " 'output/test_01.parquet']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"output/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dpk-outer", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/language/gneissweb_classification/gneissweb_classification.ipynb b/transforms/language/gneissweb_classification/gneissweb_classification.ipynb new file mode 100644 index 000000000..17a5a2e7b --- /dev/null +++ b/transforms/language/gneissweb_classification/gneissweb_classification.ipynb @@ -0,0 +1,342 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "%pip install data-prep-toolkit\n", + "%pip install 'data-prep-toolkit-transforms[gneissweb_classificationo]'\n", + "%pip install pandas" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. The set of dictionary keys holding DocIDTransform configuration for values are as follows: \n", + "| Configuration Parameters | Default | Description |\n", + "|------------|----------|--------------|\n", + "| gcls_model_credential | _unset_ | specifies the credential you use to get model. This will be huggingface token. [Guide to get huggingface token](https://huggingface.co/docs/hub/security-tokens) |\n", + "| gcls_model_file_name | _unset_ | specifies what filename of model you use to get model, like `model.bin` |\n", + "| gcls_model_url | _unset_ | specifies url that model locates. For fasttext, this will be repo nme of the model, like `facebook/fasttext-language-identification` |\n", + "| gcls_content_column_name | `contents` | specifies name of the column containing documents |\n", + "| gcls_output_label_column_name | `label` | specifies name of the output column to hold predicted classes |\n", + "| gcls_output_score_column_name | `score` | specifies name of the output column to hold score of prediction |" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58", + "metadata": {}, + "outputs": [], + "source": [ + "from dpk_gneissweb_classification.transform_python import Classification" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "09:52:55 INFO - parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_file_name': 'model.bin', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_label_column_name': 'lang', 'output_score_column_name': 'score'}\n", + "09:52:55 INFO - pipeline id pipeline_id\n", + "09:52:55 INFO - code location None\n", + "09:52:55 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", + "09:52:55 INFO - data factory data_ max_files -1, n_sample -1\n", + "09:52:55 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "09:52:55 INFO - orchestrator gcls started at 2025-01-27 09:52:55\n", + "09:52:55 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n", + "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n", + "09:52:57 INFO - Completed 1 files (33.33%) in 0.01 min\n", + "09:52:57 INFO - Completed 2 files (66.67%) in 0.011 min\n", + "09:52:57 INFO - Completed 3 files (100.0%) in 0.014 min\n", + "09:52:57 INFO - Done processing 3 files, waiting for flush() completion.\n", + "09:52:57 INFO - done flushing in 0.0 sec\n", + "09:52:57 INFO - Completed execution in 0.029 min, execution result 0\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "Classification(input_folder= \"test-data/input\",\n", + " output_folder= \"output\",\n", + " gcls_model_credential= \"PUT YOUR OWN HUGGINGFACE CREDENTIAL\",\n", + " gcls_model_file_name= \"model.bin\",\n", + " gcls_model_url= \"facebook/fasttext-language-identification\",\n", + " gcls_content_column_name= \"text\").transform()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['output/test_03.parquet',\n", + " 'output/test_02.parquet',\n", + " 'output/metadata.json',\n", + " 'output/test_01.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"output/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | text | \n", + "count() | \n", + "lang | \n", + "score | \n", + "
---|---|---|---|---|
0 | \n", + "- Notice of name-email change.doc | \n", + "6 | \n", + "en | \n", + "0.858 | \n", + "
1 | \n", + "- Nov13ENAOnly.doc | \n", + "2 | \n", + "de | \n", + "0.264 | \n", + "
2 | \n", + "- OHIO_C~1.XLS | \n", + "2 | \n", + "de | \n", + "0.603 | \n", + "
3 | \n", + "- Oneok(5-30)final.doc | \n", + "1 | \n", + "vi | \n", + "0.152 | \n", + "
4 | \n", + "- OpeningBrief.doc | \n", + "6 | \n", + "ko-Hang | \n", + "0.365 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
195 | \n", + "- invite.doc | \n", + "2 | \n", + "ro | \n", + "0.717 | \n", + "
196 | \n", + "- issues wrt portland and calgary signing shor... | \n", + "2 | \n", + "en | \n", + "0.997 | \n", + "
197 | \n", + "- jan3102.XLS | \n", + "2 | \n", + "de | \n", + "0.399 | \n", + "
198 | \n", + "- job market.gif | \n", + "2 | \n", + "en | \n", + "0.791 | \n", + "
199 | \n", + "- kick~1.mpe | \n", + "4 | \n", + "eo | \n", + "0.253 | \n", + "
200 rows × 4 columns
\n", + "