IBM · ran-iwamoto · Feb 10, 2025 · Feb 10, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/transforms/language/gneissweb_classification/Makefile b/transforms/language/gneissweb_classification/Makefile
@@ -21,8 +21,9 @@ run-cli-sample:
 	$(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \
                 --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}"  \
                 --gcls_model_credential "PUT YOUR OWN HUGGINGFACE CREDENTIAL"	\
-				--gcls_model_file_name "model.bin"	\
-				--gcls_model_url "facebook/fasttext-language-identification" \
+				--gcls_model_file_name "['fasttext_medical.bin']"	\
+				--gcls_model_url "['ibm-granite/GneissWeb.Med_classifier']"\
+				--gcls_output_label_column_name "['label']"	\
 				--gcls_content_column_name "text"
 
 run-cli-ray-sample: 
@@ -31,6 +32,7 @@ run-cli-ray-sample:
 	$(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \
                 --run_locally True --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}"  \
                 --gcls_model_credential "PUT YOUR OWN HUGGINGFACE CREDENTIAL"	\
-				--gcls_model_file_name "model.bin"	\
-				--gcls_model_url "facebook/fasttext-language-identification" \
+				--gcls_model_file_name "['fasttext_medical.bin']"	\
+				--gcls_model_url "['ibm-granite/GneissWeb.Med_classifier']"\
+				--gcls_output_label_column_name "['label']"	\
 				--gcls_content_column_name "text"
diff --git a/transforms/language/gneissweb_classification/README.md b/transforms/language/gneissweb_classification/README.md
@@ -3,8 +3,17 @@ The Gneissweb Classification transform serves as a simple exemplar to demonstrat
 of a simple 1:1 transform.  
 Please see the set of [transform project conventions](../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up.
 
+## Contributors
+
+- Ran Iwamoto ([email protected])
+
 ## Summary 
-This transform will classify each text with confidence score with fasttext classification model such as [ref](https://huggingface.co/facebook/fasttext-language-identification).
+This transform will classify each text with confidence score with fasttext classification model such as:
+- [ibm-granite/GneissWeb.Quality_annotator](https://huggingface.co/ibm-granite/GneissWeb.Quality_annotator)
+- [ibm-granite/GneissWeb.Sci_classifier](https://huggingface.co/ibm-granite/GneissWeb.Sci_classifier)
+- [ibm-granite/GneissWeb.Tech_classifier](https://huggingface.co/ibm-granite/GneissWeb.Tech_classifier)
+- [ibm-granite/GneissWeb.Edu_classifier](https://huggingface.co/ibm-granite/GneissWeb.Edu_classifier)
+- [ibm-granite/GneissWeb.Med_classifier](https://huggingface.co/ibm-granite/GneissWeb.Med_classifier)
 
 ## Configuration and command line Options
 
@@ -13,12 +22,13 @@ configuration for values are as follows:
 
 | Configuration Parameters  | Default  | Description |
 |------------|----------|--------------|
-| gcls_model_credential | _unset_ | specifies the credential you use to get model. This will be huggingface token. [Guide to get huggingface token](https://huggingface.co/docs/hub/security-tokens) |
-| gcls_model_file_name | _unset_ | specifies what filename of model you use to get model, like `model.bin` |
-| gcls_model_url | _unset_ |  specifies url that model locates. For fasttext, this will be repo name of the model, like `facebook/fasttext-language-identification` |
+| gcls_model_credential | _unset_ | specifies the credential you use to get models. This will be huggingface token. [Guide to get huggingface token](https://huggingface.co/docs/hub/security-tokens) |
+| gcls_model_file_name | _unset_ | specifies what filename of models you use to get models, like [`fasttext_gneissweb_quality_annotator.bin`,`fasttext_science.bin`,`fasttext_technology_computing.bin`,`fasttext_education.bin`,`fasttext_medical.bin`] |
+| gcls_model_url | _unset_ |  specifies urls that models locate. For fasttext, this will be repo name of the models, like [`ibm-granite/GneissWeb.Quality_annotator`,`ibm-granite/GneissWeb.Sci_classifier`,`ibm-granite/GneissWeb.Tech_classifier`,`ibm-granite/GneissWeb.Edu_classifier`,`ibm-granite/GneissWeb.Med_classifier`] |
+| gcls_n_processes | 1 | number of processes. Must be a positive integer |
 | gcls_content_column_name | `contents` | specifies name of the column containing documents |
-| gcls_output_lablel_column_name | `label` | specifies name of the output column to hold predicted classes|
-| gcls_output_score_column_name | `score` | specifies name of the output column to hold score of prediction |
+| gcls_output_lablel_column_name | [`label_quality`,`label_sci`,`label_tech`,`label_edu`,`label_med`] | specifies name of the output column to hold predicted classes|
+| gcls_output_score_column_name | [`score_quality`,`score_sci`,`score_tech`,`score_edu`,`score_med`]  | specifies name of the output column to hold score of prediction |
 
 ## Running
 
@@ -28,12 +38,13 @@ the options provided by
 the [launcher](../../../data-processing-lib/doc/launcher-options.md).
 The prefix gcls is short name for Gneissweb CLaSsification.
 ```
-  --gcls_model_credential GCLS_MODEL_CREDENTIAL   the credential you use to get model. This will be huggingface token.
-  --gcls_model_file_name GCLS_MODEL_KIND   filename of model you use to get model. Currently,like `model.bin`
-  --gcls_model_url GCLS_MODEL_URL   url that model locates. For fasttext, this will be repo name of the model, like `facebook/fasttext-language-identification`
+  --gcls_model_credential GCLS_MODEL_CREDENTIAL   the credential you use to get models. This will be huggingface token.
+  --gcls_model_file_name GCLS_MODEL_KIND   filename of models you use to get models. Currently,like [`fasttext_gneissweb_quality_annotator.bin`,`fasttext_science.bin`,`fasttext_technology_computing.bin`,`fasttext_education.bin`,`fasttext_medical.bin`]
+  --gcls_model_url GCLS_MODEL_URL   urls that models locate. For fasttext, this will be repo name of the models, like [`ibm-granite/GneissWeb.Quality_annotator`,`ibm-granite/GneissWeb.Sci_classifier`,`ibm-granite/GneissWeb.Tech_classifier`,`ibm-granite/GneissWeb.Edu_classifier`,`ibm-granite/GneissWeb.Med_classifier`]
   --gcls_content_column_name GCLS_CONTENT_COLUMN_NAME   A name of the column containing documents
-  --gcls_output_lable_column_name GCLS_OUTPUT_LABEL_COLUMN_NAME   Column name to store classification results
-  --gcls_output_score_column_name GCLS_OUTPUT_SCORE_COLUMN_NAME   Column name to store the score of prediction
+  --gcls_output_lable_column_name GCLS_OUTPUT_LABEL_COLUMN_NAME   Column names to store classification results, like [`label_quality`,`label_sci`,`label_tech`,`label_edu`,`label_med`]
+  --gcls_output_score_column_name GCLS_OUTPUT_SCORE_COLUMN_NAME   Column names to store the score of prediction, like [`score_quality`,`score_sci`,`score_tech`,`score_edu`,`score_med`]
+  --gcls_n_processes NUMBER_OF_PROCESSES   number of processes, an integer value. Larger value will give a better throughput in compensation for memory consumption
 ```
 These correspond to the configuration keys described above.
 

diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/local.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/local.py
@@ -15,22 +15,30 @@
 from data_processing.data_access import DataAccessLocal
 from dpk_gneissweb_classification.transform import (
     ClassificationTransform,
-    content_column_name_key,
-    model_credential_key,
-    model_file_name_key,
-    model_url_key,
+    content_column_name_cli_param,
+    model_credential_cli_param,
+    model_file_name_cli_param,
+    model_url_cli_param,
+    n_processes_cli_param,
+    output_label_column_name_cli_param,
+    output_score_column_name_cli_param
 )
 
 
 # create parameters
 input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
 
 classification_params = {
-    model_credential_key: "PUT YOUR OWN HUGGINGFACE CREDENTIAL",
-    model_file_name_key: "model.bin",
-    model_url_key:"facebook/fasttext-language-identification",
-    content_column_name_key: "text",
+    model_credential_cli_param: "PUT YOUR OWN HUGGINGFACE CREDENTIAL",
+    model_file_name_cli_param: ["['fasttext_medical.bin']"],
+    model_url_cli_param:["['ibm-granite/GneissWeb.Med_classifier']"],
+    output_label_column_name_cli_param:["['label_med']"],
+    output_score_column_name_cli_param:["['score']"],
+    content_column_name_cli_param: "text",
+    n_processes_cli_param: 1,
 }
+
+
 if __name__ == "__main__":
     # Here we show how to run outside of the runtime
     # Create and configure the transform.

diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/local_python.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/local_python.py
@@ -20,6 +20,9 @@
     model_credential_cli_param,
     model_file_name_cli_param,
     model_url_cli_param,
+    n_processes_cli_param,
+    output_label_column_name_cli_param,
+    output_score_column_name_cli_param
 )
 from dpk_gneissweb_classification.transform_python import ClassificationPythonTransformConfiguration
 
@@ -41,9 +44,12 @@
     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
     # classification params
     model_credential_cli_param: "PUT YOUR OWN HUGGINGFACE CREDENTIAL",
-    model_file_name_cli_param: "model.bin",
-    model_url_cli_param: "facebook/fasttext-language-identification",
+    model_file_name_cli_param:["fasttext_medical.bin"],
+    model_url_cli_param: ["ibm-granite/GneissWeb.Med_classifier"],
+    output_label_column_name_cli_param:["label_med"],
+    output_score_column_name_cli_param:["score"],
     content_column_name_cli_param: "text",
+    n_processes_cli_param: 1,
 }
 if __name__ == "__main__":
     # Set the simulated command line args
@@ -52,3 +58,4 @@
     launcher = PythonTransformLauncher(runtime_config=ClassificationPythonTransformConfiguration())
     # Launch the ray actor(s) to process the input
     launcher.launch()
+
diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/nlp_parallel.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/nlp_parallel.py
@@ -0,0 +1,83 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from typing import Any
+from functools import partial
+
+import pyarrow as pa
+import multiprocessing
+
+from data_processing.utils import TransformUtils, get_logger
+from dpk_gneissweb_classification.classification_models import ClassificationModel
+from dpk_gneissweb_classification.classification_models import ClassificationModelFactory
+
+logger = get_logger(__name__)
+
+global_model: ClassificationModel = None
+
+def init_global_model(url: str, file_name: str, credential: str):
+    global global_model
+    global_model = ClassificationModelFactory.create_model(url, file_name, credential)
+
+
+def _process(text_list):
+    return [global_model.detect_label(r) for r in text_list]
+
+
+def split_lists(text_list: list[str] | tuple[str, ...], num_chunks: int) -> list[list[str]]:
+    num_rows = len(text_list)
+    chunk_size = num_rows // num_chunks
+
+    chunks: list[list[str]] = []
+    i = 0
+    while i < num_chunks:
+        if i == num_chunks - 1:
+            remainder = num_rows % num_chunks
+        else:
+            remainder = 0
+        chunk = text_list[i * chunk_size : i * chunk_size + chunk_size + remainder]
+        chunks.append(list(chunk))
+        i += 1
+
+    return chunks
+
+
+def get_label_ds_pa_parallel(
+    table: pa.table,
+    content_column_name: str,
+    output_label_column_name: str,
+    output_score_column_name: str,
+    n_processes: int = 4,
+    url: str = None,
+    file_name: str = None,
+    credential: str = None
+) -> tuple[pa.table, dict[str, Any]]:
+
+    table_chunks = split_lists(table[content_column_name].to_pylist(), n_processes)
+
+    with multiprocessing.get_context("spawn").Pool(n_processes, initializer=init_global_model, initargs=(url, file_name, credential)) as p:
+        pool_results = p.map(_process, table_chunks)
+        classification_results = []
+        for result in pool_results:
+            classification_results += result
+        labels, scores = zip(*classification_results)
+        detected_label = {"label": list(labels), "score": list(scores)}
+
+    stats = pa.table([detected_label["label"]], names=["label"]).group_by("label").aggregate([("label", "count")])
+    stats_dict = {}
+    for batch in stats.to_batches():
+        d = batch.to_pydict()
+        for label, count in zip(d["label"], d["label_count"]):
+            stats_dict[label] = count
+    result = TransformUtils.add_column(table=table, name=output_label_column_name, content=detected_label["label"])
+    result = TransformUtils.add_column(table=result, name=output_score_column_name, content=detected_label["score"])
+    return result, stats_dict
diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/ray/local.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/ray/local.py
@@ -23,6 +23,7 @@
     model_url_cli_param,
     output_label_column_name_cli_param,
     output_score_column_name_cli_param,
+    n_processes_cli_param
 )
 
 
@@ -49,11 +50,13 @@
     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
     # classification params
     model_credential_cli_param: "PUT YOUR OWN HUGGINGFACE CREDENTIAL",
-    model_file_name_cli_param: "model.bin",
-    model_url_cli_param:"facebook/fasttext-language-identification",
+    model_file_name_cli_param: ["fasttext_medical.bin"],
+    model_url_cli_param:["ibm-granite/GneissWeb.Med_classifier"],
     content_column_name_cli_param: "text",
-    output_label_column_name_cli_param: "ft_label",
-    output_score_column_name_cli_param: "ft_score",
+    output_label_column_name_cli_param: ["ft_label"],
+    output_score_column_name_cli_param: ["ft_score"],
+    n_processes_cli_param: 1,
+
 }
 if __name__ == "__main__":
     # Set the simulated command line args
@@ -62,3 +65,5 @@
     launcher = RayTransformLauncher(ClassificationRayTransformConfiguration())
     # Launch the ray actor(s) to process the input
     launcher.launch()
+
+
diff --git a/transforms/language/gneissweb_classification/dpk_gneissweb_classification/ray/s3.py b/transforms/language/gneissweb_classification/dpk_gneissweb_classification/ray/s3.py
@@ -23,6 +23,7 @@
     model_url_cli_param,
     output_label_column_name_cli_param,
     output_score_column_name_cli_param,
+    n_processes_cli_param
 )
 
 
@@ -59,11 +60,12 @@
     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
     # classification params
     model_credential_cli_param: "PUT YOUR OWN HUGGINGFACE CREDENTIAL",
-    model_file_name_cli_param: "model.bin",
-    model_url_cli_param:"facebook/fasttext-language-identification",
-    content_column_name_cli_param: "text",
-    output_label_column_name_cli_param: "ft_label",
-    output_score_column_name_cli_param: "ft_score",
+    model_file_name_cli_param: ["fasttext_medical.bin"],
+    model_url_cli_param:["ibm-granite/GneissWeb.Med_classifier"],
+    content_column_name_cli_param: ["text"],
+    output_label_column_name_cli_param: ["ft_label"],
+    output_score_column_name_cli_param: ["ft_score"],
+    n_processes_cli_param: 1,
 }
 sys.argv = ParamsUtils.dict_to_req(d=params)
 # for arg in sys.argv: