feat: #641 - first draft implementation, python only

IBM · Sep 29, 2024 · 43d36f1 · 43d36f1
1 parent 9c09652
commit 43d36f1
Show file tree

Hide file tree

Showing 7 changed files with 129 additions and 6 deletions.
diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md
@@ -29,10 +29,12 @@ The transform can be tuned with the following parameters.
 
 | Parameter  | Default  | Description  |
 |------------|----------|--------------|
-| `chunking_type`        | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling). |
+| `chunking_type`        | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `fixed_size` for chunking text into fixed-sized windows of tokens, where both the window size and overlap between windows are measured in tokens. |
 | `content_column_name`        | `contents` | Name of the column containing the text to be chunked. |
 | `doc_id_column_name`         | `document_id` | Name of the column containing the doc_id to be propagated in the output. |
 | `dl_min_chunk_len`           | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. |
+| `chunk_size_tokens`          | `128` | Size of the chunk in tokens for the fixed-sized chunker. |
+| `chunk_overlap_tokens`       | `30` | Number of tokens overlapping between chunks for the fixed-sized chunker. |
 | `output_chunk_column_name`   | `contents` | Column name to store the chunks in the output table. |
 | `output_source_doc_id_column_name`   | `source_document_id` | Column name to store the `doc_id` from the input table. |
 | `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. |

diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py
@@ -11,9 +11,10 @@
 ################################################################################
 
 from abc import ABCMeta, abstractmethod
-from typing import Iterator, Optional
+from typing import Iterator, Optional, Dict, List
 
 from docling_core.types import Document as DLDocument
+from llama_index.core.node_parser.text.token import TokenTextSplitter
 from llama_index.core import Document as LIDocument
 from llama_index.core.node_parser import MarkdownNodeParser
 from docling_core.transforms.chunker import HierarchicalChunker
@@ -66,3 +67,69 @@ def chunk(self, content: str) -> Iterator[dict]:
             yield {
                 self.output_chunk_column_name: node.text,
             }
+
+
+class FixedTokenSizeChunker(ChunkingExecutor):
+    """
+    Chunks input text into fixed-window lengths, measured in tokens, with an overlap also measured in tokens.
+
+    Args:
+        output_chunk_column_name (str): Name of the output column containing the text of each chunk.
+        output_chunk_column_id (str): Name of the output column containing the ID of each chunk.
+        chunk_size_tokens (int): Length of each chunk in number of tokens.
+        chunk_overlap_tokens (int): Number of tokens overlapping between consecutive chunks.
+
+    Attributes:
+        output_chunk_column_name (str)
+        output_chunk_column_id (str)
+        chunk_size_tokens (int)
+        chunk_overlap_tokens (int)
+    """
+
+    def __init__(
+        self,
+        output_chunk_column_name: str,
+        output_chunk_column_id: str,
+        chunk_size_tokens: int, 
+        chunk_overlap_tokens: int
+    ):
+        self.output_chunk_column_name = output_chunk_column_name
+        self.output_chunk_column_id = output_chunk_column_id
+        self.chunk_size = chunk_size_tokens
+        self.chunk_overlap = chunk_overlap_tokens
+
+
+    def _chunk_text(self, text: str) -> List[str]:
+        """
+        Internal method to chunk text using TokenTextSplitter.
+
+        Args:
+            text (str): Input text to be chunked.
+
+        Returns:
+            List[str]: List of chunked text.
+        """
+        text_splitter = TokenTextSplitter(
+            chunk_size=self.chunk_size, 
+            chunk_overlap=self.chunk_overlap
+        )
+        return text_splitter.split_text(text)
+
+
+    def chunk(self, text: str) -> Iterator[Dict]:
+        """
+        Chunks input text into fixed-window lengths with token overlap.
+
+        Args:
+            text (str): Input text to be chunked.
+
+        Yields:
+            Dict: Chunked text with ID.
+        """
+        chunk_id = 0
+        for chunk in self._chunk_text(text):
+            yield {
+                self.output_chunk_column_id: chunk_id,
+                self.output_chunk_column_name: chunk,
+            }
+            chunk_id += 1
diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py
@@ -22,6 +22,7 @@
 # create parameters
 input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
 # input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_md"))
+# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_fixed_size"))
 output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
 local_conf = {
     "input_folder": input_folder,
@@ -39,6 +40,11 @@
     # doc_chunk params
     # "doc_chunk_chunking_type": "li_markdown",
     "doc_chunk_chunking_type": "dl_json",
+    # "doc_chunk_chunking_type": "fixed_size",
+    # fixed-size params
+    # "doc_chunk_output_chunk_column_name": "chunk_text",
+    # "doc_chunk_chunk_size_tokens": 128,
+    # "doc_chunk_chunk_overlap_tokens": 30
 }
 if __name__ == "__main__":
     # Set the simulated command line args

diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py
@@ -18,7 +18,7 @@
 import pyarrow as pa
 from data_processing.transform import AbstractTableTransform, TransformConfiguration
 from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
-from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown
+from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, FixedTokenSizeChunker
 
 
 short_name = "doc_chunk"
@@ -27,7 +27,10 @@
 doc_id_column_name_key = "doc_id_column_name"
 chunking_type_key = "chunking_type"
 dl_min_chunk_len_key = "dl_min_chunk_len"
+chunk_size_tokens_key = "chunk_size_tokens"
+chunk_overlap_tokens_key = "chunk_overlap_tokens"
 output_chunk_column_name_key = "output_chunk_column_name"
+output_chunk_column_id_key = "output_chunk_column_id"
 output_source_doc_id_column_name_key = "output_source_doc_id_column_name"
 output_jsonpath_column_name_key = "output_jsonpath_column_name"
 output_pageno_column_name_key = "output_pageno_column_name"
@@ -41,11 +44,13 @@
 output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
 output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}"
 output_bbox_column_name_cli_param = f"{cli_prefix}{output_bbox_column_name_key}"
-
+chunk_size_tokens_cli_param = f"{cli_prefix}{chunk_size_tokens_key}"
+chunk_overlap_tokens_cli_param = f"{cli_prefix}{chunk_overlap_tokens_key}"
 
 class chunking_types(str, enum.Enum):
     LI_MARKDOWN = "li_markdown"
     DL_JSON = "dl_json"
+    FIXED_SIZE = "fixed_size"
 
     def __str__(self):
         return str(self.value)
@@ -56,11 +61,13 @@ def __str__(self):
 default_chunking_type = chunking_types.DL_JSON
 default_dl_min_chunk_len = None
 default_output_chunk_column_name = "contents"
+default_output_chunk_column_id = "chunk_id"
 default_output_source_doc_id_column_name = "source_document_id"
 default_output_jsonpath_column_name = "doc_jsonpath"
 default_output_pageno_column_name = "page_number"
 default_output_bbox_column_name = "bbox"
-
+default_chunk_size_tokens = 128
+default_chunk_overlap_tokens = 30
 
 class DocChunkTransform(AbstractTableTransform):
     """
@@ -84,6 +91,7 @@ def __init__(self, config: dict[str, Any]):
         self.content_column_name = config.get(content_column_name_key, default_content_column_name)
         self.doc_id_column_name = config.get(doc_id_column_name_key, default_doc_id_column_name)
         self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name)
+        self.output_chunk_column_id = config.get(output_chunk_column_id_key, default_output_chunk_column_id)
         self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)
 
         # Parameters for Docling JSON chunking
@@ -96,6 +104,10 @@ def __init__(self, config: dict[str, Any]):
         )
         self.output_bbox_column_name_key = config.get(output_bbox_column_name_key, default_output_bbox_column_name)
 
+        # Parameters for Fixed-size with overlap chunking 
+        self.chunk_size_tokens = config.get(chunk_size_tokens_key, default_chunk_size_tokens)
+        self.chunk_overlap_tokens = config.get(chunk_overlap_tokens_key, default_chunk_overlap_tokens)
+
         # Initialize chunker
 
         self.chunker: ChunkingExecutor
@@ -111,6 +123,13 @@ def __init__(self, config: dict[str, Any]):
             self.chunker = LIMarkdown(
                 output_chunk_column_name=self.output_chunk_column_name,
             )
+        elif self.chunking_type == chunking_types.FIXED_SIZE:
+            self.chunker = FixedTokenSizeChunker(
+                output_chunk_column_name=self.output_chunk_column_name,
+                output_chunk_column_id=self.output_chunk_column_id,
+                chunk_size_tokens=self.chunk_size_tokens,
+                chunk_overlap_tokens=self.chunk_overlap_tokens
+            )
         else:
             raise RuntimeError(f"{self.chunking_type=} is not valid.")
 
@@ -213,6 +232,18 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=default_output_bbox_column_name,
             help="Column name to store the bbox of the chunk",
         )
+        parser.add_argument(
+            f"--{chunk_size_tokens_cli_param}",
+            default=default_chunk_size_tokens,
+            type=int,
+            help="Size of the chunk in tokens for the fixed-sized chunker",
+        )
+        parser.add_argument(
+            f"--{chunk_overlap_tokens_cli_param}",
+            default=default_chunk_overlap_tokens,
+            type=int,
+            help="Number of tokens overlapping between chunks for the fixed-sized chunker.",
+        )
 
     def apply_input_params(self, args: Namespace) -> bool:
         """

diff --git a/transforms/language/doc_chunk/python/test-data/expected_fixed_size/sample1.parquet b/transforms/language/doc_chunk/python/test-data/expected_fixed_size/sample1.parquet
diff --git a/transforms/language/doc_chunk/python/test-data/input_fixed_size/sample1.parquet b/transforms/language/doc_chunk/python/test-data/input_fixed_size/sample1.parquet
diff --git a/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py b/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py
@@ -16,7 +16,11 @@
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
-from doc_chunk_transform import chunking_type_cli_param, chunking_types
+from doc_chunk_transform import (
+    chunking_type_cli_param, 
+    output_chunk_column_name_cli_param,
+    chunking_types
+)
 from doc_chunk_transform_python import DocChunkPythonTransformConfiguration
 
 
@@ -55,4 +59,17 @@ def get_test_transform_fixtures(self) -> list[tuple]:
                 basedir + "/expected_md",
             )
         )
+
+        # Run with fixed size token chunker
+        fixtures.append(
+            (
+                launcher,
+                {
+                    chunking_type_cli_param: chunking_types.FIXED_SIZE,
+                    output_chunk_column_name_cli_param: "chunk_text"
+                },
+                basedir + "/input_fixed_size",
+                basedir + "/expected_fixed_size",
+            )
+        )
         return fixtures