Skip to content

Commit

Permalink
feat: #641 - first draft implementation, python only
Browse files Browse the repository at this point in the history
  • Loading branch information
juancappi committed Sep 29, 2024
1 parent 9c09652 commit 43d36f1
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 6 deletions.
4 changes: 3 additions & 1 deletion transforms/language/doc_chunk/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@ The transform can be tuned with the following parameters.

| Parameter | Default | Description |
|------------|----------|--------------|
| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling). |
| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `fixed_size` for chunking text into fixed-sized windows of tokens, where both the window size and overlap between windows are measured in tokens. |
| `content_column_name` | `contents` | Name of the column containing the text to be chunked. |
| `doc_id_column_name` | `document_id` | Name of the column containing the doc_id to be propagated in the output. |
| `dl_min_chunk_len` | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. |
| `chunk_size_tokens` | `128` | Size of the chunk in tokens for the fixed-sized chunker. |
| `chunk_overlap_tokens` | `30` | Number of tokens overlapping between chunks for the fixed-sized chunker. |
| `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. |
| `output_source_doc_id_column_name` | `source_document_id` | Column name to store the `doc_id` from the input table. |
| `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. |
Expand Down
69 changes: 68 additions & 1 deletion transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@
################################################################################

from abc import ABCMeta, abstractmethod
from typing import Iterator, Optional
from typing import Iterator, Optional, Dict, List

from docling_core.types import Document as DLDocument
from llama_index.core.node_parser.text.token import TokenTextSplitter
from llama_index.core import Document as LIDocument
from llama_index.core.node_parser import MarkdownNodeParser
from docling_core.transforms.chunker import HierarchicalChunker
Expand Down Expand Up @@ -66,3 +67,69 @@ def chunk(self, content: str) -> Iterator[dict]:
yield {
self.output_chunk_column_name: node.text,
}


class FixedTokenSizeChunker(ChunkingExecutor):
"""
Chunks input text into fixed-window lengths, measured in tokens, with an overlap also measured in tokens.
Args:
output_chunk_column_name (str): Name of the output column containing the text of each chunk.
output_chunk_column_id (str): Name of the output column containing the ID of each chunk.
chunk_size_tokens (int): Length of each chunk in number of tokens.
chunk_overlap_tokens (int): Number of tokens overlapping between consecutive chunks.
Attributes:
output_chunk_column_name (str)
output_chunk_column_id (str)
chunk_size_tokens (int)
chunk_overlap_tokens (int)
"""

def __init__(
self,
output_chunk_column_name: str,
output_chunk_column_id: str,
chunk_size_tokens: int,
chunk_overlap_tokens: int
):
self.output_chunk_column_name = output_chunk_column_name
self.output_chunk_column_id = output_chunk_column_id
self.chunk_size = chunk_size_tokens
self.chunk_overlap = chunk_overlap_tokens


def _chunk_text(self, text: str) -> List[str]:
"""
Internal method to chunk text using TokenTextSplitter.
Args:
text (str): Input text to be chunked.
Returns:
List[str]: List of chunked text.
"""
text_splitter = TokenTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap
)
return text_splitter.split_text(text)


def chunk(self, text: str) -> Iterator[Dict]:
"""
Chunks input text into fixed-window lengths with token overlap.
Args:
text (str): Input text to be chunked.
Yields:
Dict: Chunked text with ID.
"""
chunk_id = 0
for chunk in self._chunk_text(text):
yield {
self.output_chunk_column_id: chunk_id,
self.output_chunk_column_name: chunk,
}
chunk_id += 1
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
# create parameters
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_md"))
# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_fixed_size"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
local_conf = {
"input_folder": input_folder,
Expand All @@ -39,6 +40,11 @@
# doc_chunk params
# "doc_chunk_chunking_type": "li_markdown",
"doc_chunk_chunking_type": "dl_json",
# "doc_chunk_chunking_type": "fixed_size",
# fixed-size params
# "doc_chunk_output_chunk_column_name": "chunk_text",
# "doc_chunk_chunk_size_tokens": 128,
# "doc_chunk_chunk_overlap_tokens": 30
}
if __name__ == "__main__":
# Set the simulated command line args
Expand Down
37 changes: 34 additions & 3 deletions transforms/language/doc_chunk/python/src/doc_chunk_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import pyarrow as pa
from data_processing.transform import AbstractTableTransform, TransformConfiguration
from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown
from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, FixedTokenSizeChunker


short_name = "doc_chunk"
Expand All @@ -27,7 +27,10 @@
doc_id_column_name_key = "doc_id_column_name"
chunking_type_key = "chunking_type"
dl_min_chunk_len_key = "dl_min_chunk_len"
chunk_size_tokens_key = "chunk_size_tokens"
chunk_overlap_tokens_key = "chunk_overlap_tokens"
output_chunk_column_name_key = "output_chunk_column_name"
output_chunk_column_id_key = "output_chunk_column_id"
output_source_doc_id_column_name_key = "output_source_doc_id_column_name"
output_jsonpath_column_name_key = "output_jsonpath_column_name"
output_pageno_column_name_key = "output_pageno_column_name"
Expand All @@ -41,11 +44,13 @@
output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}"
output_bbox_column_name_cli_param = f"{cli_prefix}{output_bbox_column_name_key}"

chunk_size_tokens_cli_param = f"{cli_prefix}{chunk_size_tokens_key}"
chunk_overlap_tokens_cli_param = f"{cli_prefix}{chunk_overlap_tokens_key}"

class chunking_types(str, enum.Enum):
LI_MARKDOWN = "li_markdown"
DL_JSON = "dl_json"
FIXED_SIZE = "fixed_size"

def __str__(self):
return str(self.value)
Expand All @@ -56,11 +61,13 @@ def __str__(self):
default_chunking_type = chunking_types.DL_JSON
default_dl_min_chunk_len = None
default_output_chunk_column_name = "contents"
default_output_chunk_column_id = "chunk_id"
default_output_source_doc_id_column_name = "source_document_id"
default_output_jsonpath_column_name = "doc_jsonpath"
default_output_pageno_column_name = "page_number"
default_output_bbox_column_name = "bbox"

default_chunk_size_tokens = 128
default_chunk_overlap_tokens = 30

class DocChunkTransform(AbstractTableTransform):
"""
Expand All @@ -84,6 +91,7 @@ def __init__(self, config: dict[str, Any]):
self.content_column_name = config.get(content_column_name_key, default_content_column_name)
self.doc_id_column_name = config.get(doc_id_column_name_key, default_doc_id_column_name)
self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name)
self.output_chunk_column_id = config.get(output_chunk_column_id_key, default_output_chunk_column_id)
self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)

# Parameters for Docling JSON chunking
Expand All @@ -96,6 +104,10 @@ def __init__(self, config: dict[str, Any]):
)
self.output_bbox_column_name_key = config.get(output_bbox_column_name_key, default_output_bbox_column_name)

# Parameters for Fixed-size with overlap chunking
self.chunk_size_tokens = config.get(chunk_size_tokens_key, default_chunk_size_tokens)
self.chunk_overlap_tokens = config.get(chunk_overlap_tokens_key, default_chunk_overlap_tokens)

# Initialize chunker

self.chunker: ChunkingExecutor
Expand All @@ -111,6 +123,13 @@ def __init__(self, config: dict[str, Any]):
self.chunker = LIMarkdown(
output_chunk_column_name=self.output_chunk_column_name,
)
elif self.chunking_type == chunking_types.FIXED_SIZE:
self.chunker = FixedTokenSizeChunker(
output_chunk_column_name=self.output_chunk_column_name,
output_chunk_column_id=self.output_chunk_column_id,
chunk_size_tokens=self.chunk_size_tokens,
chunk_overlap_tokens=self.chunk_overlap_tokens
)
else:
raise RuntimeError(f"{self.chunking_type=} is not valid.")

Expand Down Expand Up @@ -213,6 +232,18 @@ def add_input_params(self, parser: ArgumentParser) -> None:
default=default_output_bbox_column_name,
help="Column name to store the bbox of the chunk",
)
parser.add_argument(
f"--{chunk_size_tokens_cli_param}",
default=default_chunk_size_tokens,
type=int,
help="Size of the chunk in tokens for the fixed-sized chunker",
)
parser.add_argument(
f"--{chunk_overlap_tokens_cli_param}",
default=default_chunk_overlap_tokens,
type=int,
help="Number of tokens overlapping between chunks for the fixed-sized chunker.",
)

def apply_input_params(self, args: Namespace) -> bool:
"""
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
from data_processing.test_support.launch.transform_test import (
AbstractTransformLauncherTest,
)
from doc_chunk_transform import chunking_type_cli_param, chunking_types
from doc_chunk_transform import (
chunking_type_cli_param,
output_chunk_column_name_cli_param,
chunking_types
)
from doc_chunk_transform_python import DocChunkPythonTransformConfiguration


Expand Down Expand Up @@ -55,4 +59,17 @@ def get_test_transform_fixtures(self) -> list[tuple]:
basedir + "/expected_md",
)
)

# Run with fixed size token chunker
fixtures.append(
(
launcher,
{
chunking_type_cli_param: chunking_types.FIXED_SIZE,
output_chunk_column_name_cli_param: "chunk_text"
},
basedir + "/input_fixed_size",
basedir + "/expected_fixed_size",
)
)
return fixtures

0 comments on commit 43d36f1

Please sign in to comment.