Skip to content

Commit

Permalink
fix: change naming
Browse files Browse the repository at this point in the history
to better reflect the new chunker is also leveraging a Llama Index chunker

Signed-off-by: Juan Cappi <[email protected]>
IBM#641
  • Loading branch information
juancappi committed Oct 3, 2024
1 parent 242fad1 commit c481c5c
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,13 @@ def chunk(self, content: str) -> Iterator[dict]:
}


class FixedTokenSizeChunker(ChunkingExecutor):
class LITokenTextSplitter(ChunkingExecutor):
"""
Chunks input text into fixed-window lengths, measured in tokens, with an overlap also measured in tokens.
A text chunker that leverages Llama Index's token-based text splitter. This splitter breaks input text into
fixed-window chunks, with each chunk measured in tokens rather than characters.
The chunking process ensures that each chunk contains a specific number of tokens, and an optional overlap between
chunks (also measured in tokens) can be specified to preserve context between the chunks.
Args:
output_chunk_column_name (str): Name of the output column containing the text of each chunk.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils
from doc_chunk_transform_python import DocChunkPythonTransformConfiguration

from doc_chunk_transform import chunking_types

# create parameters
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_md"))
# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_fixed_size"))
# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_token_text"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
local_conf = {
"input_folder": input_folder,
Expand All @@ -40,7 +40,7 @@
# doc_chunk params
# "doc_chunk_chunking_type": "li_markdown",
"doc_chunk_chunking_type": "dl_json",
# "doc_chunk_chunking_type": "fixed_size",
"doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT,
# fixed-size params
# "doc_chunk_output_chunk_column_name": "chunk_text",
# "doc_chunk_chunk_size_tokens": 128,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import pyarrow as pa
from data_processing.transform import AbstractTableTransform, TransformConfiguration
from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, FixedTokenSizeChunker
from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, LITokenTextSplitter


short_name = "doc_chunk"
Expand Down Expand Up @@ -50,7 +50,7 @@
class chunking_types(str, enum.Enum):
LI_MARKDOWN = "li_markdown"
DL_JSON = "dl_json"
FIXED_SIZE = "fixed_size"
LI_TOKEN_TEXT = "li_token_text"

def __str__(self):
return str(self.value)
Expand Down Expand Up @@ -123,8 +123,8 @@ def __init__(self, config: dict[str, Any]):
self.chunker = LIMarkdown(
output_chunk_column_name=self.output_chunk_column_name,
)
elif self.chunking_type == chunking_types.FIXED_SIZE:
self.chunker = FixedTokenSizeChunker(
elif self.chunking_type == chunking_types.LI_TOKEN_TEXT:
self.chunker = LITokenTextSplitter(
output_chunk_column_name=self.output_chunk_column_name,
output_chunk_column_id=self.output_chunk_column_id,
chunk_size_tokens=self.chunk_size_tokens,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,11 @@ def get_test_transform_fixtures(self) -> list[tuple]:
(
launcher,
{
chunking_type_cli_param: chunking_types.FIXED_SIZE,
chunking_type_cli_param: chunking_types.LI_TOKEN_TEXT,
output_chunk_column_name_cli_param: "chunk_text"
},
basedir + "/input_fixed_size",
basedir + "/expected_fixed_size",
basedir + "/input_token_text",
basedir + "/expected_token_text",
)
)
return fixtures

0 comments on commit c481c5c

Please sign in to comment.