From 3cb3a488e2d2143dda063270aa28349ba9e2626e Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Tue, 3 Dec 2024 13:58:40 +0100 Subject: [PATCH] factored chunker out to core, simplified test data Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docs/examples/advanced_chunking.ipynb | 211 ------- .../advanced_chunking_with_merging.ipynb | 571 ------------------ docs/examples/token_aware_chunking.ipynb | 194 ++++++ poetry.lock | 83 ++- pyproject.toml | 2 +- tests/data/md/wiki.md | 25 + 6 files changed, 288 insertions(+), 798 deletions(-) delete mode 100644 docs/examples/advanced_chunking.ipynb delete mode 100644 docs/examples/advanced_chunking_with_merging.ipynb create mode 100644 docs/examples/token_aware_chunking.ipynb create mode 100644 tests/data/md/wiki.md diff --git a/docs/examples/advanced_chunking.ipynb b/docs/examples/advanced_chunking.ipynb deleted file mode 100644 index 809d3e9cd..000000000 --- a/docs/examples/advanced_chunking.ipynb +++ /dev/null @@ -1,211 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Advanced Chunking" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this notebook, we demonstrate an advanced chunking example, showcasing how a user can:\n", - "- serialize and include some parts of the metadata (as per application logic) into the final chunk text, and\n", - "- leverage a tokenizer to build specialized chunking logic, e.g. to impose a maximum token length and futher split chunks beyond that." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We first convert an example document:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from docling.document_converter import DocumentConverter\n", - "\n", - "source = \"https://arxiv.org/pdf/2408.09869\"\n", - "converter = DocumentConverter()\n", - "doc = converter.convert(source=source).document" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Below we define the metadata serialization logic and the specific usage of the tokenizer for applying the token limits.\n", - "\n", - "The whole process is wrapped as a `BaseChunker` implementation internally using a `HierarchicalChunker` and applying the logic on top of the results of the latter." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from copy import deepcopy\n", - "from typing import Iterable, Iterator\n", - "\n", - "from docling_core.transforms.chunker import (\n", - " BaseChunk,\n", - " BaseChunker,\n", - " DocMeta,\n", - " HierarchicalChunker,\n", - ")\n", - "from docling_core.types.doc import DoclingDocument as DLDocument\n", - "from pydantic import ConfigDict, PositiveInt\n", - "from transformers import AutoTokenizer\n", - "\n", - "\n", - "class MaxTokenLimitingChunker(BaseChunker):\n", - " model_config = ConfigDict(arbitrary_types_allowed=True)\n", - "\n", - " inner_chunker: BaseChunker = HierarchicalChunker()\n", - " tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-small-en-v1.5\")\n", - " max_tokens: PositiveInt = 512\n", - " delim: str = \"\\n\"\n", - "\n", - " def _serialize_meta_to_include(self, meta: DocMeta) -> str:\n", - " meta_parts = []\n", - " headings_part = self.delim.join(meta.headings or [])\n", - " if headings_part:\n", - " meta_parts.append(headings_part)\n", - " captions_part = self.delim.join(meta.captions or [])\n", - " if captions_part:\n", - " meta_parts.append(captions_part)\n", - " return self.delim.join(meta_parts)\n", - "\n", - " def _split_above_max_tokens(self, chunk_iter: Iterable[BaseChunk]):\n", - " for chunk in chunk_iter:\n", - " meta = DocMeta.model_validate(chunk.meta)\n", - " meta_text = self._serialize_meta_to_include(meta=meta)\n", - " meta_list = [meta_text] if meta_text else []\n", - " full_ser = self.delim.join(meta_list + ([chunk.text] if chunk.text else []))\n", - "\n", - " meta_tokens = self.tokenizer(\n", - " meta_text, return_offsets_mapping=True, add_special_tokens=False\n", - " )[\"offset_mapping\"]\n", - " delim_tokens = (\n", - " self.tokenizer(\n", - " self.delim, return_offsets_mapping=True, add_special_tokens=False\n", - " )[\"offset_mapping\"]\n", - " if meta_text\n", - " else []\n", - " )\n", - " num_tokens_avail_for_text = self.max_tokens - (\n", - " len(meta_tokens) + len(delim_tokens)\n", - " )\n", - "\n", - " text_tokens = self.tokenizer(\n", - " chunk.text, return_offsets_mapping=True, add_special_tokens=False\n", - " )[\"offset_mapping\"]\n", - " num_text_tokens = len(text_tokens)\n", - "\n", - " if (\n", - " num_text_tokens <= num_tokens_avail_for_text\n", - " ): # chunk already within token limit\n", - " c = deepcopy(chunk)\n", - " c.text = full_ser\n", - " yield c\n", - " else: # chunk requires further splitting to meet token limit\n", - " fitting_texts = [\n", - " chunk.text[\n", - " text_tokens[base][0] : text_tokens[\n", - " min(base + num_tokens_avail_for_text, num_text_tokens) - 1\n", - " ][1]\n", - " ]\n", - " for base in range(0, num_text_tokens, num_tokens_avail_for_text)\n", - " ]\n", - " for text in fitting_texts:\n", - " c = deepcopy(chunk)\n", - " c.text = self.delim.join(meta_list + [text])\n", - " yield c\n", - "\n", - " def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:\n", - " chunk_iter = self.inner_chunker.chunk(dl_doc=dl_doc, **kwargs)\n", - " yield from self._split_above_max_tokens(chunk_iter=chunk_iter)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the example invocation shown below, one can see how a single original chunk (`self_ref == \"#/texts/8\"`) is split into multiple ones:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'len=64 text=1 Introduction\\nConverting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation ('" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'len=64 text=1 Introduction\\nRAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'len=26 text=1 Introduction\\n, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.'" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "chunker = MaxTokenLimitingChunker(max_tokens=64)\n", - "chunk_iter = chunker.chunk(dl_doc=doc)\n", - "\n", - "for chunk in chunk_iter:\n", - " meta = DocMeta.model_validate(chunk.meta)\n", - " if meta.doc_items[0].self_ref == \"#/texts/8\":\n", - " display(\n", - " f\"len={len(chunker.tokenizer(chunk.text, return_offsets_mapping=True, add_special_tokens=False)['offset_mapping'])} text={chunk.text}\"\n", - " )" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/examples/advanced_chunking_with_merging.ipynb b/docs/examples/advanced_chunking_with_merging.ipynb deleted file mode 100644 index 043f02b6b..000000000 --- a/docs/examples/advanced_chunking_with_merging.ipynb +++ /dev/null @@ -1,571 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Advanced Chunking" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "# %pip install -qU docling docling-core sentence-transformers transformers semchunk lancedb pydantic\n", - "\n", - "# FIXME temp install line\n", - "%pip install -qU \"docling-core @ git+https://github.com/DS4SD/docling-core.git@expand-chunking\" sentence-transformers transformers semchunk lancedb pydantic" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "from dataclasses import dataclass\n", - "from pathlib import Path\n", - "from tempfile import mkdtemp\n", - "from typing import Iterator, Optional, Self, Union\n", - "\n", - "import lancedb\n", - "import semchunk\n", - "from docling_core.transforms.chunker import (\n", - " BaseChunk,\n", - " BaseChunker,\n", - " DocChunk,\n", - " DocMeta,\n", - " HierarchicalChunker,\n", - ")\n", - "from docling_core.types import DoclingDocument\n", - "from pydantic import ConfigDict, PositiveInt, TypeAdapter, model_validator\n", - "from sentence_transformers import SentenceTransformer\n", - "from transformers import AutoTokenizer, PreTrainedTokenizerBase\n", - "\n", - "from docling.document_converter import DocumentConverter" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n", - "MAX_TOKENS = 64\n", - "DOC_SOURCE = \"http://bill.murdocks.org/iccbr2011murdock_web.pdf\"\n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)\n", - "embed_model = SentenceTransformer(EMBED_MODEL_ID)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Chunker Definition" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "class DocChunker(BaseChunker):\n", - "\n", - " model_config: ConfigDict = ConfigDict(arbitrary_types_allowed=True)\n", - "\n", - " tokenizer: PreTrainedTokenizerBase\n", - "\n", - " inner_chunker: BaseChunker = HierarchicalChunker()\n", - " max_tokens: int = None # actual dflt value resolved in validator based on tokenizer\n", - " delim: str = \"\\n\"\n", - "\n", - " @model_validator(mode=\"after\")\n", - " def patch_max_tokens(self) -> Self:\n", - " if self.max_tokens is None:\n", - " print(f\"{self.tokenizer.model_max_length=}\")\n", - " self.max_tokens = TypeAdapter(PositiveInt).validate_python(\n", - " self.tokenizer.model_max_length\n", - " )\n", - " return self\n", - "\n", - " def _count_tokens(self, text: Optional[Union[str, list[str]]]):\n", - " if text is None:\n", - " return 0\n", - " elif isinstance(text, list):\n", - " total = 0\n", - " for t in text:\n", - " total += self._count_tokens(t)\n", - " return total\n", - " return len(self.tokenizer.tokenize(text, max_length=None))\n", - "\n", - " @dataclass\n", - " class _ChunkLengthInfo:\n", - " total_len: int\n", - " text_len: int\n", - " other_len: int\n", - "\n", - " def _doc_chunk_length(self, doc_chunk: DocChunk):\n", - " text_length = self._count_tokens(doc_chunk.text)\n", - " # Note that count_tokens handles None and lists, making this code simpler:\n", - " # TODO check if delim properly considered\n", - " headings_length = self._count_tokens(doc_chunk.meta.headings)\n", - " captions_length = self._count_tokens(doc_chunk.meta.captions)\n", - " total = text_length + headings_length + captions_length\n", - " return self._ChunkLengthInfo(\n", - " total_len=total,\n", - " text_len=text_length,\n", - " other_len=total - text_length,\n", - " )\n", - "\n", - " def _make_chunk_from_doc_items(\n", - " self, doc_chunk: DocChunk, window_text: str, window_start: int, window_end: int\n", - " ):\n", - " meta = DocMeta(\n", - " doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],\n", - " headings=doc_chunk.meta.headings,\n", - " captions=doc_chunk.meta.captions,\n", - " )\n", - " new_chunk = DocChunk.from_data(text=window_text, meta=meta, delim=self.delim)\n", - " return new_chunk\n", - "\n", - " def _merge_text(self, t1, t2):\n", - " if t1 == \"\":\n", - " return t2\n", - " elif t2 == \"\":\n", - " return t1\n", - " else:\n", - " return f\"{t1}{self.delim}{t2}\"\n", - "\n", - " def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:\n", - " if doc_chunk.meta.doc_items == None or len(doc_chunk.meta.doc_items) <= 1:\n", - " return [doc_chunk]\n", - " length = self._doc_chunk_length(doc_chunk)\n", - " if length.total_len <= self.max_tokens:\n", - " return [doc_chunk]\n", - " else:\n", - " chunks = []\n", - " window_start = 0\n", - " window_end = 0\n", - " window_text = \"\"\n", - " window_text_length = 0\n", - " other_length = length.other_len\n", - " l = len(doc_chunk.meta.doc_items)\n", - " while window_end < l:\n", - " doc_item = doc_chunk.meta.doc_items[window_end]\n", - " text = doc_item.text\n", - " text_length = self._count_tokens(text)\n", - " if (\n", - " text_length + window_text_length + other_length < self.max_tokens\n", - " and window_end < l - 1\n", - " ):\n", - " # Still room left to add more to this chunk AND still at least one item left\n", - " window_end += 1\n", - " window_text_length += text_length\n", - " window_text = self._merge_text(window_text, text)\n", - " elif text_length + window_text_length + other_length < self.max_tokens:\n", - " # All the items in the window fit into the chunk and there are no other items left\n", - " window_text = self._merge_text(window_text, text)\n", - " new_chunk = self._make_chunk_from_doc_items(\n", - " doc_chunk, window_text, window_start, window_end\n", - " )\n", - " chunks.append(new_chunk)\n", - " window_end = l\n", - " elif window_start == window_end:\n", - " # Only one item in the window and it doesn't fit into the chunk. So we'll just make it a chunk for now and it will get split in the plain text splitter.\n", - " window_text = self._merge_text(window_text, text)\n", - " new_chunk = self._make_chunk_from_doc_items(\n", - " doc_chunk, window_text, window_start, window_end\n", - " )\n", - " chunks.append(new_chunk)\n", - " window_start = window_end + 1\n", - " window_end = window_start\n", - " window_text = \"\"\n", - " window_text_length = 0\n", - " else:\n", - " # Multiple items in the window but they don't fit into the chunk. However, the existing items must have fit or we wouldn't have gotten here.\n", - " # So we put everything but the last item into the chunk and then start a new window INCLUDING the current window end.\n", - " new_chunk = self._make_chunk_from_doc_items(\n", - " doc_chunk, window_text, window_start, window_end - 1\n", - " )\n", - " chunks.append(new_chunk)\n", - " window_start = window_end\n", - " window_text = \"\"\n", - " window_text_length = 0\n", - " return chunks\n", - "\n", - " def _split_using_plain_text(\n", - " self,\n", - " doc_chunk: DocChunk,\n", - " ) -> list[DocChunk]:\n", - " lengths = self._doc_chunk_length(doc_chunk)\n", - " if lengths.total_len <= self.max_tokens:\n", - " return [\n", - " DocChunk.from_data(\n", - " delim=self.delim,\n", - " **doc_chunk.export_json_dict(),\n", - " )\n", - " ]\n", - " else:\n", - "\n", - " # How much room is there for text after subtracting out the headers and captions:\n", - " available_length = self.max_tokens - lengths.other_len\n", - " sem_chunker = semchunk.chunkerify(\n", - " self.tokenizer, chunk_size=available_length\n", - " )\n", - " if available_length <= 0:\n", - " warnings.warn(\n", - " f\"Headers and captions for this chunk are longer than the total amount of size for the chunk. Chunk will be ignored.\"\n", - " )\n", - " return []\n", - " text = doc_chunk.text\n", - " segments = sem_chunker.chunk(text)\n", - " chunks = [\n", - " DocChunk.from_data(text=s, meta=doc_chunk.meta, delim=self.delim)\n", - " for s in segments\n", - " ]\n", - " return chunks\n", - "\n", - " def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):\n", - " output_chunks = []\n", - " window_start = 0\n", - " window_end = 0\n", - " l = len(chunks)\n", - " while window_end < l:\n", - " chunk = chunks[window_end]\n", - " lengths = self._doc_chunk_length(chunk)\n", - " headings_and_captions = (chunk.meta.headings, chunk.meta.captions)\n", - " if window_start == window_end:\n", - " # starting a new block of chunks to potentially merge\n", - " current_headings_and_captions = headings_and_captions\n", - " window_text = chunk.text\n", - " window_other_length = lengths.other_len\n", - " window_text_length = lengths.text_len\n", - " window_items = chunk.meta.doc_items\n", - " window_end += 1\n", - " first_chunk_of_window = chunk\n", - " elif (\n", - " headings_and_captions == current_headings_and_captions\n", - " and window_text_length + window_other_length + lengths.text_len\n", - " <= self.max_tokens\n", - " ):\n", - " # there is room to include the new chunk so add it to the window and continue\n", - " window_text = self._merge_text(window_text, chunk.text)\n", - " window_text_length += lengths.text_len\n", - " window_items = window_items + chunk.meta.doc_items\n", - " window_end += 1\n", - " else:\n", - " # no more room OR the start of new metadata. Either way, end the block and use the current window_end as the start of a new block\n", - " if window_start + 1 == window_end:\n", - " # just one chunk so use it as is\n", - " output_chunks.append(first_chunk_of_window)\n", - " else:\n", - " new_meta = DocMeta(\n", - " doc_items=window_items,\n", - " headings=headings_and_captions[0],\n", - " captions=headings_and_captions[1],\n", - " )\n", - " new_chunk = DocChunk.from_data(\n", - " text=window_text,\n", - " meta=new_meta,\n", - " delim=self.delim,\n", - " )\n", - " output_chunks.append(new_chunk)\n", - " window_start = window_end # no need to reset window_text, etc. because that will be reset in the next iteration in the if window_start == window_end block\n", - "\n", - " return output_chunks\n", - "\n", - " def _merge_chunks(self, chunks: list[DocChunk]) -> list[DocChunk]:\n", - " res = chunks\n", - " # merges as many chunks as possible that have the same headings+captions.\n", - " res = self._merge_chunks_with_matching_metadata(res)\n", - " # merges chunks with different headings+captions. This is later so that merges within a section or other grouping are preferred.\n", - " # res = self._merge_chunks_with_mismatching_metadata(res)\n", - " return res\n", - "\n", - " def _adjust_chunks_for_fixed_size(self, chunks: list[DocChunk]):\n", - " res = chunks\n", - " res = [x for c in res for x in self._split_by_doc_items(c)]\n", - " res = [x for c in res for x in self._split_using_plain_text(c)]\n", - " res = self._merge_chunks(res)\n", - " return res\n", - "\n", - " def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]:\n", - " preliminary_chunks = self.inner_chunker.chunk(dl_doc=dl_doc, **kwargs)\n", - " output_chunks = self._adjust_chunks_for_fixed_size(preliminary_chunks)\n", - " return iter(output_chunks)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Usage" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "conv_res = DocumentConverter().convert(source=DOC_SOURCE)\n", - "doc = conv_res.document" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "chunk.text (33 tokens):\n", - "'murdockj@us.ibm.com IBM T.J. Watson Research Center P.O. Box 704 Yorktown Heights, NY 10598'\n", - "chunk.get_text_for_embedding() (39 tokens):\n", - "'J. William Murdock\\nmurdockj@us.ibm.com IBM T.J. Watson Research Center P.O. Box 704 Yorktown Heights, NY 10598'\n", - "chunk.get_text_for_generation() (39 tokens):\n", - "'J. William Murdock\\nmurdockj@us.ibm.com IBM T.J. Watson Research Center P.O. Box 704 Yorktown Heights, NY 10598'\n", - "\n", - "chunk.text (58 tokens):\n", - "'Abstract. The Jeopardy! television quiz show asks natural-language questions and requires natural-language answers. One useful source of information for answering Jeopardy! questions is text from written sources such as encyclopedias or news articles. A text passage may partially or fully indicate that some candidate answer is the correct'\n", - "chunk.get_text_for_embedding() (64 tokens):\n", - "'J. William Murdock\\nAbstract. The Jeopardy! television quiz show asks natural-language questions and requires natural-language answers. One useful source of information for answering Jeopardy! questions is text from written sources such as encyclopedias or news articles. A text passage may partially or fully indicate that some candidate answer is the correct'\n", - "chunk.get_text_for_generation() (64 tokens):\n", - "'J. William Murdock\\nAbstract. The Jeopardy! television quiz show asks natural-language questions and requires natural-language answers. One useful source of information for answering Jeopardy! questions is text from written sources such as encyclopedias or news articles. A text passage may partially or fully indicate that some candidate answer is the correct'\n", - "\n", - "chunk.text (58 tokens):\n", - "'answer to the question. Recognizing whether it does requires determining the extent to which what the passage is saying about the candidate answer is similar to what the question is saying about the desired answer. This paper describes how structure mapping [1] (an algorithm originally developed for analogical reasoning) is applied'\n", - "chunk.get_text_for_embedding() (64 tokens):\n", - "'J. William Murdock\\nanswer to the question. Recognizing whether it does requires determining the extent to which what the passage is saying about the candidate answer is similar to what the question is saying about the desired answer. This paper describes how structure mapping [1] (an algorithm originally developed for analogical reasoning) is applied'\n", - "chunk.get_text_for_generation() (64 tokens):\n", - "'J. William Murdock\\nanswer to the question. Recognizing whether it does requires determining the extent to which what the passage is saying about the candidate answer is similar to what the question is saying about the desired answer. This paper describes how structure mapping [1] (an algorithm originally developed for analogical reasoning) is applied'\n", - "\n", - "chunk.text (38 tokens):\n", - "\"to determine similarity between content in questions and passages. That algorithm is one of many used in the Watson question answering system [2]. It contributes a significant amount to Watson's effectiveness.\"\n", - "chunk.get_text_for_embedding() (44 tokens):\n", - "\"J. William Murdock\\nto determine similarity between content in questions and passages. That algorithm is one of many used in the Watson question answering system [2]. It contributes a significant amount to Watson's effectiveness.\"\n", - "chunk.get_text_for_generation() (44 tokens):\n", - "\"J. William Murdock\\nto determine similarity between content in questions and passages. That algorithm is one of many used in the Watson question answering system [2]. It contributes a significant amount to Watson's effectiveness.\"\n", - "\n", - "chunk.text (60 tokens):\n", - "'Watson is a question answering system built on a set of technologies known as DeepQA [2]. Watson has been customized and configured to compete at Jeopardy!, an American television quiz show. Watson takes in a question and produces a ranked list of answers with confidence scores attached to each of these answers.'\n", - "chunk.get_text_for_embedding() (62 tokens):\n", - "'1 Introduction\\nWatson is a question answering system built on a set of technologies known as DeepQA [2]. Watson has been customized and configured to compete at Jeopardy!, an American television quiz show. Watson takes in a question and produces a ranked list of answers with confidence scores attached to each of these answers.'\n", - "chunk.get_text_for_generation() (62 tokens):\n", - "'1 Introduction\\nWatson is a question answering system built on a set of technologies known as DeepQA [2]. Watson has been customized and configured to compete at Jeopardy!, an American television quiz show. Watson takes in a question and produces a ranked list of answers with confidence scores attached to each of these answers.'\n", - "\n" - ] - } - ], - "source": [ - "chunker = DocChunker(\n", - " tokenizer=tokenizer,\n", - " max_tokens=MAX_TOKENS, # optional, derived from `tokenizer` if not provided\n", - ")\n", - "chunks = list(chunker.chunk(dl_doc=doc))\n", - "\n", - "for chunk in chunks[:5]:\n", - " txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))\n", - " print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n", - " emb_txt = chunk.get_text_for_embedding()\n", - " emb_tokens = len(tokenizer.tokenize(emb_txt, max_length=None))\n", - " print(f\"chunk.get_text_for_embedding() ({emb_tokens} tokens):\\n{repr(emb_txt)}\")\n", - " gen_txt = chunk.get_text_for_generation()\n", - " gen_tokens = len(tokenizer.tokenize(gen_txt, max_length=None))\n", - " print(f\"chunk.get_text_for_generation() ({gen_tokens} tokens):\\n{repr(gen_txt)}\")\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Vector Retrieval" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " | vector | \n", - "text | \n", - "headings | \n", - "captions | \n", - "_distance | \n", - "
---|---|---|---|---|---|
0 | \n", - "[-0.025746439, 0.03888134, 0.0033668755, -0.03... | \n", - "3. Forbus, K. and Oblinger, D. (1990). Making ... | \n", - "[References] | \n", - "None | \n", - "0.332435 | \n", - "
1 | \n", - "[0.04400234, -0.034766007, -0.00025527124, 0.0... | \n", - "4. McCord, M. C. (1990). Slot Grammar: A Syste... | \n", - "[References] | \n", - "None | \n", - "1.525625 | \n", - "
2 | \n", - "[0.10043394, 0.00652478, 0.011601829, -0.06390... | \n", - "passage using semantic and/or syntactic edges:... | \n", - "[3 Syntactic-Semantic Graphs] | \n", - "None | \n", - "1.569923 | \n", - "
3 | \n", - "[0.025994677, 0.08402823, 0.03268827, -0.03727... | \n", - "In using this algorithm, we have encountered a... | \n", - "[4 Algorithm] | \n", - "None | \n", - "1.576838 | \n", - "
4 | \n", - "[0.050165094, 0.08015387, 0.035965856, 0.00846... | \n", - "word order) are more aggressive in what they c... | \n", - "[5 Evaluation and Conclusions] | \n", - "None | \n", - "1.580265 | \n", - "