From 3cb3a488e2d2143dda063270aa28349ba9e2626e Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Tue, 3 Dec 2024 13:58:40 +0100 Subject: [PATCH] factored chunker out to core, simplified test data Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docs/examples/advanced_chunking.ipynb | 211 ------- .../advanced_chunking_with_merging.ipynb | 571 ------------------ docs/examples/token_aware_chunking.ipynb | 194 ++++++ poetry.lock | 83 ++- pyproject.toml | 2 +- tests/data/md/wiki.md | 25 + 6 files changed, 288 insertions(+), 798 deletions(-) delete mode 100644 docs/examples/advanced_chunking.ipynb delete mode 100644 docs/examples/advanced_chunking_with_merging.ipynb create mode 100644 docs/examples/token_aware_chunking.ipynb create mode 100644 tests/data/md/wiki.md diff --git a/docs/examples/advanced_chunking.ipynb b/docs/examples/advanced_chunking.ipynb deleted file mode 100644 index 809d3e9cd..000000000 --- a/docs/examples/advanced_chunking.ipynb +++ /dev/null @@ -1,211 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Advanced Chunking" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this notebook, we demonstrate an advanced chunking example, showcasing how a user can:\n", - "- serialize and include some parts of the metadata (as per application logic) into the final chunk text, and\n", - "- leverage a tokenizer to build specialized chunking logic, e.g. to impose a maximum token length and futher split chunks beyond that." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We first convert an example document:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from docling.document_converter import DocumentConverter\n", - "\n", - "source = \"https://arxiv.org/pdf/2408.09869\"\n", - "converter = DocumentConverter()\n", - "doc = converter.convert(source=source).document" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Below we define the metadata serialization logic and the specific usage of the tokenizer for applying the token limits.\n", - "\n", - "The whole process is wrapped as a `BaseChunker` implementation internally using a `HierarchicalChunker` and applying the logic on top of the results of the latter." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from copy import deepcopy\n", - "from typing import Iterable, Iterator\n", - "\n", - "from docling_core.transforms.chunker import (\n", - " BaseChunk,\n", - " BaseChunker,\n", - " DocMeta,\n", - " HierarchicalChunker,\n", - ")\n", - "from docling_core.types.doc import DoclingDocument as DLDocument\n", - "from pydantic import ConfigDict, PositiveInt\n", - "from transformers import AutoTokenizer\n", - "\n", - "\n", - "class MaxTokenLimitingChunker(BaseChunker):\n", - " model_config = ConfigDict(arbitrary_types_allowed=True)\n", - "\n", - " inner_chunker: BaseChunker = HierarchicalChunker()\n", - " tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-small-en-v1.5\")\n", - " max_tokens: PositiveInt = 512\n", - " delim: str = \"\\n\"\n", - "\n", - " def _serialize_meta_to_include(self, meta: DocMeta) -> str:\n", - " meta_parts = []\n", - " headings_part = self.delim.join(meta.headings or [])\n", - " if headings_part:\n", - " meta_parts.append(headings_part)\n", - " captions_part = self.delim.join(meta.captions or [])\n", - " if captions_part:\n", - " meta_parts.append(captions_part)\n", - " return self.delim.join(meta_parts)\n", - "\n", - " def _split_above_max_tokens(self, chunk_iter: Iterable[BaseChunk]):\n", - " for chunk in chunk_iter:\n", - " meta = DocMeta.model_validate(chunk.meta)\n", - " meta_text = self._serialize_meta_to_include(meta=meta)\n", - " meta_list = [meta_text] if meta_text else []\n", - " full_ser = self.delim.join(meta_list + ([chunk.text] if chunk.text else []))\n", - "\n", - " meta_tokens = self.tokenizer(\n", - " meta_text, return_offsets_mapping=True, add_special_tokens=False\n", - " )[\"offset_mapping\"]\n", - " delim_tokens = (\n", - " self.tokenizer(\n", - " self.delim, return_offsets_mapping=True, add_special_tokens=False\n", - " )[\"offset_mapping\"]\n", - " if meta_text\n", - " else []\n", - " )\n", - " num_tokens_avail_for_text = self.max_tokens - (\n", - " len(meta_tokens) + len(delim_tokens)\n", - " )\n", - "\n", - " text_tokens = self.tokenizer(\n", - " chunk.text, return_offsets_mapping=True, add_special_tokens=False\n", - " )[\"offset_mapping\"]\n", - " num_text_tokens = len(text_tokens)\n", - "\n", - " if (\n", - " num_text_tokens <= num_tokens_avail_for_text\n", - " ): # chunk already within token limit\n", - " c = deepcopy(chunk)\n", - " c.text = full_ser\n", - " yield c\n", - " else: # chunk requires further splitting to meet token limit\n", - " fitting_texts = [\n", - " chunk.text[\n", - " text_tokens[base][0] : text_tokens[\n", - " min(base + num_tokens_avail_for_text, num_text_tokens) - 1\n", - " ][1]\n", - " ]\n", - " for base in range(0, num_text_tokens, num_tokens_avail_for_text)\n", - " ]\n", - " for text in fitting_texts:\n", - " c = deepcopy(chunk)\n", - " c.text = self.delim.join(meta_list + [text])\n", - " yield c\n", - "\n", - " def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:\n", - " chunk_iter = self.inner_chunker.chunk(dl_doc=dl_doc, **kwargs)\n", - " yield from self._split_above_max_tokens(chunk_iter=chunk_iter)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the example invocation shown below, one can see how a single original chunk (`self_ref == \"#/texts/8\"`) is split into multiple ones:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'len=64 text=1 Introduction\\nConverting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation ('" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'len=64 text=1 Introduction\\nRAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'len=26 text=1 Introduction\\n, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.'" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "chunker = MaxTokenLimitingChunker(max_tokens=64)\n", - "chunk_iter = chunker.chunk(dl_doc=doc)\n", - "\n", - "for chunk in chunk_iter:\n", - " meta = DocMeta.model_validate(chunk.meta)\n", - " if meta.doc_items[0].self_ref == \"#/texts/8\":\n", - " display(\n", - " f\"len={len(chunker.tokenizer(chunk.text, return_offsets_mapping=True, add_special_tokens=False)['offset_mapping'])} text={chunk.text}\"\n", - " )" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/examples/advanced_chunking_with_merging.ipynb b/docs/examples/advanced_chunking_with_merging.ipynb deleted file mode 100644 index 043f02b6b..000000000 --- a/docs/examples/advanced_chunking_with_merging.ipynb +++ /dev/null @@ -1,571 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Advanced Chunking" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "# %pip install -qU docling docling-core sentence-transformers transformers semchunk lancedb pydantic\n", - "\n", - "# FIXME temp install line\n", - "%pip install -qU \"docling-core @ git+https://github.com/DS4SD/docling-core.git@expand-chunking\" sentence-transformers transformers semchunk lancedb pydantic" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "from dataclasses import dataclass\n", - "from pathlib import Path\n", - "from tempfile import mkdtemp\n", - "from typing import Iterator, Optional, Self, Union\n", - "\n", - "import lancedb\n", - "import semchunk\n", - "from docling_core.transforms.chunker import (\n", - " BaseChunk,\n", - " BaseChunker,\n", - " DocChunk,\n", - " DocMeta,\n", - " HierarchicalChunker,\n", - ")\n", - "from docling_core.types import DoclingDocument\n", - "from pydantic import ConfigDict, PositiveInt, TypeAdapter, model_validator\n", - "from sentence_transformers import SentenceTransformer\n", - "from transformers import AutoTokenizer, PreTrainedTokenizerBase\n", - "\n", - "from docling.document_converter import DocumentConverter" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n", - "MAX_TOKENS = 64\n", - "DOC_SOURCE = \"http://bill.murdocks.org/iccbr2011murdock_web.pdf\"\n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)\n", - "embed_model = SentenceTransformer(EMBED_MODEL_ID)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Chunker Definition" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "class DocChunker(BaseChunker):\n", - "\n", - " model_config: ConfigDict = ConfigDict(arbitrary_types_allowed=True)\n", - "\n", - " tokenizer: PreTrainedTokenizerBase\n", - "\n", - " inner_chunker: BaseChunker = HierarchicalChunker()\n", - " max_tokens: int = None # actual dflt value resolved in validator based on tokenizer\n", - " delim: str = \"\\n\"\n", - "\n", - " @model_validator(mode=\"after\")\n", - " def patch_max_tokens(self) -> Self:\n", - " if self.max_tokens is None:\n", - " print(f\"{self.tokenizer.model_max_length=}\")\n", - " self.max_tokens = TypeAdapter(PositiveInt).validate_python(\n", - " self.tokenizer.model_max_length\n", - " )\n", - " return self\n", - "\n", - " def _count_tokens(self, text: Optional[Union[str, list[str]]]):\n", - " if text is None:\n", - " return 0\n", - " elif isinstance(text, list):\n", - " total = 0\n", - " for t in text:\n", - " total += self._count_tokens(t)\n", - " return total\n", - " return len(self.tokenizer.tokenize(text, max_length=None))\n", - "\n", - " @dataclass\n", - " class _ChunkLengthInfo:\n", - " total_len: int\n", - " text_len: int\n", - " other_len: int\n", - "\n", - " def _doc_chunk_length(self, doc_chunk: DocChunk):\n", - " text_length = self._count_tokens(doc_chunk.text)\n", - " # Note that count_tokens handles None and lists, making this code simpler:\n", - " # TODO check if delim properly considered\n", - " headings_length = self._count_tokens(doc_chunk.meta.headings)\n", - " captions_length = self._count_tokens(doc_chunk.meta.captions)\n", - " total = text_length + headings_length + captions_length\n", - " return self._ChunkLengthInfo(\n", - " total_len=total,\n", - " text_len=text_length,\n", - " other_len=total - text_length,\n", - " )\n", - "\n", - " def _make_chunk_from_doc_items(\n", - " self, doc_chunk: DocChunk, window_text: str, window_start: int, window_end: int\n", - " ):\n", - " meta = DocMeta(\n", - " doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],\n", - " headings=doc_chunk.meta.headings,\n", - " captions=doc_chunk.meta.captions,\n", - " )\n", - " new_chunk = DocChunk.from_data(text=window_text, meta=meta, delim=self.delim)\n", - " return new_chunk\n", - "\n", - " def _merge_text(self, t1, t2):\n", - " if t1 == \"\":\n", - " return t2\n", - " elif t2 == \"\":\n", - " return t1\n", - " else:\n", - " return f\"{t1}{self.delim}{t2}\"\n", - "\n", - " def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:\n", - " if doc_chunk.meta.doc_items == None or len(doc_chunk.meta.doc_items) <= 1:\n", - " return [doc_chunk]\n", - " length = self._doc_chunk_length(doc_chunk)\n", - " if length.total_len <= self.max_tokens:\n", - " return [doc_chunk]\n", - " else:\n", - " chunks = []\n", - " window_start = 0\n", - " window_end = 0\n", - " window_text = \"\"\n", - " window_text_length = 0\n", - " other_length = length.other_len\n", - " l = len(doc_chunk.meta.doc_items)\n", - " while window_end < l:\n", - " doc_item = doc_chunk.meta.doc_items[window_end]\n", - " text = doc_item.text\n", - " text_length = self._count_tokens(text)\n", - " if (\n", - " text_length + window_text_length + other_length < self.max_tokens\n", - " and window_end < l - 1\n", - " ):\n", - " # Still room left to add more to this chunk AND still at least one item left\n", - " window_end += 1\n", - " window_text_length += text_length\n", - " window_text = self._merge_text(window_text, text)\n", - " elif text_length + window_text_length + other_length < self.max_tokens:\n", - " # All the items in the window fit into the chunk and there are no other items left\n", - " window_text = self._merge_text(window_text, text)\n", - " new_chunk = self._make_chunk_from_doc_items(\n", - " doc_chunk, window_text, window_start, window_end\n", - " )\n", - " chunks.append(new_chunk)\n", - " window_end = l\n", - " elif window_start == window_end:\n", - " # Only one item in the window and it doesn't fit into the chunk. So we'll just make it a chunk for now and it will get split in the plain text splitter.\n", - " window_text = self._merge_text(window_text, text)\n", - " new_chunk = self._make_chunk_from_doc_items(\n", - " doc_chunk, window_text, window_start, window_end\n", - " )\n", - " chunks.append(new_chunk)\n", - " window_start = window_end + 1\n", - " window_end = window_start\n", - " window_text = \"\"\n", - " window_text_length = 0\n", - " else:\n", - " # Multiple items in the window but they don't fit into the chunk. However, the existing items must have fit or we wouldn't have gotten here.\n", - " # So we put everything but the last item into the chunk and then start a new window INCLUDING the current window end.\n", - " new_chunk = self._make_chunk_from_doc_items(\n", - " doc_chunk, window_text, window_start, window_end - 1\n", - " )\n", - " chunks.append(new_chunk)\n", - " window_start = window_end\n", - " window_text = \"\"\n", - " window_text_length = 0\n", - " return chunks\n", - "\n", - " def _split_using_plain_text(\n", - " self,\n", - " doc_chunk: DocChunk,\n", - " ) -> list[DocChunk]:\n", - " lengths = self._doc_chunk_length(doc_chunk)\n", - " if lengths.total_len <= self.max_tokens:\n", - " return [\n", - " DocChunk.from_data(\n", - " delim=self.delim,\n", - " **doc_chunk.export_json_dict(),\n", - " )\n", - " ]\n", - " else:\n", - "\n", - " # How much room is there for text after subtracting out the headers and captions:\n", - " available_length = self.max_tokens - lengths.other_len\n", - " sem_chunker = semchunk.chunkerify(\n", - " self.tokenizer, chunk_size=available_length\n", - " )\n", - " if available_length <= 0:\n", - " warnings.warn(\n", - " f\"Headers and captions for this chunk are longer than the total amount of size for the chunk. Chunk will be ignored.\"\n", - " )\n", - " return []\n", - " text = doc_chunk.text\n", - " segments = sem_chunker.chunk(text)\n", - " chunks = [\n", - " DocChunk.from_data(text=s, meta=doc_chunk.meta, delim=self.delim)\n", - " for s in segments\n", - " ]\n", - " return chunks\n", - "\n", - " def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):\n", - " output_chunks = []\n", - " window_start = 0\n", - " window_end = 0\n", - " l = len(chunks)\n", - " while window_end < l:\n", - " chunk = chunks[window_end]\n", - " lengths = self._doc_chunk_length(chunk)\n", - " headings_and_captions = (chunk.meta.headings, chunk.meta.captions)\n", - " if window_start == window_end:\n", - " # starting a new block of chunks to potentially merge\n", - " current_headings_and_captions = headings_and_captions\n", - " window_text = chunk.text\n", - " window_other_length = lengths.other_len\n", - " window_text_length = lengths.text_len\n", - " window_items = chunk.meta.doc_items\n", - " window_end += 1\n", - " first_chunk_of_window = chunk\n", - " elif (\n", - " headings_and_captions == current_headings_and_captions\n", - " and window_text_length + window_other_length + lengths.text_len\n", - " <= self.max_tokens\n", - " ):\n", - " # there is room to include the new chunk so add it to the window and continue\n", - " window_text = self._merge_text(window_text, chunk.text)\n", - " window_text_length += lengths.text_len\n", - " window_items = window_items + chunk.meta.doc_items\n", - " window_end += 1\n", - " else:\n", - " # no more room OR the start of new metadata. Either way, end the block and use the current window_end as the start of a new block\n", - " if window_start + 1 == window_end:\n", - " # just one chunk so use it as is\n", - " output_chunks.append(first_chunk_of_window)\n", - " else:\n", - " new_meta = DocMeta(\n", - " doc_items=window_items,\n", - " headings=headings_and_captions[0],\n", - " captions=headings_and_captions[1],\n", - " )\n", - " new_chunk = DocChunk.from_data(\n", - " text=window_text,\n", - " meta=new_meta,\n", - " delim=self.delim,\n", - " )\n", - " output_chunks.append(new_chunk)\n", - " window_start = window_end # no need to reset window_text, etc. because that will be reset in the next iteration in the if window_start == window_end block\n", - "\n", - " return output_chunks\n", - "\n", - " def _merge_chunks(self, chunks: list[DocChunk]) -> list[DocChunk]:\n", - " res = chunks\n", - " # merges as many chunks as possible that have the same headings+captions.\n", - " res = self._merge_chunks_with_matching_metadata(res)\n", - " # merges chunks with different headings+captions. This is later so that merges within a section or other grouping are preferred.\n", - " # res = self._merge_chunks_with_mismatching_metadata(res)\n", - " return res\n", - "\n", - " def _adjust_chunks_for_fixed_size(self, chunks: list[DocChunk]):\n", - " res = chunks\n", - " res = [x for c in res for x in self._split_by_doc_items(c)]\n", - " res = [x for c in res for x in self._split_using_plain_text(c)]\n", - " res = self._merge_chunks(res)\n", - " return res\n", - "\n", - " def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]:\n", - " preliminary_chunks = self.inner_chunker.chunk(dl_doc=dl_doc, **kwargs)\n", - " output_chunks = self._adjust_chunks_for_fixed_size(preliminary_chunks)\n", - " return iter(output_chunks)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Usage" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "conv_res = DocumentConverter().convert(source=DOC_SOURCE)\n", - "doc = conv_res.document" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "chunk.text (33 tokens):\n", - "'murdockj@us.ibm.com IBM T.J. Watson Research Center P.O. Box 704 Yorktown Heights, NY 10598'\n", - "chunk.get_text_for_embedding() (39 tokens):\n", - "'J. William Murdock\\nmurdockj@us.ibm.com IBM T.J. Watson Research Center P.O. Box 704 Yorktown Heights, NY 10598'\n", - "chunk.get_text_for_generation() (39 tokens):\n", - "'J. William Murdock\\nmurdockj@us.ibm.com IBM T.J. Watson Research Center P.O. Box 704 Yorktown Heights, NY 10598'\n", - "\n", - "chunk.text (58 tokens):\n", - "'Abstract. The Jeopardy! television quiz show asks natural-language questions and requires natural-language answers. One useful source of information for answering Jeopardy! questions is text from written sources such as encyclopedias or news articles. A text passage may partially or fully indicate that some candidate answer is the correct'\n", - "chunk.get_text_for_embedding() (64 tokens):\n", - "'J. William Murdock\\nAbstract. The Jeopardy! television quiz show asks natural-language questions and requires natural-language answers. One useful source of information for answering Jeopardy! questions is text from written sources such as encyclopedias or news articles. A text passage may partially or fully indicate that some candidate answer is the correct'\n", - "chunk.get_text_for_generation() (64 tokens):\n", - "'J. William Murdock\\nAbstract. The Jeopardy! television quiz show asks natural-language questions and requires natural-language answers. One useful source of information for answering Jeopardy! questions is text from written sources such as encyclopedias or news articles. A text passage may partially or fully indicate that some candidate answer is the correct'\n", - "\n", - "chunk.text (58 tokens):\n", - "'answer to the question. Recognizing whether it does requires determining the extent to which what the passage is saying about the candidate answer is similar to what the question is saying about the desired answer. This paper describes how structure mapping [1] (an algorithm originally developed for analogical reasoning) is applied'\n", - "chunk.get_text_for_embedding() (64 tokens):\n", - "'J. William Murdock\\nanswer to the question. Recognizing whether it does requires determining the extent to which what the passage is saying about the candidate answer is similar to what the question is saying about the desired answer. This paper describes how structure mapping [1] (an algorithm originally developed for analogical reasoning) is applied'\n", - "chunk.get_text_for_generation() (64 tokens):\n", - "'J. William Murdock\\nanswer to the question. Recognizing whether it does requires determining the extent to which what the passage is saying about the candidate answer is similar to what the question is saying about the desired answer. This paper describes how structure mapping [1] (an algorithm originally developed for analogical reasoning) is applied'\n", - "\n", - "chunk.text (38 tokens):\n", - "\"to determine similarity between content in questions and passages. That algorithm is one of many used in the Watson question answering system [2]. It contributes a significant amount to Watson's effectiveness.\"\n", - "chunk.get_text_for_embedding() (44 tokens):\n", - "\"J. William Murdock\\nto determine similarity between content in questions and passages. That algorithm is one of many used in the Watson question answering system [2]. It contributes a significant amount to Watson's effectiveness.\"\n", - "chunk.get_text_for_generation() (44 tokens):\n", - "\"J. William Murdock\\nto determine similarity between content in questions and passages. That algorithm is one of many used in the Watson question answering system [2]. It contributes a significant amount to Watson's effectiveness.\"\n", - "\n", - "chunk.text (60 tokens):\n", - "'Watson is a question answering system built on a set of technologies known as DeepQA [2]. Watson has been customized and configured to compete at Jeopardy!, an American television quiz show. Watson takes in a question and produces a ranked list of answers with confidence scores attached to each of these answers.'\n", - "chunk.get_text_for_embedding() (62 tokens):\n", - "'1 Introduction\\nWatson is a question answering system built on a set of technologies known as DeepQA [2]. Watson has been customized and configured to compete at Jeopardy!, an American television quiz show. Watson takes in a question and produces a ranked list of answers with confidence scores attached to each of these answers.'\n", - "chunk.get_text_for_generation() (62 tokens):\n", - "'1 Introduction\\nWatson is a question answering system built on a set of technologies known as DeepQA [2]. Watson has been customized and configured to compete at Jeopardy!, an American television quiz show. Watson takes in a question and produces a ranked list of answers with confidence scores attached to each of these answers.'\n", - "\n" - ] - } - ], - "source": [ - "chunker = DocChunker(\n", - " tokenizer=tokenizer,\n", - " max_tokens=MAX_TOKENS, # optional, derived from `tokenizer` if not provided\n", - ")\n", - "chunks = list(chunker.chunk(dl_doc=doc))\n", - "\n", - "for chunk in chunks[:5]:\n", - " txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))\n", - " print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n", - " emb_txt = chunk.get_text_for_embedding()\n", - " emb_tokens = len(tokenizer.tokenize(emb_txt, max_length=None))\n", - " print(f\"chunk.get_text_for_embedding() ({emb_tokens} tokens):\\n{repr(emb_txt)}\")\n", - " gen_txt = chunk.get_text_for_generation()\n", - " gen_tokens = len(tokenizer.tokenize(gen_txt, max_length=None))\n", - " print(f\"chunk.get_text_for_generation() ({gen_tokens} tokens):\\n{repr(gen_txt)}\")\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Vector Retrieval" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vectortextheadingscaptions_distance
0[-0.025746439, 0.03888134, 0.0033668755, -0.03...3. Forbus, K. and Oblinger, D. (1990). Making ...[References]None0.332435
1[0.04400234, -0.034766007, -0.00025527124, 0.0...4. McCord, M. C. (1990). Slot Grammar: A Syste...[References]None1.525625
2[0.10043394, 0.00652478, 0.011601829, -0.06390...passage using semantic and/or syntactic edges:...[3 Syntactic-Semantic Graphs]None1.569923
3[0.025994677, 0.08402823, 0.03268827, -0.03727...In using this algorithm, we have encountered a...[4 Algorithm]None1.576838
4[0.050165094, 0.08015387, 0.035965856, 0.00846...word order) are more aggressive in what they c...[5 Evaluation and Conclusions]None1.580265
\n", - "
" - ], - "text/plain": [ - " vector \\\n", - "0 [-0.025746439, 0.03888134, 0.0033668755, -0.03... \n", - "1 [0.04400234, -0.034766007, -0.00025527124, 0.0... \n", - "2 [0.10043394, 0.00652478, 0.011601829, -0.06390... \n", - "3 [0.025994677, 0.08402823, 0.03268827, -0.03727... \n", - "4 [0.050165094, 0.08015387, 0.035965856, 0.00846... \n", - "\n", - " text \\\n", - "0 3. Forbus, K. and Oblinger, D. (1990). Making ... \n", - "1 4. McCord, M. C. (1990). Slot Grammar: A Syste... \n", - "2 passage using semantic and/or syntactic edges:... \n", - "3 In using this algorithm, we have encountered a... \n", - "4 word order) are more aggressive in what they c... \n", - "\n", - " headings captions _distance \n", - "0 [References] None 0.332435 \n", - "1 [References] None 1.525625 \n", - "2 [3 Syntactic-Semantic Graphs] None 1.569923 \n", - "3 [4 Algorithm] None 1.576838 \n", - "4 [5 Evaluation and Conclusions] None 1.580265 " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def make_lancedb_index(db_uri, index_name, chunks: list[DocChunk], embedding_model):\n", - " db = lancedb.connect(db_uri)\n", - " data = []\n", - " for chunk in chunks:\n", - " embeddings = embedding_model.encode(chunk.get_text_for_embedding())\n", - " data_item = {\n", - " \"vector\": embeddings,\n", - " \"text\": chunk.text,\n", - " \"headings\": chunk.meta.headings,\n", - " \"captions\": chunk.meta.captions,\n", - " }\n", - " data.append(data_item)\n", - " tbl = db.create_table(index_name, data=data, exist_ok=True)\n", - " return tbl\n", - "\n", - "\n", - "db_uri = str(Path(mkdtemp()) / \"docling.db\") # or set as needed\n", - "index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)\n", - "\n", - "sample_query = \"Making SME greedy and pragmatic\"\n", - "sample_embedding = embed_model.encode(sample_query)\n", - "results = index.search(sample_embedding).limit(5)\n", - "\n", - "results.to_pandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/examples/token_aware_chunking.ipynb b/docs/examples/token_aware_chunking.ipynb new file mode 100644 index 000000000..2e1f77b6d --- /dev/null +++ b/docs/examples/token_aware_chunking.ipynb @@ -0,0 +1,194 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Token-aware Chunking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO fix temp install line\n", + "%pip install -qU sentence-transformers transformers semchunk lancedb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conversion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from docling.document_converter import DocumentConverter\n", + "\n", + "DOC_SOURCE = \"../../tests/data/md/wiki.md\"\n", + "\n", + "doc = DocumentConverter().convert(source=DOC_SOURCE).document" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Chunking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice how `tokenizer` and `embed_model` further below are single-sourced from `EMBED_MODEL_ID`.\n", + "\n", + "This is important for making sure the chunker and the embedding model are using the same tokenizer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from docling_core.transforms.chunker import DocChunk, TokenAwareChunker\n", + "from transformers import AutoTokenizer\n", + "\n", + "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n", + "MAX_TOKENS = 64\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)\n", + "\n", + "chunker = TokenAwareChunker(\n", + " tokenizer=tokenizer,\n", + " max_tokens=MAX_TOKENS, # optional, by default derived from `tokenizer`\n", + " # merge_peers=True, # optional, defaults to True\n", + ")\n", + "chunk_iter = chunker.chunk(dl_doc=doc)\n", + "chunks = list(chunk_iter)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Points to notice:\n", + "- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)\n", + "- Where neeeded, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n", + "- Where possible, we merge undersized peer chunks (see chunk 0)\n", + "- \"Tail\" chunks trailing right after merges may still be undersized (see chunk 8)\n", + "- The last input paragraph is not yielded in the output, which is a known issue (to be remediated)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i, chunk in enumerate(chunks[:]):\n", + " print(f\"=== {i} ===\")\n", + " txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))\n", + " print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n", + "\n", + " ser_txt = chunker.serialize(chunk=chunk)\n", + " ser_tokens = len(tokenizer.tokenize(ser_txt, max_length=None))\n", + " print(f\"chunker.serialize() ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n", + "\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Vector Retrieval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sentence_transformers import SentenceTransformer\n", + "\n", + "embed_model = SentenceTransformer(EMBED_MODEL_ID)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from tempfile import mkdtemp\n", + "\n", + "import lancedb\n", + "\n", + "\n", + "def make_lancedb_index(db_uri, index_name, chunks: list[DocChunk], embedding_model):\n", + " db = lancedb.connect(db_uri)\n", + " data = []\n", + " for chunk in chunks:\n", + " embeddings = embedding_model.encode(chunker.serialize(chunk=chunk))\n", + " data_item = {\n", + " \"vector\": embeddings,\n", + " \"text\": chunk.text,\n", + " \"headings\": chunk.meta.headings,\n", + " \"captions\": chunk.meta.captions,\n", + " }\n", + " data.append(data_item)\n", + " tbl = db.create_table(index_name, data=data, exist_ok=True)\n", + " return tbl\n", + "\n", + "\n", + "db_uri = str(Path(mkdtemp()) / \"docling.db\")\n", + "index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)\n", + "\n", + "sample_query = \"invent\"\n", + "sample_embedding = embed_model.encode(sample_query)\n", + "results = index.search(sample_embedding).limit(5)\n", + "\n", + "results.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/poetry.lock b/poetry.lock index 184658ea3..49f416686 100644 --- a/poetry.lock +++ b/poetry.lock @@ -893,21 +893,27 @@ name = "docling-core" version = "2.6.1" description = "A python library to define and validate data types in Docling." optional = false -python-versions = "<4.0,>=3.9" -files = [ - {file = "docling_core-2.6.1-py3-none-any.whl", hash = "sha256:8e7a5bc0ce13289567738481949fed3ab580f2d8cea7525b246159233d81b26b"}, - {file = "docling_core-2.6.1.tar.gz", hash = "sha256:c8af45e0873611120cc24757d567d37e053a54e2ce060b7b5b44efd0d73f75e5"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] -jsonref = ">=1.1.0,<2.0.0" -jsonschema = ">=4.16.0,<5.0.0" -pandas = ">=2.1.4,<3.0.0" -pillow = ">=10.3.0,<11.0.0" +jsonref = "^1.1.0" +jsonschema = "^4.16.0" +pandas = "^2.1.4" +pillow = "^10.3.0" pydantic = ">=2.6.0,<2.10" pyyaml = ">=5.1,<7.0.0" -tabulate = ">=0.9.0,<0.10.0" -typing-extensions = ">=4.12.2,<5.0.0" +semchunk = "^2.2.0" +tabulate = "^0.9.0" +transformers = "^4.46.3" +typing-extensions = "^4.12.2" + +[package.source] +type = "git" +url = "git@github.com:DS4SD/docling-core.git" +reference = "expand-chunking" +resolved_reference = "816c779887b5d29adabbef206b9756afe0ae4036" [[package]] name = "docling-ibm-models" @@ -2823,6 +2829,32 @@ files = [ {file = "more_itertools-10.5.0-py3-none-any.whl", hash = "sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef"}, ] +[[package]] +name = "mpire" +version = "2.10.2" +description = "A Python package for easy multiprocessing, but faster than multiprocessing" +optional = false +python-versions = "*" +files = [ + {file = "mpire-2.10.2-py3-none-any.whl", hash = "sha256:d627707f7a8d02aa4c7f7d59de399dec5290945ddf7fbd36cbb1d6ebb37a51fb"}, + {file = "mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97"}, +] + +[package.dependencies] +multiprocess = [ + {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""}, + {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""}, +] +pygments = ">=2.0" +pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""} +tqdm = ">=4.27" + +[package.extras] +dashboard = ["flask"] +dill = ["multiprocess", "multiprocess (>=0.70.15)"] +docs = ["docutils (==0.17.1)", "sphinx (==3.2.1)", "sphinx-autodoc-typehints (==1.11.0)", "sphinx-rtd-theme (==0.5.0)", "sphinx-versions (==1.0.1)", "sphinxcontrib-images (==0.9.2)"] +testing = ["ipywidgets", "multiprocess", "multiprocess (>=0.70.15)", "numpy", "pywin32 (>=301)", "rich"] + [[package]] name = "mpmath" version = "1.3.0" @@ -3192,6 +3224,7 @@ files = [ {file = "nh3-0.2.19-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00810cd5275f5c3f44b9eb0e521d1a841ee2f8023622de39ffc7d88bd533d8e0"}, {file = "nh3-0.2.19-cp38-abi3-win32.whl", hash = "sha256:7e98621856b0a911c21faa5eef8f8ea3e691526c2433f9afc2be713cb6fbdb48"}, {file = "nh3-0.2.19-cp38-abi3-win_amd64.whl", hash = "sha256:75c7cafb840f24430b009f7368945cb5ca88b2b54bb384ebfba495f16bc9c121"}, + {file = "nh3-0.2.19.tar.gz", hash = "sha256:790056b54c068ff8dceb443eaefb696b84beff58cca6c07afd754d17692a4804"}, ] [[package]] @@ -3765,10 +3798,10 @@ files = [ numpy = [ {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -3791,10 +3824,10 @@ files = [ numpy = [ {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -3975,8 +4008,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.22.4", markers = "python_version < \"3.11\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -6029,6 +6062,11 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -6109,6 +6147,21 @@ files = [ cryptography = ">=2.0" jeepney = ">=0.6" +[[package]] +name = "semchunk" +version = "2.2.0" +description = "A fast and lightweight Python library for splitting text into semantically meaningful chunks." +optional = false +python-versions = ">=3.9" +files = [ + {file = "semchunk-2.2.0-py3-none-any.whl", hash = "sha256:7db19ca90ddb48f99265e789e07a7bb111ae25185f9cc3d44b94e1e61b9067fc"}, + {file = "semchunk-2.2.0.tar.gz", hash = "sha256:4de761ce614036fa3bea61adbe47e3ade7c96ac9b062f223b3ac353dbfd26743"}, +] + +[package.dependencies] +mpire = {version = "*", extras = ["dill"]} +tqdm = "*" + [[package]] name = "semver" version = "2.13.0" @@ -7647,4 +7700,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "33ee730cf750e618ec005ad44ad09617bc8f95632b30ac02b5290a03a33bdf5b" +content-hash = "1429896e59f5c703299287579219b73855eb42f4aef6d72dd1410ae236b2f4bc" diff --git a/pyproject.toml b/pyproject.toml index b6b832c85..729c05bba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ packages = [{include = "docling"}] ###################### python = "^3.9" pydantic = ">=2.0.0,<2.10" -docling-core = "^2.6.1" +docling-core = { git = "git@github.com:DS4SD/docling-core.git", branch = "expand-chunking" } docling-ibm-models = "^2.0.6" deepsearch-glm = "^0.26.1" filetype = "^1.2.0" diff --git a/tests/data/md/wiki.md b/tests/data/md/wiki.md new file mode 100644 index 000000000..ab14af752 --- /dev/null +++ b/tests/data/md/wiki.md @@ -0,0 +1,25 @@ +# IBM + +International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries. + +It is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Average. + +IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021. + +IBM was founded in 1911 as the Computing-Tabulating-Recording Company (CTR), a holding company of manufacturers of record-keeping and measuring systems. It was renamed "International Business Machines" in 1924 and soon became the leading manufacturer of punch-card tabulating systems. During the 1960s and 1970s, the IBM mainframe, exemplified by the System/360, was the world's dominant computing platform, with the company producing 80 percent of computers in the U.S. and 70 percent of computers worldwide.[11] + +IBM debuted in the microcomputer market in 1981 with the IBM Personal Computer, — its DOS software provided by Microsoft, — which became the basis for the majority of personal computers to the present day.[12] The company later also found success in the portable space with the ThinkPad. Since the 1990s, IBM has concentrated on computer services, software, supercomputers, and scientific research; it sold its microcomputer division to Lenovo in 2005. IBM continues to develop mainframes, and its supercomputers have consistently ranked among the most powerful in the world in the 21st century. + +As one of the world's oldest and largest technology companies, IBM has been responsible for several technological innovations, including the automated teller machine (ATM), dynamic random-access memory (DRAM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the SQL programming language, and the UPC barcode. The company has made inroads in advanced computer chips, quantum computing, artificial intelligence, and data infrastructure.[13][14][15] IBM employees and alumni have won various recognitions for their scientific research and inventions, including six Nobel Prizes and six Turing Awards.[16] + +## History + +### 1910s–1950s + +IBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine (1889);[19] and Willard Bundy invented a time clock to record workers' arrival and departure times on a paper tape (1889).[20] On June 16, 1911, their four companies were amalgamated in New York State by Charles Ranlett Flint forming a fifth company, the Computing-Tabulating-Recording Company (CTR) based in Endicott, New York.[1][21] The five companies had 1,300 employees and offices and plants in Endicott and Binghamton, New York; Dayton, Ohio; Detroit, Michigan; Washington, D.C.; and Toronto, Canada.[22] + +Collectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson, called on Flint and, in 1914, was offered a position at CTR.[23] Watson joined CTR as general manager and then, 11 months later, was made President when antitrust cases relating to his time at NCR were resolved.[24] Having learned Patterson's pioneering business practices, Watson proceeded to put the stamp of NCR onto CTR's companies.[23]: 105  He implemented sales conventions, "generous sales incentives, a focus on customer service, an insistence on well-groomed, dark-suited salesmen and had an evangelical fervor for instilling company pride and loyalty in every worker".[25][26] His favorite slogan, "THINK", became a mantra for each company's employees.[25] During Watson's first four years, revenues reached $9 million ($158 million today) and the company's operations expanded to Europe, South America, Asia and Australia.[25] Watson never liked the clumsy hyphenated name "Computing-Tabulating-Recording Company" and chose to replace it with the more expansive title "International Business Machines" which had previously been used as the name of CTR's Canadian Division;[27] the name was changed on February 14, 1924.[28] By 1933, most of the subsidiaries had been merged into one company, IBM. + +### 1960s–1980s + +In 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.