huggingface · tic-top · Jul 25, 2024 · Jul 25, 2024 · Jul 25, 2024 · Jul 25, 2024
diff --git a/.github/workflows/self-pr-slow-ci.yml b/.github/workflows/self-pr-slow-ci.yml
@@ -65,8 +65,8 @@ jobs:
         fail-fast: false
         matrix:
           folders: ${{ fromJson(needs.find_models_to_run.outputs.models) }}
-          machine_type: [single-gpu, multi-gpu]
-      runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, ci]
+          machine_type: [single-gpu]
+      runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, a10, ci]
       container:
         image: huggingface/transformers-all-latest-gpu
         options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
@@ -171,6 +171,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                        [JetMoe](model_doc/jetmoe)                        |       ✅        |         ❌         |      ❌      |
 |                       [Jukebox](model_doc/jukebox)                       |       ✅        |         ❌         |      ❌      |
 |                      [KOSMOS-2](model_doc/kosmos-2)                      |       ✅        |         ❌         |      ❌      |
+|                    [KOSMOS-2.5](model_doc/kosmos-2.5)                    |       ✅        |         ❌         |      ❌      |
 |                      [LayoutLM](model_doc/layoutlm)                      |       ✅        |         ✅         |      ❌      |
 |                    [LayoutLMv2](model_doc/layoutlmv2)                    |       ✅        |         ❌         |      ❌      |
 |                    [LayoutLMv3](model_doc/layoutlmv3)                    |       ✅        |         ✅         |      ❌      |

diff --git a/docs/source/en/model_doc/kosmos-2.5.md b/docs/source/en/model_doc/kosmos-2.5.md
@@ -0,0 +1,126 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# KOSMOS-2.5
+
+## Overview
+
+Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive images. Pre-trained on large-scale text-intensive images, Kosmos-2.5 excels in two distinct yet cooperative transcription tasks: (1) generating spatially-aware text blocks, where each block of text is assigned its spatial coordinates within the image, and (2) producing structured text output that captures styles and structures into the markdown format. This unified multimodal literate capability is achieved through a shared decoder-only auto-regressive Transformer architecture, task-specific prompts, and flexible text representations. We evaluate Kosmos-2.5 on end-to-end document-level text recognition and image-to-markdown text generation. Furthermore, the model can be readily adapted for any text-intensive image understanding task with different prompts through supervised fine-tuning, making it a general-purpose tool for real-world applications involving text-rich images. This work also paves the way for the future scaling of multimodal large language models.
+
+The abstract from the paper is the following:
+
+*We present Kosmos-2.5, a multimodal literate model for machine reading of text-intensive images. Pre-trained on large-scale text-intensive images, Kosmos-2.5 excels in two distinct yet cooperative transcription tasks: (1) generating spatially-aware text blocks, where each block of text is assigned its spatial coordinates within the image, and (2) producing structured text output that captures styles and structures into the markdown format. This unified multimodal literate capability is achieved through a shared Transformer architecture, task-specific prompts, and flexible text representations. We evaluate Kosmos-2.5 on end-to-end document-level text recognition and image-to-markdown text generation. Furthermore, the model can be readily adapted for any text-intensive image understanding task with different prompts through supervised fine-tuning, making it a general-purpose tool for real-world applications involving text-rich images. This work also paves the way for the future scaling of multimodal large language models.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/kosmos2_5_ocr.png"
+alt="drawing" width="600"/>
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/kosmos2_5_md.png"
+alt="drawing" width="600"/>
+
+<small> Overview of tasks that KOSMOS-2.5 can handle. Taken from the <a href="https://arxiv.org/abs/2309.11419">original paper</a>. </small>
+
+## Example
+
+```python
+from PIL import Image
+import requests
+import torch
+from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
+import re
+repo = "microsoft/kosmos-2.5"
+device = "cuda:0"
+dtype = torch.bfloat16
+model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
+processor = AutoProcessor.from_pretrained(repo)
+url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
+image = Image.open(requests.get(url, stream=True).raw)
+prompt = "<ocr>" # <md>
+inputs = processor(text=prompt, images=image, return_tensors="pt")
+height, width = inputs.pop("height"), inputs.pop("width")
+raw_width, raw_height = image.size
+scale_height = raw_height / height
+scale_width = raw_width / width
+inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
+inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
+generated_ids = model.generate(
+    **inputs,
+    max_new_tokens=1024,
+)
+generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+def postprocess(y, scale_height, scale_width):
+    y = y.replace(prompt, "")
+    if "<md>" in prompt:
+        return y
+    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
+    bboxs_raw = re.findall(pattern, y)
+    lines = re.split(pattern, y)[1:]
+    bboxs = [re.findall(r"\d+", i) for i in bboxs_raw]
+    bboxs = [[int(j) for j in i] for i in bboxs]
+    info = ""
+    for i in range(len(lines)):
+        box = bboxs[i]
+        x0, y0, x1, y1 = box
+        if not (x0 >= x1 or y0 >= y1):
+            x0 = int(x0 * scale_width)
+            y0 = int(y0 * scale_height)
+            x1 = int(x1 * scale_width)
+            y1 = int(y1 * scale_height)
+            info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
+    return info
+output_text = postprocess(generated_text[0], scale_height, scale_width)
+print(output_text)
+```
+```text
+55,595,71,595,71,629,55,629,1
+82,595,481,595,481,635,82,635,[REG] BLACK SAKURA
+716,590,841,590,841,629,716,629,45,455
+55,637,71,637,71,672,55,672,1
+82,637,486,637,486,675,82,675,COOKIE DOH SAUCES
+818,632,843,632,843,668,818,668,0
+51,683,71,683,71,719,51,719,1
+82,683,371,683,371,719,82,719,NATA DE COCO
+820,677,845,677,845,713,820,713,0
+32,770,851,770,851,811,32,811,Sub Total 45,455
+28,811,853,811,853,858,28,858,PB1 (10%) 4,545
+28,857,855,857,855,905,28,905,Rounding 0
+24,905,858,905,858,956,24,956,Total 50,000
+17,1096,868,1096,868,1150,17,1150,Card Payment 50,000
+```
+
+
+
+## Kosmos2_5Config
+
+[[autodoc]] Kosmos2_5Config
+
+## Kosmos2_5ImageProcessor
+
+[[autodoc]] Kosmos2_5ImageProcessor
+
+## Kosmos2_5Processor
+
+[[autodoc]] Kosmos2_5Processor
+    - __call__
+
+## Kosmos2_5Model
+
+[[autodoc]] Kosmos2_5Model
+    - forward
+
+## Kosmos2_5ForConditionalGeneration
+
+[[autodoc]] Kosmos2_5ForConditionalGeneration
+    - forward
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
@@ -53,6 +53,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
 * [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
 * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
+* [Kosmos-2.5](https://huggingface.co/docs/transformers/model_doc/kosmos2_5#transformers.Kosmos2_5Model)
 * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
 * [Llava](https://huggingface.co/docs/transformers/model_doc/llava)
 * [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)
@@ -209,6 +210,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [GPTNeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox#transformers.GPTNeoXModel)
 * [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
 * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
+* [Kosmos-2.5](https://huggingface.co/docs/transformers/model_doc/kosmos2_5#transformers.Kosmos2_5Model)
 * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
 * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
 * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -486,6 +486,11 @@
         "Kosmos2Config",
         "Kosmos2Processor",
     ],
+    "models.kosmos2_5": [
+        "Kosmos2_5Config",
+        "Kosmos2_5ImageProcessor",
+        "Kosmos2_5Processor",
+    ],
     "models.layoutlm": [
         "LayoutLMConfig",
         "LayoutLMTokenizer",
@@ -1149,6 +1154,7 @@
     _import_structure["models.idefics2"].extend(["Idefics2ImageProcessor"])
     _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"])
     _import_structure["models.instructblipvideo"].extend(["InstructBlipVideoImageProcessor"])
+    _import_structure["models.kosmos2_5"].extend(["Kosmos2_5ImageProcessor", "Kosmos2_5Processor"])
     _import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"])
     _import_structure["models.layoutlmv3"].extend(["LayoutLMv3FeatureExtractor", "LayoutLMv3ImageProcessor"])
     _import_structure["models.levit"].extend(["LevitFeatureExtractor", "LevitImageProcessor"])
@@ -2372,6 +2378,13 @@
             "Kosmos2PreTrainedModel",
         ]
     )
+    _import_structure["models.kosmos2_5"].extend(
+        [
+            "Kosmos2_5ForConditionalGeneration",
+            "Kosmos2_5Model",
+            "Kosmos2_5PreTrainedModel",
+        ]
+    )
     _import_structure["models.layoutlm"].extend(
         [
             "LayoutLMForMaskedLM",
@@ -5129,6 +5142,11 @@
         Kosmos2Config,
         Kosmos2Processor,
     )
+    from .models.kosmos2_5 import (
+        Kosmos2_5Config,
+        Kosmos2_5ImageProcessor,
+        Kosmos2_5Processor,
+    )
     from .models.layoutlm import (
         LayoutLMConfig,
         LayoutLMTokenizer,
@@ -5821,6 +5839,7 @@
         from .models.idefics2 import Idefics2ImageProcessor
         from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor
         from .models.instructblipvideo import InstructBlipVideoImageProcessor
+        from .models.kosmos2_5 import Kosmos2_5ImageProcessor, Kosmos2_5Processor
         from .models.layoutlmv2 import (
             LayoutLMv2FeatureExtractor,
             LayoutLMv2ImageProcessor,
@@ -6852,6 +6871,11 @@
             Kosmos2Model,
             Kosmos2PreTrainedModel,
         )
+        from .models.kosmos2_5 import (
+            Kosmos2_5ForConditionalGeneration,
+            Kosmos2_5Model,
+            Kosmos2_5PreTrainedModel,
+        )
         from .models.layoutlm import (
             LayoutLMForMaskedLM,
             LayoutLMForQuestionAnswering,

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -116,6 +116,7 @@
     jamba,
     jetmoe,
     kosmos2,
+    kosmos2_5,
     layoutlm,
     layoutlmv2,
     layoutlmv3,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -134,6 +134,7 @@
         ("jetmoe", "JetMoeConfig"),
         ("jukebox", "JukeboxConfig"),
         ("kosmos-2", "Kosmos2Config"),
+        ("kosmos-2.5", "Kosmos2_5Config"),
         ("layoutlm", "LayoutLMConfig"),
         ("layoutlmv2", "LayoutLMv2Config"),
         ("layoutlmv3", "LayoutLMv3Config"),
@@ -413,6 +414,7 @@
         ("jetmoe", "JetMoe"),
         ("jukebox", "Jukebox"),
         ("kosmos-2", "KOSMOS-2"),
+        ("kosmos-2.5", "KOSMOS-2.5"),
         ("layoutlm", "LayoutLM"),
         ("layoutlmv2", "LayoutLMv2"),
         ("layoutlmv3", "LayoutLMv3"),
@@ -628,6 +630,7 @@
         ("data2vec-vision", "data2vec"),
         ("donut-swin", "donut"),
         ("kosmos-2", "kosmos2"),
+        ("kosmos-2.5", "kosmos2_5"),
         ("maskformer-swin", "maskformer"),
         ("xclip", "x_clip"),
         ("clip_vision_model", "clip"),

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -91,6 +91,7 @@
             ("instructblip", ("BlipImageProcessor",)),
             ("instructblipvideo", ("InstructBlipVideoImageProcessor",)),
             ("kosmos-2", ("CLIPImageProcessor",)),
+            ("kosmos-2.5", ("Kosmos2_5ImageProcessor",)),
             ("layoutlmv2", ("LayoutLMv2ImageProcessor",)),
             ("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
             ("levit", ("LevitImageProcessor",)),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -129,6 +129,7 @@
         ("jetmoe", "JetMoeModel"),
         ("jukebox", "JukeboxModel"),
         ("kosmos-2", "Kosmos2Model"),
+        ("kosmos-2.5", "Kosmos2_5Model"),
         ("layoutlm", "LayoutLMModel"),
         ("layoutlmv2", "LayoutLMv2Model"),
         ("layoutlmv3", "LayoutLMv3Model"),
@@ -702,6 +703,7 @@
         ("instructblip", "InstructBlipForConditionalGeneration"),
         ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
+        ("kosmos-2.5", "Kosmos2_5ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
         ("llava-next-video", "LlavaNextVideoForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -66,6 +66,7 @@
         ("instructblip", "InstructBlipProcessor"),
         ("instructblipvideo", "InstructBlipVideoProcessor"),
         ("kosmos-2", "Kosmos2Processor"),
+        ("kosmos-2.5", "Kosmos2_5Processor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),
         ("llava", "LlavaProcessor"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -235,6 +235,10 @@
                     "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "kosmos-2.5",
+                ("PreTrainedTokenizerFast", None),
+            ),
             ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
             ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
             ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),

diff --git a/src/transformers/models/kosmos2_5/__init__.py b/src/transformers/models/kosmos2_5/__init__.py
@@ -0,0 +1,64 @@
+# coding=utf-8
+# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_vision_available,
+)
+
+
+_import_structure = {
+    "configuration_kosmos2_5": ["Kosmos2_5Config"],
+    "image_processing_kosmos2_5": ["Kosmos2_5ImageProcessor"],
+    "processing_kosmos2_5": ["Kosmos2_5Processor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_kosmos2_5"] = [
+        "Kosmos2_5ForConditionalGeneration",
+        "Kosmos2_5Model",
+        "Kosmos2_5PreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_kosmos2_5 import Kosmos2_5Config
+    from .image_processing_kosmos2_5 import Kosmos2_5ImageProcessor
+    from .processing_kosmos2_5 import Kosmos2_5Processor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_kosmos2_5 import (
+            Kosmos2_5ForConditionalGeneration,
+            Kosmos2_5Model,
+            Kosmos2_5PreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)