propagated changes for new CodeItem class

DS4SD · Jan 15, 2025 · 412e4c9 · 412e4c9
1 parent 57fc28d
commit 412e4c9
Show file tree

Hide file tree

Showing 5 changed files with 178 additions and 27 deletions.
diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
@@ -40,7 +40,7 @@ class LayoutModel(BasePageModel):
         DocItemLabel.PAGE_FOOTER,
         DocItemLabel.CODE,
         DocItemLabel.LIST_ITEM,
-        # "Formula",
+        DocItemLabel.FORMULA,
     ]
     PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
 

diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py
@@ -135,31 +135,6 @@ def __call__(
                                 )
                             elements.append(fig)
                             body.append(fig)
-                        elif cluster.label == LayoutModel.FORMULA_LABEL:
-                            equation = None
-                            if page.predictions.equations_prediction:
-                                equation = page.predictions.equations_prediction.equation_map.get(
-                                    cluster.id, None
-                                )
-                            if (
-                                not equation
-                            ):  # fallback: add empty formula, if it isn't present
-                                text = self.sanitize_text(
-                                    [
-                                        cell.text.replace("\x02", "-").strip()
-                                        for cell in cluster.cells
-                                        if len(cell.text.strip()) > 0
-                                    ]
-                                )
-                                equation = TextElement(
-                                    label=cluster.label,
-                                    id=cluster.id,
-                                    cluster=cluster,
-                                    page_no=page.page_no,
-                                    text=text,
-                                )
-                            elements.append(equation)
-                            body.append(equation)
                         elif cluster.label in LayoutModel.CONTAINER_LABELS:
                             container_el = ContainerElement(
                                 label=cluster.label,

diff --git a/docling/utils/glm_utils.py b/docling/utils/glm_utils.py
@@ -270,7 +270,6 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
             container_el = doc.add_group(label=group_label)
 
             _add_child_elements(container_el, doc, obj, pelem)
-
         elif "text" in obj:
             text = obj["text"][span_i:span_j]
 
@@ -304,6 +303,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
                 current_list = None
 
                 doc.add_heading(text=text, prov=prov)
+            elif label == DocItemLabel.CODE:
+                current_list = None
+
+                doc.add_code(text=text, prov=prov)
             else:
                 current_list = None
 

diff --git a/docs/examples/develop_code_equation_enrichment.py b/docs/examples/develop_code_equation_enrichment.py
@@ -0,0 +1,173 @@
+import logging
+from pathlib import Path
+from typing import Any, Iterable, Literal
+
+from docling_core.types.doc import (
+    DoclingDocument,
+    NodeItem,
+    TextItem,
+)
+from enum import Enum
+
+from pydantic import BaseModel
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import AcceleratorOptions, PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.models.base_model import BaseEnrichmentModel
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+
+from docling_ibm_models.code_formula_model.code_formula_predictor import (
+    CodeFormulaPredictor,
+)
+
+from docling.datamodel.settings import settings
+
+# TODO: remove this. Imported so that the models are registered
+from docling_ibm_models.code_formula_model.models.vary_opt import *
+from docling_ibm_models.code_formula_model.models.vary_opt_image_processor import *
+
+
+class CodeFormulaMode(str, Enum):
+    """Modes for the CodeFormula model."""
+
+    CODE = "code"
+    FORMULA = "formula"
+    CODE_FORMULA = "code_formula"
+
+
+class CodeFormulaModelOptions(BaseModel):
+    kind: Literal["code_formula"] = "code_formula"
+
+    mode: CodeFormulaMode = CodeFormulaMode.CODE_FORMULA
+
+
+class CodeFormulaModel(BaseEnrichmentModel):
+
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Path,
+        accelerator_options: AcceleratorOptions,
+        code_formula_options: CodeFormulaModelOptions,
+    ):
+        """Init the CodeFormulaModel.
+
+        Args:
+            enabled (bool): True if the model is enabled, False othewise.
+            
+        """
+        self.enabled = enabled
+        self.mode = code_formula_options.mode
+
+        self.code_formula_model = CodeFormulaPredictor(
+            artifacts_path=artifacts_path,
+            device=accelerator_options.device,
+            num_threads=accelerator_options.num_threads,
+        )
+
+    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
+        return (
+            self.enabled
+            and isinstance(element, TextItem)
+            and (
+                (
+                    element.label == "code"
+                    and (
+                        CodeFormulaMode.CODE
+                        or self.mode == CodeFormulaMode.CODE_FORMULA
+                    )
+                )
+                or (
+                    element.label == "formula"
+                    and (
+                        self.mode == CodeFormulaMode.FORMULA
+                        or self.mode == CodeFormulaMode.CODE_FORMULA
+                    )
+                )
+            )
+        )
+
+    def __call__(
+        self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
+    ) -> Iterable[Any]:
+        print(len(element_batch))
+        if not self.enabled:
+            return
+
+        # ! TODO: batch size missing
+        images = [el.get_image(doc) for el in element_batch]
+        labels = [el.label for el in element_batch]
+
+        outputs = self.code_formula_model.predict(images, labels)
+        # for output in outputs:
+        #     print(output)
+        #     print("\n\n\n\n\n")
+
+        for element, output in zip(element_batch, outputs):
+            element.text = output
+
+        yield element_batch
+
+
+class CodeFormulaPipelineOptions(PdfPipelineOptions):
+    do_code_formula_enrichment: bool = True
+
+class CodeFormulaPipeline(StandardPdfPipeline):
+
+    def __init__(self, pipeline_options: CodeFormulaPipelineOptions):
+        super().__init__(pipeline_options)
+        self.pipeline_options: CodeFormulaPipelineOptions
+
+        self.enrichment_pipe = [
+            CodeFormulaModel(
+                enabled=pipeline_options.do_code_formula_enrichment,
+                artifacts_path="/dccstor/doc_fig_class/DocFM-Vision-Pretrainer/Vary-master/checkpoints_code_equation_model/best_run",
+                accelerator_options=AcceleratorOptions(device="cpu"),
+                code_formula_options=CodeFormulaModelOptions(),
+            )
+        ]
+
+    @classmethod
+    def get_default_options(cls) -> CodeFormulaPipelineOptions:
+        return CodeFormulaPipelineOptions()
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    # input_doc_path = Path("./tests/data/code_and_formulas.pdf")
+    input_doc_path = Path(
+        "/dccstor/doc_fig_class/docling-ibm/test/data/pdf/code_and_formulas.pdf"
+    )
+
+    settings.debug.visualize_raw_layout = True
+    settings.debug.visualize_layout = True
+    settings.debug.visualize_ocr = True
+    settings.debug.visualize_tables = True
+
+    pipeline_options = CodeFormulaPipelineOptions()
+    pipeline_options.images_scale = 2.0
+
+    pipeline_options.generate_page_images = True
+    pipeline_options.generate_picture_images = True
+
+    doc_converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_cls=CodeFormulaPipeline,
+                pipeline_options=pipeline_options,
+            )
+        }
+    )
+    result = doc_converter.convert(input_doc_path)
+
+    for element, _level in result.document.iterate_items():
+        if isinstance(element, TextItem) and (element.label == "code" or element.label == "formula"):
+            print(
+                f"The model populated the `text` portion of the TextElement {element.self_ref}:\n{element.text}\n\n\n\n\n"
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/data/code_and_formulas.pdf b/tests/data/code_and_formulas.pdf