Skip to content

Commit

Permalink
propagated changes for new CodeItem class
Browse files Browse the repository at this point in the history
  • Loading branch information
Matteo Omenetti [email protected] authored and Matteo Omenetti [email protected] committed Jan 15, 2025
1 parent 57fc28d commit 412e4c9
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 27 deletions.
2 changes: 1 addition & 1 deletion docling/models/layout_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class LayoutModel(BasePageModel):
DocItemLabel.PAGE_FOOTER,
DocItemLabel.CODE,
DocItemLabel.LIST_ITEM,
# "Formula",
DocItemLabel.FORMULA,
]
PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]

Expand Down
25 changes: 0 additions & 25 deletions docling/models/page_assemble_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,31 +135,6 @@ def __call__(
)
elements.append(fig)
body.append(fig)
elif cluster.label == LayoutModel.FORMULA_LABEL:
equation = None
if page.predictions.equations_prediction:
equation = page.predictions.equations_prediction.equation_map.get(
cluster.id, None
)
if (
not equation
): # fallback: add empty formula, if it isn't present
text = self.sanitize_text(
[
cell.text.replace("\x02", "-").strip()
for cell in cluster.cells
if len(cell.text.strip()) > 0
]
)
equation = TextElement(
label=cluster.label,
id=cluster.id,
cluster=cluster,
page_no=page.page_no,
text=text,
)
elements.append(equation)
body.append(equation)
elif cluster.label in LayoutModel.CONTAINER_LABELS:
container_el = ContainerElement(
label=cluster.label,
Expand Down
5 changes: 4 additions & 1 deletion docling/utils/glm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,6 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
container_el = doc.add_group(label=group_label)

_add_child_elements(container_el, doc, obj, pelem)

elif "text" in obj:
text = obj["text"][span_i:span_j]

Expand Down Expand Up @@ -304,6 +303,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
current_list = None

doc.add_heading(text=text, prov=prov)
elif label == DocItemLabel.CODE:
current_list = None

doc.add_code(text=text, prov=prov)
else:
current_list = None

Expand Down
173 changes: 173 additions & 0 deletions docs/examples/develop_code_equation_enrichment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import logging
from pathlib import Path
from typing import Any, Iterable, Literal

from docling_core.types.doc import (
DoclingDocument,
NodeItem,
TextItem,
)
from enum import Enum

from pydantic import BaseModel

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import AcceleratorOptions, PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.base_model import BaseEnrichmentModel
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

from docling_ibm_models.code_formula_model.code_formula_predictor import (
CodeFormulaPredictor,
)

from docling.datamodel.settings import settings

# TODO: remove this. Imported so that the models are registered
from docling_ibm_models.code_formula_model.models.vary_opt import *
from docling_ibm_models.code_formula_model.models.vary_opt_image_processor import *


class CodeFormulaMode(str, Enum):
"""Modes for the CodeFormula model."""

CODE = "code"
FORMULA = "formula"
CODE_FORMULA = "code_formula"


class CodeFormulaModelOptions(BaseModel):
kind: Literal["code_formula"] = "code_formula"

mode: CodeFormulaMode = CodeFormulaMode.CODE_FORMULA


class CodeFormulaModel(BaseEnrichmentModel):

def __init__(
self,
enabled: bool,
artifacts_path: Path,
accelerator_options: AcceleratorOptions,
code_formula_options: CodeFormulaModelOptions,
):
"""Init the CodeFormulaModel.
Args:
enabled (bool): True if the model is enabled, False othewise.
"""
self.enabled = enabled
self.mode = code_formula_options.mode

self.code_formula_model = CodeFormulaPredictor(
artifacts_path=artifacts_path,
device=accelerator_options.device,
num_threads=accelerator_options.num_threads,
)

def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
return (
self.enabled
and isinstance(element, TextItem)
and (
(
element.label == "code"
and (
CodeFormulaMode.CODE
or self.mode == CodeFormulaMode.CODE_FORMULA
)
)
or (
element.label == "formula"
and (
self.mode == CodeFormulaMode.FORMULA
or self.mode == CodeFormulaMode.CODE_FORMULA
)
)
)
)

def __call__(
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
) -> Iterable[Any]:
print(len(element_batch))
if not self.enabled:
return

# ! TODO: batch size missing
images = [el.get_image(doc) for el in element_batch]
labels = [el.label for el in element_batch]

outputs = self.code_formula_model.predict(images, labels)
# for output in outputs:
# print(output)
# print("\n\n\n\n\n")

for element, output in zip(element_batch, outputs):
element.text = output

yield element_batch


class CodeFormulaPipelineOptions(PdfPipelineOptions):
do_code_formula_enrichment: bool = True

class CodeFormulaPipeline(StandardPdfPipeline):

def __init__(self, pipeline_options: CodeFormulaPipelineOptions):
super().__init__(pipeline_options)
self.pipeline_options: CodeFormulaPipelineOptions

self.enrichment_pipe = [
CodeFormulaModel(
enabled=pipeline_options.do_code_formula_enrichment,
artifacts_path="/dccstor/doc_fig_class/DocFM-Vision-Pretrainer/Vary-master/checkpoints_code_equation_model/best_run",
accelerator_options=AcceleratorOptions(device="cpu"),
code_formula_options=CodeFormulaModelOptions(),
)
]

@classmethod
def get_default_options(cls) -> CodeFormulaPipelineOptions:
return CodeFormulaPipelineOptions()


def main():
logging.basicConfig(level=logging.INFO)

# input_doc_path = Path("./tests/data/code_and_formulas.pdf")
input_doc_path = Path(
"/dccstor/doc_fig_class/docling-ibm/test/data/pdf/code_and_formulas.pdf"
)

settings.debug.visualize_raw_layout = True
settings.debug.visualize_layout = True
settings.debug.visualize_ocr = True
settings.debug.visualize_tables = True

pipeline_options = CodeFormulaPipelineOptions()
pipeline_options.images_scale = 2.0

pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=CodeFormulaPipeline,
pipeline_options=pipeline_options,
)
}
)
result = doc_converter.convert(input_doc_path)

for element, _level in result.document.iterate_items():
if isinstance(element, TextItem) and (element.label == "code" or element.label == "formula"):
print(
f"The model populated the `text` portion of the TextElement {element.self_ref}:\n{element.text}\n\n\n\n\n"
)


if __name__ == "__main__":
main()
Binary file added tests/data/code_and_formulas.pdf
Binary file not shown.

0 comments on commit 412e4c9

Please sign in to comment.