DS4SD · cau-git · Dec 9, 2024 · Dec 4, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py
@@ -6,7 +6,7 @@
 
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_parse.docling_parse import pdf_parser_v1
+from docling_parse.pdf_parsers import pdf_parser_v1
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
 

diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py
@@ -6,7 +6,7 @@
 
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_parse.docling_parse import pdf_parser_v2
+from docling_parse.pdf_parsers import pdf_parser_v2
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
 
@@ -210,12 +210,14 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
         self.parser = pdf_parser_v2("fatal")
 
         success = False
-        if isinstance(path_or_stream, BytesIO):
+        if isinstance(self.path_or_stream, BytesIO):
             success = self.parser.load_document_from_bytesio(
-                self.document_hash, path_or_stream
+                self.document_hash, self.path_or_stream
+            )
+        elif isinstance(self.path_or_stream, Path):
+            success = self.parser.load_document(
+                self.document_hash, str(self.path_or_stream)
             )
-        elif isinstance(path_or_stream, Path):
-            success = self.parser.load_document(self.document_hash, str(path_or_stream))
 
         if not success:
             raise RuntimeError(

diff --git a/docling/cli/main.py b/docling/cli/main.py
@@ -208,7 +208,7 @@ def convert(
     ] = None,
     pdf_backend: Annotated[
         PdfBackend, typer.Option(..., help="The PDF backend to use.")
-    ] = PdfBackend.DLPARSE_V1,
+    ] = PdfBackend.DLPARSE_V2,
     table_mode: Annotated[
         TableFormerMode,
         typer.Option(..., help="The mode to use in the table structure model."),

diff --git a/docling/document_converter.py b/docling/document_converter.py
@@ -9,7 +9,7 @@
 
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.asciidoc_backend import AsciiDocBackend
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
@@ -84,12 +84,12 @@ class HTMLFormatOption(FormatOption):
 
 class PdfFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
-    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
+    backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
 
 
 class ImageFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
-    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
+    backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
 
 
 def _get_default_option(format: InputFormat) -> FormatOption:
@@ -113,10 +113,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
             pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
         ),
         InputFormat.IMAGE: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
         ),
         InputFormat.PDF: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
         ),
     }
     if (options := format_to_default_options.get(format)) is not None:

diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py
@@ -4,7 +4,6 @@
 from typing import List, Union
 
 from deepsearch_glm.nlp_utils import init_nlp_model
-from deepsearch_glm.utils.doc_utils import to_docling_document
 from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
 from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
 from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
@@ -29,6 +28,7 @@
 from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
 from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
 from docling.datamodel.settings import settings
+from docling.utils.glm_utils import to_docling_document
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 from docling.utils.utils import create_hash
 
@@ -232,7 +232,7 @@ def make_spans(cell):
     def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
         with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
             ds_doc = self._to_legacy_document(conv_res)
-            ds_doc_dict = ds_doc.model_dump(by_alias=True)
+            ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
 
             glm_doc = self.model.apply_on_doc(ds_doc_dict)