Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: docling-parse v2 as default PDF backend #549

Merged
merged 4 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_parse.docling_parse import pdf_parser_v1
from docling_parse.pdf_parsers import pdf_parser_v1
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage

Expand Down
12 changes: 7 additions & 5 deletions docling/backend/docling_parse_v2_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_parse.docling_parse import pdf_parser_v2
from docling_parse.pdf_parsers import pdf_parser_v2
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage

Expand Down Expand Up @@ -210,12 +210,14 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
self.parser = pdf_parser_v2("fatal")

success = False
if isinstance(path_or_stream, BytesIO):
if isinstance(self.path_or_stream, BytesIO):
success = self.parser.load_document_from_bytesio(
self.document_hash, path_or_stream
self.document_hash, self.path_or_stream
)
elif isinstance(self.path_or_stream, Path):
success = self.parser.load_document(
self.document_hash, str(self.path_or_stream)
)
elif isinstance(path_or_stream, Path):
success = self.parser.load_document(self.document_hash, str(path_or_stream))

if not success:
raise RuntimeError(
Expand Down
2 changes: 1 addition & 1 deletion docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def convert(
] = None,
pdf_backend: Annotated[
PdfBackend, typer.Option(..., help="The PDF backend to use.")
] = PdfBackend.DLPARSE_V1,
] = PdfBackend.DLPARSE_V2,
table_mode: Annotated[
TableFormerMode,
typer.Option(..., help="The mode to use in the table structure model."),
Expand Down
10 changes: 5 additions & 5 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
Expand Down Expand Up @@ -84,12 +84,12 @@ class HTMLFormatOption(FormatOption):

class PdfFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend


class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend


def _get_default_option(format: InputFormat) -> FormatOption:
Expand All @@ -113,10 +113,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
}
if (options := format_to_default_options.get(format)) is not None:
Expand Down
4 changes: 2 additions & 2 deletions docling/models/ds_glm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import List, Union

from deepsearch_glm.nlp_utils import init_nlp_model
from deepsearch_glm.utils.doc_utils import to_docling_document
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
Expand All @@ -29,6 +28,7 @@
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docling.datamodel.settings import settings
from docling.utils.glm_utils import to_docling_document
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import create_hash

Expand Down Expand Up @@ -232,7 +232,7 @@ def make_spans(cell):
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
ds_doc = self._to_legacy_document(conv_res)
ds_doc_dict = ds_doc.model_dump(by_alias=True)
ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)

glm_doc = self.model.apply_on_doc(ds_doc_dict)

Expand Down
Loading
Loading