DS4SD · cau-git · Dec 10, 2024 · Dec 2, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/docling/cli/main.py b/docling/cli/main.py
@@ -26,6 +26,8 @@
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
     EasyOcrOptions,
     OcrMacOptions,
     OcrOptions,
@@ -264,6 +266,10 @@ def convert(
             help="Show version information.",
         ),
     ] = None,
+    num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
+    device: Annotated[
+        AcceleratorDevice, typer.Option(..., help="Accelerator device")
+    ] = AcceleratorDevice.AUTO,
 ):
     if verbose == 0:
         logging.basicConfig(level=logging.WARNING)
@@ -343,7 +349,9 @@ def convert(
         if ocr_lang_list is not None:
             ocr_options.lang = ocr_lang_list
 
+        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
         pipeline_options = PdfPipelineOptions(
+            accelerator_options=accelerator_options,
             do_ocr=ocr,
             ocr_options=ocr_options,
             do_table_structure=True,

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
@@ -1,8 +1,64 @@
+import logging
+import os
 from enum import Enum
 from pathlib import Path
-from typing import List, Literal, Optional, Union
+from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic_settings import (
+    BaseSettings,
+    PydanticBaseSettingsSource,
+    SettingsConfigDict,
+)
+
+_log = logging.getLogger(__name__)
+
+
+class AcceleratorDevice(str, Enum):
+    """Devices to run model inference"""
+
+    AUTO = "auto"
+    CPU = "cpu"
+    CUDA = "cuda"
+    MPS = "mps"
+
+
+class AcceleratorOptions(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
+    )
+
+    num_threads: int = 4
+    device: AcceleratorDevice = AcceleratorDevice.AUTO
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_alternative_envvars(cls, data: Any) -> Any:
+        r"""
+        Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
+        The alternative envvar is used only if it is valid and the regular envvar is not set.
+
+        Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
+        the same functionality. In case the alias envvar is set and the user tries to override the
+        parameter in settings initialization, Pydantic treats the parameter provided in __init__()
+        as an extra input instead of simply overwriting the evvar value for that parameter.
+        """
+        if isinstance(data, dict):
+            input_num_threads = data.get("num_threads")
+
+            # Check if to set the num_threads from the alternative envvar
+            if input_num_threads is None:
+                docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
+                omp_num_threads = os.getenv("OMP_NUM_THREADS")
+                if docling_num_threads is None and omp_num_threads is not None:
+                    try:
+                        data["num_threads"] = int(omp_num_threads)
+                    except ValueError:
+                        _log.error(
+                            "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
+                            omp_num_threads,
+                        )
+        return data
 
 
 class TableFormerMode(str, Enum):
@@ -78,9 +134,17 @@ class EasyOcrOptions(OcrOptions):
 
     kind: Literal["easyocr"] = "easyocr"
     lang: List[str] = ["fr", "de", "es", "en"]
-    use_gpu: bool = True  # same default as easyocr.Reader
+    use_gpu: Annotated[
+        int,
+        Field(
+            deprecated="Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
+            "When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
+            "to run EasyOCR. Otherwise, EasyOCR runs in CPU."
+        ),
+    ] = True
+
     model_storage_directory: Optional[str] = None
-    download_enabled: bool = True  # same default as easyocr.Reader
+    download_enabled: bool = True
 
     model_config = ConfigDict(
         extra="forbid",
@@ -132,6 +196,7 @@ class PipelineOptions(BaseModel):
     create_legacy_output: bool = (
         True  # This defautl will be set to False on a future version of docling
     )
+    accelerator_options: AcceleratorOptions = AcceleratorOptions()
 
 
 class PdfPipelineOptions(PipelineOptions):

diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py
@@ -7,16 +7,26 @@
 
 from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import EasyOcrOptions
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    EasyOcrOptions,
+)
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 
 _log = logging.getLogger(__name__)
 
 
 class EasyOcrModel(BaseOcrModel):
-    def __init__(self, enabled: bool, options: EasyOcrOptions):
+    def __init__(
+        self,
+        enabled: bool,
+        options: EasyOcrOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
         super().__init__(enabled=enabled, options=options)
         self.options: EasyOcrOptions
 
@@ -31,11 +41,23 @@ def __init__(self, enabled: bool, options: EasyOcrOptions):
                     "Alternatively, Docling has support for other OCR engines. See the documentation."
                 )
 
+            use_gpu = False
+            if self.options.use_gpu:
+                device = decide_device(accelerator_options.device)
+                # Enable easyocr GPU if running on CUDA, MPS
+                use_gpu = any(
+                    filter(
+                        lambda x: str(x).lower() in device,
+                        [AcceleratorDevice.CUDA.value, AcceleratorDevice.MPS.value],
+                    )
+                )
+
             self.reader = easyocr.Reader(
                 lang_list=self.options.lang,
-                gpu=self.options.use_gpu,
+                gpu=use_gpu,
                 model_storage_directory=self.options.model_storage_directory,
                 download_enabled=self.options.download_enabled,
+                verbose=False,
             )
 
     def __call__(

diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
@@ -17,8 +17,10 @@
     Page,
 )
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import AcceleratorOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
+from docling.utils.accelerator_utils import decide_device
 from docling.utils.layout_postprocessor import LayoutPostprocessor
 from docling.utils.profiling import TimeRecorder
 
@@ -45,11 +47,13 @@ class LayoutModel(BasePageModel):
     TABLE_LABEL = DocItemLabel.TABLE
     FIGURE_LABEL = DocItemLabel.PICTURE
     FORMULA_LABEL = DocItemLabel.FORMULA
-
     CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
 
-    def __init__(self, artifacts_path: Path):
-        self.layout_predictor = LayoutPredictor(artifacts_path)  # TODO temporary
+    def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
+        device = decide_device(accelerator_options.device)
+        self.layout_predictor = LayoutPredictor(
+            artifacts_path, device, accelerator_options.num_threads
+        )
 
     def draw_clusters_and_cells_side_by_side(
         self, conv_res, page, clusters, mode_prefix: str, show: bool = False

diff --git a/docling/models/rapid_ocr_model.py b/docling/models/rapid_ocr_model.py
@@ -6,16 +6,26 @@
 
 from docling.datamodel.base_models import OcrCell, Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import RapidOcrOptions
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    RapidOcrOptions,
+)
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 
 _log = logging.getLogger(__name__)
 
 
 class RapidOcrModel(BaseOcrModel):
-    def __init__(self, enabled: bool, options: RapidOcrOptions):
+    def __init__(
+        self,
+        enabled: bool,
+        options: RapidOcrOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
         super().__init__(enabled=enabled, options=options)
         self.options: RapidOcrOptions
 
@@ -30,52 +40,21 @@ def __init__(self, enabled: bool, options: RapidOcrOptions):
                     "Alternatively, Docling has support for other OCR engines. See the documentation."
                 )
 
-            # This configuration option will be revamped while introducing device settings for all models.
-            # For the moment we will default to auto and let onnx-runtime pick the best.
-            cls_use_cuda = True
-            rec_use_cuda = True
-            det_use_cuda = True
-            det_use_dml = True
-            cls_use_dml = True
-            rec_use_dml = True
-
-            # # Same as Defaults in RapidOCR
-            # cls_use_cuda = False
-            # rec_use_cuda = False
-            # det_use_cuda = False
-            # det_use_dml = False
-            # cls_use_dml = False
-            # rec_use_dml = False
-
-            # # If we set everything to true onnx-runtime would automatically choose the fastest accelerator
-            # if self.options.device == self.options.Device.AUTO:
-            #     cls_use_cuda = True
-            #     rec_use_cuda = True
-            #     det_use_cuda = True
-            #     det_use_dml = True
-            #     cls_use_dml = True
-            #     rec_use_dml = True
-
-            # # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU.
-            # elif self.options.device == self.options.Device.CUDA:
-            #     cls_use_cuda = True
-            #     rec_use_cuda = True
-            #     det_use_cuda = True
-
-            # # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU.
-            # elif self.options.device == self.options.Device.DIRECTML:
-            #     det_use_dml = True
-            #     cls_use_dml = True
-            #     rec_use_dml = True
+            # Decide the accelerator devices
+            device = decide_device(accelerator_options.device)
+            use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
+            use_dml = accelerator_options.device == AcceleratorDevice.AUTO
+            intra_op_num_threads = accelerator_options.num_threads
 
             self.reader = RapidOCR(
                 text_score=self.options.text_score,
-                cls_use_cuda=cls_use_cuda,
-                rec_use_cuda=rec_use_cuda,
-                det_use_cuda=det_use_cuda,
-                det_use_dml=det_use_dml,
-                cls_use_dml=cls_use_dml,
-                rec_use_dml=rec_use_dml,
+                cls_use_cuda=use_cuda,
+                rec_use_cuda=use_cuda,
+                det_use_cuda=use_cuda,
+                det_use_dml=use_dml,
+                cls_use_dml=use_dml,
+                rec_use_dml=use_dml,
+                intra_op_num_threads=intra_op_num_threads,
                 print_verbose=self.options.print_verbose,
                 det_model_path=self.options.det_model_path,
                 cls_model_path=self.options.cls_model_path,

diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
@@ -9,15 +9,24 @@
 
 from docling.datamodel.base_models import Page, Table, TableStructurePrediction
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
+from docling.datamodel.pipeline_options import (
+    AcceleratorOptions,
+    TableFormerMode,
+    TableStructureOptions,
+)
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
+from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 
 
 class TableStructureModel(BasePageModel):
     def __init__(
-        self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
+        self,
+        enabled: bool,
+        artifacts_path: Path,
+        options: TableStructureOptions,
+        accelerator_options: AcceleratorOptions,
     ):
         self.options = options
         self.do_cell_matching = self.options.do_cell_matching
@@ -26,16 +35,22 @@ def __init__(
         self.enabled = enabled
         if self.enabled:
             if self.mode == TableFormerMode.ACCURATE:
-                artifacts_path = artifacts_path / "fat"
+                artifacts_path = artifacts_path / "accurate"
+            else:
+                artifacts_path = artifacts_path / "fast"
 
             # Third Party
             import docling_ibm_models.tableformer.common as c
 
+            device = decide_device(accelerator_options.device)
+
             self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
             self.tm_config["model"]["save_dir"] = artifacts_path
             self.tm_model_type = self.tm_config["model"]["type"]
 
-            self.tf_predictor = TFPredictor(self.tm_config)
+            self.tf_predictor = TFPredictor(
+                self.tm_config, device, accelerator_options.num_threads
+            )
             self.scale = 2.0  # Scale up table input images to 144 dpi
 
     def draw_table_and_cells(

diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
@@ -75,14 +75,16 @@ def __init__(self, pipeline_options: PdfPipelineOptions):
             # Layout model
             LayoutModel(
                 artifacts_path=self.artifacts_path
-                / StandardPdfPipeline._layout_model_path
+                / StandardPdfPipeline._layout_model_path,
+                accelerator_options=pipeline_options.accelerator_options,
             ),
             # Table structure model
             TableStructureModel(
                 enabled=pipeline_options.do_table_structure,
                 artifacts_path=self.artifacts_path
                 / StandardPdfPipeline._table_model_path,
                 options=pipeline_options.table_structure_options,
+                accelerator_options=pipeline_options.accelerator_options,
             ),
             # Page assemble
             PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
@@ -114,6 +116,7 @@ def get_ocr_model(self) -> Optional[BaseOcrModel]:
             return EasyOcrModel(
                 enabled=self.pipeline_options.do_ocr,
                 options=self.pipeline_options.ocr_options,
+                accelerator_options=self.pipeline_options.accelerator_options,
             )
         elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
             return TesseractOcrCliModel(
@@ -129,6 +132,7 @@ def get_ocr_model(self) -> Optional[BaseOcrModel]:
             return RapidOcrModel(
                 enabled=self.pipeline_options.do_ocr,
                 options=self.pipeline_options.ocr_options,
+                accelerator_options=self.pipeline_options.accelerator_options,
             )
         elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
             if "darwin" != sys.platform: