Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(Accelerator): Introduce AI runtime configuration scheme #514

Merged
merged 10 commits into from
Dec 10, 2024
8 changes: 8 additions & 0 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
OcrMacOptions,
OcrOptions,
Expand Down Expand Up @@ -264,6 +266,10 @@ def convert(
help="Show version information.",
),
] = None,
num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
device: Annotated[
AcceleratorDevice, typer.Option(..., help="Accelerator device")
] = AcceleratorDevice.AUTO,
):
if verbose == 0:
logging.basicConfig(level=logging.WARNING)
Expand Down Expand Up @@ -343,7 +349,9 @@ def convert(
if ocr_lang_list is not None:
ocr_options.lang = ocr_lang_list

accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
pipeline_options = PdfPipelineOptions(
accelerator_options=accelerator_options,
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
Expand Down
73 changes: 69 additions & 4 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,64 @@
import logging
import os
from enum import Enum
from pathlib import Path
from typing import List, Literal, Optional, Union
from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union

from pydantic import BaseModel, ConfigDict, Field
from pydantic import BaseModel, ConfigDict, Field, model_validator
from pydantic_settings import (
BaseSettings,
PydanticBaseSettingsSource,
SettingsConfigDict,
)

_log = logging.getLogger(__name__)


class AcceleratorDevice(str, Enum):
"""Devices to run model inference"""

AUTO = "auto"
CPU = "cpu"
CUDA = "cuda"
MPS = "mps"


class AcceleratorOptions(BaseSettings):
model_config = SettingsConfigDict(
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
)

num_threads: int = 4
device: AcceleratorDevice = AcceleratorDevice.AUTO

@model_validator(mode="before")
@classmethod
def check_alternative_envvars(cls, data: Any) -> Any:
r"""
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
The alternative envvar is used only if it is valid and the regular envvar is not set.

Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
the same functionality. In case the alias envvar is set and the user tries to override the
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
as an extra input instead of simply overwriting the evvar value for that parameter.
"""
if isinstance(data, dict):
input_num_threads = data.get("num_threads")

# Check if to set the num_threads from the alternative envvar
if input_num_threads is None:
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
omp_num_threads = os.getenv("OMP_NUM_THREADS")
if docling_num_threads is None and omp_num_threads is not None:
try:
data["num_threads"] = int(omp_num_threads)
except ValueError:
_log.error(
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
omp_num_threads,
)
return data


class TableFormerMode(str, Enum):
Expand Down Expand Up @@ -78,9 +134,17 @@ class EasyOcrOptions(OcrOptions):

kind: Literal["easyocr"] = "easyocr"
lang: List[str] = ["fr", "de", "es", "en"]
use_gpu: bool = True # same default as easyocr.Reader
use_gpu: Annotated[
int,
Field(
deprecated="Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
"When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
"to run EasyOCR. Otherwise, EasyOCR runs in CPU."
),
] = True

model_storage_directory: Optional[str] = None
download_enabled: bool = True # same default as easyocr.Reader
download_enabled: bool = True

model_config = ConfigDict(
extra="forbid",
Expand Down Expand Up @@ -132,6 +196,7 @@ class PipelineOptions(BaseModel):
create_legacy_output: bool = (
True # This defautl will be set to False on a future version of docling
)
accelerator_options: AcceleratorOptions = AcceleratorOptions()


class PdfPipelineOptions(PipelineOptions):
Expand Down
28 changes: 25 additions & 3 deletions docling/models/easyocr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,26 @@

from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import EasyOcrOptions
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder

_log = logging.getLogger(__name__)


class EasyOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: EasyOcrOptions):
def __init__(
self,
enabled: bool,
options: EasyOcrOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(enabled=enabled, options=options)
self.options: EasyOcrOptions

Expand All @@ -31,11 +41,23 @@ def __init__(self, enabled: bool, options: EasyOcrOptions):
"Alternatively, Docling has support for other OCR engines. See the documentation."
)

use_gpu = False
if self.options.use_gpu:
device = decide_device(accelerator_options.device)
# Enable easyocr GPU if running on CUDA, MPS
use_gpu = any(
filter(
lambda x: str(x).lower() in device,
[AcceleratorDevice.CUDA.value, AcceleratorDevice.MPS.value],
)
)

self.reader = easyocr.Reader(
lang_list=self.options.lang,
gpu=self.options.use_gpu,
gpu=use_gpu,
model_storage_directory=self.options.model_storage_directory,
download_enabled=self.options.download_enabled,
verbose=False,
)

def __call__(
Expand Down
10 changes: 7 additions & 3 deletions docling/models/layout_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
Page,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.layout_postprocessor import LayoutPostprocessor
from docling.utils.profiling import TimeRecorder

Expand All @@ -45,11 +47,13 @@ class LayoutModel(BasePageModel):
TABLE_LABEL = DocItemLabel.TABLE
FIGURE_LABEL = DocItemLabel.PICTURE
FORMULA_LABEL = DocItemLabel.FORMULA

CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]

def __init__(self, artifacts_path: Path):
self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
device = decide_device(accelerator_options.device)
self.layout_predictor = LayoutPredictor(
artifacts_path, device, accelerator_options.num_threads
)

def draw_clusters_and_cells_side_by_side(
self, conv_res, page, clusters, mode_prefix: str, show: bool = False
Expand Down
69 changes: 24 additions & 45 deletions docling/models/rapid_ocr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,26 @@

from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import RapidOcrOptions
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
RapidOcrOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder

_log = logging.getLogger(__name__)


class RapidOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: RapidOcrOptions):
def __init__(
self,
enabled: bool,
options: RapidOcrOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(enabled=enabled, options=options)
self.options: RapidOcrOptions

Expand All @@ -30,52 +40,21 @@ def __init__(self, enabled: bool, options: RapidOcrOptions):
"Alternatively, Docling has support for other OCR engines. See the documentation."
)

# This configuration option will be revamped while introducing device settings for all models.
# For the moment we will default to auto and let onnx-runtime pick the best.
cls_use_cuda = True
rec_use_cuda = True
det_use_cuda = True
det_use_dml = True
cls_use_dml = True
rec_use_dml = True

# # Same as Defaults in RapidOCR
# cls_use_cuda = False
# rec_use_cuda = False
# det_use_cuda = False
# det_use_dml = False
# cls_use_dml = False
# rec_use_dml = False

# # If we set everything to true onnx-runtime would automatically choose the fastest accelerator
# if self.options.device == self.options.Device.AUTO:
# cls_use_cuda = True
# rec_use_cuda = True
# det_use_cuda = True
# det_use_dml = True
# cls_use_dml = True
# rec_use_dml = True

# # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU.
# elif self.options.device == self.options.Device.CUDA:
# cls_use_cuda = True
# rec_use_cuda = True
# det_use_cuda = True

# # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU.
# elif self.options.device == self.options.Device.DIRECTML:
# det_use_dml = True
# cls_use_dml = True
# rec_use_dml = True
# Decide the accelerator devices
device = decide_device(accelerator_options.device)
use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
use_dml = accelerator_options.device == AcceleratorDevice.AUTO
intra_op_num_threads = accelerator_options.num_threads

self.reader = RapidOCR(
text_score=self.options.text_score,
cls_use_cuda=cls_use_cuda,
rec_use_cuda=rec_use_cuda,
det_use_cuda=det_use_cuda,
det_use_dml=det_use_dml,
cls_use_dml=cls_use_dml,
rec_use_dml=rec_use_dml,
cls_use_cuda=use_cuda,
rec_use_cuda=use_cuda,
det_use_cuda=use_cuda,
det_use_dml=use_dml,
cls_use_dml=use_dml,
rec_use_dml=use_dml,
intra_op_num_threads=intra_op_num_threads,
print_verbose=self.options.print_verbose,
det_model_path=self.options.det_model_path,
cls_model_path=self.options.cls_model_path,
Expand Down
23 changes: 19 additions & 4 deletions docling/models/table_structure_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,24 @@

from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
TableFormerMode,
TableStructureOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder


class TableStructureModel(BasePageModel):
def __init__(
self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
self,
enabled: bool,
artifacts_path: Path,
options: TableStructureOptions,
accelerator_options: AcceleratorOptions,
):
self.options = options
self.do_cell_matching = self.options.do_cell_matching
Expand All @@ -26,16 +35,22 @@ def __init__(
self.enabled = enabled
if self.enabled:
if self.mode == TableFormerMode.ACCURATE:
artifacts_path = artifacts_path / "fat"
artifacts_path = artifacts_path / "accurate"
else:
artifacts_path = artifacts_path / "fast"

# Third Party
import docling_ibm_models.tableformer.common as c

device = decide_device(accelerator_options.device)

self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
self.tm_config["model"]["save_dir"] = artifacts_path
self.tm_model_type = self.tm_config["model"]["type"]

self.tf_predictor = TFPredictor(self.tm_config)
self.tf_predictor = TFPredictor(
self.tm_config, device, accelerator_options.num_threads
)
self.scale = 2.0 # Scale up table input images to 144 dpi

def draw_table_and_cells(
Expand Down
6 changes: 5 additions & 1 deletion docling/pipeline/standard_pdf_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,16 @@ def __init__(self, pipeline_options: PdfPipelineOptions):
# Layout model
LayoutModel(
artifacts_path=self.artifacts_path
/ StandardPdfPipeline._layout_model_path
/ StandardPdfPipeline._layout_model_path,
accelerator_options=pipeline_options.accelerator_options,
),
# Table structure model
TableStructureModel(
enabled=pipeline_options.do_table_structure,
artifacts_path=self.artifacts_path
/ StandardPdfPipeline._table_model_path,
options=pipeline_options.table_structure_options,
accelerator_options=pipeline_options.accelerator_options,
),
# Page assemble
PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
Expand Down Expand Up @@ -114,6 +116,7 @@ def get_ocr_model(self) -> Optional[BaseOcrModel]:
return EasyOcrModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
accelerator_options=self.pipeline_options.accelerator_options,
)
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
return TesseractOcrCliModel(
Expand All @@ -129,6 +132,7 @@ def get_ocr_model(self) -> Optional[BaseOcrModel]:
return RapidOcrModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
accelerator_options=self.pipeline_options.accelerator_options,
)
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
if "darwin" != sys.platform:
Expand Down
Loading
Loading