Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ocr): added support for RapidOCR engine #415

Merged
merged 10 commits into from
Nov 27, 2024
4 changes: 4 additions & 0 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
OcrMacOptions,
OcrOptions,
PdfPipelineOptions,
RapidOcrOptions,
TableFormerMode,
TesseractCliOcrOptions,
TesseractOcrOptions,
Expand Down Expand Up @@ -76,6 +77,7 @@ class OcrEngine(str, Enum):
TESSERACT_CLI = "tesseract_cli"
TESSERACT = "tesseract"
OCRMAC = "ocrmac"
RAPIDOCR = "rapidocr"


def export_documents(
Expand Down Expand Up @@ -262,6 +264,8 @@ def convert(
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.OCRMAC:
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.RAPIDOCR:
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
else:
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

Expand Down
36 changes: 36 additions & 0 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,42 @@ class OcrOptions(BaseModel):
)


class RapidOcrOptions(OcrOptions):
kind: Literal["rapidocr"] = "rapidocr"

# English and chinese are the most commly used models and have been tested with RapidOCR.
lang: List[str] = [
"english",
"chinese",
] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
# For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/

# For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
text_score: float = 0.5 # same default as rapidocr

use_det: Optional[bool] = None # same default as rapidocr
use_cls: Optional[bool] = None # same default as rapidocr
use_rec: Optional[bool] = None # same default as rapidocr

# class Device(Enum):
# CPU = "CPU"
# CUDA = "CUDA"
# DIRECTML = "DIRECTML"
# AUTO = "AUTO"

# device: Device = Device.AUTO # Default value is AUTO

print_verbose: bool = False # same default as rapidocr

det_model_path: Optional[str] = None # same default as rapidocr
cls_model_path: Optional[str] = None # same default as rapidocr
rec_model_path: Optional[str] = None # same default as rapidocr

model_config = ConfigDict(
extra="forbid",
)


class EasyOcrOptions(OcrOptions):
kind: Literal["easyocr"] = "easyocr"
lang: List[str] = ["fr", "de", "es", "en"]
Expand Down
147 changes: 147 additions & 0 deletions docling/models/rapid_ocr_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import logging
from typing import Iterable

import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin

from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import RapidOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder

_log = logging.getLogger(__name__)


class RapidOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: RapidOcrOptions):
super().__init__(enabled=enabled, options=options)
self.options: RapidOcrOptions

self.scale = 3 # multiplier for 72 dpi == 216 dpi.

if self.enabled:
try:
from rapidocr_onnxruntime import RapidOCR # type: ignore
except ImportError:
raise ImportError(
"RapidOCR is not installed. Please install it via `pip install rapidocr_onnxruntime` to use this OCR engine. "
"Alternatively, Docling has support for other OCR engines. See the documentation."
)

# This configuration option will be revamped while introducing device settings for all models.
# For the moment we will default to auto and let onnx-runtime pick the best.
cls_use_cuda = True
rec_use_cuda = True
det_use_cuda = True
det_use_dml = True
cls_use_dml = True
rec_use_dml = True

# # Same as Defaults in RapidOCR
# cls_use_cuda = False
# rec_use_cuda = False
# det_use_cuda = False
# det_use_dml = False
# cls_use_dml = False
# rec_use_dml = False

# # If we set everything to true onnx-runtime would automatically choose the fastest accelerator
# if self.options.device == self.options.Device.AUTO:
# cls_use_cuda = True
# rec_use_cuda = True
# det_use_cuda = True
# det_use_dml = True
# cls_use_dml = True
# rec_use_dml = True

# # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU.
# elif self.options.device == self.options.Device.CUDA:
# cls_use_cuda = True
# rec_use_cuda = True
# det_use_cuda = True

# # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU.
# elif self.options.device == self.options.Device.DIRECTML:
# det_use_dml = True
# cls_use_dml = True
# rec_use_dml = True

self.reader = RapidOCR(
text_score=self.options.text_score,
cls_use_cuda=cls_use_cuda,
rec_use_cuda=rec_use_cuda,
det_use_cuda=det_use_cuda,
det_use_dml=det_use_dml,
cls_use_dml=cls_use_dml,
rec_use_dml=rec_use_dml,
print_verbose=self.options.print_verbose,
det_model_path=self.options.det_model_path,
cls_model_path=self.options.cls_model_path,
rec_model_path=self.options.rec_model_path,
)

def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:

if not self.enabled:
yield from page_batch
return

for page in page_batch:

assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
with TimeRecorder(conv_res, "ocr"):
ocr_rects = self.get_ocr_rects(page)

all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
im = numpy.array(high_res_image)
result, _ = self.reader(
im,
use_det=self.options.use_det,
use_cls=self.options.use_cls,
use_rec=self.options.use_rec,
)

del high_res_image
del im

cells = [
OcrCell(
id=ix,
text=line[1],
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale) + ocr_rect.l,
(line[0][0][1] / self.scale) + ocr_rect.t,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
)
for ix, line in enumerate(result)
]
all_ocr_cells.extend(cells)

# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)

# DEBUG code:
if settings.debug.visualize_ocr:
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)

yield page
7 changes: 7 additions & 0 deletions docling/pipeline/standard_pdf_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
RapidOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
Expand All @@ -26,6 +27,7 @@
PagePreprocessingModel,
PagePreprocessingOptions,
)
from docling.models.rapid_ocr_model import RapidOcrModel
from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel
Expand Down Expand Up @@ -121,6 +123,11 @@ def get_ocr_model(self) -> Optional[BaseOcrModel]:
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, RapidOcrOptions):
return RapidOcrModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
if "darwin" != sys.platform:
raise RuntimeError(
Expand Down
4 changes: 3 additions & 1 deletion docs/examples/full_page_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
RapidOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
Expand All @@ -20,10 +21,11 @@ def main():
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only)
# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only), RapidOcrOptions
# ocr_options = EasyOcrOptions(force_full_page_ocr=True)
# ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
# ocr_options = OcrMacOptions(force_full_page_ocr=True)
# ocr_options = RapidOcrOptions(force_full_page_ocr=True)
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
pipeline_options.ocr_options = ocr_options

Expand Down
1 change: 1 addition & 0 deletions docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
| Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
| Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
| OcrMac | System dependency. See description below. | `OcrMacOptions` |
| [RapidOCR](https://github.com/RapidAI/RapidOCR) | Extra feature not included in Default Docling installation can be installed via `pip install rapidocr_onnxruntime` | `RapidOcrOptions` |

The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example

Expand Down
Loading