Add HOCRConverter (fixes #650) (#651)

* Add HOCRConverter * Add line to README.md * Test cicd * Test cicd 2 * Changes based on review comments * Remove whitespace changes to CHANGELOG.md * Remove duplicated html output * Add link to hocr wiki * Add tests for extracting hocr and html Co-authored-by: Pieter Marsman <[email protected]>
pdfminer · Aug 14, 2022 · 77df431 · 77df431
1 parent f79ad56
commit 77df431
Show file tree

Hide file tree

Showing 5 changed files with 200 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,12 @@
 # Changelog
-
 All notable changes in pdfminer.six will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
+- Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
+
 ### Fixed
 
 - 'ValueError': when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773))

diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ Features
 
 * Written entirely in Python.
 * Parse, analyze, and convert PDF documents.
+* Extract content as text, images, html or [hOCR](https://en.wikipedia.org/wiki/HOCR).
 * PDF-1.7 specification support. (well, almost).
 * CJK languages and vertical writing scripts support.
 * Various font types (Type1, TrueType, Type3, and CID) support.

diff --git a/pdfminer/converter.py b/pdfminer/converter.py
@@ -19,6 +19,7 @@
 from . import utils
 from .image import ImageWriter
 from .layout import LAParams, LTComponent, TextGroupElement
+from .layout import LTAnno
 from .layout import LTChar
 from .layout import LTContainer
 from .layout import LTCurve
@@ -821,3 +822,179 @@ def render(item: LTItem) -> None:
     def close(self) -> None:
         self.write_footer()
         return
+
+
+class HOCRConverter(PDFConverter[AnyIO]):
+    """Extract an hOCR representation from explicit text information within a PDF."""
+
+    #   Where text is being extracted from a variety of types of PDF within a
+    #   business process, those PDFs where the text is only present in image
+    #   form will need to be analysed using an OCR tool which will typically
+    #   output hOCR. This converter extracts the explicit text information from
+    #   those PDFs that do have it and uses it to genxerate a basic hOCR
+    #   representation that is designed to be used in conjunction with the image
+    #   of the PDF in the same way as genuine OCR output would be, but without the
+    #   inevitable OCR errors.
+
+    #   The converter does not handle images, diagrams or text colors.
+
+    #   In the examples processed by the contributor it was necessary to set
+    #   LAParams.all_texts to True.
+
+    CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
+
+    def __init__(
+        self,
+        rsrcmgr: PDFResourceManager,
+        outfp: AnyIO,
+        codec: str = "utf8",
+        pageno: int = 1,
+        laparams: Optional[LAParams] = None,
+        stripcontrol: bool = False,
+    ):
+        PDFConverter.__init__(
+            self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams
+        )
+        self.stripcontrol = stripcontrol
+        self.within_chars = False
+        self.write_header()
+
+    def bbox_repr(self, bbox: Rect) -> str:
+        (in_x0, in_y0, in_x1, in_y1) = bbox
+        # PDF y-coordinates are the other way round from hOCR coordinates
+        out_x0 = int(in_x0)
+        out_y0 = int(self.page_bbox[3] - in_y1)
+        out_x1 = int(in_x1)
+        out_y1 = int(self.page_bbox[3] - in_y0)
+        return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
+
+    def write(self, text: str) -> None:
+        if self.codec:
+            encoded_text = text.encode(self.codec)
+            cast(BinaryIO, self.outfp).write(encoded_text)
+        else:
+            cast(TextIO, self.outfp).write(text)
+
+    def write_header(self) -> None:
+        if self.codec:
+            self.write(
+                "<html xmlns='http://www.w3.org/1999/xhtml' "
+                "xml:lang='en' lang='en' charset='%s'>\n" % self.codec
+            )
+        else:
+            self.write(
+                "<html xmlns='http://www.w3.org/1999/xhtml' "
+                "xml:lang='en' lang='en'>\n"
+            )
+        self.write("<head>\n")
+        self.write("<title></title>\n")
+        self.write(
+            "<meta http-equiv='Content-Type' " "content='text/html;charset=utf-8' />\n"
+        )
+        self.write(
+            "<meta name='ocr-system' " "content='pdfminer.six HOCR Converter' />\n"
+        )
+        self.write(
+            "  <meta name='ocr-capabilities'"
+            " content='ocr_page ocr_block ocr_line ocrx_word'/>\n"
+        )
+        self.write("</head>\n")
+        self.write("<body>\n")
+
+    def write_footer(self) -> None:
+        self.write("<!-- comment in the following line to debug -->\n")
+        self.write(
+            "<!--script src='https://unpkg.com/hocrjs'>" "</script--></body></html>\n"
+        )
+
+    def write_text(self, text: str) -> None:
+        if self.stripcontrol:
+            text = self.CONTROL.sub("", text)
+        self.write(text)
+
+    def write_word(self) -> None:
+        if len(self.working_text) > 0:
+            bold_and_italic_styles = ""
+            if "Italic" in self.working_font:
+                bold_and_italic_styles = "font-style: italic; "
+            if "Bold" in self.working_font:
+                bold_and_italic_styles += "font-weight: bold; "
+            self.write(
+                "<span style='font:\"%s\"; font-size:%d; %s' "
+                "class='ocrx_word' title='%s; x_font %s; "
+                "x_fsize %d'>%s</span>"
+                % (
+                    (
+                        self.working_font,
+                        self.working_size,
+                        bold_and_italic_styles,
+                        self.bbox_repr(self.working_bbox),
+                        self.working_font,
+                        self.working_size,
+                        self.working_text.strip(),
+                    )
+                )
+            )
+        self.within_chars = False
+
+    def receive_layout(self, ltpage: LTPage) -> None:
+        def render(item: LTItem) -> None:
+            if self.within_chars and isinstance(item, LTAnno):
+                self.write_word()
+            if isinstance(item, LTPage):
+                self.page_bbox = item.bbox
+                self.write(
+                    "<div class='ocr_page' id='%s' title='%s'>\n"
+                    % (item.pageid, self.bbox_repr(item.bbox))
+                )
+                for child in item:
+                    render(child)
+                self.write("</div>\n")
+            elif isinstance(item, LTTextLine):
+                self.write(
+                    "<span class='ocr_line' title='%s'>" % ((self.bbox_repr(item.bbox)))
+                )
+                for child_line in item:
+                    render(child_line)
+                self.write("</span>\n")
+            elif isinstance(item, LTTextBox):
+                self.write(
+                    "<div class='ocr_block' id='%d' title='%s'>\n"
+                    % (item.index, self.bbox_repr(item.bbox))
+                )
+                for child in item:
+                    render(child)
+                self.write("</div>\n")
+            elif isinstance(item, LTChar):
+                if not self.within_chars:
+                    self.within_chars = True
+                    self.working_text = item.get_text()
+                    self.working_bbox = item.bbox
+                    self.working_font = item.fontname
+                    self.working_size = item.size
+                else:
+                    if len(item.get_text().strip()) == 0:
+                        self.write_word()
+                        self.write(item.get_text())
+                    else:
+                        if (
+                            self.working_bbox[1] != item.bbox[1]
+                            or self.working_font != item.fontname
+                            or self.working_size != item.size
+                        ):
+                            self.write_word()
+                            self.working_bbox = item.bbox
+                            self.working_font = item.fontname
+                            self.working_size = item.size
+                        self.working_text += item.get_text()
+                        self.working_bbox = (
+                            self.working_bbox[0],
+                            self.working_bbox[1],
+                            item.bbox[2],
+                            self.working_bbox[3],
+                        )
+
+        render(ltpage)
+
+    def close(self) -> None:
+        self.write_footer()
diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py
@@ -5,7 +5,13 @@
 from io import StringIO
 from typing import Any, BinaryIO, Container, Iterator, Optional, cast
 
-from .converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator
+from .converter import (
+    XMLConverter,
+    HTMLConverter,
+    TextConverter,
+    PDFPageAggregator,
+    HOCRConverter,
+)
 from .image import ImageWriter
 from .layout import LAParams, LTPage
 from .pdfdevice import PDFDevice, TagExtractor
@@ -41,8 +47,8 @@ def extract_text_to_fp(
     :param inf: a file-like object to read PDF structure from, such as a
         file handler (using the builtin `open()` function) or a `BytesIO`.
     :param outfp: a file-like object to write the text to.
-    :param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works
-        properly.
+    :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
+        Only 'text' works properly.
     :param codec: Text decoding codec
     :param laparams: An LAParams object from pdfminer.layout. Default is None
         but may not layout correctly.
@@ -100,6 +106,11 @@ def extract_text_to_fp(
             imagewriter=imagewriter,
         )
 
+    elif output_type == "hocr":
+        device = HOCRConverter(
+            rsrcmgr, outfp, codec=codec, laparams=laparams, stripcontrol=strip_control
+        )
+
     elif output_type == "tag":
         # Binary I/O is required, but we have no good way to test it here.
         device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)

diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py
@@ -111,6 +111,12 @@ def test_encryption_rc4_40(self):
     def test_encryption_rc4_128(self):
         run("encryption/rc4-128.pdf", "-P foo")
 
+    def test_html_simple1(self):
+        run("simple1.pdf", "-t html")
+
+    def test_hocr_simple1(self):
+        run("simple1.pdf", "-t hocr")
+
 
 class TestDumpImages:
     @staticmethod