pdfminer · pietermarsman · Aug 14, 2022 · Jul 29, 2021 · Jan 25, 2022 · Jan 25, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,12 @@
 # Changelog
-All notable changes in pdfminer.six will be documented in this file. 
+All notable changes in pdfminer.six will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
 ### Added
+- Converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
 - Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
 
 ### Fixed
@@ -34,7 +35,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - Using `io.TextIOBase` as the file to write to ([#616](https://github.com/pdfminer/pdfminer.six/pull/616))
 - Parsing \r\n after the escape character in a literal string ([#616](https://github.com/pdfminer/pdfminer.six/pull/616))
 
-## Removed
+### Removed
 - Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522))
 - Unused dependency on `sortedcontainers` package ([#525](https://github.com/pdfminer/pdfminer.six/pull/525))
 - Support for non-standard output streams that are not binary ([#523](https://github.com/pdfminer/pdfminer.six/pull/523))
@@ -76,7 +77,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
 - Raise a warning instead of an error when extracting text from a non-extractable PDF ([#453](https://github.com/pdfminer/pdfminer.six/pull/453))
 - Switched from pycryptodome to cryptography package for AES decryption ([#456](https://github.com/pdfminer/pdfminer.six/pull/456))
-  
+
 ## [20200517]
 
 ### Added
@@ -133,18 +134,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Deprecated
 - The argument `_py2_no_more_posargs` because Python2 is removed on January
-, 2020 ([#328](https://github.com/pdfminer/pdfminer.six/pull/328) and 
+, 2020 ([#328](https://github.com/pdfminer/pdfminer.six/pull/328) and
 [#307](https://github.com/pdfminer/pdfminer.six/pull/307))
 
 ### Added
 - Simple wrapper to easily extract text from a PDF file [#330](https://github.com/pdfminer/pdfminer.six/pull/330)
 - Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46))
-- Sphinx documentation that is published on 
+- Sphinx documentation that is published on
   [Read the Docs](https://pdfminersix.readthedocs.io/)
   ([#329](https://github.com/pdfminer/pdfminer.six/pull/329))
 
 ### Fixed
-- Unhandled AssertionError when dumping pdf containing reference to object id 0 
+- Unhandled AssertionError when dumping pdf containing reference to object id 0
  ([#318](https://github.com/pdfminer/pdfminer.six/pull/318))
 - Debug flag actually changes logging level to debug for pdf2txt.py and
  dumppdf.py ([#325](https://github.com/pdfminer/pdfminer.six/pull/325))

diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ Features
 
 * Written entirely in Python.
 * Parse, analyze, and convert PDF documents.
+* Extract content as text, images, html or hOCR.
 * PDF-1.7 specification support. (well, almost).
 * CJK languages and vertical writing scripts support.
 * Various font types (Type1, TrueType, Type3, and CID) support.

diff --git a/pdfminer/converter.py b/pdfminer/converter.py
@@ -723,3 +723,152 @@ def render(item: LTItem) -> None:
     def close(self) -> None:
         self.write_footer()
         return
+
+
+class HOCRConverter(PDFConverter):
+    """
+    Where text is being extracted from a variety of types of PDF within a
+    business process, those PDFs where the text is only present in image
+    form will need to be analysed using an OCR tool which will typically
+    output hOCR. This converter extracts the explicit text information from
+    those PDFs that do have it and uses it to genxerate a basic hOCR
+    representation that is designed to be used in conjunction with the image
+    of the PDF in the same way as genuine OCR output would be, but without the
+    inevitable OCR errors.
+
+    The converter does not handle images, diagrams or text colors.
+
+    In the examples processed by the contributor it was necessary to set
+    LAParams.all_texts to True.
+    """
+
+    CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
+
+    def __init__(self, rsrcmgr, outfp, codec='utf8', pageno=1,
+                 laparams=None, stripcontrol=False):
+        PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
+                              laparams=laparams)
+        self.stripcontrol = stripcontrol
+        self.within_chars = False
+        self.write_header()
+
+    def bbox_repr(self, bbox):
+        (x0, y0, x1, y1) = bbox
+        # pdf y-coordinates are the other way round from hOCR coordinates
+        return " ".join((
+            "bbox",
+            str(round(x0, 3)),
+            str(round(self.page_bbox[3]-y1, 0)),
+            str(round(x1, 0)),
+            str(round(self.page_bbox[3]-y0, 0))
+        ))
+
+    def write(self, text):
+        if self.codec:
+            text = text.encode(self.codec)
+        self.outfp.write(text)
+
+    def write_header(self):
+        if self.codec:
+            self.write(
+                "<html xmlns='http://www.w3.org/1999/xhtml' "
+                "xml:lang='en' lang='en' charset='%s'>\n" % self.codec)
+        else:
+            self.write(
+                "<html xmlns='http://www.w3.org/1999/xhtml' "
+                "xml:lang='en' lang='en'>\n")
+        self.write("<head>\n")
+        self.write("<title></title>\n")
+        self.write("<meta http-equiv='Content-Type' "
+                   "content='text/html;charset=utf-8' />\n")
+        self.write("<meta name='ocr-system' "
+                   "content='PDFMiner HOCR Converter' />\n")
+        self.write("  <meta name='ocr-capabilities'"
+                   " content='ocr_page ocr_block ocr_line ocrx_word'/>\n")
+        self.write("</head>\n")
+        self.write("<body>\n")
+
+    def write_footer(self):
+        self.write("<!-- comment in the following line to debug -->\n")
+        self.write("<!--script src='https://unpkg.com/hocrjs'>"
+                   "</script--></body>\n")
+
+    def write_text(self, text):
+        if self.stripcontrol:
+            text = self.CONTROL.sub("", text)
+        self.write(text)
+
+    def write_word(self):
+        if len(self.working_text) > 0:
+            bold_and_italic_styles = ''
+            if 'Italic' in self.working_font:
+                bold_and_italic_styles = 'font-style: italic; '
+            if 'Bold' in self.working_font:
+                bold_and_italic_styles += 'font-weight: bold; '
+            self.write("<span style='font:\"%s\"; font-size:%d; %s'"
+                       "class='ocrx_word' title='%s; x_font %s; "
+                       "x_fsize %d'>%s</span>" %
+                       ((self.working_font, self.working_size,
+                         bold_and_italic_styles,
+                         self.bbox_repr(self.working_bbox),
+                         self.working_font, self.working_size,
+                         self.working_text.strip())))
+        self.within_chars = False
+
+    def receive_layout(self, ltpage):
+
+        def render(item):
+            if self.within_chars and not isinstance(item, LTChar):
+                self.write_word()
+            if isinstance(item, LTPage):
+                self.page_bbox = item.bbox
+                self.write(
+                    "<div class='ocr_page' id='%s' title='%s'>\n" %
+                    (item.pageid, self.bbox_repr(item.bbox)))
+                for child in item:
+                    render(child)
+                self.write("</div>\n")
+            elif isinstance(item, LTTextLine):
+                self.write("<span class='ocr_line' title='%s'>" %
+                           ((self.bbox_repr(item.bbox))))
+                for child in item:
+                    render(child)
+                self.write("</span>\n")
+            elif isinstance(item, LTTextBox):
+                self.write(
+                    "<div class='ocr_block' id='%d' title='%s'>\n" %
+                    (item.index, self.bbox_repr(item.bbox)))
+                for child in item:
+                    render(child)
+                self.write("</div>\n")
+            elif isinstance(item, LTChar):
+                if not self.within_chars:
+                    self.within_chars = True
+                    self.working_text = item.get_text()
+                    self.working_bbox = item.bbox
+                    self.working_font = item.fontname
+                    self.working_size = item.size
+                else:
+                    if len(item.get_text().strip()) == 0:
+                        self.write_word()
+                        self.write(item.get_text())
+                    else:
+                        if self.working_bbox[1] != item.bbox[1] \
+                                or self.working_font != \
+                                item.fontname or self.working_size \
+                                != item.size:
+                            self.write_word()
+                            self.working_bbox = item.bbox
+                            self.working_font = item.fontname
+                            self.working_size = item.size
+                        self.working_text += item.get_text()
+                        self.working_bbox = (
+                            self.working_bbox[0],
+                            self.working_bbox[1],
+                            item.bbox[2],
+                            self.working_bbox[3])
+
+        render(ltpage)
+
+    def close(self):
+        self.write_footer()
diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py
@@ -6,7 +6,7 @@
 from typing import Any, BinaryIO, Container, Iterator, Optional, cast
 
 from .converter import XMLConverter, HTMLConverter, TextConverter, \
-    PDFPageAggregator
+    HOCRConverter, PDFPageAggregator
 from .image import ImageWriter
 from .layout import LAParams, LTPage
 from .pdfdevice import PDFDevice, TagExtractor
@@ -42,8 +42,8 @@ def extract_text_to_fp(
     :param inf: a file-like object to read PDF structure from, such as a
         file handler (using the builtin `open()` function) or a `BytesIO`.
     :param outfp: a file-like object to write the text to.
-    :param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works
-        properly.
+    :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
+        Only 'text' works properly.
     :param codec: Text decoding codec
     :param laparams: An LAParams object from pdfminer.layout. Default is None
         but may not layout correctly.
@@ -88,6 +88,10 @@ def extract_text_to_fp(
         device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                                layoutmode=layoutmode, laparams=laparams,
                                imagewriter=imagewriter)
+
+    elif output_type == 'hocr':
+        device = HOCRConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
+                               stripcontrol=strip_control)
 
     elif output_type == 'tag':
         # Binary I/O is required, but we have no good way to test it here.