Skip to content

Commit

Permalink
Add HOCRConverter (fixes #650) (#651)
Browse files Browse the repository at this point in the history
* Add HOCRConverter

* Add line to README.md

* Test cicd

* Test cicd 2

* Changes based on review comments

* Remove whitespace changes to CHANGELOG.md

* Remove duplicated html output

* Add link to hocr wiki

* Add tests for extracting hocr and html

Co-authored-by: Pieter Marsman <[email protected]>
  • Loading branch information
richardpaulhudson and pietermarsman authored Aug 14, 2022
1 parent f79ad56 commit 77df431
Show file tree
Hide file tree
Showing 5 changed files with 200 additions and 4 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# Changelog

All notable changes in pdfminer.six will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [Unreleased]

- Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))

### Fixed

- 'ValueError': when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773))
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Features

* Written entirely in Python.
* Parse, analyze, and convert PDF documents.
* Extract content as text, images, html or [hOCR](https://en.wikipedia.org/wiki/HOCR).
* PDF-1.7 specification support. (well, almost).
* CJK languages and vertical writing scripts support.
* Various font types (Type1, TrueType, Type3, and CID) support.
Expand Down
177 changes: 177 additions & 0 deletions pdfminer/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from . import utils
from .image import ImageWriter
from .layout import LAParams, LTComponent, TextGroupElement
from .layout import LTAnno
from .layout import LTChar
from .layout import LTContainer
from .layout import LTCurve
Expand Down Expand Up @@ -821,3 +822,179 @@ def render(item: LTItem) -> None:
def close(self) -> None:
self.write_footer()
return


class HOCRConverter(PDFConverter[AnyIO]):
"""Extract an hOCR representation from explicit text information within a PDF."""

# Where text is being extracted from a variety of types of PDF within a
# business process, those PDFs where the text is only present in image
# form will need to be analysed using an OCR tool which will typically
# output hOCR. This converter extracts the explicit text information from
# those PDFs that do have it and uses it to genxerate a basic hOCR
# representation that is designed to be used in conjunction with the image
# of the PDF in the same way as genuine OCR output would be, but without the
# inevitable OCR errors.

# The converter does not handle images, diagrams or text colors.

# In the examples processed by the contributor it was necessary to set
# LAParams.all_texts to True.

CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")

def __init__(
self,
rsrcmgr: PDFResourceManager,
outfp: AnyIO,
codec: str = "utf8",
pageno: int = 1,
laparams: Optional[LAParams] = None,
stripcontrol: bool = False,
):
PDFConverter.__init__(
self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams
)
self.stripcontrol = stripcontrol
self.within_chars = False
self.write_header()

def bbox_repr(self, bbox: Rect) -> str:
(in_x0, in_y0, in_x1, in_y1) = bbox
# PDF y-coordinates are the other way round from hOCR coordinates
out_x0 = int(in_x0)
out_y0 = int(self.page_bbox[3] - in_y1)
out_x1 = int(in_x1)
out_y1 = int(self.page_bbox[3] - in_y0)
return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"

def write(self, text: str) -> None:
if self.codec:
encoded_text = text.encode(self.codec)
cast(BinaryIO, self.outfp).write(encoded_text)
else:
cast(TextIO, self.outfp).write(text)

def write_header(self) -> None:
if self.codec:
self.write(
"<html xmlns='http://www.w3.org/1999/xhtml' "
"xml:lang='en' lang='en' charset='%s'>\n" % self.codec
)
else:
self.write(
"<html xmlns='http://www.w3.org/1999/xhtml' "
"xml:lang='en' lang='en'>\n"
)
self.write("<head>\n")
self.write("<title></title>\n")
self.write(
"<meta http-equiv='Content-Type' " "content='text/html;charset=utf-8' />\n"
)
self.write(
"<meta name='ocr-system' " "content='pdfminer.six HOCR Converter' />\n"
)
self.write(
" <meta name='ocr-capabilities'"
" content='ocr_page ocr_block ocr_line ocrx_word'/>\n"
)
self.write("</head>\n")
self.write("<body>\n")

def write_footer(self) -> None:
self.write("<!-- comment in the following line to debug -->\n")
self.write(
"<!--script src='https://unpkg.com/hocrjs'>" "</script--></body></html>\n"
)

def write_text(self, text: str) -> None:
if self.stripcontrol:
text = self.CONTROL.sub("", text)
self.write(text)

def write_word(self) -> None:
if len(self.working_text) > 0:
bold_and_italic_styles = ""
if "Italic" in self.working_font:
bold_and_italic_styles = "font-style: italic; "
if "Bold" in self.working_font:
bold_and_italic_styles += "font-weight: bold; "
self.write(
"<span style='font:\"%s\"; font-size:%d; %s' "
"class='ocrx_word' title='%s; x_font %s; "
"x_fsize %d'>%s</span>"
% (
(
self.working_font,
self.working_size,
bold_and_italic_styles,
self.bbox_repr(self.working_bbox),
self.working_font,
self.working_size,
self.working_text.strip(),
)
)
)
self.within_chars = False

def receive_layout(self, ltpage: LTPage) -> None:
def render(item: LTItem) -> None:
if self.within_chars and isinstance(item, LTAnno):
self.write_word()
if isinstance(item, LTPage):
self.page_bbox = item.bbox
self.write(
"<div class='ocr_page' id='%s' title='%s'>\n"
% (item.pageid, self.bbox_repr(item.bbox))
)
for child in item:
render(child)
self.write("</div>\n")
elif isinstance(item, LTTextLine):
self.write(
"<span class='ocr_line' title='%s'>" % ((self.bbox_repr(item.bbox)))
)
for child_line in item:
render(child_line)
self.write("</span>\n")
elif isinstance(item, LTTextBox):
self.write(
"<div class='ocr_block' id='%d' title='%s'>\n"
% (item.index, self.bbox_repr(item.bbox))
)
for child in item:
render(child)
self.write("</div>\n")
elif isinstance(item, LTChar):
if not self.within_chars:
self.within_chars = True
self.working_text = item.get_text()
self.working_bbox = item.bbox
self.working_font = item.fontname
self.working_size = item.size
else:
if len(item.get_text().strip()) == 0:
self.write_word()
self.write(item.get_text())
else:
if (
self.working_bbox[1] != item.bbox[1]
or self.working_font != item.fontname
or self.working_size != item.size
):
self.write_word()
self.working_bbox = item.bbox
self.working_font = item.fontname
self.working_size = item.size
self.working_text += item.get_text()
self.working_bbox = (
self.working_bbox[0],
self.working_bbox[1],
item.bbox[2],
self.working_bbox[3],
)

render(ltpage)

def close(self) -> None:
self.write_footer()
17 changes: 14 additions & 3 deletions pdfminer/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@
from io import StringIO
from typing import Any, BinaryIO, Container, Iterator, Optional, cast

from .converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator
from .converter import (
XMLConverter,
HTMLConverter,
TextConverter,
PDFPageAggregator,
HOCRConverter,
)
from .image import ImageWriter
from .layout import LAParams, LTPage
from .pdfdevice import PDFDevice, TagExtractor
Expand Down Expand Up @@ -41,8 +47,8 @@ def extract_text_to_fp(
:param inf: a file-like object to read PDF structure from, such as a
file handler (using the builtin `open()` function) or a `BytesIO`.
:param outfp: a file-like object to write the text to.
:param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works
properly.
:param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
Only 'text' works properly.
:param codec: Text decoding codec
:param laparams: An LAParams object from pdfminer.layout. Default is None
but may not layout correctly.
Expand Down Expand Up @@ -100,6 +106,11 @@ def extract_text_to_fp(
imagewriter=imagewriter,
)

elif output_type == "hocr":
device = HOCRConverter(
rsrcmgr, outfp, codec=codec, laparams=laparams, stripcontrol=strip_control
)

elif output_type == "tag":
# Binary I/O is required, but we have no good way to test it here.
device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
Expand Down
6 changes: 6 additions & 0 deletions tests/test_tools_pdf2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,12 @@ def test_encryption_rc4_40(self):
def test_encryption_rc4_128(self):
run("encryption/rc4-128.pdf", "-P foo")

def test_html_simple1(self):
run("simple1.pdf", "-t html")

def test_hocr_simple1(self):
run("simple1.pdf", "-t hocr")


class TestDumpImages:
@staticmethod
Expand Down

0 comments on commit 77df431

Please sign in to comment.