[loaders] Include text from figures when all_texts=True

Closes #98
jstockwin · Jun 23, 2020 · 14dbd7d · 14dbd7d
1 parent a723235
commit 14dbd7d
Show file tree

Hide file tree

Showing 8 changed files with 103 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+### Changed
+- When the layout parameter all_texts is True, the text inside figures is now also returned as elements in the document. ([#99](https://github.com/jstockwin/py-pdf-parser/pull/99))
 
 ## [0.4.0]
 ### Added

diff --git a/docs/source/example_files/figure.pdf b/docs/source/example_files/figure.pdf
diff --git a/docs/source/examples/extracting_text_from_figures.rst b/docs/source/examples/extracting_text_from_figures.rst
@@ -0,0 +1,45 @@
+.. _extracting-text-from-figures:
+
+Extracting Text From Figures
+----------------------------
+PDFs are structured documents, and can contain Figures. By default, PDFMiner.six and
+hence py-pdf-parser does not extract text from figures.
+
+You can :download:`download an example here </example_files/figure.pdf>`. In the
+example, there is figure which contains a red square, and some text. Below the figure
+there is some more text.
+
+By default, the text in the figure will not be included:
+
+.. code-block:: python
+
+   from py_pdf_parser import load_file
+   document = load_file("figure.pdf")
+   print([element.text() for element in document.elements])
+
+which results in:
+
+::
+
+   ["Here is some text outside of an image"]
+
+To include the text inside the figure, we must pass the ``all_texts`` layout parameter.
+This is documented in the PDFMiner.six documentation, `here
+<https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_.
+
+The layout parameters can be passed to both :meth:`~py_pdf_parser.loaders.load` and
+:meth:`~py-pdf-parser.loaders.load_file` as a dictionary to the ``la_params`` argument.
+
+In our case:
+
+.. code-block:: python
+
+   from py_pdf_parser import load_file
+   document = load_file("figure.pdf", la_params={"all_texts": True})
+   print([element.text() for element in document.elements])
+
+which results in:
+
+::
+
+   ["This is some text in an image", "Here is some text outside of an image"]
diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst
@@ -7,11 +7,13 @@ Below you can find links to the following examples:
 - The :ref:`order-summary` example explains how to use font mappings, sections, and how to extract simple tables.
 - The :ref:`more-tables` example explains tables in more detail, showing how to extract more complex tables.
 - The :ref:`element-ordering` example shows how to specify different orderings for the elements on a page.
+- The :ref:`extracting-text-from-figures` example shows how to extract text from figures.
 
 .. toctree::
 
    simple_memo
    order_summary
    more_tables
    element_ordering
+   extracting_text_from_figures
 
diff --git a/py_pdf_parser/loaders.py b/py_pdf_parser/loaders.py
@@ -3,7 +3,7 @@
 import logging
 
 from pdfminer.high_level import extract_pages
-from pdfminer.layout import LTTextContainer, LAParams
+from pdfminer.layout import LTTextContainer, LAParams, LTFigure
 
 from .components import PDFDocument
 
@@ -74,6 +74,17 @@ def load(
     pages: Dict[int, Page] = {}
     for page in extract_pages(pdf_file, laparams=LAParams(**la_params)):
         elements = [element for element in page if isinstance(element, LTTextContainer)]
+
+        # If all_texts=True then we may get some text from inside figures
+        if la_params.get("all_texts"):
+            figures = (element for element in page if isinstance(element, LTFigure))
+            for figure in figures:
+                elements += [
+                    element
+                    for element in figure
+                    if isinstance(element, LTTextContainer)
+                ]
+
         if not elements:
             logger.warning(
                 f"No elements detected on page {page.pageid}, skipping this page."

diff --git a/tests/data/image.pdf b/tests/data/image.pdf
diff --git a/tests/test_doc_examples/test_extracting_text_from_figures.py b/tests/test_doc_examples/test_extracting_text_from_figures.py
@@ -0,0 +1,24 @@
+import os
+
+from py_pdf_parser.loaders import load_file
+from tests.base import BaseTestCase
+
+
+class TestExtractingTextFromFigures(BaseTestCase):
+    def test_output_is_correct(self):
+        file_path = os.path.join(
+            os.path.dirname(__file__), "../../docs/source/example_files/figure.pdf"
+        )
+
+        # Without all_texts
+        document = load_file(file_path)
+        self.assertListEqual(
+            [element.text() for element in document.elements],
+            ["Here is some text outside of an image"],
+        )
+
+        document = load_file(file_path, la_params={"all_texts": True})
+        self.assertListEqual(
+            [element.text() for element in document.elements],
+            ["This is some text in an image", "Here is some text outside of an image"],
+        )
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
@@ -17,3 +17,21 @@ def test_load(self):
         with open(file_path, "rb") as in_file:
             document = load(in_file)
         self.assertIsInstance(document, PDFDocument)
+
+    def test_load_with_text_in_image(self):
+        file_path = os.path.join(os.path.dirname(__file__), "data", "image.pdf")
+        with open(file_path, "rb") as in_file:
+            document = load(in_file)
+        self.assertIsInstance(document, PDFDocument)
+        self.assertEqual(len(document.elements), 1)
+
+        with open(file_path, "rb") as in_file:
+            document = load(in_file, la_params={"all_texts": True})
+        self.assertIsInstance(document, PDFDocument)
+        self.assertEqual(len(document.elements), 2)
+
+    def test_load_file_with_text_in_image(self):
+        file_path = os.path.join(os.path.dirname(__file__), "data", "image.pdf")
+        document = load_file(file_path, la_params={"all_texts": True})
+        self.assertIsInstance(document, PDFDocument)
+        self.assertEqual(len(document.elements), 2)