diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e0c466b..c08b7bf2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Changed +- When the layout parameter all_texts is True, the text inside figures is now also returned as elements in the document. ([#99](https://github.com/jstockwin/py-pdf-parser/pull/99)) ## [0.4.0] ### Added diff --git a/docs/source/example_files/figure.pdf b/docs/source/example_files/figure.pdf new file mode 100644 index 00000000..bd5a95ec Binary files /dev/null and b/docs/source/example_files/figure.pdf differ diff --git a/docs/source/examples/extracting_text_from_figures.rst b/docs/source/examples/extracting_text_from_figures.rst new file mode 100644 index 00000000..7a7e138c --- /dev/null +++ b/docs/source/examples/extracting_text_from_figures.rst @@ -0,0 +1,45 @@ +.. _extracting-text-from-figures: + +Extracting Text From Figures +---------------------------- +PDFs are structured documents, and can contain Figures. By default, PDFMiner.six and +hence py-pdf-parser does not extract text from figures. + +You can :download:`download an example here `. In the +example, there is figure which contains a red square, and some text. Below the figure +there is some more text. + +By default, the text in the figure will not be included: + +.. code-block:: python + + from py_pdf_parser import load_file + document = load_file("figure.pdf") + print([element.text() for element in document.elements]) + +which results in: + +:: + + ["Here is some text outside of an image"] + +To include the text inside the figure, we must pass the ``all_texts`` layout parameter. +This is documented in the PDFMiner.six documentation, `here +`_. + +The layout parameters can be passed to both :meth:`~py_pdf_parser.loaders.load` and +:meth:`~py-pdf-parser.loaders.load_file` as a dictionary to the ``la_params`` argument. + +In our case: + +.. code-block:: python + + from py_pdf_parser import load_file + document = load_file("figure.pdf", la_params={"all_texts": True}) + print([element.text() for element in document.elements]) + +which results in: + +:: + + ["This is some text in an image", "Here is some text outside of an image"] diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index 31e0fce3..f69b2e7f 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -7,6 +7,7 @@ Below you can find links to the following examples: - The :ref:`order-summary` example explains how to use font mappings, sections, and how to extract simple tables. - The :ref:`more-tables` example explains tables in more detail, showing how to extract more complex tables. - The :ref:`element-ordering` example shows how to specify different orderings for the elements on a page. +- The :ref:`extracting-text-from-figures` example shows how to extract text from figures. .. toctree:: @@ -14,4 +15,5 @@ Below you can find links to the following examples: order_summary more_tables element_ordering + extracting_text_from_figures diff --git a/py_pdf_parser/loaders.py b/py_pdf_parser/loaders.py index 3e3e26b5..d0b93f5b 100644 --- a/py_pdf_parser/loaders.py +++ b/py_pdf_parser/loaders.py @@ -3,7 +3,7 @@ import logging from pdfminer.high_level import extract_pages -from pdfminer.layout import LTTextContainer, LAParams +from pdfminer.layout import LTTextContainer, LAParams, LTFigure from .components import PDFDocument @@ -74,6 +74,17 @@ def load( pages: Dict[int, Page] = {} for page in extract_pages(pdf_file, laparams=LAParams(**la_params)): elements = [element for element in page if isinstance(element, LTTextContainer)] + + # If all_texts=True then we may get some text from inside figures + if la_params.get("all_texts"): + figures = (element for element in page if isinstance(element, LTFigure)) + for figure in figures: + elements += [ + element + for element in figure + if isinstance(element, LTTextContainer) + ] + if not elements: logger.warning( f"No elements detected on page {page.pageid}, skipping this page." diff --git a/tests/data/image.pdf b/tests/data/image.pdf new file mode 100644 index 00000000..bd5a95ec Binary files /dev/null and b/tests/data/image.pdf differ diff --git a/tests/test_doc_examples/test_extracting_text_from_figures.py b/tests/test_doc_examples/test_extracting_text_from_figures.py new file mode 100644 index 00000000..9a308bfa --- /dev/null +++ b/tests/test_doc_examples/test_extracting_text_from_figures.py @@ -0,0 +1,24 @@ +import os + +from py_pdf_parser.loaders import load_file +from tests.base import BaseTestCase + + +class TestExtractingTextFromFigures(BaseTestCase): + def test_output_is_correct(self): + file_path = os.path.join( + os.path.dirname(__file__), "../../docs/source/example_files/figure.pdf" + ) + + # Without all_texts + document = load_file(file_path) + self.assertListEqual( + [element.text() for element in document.elements], + ["Here is some text outside of an image"], + ) + + document = load_file(file_path, la_params={"all_texts": True}) + self.assertListEqual( + [element.text() for element in document.elements], + ["This is some text in an image", "Here is some text outside of an image"], + ) diff --git a/tests/test_loaders.py b/tests/test_loaders.py index 3ef6e90f..81637183 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -17,3 +17,21 @@ def test_load(self): with open(file_path, "rb") as in_file: document = load(in_file) self.assertIsInstance(document, PDFDocument) + + def test_load_with_text_in_image(self): + file_path = os.path.join(os.path.dirname(__file__), "data", "image.pdf") + with open(file_path, "rb") as in_file: + document = load(in_file) + self.assertIsInstance(document, PDFDocument) + self.assertEqual(len(document.elements), 1) + + with open(file_path, "rb") as in_file: + document = load(in_file, la_params={"all_texts": True}) + self.assertIsInstance(document, PDFDocument) + self.assertEqual(len(document.elements), 2) + + def test_load_file_with_text_in_image(self): + file_path = os.path.join(os.path.dirname(__file__), "data", "image.pdf") + document = load_file(file_path, la_params={"all_texts": True}) + self.assertIsInstance(document, PDFDocument) + self.assertEqual(len(document.elements), 2)