diff --git a/CHANGELOG.md b/CHANGELOG.md index 46f4fdfb..4b7e923d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added `__len__` and `__repr__` functions to the Section class. ([#90](https://github.com/jstockwin/py-pdf-parser/pull/90)) - Added flag to `extract_simple_table` and `extract_table` functions to remove duplicate header rows. ([#89](https://github.com/jstockwin/py-pdf-parser/pull/89)) +- You can now specify `element_ordering` when instantiating a PDFDocument. This defaults to the old behaviour or left to right, top to bottom. ([#95](https://github.com/jstockwin/py-pdf-parser/pull/95)) + ### Changed - Advanced layout analysis is now disabled by default. ([#88](https://github.com/jstockwin/py-pdf-parser/pull/88)) diff --git a/docs/source/example_files/columns.pdf b/docs/source/example_files/columns.pdf new file mode 100644 index 00000000..de4db28a Binary files /dev/null and b/docs/source/example_files/columns.pdf differ diff --git a/docs/source/example_files/grid.pdf b/docs/source/example_files/grid.pdf new file mode 100644 index 00000000..a0107f00 Binary files /dev/null and b/docs/source/example_files/grid.pdf differ diff --git a/docs/source/examples/element_ordering.rst b/docs/source/examples/element_ordering.rst new file mode 100644 index 00000000..d28ed2c5 --- /dev/null +++ b/docs/source/examples/element_ordering.rst @@ -0,0 +1,163 @@ +.. _element-ordering: + +Element Ordering +---------------- + +In this example, we see how to specify a custom ordering for the elements. + +For this we will use a simple pdf, which has a single element in each corner of the +page. You can :download:`download the example here `. + + +Default +....... + +The default element ordering is left to right, top to bottom. + +.. code-block:: python + + from py_pdf_parser.loaders import load_file + + file_path = "grid.pdf" + + # Default - left to right, top to bottom + document = load_file(file_path) + print([element.text() for element in document.elements]) + +This results in +:: + + ['Top Left', 'Top Right', 'Bottom Left', 'Bottom Right'] + +Presets +....... + +There are also preset orderings for ``right to left, top to bottom``, +``top to bottom, left to right``, and ``top to bottom, right to left``. You can use +these by importing the :class:`~py_pdf_parser.components.ElementOrdering` class from +:py:mod:`py_pdf_parser.components` and passing these as the ``element_ordering`` +argument to :class:`~py_pdf_parser.components.PDFDocument`. Note that keyword arguments +to :meth:`~py_pdf_parser.loaders.load` and :meth:`~py_pdf_parser.loaders.load_file` get +passed through to the :class:`~py_pdf_parser.components.PDFDocument`. + +.. code-block:: python + + from py_pdf_parser.loaders import load_file + from py_pdf_parser.components import ElementOrdering + + # Preset - right to left, top to bottom + document = load_file( + file_path, element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM + ) + print([element.text() for element in document.elements]) + + # Preset - top to bottom, left to right + document = load_file( + file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT + ) + print([element.text() for element in document.elements]) + + # Preset - top to bottom, right to left + document = load_file( + file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT + ) + print([element.text() for element in document.elements]) + +which results in + +:: + + ['Top Right', 'Top Left', 'Bottom Right', 'Bottom Left'] + ['Bottom Left', 'Top Left', 'Bottom Right', 'Top Right'] + ['Top Right', 'Bottom Right', 'Top Left', 'Bottom Left'] + +Custom Ordering +............... + +If none of the presets give an ordering you are looking for, you can also pass a +callable as the ``element_ordering`` argument of +:class:`~py_pdf_parser.components.PDFDocument`. This callable will be given a list of +elements for each page, and should return a list of the same elements, in the desired +order. + +.. important:: + + The elements which get passed to your function will be PDFMiner.six elements, and NOT + class :class:`~py_pdf_parser.componenets.PDFElement`. You can access the ``x0``, + ``x1``, ``y0``, ``y1`` directly, and extract the text using `get_text()`. Other + options are available: please familiarise yourself with the PDFMiner.six + documentation. + +.. note:: + + Your function will be called multiple times, once for each page of the document. + Elements will always be considered in order of increasing page number, your function + only controls the ordering within each page. + +For example, if we wanted to implement an ordering which is bottom to top, left to right +then we can do this as follows: + +.. code-block:: python + + from py_pdf_parser.loaders import load_file + + # Custom - bottom to top, left to right + def ordering_function(elements): + """ + Note: Elements will be PDFMiner.six elements. The x axis is positive as you go left + to right, and the y axis is positive as you go bottom to top, and hence we can + simply sort according to this. + """ + return sorted(elements, key=lambda elem: (elem.x0, elem.y0)) + + + document = load_file(file_path, element_ordering=ordering_function) + print([element.text() for element in document.elements]) + +which results in + +:: + + ['Bottom Left', 'Top Left', 'Bottom Right', 'Top Right'] + +Multiple Columns +................ + +Finally, suppose our PDF has multiple columns, like +:download:`this example `. + +If we don't specify an ``element_ordering``, the elements will be extracted in the +following order: + +:: + + ['Column 1 Title', 'Column 2 Title', 'Here is some column 1 text.', 'Here is some column 2 text.', 'Col 1 left', 'Col 1 right', 'Col 2 left', 'Col 2 right'] + +If we visualise this document +(see the :ref:`simple-memo` example if you don't know how to do this), then we can see +that the column divider is at an ``x`` value of about 300. Using this information, we +can specify a custom ordering function which will order the elements left to right, +top to bottom, but in each column individually. + +.. code-block:: python + + from py_pdf_parser.loaders import load_file + + document = load_file("columns.pdf") + + def column_ordering_function(elements): + """ + The first entry in the key is False for colum 1, and Tru for column 2. The second + and third keys just give left to right, top to bottom. + """ + return sorted(elements, key=lambda elem: (elem.x0 > 300, -elem.y0, elem.x0)) + + + document = load_file(file_path, element_ordering=column_ordering_function) + print([element.text() for element in document.elements]) + +which returns the elements in the correct order: + +:: + + ['Column 1 Title', 'Here is some column 1 text.', 'Col 1 left', 'Col 1 right', 'Column 2 Title', 'Here is some column 2 text.', 'Col 2 left', 'Col 2 right'] diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index 873761a8..31e0fce3 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -6,10 +6,12 @@ Below you can find links to the following examples: - The :ref:`simple-memo` example shows the very basics of using py-pdf-parser. You will see how to load a pdf document, start filtering the elements, and extract text from certain elements in the document. - The :ref:`order-summary` example explains how to use font mappings, sections, and how to extract simple tables. - The :ref:`more-tables` example explains tables in more detail, showing how to extract more complex tables. +- The :ref:`element-ordering` example shows how to specify different orderings for the elements on a page. .. toctree:: simple_memo order_summary more_tables + element_ordering diff --git a/py_pdf_parser/components.py b/py_pdf_parser/components.py index 233ea355..9b40e9c8 100644 --- a/py_pdf_parser/components.py +++ b/py_pdf_parser/components.py @@ -1,7 +1,8 @@ -from typing import Dict, List, Set, Optional, Union, TYPE_CHECKING +from typing import Callable, Dict, List, Set, Optional, Union, TYPE_CHECKING import re from collections import Counter, defaultdict +from enum import Enum, auto from itertools import chain from .common import BoundingBox @@ -14,6 +15,33 @@ from pdfminer.layout import LTComponent +class ElementOrdering(Enum): + """ + A class enumerating the available presets for element_ordering. + """ + + LEFT_TO_RIGHT_TOP_TO_BOTTOM = auto() + RIGHT_TO_LEFT_TOP_TO_BOTTOM = auto() + TOP_TO_BOTTOM_LEFT_TO_RIGHT = auto() + TOP_TO_BOTTOM_RIGHT_TO_LEFT = auto() + + +_ELEMENT_ORDERING_FUNCTIONS: Dict[ElementOrdering, Callable[[List], List]] = { + ElementOrdering.LEFT_TO_RIGHT_TOP_TO_BOTTOM: lambda elements: sorted( + elements, key=lambda elem: (-elem.y0, elem.x0) + ), + ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM: lambda elements: sorted( + elements, key=lambda elem: (-elem.y0, -elem.x0) + ), + ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT: lambda elements: sorted( + elements, key=lambda elem: (elem.x0, -elem.y0) + ), + ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT: lambda elements: sorted( + elements, key=lambda elem: (-elem.x0, -elem.y0) + ), +} + + class PDFPage: """ A representation of a page within the `PDFDocument`. @@ -325,6 +353,11 @@ class PDFDocument: Default: 0. font_size_precision (int): How much rounding to apply to the font size. The font size will be rounded to this many decimal places. + element_ordering (ElementOrdering or callable, optional): An ordering function + for the elements. Either a member of the ElementOrdering Enum, or a callable + which takes a list of elements and returns an ordered list of elements. This + will be called separately for each page. Note that the elements in this case + will be PDFMiner elements, and not PDFElements from this package. Attributes: pages (list): A list of all `PDFPages` in the document. @@ -337,7 +370,8 @@ class PDFDocument: number_of_pages: int page_numbers: List[int] sectioning: "Sectioning" - # _element_list will contain all elements, sorted from top to bottom, left to right. + # _element_list will contain all elements, sorted according to element_ordering + # (default left to right, top to bottom). _element_list: List[PDFElement] # _element_indexes_by_font will be a caching of fonts to elements indexes but it # will be built as needed (while filtering by fonts), not on document load. @@ -357,6 +391,9 @@ def __init__( font_mapping_is_regex: bool = False, regex_flags: Union[int, re.RegexFlag] = 0, font_size_precision: int = 1, + element_ordering: Union[ + ElementOrdering, Callable[[List], List] + ] = ElementOrdering.LEFT_TO_RIGHT_TOP_TO_BOTTOM, ): self.sectioning = Sectioning(self) self._element_list = [] @@ -369,7 +406,11 @@ def __init__( idx = 0 for page_number, page in sorted(pages.items()): first_element = None - for element in sorted(page.elements, key=lambda elem: (-elem.y0, elem.x0)): + if isinstance(element_ordering, ElementOrdering): + sort_func = _ELEMENT_ORDERING_FUNCTIONS[element_ordering] + else: + sort_func = element_ordering + for element in sort_func(page.elements): pdf_element = PDFElement( document=self, element=element, diff --git a/py_pdf_parser/filtering.py b/py_pdf_parser/filtering.py index 248a7279..51fc5554 100644 --- a/py_pdf_parser/filtering.py +++ b/py_pdf_parser/filtering.py @@ -684,7 +684,8 @@ def before(self, element: "PDFElement", inclusive: bool = False) -> "ElementList Returns all elements before the specified element. By before, we mean preceding elements according to their index. The PDFDocument - will order elements left to right, top to bottom (as you would normally read). + will order elements according to the specified element_ordering (which defaults + to left to right, top to bottom). Args: element (PDFElement): The element in question. @@ -704,7 +705,8 @@ def after(self, element: "PDFElement", inclusive: bool = False) -> "ElementList" Returns all elements after the specified element. By after, we mean succeeding elements according to their index. The PDFDocument - will order elements left to right, top to bottom (as you would normally read). + will order elements according to the specified element_ordering (which defaults + to left to right, top to bottom). Args: element (PDFElement): The element in question. @@ -729,7 +731,8 @@ def between( Returns all elements between the start and end elements. This is done according to the element indexes. The PDFDocument will order - elements left to right, top to bottom (as you would normally read). + elements according to the specified element_ordering (which defaults + to left to right, top to bottom). This is the same as applying `before` with `start_element` and `after` with `end_element`. diff --git a/tests/base.py b/tests/base.py index 12a7e5ce..fa2f7b3d 100644 --- a/tests/base.py +++ b/tests/base.py @@ -37,7 +37,7 @@ def assert_original_element_list_list_equal( def assert_original_element_list_equal( self, original_element_list: List[Optional["LTComponent"]], - element_list: List[Optional["PDFElement"]], + element_list: Union[List[Optional["PDFElement"]], "ElementList"], ): self.assertEqual(len(original_element_list), len(element_list)) for original_element, element in zip(original_element_list, element_list): diff --git a/tests/test_components.py b/tests/test_components.py index ddb7aed9..891314a8 100644 --- a/tests/test_components.py +++ b/tests/test_components.py @@ -3,13 +3,13 @@ from ddt import ddt, data from py_pdf_parser.common import BoundingBox -from py_pdf_parser.components import PDFDocument +from py_pdf_parser.components import PDFDocument, ElementOrdering from py_pdf_parser.filtering import ElementList from py_pdf_parser.loaders import Page from py_pdf_parser.exceptions import NoElementsOnPageError, PageNotFoundError from .base import BaseTestCase -from .utils import create_pdf_element, FakePDFMinerTextElement +from .utils import create_pdf_element, create_pdf_document, FakePDFMinerTextElement @ddt @@ -286,3 +286,56 @@ def test_document(self): def test_document_with_blank_page(self): with self.assertRaises(NoElementsOnPageError): PDFDocument(pages={1: Page(elements=[], width=100, height=100)}) + + def test_element_ordering(self): + # elem_1 elem_2 + # elem_3 elem_4 + elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) + elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) + elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) + elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5)) + + # Check default: left to right, top to bottom + document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4]) + self.assert_original_element_list_equal( + [elem_1, elem_2, elem_3, elem_4], document.elements + ) + + # Check other presets + document = create_pdf_document( + elements=[elem_1, elem_2, elem_3, elem_4], + element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM, + ) + self.assert_original_element_list_equal( + [elem_2, elem_1, elem_4, elem_3], document.elements + ) + + document = create_pdf_document( + elements=[elem_1, elem_2, elem_3, elem_4], + element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT, + ) + self.assert_original_element_list_equal( + [elem_1, elem_3, elem_2, elem_4], document.elements + ) + + document = create_pdf_document( + elements=[elem_1, elem_2, elem_3, elem_4], + element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT, + ) + self.assert_original_element_list_equal( + [elem_2, elem_4, elem_1, elem_3], document.elements + ) + + # Check custom function + document = create_pdf_document( + elements=[elem_1, elem_2, elem_3, elem_4], + element_ordering=lambda elements: [ + elements[0], + elements[3], + elements[1], + elements[2], + ], + ) + self.assert_original_element_list_equal( + [elem_1, elem_4, elem_2, elem_3], document.elements + ) diff --git a/tests/test_doc_examples/test_element_ordering.py b/tests/test_doc_examples/test_element_ordering.py new file mode 100644 index 00000000..ec7939aa --- /dev/null +++ b/tests/test_doc_examples/test_element_ordering.py @@ -0,0 +1,100 @@ +import os + +from py_pdf_parser.components import ElementOrdering +from py_pdf_parser.loaders import load_file + +from tests.base import BaseTestCase + + +class TestSimpleMemo(BaseTestCase): + def test_output_is_correct(self): + file_path = os.path.join( + os.path.dirname(__file__), "../../docs/source/example_files/grid.pdf" + ) + + # Default - left to right, top to bottom + document = load_file(file_path) + self.assertListEqual( + [element.text() for element in document.elements], + ["Top Left", "Top Right", "Bottom Left", "Bottom Right"], + ) + + # Preset - right to left, top to bottom + document = load_file( + file_path, element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM + ) + self.assertListEqual( + [element.text() for element in document.elements], + ["Top Right", "Top Left", "Bottom Right", "Bottom Left"], + ) + + # Preset - top to bottom, left to right + document = load_file( + file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT + ) + self.assertListEqual( + [element.text() for element in document.elements], + ["Bottom Left", "Top Left", "Bottom Right", "Top Right"], + ) + + # Preset - top to bottom, right to left + document = load_file( + file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT + ) + self.assertListEqual( + [element.text() for element in document.elements], + ["Top Right", "Bottom Right", "Top Left", "Bottom Left"], + ) + + # Custom - bottom to top, left to right + def ordering_function(elements): + return sorted(elements, key=lambda elem: (elem.x0, elem.y0)) + + document = load_file(file_path, element_ordering=ordering_function) + self.assertListEqual( + [element.text() for element in document.elements], + ["Bottom Left", "Top Left", "Bottom Right", "Top Right"], + ) + + # Custom - This PDF has columns! + # TODO: CHANGE PATH! + file_path = os.path.join( + os.path.dirname(__file__), "../../docs/source/example_files/columns.pdf" + ) + + # Default - left to right, top to bottom + document = load_file(file_path) + self.assertListEqual( + [element.text() for element in document.elements], + [ + "Column 1 Title", + "Column 2 Title", + "Here is some column 1 text.", + "Here is some column 2 text.", + "Col 1 left", + "Col 1 right", + "Col 2 left", + "Col 2 right", + ], + ) + + # Visualise, and we can see that the middle is at around x = 300. + # visualise(document) + + def column_ordering_function(elements): + return sorted(elements, key=lambda elem: (elem.x0 > 300, -elem.y0, elem.x0)) + + document = load_file(file_path, element_ordering=column_ordering_function) + self.assertListEqual( + [element.text() for element in document.elements], + [ + "Column 1 Title", + "Here is some column 1 text.", + "Col 1 left", + "Col 1 right", + "Column 2 Title", + "Here is some column 2 text.", + "Col 2 left", + "Col 2 right", + ], + ) diff --git a/tests/utils.py b/tests/utils.py index 8bb2fe85..cf204572 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,8 +1,8 @@ import re -from typing import NamedTuple, Dict, List, Optional, Union +from typing import NamedTuple, Callable, Dict, List, Optional, Union -from py_pdf_parser.components import PDFElement, PDFDocument +from py_pdf_parser.components import PDFElement, PDFDocument, ElementOrdering from py_pdf_parser.sectioning import Section from pdfminer.layout import LTComponent @@ -94,6 +94,9 @@ def create_pdf_document( font_mapping_is_regex: bool = False, regex_flags: Union[int, re.RegexFlag] = 0, font_size_precision: int = 1, + element_ordering: Union[ + ElementOrdering, Callable[[List], List] + ] = ElementOrdering.LEFT_TO_RIGHT_TOP_TO_BOTTOM, ) -> "PDFDocument": """ Creates a PDF document with the given elements. @@ -114,6 +117,7 @@ def create_pdf_document( font_mapping_is_regex=font_mapping_is_regex, regex_flags=regex_flags, font_size_precision=font_size_precision, + element_ordering=element_ordering, )