[components] Add element_ordering argument to PDFDocument

Closes #94
jstockwin · Jun 22, 2020 · 9832d2c · 9832d2c
1 parent 4b75426
commit 9832d2c
Show file tree

Hide file tree

Showing 5 changed files with 106 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Added `__len__` and `__repr__` functions to the Section class. ([#90](https://github.com/jstockwin/py-pdf-parser/pull/90))
 - Added flag to `extract_simple_table` and `extract_table` functions to remove duplicate header rows. ([#89](https://github.com/jstockwin/py-pdf-parser/pull/89))
+- You can now specify `element_ordering` when instantiating a PDFDocument. This defaults to the old behaviour or left to right, top to bottom. ([#95](https://github.com/jstockwin/py-pdf-parser/pull/95))
+
 ### Changed
 - Advanced layout analysis is now disabled by default. ([#88](https://github.com/jstockwin/py-pdf-parser/pull/88))
 

diff --git a/py_pdf_parser/components.py b/py_pdf_parser/components.py
@@ -1,7 +1,8 @@
-from typing import Dict, List, Set, Optional, Union, TYPE_CHECKING
+from typing import Callable, Dict, List, Set, Optional, Union, TYPE_CHECKING
 
 import re
 from collections import Counter, defaultdict
+from enum import Enum, auto
 from itertools import chain
 
 from .common import BoundingBox
@@ -14,6 +15,29 @@
     from pdfminer.layout import LTComponent
 
 
+class ElementOrdering(Enum):
+    LEFT_TO_RIGHT_TOP_TO_BOTTOM = auto()
+    RIGHT_TO_LEFT_TOP_TO_BOTTOM = auto()
+    TOP_TO_BOTTOM_LEFT_TO_RIGHT = auto()
+    TOP_TO_BOTTOM_RIGHT_TO_LEFT = auto()
+
+
+_ELEMENT_ORDERING_FUNCTIONS: Dict["ElementOrdering", Callable[[List], List]] = {
+    ElementOrdering.LEFT_TO_RIGHT_TOP_TO_BOTTOM: lambda elements: sorted(
+        elements, key=lambda elem: (-elem.y0, elem.x0)
+    ),
+    ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM: lambda elements: sorted(
+        elements, key=lambda elem: (-elem.y0, -elem.x0)
+    ),
+    ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT: lambda elements: sorted(
+        elements, key=lambda elem: (elem.x0, -elem.y0)
+    ),
+    ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT: lambda elements: sorted(
+        elements, key=lambda elem: (-elem.x0, -elem.y0)
+    ),
+}
+
+
 class PDFPage:
     """
     A representation of a page within the `PDFDocument`.
@@ -325,6 +349,11 @@ class PDFDocument:
                 Default: 0.
         font_size_precision (int): How much rounding to apply to the font size. The font
             size will be rounded to this many decimal places.
+        element_ordering (ElementOrdering or callable, optional): An ordering function
+            for the elements. Either a member of the ElementOrdering Enum, or a callable
+            which takes a list of elements and returns an ordered list of elements. This
+            will be called separately for each page. Note that the elements in this case
+            will be PDFMiner elements, and not PDFElements from this package.
 
     Attributes:
         pages (list): A list of all `PDFPages` in the document.
@@ -337,7 +366,8 @@ class PDFDocument:
     number_of_pages: int
     page_numbers: List[int]
     sectioning: "Sectioning"
-    # _element_list will contain all elements, sorted from top to bottom, left to right.
+    # _element_list will contain all elements, sorted according to element_ordering
+    # (default left to right, top to bottom).
     _element_list: List[PDFElement]
     # _element_indexes_by_font will be a caching of fonts to elements indexes but it
     # will be built as needed (while filtering by fonts), not on document load.
@@ -357,6 +387,9 @@ def __init__(
         font_mapping_is_regex: bool = False,
         regex_flags: Union[int, re.RegexFlag] = 0,
         font_size_precision: int = 1,
+        element_ordering: Union[
+            "ElementOrdering", Callable[[List], List]
+        ] = ElementOrdering.LEFT_TO_RIGHT_TOP_TO_BOTTOM,
     ):
         self.sectioning = Sectioning(self)
         self._element_list = []
@@ -369,7 +402,11 @@ def __init__(
         idx = 0
         for page_number, page in sorted(pages.items()):
             first_element = None
-            for element in sorted(page.elements, key=lambda elem: (-elem.y0, elem.x0)):
+            if isinstance(element_ordering, ElementOrdering):
+                sort_func = _ELEMENT_ORDERING_FUNCTIONS[element_ordering]
+            else:
+                sort_func = element_ordering
+            for element in sort_func(page.elements):
                 pdf_element = PDFElement(
                     document=self,
                     element=element,

diff --git a/py_pdf_parser/filtering.py b/py_pdf_parser/filtering.py
@@ -684,7 +684,8 @@ def before(self, element: "PDFElement", inclusive: bool = False) -> "ElementList
         Returns all elements before the specified element.
 
         By before, we mean preceding elements according to their index. The PDFDocument
-        will order elements left to right, top to bottom (as you would normally read).
+        will order elements according to the specified element_ordering (which defaults
+        to left to right, top to bottom).
 
         Args:
             element (PDFElement): The element in question.
@@ -704,7 +705,8 @@ def after(self, element: "PDFElement", inclusive: bool = False) -> "ElementList"
         Returns all elements after the specified element.
 
         By after, we mean succeeding elements according to their index. The PDFDocument
-        will order elements left to right, top to bottom (as you would normally read).
+        will order elements according to the specified element_ordering (which defaults
+        to left to right, top to bottom).
 
         Args:
             element (PDFElement): The element in question.
@@ -729,7 +731,8 @@ def between(
         Returns all elements between the start and end elements.
 
         This is done according to the element indexes. The PDFDocument will order
-        elements left to right, top to bottom (as you would normally read).
+        elements according to the specified element_ordering (which defaults
+        to left to right, top to bottom).
 
         This is the same as applying `before` with `start_element` and `after` with
         `end_element`.

diff --git a/tests/test_components.py b/tests/test_components.py
@@ -3,13 +3,13 @@
 from ddt import ddt, data
 
 from py_pdf_parser.common import BoundingBox
-from py_pdf_parser.components import PDFDocument
+from py_pdf_parser.components import PDFDocument, ElementOrdering
 from py_pdf_parser.filtering import ElementList
 from py_pdf_parser.loaders import Page
 from py_pdf_parser.exceptions import NoElementsOnPageError, PageNotFoundError
 
 from .base import BaseTestCase
-from .utils import create_pdf_element, FakePDFMinerTextElement
+from .utils import create_pdf_element, create_pdf_document, FakePDFMinerTextElement
 
 
 @ddt
@@ -286,3 +286,56 @@ def test_document(self):
     def test_document_with_blank_page(self):
         with self.assertRaises(NoElementsOnPageError):
             PDFDocument(pages={1: Page(elements=[], width=100, height=100)})
+
+    def test_element_ordering(self):
+        #       elem_1      elem_2
+        #       elem_3      elem_4
+        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
+        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
+        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
+        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
+
+        # Check default: left to right, top to bottom
+        document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4])
+        self.assert_original_element_list_equal(
+            [elem_1, elem_2, elem_3, elem_4], document.elements
+        )
+
+        # Check other presets
+        document = create_pdf_document(
+            elements=[elem_1, elem_2, elem_3, elem_4],
+            element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM,
+        )
+        self.assert_original_element_list_equal(
+            [elem_2, elem_1, elem_4, elem_3], document.elements
+        )
+
+        document = create_pdf_document(
+            elements=[elem_1, elem_2, elem_3, elem_4],
+            element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT,
+        )
+        self.assert_original_element_list_equal(
+            [elem_1, elem_3, elem_2, elem_4], document.elements
+        )
+
+        document = create_pdf_document(
+            elements=[elem_1, elem_2, elem_3, elem_4],
+            element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT,
+        )
+        self.assert_original_element_list_equal(
+            [elem_2, elem_4, elem_1, elem_3], document.elements
+        )
+
+        # Check custom function
+        document = create_pdf_document(
+            elements=[elem_1, elem_2, elem_3, elem_4],
+            element_ordering=lambda elements: [
+                elements[0],
+                elements[3],
+                elements[1],
+                elements[2],
+            ],
+        )
+        self.assert_original_element_list_equal(
+            [elem_1, elem_4, elem_2, elem_3], document.elements
+        )
diff --git a/tests/utils.py b/tests/utils.py
@@ -2,7 +2,7 @@
 
 from typing import NamedTuple, Dict, List, Optional, Union
 
-from py_pdf_parser.components import PDFElement, PDFDocument
+from py_pdf_parser.components import PDFElement, PDFDocument, ElementOrdering
 from py_pdf_parser.sectioning import Section
 from pdfminer.layout import LTComponent
 
@@ -94,6 +94,7 @@ def create_pdf_document(
     font_mapping_is_regex: bool = False,
     regex_flags: Union[int, re.RegexFlag] = 0,
     font_size_precision: int = 1,
+    element_ordering: ElementOrdering = ElementOrdering.LEFT_TO_RIGHT_TOP_TO_BOTTOM,
 ) -> "PDFDocument":
     """
     Creates a PDF document with the given elements.
@@ -114,6 +115,7 @@ def create_pdf_document(
         font_mapping_is_regex=font_mapping_is_regex,
         regex_flags=regex_flags,
         font_size_precision=font_size_precision,
+        element_ordering=element_ordering,
     )