[filtering] Add new move_forwards_from and move_backwards_from functions

jstockwin · Jul 9, 2020 · 568c458 · 568c458
1 parent d5448d2
commit 568c458
Show file tree

Hide file tree

Showing 4 changed files with 144 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+### Added
+- New functions on `ElementList`, `move_forwards_from` and `move_backwards_from`, to allow moving forwards and backwards from a certain element in the list easily. ([#113](https://github.com/jstockwin/py-pdf-parser/pull/113))
+
 ### Changed
 - When the layout parameter all_texts is True, the text inside figures is now also returned as elements in the document. ([#99](https://github.com/jstockwin/py-pdf-parser/pull/99))
 

diff --git a/py_pdf_parser/exceptions.py b/py_pdf_parser/exceptions.py
@@ -20,6 +20,10 @@ class MultipleElementsFoundError(PDFParserError):
     pass
 
 
+class ElementOutOfRangeError(PDFParserError):
+    pass
+
+
 # Sectioning
 class InvalidSectionError(PDFParserError):
     pass

diff --git a/py_pdf_parser/filtering.py b/py_pdf_parser/filtering.py
@@ -13,6 +13,7 @@
 
 from .common import BoundingBox
 from .exceptions import (
+    ElementOutOfRangeError,
     NoElementFoundError,
     MultipleElementsFoundError,
     SectionNotFoundError,
@@ -851,6 +852,71 @@ def remove_elements(self, *elements: "PDFElement") -> "ElementList":
             self.document, self.indexes - set(element._index for element in elements)
         )
 
+    def move_forwards_from(
+        self, element: "PDFElement", count: int = 1, capped: bool = False
+    ) -> "PDFElement":
+        """
+        Returns the element in the element list obtained by moving forwards from
+        `element` by `count`.
+
+        Args:
+            element (PDFElement): The element to start at.
+            count (int, optional): How many elements to move from `element`. The default
+                of 1 will move forwards by one element. Passing 0 will simply return the
+                element itself. You can also pass negative integers to move backwards.
+            capped (bool, optional): By default (False), if the count is high enough
+                that we try to move out of range of the list, an exception will be
+                raised. Passing `capped=True` will change this behaviour to instead
+                return the element at the start or end of the list.
+
+        Raises:
+            ElementOutOfRangeError: If the count large (or large-negative) enough that
+                we reach the end (or start) of the list. Only happens when capped=False.
+        """
+        indexes = sorted(self.indexes)
+        new_index = indexes.index(element._index) + count
+        if new_index < 0 or new_index >= len(indexes):
+            # Out of range. We could simply catch the index error for large new_index,
+            # but we have to handle the negative case like this anyway, so might as well
+            # do both cases while we're at it.
+            if capped:
+                new_index = max(min(new_index, len(indexes) - 1), 0)
+                element_index = indexes[new_index]
+                return self.document._element_list[element_index]
+            raise ElementOutOfRangeError(
+                f"Requested element is {'before' if capped < 0 else 'after'} the start "
+                "of the ElementList"
+            )
+
+        # We avoid just returning self[new_index] here since getitem will do an
+        # additional sorted(self.indexes), which we have already computed here.
+        element_index = indexes[new_index]
+        return self.document._element_list[element_index]
+
+    def move_backwards_from(
+        self, element: "PDFElement", count: int = 1, capped: bool = False
+    ) -> "PDFElement":
+        """
+        Returns the element in the element list obtained by moving backwards from
+        `element` by `count`.
+
+        Args:
+            element (PDFElement): The element to start at.
+            count (int, optional): How many elements to move from `element`. The default
+                of 1 will move backwards by one element. Passing 0 will simply return
+                the element itself. You can also pass negative integers to move
+                forwards.
+            capped (bool, optional): By default (False), if the count is high enough
+                that we try to move out of range of the list, an exception will be
+                raised. Passing `capped=True` will change this behaviour to instead
+                return the element at the start or end of the list.
+
+        Raises:
+            ElementOutOfRangeError: If the count large (or large-negative) enough that
+                we reach the start (or end) of the list. Only happens when capped=False.
+        """
+        return self.move_forwards_from(element, count=-count, capped=capped)
+
     def __intersect_indexes_with_self(self, new_indexes: Set[int]) -> "ElementList":
         return self & ElementList(self.document, new_indexes)
 

diff --git a/tests/test_filtering.py b/tests/test_filtering.py
@@ -4,7 +4,11 @@
 
 from py_pdf_parser.components import PDFDocument, PDFElement
 from py_pdf_parser.common import BoundingBox
-from py_pdf_parser.exceptions import NoElementFoundError, MultipleElementsFoundError
+from py_pdf_parser.exceptions import (
+    NoElementFoundError,
+    MultipleElementsFoundError,
+    ElementOutOfRangeError,
+)
 from py_pdf_parser.filtering import ElementList
 from py_pdf_parser.loaders import Page
 
@@ -1141,6 +1145,72 @@ def test_remove_elements(self):
         self.assertNotIn(self.elem_list[0], result)
         self.assertNotIn(self.elem_list[1], result)
 
+    def test_move_forwards_from(self):
+        # By default, should move forwards by one element
+        self.assertEqual(
+            self.elem_list.move_forwards_from(self.elem_list[2]), self.elem_list[3]
+        )
+        # Test count
+        self.assertEqual(
+            self.elem_list.move_forwards_from(self.elem_list[2], count=2),
+            self.elem_list[4],
+        )
+        # Negative count should move backwards
+        self.assertEqual(
+            self.elem_list.move_forwards_from(self.elem_list[2], count=-1),
+            self.elem_list[1],
+        )
+        # Going outside of list in either direction should raise exception
+        with self.assertRaises(ElementOutOfRangeError):
+            self.elem_list.move_forwards_from(self.elem_list[2], count=10)
+        with self.assertRaises(ElementOutOfRangeError):
+            self.elem_list.move_forwards_from(self.elem_list[2], count=-10)
+        # Passing capped=True should instead return first/last element
+        self.assertEqual(
+            self.elem_list.move_forwards_from(self.elem_list[2], count=10, capped=True),
+            self.elem_list[-1],
+        )
+        self.assertEqual(
+            self.elem_list.move_forwards_from(
+                self.elem_list[2], count=-10, capped=True
+            ),
+            self.elem_list[0],
+        )
+
+    def test_move_backwards_from(self):
+        # By default, should move backwards by one element
+        self.assertEqual(
+            self.elem_list.move_backwards_from(self.elem_list[3]), self.elem_list[2]
+        )
+        # Test count
+        self.assertEqual(
+            self.elem_list.move_backwards_from(self.elem_list[3], count=2),
+            self.elem_list[1],
+        )
+        # Negative count should move forwards
+        self.assertEqual(
+            self.elem_list.move_backwards_from(self.elem_list[3], count=-1),
+            self.elem_list[4],
+        )
+        # Going outside of list in either direction should raise exception
+        with self.assertRaises(ElementOutOfRangeError):
+            self.elem_list.move_backwards_from(self.elem_list[3], count=10)
+        with self.assertRaises(ElementOutOfRangeError):
+            self.elem_list.move_backwards_from(self.elem_list[3], count=-10)
+        # Passing capped=True should instead return first/last element
+        self.assertEqual(
+            self.elem_list.move_backwards_from(
+                self.elem_list[3], count=10, capped=True
+            ),
+            self.elem_list[0],
+        )
+        self.assertEqual(
+            self.elem_list.move_backwards_from(
+                self.elem_list[3], count=-10, capped=True
+            ),
+            self.elem_list[-1],
+        )
+
     def test_repr(self):
         self.assertEqual(repr(self.elem_list), "<ElementList of 6 elements>")
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,6 +20,10 @@ class MultipleElementsFoundError(PDFParserError): @@
         pass
+    class ElementOutOfRangeError(PDFParserError):
+        pass
     # Sectioning
     class InvalidSectionError(PDFParserError):
         pass
@@ Expand Down @@