Skip to content

Commit

Permalink
[filtering] Add new move_forwards_from and move_backwards_from functions
Browse files Browse the repository at this point in the history
  • Loading branch information
jstockwin committed Jul 9, 2020
1 parent d5448d2 commit 568c458
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 1 deletion.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Added
- New functions on `ElementList`, `move_forwards_from` and `move_backwards_from`, to allow moving forwards and backwards from a certain element in the list easily. ([#113](https://github.com/jstockwin/py-pdf-parser/pull/113))

### Changed
- When the layout parameter all_texts is True, the text inside figures is now also returned as elements in the document. ([#99](https://github.com/jstockwin/py-pdf-parser/pull/99))

Expand Down
4 changes: 4 additions & 0 deletions py_pdf_parser/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ class MultipleElementsFoundError(PDFParserError):
pass


class ElementOutOfRangeError(PDFParserError):
pass


# Sectioning
class InvalidSectionError(PDFParserError):
pass
Expand Down
66 changes: 66 additions & 0 deletions py_pdf_parser/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from .common import BoundingBox
from .exceptions import (
ElementOutOfRangeError,
NoElementFoundError,
MultipleElementsFoundError,
SectionNotFoundError,
Expand Down Expand Up @@ -851,6 +852,71 @@ def remove_elements(self, *elements: "PDFElement") -> "ElementList":
self.document, self.indexes - set(element._index for element in elements)
)

def move_forwards_from(
self, element: "PDFElement", count: int = 1, capped: bool = False
) -> "PDFElement":
"""
Returns the element in the element list obtained by moving forwards from
`element` by `count`.
Args:
element (PDFElement): The element to start at.
count (int, optional): How many elements to move from `element`. The default
of 1 will move forwards by one element. Passing 0 will simply return the
element itself. You can also pass negative integers to move backwards.
capped (bool, optional): By default (False), if the count is high enough
that we try to move out of range of the list, an exception will be
raised. Passing `capped=True` will change this behaviour to instead
return the element at the start or end of the list.
Raises:
ElementOutOfRangeError: If the count large (or large-negative) enough that
we reach the end (or start) of the list. Only happens when capped=False.
"""
indexes = sorted(self.indexes)
new_index = indexes.index(element._index) + count
if new_index < 0 or new_index >= len(indexes):
# Out of range. We could simply catch the index error for large new_index,
# but we have to handle the negative case like this anyway, so might as well
# do both cases while we're at it.
if capped:
new_index = max(min(new_index, len(indexes) - 1), 0)
element_index = indexes[new_index]
return self.document._element_list[element_index]
raise ElementOutOfRangeError(
f"Requested element is {'before' if capped < 0 else 'after'} the start "
"of the ElementList"
)

# We avoid just returning self[new_index] here since getitem will do an
# additional sorted(self.indexes), which we have already computed here.
element_index = indexes[new_index]
return self.document._element_list[element_index]

def move_backwards_from(
self, element: "PDFElement", count: int = 1, capped: bool = False
) -> "PDFElement":
"""
Returns the element in the element list obtained by moving backwards from
`element` by `count`.
Args:
element (PDFElement): The element to start at.
count (int, optional): How many elements to move from `element`. The default
of 1 will move backwards by one element. Passing 0 will simply return
the element itself. You can also pass negative integers to move
forwards.
capped (bool, optional): By default (False), if the count is high enough
that we try to move out of range of the list, an exception will be
raised. Passing `capped=True` will change this behaviour to instead
return the element at the start or end of the list.
Raises:
ElementOutOfRangeError: If the count large (or large-negative) enough that
we reach the start (or end) of the list. Only happens when capped=False.
"""
return self.move_forwards_from(element, count=-count, capped=capped)

def __intersect_indexes_with_self(self, new_indexes: Set[int]) -> "ElementList":
return self & ElementList(self.document, new_indexes)

Expand Down
72 changes: 71 additions & 1 deletion tests/test_filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@

from py_pdf_parser.components import PDFDocument, PDFElement
from py_pdf_parser.common import BoundingBox
from py_pdf_parser.exceptions import NoElementFoundError, MultipleElementsFoundError
from py_pdf_parser.exceptions import (
NoElementFoundError,
MultipleElementsFoundError,
ElementOutOfRangeError,
)
from py_pdf_parser.filtering import ElementList
from py_pdf_parser.loaders import Page

Expand Down Expand Up @@ -1141,6 +1145,72 @@ def test_remove_elements(self):
self.assertNotIn(self.elem_list[0], result)
self.assertNotIn(self.elem_list[1], result)

def test_move_forwards_from(self):
# By default, should move forwards by one element
self.assertEqual(
self.elem_list.move_forwards_from(self.elem_list[2]), self.elem_list[3]
)
# Test count
self.assertEqual(
self.elem_list.move_forwards_from(self.elem_list[2], count=2),
self.elem_list[4],
)
# Negative count should move backwards
self.assertEqual(
self.elem_list.move_forwards_from(self.elem_list[2], count=-1),
self.elem_list[1],
)
# Going outside of list in either direction should raise exception
with self.assertRaises(ElementOutOfRangeError):
self.elem_list.move_forwards_from(self.elem_list[2], count=10)
with self.assertRaises(ElementOutOfRangeError):
self.elem_list.move_forwards_from(self.elem_list[2], count=-10)
# Passing capped=True should instead return first/last element
self.assertEqual(
self.elem_list.move_forwards_from(self.elem_list[2], count=10, capped=True),
self.elem_list[-1],
)
self.assertEqual(
self.elem_list.move_forwards_from(
self.elem_list[2], count=-10, capped=True
),
self.elem_list[0],
)

def test_move_backwards_from(self):
# By default, should move backwards by one element
self.assertEqual(
self.elem_list.move_backwards_from(self.elem_list[3]), self.elem_list[2]
)
# Test count
self.assertEqual(
self.elem_list.move_backwards_from(self.elem_list[3], count=2),
self.elem_list[1],
)
# Negative count should move forwards
self.assertEqual(
self.elem_list.move_backwards_from(self.elem_list[3], count=-1),
self.elem_list[4],
)
# Going outside of list in either direction should raise exception
with self.assertRaises(ElementOutOfRangeError):
self.elem_list.move_backwards_from(self.elem_list[3], count=10)
with self.assertRaises(ElementOutOfRangeError):
self.elem_list.move_backwards_from(self.elem_list[3], count=-10)
# Passing capped=True should instead return first/last element
self.assertEqual(
self.elem_list.move_backwards_from(
self.elem_list[3], count=10, capped=True
),
self.elem_list[0],
)
self.assertEqual(
self.elem_list.move_backwards_from(
self.elem_list[3], count=-10, capped=True
),
self.elem_list[-1],
)

def test_repr(self):
self.assertEqual(repr(self.elem_list), "<ElementList of 6 elements>")

Expand Down

0 comments on commit 568c458

Please sign in to comment.