Skip to content

Commit

Permalink
[components] Add element_ordering argument to PDFDocument
Browse files Browse the repository at this point in the history
Closes #94
  • Loading branch information
jstockwin committed Jun 22, 2020
1 parent 4b75426 commit 9832d2c
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 9 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added `__len__` and `__repr__` functions to the Section class. ([#90](https://github.com/jstockwin/py-pdf-parser/pull/90))
- Added flag to `extract_simple_table` and `extract_table` functions to remove duplicate header rows. ([#89](https://github.com/jstockwin/py-pdf-parser/pull/89))
- You can now specify `element_ordering` when instantiating a PDFDocument. This defaults to the old behaviour or left to right, top to bottom. ([#95](https://github.com/jstockwin/py-pdf-parser/pull/95))

### Changed
- Advanced layout analysis is now disabled by default. ([#88](https://github.com/jstockwin/py-pdf-parser/pull/88))

Expand Down
43 changes: 40 additions & 3 deletions py_pdf_parser/components.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import Dict, List, Set, Optional, Union, TYPE_CHECKING
from typing import Callable, Dict, List, Set, Optional, Union, TYPE_CHECKING

import re
from collections import Counter, defaultdict
from enum import Enum, auto
from itertools import chain

from .common import BoundingBox
Expand All @@ -14,6 +15,29 @@
from pdfminer.layout import LTComponent


class ElementOrdering(Enum):
LEFT_TO_RIGHT_TOP_TO_BOTTOM = auto()
RIGHT_TO_LEFT_TOP_TO_BOTTOM = auto()
TOP_TO_BOTTOM_LEFT_TO_RIGHT = auto()
TOP_TO_BOTTOM_RIGHT_TO_LEFT = auto()


_ELEMENT_ORDERING_FUNCTIONS: Dict["ElementOrdering", Callable[[List], List]] = {
ElementOrdering.LEFT_TO_RIGHT_TOP_TO_BOTTOM: lambda elements: sorted(
elements, key=lambda elem: (-elem.y0, elem.x0)
),
ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM: lambda elements: sorted(
elements, key=lambda elem: (-elem.y0, -elem.x0)
),
ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT: lambda elements: sorted(
elements, key=lambda elem: (elem.x0, -elem.y0)
),
ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT: lambda elements: sorted(
elements, key=lambda elem: (-elem.x0, -elem.y0)
),
}


class PDFPage:
"""
A representation of a page within the `PDFDocument`.
Expand Down Expand Up @@ -325,6 +349,11 @@ class PDFDocument:
Default: 0.
font_size_precision (int): How much rounding to apply to the font size. The font
size will be rounded to this many decimal places.
element_ordering (ElementOrdering or callable, optional): An ordering function
for the elements. Either a member of the ElementOrdering Enum, or a callable
which takes a list of elements and returns an ordered list of elements. This
will be called separately for each page. Note that the elements in this case
will be PDFMiner elements, and not PDFElements from this package.
Attributes:
pages (list): A list of all `PDFPages` in the document.
Expand All @@ -337,7 +366,8 @@ class PDFDocument:
number_of_pages: int
page_numbers: List[int]
sectioning: "Sectioning"
# _element_list will contain all elements, sorted from top to bottom, left to right.
# _element_list will contain all elements, sorted according to element_ordering
# (default left to right, top to bottom).
_element_list: List[PDFElement]
# _element_indexes_by_font will be a caching of fonts to elements indexes but it
# will be built as needed (while filtering by fonts), not on document load.
Expand All @@ -357,6 +387,9 @@ def __init__(
font_mapping_is_regex: bool = False,
regex_flags: Union[int, re.RegexFlag] = 0,
font_size_precision: int = 1,
element_ordering: Union[
"ElementOrdering", Callable[[List], List]
] = ElementOrdering.LEFT_TO_RIGHT_TOP_TO_BOTTOM,
):
self.sectioning = Sectioning(self)
self._element_list = []
Expand All @@ -369,7 +402,11 @@ def __init__(
idx = 0
for page_number, page in sorted(pages.items()):
first_element = None
for element in sorted(page.elements, key=lambda elem: (-elem.y0, elem.x0)):
if isinstance(element_ordering, ElementOrdering):
sort_func = _ELEMENT_ORDERING_FUNCTIONS[element_ordering]
else:
sort_func = element_ordering
for element in sort_func(page.elements):
pdf_element = PDFElement(
document=self,
element=element,
Expand Down
9 changes: 6 additions & 3 deletions py_pdf_parser/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,8 @@ def before(self, element: "PDFElement", inclusive: bool = False) -> "ElementList
Returns all elements before the specified element.
By before, we mean preceding elements according to their index. The PDFDocument
will order elements left to right, top to bottom (as you would normally read).
will order elements according to the specified element_ordering (which defaults
to left to right, top to bottom).
Args:
element (PDFElement): The element in question.
Expand All @@ -704,7 +705,8 @@ def after(self, element: "PDFElement", inclusive: bool = False) -> "ElementList"
Returns all elements after the specified element.
By after, we mean succeeding elements according to their index. The PDFDocument
will order elements left to right, top to bottom (as you would normally read).
will order elements according to the specified element_ordering (which defaults
to left to right, top to bottom).
Args:
element (PDFElement): The element in question.
Expand All @@ -729,7 +731,8 @@ def between(
Returns all elements between the start and end elements.
This is done according to the element indexes. The PDFDocument will order
elements left to right, top to bottom (as you would normally read).
elements according to the specified element_ordering (which defaults
to left to right, top to bottom).
This is the same as applying `before` with `start_element` and `after` with
`end_element`.
Expand Down
57 changes: 55 additions & 2 deletions tests/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
from ddt import ddt, data

from py_pdf_parser.common import BoundingBox
from py_pdf_parser.components import PDFDocument
from py_pdf_parser.components import PDFDocument, ElementOrdering
from py_pdf_parser.filtering import ElementList
from py_pdf_parser.loaders import Page
from py_pdf_parser.exceptions import NoElementsOnPageError, PageNotFoundError

from .base import BaseTestCase
from .utils import create_pdf_element, FakePDFMinerTextElement
from .utils import create_pdf_element, create_pdf_document, FakePDFMinerTextElement


@ddt
Expand Down Expand Up @@ -286,3 +286,56 @@ def test_document(self):
def test_document_with_blank_page(self):
with self.assertRaises(NoElementsOnPageError):
PDFDocument(pages={1: Page(elements=[], width=100, height=100)})

def test_element_ordering(self):
# elem_1 elem_2
# elem_3 elem_4
elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))

# Check default: left to right, top to bottom
document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4])
self.assert_original_element_list_equal(
[elem_1, elem_2, elem_3, elem_4], document.elements
)

# Check other presets
document = create_pdf_document(
elements=[elem_1, elem_2, elem_3, elem_4],
element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM,
)
self.assert_original_element_list_equal(
[elem_2, elem_1, elem_4, elem_3], document.elements
)

document = create_pdf_document(
elements=[elem_1, elem_2, elem_3, elem_4],
element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT,
)
self.assert_original_element_list_equal(
[elem_1, elem_3, elem_2, elem_4], document.elements
)

document = create_pdf_document(
elements=[elem_1, elem_2, elem_3, elem_4],
element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT,
)
self.assert_original_element_list_equal(
[elem_2, elem_4, elem_1, elem_3], document.elements
)

# Check custom function
document = create_pdf_document(
elements=[elem_1, elem_2, elem_3, elem_4],
element_ordering=lambda elements: [
elements[0],
elements[3],
elements[1],
elements[2],
],
)
self.assert_original_element_list_equal(
[elem_1, elem_4, elem_2, elem_3], document.elements
)
4 changes: 3 additions & 1 deletion tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import NamedTuple, Dict, List, Optional, Union

from py_pdf_parser.components import PDFElement, PDFDocument
from py_pdf_parser.components import PDFElement, PDFDocument, ElementOrdering
from py_pdf_parser.sectioning import Section
from pdfminer.layout import LTComponent

Expand Down Expand Up @@ -94,6 +94,7 @@ def create_pdf_document(
font_mapping_is_regex: bool = False,
regex_flags: Union[int, re.RegexFlag] = 0,
font_size_precision: int = 1,
element_ordering: ElementOrdering = ElementOrdering.LEFT_TO_RIGHT_TOP_TO_BOTTOM,
) -> "PDFDocument":
"""
Creates a PDF document with the given elements.
Expand All @@ -114,6 +115,7 @@ def create_pdf_document(
font_mapping_is_regex=font_mapping_is_regex,
regex_flags=regex_flags,
font_size_precision=font_size_precision,
element_ordering=element_ordering,
)


Expand Down

0 comments on commit 9832d2c

Please sign in to comment.