Skip to content

Commit

Permalink
Merge pull request #57 from optimor/simple-table-update
Browse files Browse the repository at this point in the history
[tables] Update extract simple table with option to allow gaps
  • Loading branch information
jstockwin authored Apr 14, 2020
2 parents a6539e2 + 65950fd commit ce27e6b
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 22 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Added
- `extract_simple_table` now allows extracting tables with gaps, provided there is at least one full row and one full column. This is only the case if you pass `allow_gaps=True`, otherwise the original logic of raising an exception if there a gap remains. You can optionally pass a `reference_element` which must be in both a full row and a full column, this defaults to the first (top-left) element. ([#57](https://github.com/optimor/py-pdf-parser/pull/57))

## [0.1.0] - 2019-04-08
### Added
Expand Down
85 changes: 63 additions & 22 deletions py_pdf_parser/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,50 @@ def extract_simple_table(
elements: "ElementList",
as_text: bool = False,
strip_text: bool = True,
allow_gaps: bool = False,
reference_element: Optional["PDFElement"] = None,
tolerance: float = 0.0,
) -> List[List]:
"""
Returns elements structured as a table.
Given an ElementList, tries to extract a structured table by examining which
elements are aligned. To use this function, the table must contain no gaps, i.e.
should be a full N x M table with an element in each cell. There must be a clear
gap between each row and between each column which contains no elements, and
a single cell cannot contain multiple elements.
If your table has empty cells, you can use `extract_table` instead. If you fail
to satisfy any of the other conditions listed above, that case is not yet supported.
elements are aligned.
To use this function, there must be at least one full row and one full column (which
we call the reference row and column), i.e. the reference row must have an element
in every column, and the reference column must have an element in every row. The
reference row and column can be specified by passing the single element in both the
reference row and the reference column. By default, this is the top left element,
which means we use the first row and column as the references. Note if you need to
change the reference_element, that means you have gaps in your table, and as such
you will need to pass `allow_gaps=True`.
Important: This function uses the elements in the reference row and column to scan
horizontally and vertically to find the rest of the table. If there are gaps in your
reference row and column, this could result in rows and columns being missed by
this function.
There must be a clear gap between each row and between each column which contains no
elements, and a single cell cannot contain multiple elements.
If there are no valid reference rows or columns, try extract_table() instead. If you
have elements spanning multiple rows or columns, it may be possible to fix this by
using extract_table(). If you fail to satisfy any of the other conditions listed
above, that case is not yet supported.
Args:
elements (ElementList): A list of elements to extract into a table.
as_text (bool, optional): Whether to extract the text from each element instead
of the PDFElement itself. Default: False.
strip_text (bool, optional): Whether to strip the text for each element of the
table (Only relevant if as_text is True). Default: True.
allow_gaps (bool, optional): Whether to allow empty spaces in the table.
reference_element (PDFElement, optional): An element in a full row and a full
column. Will be used to specify the reference row and column. If None, the
top left element will be used, meaning the top row and left column will be
used. If there are gaps in these, you should specify a different reference.
Default: None.
tolerance (int, optional): For elements to be counted as in the same row or
column, they must overlap by at least `tolerance`. Default: 0.
Expand All @@ -49,38 +73,52 @@ def extract_simple_table(
list[list]: a list of rows, which are lists of PDFElements or strings
(depending on the value of as_text).
"""
first_row = elements.to_the_right_of(
elements[0], inclusive=True, tolerance=tolerance
if reference_element is None:
reference_element = elements[0]
reference_row = elements.horizontally_in_line_with(
reference_element, inclusive=True, tolerance=tolerance
)
reference_column = elements.vertically_in_line_with(
reference_element, inclusive=True, tolerance=tolerance, all_pages=True
)
first_column = elements.below(elements[0], inclusive=True, tolerance=tolerance)

table: List[List] = []
for left_hand_element in first_column:
for reference_column_element in reference_column:
row: List = []
for top_element in first_row:
element = elements.to_the_right_of(
left_hand_element, inclusive=True, tolerance=tolerance
).below(top_element, inclusive=True, tolerance=tolerance)
for reference_row_element in reference_row:
element = elements.horizontally_in_line_with(
reference_column_element, inclusive=True, tolerance=tolerance
).vertically_in_line_with(
reference_row_element,
inclusive=True,
tolerance=tolerance,
all_pages=True,
)
try:
row.append(element.extract_single_element())
except NoElementFoundError as err:
raise TableExtractionError(
"Element not found, there appears to be a gap in the table. "
"Please try extract_table() instead."
) from err
if allow_gaps:
row.append(None)
else:
raise TableExtractionError(
"Element not found, there appears to be a gap in the table. "
"If this is expected, pass allow_gaps=True."
) from err
except MultipleElementsFoundError as err:
raise TableExtractionError(
"Multiple elements appear to be in the place of one cell in the "
"table. Please try extract_table() instead."
) from err
table.append(row)

table_size = sum(len(row) for row in table)
table_size = sum(
len([element for element in row if element is not None]) for row in table
)
if table_size != len(elements):
raise TableExtractionError(
f"Number of elements in table ({table_size}) does not match number of "
f"elements passed {len(elements)}. Perhaps try extract_table instead of "
"extract_simple_table."
f"elements passed ({len(elements)}). Perhaps try extract_table instead of "
"extract_simple_table, or change you reference element."
)

if as_text:
Expand Down Expand Up @@ -109,6 +147,9 @@ def extract_table(
If you fail to satisfy any of the other conditions listed above, that case is not
yet supported.
Note: If you satisfy the conditions to use extract_simple_table, then that should be
used instead, as it's much more efficient.
Args:
elements (ElementList): A list of elements to extract into a table.
as_text (bool, optional): Whether to extract the text from each element instead
Expand Down
106 changes: 106 additions & 0 deletions py_pdf_parser/tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,112 @@ def test_extract_simple_table(self):
with self.assertRaises(TableExtractionError):
extract_simple_table(elem_list)

def test_extract_simple_table_with_gaps(self):
# elem_1 elem_2 elem_3
# elem_4 elem_5
elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10))
elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
document = create_pdf_document(
elements=[elem_1, elem_2, elem_3, elem_4, elem_5]
)
elem_list = document.elements
result = extract_simple_table(elem_list, allow_gaps=True)
self.assertEqual(len(result), 2)
self.assertEqual(len(result[0]), 3)
self.assertEqual(len(result[1]), 3)
self.assert_original_element_list_list_equal(
[[elem_1, elem_2, elem_3], [elem_4, elem_5, None]], result
)

def test_extract_simple_table_with_gaps_and_different_reference(self):
# elem_1 elem_2 elem_3
# elem_4 elem_5
elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10))
elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
document = create_pdf_document(
elements=[elem_1, elem_2, elem_3, elem_4, elem_5]
)
elem_list = document.elements
reference_element = self.extract_element_from_list(elem_2, elem_list)
result = extract_simple_table(
elem_list, allow_gaps=True, reference_element=reference_element
)
self.assertEqual(len(result), 2)
self.assertEqual(len(result[0]), 3)
self.assertEqual(len(result[1]), 3)
self.assert_original_element_list_list_equal(
[[elem_1, elem_2, elem_3], [elem_4, elem_5, None]], result
)

def test_extract_simple_table_with_gaps_and_wrong_reference(self):
# elem_1 elem_2 elem_3
# elem_4 elem_5
elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10))
elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
document = create_pdf_document(
elements=[elem_1, elem_2, elem_3, elem_4, elem_5]
)
elem_list = document.elements
reference_element = self.extract_element_from_list(elem_3, elem_list)
with self.assertRaises(TableExtractionError):
extract_simple_table(
elem_list, allow_gaps=True, reference_element=reference_element
)

def test_extract_simple_table_from_different_pages(self):
# Checks that simple 2*2 tables are correctly extracted from different pages
#
# Page 1:
# elem_p1_1 elem_p1_2
# elem_p1_3 elem_p1_4
#
# Page 2:
# elem_p2_1 elem_p2_2
# elem_p2_3 elem_p2_4
#
elem_p1_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
elem_p1_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
elem_p1_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
elem_p1_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))

elem_p2_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
elem_p2_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
elem_p2_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
elem_p2_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))

document = create_pdf_document(
elements={
1: [elem_p1_1, elem_p1_2, elem_p1_3, elem_p1_4],
2: [elem_p2_1, elem_p2_2, elem_p2_3, elem_p2_4],
}
)
elem_list = document.elements

result = extract_simple_table(elem_list)
self.assertEqual(len(result), 4)
self.assertEqual(len(result[0]), 2)
self.assertEqual(len(result[1]), 2)
self.assertEqual(len(result[2]), 2)
self.assertEqual(len(result[3]), 2)
self.assert_original_element_list_list_equal(
[
[elem_p1_1, elem_p1_2],
[elem_p1_3, elem_p1_4],
[elem_p2_1, elem_p2_2],
[elem_p2_3, elem_p2_4],
],
result,
)

def test_extract_simple_table_with_tolerance(self):
# Checks that simple 2*2 table is correctly extracted
#
Expand Down

0 comments on commit ce27e6b

Please sign in to comment.