Skip to content

Commit

Permalink
[tables] Add flag to remove duplicate header rows
Browse files Browse the repository at this point in the history
  • Loading branch information
paulopaixaoamaral committed May 22, 2020
1 parent 2750650 commit 6247868
Show file tree
Hide file tree
Showing 2 changed files with 234 additions and 0 deletions.
54 changes: 54 additions & 0 deletions py_pdf_parser/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def extract_simple_table(
allow_gaps: bool = False,
reference_element: Optional["PDFElement"] = None,
tolerance: float = 0.0,
remove_duplicate_header_rows: bool = False,
) -> List[List]:
"""
Returns elements structured as a table.
Expand Down Expand Up @@ -65,6 +66,8 @@ def extract_simple_table(
Default: None.
tolerance (int, optional): For elements to be counted as in the same row or
column, they must overlap by at least `tolerance`. Default: 0.
remove_duplicate_header_rows (bool, optional): Remove duplicates of the header
row (the first row) if they exist. Default: False.
Raises:
TableExtractionError: If something goes wrong.
Expand Down Expand Up @@ -125,6 +128,9 @@ def extract_simple_table(
"extract_simple_table, or change you reference element."
)

if remove_duplicate_header_rows:
table = _remove_duplicate_header_rows(table)

if as_text:
return get_text_from_table(table, strip_text=strip_text)

Expand All @@ -139,6 +145,7 @@ def extract_table(
fix_element_in_multiple_rows: bool = False,
fix_element_in_multiple_cols: bool = False,
tolerance: float = 0.0,
remove_duplicate_header_rows: bool = False,
) -> List[List]:
"""
Returns elements structured as a table.
Expand Down Expand Up @@ -172,6 +179,8 @@ def extract_table(
expect this to be the case in your table. Default: False.
tolerance (int, optional): For elements to be counted as in the same row or
column, they must overlap by at least `tolerance`. Default: 0.
remove_duplicate_header_rows (bool, optional): Remove duplicates of the header
row (the first row) if they exist. Default: False.
Raises:
TableExtractionError: If something goes wrong.
Expand Down Expand Up @@ -235,6 +244,9 @@ def extract_table(
table_row.append(element)
table.append(table_row)

if remove_duplicate_header_rows:
table = _remove_duplicate_header_rows(table)

if as_text:
return get_text_from_table(table, strip_text=strip_text)

Expand Down Expand Up @@ -431,3 +443,45 @@ def _fix_cols(cols: Set["ElementList"], elements: "ElementList") -> None:
else:
sorted_columns.remove(col)
return


def _remove_duplicate_header_rows(table: List[List[Any]]) -> List[List[Any]]:
"""
Removes rows which are duplicates of the header (i.e., the first) row.
A row is considered duplicate if all of its elements have the same text and font of
their correspondent elements (i.e., same index) in the header row.
Args:
table (List[List[Any]]): The table to remove the duplicate headers from.
Returns:
List[List[Any]]: The table without the duplicate header rows.
"""
if len(table) <= 1:
return table

def elements_equal(elem_1: Optional["PDFElement"], elem_2: Optional["PDFElement"]):
if elem_1 is None and elem_2 is None:
return True

if (elem_1 is None or elem_2 is None) or (
elem_2 is None and elem_1 is not None
):
return False

if elem_1.text() != elem_2.text() or elem_1.font != elem_2.font:
return False

return True

header = table[0]
rows_without_duplicate_header = [
row
for row in table[1:]
if any(
not elements_equal(element, header[index])
for index, element in enumerate(row)
)
]
return [header] + rows_without_duplicate_header
180 changes: 180 additions & 0 deletions tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,92 @@ def test_extract_simple_table_with_tolerance(self):
[[elem_1, elem_2], [elem_3, elem_4]], result
)

def test_extract_simple_table_removing_duplicate_header_rows(self):
# header_elem_1 header_elem_2
header_elem_1 = FakePDFMinerTextElement(
text="header 1",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(0, 5, 21, 25),
)
header_elem_2 = FakePDFMinerTextElement(
text="header 2",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(6, 10, 21, 25),
)
document = create_pdf_document(elements=[header_elem_1, header_elem_2])
elem_list = document.elements

result = extract_simple_table(elem_list, remove_duplicate_header_rows=True)
# Extraction here should just return the whole table as it is not possible to
# have duplicates of a single lined table.
self.assertEqual(len(result), 1)
self.assertEqual(len(result[0]), 2)
self.assert_original_element_list_list_equal(
[[header_elem_1, header_elem_2]], result
)

# header_elem_1 header_elem_2
# elem_1 elem_2
# header_elem_3 header_elem_4
# elem_3 elem_4
# header_elem_5 header_elem_6
#
elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 16, 20))
elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 16, 20))
header_elem_3 = FakePDFMinerTextElement(
text="header 1",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(0, 5, 11, 15),
)
header_elem_4 = FakePDFMinerTextElement(
text="header 2",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(6, 10, 11, 15),
)
elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
header_elem_5 = FakePDFMinerTextElement(
text="header 1",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(0, 5, 0, 5),
)
header_elem_6 = FakePDFMinerTextElement(
text="header 2",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(6, 10, 0, 5),
)

document = create_pdf_document(
elements=[
header_elem_1,
header_elem_2,
elem_1,
elem_2,
header_elem_3,
header_elem_4,
elem_3,
elem_4,
header_elem_5,
header_elem_6,
]
)
elem_list = document.elements

result = extract_simple_table(elem_list, remove_duplicate_header_rows=True)
self.assertEqual(len(result), 3)
self.assertEqual(len(result[0]), 2)
self.assertEqual(len(result[1]), 2)
self.assertEqual(len(result[2]), 2)
self.assert_original_element_list_list_equal(
[[header_elem_1, header_elem_2], [elem_1, elem_2], [elem_3, elem_4]], result
)

def test_extract_table(self):
# Checks that simple 2*2 table is correctly extracted
#
Expand Down Expand Up @@ -311,6 +397,100 @@ def test_extract_table_with_tolerance(self):
[[elem_1, elem_2], [elem_3, elem_4]], result
)

def test_extract_table_removing_duplicate_header_rows(self):
# header_elem_1 header_elem_2
header_elem_1 = FakePDFMinerTextElement(
text="header 1",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(0, 5, 21, 25),
)
header_elem_2 = FakePDFMinerTextElement(
text="header 2",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(11, 15, 21, 25),
)
document = create_pdf_document(elements=[header_elem_1, header_elem_2])
elem_list = document.elements

result = extract_simple_table(elem_list, remove_duplicate_header_rows=True)
# Extraction here should just return the whole table as it is not possible to
# have duplicates of a single lined table.
self.assertEqual(len(result), 1)
self.assertEqual(len(result[0]), 2)
self.assert_original_element_list_list_equal(
[[header_elem_1, header_elem_2]], result
)

# header_elem_1 header_elem_2
# elem_1 elem_2
# header_elem_3 header_elem_4
# elem_3 elem_4
# header_elem_5 header_elem_6
#
elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 16, 20))
elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 16, 20))
header_elem_3 = FakePDFMinerTextElement(
text="header 1",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(0, 5, 11, 15),
)
header_elem_4 = FakePDFMinerTextElement(
text="header 2",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(11, 15, 11, 15),
)
elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10))
header_elem_5 = FakePDFMinerTextElement(
text="header 1",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(0, 5, 0, 5),
)
header_elem_6 = FakePDFMinerTextElement(
text="header 2",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(6, 10, 0, 5),
)

document = create_pdf_document(
elements=[
header_elem_1,
header_elem_2,
elem_1,
elem_2,
header_elem_3,
header_elem_4,
elem_3,
elem_4,
header_elem_5,
header_elem_6,
]
)
elem_list = document.elements

result = extract_table(elem_list, remove_duplicate_header_rows=True)
# The last row will not be removed as the gaps do not match the header row
self.assertEqual(len(result), 4)
self.assertEqual(len(result[0]), 3)
self.assertEqual(len(result[1]), 3)
self.assertEqual(len(result[2]), 3)
self.assertEqual(len(result[3]), 3)
self.assert_original_element_list_list_equal(
[
[header_elem_1, None, header_elem_2],
[elem_1, elem_2, None],
[elem_3, None, elem_4],
[header_elem_5, header_elem_6, None],
],
result,
)

def test_extract_text_from_simple_table(self):
# Checks that text from simple 2*2 table is correctly extracted
#
Expand Down

0 comments on commit 6247868

Please sign in to comment.