Skip to content

Commit

Permalink
Changed according to CR
Browse files Browse the repository at this point in the history
  • Loading branch information
paulopaixaoamaral committed May 22, 2020
1 parent d75001f commit 0bda4c2
Show file tree
Hide file tree
Showing 3 changed files with 208 additions and 18 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Changed
### Added
- Added flag to `extract_simple_table` and `extract_table` functions to remove duplicate header rows. ([#89](https://github.com/jstockwin/py-pdf-parser/pull/89))
### Changed
- Advanced layout analysis is now disabled by default. ([#88](https://github.com/jstockwin/py-pdf-parser/pull/88))

## [0.3.0] - 2020-05-14
Expand Down
44 changes: 28 additions & 16 deletions py_pdf_parser/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,32 +456,44 @@ def _remove_duplicate_header_rows(table: List[List[Any]]) -> List[List[Any]]:
Returns:
List[List[Any]]: The table without the duplicate header rows.
"""
if len(table) <= 1:
return table

def elements_equal(elem_1: Optional["PDFElement"], elem_2: Optional["PDFElement"]):
if elem_1 is None and elem_2 is None:
return True

if (elem_1 is None or elem_2 is None) or (
elem_2 is None and elem_1 is not None
):
return False

if elem_1.text() != elem_2.text() or elem_1.font != elem_2.font:
return False

return True

header = table[0]
rows_without_duplicate_header = [
row
for row in table[1:]
if any(
not elements_equal(element, header[index])
not _are_elements_equal(element, header[index])
for index, element in enumerate(row)
)
]
return [header] + rows_without_duplicate_header


def _are_elements_equal(
elem_1: Optional["PDFElement"], elem_2: Optional["PDFElement"]
) -> bool:
"""
Checks if two elements are equal.
Two elements are considered equal if they are both None or they have the same text
and font.
Args:
elem_1 (PDFElement, optional): The first element to compare.
elem_2 (PDFElement, optional): The second element to compare.
Returns:
bool: True if elements are equal, False otherwise.
"""
if elem_1 is None and elem_2 is None:
return True

if elem_1 is None or elem_2 is None:
return False

if elem_1.text() != elem_2.text() or elem_1.font != elem_2.font:
return False

return True
179 changes: 178 additions & 1 deletion tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
get_text_from_table,
_validate_table_shape,
add_header_to_table,
_are_elements_equal,
)

from .base import BaseTestCase
Expand Down Expand Up @@ -270,6 +271,76 @@ def test_extract_simple_table_removing_duplicate_header_rows(self):
[[header_elem_1, header_elem_2], [elem_1, elem_2], [elem_3, elem_4]], result
)

def test_extract_simple_table_removing_duplicate_header_different_fonts_or_text(
self,
):
# header_elem_1 header_elem_2
# header_elem_3_different_font header_elem_4
# header_elem_5_different_text header_elem_6
#
header_elem_1 = FakePDFMinerTextElement(
text="header 1",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(0, 5, 21, 25),
)
header_elem_2 = FakePDFMinerTextElement(
text="header 2",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(11, 15, 21, 25),
)
header_elem_3_different_font = FakePDFMinerTextElement(
text="header 1",
font_name="header font",
font_size=12,
bounding_box=BoundingBox(0, 5, 16, 20),
)
header_elem_4 = FakePDFMinerTextElement(
text="header 1",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(11, 15, 16, 20),
)
header_elem_5_different_text = FakePDFMinerTextElement(
text="header with a different name",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(0, 5, 11, 15),
)
header_elem_6 = FakePDFMinerTextElement(
text="header 2",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(11, 15, 11, 15),
)

document = create_pdf_document(
elements=[
header_elem_1,
header_elem_2,
header_elem_3_different_font,
header_elem_4,
header_elem_5_different_text,
header_elem_6,
]
)
elem_list = document.elements

result = extract_simple_table(elem_list, remove_duplicate_header_rows=True)
self.assertEqual(len(result), 3)
self.assertEqual(len(result[0]), 2)
self.assertEqual(len(result[1]), 2)
self.assertEqual(len(result[2]), 2)
self.assert_original_element_list_list_equal(
[
[header_elem_1, header_elem_2],
[header_elem_3_different_font, header_elem_4],
[header_elem_5_different_text, header_elem_6],
],
result,
)

def test_extract_table(self):
# Checks that simple 2*2 table is correctly extracted
#
Expand Down Expand Up @@ -414,7 +485,7 @@ def test_extract_table_removing_duplicate_header_rows(self):
document = create_pdf_document(elements=[header_elem_1, header_elem_2])
elem_list = document.elements

result = extract_simple_table(elem_list, remove_duplicate_header_rows=True)
result = extract_table(elem_list, remove_duplicate_header_rows=True)
# Extraction here should just return the whole table as it is not possible to
# have duplicates of a single lined table.
self.assertEqual(len(result), 1)
Expand Down Expand Up @@ -491,6 +562,74 @@ def test_extract_table_removing_duplicate_header_rows(self):
result,
)

def test_extract_table_removing_duplicate_header_different_fonts_or_text(self):
# header_elem_1 header_elem_2
# header_elem_3_different_font header_elem_4
# header_elem_5_different_text header_elem_6
#
header_elem_1 = FakePDFMinerTextElement(
text="header 1",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(0, 5, 21, 25),
)
header_elem_2 = FakePDFMinerTextElement(
text="header 2",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(11, 15, 21, 25),
)
header_elem_3_different_font = FakePDFMinerTextElement(
text="header 1",
font_name="header font",
font_size=12,
bounding_box=BoundingBox(0, 5, 16, 20),
)
header_elem_4 = FakePDFMinerTextElement(
text="header 1",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(11, 15, 16, 20),
)
header_elem_5_different_text = FakePDFMinerTextElement(
text="header with a different name",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(0, 5, 11, 15),
)
header_elem_6 = FakePDFMinerTextElement(
text="header 2",
font_name="header font",
font_size=10,
bounding_box=BoundingBox(11, 15, 11, 15),
)

document = create_pdf_document(
elements=[
header_elem_1,
header_elem_2,
header_elem_3_different_font,
header_elem_4,
header_elem_5_different_text,
header_elem_6,
]
)
elem_list = document.elements

result = extract_table(elem_list, remove_duplicate_header_rows=True)
self.assertEqual(len(result), 3)
self.assertEqual(len(result[0]), 2)
self.assertEqual(len(result[1]), 2)
self.assertEqual(len(result[2]), 2)
self.assert_original_element_list_list_equal(
[
[header_elem_1, header_elem_2],
[header_elem_3_different_font, header_elem_4],
[header_elem_5_different_text, header_elem_6],
],
result,
)

def test_extract_text_from_simple_table(self):
# Checks that text from simple 2*2 table is correctly extracted
#
Expand Down Expand Up @@ -764,3 +903,41 @@ def test_validate_table_shape(self):
table = [[""], ["", ""]]
with self.assertRaises(InvalidTableError):
_validate_table_shape(table)

def test_are_elements_equal(self):
self.assertFalse((_are_elements_equal(create_pdf_element(), None)))
self.assertFalse((_are_elements_equal(None, create_pdf_element())))

element_1 = create_pdf_element(
text="fake_text_1", font_name="fake_font", font_size=10
)
element_2 = create_pdf_element(
text="fake_text_2", font_name="fake_font", font_size=10
)
self.assertFalse(_are_elements_equal(element_1, element_2))

element_1 = create_pdf_element(
text="fake_text", font_name="fake_font_1", font_size=10
)
element_2 = create_pdf_element(
text="fake_text", font_name="fake_font_2", font_size=10
)
self.assertFalse(_are_elements_equal(element_1, element_2))

element_1 = create_pdf_element(
text="fake_text", font_name="fake_font", font_size=10
)
element_2 = create_pdf_element(
text="fake_text", font_name="fake_font", font_size=12
)
self.assertFalse(_are_elements_equal(element_1, element_2))

self.assertTrue(_are_elements_equal(None, None))

element_1 = create_pdf_element(
text="fake_text", font_name="fake_font", font_size=10
)
element_2 = create_pdf_element(
text="fake_text", font_name="fake_font", font_size=10
)
self.assertTrue(_are_elements_equal(element_1, element_2))

0 comments on commit 0bda4c2

Please sign in to comment.