diff --git a/CHANGELOG.md b/CHANGELOG.md index f8fb341d..dd5dbecb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- Added flag to `extract_simple_table` and `extract_table` functions to remove duplicate header rows. ([#89](https://github.com/jstockwin/py-pdf-parser/pull/89)) ### Changed - Advanced layout analysis is now disabled by default. ([#88](https://github.com/jstockwin/py-pdf-parser/pull/88)) diff --git a/py_pdf_parser/tables.py b/py_pdf_parser/tables.py index cbbeaf5a..fca54b01 100644 --- a/py_pdf_parser/tables.py +++ b/py_pdf_parser/tables.py @@ -22,6 +22,7 @@ def extract_simple_table( allow_gaps: bool = False, reference_element: Optional["PDFElement"] = None, tolerance: float = 0.0, + remove_duplicate_header_rows: bool = False, ) -> List[List]: """ Returns elements structured as a table. @@ -65,6 +66,8 @@ def extract_simple_table( Default: None. tolerance (int, optional): For elements to be counted as in the same row or column, they must overlap by at least `tolerance`. Default: 0. + remove_duplicate_header_rows (bool, optional): Remove duplicates of the header + row (the first row) if they exist. Default: False. Raises: TableExtractionError: If something goes wrong. @@ -125,6 +128,9 @@ def extract_simple_table( "extract_simple_table, or change you reference element." ) + if remove_duplicate_header_rows: + table = _remove_duplicate_header_rows(table) + if as_text: return get_text_from_table(table, strip_text=strip_text) @@ -139,6 +145,7 @@ def extract_table( fix_element_in_multiple_rows: bool = False, fix_element_in_multiple_cols: bool = False, tolerance: float = 0.0, + remove_duplicate_header_rows: bool = False, ) -> List[List]: """ Returns elements structured as a table. @@ -172,6 +179,8 @@ def extract_table( expect this to be the case in your table. Default: False. tolerance (int, optional): For elements to be counted as in the same row or column, they must overlap by at least `tolerance`. Default: 0. + remove_duplicate_header_rows (bool, optional): Remove duplicates of the header + row (the first row) if they exist. Default: False. Raises: TableExtractionError: If something goes wrong. @@ -235,6 +244,9 @@ def extract_table( table_row.append(element) table.append(table_row) + if remove_duplicate_header_rows: + table = _remove_duplicate_header_rows(table) + if as_text: return get_text_from_table(table, strip_text=strip_text) @@ -431,3 +443,57 @@ def _fix_cols(cols: Set["ElementList"], elements: "ElementList") -> None: else: sorted_columns.remove(col) return + + +def _remove_duplicate_header_rows(table: List[List[Any]]) -> List[List[Any]]: + """ + Removes rows which are duplicates of the header (i.e., the first) row. + A row is considered duplicate if all of its elements have the same text and font of + their correspondent elements (i.e., same index) in the header row. + + Args: + table (List[List[Any]]): The table to remove the duplicate headers from. + + Returns: + List[List[Any]]: The table without the duplicate header rows. + """ + if len(table) <= 1: + return table + + header = table[0] + rows_without_duplicate_header = [ + row + for row in table[1:] + if any( + not _are_elements_equal(element, header[index]) + for index, element in enumerate(row) + ) + ] + return [header] + rows_without_duplicate_header + + +def _are_elements_equal( + elem_1: Optional["PDFElement"], elem_2: Optional["PDFElement"] +) -> bool: + """ + Checks if two elements are equal. + Two elements are considered equal if they are both None or they have the same text + and font. + + Args: + elem_1 (PDFElement, optional): The first element to compare. + elem_2 (PDFElement, optional): The second element to compare. + + Returns: + bool: True if elements are equal, False otherwise. + """ + if elem_1 is None and elem_2 is None: + return True + + if elem_1 is None or elem_2 is None: + return False + + if elem_1.text() != elem_2.text() or elem_1.font != elem_2.font: + return False + + return True diff --git a/tests/test_tables.py b/tests/test_tables.py index 82491719..1f00fcb2 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -10,6 +10,7 @@ get_text_from_table, _validate_table_shape, add_header_to_table, + _are_elements_equal, ) from .base import BaseTestCase @@ -184,6 +185,162 @@ def test_extract_simple_table_with_tolerance(self): [[elem_1, elem_2], [elem_3, elem_4]], result ) + def test_extract_simple_table_removing_duplicate_header_rows(self): + # header_elem_1 header_elem_2 + header_elem_1 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 21, 25), + ) + header_elem_2 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(6, 10, 21, 25), + ) + document = create_pdf_document(elements=[header_elem_1, header_elem_2]) + elem_list = document.elements + + result = extract_simple_table(elem_list, remove_duplicate_header_rows=True) + # Extraction here should just return the whole table as it is not possible to + # have duplicates of a single lined table. + self.assertEqual(len(result), 1) + self.assertEqual(len(result[0]), 2) + self.assert_original_element_list_list_equal( + [[header_elem_1, header_elem_2]], result + ) + + # header_elem_1 header_elem_2 + # elem_1 elem_2 + # header_elem_3 header_elem_4 + # elem_3 elem_4 + # header_elem_5 header_elem_6 + # + elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 16, 20)) + elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 16, 20)) + header_elem_3 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 11, 15), + ) + header_elem_4 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(6, 10, 11, 15), + ) + elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) + elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) + header_elem_5 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 0, 5), + ) + header_elem_6 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(6, 10, 0, 5), + ) + + document = create_pdf_document( + elements=[ + header_elem_1, + header_elem_2, + elem_1, + elem_2, + header_elem_3, + header_elem_4, + elem_3, + elem_4, + header_elem_5, + header_elem_6, + ] + ) + elem_list = document.elements + + result = extract_simple_table(elem_list, remove_duplicate_header_rows=True) + self.assertEqual(len(result), 3) + self.assertEqual(len(result[0]), 2) + self.assertEqual(len(result[1]), 2) + self.assertEqual(len(result[2]), 2) + self.assert_original_element_list_list_equal( + [[header_elem_1, header_elem_2], [elem_1, elem_2], [elem_3, elem_4]], result + ) + + def test_extract_simple_table_removing_duplicate_header_different_fonts_or_text( + self, + ): + # header_elem_1 header_elem_2 + # header_elem_3_different_font header_elem_4 + # header_elem_5_different_text header_elem_6 + # + header_elem_1 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 21, 25), + ) + header_elem_2 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(11, 15, 21, 25), + ) + header_elem_3_different_font = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=12, + bounding_box=BoundingBox(0, 5, 16, 20), + ) + header_elem_4 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(11, 15, 16, 20), + ) + header_elem_5_different_text = FakePDFMinerTextElement( + text="header with a different name", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 11, 15), + ) + header_elem_6 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(11, 15, 11, 15), + ) + + document = create_pdf_document( + elements=[ + header_elem_1, + header_elem_2, + header_elem_3_different_font, + header_elem_4, + header_elem_5_different_text, + header_elem_6, + ] + ) + elem_list = document.elements + + result = extract_simple_table(elem_list, remove_duplicate_header_rows=True) + self.assertEqual(len(result), 3) + self.assertEqual(len(result[0]), 2) + self.assertEqual(len(result[1]), 2) + self.assertEqual(len(result[2]), 2) + self.assert_original_element_list_list_equal( + [ + [header_elem_1, header_elem_2], + [header_elem_3_different_font, header_elem_4], + [header_elem_5_different_text, header_elem_6], + ], + result, + ) + def test_extract_table(self): # Checks that simple 2*2 table is correctly extracted # @@ -311,6 +468,168 @@ def test_extract_table_with_tolerance(self): [[elem_1, elem_2], [elem_3, elem_4]], result ) + def test_extract_table_removing_duplicate_header_rows(self): + # header_elem_1 header_elem_2 + header_elem_1 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 21, 25), + ) + header_elem_2 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(11, 15, 21, 25), + ) + document = create_pdf_document(elements=[header_elem_1, header_elem_2]) + elem_list = document.elements + + result = extract_table(elem_list, remove_duplicate_header_rows=True) + # Extraction here should just return the whole table as it is not possible to + # have duplicates of a single lined table. + self.assertEqual(len(result), 1) + self.assertEqual(len(result[0]), 2) + self.assert_original_element_list_list_equal( + [[header_elem_1, header_elem_2]], result + ) + + # header_elem_1 header_elem_2 + # elem_1 elem_2 + # header_elem_3 header_elem_4 + # elem_3 elem_4 + # header_elem_5 header_elem_6 + # + elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 16, 20)) + elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 16, 20)) + header_elem_3 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 11, 15), + ) + header_elem_4 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(11, 15, 11, 15), + ) + elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) + elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10)) + header_elem_5 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 0, 5), + ) + header_elem_6 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(6, 10, 0, 5), + ) + + document = create_pdf_document( + elements=[ + header_elem_1, + header_elem_2, + elem_1, + elem_2, + header_elem_3, + header_elem_4, + elem_3, + elem_4, + header_elem_5, + header_elem_6, + ] + ) + elem_list = document.elements + + result = extract_table(elem_list, remove_duplicate_header_rows=True) + # The last row will not be removed as the gaps do not match the header row + self.assertEqual(len(result), 4) + self.assertEqual(len(result[0]), 3) + self.assertEqual(len(result[1]), 3) + self.assertEqual(len(result[2]), 3) + self.assertEqual(len(result[3]), 3) + self.assert_original_element_list_list_equal( + [ + [header_elem_1, None, header_elem_2], + [elem_1, elem_2, None], + [elem_3, None, elem_4], + [header_elem_5, header_elem_6, None], + ], + result, + ) + + def test_extract_table_removing_duplicate_header_different_fonts_or_text(self): + # header_elem_1 header_elem_2 + # header_elem_3_different_font header_elem_4 + # header_elem_5_different_text header_elem_6 + # + header_elem_1 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 21, 25), + ) + header_elem_2 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(11, 15, 21, 25), + ) + header_elem_3_different_font = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=12, + bounding_box=BoundingBox(0, 5, 16, 20), + ) + header_elem_4 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(11, 15, 16, 20), + ) + header_elem_5_different_text = FakePDFMinerTextElement( + text="header with a different name", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 11, 15), + ) + header_elem_6 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(11, 15, 11, 15), + ) + + document = create_pdf_document( + elements=[ + header_elem_1, + header_elem_2, + header_elem_3_different_font, + header_elem_4, + header_elem_5_different_text, + header_elem_6, + ] + ) + elem_list = document.elements + + result = extract_table(elem_list, remove_duplicate_header_rows=True) + self.assertEqual(len(result), 3) + self.assertEqual(len(result[0]), 2) + self.assertEqual(len(result[1]), 2) + self.assertEqual(len(result[2]), 2) + self.assert_original_element_list_list_equal( + [ + [header_elem_1, header_elem_2], + [header_elem_3_different_font, header_elem_4], + [header_elem_5_different_text, header_elem_6], + ], + result, + ) + def test_extract_text_from_simple_table(self): # Checks that text from simple 2*2 table is correctly extracted # @@ -584,3 +903,41 @@ def test_validate_table_shape(self): table = [[""], ["", ""]] with self.assertRaises(InvalidTableError): _validate_table_shape(table) + + def test_are_elements_equal(self): + self.assertFalse((_are_elements_equal(create_pdf_element(), None))) + self.assertFalse((_are_elements_equal(None, create_pdf_element()))) + + element_1 = create_pdf_element( + text="fake_text_1", font_name="fake_font", font_size=10 + ) + element_2 = create_pdf_element( + text="fake_text_2", font_name="fake_font", font_size=10 + ) + self.assertFalse(_are_elements_equal(element_1, element_2)) + + element_1 = create_pdf_element( + text="fake_text", font_name="fake_font_1", font_size=10 + ) + element_2 = create_pdf_element( + text="fake_text", font_name="fake_font_2", font_size=10 + ) + self.assertFalse(_are_elements_equal(element_1, element_2)) + + element_1 = create_pdf_element( + text="fake_text", font_name="fake_font", font_size=10 + ) + element_2 = create_pdf_element( + text="fake_text", font_name="fake_font", font_size=12 + ) + self.assertFalse(_are_elements_equal(element_1, element_2)) + + self.assertTrue(_are_elements_equal(None, None)) + + element_1 = create_pdf_element( + text="fake_text", font_name="fake_font", font_size=10 + ) + element_2 = create_pdf_element( + text="fake_text", font_name="fake_font", font_size=10 + ) + self.assertTrue(_are_elements_equal(element_1, element_2))