From 6247868d281ab5f1f1580c1a524aa294e70219b9 Mon Sep 17 00:00:00 2001 From: Paulo Amaral Date: Fri, 22 May 2020 11:53:22 +0100 Subject: [PATCH] [tables] Add flag to remove duplicate header rows --- py_pdf_parser/tables.py | 54 ++++++++++++ tests/test_tables.py | 180 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+) diff --git a/py_pdf_parser/tables.py b/py_pdf_parser/tables.py index cbbeaf5a..4fbacd6d 100644 --- a/py_pdf_parser/tables.py +++ b/py_pdf_parser/tables.py @@ -22,6 +22,7 @@ def extract_simple_table( allow_gaps: bool = False, reference_element: Optional["PDFElement"] = None, tolerance: float = 0.0, + remove_duplicate_header_rows: bool = False, ) -> List[List]: """ Returns elements structured as a table. @@ -65,6 +66,8 @@ def extract_simple_table( Default: None. tolerance (int, optional): For elements to be counted as in the same row or column, they must overlap by at least `tolerance`. Default: 0. + remove_duplicate_header_rows (bool, optional): Remove duplicates of the header + row (the first row) if they exist. Default: False. Raises: TableExtractionError: If something goes wrong. @@ -125,6 +128,9 @@ def extract_simple_table( "extract_simple_table, or change you reference element." ) + if remove_duplicate_header_rows: + table = _remove_duplicate_header_rows(table) + if as_text: return get_text_from_table(table, strip_text=strip_text) @@ -139,6 +145,7 @@ def extract_table( fix_element_in_multiple_rows: bool = False, fix_element_in_multiple_cols: bool = False, tolerance: float = 0.0, + remove_duplicate_header_rows: bool = False, ) -> List[List]: """ Returns elements structured as a table. @@ -172,6 +179,8 @@ def extract_table( expect this to be the case in your table. Default: False. tolerance (int, optional): For elements to be counted as in the same row or column, they must overlap by at least `tolerance`. Default: 0. + remove_duplicate_header_rows (bool, optional): Remove duplicates of the header + row (the first row) if they exist. Default: False. Raises: TableExtractionError: If something goes wrong. @@ -235,6 +244,9 @@ def extract_table( table_row.append(element) table.append(table_row) + if remove_duplicate_header_rows: + table = _remove_duplicate_header_rows(table) + if as_text: return get_text_from_table(table, strip_text=strip_text) @@ -431,3 +443,45 @@ def _fix_cols(cols: Set["ElementList"], elements: "ElementList") -> None: else: sorted_columns.remove(col) return + + +def _remove_duplicate_header_rows(table: List[List[Any]]) -> List[List[Any]]: + """ + Removes rows which are duplicates of the header (i.e., the first) row. + A row is considered duplicate if all of its elements have the same text and font of + their correspondent elements (i.e., same index) in the header row. + + Args: + table (List[List[Any]]): The table to remove the duplicate headers from. + + Returns: + List[List[Any]]: The table without the duplicate header rows. + + """ + if len(table) <= 1: + return table + + def elements_equal(elem_1: Optional["PDFElement"], elem_2: Optional["PDFElement"]): + if elem_1 is None and elem_2 is None: + return True + + if (elem_1 is None or elem_2 is None) or ( + elem_2 is None and elem_1 is not None + ): + return False + + if elem_1.text() != elem_2.text() or elem_1.font != elem_2.font: + return False + + return True + + header = table[0] + rows_without_duplicate_header = [ + row + for row in table[1:] + if any( + not elements_equal(element, header[index]) + for index, element in enumerate(row) + ) + ] + return [header] + rows_without_duplicate_header diff --git a/tests/test_tables.py b/tests/test_tables.py index 82491719..3dee2015 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -184,6 +184,92 @@ def test_extract_simple_table_with_tolerance(self): [[elem_1, elem_2], [elem_3, elem_4]], result ) + def test_extract_simple_table_removing_duplicate_header_rows(self): + # header_elem_1 header_elem_2 + header_elem_1 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 21, 25), + ) + header_elem_2 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(6, 10, 21, 25), + ) + document = create_pdf_document(elements=[header_elem_1, header_elem_2]) + elem_list = document.elements + + result = extract_simple_table(elem_list, remove_duplicate_header_rows=True) + # Extraction here should just return the whole table as it is not possible to + # have duplicates of a single lined table. + self.assertEqual(len(result), 1) + self.assertEqual(len(result[0]), 2) + self.assert_original_element_list_list_equal( + [[header_elem_1, header_elem_2]], result + ) + + # header_elem_1 header_elem_2 + # elem_1 elem_2 + # header_elem_3 header_elem_4 + # elem_3 elem_4 + # header_elem_5 header_elem_6 + # + elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 16, 20)) + elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 16, 20)) + header_elem_3 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 11, 15), + ) + header_elem_4 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(6, 10, 11, 15), + ) + elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) + elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) + header_elem_5 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 0, 5), + ) + header_elem_6 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(6, 10, 0, 5), + ) + + document = create_pdf_document( + elements=[ + header_elem_1, + header_elem_2, + elem_1, + elem_2, + header_elem_3, + header_elem_4, + elem_3, + elem_4, + header_elem_5, + header_elem_6, + ] + ) + elem_list = document.elements + + result = extract_simple_table(elem_list, remove_duplicate_header_rows=True) + self.assertEqual(len(result), 3) + self.assertEqual(len(result[0]), 2) + self.assertEqual(len(result[1]), 2) + self.assertEqual(len(result[2]), 2) + self.assert_original_element_list_list_equal( + [[header_elem_1, header_elem_2], [elem_1, elem_2], [elem_3, elem_4]], result + ) + def test_extract_table(self): # Checks that simple 2*2 table is correctly extracted # @@ -311,6 +397,100 @@ def test_extract_table_with_tolerance(self): [[elem_1, elem_2], [elem_3, elem_4]], result ) + def test_extract_table_removing_duplicate_header_rows(self): + # header_elem_1 header_elem_2 + header_elem_1 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 21, 25), + ) + header_elem_2 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(11, 15, 21, 25), + ) + document = create_pdf_document(elements=[header_elem_1, header_elem_2]) + elem_list = document.elements + + result = extract_simple_table(elem_list, remove_duplicate_header_rows=True) + # Extraction here should just return the whole table as it is not possible to + # have duplicates of a single lined table. + self.assertEqual(len(result), 1) + self.assertEqual(len(result[0]), 2) + self.assert_original_element_list_list_equal( + [[header_elem_1, header_elem_2]], result + ) + + # header_elem_1 header_elem_2 + # elem_1 elem_2 + # header_elem_3 header_elem_4 + # elem_3 elem_4 + # header_elem_5 header_elem_6 + # + elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 16, 20)) + elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 16, 20)) + header_elem_3 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 11, 15), + ) + header_elem_4 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(11, 15, 11, 15), + ) + elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) + elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10)) + header_elem_5 = FakePDFMinerTextElement( + text="header 1", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(0, 5, 0, 5), + ) + header_elem_6 = FakePDFMinerTextElement( + text="header 2", + font_name="header font", + font_size=10, + bounding_box=BoundingBox(6, 10, 0, 5), + ) + + document = create_pdf_document( + elements=[ + header_elem_1, + header_elem_2, + elem_1, + elem_2, + header_elem_3, + header_elem_4, + elem_3, + elem_4, + header_elem_5, + header_elem_6, + ] + ) + elem_list = document.elements + + result = extract_table(elem_list, remove_duplicate_header_rows=True) + # The last row will not be removed as the gaps do not match the header row + self.assertEqual(len(result), 4) + self.assertEqual(len(result[0]), 3) + self.assertEqual(len(result[1]), 3) + self.assertEqual(len(result[2]), 3) + self.assertEqual(len(result[3]), 3) + self.assert_original_element_list_list_equal( + [ + [header_elem_1, None, header_elem_2], + [elem_1, elem_2, None], + [elem_3, None, elem_4], + [header_elem_5, header_elem_6, None], + ], + result, + ) + def test_extract_text_from_simple_table(self): # Checks that text from simple 2*2 table is correctly extracted #