Merge pull request #57 from optimor/simple-table-update

[tables] Update extract simple table with option to allow gaps
jstockwin · Apr 14, 2020 · ce27e6b · ce27e6b
2 parents a6539e2 + 65950fd
commit ce27e6b
Show file tree

Hide file tree

Showing 3 changed files with 171 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+### Added
+- `extract_simple_table` now allows extracting tables with gaps, provided there is at least one full row and one full column. This is only the case if you pass `allow_gaps=True`, otherwise the original logic of raising an exception if there a gap remains. You can optionally pass a `reference_element` which must be in both a full row and a full column, this defaults to the first (top-left) element. ([#57](https://github.com/optimor/py-pdf-parser/pull/57))
 
 ## [0.1.0] - 2019-04-08
 ### Added

diff --git a/py_pdf_parser/tables.py b/py_pdf_parser/tables.py
@@ -19,26 +19,50 @@ def extract_simple_table(
     elements: "ElementList",
     as_text: bool = False,
     strip_text: bool = True,
+    allow_gaps: bool = False,
+    reference_element: Optional["PDFElement"] = None,
     tolerance: float = 0.0,
 ) -> List[List]:
     """
     Returns elements structured as a table.
 
     Given an ElementList, tries to extract a structured table by examining which
-    elements are aligned. To use this function, the table must contain no gaps, i.e.
-    should be a full N x M table with an element in each cell. There must be a clear
-    gap between each row and between each column which contains no elements, and
-    a single cell cannot contain multiple elements.
-
-    If your table has empty cells, you can use `extract_table` instead. If you fail
-    to satisfy any of the other conditions listed above, that case is not yet supported.
+    elements are aligned.
+
+    To use this function, there must be at least one full row and one full column (which
+    we call the reference row and column), i.e. the reference row must have an element
+    in every column, and the reference column must have an element in every row. The
+    reference row and column can be specified by passing the single element in both the
+    reference row and the reference column. By default, this is the top left element,
+    which means we use the first row and column as the references. Note if you need to
+    change the reference_element, that means you have gaps in your table, and as such
+    you will need to pass `allow_gaps=True`.
+
+    Important: This function uses the elements in the reference row and column to scan
+    horizontally and vertically to find the rest of the table. If there are gaps in your
+    reference row and column, this could result in rows and columns being missed by
+    this function.
+
+    There must be a clear gap between each row and between each column which contains no
+    elements, and a single cell cannot contain multiple elements.
+
+    If there are no valid reference rows or columns, try extract_table() instead. If you
+    have elements spanning multiple rows or columns, it may be possible to fix this by
+    using extract_table(). If you fail to satisfy any of the other conditions listed
+    above, that case is not yet supported.
 
     Args:
         elements (ElementList): A list of elements to extract into a table.
         as_text (bool, optional): Whether to extract the text from each element instead
             of the PDFElement itself. Default: False.
         strip_text (bool, optional): Whether to strip the text for each element of the
                 table (Only relevant if as_text is True). Default: True.
+        allow_gaps (bool, optional): Whether to allow empty spaces in the table.
+        reference_element (PDFElement, optional): An element in a full row and a full
+            column. Will be used to specify the reference row and column. If None, the
+            top left element will be used, meaning the top row and left column will be
+            used. If there are gaps in these, you should specify a different reference.
+            Default: None.
         tolerance (int, optional): For elements to be counted as in the same row or
             column, they must overlap by at least `tolerance`. Default: 0.
 
@@ -49,38 +73,52 @@ def extract_simple_table(
         list[list]: a list of rows, which are lists of PDFElements or strings
             (depending on the value of as_text).
     """
-    first_row = elements.to_the_right_of(
-        elements[0], inclusive=True, tolerance=tolerance
+    if reference_element is None:
+        reference_element = elements[0]
+    reference_row = elements.horizontally_in_line_with(
+        reference_element, inclusive=True, tolerance=tolerance
+    )
+    reference_column = elements.vertically_in_line_with(
+        reference_element, inclusive=True, tolerance=tolerance, all_pages=True
     )
-    first_column = elements.below(elements[0], inclusive=True, tolerance=tolerance)
 
     table: List[List] = []
-    for left_hand_element in first_column:
+    for reference_column_element in reference_column:
         row: List = []
-        for top_element in first_row:
-            element = elements.to_the_right_of(
-                left_hand_element, inclusive=True, tolerance=tolerance
-            ).below(top_element, inclusive=True, tolerance=tolerance)
+        for reference_row_element in reference_row:
+            element = elements.horizontally_in_line_with(
+                reference_column_element, inclusive=True, tolerance=tolerance
+            ).vertically_in_line_with(
+                reference_row_element,
+                inclusive=True,
+                tolerance=tolerance,
+                all_pages=True,
+            )
             try:
                 row.append(element.extract_single_element())
             except NoElementFoundError as err:
-                raise TableExtractionError(
-                    "Element not found, there appears to be a gap in the table. "
-                    "Please try extract_table() instead."
-                ) from err
+                if allow_gaps:
+                    row.append(None)
+                else:
+                    raise TableExtractionError(
+                        "Element not found, there appears to be a gap in the table. "
+                        "If this is expected, pass allow_gaps=True."
+                    ) from err
             except MultipleElementsFoundError as err:
                 raise TableExtractionError(
                     "Multiple elements appear to be in the place of one cell in the "
                     "table. Please try extract_table() instead."
                 ) from err
         table.append(row)
 
-    table_size = sum(len(row) for row in table)
+    table_size = sum(
+        len([element for element in row if element is not None]) for row in table
+    )
     if table_size != len(elements):
         raise TableExtractionError(
             f"Number of elements in table ({table_size}) does not match number of "
-            f"elements passed {len(elements)}. Perhaps try extract_table instead of "
-            "extract_simple_table."
+            f"elements passed ({len(elements)}). Perhaps try extract_table instead of "
+            "extract_simple_table, or change you reference element."
         )
 
     if as_text:
@@ -109,6 +147,9 @@ def extract_table(
     If you fail to satisfy any of the other conditions listed above, that case is not
     yet supported.
 
+    Note: If you satisfy the conditions to use extract_simple_table, then that should be
+    used instead, as it's much more efficient.
+
     Args:
         elements (ElementList): A list of elements to extract into a table.
         as_text (bool, optional): Whether to extract the text from each element instead

diff --git a/py_pdf_parser/tests/test_tables.py b/py_pdf_parser/tests/test_tables.py
@@ -52,6 +52,112 @@ def test_extract_simple_table(self):
         with self.assertRaises(TableExtractionError):
             extract_simple_table(elem_list)
 
+    def test_extract_simple_table_with_gaps(self):
+        #       elem_1      elem_2      elem_3
+        #       elem_4      elem_5
+        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
+        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
+        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10))
+        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
+        elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
+        document = create_pdf_document(
+            elements=[elem_1, elem_2, elem_3, elem_4, elem_5]
+        )
+        elem_list = document.elements
+        result = extract_simple_table(elem_list, allow_gaps=True)
+        self.assertEqual(len(result), 2)
+        self.assertEqual(len(result[0]), 3)
+        self.assertEqual(len(result[1]), 3)
+        self.assert_original_element_list_list_equal(
+            [[elem_1, elem_2, elem_3], [elem_4, elem_5, None]], result
+        )
+
+    def test_extract_simple_table_with_gaps_and_different_reference(self):
+        #       elem_1      elem_2      elem_3
+        #       elem_4      elem_5
+        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
+        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
+        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10))
+        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
+        elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
+        document = create_pdf_document(
+            elements=[elem_1, elem_2, elem_3, elem_4, elem_5]
+        )
+        elem_list = document.elements
+        reference_element = self.extract_element_from_list(elem_2, elem_list)
+        result = extract_simple_table(
+            elem_list, allow_gaps=True, reference_element=reference_element
+        )
+        self.assertEqual(len(result), 2)
+        self.assertEqual(len(result[0]), 3)
+        self.assertEqual(len(result[1]), 3)
+        self.assert_original_element_list_list_equal(
+            [[elem_1, elem_2, elem_3], [elem_4, elem_5, None]], result
+        )
+
+    def test_extract_simple_table_with_gaps_and_wrong_reference(self):
+        #       elem_1      elem_2      elem_3
+        #       elem_4      elem_5
+        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
+        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
+        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10))
+        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
+        elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
+        document = create_pdf_document(
+            elements=[elem_1, elem_2, elem_3, elem_4, elem_5]
+        )
+        elem_list = document.elements
+        reference_element = self.extract_element_from_list(elem_3, elem_list)
+        with self.assertRaises(TableExtractionError):
+            extract_simple_table(
+                elem_list, allow_gaps=True, reference_element=reference_element
+            )
+
+    def test_extract_simple_table_from_different_pages(self):
+        # Checks that simple 2*2 tables are correctly extracted from different pages
+        #
+        # Page 1:
+        #       elem_p1_1      elem_p1_2
+        #       elem_p1_3      elem_p1_4
+        #
+        # Page 2:
+        #       elem_p2_1      elem_p2_2
+        #       elem_p2_3      elem_p2_4
+        #
+        elem_p1_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
+        elem_p1_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
+        elem_p1_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
+        elem_p1_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
+
+        elem_p2_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
+        elem_p2_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
+        elem_p2_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
+        elem_p2_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
+
+        document = create_pdf_document(
+            elements={
+                1: [elem_p1_1, elem_p1_2, elem_p1_3, elem_p1_4],
+                2: [elem_p2_1, elem_p2_2, elem_p2_3, elem_p2_4],
+            }
+        )
+        elem_list = document.elements
+
+        result = extract_simple_table(elem_list)
+        self.assertEqual(len(result), 4)
+        self.assertEqual(len(result[0]), 2)
+        self.assertEqual(len(result[1]), 2)
+        self.assertEqual(len(result[2]), 2)
+        self.assertEqual(len(result[3]), 2)
+        self.assert_original_element_list_list_equal(
+            [
+                [elem_p1_1, elem_p1_2],
+                [elem_p1_3, elem_p1_4],
+                [elem_p2_1, elem_p2_2],
+                [elem_p2_3, elem_p2_4],
+            ],
+            result,
+        )
+
     def test_extract_simple_table_with_tolerance(self):
         # Checks that simple 2*2 table is correctly extracted
         #