Skip to content

Commit

Permalink
Merge pull request #89 from jstockwin/remove-duplicate-header-rows
Browse files Browse the repository at this point in the history
[tables] Add flag to remove duplicate header rows
  • Loading branch information
paulopaixaoamaral authored May 22, 2020
2 parents 2750650 + 0bda4c2 commit b4a61a8
Show file tree
Hide file tree
Showing 3 changed files with 425 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Added
- Added flag to `extract_simple_table` and `extract_table` functions to remove duplicate header rows. ([#89](https://github.com/jstockwin/py-pdf-parser/pull/89))
### Changed
- Advanced layout analysis is now disabled by default. ([#88](https://github.com/jstockwin/py-pdf-parser/pull/88))

Expand Down
66 changes: 66 additions & 0 deletions py_pdf_parser/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def extract_simple_table(
allow_gaps: bool = False,
reference_element: Optional["PDFElement"] = None,
tolerance: float = 0.0,
remove_duplicate_header_rows: bool = False,
) -> List[List]:
"""
Returns elements structured as a table.
Expand Down Expand Up @@ -65,6 +66,8 @@ def extract_simple_table(
Default: None.
tolerance (int, optional): For elements to be counted as in the same row or
column, they must overlap by at least `tolerance`. Default: 0.
remove_duplicate_header_rows (bool, optional): Remove duplicates of the header
row (the first row) if they exist. Default: False.
Raises:
TableExtractionError: If something goes wrong.
Expand Down Expand Up @@ -125,6 +128,9 @@ def extract_simple_table(
"extract_simple_table, or change you reference element."
)

if remove_duplicate_header_rows:
table = _remove_duplicate_header_rows(table)

if as_text:
return get_text_from_table(table, strip_text=strip_text)

Expand All @@ -139,6 +145,7 @@ def extract_table(
fix_element_in_multiple_rows: bool = False,
fix_element_in_multiple_cols: bool = False,
tolerance: float = 0.0,
remove_duplicate_header_rows: bool = False,
) -> List[List]:
"""
Returns elements structured as a table.
Expand Down Expand Up @@ -172,6 +179,8 @@ def extract_table(
expect this to be the case in your table. Default: False.
tolerance (int, optional): For elements to be counted as in the same row or
column, they must overlap by at least `tolerance`. Default: 0.
remove_duplicate_header_rows (bool, optional): Remove duplicates of the header
row (the first row) if they exist. Default: False.
Raises:
TableExtractionError: If something goes wrong.
Expand Down Expand Up @@ -235,6 +244,9 @@ def extract_table(
table_row.append(element)
table.append(table_row)

if remove_duplicate_header_rows:
table = _remove_duplicate_header_rows(table)

if as_text:
return get_text_from_table(table, strip_text=strip_text)

Expand Down Expand Up @@ -431,3 +443,57 @@ def _fix_cols(cols: Set["ElementList"], elements: "ElementList") -> None:
else:
sorted_columns.remove(col)
return


def _remove_duplicate_header_rows(table: List[List[Any]]) -> List[List[Any]]:
"""
Removes rows which are duplicates of the header (i.e., the first) row.
A row is considered duplicate if all of its elements have the same text and font of
their correspondent elements (i.e., same index) in the header row.
Args:
table (List[List[Any]]): The table to remove the duplicate headers from.
Returns:
List[List[Any]]: The table without the duplicate header rows.
"""
if len(table) <= 1:
return table

header = table[0]
rows_without_duplicate_header = [
row
for row in table[1:]
if any(
not _are_elements_equal(element, header[index])
for index, element in enumerate(row)
)
]
return [header] + rows_without_duplicate_header


def _are_elements_equal(
elem_1: Optional["PDFElement"], elem_2: Optional["PDFElement"]
) -> bool:
"""
Checks if two elements are equal.
Two elements are considered equal if they are both None or they have the same text
and font.
Args:
elem_1 (PDFElement, optional): The first element to compare.
elem_2 (PDFElement, optional): The second element to compare.
Returns:
bool: True if elements are equal, False otherwise.
"""
if elem_1 is None and elem_2 is None:
return True

if elem_1 is None or elem_2 is None:
return False

if elem_1.text() != elem_2.text() or elem_1.font != elem_2.font:
return False

return True
Loading

0 comments on commit b4a61a8

Please sign in to comment.