Skip to content

Commit

Permalink
Merge pull request #73 from jstockwin/cache-filtering-by-font
Browse files Browse the repository at this point in the history
[filtering] Cache filtering by fonts
  • Loading branch information
paulopaixaoamaral authored May 6, 2020
2 parents db527fa + 68e5590 commit 797f8b6
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
- Documentation is now hosted [here](https://py-pdf-parser.readthedocs.io/en/latest/).
- Font filtering now caches the elements by font. ([#73](https://github.com/jstockwin/py-pdf-parser/pull/73))

### Changed
- This product is now complete enough for the needs of Optimor Ltd, however `jstockwin` is going to continue development as a personal project. The repository has been moved from `optimor/py-pdf-parser` to `jstockwin/py-pdf-parser`.
Expand Down
34 changes: 33 additions & 1 deletion py_pdf_parser/components.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import Dict, List, Set, Optional, Union, TYPE_CHECKING

import re
from collections import Counter
from collections import Counter, defaultdict
from itertools import chain

from .common import BoundingBox
from .exceptions import PageNotFoundError, NoElementsOnPageError
Expand Down Expand Up @@ -338,6 +339,9 @@ class PDFDocument:
sectioning: "Sectioning"
# _element_list will contain all elements, sorted from top to bottom, left to right.
_element_list: List[PDFElement]
# _element_indexes_by_font will be a caching of fonts to elements indexes but it
# will be built as needed (while filtering by fonts), not on document load.
_element_indexes_by_font: Dict[str, Set[int]]
_ignored_indexes: Set[int]
_font_mapping: Dict[str, str]
_font_mapping_is_regex: bool
Expand All @@ -356,6 +360,7 @@ def __init__(
):
self.sectioning = Sectioning(self)
self._element_list = []
self._element_indexes_by_font = defaultdict(set)
self._font_mapping = font_mapping if font_mapping is not None else {}
self._font_mapping_is_regex = font_mapping_is_regex
self._regex_flags = regex_flags
Expand Down Expand Up @@ -442,3 +447,30 @@ def get_page(self, page_number: int) -> "PDFPage":
return self.__pages[page_number]
except KeyError as err:
raise PageNotFoundError(f"Could not find page {page_number}") from err

def _element_indexes_with_fonts(self, *fonts: str) -> Set[int]:
"""
Returns all the indexes of elements with given fonts.
For internal use only, used to cache fonts. If you want to filter by fonts you
should use elements.filter_by_fonts instead.
Args:
*fonts (str): The fonts to filter for.
Returns:
Set[int]: The elements indexes.
"""
non_cached_fonts = [
font for font in fonts if font not in self._element_indexes_by_font.keys()
]
if non_cached_fonts:
# If we don't have cached elements for any of the required fonts, build
# the cache for the non cached fonts.
for element in self._element_list:
if element.font not in non_cached_fonts:
continue

self._element_indexes_by_font[element.font].add(element._index)

# Returns elements based on the caching of fonts to elements indexes.
return set(chain.from_iterable(self._element_indexes_by_font.values()))
5 changes: 2 additions & 3 deletions py_pdf_parser/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,7 @@ def filter_by_font(self, font: str) -> "ElementList":
Returns:
ElementList: The filtered list.
"""
new_indexes = set(element._index for element in self if element.font == font)
return ElementList(self.document, new_indexes)
return self.filter_by_fonts(font)

def filter_by_fonts(self, *fonts: str) -> "ElementList":
"""
Expand All @@ -213,7 +212,7 @@ def filter_by_fonts(self, *fonts: str) -> "ElementList":
Returns:
ElementList: The filtered list.
"""
new_indexes = set(element._index for element in self if element.font in fonts)
new_indexes = self.indexes & self.document._element_indexes_with_fonts(*fonts)
return ElementList(self.document, new_indexes)

def filter_by_page(self, page_number: int) -> "ElementList":
Expand Down
12 changes: 12 additions & 0 deletions py_pdf_parser/tests/test_filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,17 @@ def test_filter_by_font(self):
self.assertEqual(len(doc.elements.filter_by_font("hello,1")), 0)

self.assertEqual(len(doc.elements.filter_by_font("foo,2")), 1)
# Check if "foo,2" has been added to cache
self.assertEqual(doc._element_indexes_by_font, {"foo,2": set([0])})
self.assert_original_element_in(elem1, doc.elements.filter_by_font("foo,2"))

doc = create_pdf_document([elem1, elem2], font_mapping={"foo,2": "font_a"})
self.assertEqual(len(doc.elements.filter_by_font("hello,1")), 0)
self.assertEqual(len(doc.elements.filter_by_font("foo,2")), 0)

self.assertEqual(len(doc.elements.filter_by_font("font_a")), 1)
# Check if "font_a" has been added to cache
self.assertEqual(doc._element_indexes_by_font, {"font_a": set([0])})
self.assert_original_element_in(elem1, doc.elements.filter_by_font("font_a"))

def test_filter_by_fonts(self):
Expand All @@ -175,6 +179,10 @@ def test_filter_by_fonts(self):
self.assertEqual(len(doc.elements.filter_by_fonts("hello,1")), 0)

self.assertEqual(len(doc.elements.filter_by_fonts("foo,2", "bar,3")), 2)
# Check if "foo,2" and "bar,3" have been added to cache
self.assertEqual(
doc._element_indexes_by_font, {"foo,2": set([0]), "bar,3": set([1])}
)
self.assert_original_element_in(
elem1, doc.elements.filter_by_fonts("foo,2", "bar,3")
)
Expand All @@ -190,6 +198,10 @@ def test_filter_by_fonts(self):
self.assertEqual(len(doc.elements.filter_by_fonts("foo,2", "bar,3")), 0)

self.assertEqual(len(doc.elements.filter_by_fonts("font_a", "font_b")), 2)
# Check if "font_a" and "font_b" have been added to cache
self.assertEqual(
doc._element_indexes_by_font, {"font_a": set([0]), "font_b": set([1])}
)
self.assert_original_element_in(
elem1, doc.elements.filter_by_fonts("font_a", "font_b")
)
Expand Down

0 comments on commit 797f8b6

Please sign in to comment.