diff --git a/CHANGELOG.md b/CHANGELOG.md index ba5ba5eb..ecb740c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - This product is now complete enough for the needs of Optimor Ltd, however `jstockwin` is going to continue development as a personal project. The repository has been moved from `optimor/py-pdf-parser` to `jstockwin/py-pdf-parser`. ### Added -- The visualise tool now draws an outline around each section on the page. ([#69](https://github.com/jstockwin/py-pdf-parser/pull/69)) +- The visualise tool now draws an outline around each section on the page. ([#69](https://github.com/jstockwin/py-pdf-parser/pull/69)) (updated in [#80](https://github.com/jstockwin/py-pdf-parser/pull/80)) ## [0.2.0] - 2020-04-17 ### Added diff --git a/py_pdf_parser/visualise/sections.py b/py_pdf_parser/visualise/sections.py index 6bb3a72d..9b084422 100644 --- a/py_pdf_parser/visualise/sections.py +++ b/py_pdf_parser/visualise/sections.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, List, Tuple, Dict +from typing import TYPE_CHECKING, Optional, List, Tuple, Dict from pyvoronoi import Pyvoronoi from shapely import geometry, ops @@ -10,10 +10,21 @@ from matplotlib.axes import Axes +# The simple boundary margins are used when trying to draw simple rectangles around +# sections - we see if each of them work in turn. A higher margin means more space +# between the elements and the section boundary line. +SIMPLE_BOUNDARY_MARGINS = [10, 5, 2, 0] + + class SectionVisualiser: """ Used internally to draw outlines of sections on the visualise plot. + We first try to draw a simple rectangle around the section with a fixed margin, for + increasingly small margins. If this doesn't work (because an element that is not + in the section would be within the section outline rectangle) then we instead + construct the boundary as follows: + We create a Voronoi diagram around all of the elements on the page, and the page boundaries (actually we get a diagram around each side of the bounding box of each element). Then for each line in the diagram we check if it was generated between one @@ -26,14 +37,18 @@ class SectionVisualiser: It can still produce some slightly interesting shapes, but does work fairly well. Importantly, every element in the section will be within the outline, and no boxes - which are not in the section will be. Simply drawing a rectangle would not achieve - this. + which are not in the section will be (which cannot always be achieved by simply + drawing a rectangle around all the points in the section). It does add some time when changing page on the visualise tool, but the whole process is done in <0.5 sections which is acceptable for a development tool. """ + all_elements: List["PDFElement"] document: "PDFDocument" + page: "PDFPage" + pv: Optional["Pyvoronoi"] + pv_segments: Optional[List] __ax: "Axes" __sections_by_page_number: Dict[int, List["Section"]] @@ -75,8 +90,19 @@ def __get_segments_for_elements(self, elements: List["PDFElement"]) -> List: for start, end in self.__get_segment_for_element(element) ] + def __get_element_boxes(self, elements: List["PDFElement"]): + return [ + geometry.box( + element.bounding_box.x0, + element.bounding_box.y0, + element.bounding_box.x1, + element.bounding_box.y1, + ) + for element in elements + ] + def __simplify_outlines( - self, line: geometry.LineString, all_elements: List["PDFElement"] + self, line: geometry.LineString ) -> Tuple[List[int], List[int]]: """ Simplified the outline by considering set of 3 consecutive vertices, and if @@ -84,22 +110,13 @@ def __simplify_outlines( shape. This is done iteratively around the shape until no further changes are made. """ - # return line.xy xs, ys = line.xy # The last point is the same as the first point, which makes things a bit more # complicated. We simply remove the last point and add it back at the end. xs.pop(-1) ys.pop(-1) - boxes = [ - geometry.box( - element.bounding_box.x0, - element.bounding_box.y0, - element.bounding_box.x1, - element.bounding_box.y1, - ) - for element in all_elements - ] + boxes = self.__get_element_boxes(self.all_elements) idx = 0 since_last_changed = 0 while since_last_changed <= len(xs) + 1: @@ -132,15 +149,7 @@ def __simplify_outlines( ys.append(ys[0]) return xs, ys - def __plot_edges( - self, - to_plot: List, - edges: List, - vertices: List, - pv: Pyvoronoi, - all_elements: List["PDFElement"], - label: str, - ): + def __plot_edges(self, to_plot: List, edges: List, vertices: List, label: str): lines = [] for edge_idx in to_plot: edge = edges[edge_idx] @@ -162,32 +171,27 @@ def __plot_edges( # lines, or it is a LineString which means we only need to draw one. if isinstance(merged_line, geometry.MultiLineString): for line in merged_line: - xs, ys = self.__simplify_outlines(line, all_elements) + xs, ys = self.__simplify_outlines(line) self.__ax.plot(xs, ys, **kwargs) kwargs.pop( "label", None ) # Only pass label once for single legend entry else: - xs, ys = self.__simplify_outlines(merged_line, all_elements) + xs, ys = self.__simplify_outlines(merged_line) self.__ax.plot(xs, ys, **kwargs) - def __plot_section( - self, - pv: Pyvoronoi, - all_segments: List, - all_elements: List["PDFElement"], - section: "Section", - page: "PDFPage", - ): - edges = pv.GetEdges() - vertices = pv.GetVertices() - cells = pv.GetCells() + def __plot_section(self, section: "Section"): + if self.pv is None or self.pv_segments is None: + self.pv, self.pv_segments = self.__get_voronoi() + edges = self.pv.GetEdges() + vertices = self.pv.GetVertices() + cells = self.pv.GetCells() # If an ignored element is within the section, we need to draw lines around it. # The following code gets the first and last non-ignored elements in the section # on the page, and then gets all elements between (inclusive) these elements, # even if they are ignored. - section_elements_on_page = section.elements & page.elements + section_elements_on_page = section.elements & self.page.elements section_elements = [ section.document._element_list[index] for index in range( @@ -195,9 +199,8 @@ def __plot_section( section_elements_on_page[-1]._index + 1, ) ] - # section_elements = section.elements section_segments = self.__get_segments_for_elements(section_elements) - in_section = [point in section_segments for point in all_segments] + in_section = [point in section_segments for point in self.pv_segments] to_plot = [] for idx, edge in enumerate(edges): @@ -207,20 +210,16 @@ def __plot_section( if in_section[first_segment] and not in_section[second_segment]: to_plot.append(idx) - self.__plot_edges( - to_plot, edges, vertices, pv, all_elements, label=section.unique_name - ) + self.__plot_edges(to_plot, edges, vertices, label=section.unique_name) - def __get_voronoi( - self, all_elements: List["PDFElement"], page: "PDFPage" - ) -> Tuple[Pyvoronoi, List]: - all_segments = self.__get_segments_for_elements(all_elements) + def __get_voronoi(self) -> Tuple[Pyvoronoi, List]: + all_segments = self.__get_segments_for_elements(self.all_elements) # Add the page boundary as segments: all_segments += [ - [(0, 0), (0, page.height)], - [(0, 0), (page.width, 0)], - [(0, page.height), (page.width, page.height)], - [(page.width, 0), (page.width, page.height)], + [(0, 0), (0, self.page.height)], + [(0, 0), (self.page.width, 0)], + [(0, self.page.height), (self.page.width, self.page.height)], + [(self.page.width, 0), (self.page.width, self.page.height)], ] pv = Pyvoronoi(10) @@ -230,7 +229,54 @@ def __get_voronoi( pv.Construct() return pv, all_segments + def __get_boundary_for_elements( + self, elements: List["PDFElement"], margin: int + ) -> Tuple[int, int, int, int]: + x0s = [element.bounding_box.x0 for element in elements] + x1s = [element.bounding_box.x1 for element in elements] + y0s = [element.bounding_box.y0 for element in elements] + y1s = [element.bounding_box.y1 for element in elements] + + x0 = min(x0s) - margin + x1 = max(x1s) + margin + y0 = min(y0s) - margin + y1 = max(y1s) + margin + + return x0, x1, y0, y1 + + def __plot_section_simple(self, section) -> bool: + section_elements_on_page = section.elements & self.page.elements + non_section_elements = self.page.elements - section_elements_on_page + boxes = self.__get_element_boxes(non_section_elements) + + for margin in SIMPLE_BOUNDARY_MARGINS: + x0, x1, y0, y1 = self.__get_boundary_for_elements( + section_elements_on_page, margin=margin + ) + + boundary = geometry.box(x0, y0, x1, y1) + + if not any(box.intersects(boundary) for box in boxes): + # No elements outside of the section are within this boundary, and as + # such we can simply draw this boundary as the section outline. Break. + break + else: + # None of the margins gave us a box which did not contain any non-section + # elements. We cannot use the simple method. + return False + + label = section.unique_name + + kwargs = {"label": label, "alpha": 0.5, "color": self.__colour_mapping[label]} + self.__ax.plot([x0, x1, x1, x0, x0], [y0, y0, y1, y1, y0], **kwargs) + + return True + def plot_sections_for_page(self, page: "PDFPage"): + self.pv = None + self.pv_segments = None + self.page = page + sections = self.__get_sections_for_page(page) if not sections: @@ -242,15 +288,14 @@ def plot_sections_for_page(self, page: "PDFPage"): range(page.start_element._index, page.end_element._index + 1) ) ignored_indexes_on_page = page_indexes & self.document._ignored_indexes - all_elements = list(page.elements) + [ + self.all_elements = list(page.elements) + [ self.document._element_list[index] for index in ignored_indexes_on_page ] - # Get the voronoi diagram - pv, all_segments = self.__get_voronoi(all_elements, page) - for section in sections: - self.__plot_section(pv, all_segments, all_elements, section, page) + plotted = self.__plot_section_simple(section) + if not plotted: + self.__plot_section(section) # Show the legend self.__ax.legend()