Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[visualise] Draw rectangular section outlines when possible #80

Merged
merged 2 commits into from
May 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- This product is now complete enough for the needs of Optimor Ltd, however `jstockwin` is going to continue development as a personal project. The repository has been moved from `optimor/py-pdf-parser` to `jstockwin/py-pdf-parser`.

### Added
- The visualise tool now draws an outline around each section on the page. ([#69](https://github.com/jstockwin/py-pdf-parser/pull/69))
- The visualise tool now draws an outline around each section on the page. ([#69](https://github.com/jstockwin/py-pdf-parser/pull/69)) (updated in [#80](https://github.com/jstockwin/py-pdf-parser/pull/80))

## [0.2.0] - 2020-04-17
### Added
Expand Down
155 changes: 100 additions & 55 deletions py_pdf_parser/visualise/sections.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING, List, Tuple, Dict
from typing import TYPE_CHECKING, Optional, List, Tuple, Dict

from pyvoronoi import Pyvoronoi
from shapely import geometry, ops
Expand All @@ -10,10 +10,21 @@
from matplotlib.axes import Axes


# The simple boundary margins are used when trying to draw simple rectangles around
# sections - we see if each of them work in turn. A higher margin means more space
# between the elements and the section boundary line.
SIMPLE_BOUNDARY_MARGINS = [10, 5, 2, 0]


class SectionVisualiser:
"""
Used internally to draw outlines of sections on the visualise plot.

We first try to draw a simple rectangle around the section with a fixed margin, for
increasingly small margins. If this doesn't work (because an element that is not
in the section would be within the section outline rectangle) then we instead
construct the boundary as follows:

We create a Voronoi diagram around all of the elements on the page, and the page
boundaries (actually we get a diagram around each side of the bounding box of each
element). Then for each line in the diagram we check if it was generated between one
Expand All @@ -26,14 +37,18 @@ class SectionVisualiser:

It can still produce some slightly interesting shapes, but does work fairly well.
Importantly, every element in the section will be within the outline, and no boxes
which are not in the section will be. Simply drawing a rectangle would not achieve
this.
which are not in the section will be (which cannot always be achieved by simply
drawing a rectangle around all the points in the section).

It does add some time when changing page on the visualise tool, but the whole
process is done in <0.5 sections which is acceptable for a development tool.
"""

all_elements: List["PDFElement"]
document: "PDFDocument"
page: "PDFPage"
pv: Optional["Pyvoronoi"]
pv_segments: Optional[List]

__ax: "Axes"
__sections_by_page_number: Dict[int, List["Section"]]
Expand Down Expand Up @@ -75,31 +90,33 @@ def __get_segments_for_elements(self, elements: List["PDFElement"]) -> List:
for start, end in self.__get_segment_for_element(element)
]

def __get_element_boxes(self, elements: List["PDFElement"]):
return [
geometry.box(
element.bounding_box.x0,
element.bounding_box.y0,
element.bounding_box.x1,
element.bounding_box.y1,
)
for element in elements
]

def __simplify_outlines(
self, line: geometry.LineString, all_elements: List["PDFElement"]
self, line: geometry.LineString
) -> Tuple[List[int], List[int]]:
"""
Simplified the outline by considering set of 3 consecutive vertices, and if
there are no elements in this triangle, removes the middle vertex from the
shape. This is done iteratively around the shape until no further changes are
made.
"""
# return line.xy
xs, ys = line.xy

# The last point is the same as the first point, which makes things a bit more
# complicated. We simply remove the last point and add it back at the end.
xs.pop(-1)
ys.pop(-1)
boxes = [
geometry.box(
element.bounding_box.x0,
element.bounding_box.y0,
element.bounding_box.x1,
element.bounding_box.y1,
)
for element in all_elements
]
boxes = self.__get_element_boxes(self.all_elements)
idx = 0
since_last_changed = 0
while since_last_changed <= len(xs) + 1:
Expand Down Expand Up @@ -132,15 +149,7 @@ def __simplify_outlines(
ys.append(ys[0])
return xs, ys

def __plot_edges(
self,
to_plot: List,
edges: List,
vertices: List,
pv: Pyvoronoi,
all_elements: List["PDFElement"],
label: str,
):
def __plot_edges(self, to_plot: List, edges: List, vertices: List, label: str):
lines = []
for edge_idx in to_plot:
edge = edges[edge_idx]
Expand All @@ -162,42 +171,36 @@ def __plot_edges(
# lines, or it is a LineString which means we only need to draw one.
if isinstance(merged_line, geometry.MultiLineString):
for line in merged_line:
xs, ys = self.__simplify_outlines(line, all_elements)
xs, ys = self.__simplify_outlines(line)
self.__ax.plot(xs, ys, **kwargs)
kwargs.pop(
"label", None
) # Only pass label once for single legend entry
else:
xs, ys = self.__simplify_outlines(merged_line, all_elements)
xs, ys = self.__simplify_outlines(merged_line)
self.__ax.plot(xs, ys, **kwargs)

def __plot_section(
self,
pv: Pyvoronoi,
all_segments: List,
all_elements: List["PDFElement"],
section: "Section",
page: "PDFPage",
):
edges = pv.GetEdges()
vertices = pv.GetVertices()
cells = pv.GetCells()
def __plot_section(self, section: "Section"):
if self.pv is None or self.pv_segments is None:
self.pv, self.pv_segments = self.__get_voronoi()
edges = self.pv.GetEdges()
vertices = self.pv.GetVertices()
cells = self.pv.GetCells()

# If an ignored element is within the section, we need to draw lines around it.
# The following code gets the first and last non-ignored elements in the section
# on the page, and then gets all elements between (inclusive) these elements,
# even if they are ignored.
section_elements_on_page = section.elements & page.elements
section_elements_on_page = section.elements & self.page.elements
section_elements = [
section.document._element_list[index]
for index in range(
section_elements_on_page[0]._index,
section_elements_on_page[-1]._index + 1,
)
]
# section_elements = section.elements
section_segments = self.__get_segments_for_elements(section_elements)
in_section = [point in section_segments for point in all_segments]
in_section = [point in section_segments for point in self.pv_segments]

to_plot = []
for idx, edge in enumerate(edges):
Expand All @@ -207,20 +210,16 @@ def __plot_section(
if in_section[first_segment] and not in_section[second_segment]:
to_plot.append(idx)

self.__plot_edges(
to_plot, edges, vertices, pv, all_elements, label=section.unique_name
)
self.__plot_edges(to_plot, edges, vertices, label=section.unique_name)

def __get_voronoi(
self, all_elements: List["PDFElement"], page: "PDFPage"
) -> Tuple[Pyvoronoi, List]:
all_segments = self.__get_segments_for_elements(all_elements)
def __get_voronoi(self) -> Tuple[Pyvoronoi, List]:
all_segments = self.__get_segments_for_elements(self.all_elements)
# Add the page boundary as segments:
all_segments += [
[(0, 0), (0, page.height)],
[(0, 0), (page.width, 0)],
[(0, page.height), (page.width, page.height)],
[(page.width, 0), (page.width, page.height)],
[(0, 0), (0, self.page.height)],
[(0, 0), (self.page.width, 0)],
[(0, self.page.height), (self.page.width, self.page.height)],
[(self.page.width, 0), (self.page.width, self.page.height)],
]

pv = Pyvoronoi(10)
Expand All @@ -230,7 +229,54 @@ def __get_voronoi(
pv.Construct()
return pv, all_segments

def __get_boundary_for_elements(
self, elements: List["PDFElement"], margin: int
) -> Tuple[int, int, int, int]:
x0s = [element.bounding_box.x0 for element in elements]
x1s = [element.bounding_box.x1 for element in elements]
y0s = [element.bounding_box.y0 for element in elements]
y1s = [element.bounding_box.y1 for element in elements]

x0 = min(x0s) - margin
x1 = max(x1s) + margin
y0 = min(y0s) - margin
y1 = max(y1s) + margin

return x0, x1, y0, y1

def __plot_section_simple(self, section) -> bool:
section_elements_on_page = section.elements & self.page.elements
non_section_elements = self.page.elements - section_elements_on_page
boxes = self.__get_element_boxes(non_section_elements)

for margin in SIMPLE_BOUNDARY_MARGINS:
x0, x1, y0, y1 = self.__get_boundary_for_elements(
section_elements_on_page, margin=margin
)

boundary = geometry.box(x0, y0, x1, y1)

if not any(box.intersects(boundary) for box in boxes):
# No elements outside of the section are within this boundary, and as
# such we can simply draw this boundary as the section outline. Break.
break
else:
# None of the margins gave us a box which did not contain any non-section
# elements. We cannot use the simple method.
return False

label = section.unique_name

kwargs = {"label": label, "alpha": 0.5, "color": self.__colour_mapping[label]}
self.__ax.plot([x0, x1, x1, x0, x0], [y0, y0, y1, y1, y0], **kwargs)

return True

def plot_sections_for_page(self, page: "PDFPage"):
self.pv = None
self.pv_segments = None
self.page = page

sections = self.__get_sections_for_page(page)

if not sections:
Expand All @@ -242,15 +288,14 @@ def plot_sections_for_page(self, page: "PDFPage"):
range(page.start_element._index, page.end_element._index + 1)
)
ignored_indexes_on_page = page_indexes & self.document._ignored_indexes
all_elements = list(page.elements) + [
self.all_elements = list(page.elements) + [
self.document._element_list[index] for index in ignored_indexes_on_page
]

# Get the voronoi diagram
pv, all_segments = self.__get_voronoi(all_elements, page)

for section in sections:
self.__plot_section(pv, all_segments, all_elements, section, page)
plotted = self.__plot_section_simple(section)
if not plotted:
self.__plot_section(section)

# Show the legend
self.__ax.legend()