Skip to content
This repository has been archived by the owner on Jan 6, 2025. It is now read-only.

[MRG + 1] Create a new figure and test each plot type #127 #179

Merged
merged 9 commits into from
Nov 2, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion camelot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from .__version__ import __version__
from .io import read_pdf

from .plotting import plot_pdf

def _write_usage(self, prog, args='', prefix='Usage: '):
return self._write_usage('camelot', args, prefix=prefix)
Expand Down
5 changes: 3 additions & 2 deletions camelot/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from . import __version__
from .io import read_pdf
from .plotting import plot_pdf


logger = logging.getLogger('camelot')
Expand Down Expand Up @@ -106,7 +107,7 @@ def lattice(c, *args, **kwargs):
click.echo('Found {} tables'.format(tables.n))
if plot_type is not None:
for table in tables:
table.plot(plot_type)
plot_pdf(table, plot_type)
else:
if output is None:
raise click.UsageError('Please specify output file path using --output')
Expand Down Expand Up @@ -152,7 +153,7 @@ def stream(c, *args, **kwargs):
click.echo('Found {} tables'.format(tables.n))
if plot_type is not None:
for table in tables:
table.plot(plot_type)
plot_pdf(table, plot_type)
else:
if output is None:
raise click.UsageError('Please specify output file path using --output')
Expand Down
29 changes: 0 additions & 29 deletions camelot/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import numpy as np
import pandas as pd

from .plotting import *


class Cell(object):
"""Defines a cell in a table with coordinates relative to a
Expand Down Expand Up @@ -321,33 +319,6 @@ def set_span(self):
cell.hspan = True
return self

def plot(self, geometry_type):
"""Plot geometry found on PDF page based on geometry_type
specified, useful for debugging and playing with different
parameters to get the best output.

Parameters
----------
geometry_type : str
The geometry type for which a plot should be generated.
Can be 'text', 'table', 'contour', 'joint', 'line'

"""
if self.flavor == 'stream' and geometry_type in ['contour', 'joint', 'line']:
raise NotImplementedError("{} cannot be plotted with flavor='stream'".format(
geometry_type))

if geometry_type == 'text':
plot_text(self._text)
elif geometry_type == 'table':
plot_table(self)
elif geometry_type == 'contour':
plot_contour(self._image)
elif geometry_type == 'joint':
plot_joint(self._image)
elif geometry_type == 'line':
plot_line(self._segments)

def to_csv(self, path, **kwargs):
"""Writes Table to a comma-separated values (csv) file.

Expand Down
83 changes: 66 additions & 17 deletions camelot/plotting.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,42 @@
import cv2
# -*- coding: utf-8 -*-

import matplotlib.pyplot as plt
import matplotlib.patches as patches


def plot_pdf(table, geometry_type, filename=None):
"""Plot geometry found on PDF page based on geometry_type
specified, useful for debugging and playing with different
parameters to get the best output.

Parameters
----------
table: Table
The table object to plot data from
geometry_type : str
The geometry type for which a plot should be generated.
Can be 'text', 'table', 'contour', 'joint', 'line'
filename: str
If specified, saves the plot to a file with the given name
"""
if table.flavor == 'stream' and geometry_type in ['contour', 'joint', 'line']:
raise NotImplementedError("{} cannot be plotted with flavor='stream'".format(
geometry_type))
if geometry_type == 'text':
plot_text(table._text)
elif geometry_type == 'table':
plot_table(table)
elif geometry_type == 'contour':
plot_contour(table._image)
elif geometry_type == 'joint':
plot_joint(table._image)
elif geometry_type == 'line':
plot_line(table._segments)
if filename:
plt.savefig(filename)
plt.show()


def plot_text(text):
"""Generates a plot for all text present on the PDF page.

Expand All @@ -26,7 +60,7 @@ def plot_text(text):
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
plt.show()
return fig


def plot_table(table):
Expand All @@ -37,21 +71,23 @@ def plot_table(table):
table : camelot.core.Table

"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
for row in table.cells:
for cell in row:
if cell.left:
plt.plot([cell.lb[0], cell.lt[0]],
ax.plot([cell.lb[0], cell.lt[0]],
[cell.lb[1], cell.lt[1]])
if cell.right:
plt.plot([cell.rb[0], cell.rt[0]],
ax.plot([cell.rb[0], cell.rt[0]],
[cell.rb[1], cell.rt[1]])
if cell.top:
plt.plot([cell.lt[0], cell.rt[0]],
ax.plot([cell.lt[0], cell.rt[0]],
[cell.lt[1], cell.rt[1]])
if cell.bottom:
plt.plot([cell.lb[0], cell.rb[0]],
ax.plot([cell.lb[0], cell.rb[0]],
[cell.lb[1], cell.rb[1]])
plt.show()
return fig
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@suyash458 Are you returning fig from the plot_* functions only for those additional asserts in the tests?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pytest-mpl's image comparison decorator requires the plot functions to return a matplotlib figure



def plot_contour(image):
Expand All @@ -64,11 +100,20 @@ def plot_contour(image):

"""
img, table_bbox = image
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
for t in table_bbox.keys():
cv2.rectangle(img, (t[0], t[1]),
(t[2], t[3]), (255, 0, 0), 20)
plt.imshow(img)
plt.show()
ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1],
fill=None,
edgecolor='red'
)
)
ax.imshow(img)
return fig


def plot_joint(image):
Expand All @@ -81,15 +126,17 @@ def plot_joint(image):

"""
img, table_bbox = image
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
x_coord = []
y_coord = []
for k in table_bbox.keys():
for coord in table_bbox[k]:
x_coord.append(coord[0])
y_coord.append(coord[1])
plt.plot(x_coord, y_coord, 'ro')
plt.imshow(img)
plt.show()
ax.plot(x_coord, y_coord, 'ro')
ax.imshow(img)
return fig


def plot_line(segments):
Expand All @@ -100,9 +147,11 @@ def plot_line(segments):
segments : tuple

"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
vertical, horizontal = segments
for v in vertical:
plt.plot([v[0], v[2]], [v[1], v[3]])
ax.plot([v[0], v[2]], [v[1], v[3]])
for h in horizontal:
plt.plot([h[0], h[2]], [h[1], h[3]])
plt.show()
ax.plot([h[0], h[2]], [h[1], h[3]])
return fig
15 changes: 7 additions & 8 deletions docs/user/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ To process background lines, you can pass ``process_background=True``.
Plot geometry
-------------

You can use a :class:`table <camelot.core.Table>` object's :meth:`plot() <camelot.core.TableList.plot>` method to plot various geometries that were detected by Camelot while processing the PDF page. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters.

The following geometries are available for plotting. You can pass them to the :meth:`plot() <camelot.core.TableList.plot>` method, which will then generate a `matplotlib <https://matplotlib.org/>`_ plot for the passed geometry.
You can use camelot's :meth:`plot_pdf() <camelot.plotting.plot_pdf>` method to plot various geometries that were detected by Camelot while processing the PDF page. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters.
The following geometries are available for plotting. You can pass them to the :meth:`plot_pdf() <camelot.plotting.plot_pdf>` method, which will then generate a `matplotlib <https://matplotlib.org/>`_ plot for the passed geometry. The plot can be saved to a file by passing a ``filename`` param to :meth:`plot_pdf() <camelot.plotting.plot_pdf>`

- 'text'
- 'table'
Expand All @@ -59,7 +58,7 @@ Let's plot all the text present on the table's PDF page.

::

>>> tables[0].plot('text')
>>> camelot.plot_pdf(tables[0], 'text')

.. figure:: ../_static/png/geometry_text.png
:height: 674
Expand All @@ -81,7 +80,7 @@ Let's plot the table (to see if it was detected correctly or not). This geometry

::

>>> tables[0].plot('table')
>>> camelot.plot_pdf(tables[0], 'table')

.. figure:: ../_static/png/geometry_table.png
:height: 674
Expand All @@ -101,7 +100,7 @@ Now, let's plot all table boundaries present on the table's PDF page.

::

>>> tables[0].plot('contour')
>>> camelot.plot_pdf(tables[0], 'contour')

.. figure:: ../_static/png/geometry_contour.png
:height: 674
Expand All @@ -119,7 +118,7 @@ Cool, let's plot all line segments present on the table's PDF page.

::

>>> tables[0].plot('line')
>>> camelot.plot_pdf(tables[0], 'line')

.. figure:: ../_static/png/geometry_line.png
:height: 674
Expand All @@ -137,7 +136,7 @@ Finally, let's plot all line intersections present on the table's PDF page.

::

>>> tables[0].plot('joint')
>>> camelot.plot_pdf(tables[0], 'joint')

.. figure:: ../_static/png/geometry_joint.png
:height: 674
Expand Down
3 changes: 3 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import matplotlib

matplotlib.use('Agg')
Binary file added tests/files/baseline_plots/test_contour_plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/files/baseline_plots/test_joint_plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/files/baseline_plots/test_line_plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/files/baseline_plots/test_table_plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/files/baseline_plots/test_text_plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
75 changes: 75 additions & 0 deletions tests/test_plotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-

import camelot
import os
import pytest

import matplotlib.pyplot as plt

from camelot.plotting import *


testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files")


@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots")
def test_text_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename)
text = tables[0]._text
fig = plot_text(text)
ax = fig.axes[0]
xs, ys = [], []
for t in text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
assert ax.get_xlim() == (min(xs) - 10, max(xs) + 10)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@suyash458 Why do we need additional asserts here? Doesn't pytest compare the whole image which is a sure shot way of knowing whether the images are the same or not?

Copy link
Contributor Author

@suyashb95 suyashb95 Oct 31, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case image comparison tests are skipped if the --mpl option is not specified, these asserts would still work. Ultimately though, image comparison is a sure shot test.

Should the --mpl option be added to addopts in setup.cfg ?

Copy link
Contributor

@vinayak-mehta vinayak-mehta Oct 31, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes and in the Makefile too. After that we shouldn't need to worry about additional asserts.

assert ax.get_ylim() == (min(ys) - 10, max(ys) + 10)
assert len(ax.patches) == len(text)
return fig


@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots")
def test_contour_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename)
_, table_bbox = tables[0]._image
fig = plot_contour(tables[0]._image)
ax = fig.axes[0]
assert len(ax.patches) == len(table_bbox.keys())
return fig


@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots")
def test_table_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename)
cells = [cell for row in tables[0].cells for cell in row]
num_lines = sum([sum([cell.left, cell.right, cell.top, cell.bottom]) for cell in cells])
fig = plot_table(tables[0])
ax = fig.axes[0]
assert len(ax.lines) == num_lines
return fig


@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots")
def test_line_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename)
vertical, horizontal = tables[0]._segments
fig = plot_line(tables[0]._segments)
ax = fig.axes[0]
assert len(ax.lines) == len(vertical + horizontal)
return fig


@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots")
def test_joint_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename)
_, table_bbox = tables[0]._image
fig = plot_joint(tables[0]._image)
ax = fig.axes[0]
assert len(ax.lines) == len(table_bbox.keys())
return fig