diff --git a/camelot/__init__.py b/camelot/__init__.py index fd0e437b..d8a41b9f 100644 --- a/camelot/__init__.py +++ b/camelot/__init__.py @@ -6,7 +6,8 @@ from .__version__ import __version__ from .io import read_pdf -from .plotting import plot_pdf +from .plotting import plot + def _write_usage(self, prog, args='', prefix='Usage: '): return self._write_usage('camelot', args, prefix=prefix) diff --git a/camelot/cli.py b/camelot/cli.py index e5553f85..66657413 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -3,10 +3,11 @@ import logging import click +import matplotlib.pyplot as plt from . import __version__ from .io import read_pdf -from .plotting import plot_pdf +from .plotting import plot logger = logging.getLogger('camelot') @@ -81,7 +82,7 @@ def cli(ctx, *args, **kwargs): help='Number of times for erosion/dilation will be applied.') @click.option('-plot', '--plot_type', type=click.Choice(['text', 'table', 'contour', 'joint', 'line']), - help='Plot geometry found on PDF page, for debugging.') + help='Plot elements found on PDF page for visual debugging.') @click.argument('filepath', type=click.Path(exists=True)) @pass_config def lattice(c, *args, **kwargs): @@ -107,7 +108,8 @@ def lattice(c, *args, **kwargs): click.echo('Found {} tables'.format(tables.n)) if plot_type is not None: for table in tables: - plot_pdf(table, plot_type) + plot(table, plot_type=plot_type) + plt.show() else: if output is None: raise click.UsageError('Please specify output file path using --output') @@ -128,7 +130,7 @@ def lattice(c, *args, **kwargs): ' used to combine text horizontally, to generate columns.') @click.option('-plot', '--plot_type', type=click.Choice(['text', 'table']), - help='Plot geometry found on PDF page for debugging.') + help='Plot elements found on PDF page for visual debugging.') @click.argument('filepath', type=click.Path(exists=True)) @pass_config def stream(c, *args, **kwargs): @@ -153,7 +155,8 @@ def stream(c, *args, **kwargs): click.echo('Found {} tables'.format(tables.n)) if plot_type is not None: for table in tables: - plot_pdf(table, plot_type) + plot(table, plot_type=plot_type) + plt.show() else: if output is None: raise click.UsageError('Please specify output file path using --output') diff --git a/camelot/handlers.py b/camelot/handlers.py index 6820cc7e..b35fa70c 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -130,9 +130,6 @@ def parse(self, flavor='lattice', **kwargs): ------- tables : camelot.core.TableList List of tables found in PDF. - geometry : camelot.core.GeometryList - List of geometry objects (contours, lines, joints) found - in PDF. """ tables = [] diff --git a/camelot/plotting.py b/camelot/plotting.py index 9f834f2b..73d5b37f 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -4,46 +4,56 @@ import matplotlib.patches as patches -def plot_pdf(table, geometry_type, filename=None): - """Plot geometry found on PDF page based on geometry_type - specified, useful for debugging and playing with different - parameters to get the best output. - - Parameters - ---------- - table: Table - The table object to plot data from - geometry_type : str - The geometry type for which a plot should be generated. - Can be 'text', 'table', 'contour', 'joint', 'line' - filename: str - If specified, saves the plot to a file with the given name - """ - if table.flavor == 'stream' and geometry_type in ['contour', 'joint', 'line']: - raise NotImplementedError("{} cannot be plotted with flavor='stream'".format( - geometry_type)) - if geometry_type == 'text': - plot_text(table._text) - elif geometry_type == 'table': - plot_table(table) - elif geometry_type == 'contour': - plot_contour(table._image) - elif geometry_type == 'joint': - plot_joint(table._image) - elif geometry_type == 'line': - plot_line(table._segments) - if filename: - plt.savefig(filename) - plt.show() +def plot(table, plot_type='text', filepath=None): + """Plot elements found on PDF page based on plot_type + specified, useful for debugging and playing with different + parameters to get the best output. + + Parameters + ---------- + table: Table + A Camelot Table. + plot_type : str, optional (default: 'text') + {'text', 'table', 'contour', 'joint', 'line'} + The element type for which a plot should be generated. + filepath: str, optional (default: None) + Absolute path for saving the generated plot. + + Returns + ------- + fig : matplotlib.fig.Figure + + """ + if table.flavor == 'stream' and plot_type in ['contour', 'joint', 'line']: + raise NotImplementedError("{} cannot be plotted with flavor='stream'".format( + plot_type)) + if plot_type == 'text': + fig = plot_text(table._text) + elif plot_type == 'table': + fig = plot_table(table) + elif plot_type == 'contour': + fig = plot_contour(table._image) + elif plot_type == 'joint': + fig = plot_joint(table._image) + elif plot_type == 'line': + fig = plot_line(table._segments) + if filepath: + plt.savefig(filepath) + return fig def plot_text(text): - """Generates a plot for all text present on the PDF page. + """Generates a plot for all text elements present + on the PDF page. Parameters ---------- text : list + Returns + ------- + fig : matplotlib.fig.Figure + """ fig = plt.figure() ax = fig.add_subplot(111, aspect='equal') @@ -64,12 +74,17 @@ def plot_text(text): def plot_table(table): - """Generates a plot for the table. + """Generates a plot for the detected tables + on the PDF page. Parameters ---------- table : camelot.core.Table + Returns + ------- + fig : matplotlib.fig.Figure + """ fig = plt.figure() ax = fig.add_subplot(111, aspect='equal') @@ -91,13 +106,17 @@ def plot_table(table): def plot_contour(image): - """Generates a plot for all table boundaries present on the - PDF page. + """Generates a plot for all table boundaries present + on the PDF page. Parameters ---------- image : tuple + Returns + ------- + fig : matplotlib.fig.Figure + """ img, table_bbox = image fig = plt.figure() @@ -117,13 +136,17 @@ def plot_contour(image): def plot_joint(image): - """Generates a plot for all line intersections present on the - PDF page. + """Generates a plot for all line intersections present + on the PDF page. Parameters ---------- image : tuple + Returns + ------- + fig : matplotlib.fig.Figure + """ img, table_bbox = image fig = plt.figure() @@ -140,12 +163,17 @@ def plot_joint(image): def plot_line(segments): - """Generates a plot for all line segments present on the PDF page. + """Generates a plot for all line segments present + on the PDF page. Parameters ---------- segments : tuple + Returns + ------- + fig : matplotlib.fig.Figure + """ fig = plt.figure() ax = fig.add_subplot(111, aspect='equal') diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index f3c97321..e0a1d142 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -27,11 +27,12 @@ To process background lines, you can pass ``process_background=True``. .. csv-table:: :file: ../_static/csv/background_lines.csv -Plot geometry -------------- +Visual debugging +---------------- -You can use camelot's :meth:`plot_pdf() ` method to plot various geometries that were detected by Camelot while processing the PDF page. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters. -The following geometries are available for plotting. You can pass them to the :meth:`plot_pdf() ` method, which will then generate a `matplotlib `_ plot for the passed geometry. The plot can be saved to a file by passing a ``filename`` param to :meth:`plot_pdf() ` +You can use the :meth:`plot() ` method to generate a `matplotlib `_ plot of various elements that were detected on the PDF page while processing it. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters. + +You can specify the type of element you want to plot using the ``plot_type`` keyword argument. The generated plot can be saved to a file by passing a ``filename`` keyword argument. The following plot types are supported: - 'text' - 'table' @@ -39,9 +40,9 @@ The following geometries are available for plotting. You can pass them to the :m - 'line' - 'joint' -.. note:: The last three geometries can only be used with :ref:`Lattice `, i.e. when ``flavor='lattice'``. +.. note:: The last three plot types can only be used with :ref:`Lattice `, i.e. when ``flavor='lattice'``. -Let's generate a plot for each geometry using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out. +Let's generate a plot for each type using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out. :: @@ -76,7 +77,7 @@ This, as we shall later see, is very helpful with :ref:`Stream ` for not table ^^^^^ -Let's plot the table (to see if it was detected correctly or not). This geometry type, along with contour, line and joint is useful for debugging and improving the extraction output, in case the table wasn't detected correctly. (More on that later.) +Let's plot the table (to see if it was detected correctly or not). This plot type, along with contour, line and joint is useful for debugging and improving the extraction output, in case the table wasn't detected correctly. (More on that later.) :: diff --git a/tests/__init__.py b/tests/__init__.py index 3e0f72c3..a946ff71 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,3 +1,2 @@ -import matplotlib - -matplotlib.use('Agg') +import matplotlib +matplotlib.use('agg') diff --git a/tests/test_plotting.py b/tests/test_plotting.py index 3ec4f093..e01cac6f 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- -import camelot import os + import pytest -from camelot.plotting import * +import camelot testdir = os.path.dirname(os.path.abspath(__file__)) @@ -16,23 +16,23 @@ def test_text_plot(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename) - return plot_text(tables[0]._text) + return camelot.plot(tables[0], plot_type='text') @pytest.mark.mpl_image_compare( baseline_dir="files/baseline_plots", remove_text=True) -def test_contour_plot(): +def test_table_plot(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename) - return plot_contour(tables[0]._image) + return camelot.plot(tables[0], plot_type='table') @pytest.mark.mpl_image_compare( baseline_dir="files/baseline_plots", remove_text=True) -def test_table_plot(): +def test_contour_plot(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename) - return plot_table(tables[0]) + return camelot.plot(tables[0], plot_type='contour') @pytest.mark.mpl_image_compare( @@ -40,7 +40,7 @@ def test_table_plot(): def test_line_plot(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename) - return plot_line(tables[0]._segments) + return camelot.plot(tables[0], plot_type='line') @pytest.mark.mpl_image_compare( @@ -48,4 +48,4 @@ def test_line_plot(): def test_joint_plot(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename) - return plot_joint(tables[0]._image) + return camelot.plot(tables[0], plot_type='joint')