Skip to content
This repository has been archived by the owner on Jan 6, 2025. It is now read-only.

Commit

Permalink
[MRG] Add error/warning tests (#111)
Browse files Browse the repository at this point in the history
  • Loading branch information
vinayak-mehta committed Oct 2, 2018
1 parent 09ed772 commit 69f6c25
Show file tree
Hide file tree
Showing 13 changed files with 94 additions and 143 deletions.
14 changes: 14 additions & 0 deletions camelot/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# -*- coding: utf-8 -*-

import logging


# set up logging
logger = logging.getLogger('camelot')

format_string = '%(asctime)s - %(levelname)s - %(message)s'
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
handler = logging.StreamHandler()
handler.setFormatter(formatter)

logger.addHandler(handler)


from .__version__ import __version__

from .io import read_pdf
7 changes: 5 additions & 2 deletions camelot/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# -*- coding: utf-8 -*-

from pprint import pprint
import logging

logger = logging.getLogger('camelot')
logger.setLevel(logging.INFO)

import click

Expand Down Expand Up @@ -38,7 +41,7 @@ def set_config(self, key, value):
def cli(ctx, *args, **kwargs):
"""Camelot: PDF Table Extraction for Humans"""
ctx.obj = Config()
for key, value in kwargs.iteritems():
for key, value in kwargs.items():
ctx.obj.set_config(key, value)


Expand Down
12 changes: 0 additions & 12 deletions camelot/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,18 +447,6 @@ def __len__(self):
def __getitem__(self, idx):
return self._tables[idx]

def __iter__(self):
self._n = 0
return self

def next(self):
if self._n < len(self):
r = self._tables[self._n]
self._n += 1
return r
else:
raise StopIteration

@staticmethod
def _format_func(table, f):
return getattr(table, 'to_{}'.format(f))
Expand Down
2 changes: 1 addition & 1 deletion camelot/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class PDFHandler(object):
def __init__(self, filename, pages='1'):
self.filename = filename
if not self.filename.endswith('.pdf'):
raise TypeError("File format not supported.")
raise NotImplementedError("File format not supported")
self.pages = self._get_pages(self.filename, pages)

def _get_pages(self, filename, pages):
Expand Down
9 changes: 5 additions & 4 deletions camelot/parsers/lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import copy
import logging
import warnings
import subprocess

import numpy as np
Expand All @@ -13,12 +14,12 @@
from ..core import Table
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
merge_close_lines, get_table_index, compute_accuracy,
compute_whitespace, setup_logging)
compute_whitespace)
from ..image_processing import (adaptive_threshold, find_lines,
find_table_contours, find_table_joints)


logger = setup_logging(__name__)
logger = logging.getLogger('camelot')


class Lattice(BaseParser):
Expand Down Expand Up @@ -305,11 +306,11 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
return table

def extract_tables(self, filename):
logger.info('Processing {}'.format(os.path.basename(filename)))
self._generate_layout(filename)
logger.info('Processing {}'.format(os.path.basename(self.rootname)))

if not self.horizontal_text:
logger.info("No tables found on {}".format(
warnings.warn("No tables found on {}".format(
os.path.basename(self.rootname)))
return []

Expand Down
11 changes: 6 additions & 5 deletions camelot/parsers/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@
from __future__ import division
import os
import logging
import warnings

import numpy as np
import pandas as pd

from .base import BaseParser
from ..core import Table
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
compute_whitespace, setup_logging)
compute_whitespace)


logger = setup_logging(__name__)
logger = logging.getLogger('camelot')


class Stream(BaseParser):
Expand Down Expand Up @@ -287,7 +288,7 @@ def _generate_columns_and_rows(self, table_idx, tk):
else:
ncols = max(set(elements), key=elements.count)
if ncols == 1:
logger.info("No tables found on {}".format(
warnings.warn("No tables found on {}".format(
os.path.basename(self.rootname)))
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol)
Expand Down Expand Up @@ -344,11 +345,11 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
return table

def extract_tables(self, filename):
logger.info('Processing {}'.format(os.path.basename(filename)))
self._generate_layout(filename)
logger.info('Processing {}'.format(os.path.basename(self.rootname)))

if not self.horizontal_text:
logger.info("No tables found on {}".format(
warnings.warn("No tables found on {}".format(
os.path.basename(self.rootname)))
return []

Expand Down
125 changes: 6 additions & 119 deletions camelot/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import division
import os
import shutil
import logging
import tempfile
import warnings
from itertools import groupby
from operator import itemgetter

Expand Down Expand Up @@ -38,7 +38,7 @@
]


def validate_input(kwargs, flavor='lattice', geometry_type=False):
def validate_input(kwargs, flavor='lattice'):
def check_intersection(parser_kwargs, input_kwargs):
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
if isec:
Expand All @@ -49,10 +49,6 @@ def check_intersection(parser_kwargs, input_kwargs):
check_intersection(stream_kwargs, kwargs)
else:
check_intersection(lattice_kwargs, kwargs)
if geometry_type:
if flavor != 'lattice' and geometry_type in ['contour', 'joint', 'line']:
raise ValueError("Use geometry_type='{}' with flavor='lattice'".format(
geometry_type))


def remove_extra(kwargs, flavor='lattice'):
Expand All @@ -77,35 +73,6 @@ def __exit__(self, exc_type, exc_value, traceback):
shutil.rmtree(self.name)


def setup_logging(name):
"""Sets up a logger with StreamHandler.
Parameters
----------
name : str
Returns
-------
logger : logging.Logger
"""
logger = logging.getLogger(name)

format_string = '%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')

handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
handler.setFormatter(formatter)

logger.addHandler(handler)

return logger


logger = setup_logging(__name__)


def translate(x1, x2):
"""Translates x2 by x1.
Expand Down Expand Up @@ -140,35 +107,6 @@ def scale(x, s):
return x


def rotate(x1, y1, x2, y2, angle):
"""Rotates point x2, y2 about point x1, y1 by angle.
Parameters
----------
x1 : float
y1 : float
x2 : float
y2 : float
angle : float
Angle in radians.
Returns
-------
xnew : float
ynew : float
"""
s = np.sin(angle)
c = np.cos(angle)
x2 = translate(-x1, x2)
y2 = translate(-y1, y2)
xnew = c * x2 - s * y2
ynew = s * x2 + c * y2
xnew = translate(x1, xnew)
ynew = translate(y1, ynew)
return xnew, ynew


def scale_pdf(k, factors):
"""Translates and scales pdf coordinate space to image
coordinate space.
Expand Down Expand Up @@ -345,33 +283,6 @@ def text_in_bbox(bbox, text):
return t_bbox


def remove_close_lines(ar, line_close_tol=2):
"""Removes lines which are within a tolerance, based on their x or
y axis projections.
Parameters
----------
ar : list
line_close_tol : int, optional (default: 2)
Returns
-------
ret : list
"""
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=line_close_tol):
pass
else:
ret.append(a)
return ret


def merge_close_lines(ar, line_close_tol=2):
"""Merges lines which are within a tolerance by calculating a
moving mean, based on their x or y axis projections.
Expand Down Expand Up @@ -564,7 +475,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
text = t.get_text().strip('\n')
text_range = (t.x0, t.x1)
col_range = (table.cols[0][0], table.cols[-1][1])
logger.info("{} {} does not lie in column range {}".format(
warnings.warn("{} {} does not lie in column range {}".format(
text, text_range, col_range))
r_idx = r
c_idx = lt_col_overlap.index(max(lt_col_overlap))
Expand Down Expand Up @@ -648,27 +559,6 @@ def compute_whitespace(d):
return whitespace


def remove_empty(d):
"""Removes empty rows and columns from a two-dimensional list.
Parameters
----------
d : list
Returns
-------
d : list
"""
for i, row in enumerate(d):
if row == [''] * len(row):
d.pop(i)
d = zip(*d)
d = [list(row) for row in d if any(row)]
d = zip(*d)
return d


def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
detect_vertical=True, all_texts=True):
"""Returns a PDFMiner LTPage object and page dimension of a single
Expand Down Expand Up @@ -755,17 +645,14 @@ def get_text_objects(layout, ltype="char", t=None):

def merge_tuples(tuples):
"""Merges a list of overlapping tuples.
Parameters
Parameters
----------
tuples : list
List of tuples where a tuple is a single axis coordinate pair.
Yields
Yields
------
tuple
"""
"""
merged = list(tuples[0])
for s, e in tuples:
if s <= merged[1]:
Expand Down
Binary file added tests/files/blank.pdf
Binary file not shown.
2 changes: 2 additions & 0 deletions tests/files/foo.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"a","b"
"1","2"
Binary file added tests/files/foo.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# -*- coding: utf-8 -*-
Loading

0 comments on commit 69f6c25

Please sign in to comment.