Skip to content

Commit

Permalink
Merge pull request #4 from camelot-dev/fix-strip-text-arg
Browse files Browse the repository at this point in the history
[MRG] Fixed strip_text argument getting ignored
  • Loading branch information
dimitern authored Jul 4, 2019
2 parents d5df936 + 240ea6c commit e81e818
Show file tree
Hide file tree
Showing 3 changed files with 2,587 additions and 284 deletions.
34 changes: 29 additions & 5 deletions camelot/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import division

import re
import os
import sys
import random
Expand Down Expand Up @@ -405,6 +406,27 @@ def merge_close_lines(ar, line_tol=2):
return ret


def text_strip(text, strip=""):
"""Strips any characters in `strip` that are present in `text`.
Parameters
----------
text : str
Text to process and strip.
strip : str, optional (default: '')
Characters that should be stripped from `text`.
Returns
-------
stripped : str
"""
if not strip:
return text

stripped = re.sub(
r"[{}]".format("".join(map(re.escape, strip))), "", text, re.UNICODE
)
return stripped


# TODO: combine the following functions into a TextProcessor class which
# applies corresponding transformations sequentially
# (inspired from sklearn.pipeline.Pipeline)
Expand Down Expand Up @@ -456,10 +478,10 @@ def flag_font_size(textline, direction, strip_text=""):
fchars = [t[0] for t in chars]
if "".join(fchars).strip():
flist.append("".join(fchars))
fstring = "".join(flist).strip(strip_text)
fstring = "".join(flist)
else:
fstring = "".join([t.get_text() for t in textline]).strip(strip_text)
return fstring
fstring = "".join([t.get_text() for t in textline])
return text_strip(fstring, strip_text)


def split_textline(table, textline, direction, flag_size=False, strip_text=""):
Expand Down Expand Up @@ -574,7 +596,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
)
else:
gchars = [t[2].get_text() for t in chars]
grouped_chars.append((key[0], key[1], "".join(gchars).strip(strip_text)))
grouped_chars.append(
(key[0], key[1], text_strip("".join(gchars), strip_text))
)
return grouped_chars


Expand Down Expand Up @@ -678,7 +702,7 @@ def get_table_index(
error,
)
else:
return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error


def compute_accuracy(error_weights):
Expand Down
Loading

0 comments on commit e81e818

Please sign in to comment.