From 7fbdcecaa50df9fac83323ad11a92d14661518fa Mon Sep 17 00:00:00 2001 From: JSv4 Date: Sun, 26 Feb 2023 00:35:04 -0500 Subject: [PATCH 1/2] modified extract_page_tokens() in cli.pawls.preprocessors.tesseract to properly handle parsing empty pdfs (no tokens at all in the entire pdf). --- cli/pawls/preprocessors/tesseract.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cli/pawls/preprocessors/tesseract.py b/cli/pawls/preprocessors/tesseract.py index d0bd49b..7c48907 100644 --- a/cli/pawls/preprocessors/tesseract.py +++ b/cli/pawls/preprocessors/tesseract.py @@ -28,10 +28,17 @@ def extract_page_tokens( res = pd.read_csv( io.StringIO(_data), quoting=csv.QUOTE_NONE, encoding="utf-8", sep="\t" ) - # An implementation adopted from https://github.com/Layout-Parser/layout-parser/blob/20de8e7adb0a7d7740aed23484fa8b943126f881/src/layoutparser/ocr.py#L475 + + # An implementation adopted from https://github.com/Layout-Parser/layout-parser/blob + # /20de8e7adb0a7d7740aed23484fa8b943126f881/src/layoutparser/ocr.py#L475 + res_without_na_text_rows = res[~res.text.isna()] + + if res_without_na_text_rows.empty: + return [] + tokens = ( res[~res.text.isna()] - .groupby(["page_num", "block_num", "par_num", "line_num", "word_num"]) + .groupby(["page_num", "block_num", "par_num", "line_num", "word_num"], group_keys=False) .apply( lambda gp: pd.Series( [ From cc262fe5779857abcb6558b50b34aa3bcf049058 Mon Sep 17 00:00:00 2001 From: JSv4 Date: Sun, 26 Feb 2023 00:56:13 -0500 Subject: [PATCH 2/2] Accidentally created two copies of scrubbed df. Cleaned up. --- cli/pawls/preprocessors/tesseract.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/cli/pawls/preprocessors/tesseract.py b/cli/pawls/preprocessors/tesseract.py index d0bd49b..28f1a9d 100644 --- a/cli/pawls/preprocessors/tesseract.py +++ b/cli/pawls/preprocessors/tesseract.py @@ -28,10 +28,17 @@ def extract_page_tokens( res = pd.read_csv( io.StringIO(_data), quoting=csv.QUOTE_NONE, encoding="utf-8", sep="\t" ) - # An implementation adopted from https://github.com/Layout-Parser/layout-parser/blob/20de8e7adb0a7d7740aed23484fa8b943126f881/src/layoutparser/ocr.py#L475 + + # An implementation adopted from https://github.com/Layout-Parser/layout-parser/blob + # /20de8e7adb0a7d7740aed23484fa8b943126f881/src/layoutparser/ocr.py#L475 + res_without_na_text_rows = res[~res.text.isna()] + + if res_without_na_text_rows.empty: + return [] + tokens = ( - res[~res.text.isna()] - .groupby(["page_num", "block_num", "par_num", "line_num", "word_num"]) + res_without_na_text_rows + .groupby(["page_num", "block_num", "par_num", "line_num", "word_num"], group_keys=False) .apply( lambda gp: pd.Series( [