-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[MC-754][MC-838][MC-839][MC-840] PDF Reader plugin - Fix Extract Text…
… action (#1038) * [MC-754] Fix Extract Text action * [MC-754] Update plugin spec * [MC-754][MC-838][MC-839][MC-840] Update Dockerfile and Extract Text action * [MC-754] Update Extract Text action * [MC-754][MC-838][MC-839][MC-840] Update error messaging in Extract Text action * Update plugins/pdf_reader/help.md Co-authored-by: Max Berezin <[email protected]> Co-authored-by: Mike Rinehart <[email protected]>
- Loading branch information
1 parent
4af4d59
commit 3bfddc6
Showing
13 changed files
with
149 additions
and
78 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,15 @@ | ||
{ | ||
"spec": "032f61826d1ac192a09f96cd5edb3dda", | ||
"manifest": "f0248c3c049406b59ed105c138c74f0a", | ||
"setup": "3cafeb8492493912376dca6c2e3a18d8", | ||
"spec": "a081c10928cf560b109b01f0c6e3e201", | ||
"manifest": "afe7a3a3d929385fa0013ee9b605714b", | ||
"setup": "b3ddc65f7ceb6a228693f9c434f82102", | ||
"schemas": [ | ||
{ | ||
"identifier": "extract_text/schema.py", | ||
"hash": "5643b10a9793d0fd402d7d5514937986" | ||
"hash": "51cc611ba8d27bf904f9251f8512db10" | ||
}, | ||
{ | ||
"identifier": "connection/schema.py", | ||
"hash": "cb60c2b5b62fafb9634d667a8ad96277" | ||
"hash": "da5382221ca2a33a2f854e17b068d502" | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
50 changes: 23 additions & 27 deletions
50
plugins/pdf_reader/komand_pdf_reader/actions/extract_text/action.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,37 @@ | ||
import komand | ||
from .schema import ExtractTextInput, ExtractTextOutput | ||
import insightconnect_plugin_runtime | ||
from .schema import ExtractTextInput, ExtractTextOutput, Input, Output, Component | ||
|
||
# Custom imports below | ||
import PyPDF2 | ||
from insightconnect_plugin_runtime.exceptions import PluginException | ||
import pdfplumber | ||
from pdfminer.pdfparser import PDFSyntaxError | ||
import base64 | ||
import io | ||
|
||
|
||
class ExtractText(komand.Action): | ||
class ExtractText(insightconnect_plugin_runtime.Action): | ||
def __init__(self): | ||
super(self.__class__, self).__init__( | ||
name="extract_text", | ||
description="Extract text from PDF file", | ||
description=Component.DESCRIPTION, | ||
input=ExtractTextInput(), | ||
output=ExtractTextOutput(), | ||
) | ||
|
||
def run(self, params={}): | ||
pdf_text = "" | ||
try: | ||
if params.get("contents"): | ||
pdfFile = base64.b64decode(params.get("contents")) | ||
else: | ||
raise Exception("File contents missing!") | ||
except Exception as e: | ||
self.logger.error("File contents missing: ", e) | ||
raise | ||
try: | ||
with open("temp.pdf", "wb") as temp_pdf: | ||
temp_pdf.write(pdfFile) | ||
pdfReader = PyPDF2.PdfFileReader(open("temp.pdf", "rb")) | ||
pdftext = "" | ||
for page in range(pdfReader.numPages): | ||
pageObj = pdfReader.getPage(page) | ||
pdftext += pageObj.extractText().replace("\n", "") | ||
except Exception as e: | ||
self.logger.info("An error occurred while extracting text: ", e) | ||
raise | ||
return {"output": pdftext} | ||
|
||
def test(self): | ||
return {"output": "successful"} | ||
with io.BytesIO(base64.b64decode(params.get(Input.CONTENTS))) as f: | ||
pdf_file = pdfplumber.open(f) | ||
try: | ||
pages = pdf_file.pages | ||
for page in enumerate(pages): | ||
pdf_text += page[1].extract_text().replace("\n", " ") | ||
finally: | ||
pdf_file.close() | ||
except PDFSyntaxError: | ||
raise PluginException( | ||
cause="The provided content is not in PDF file format.", | ||
assistance="Please check that the input is correct and try again.", | ||
) | ||
return {Output.OUTPUT: pdf_text} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
4 changes: 2 additions & 2 deletions
4
plugins/pdf_reader/komand_pdf_reader/connection/connection.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
# List third-party dependencies here, separated by newlines. | ||
# All dependencies must be version-pinned, eg. requests==1.2.0 | ||
# See: https://pip.pypa.io/en/stable/user_guide/#requirements-files | ||
PyPDF2==1.26.0 | ||
pdfplumber==0.5.28 | ||
parameterized==0.8.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Oops, something went wrong.