Skip to content

Commit

Permalink
[MC-754][MC-838][MC-839][MC-840] PDF Reader plugin - Fix Extract Text…
Browse files Browse the repository at this point in the history
… action (#1038)

* [MC-754] Fix Extract Text action

* [MC-754] Update plugin spec

* [MC-754][MC-838][MC-839][MC-840] Update Dockerfile and Extract Text action

* [MC-754] Update Extract Text action

* [MC-754][MC-838][MC-839][MC-840] Update error messaging in Extract Text action

* Update plugins/pdf_reader/help.md

Co-authored-by: Max Berezin <[email protected]>
Co-authored-by: Mike Rinehart <[email protected]>
  • Loading branch information
3 people authored Dec 14, 2021
1 parent 4af4d59 commit 3bfddc6
Show file tree
Hide file tree
Showing 13 changed files with 149 additions and 78 deletions.
10 changes: 5 additions & 5 deletions plugins/pdf_reader/.CHECKSUM
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
{
"spec": "032f61826d1ac192a09f96cd5edb3dda",
"manifest": "f0248c3c049406b59ed105c138c74f0a",
"setup": "3cafeb8492493912376dca6c2e3a18d8",
"spec": "a081c10928cf560b109b01f0c6e3e201",
"manifest": "afe7a3a3d929385fa0013ee9b605714b",
"setup": "b3ddc65f7ceb6a228693f9c434f82102",
"schemas": [
{
"identifier": "extract_text/schema.py",
"hash": "5643b10a9793d0fd402d7d5514937986"
"hash": "51cc611ba8d27bf904f9251f8512db10"
},
{
"identifier": "connection/schema.py",
"hash": "cb60c2b5b62fafb9634d667a8ad96277"
"hash": "da5382221ca2a33a2f854e17b068d502"
}
]
}
10 changes: 3 additions & 7 deletions plugins/pdf_reader/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
FROM komand/python-pypy3-plugin:2
# The three supported python parent images are:
# - komand/python-2-plugin
# - komand/python-3-plugin
# - komand/python-pypy3-plugin
#
# Update the tag to a full semver version
FROM rapid7/insightconnect-python-3-38-plugin:4

# Add any custom package dependencies here
# NOTE: Add pip packages to requirements.txt
Expand All @@ -22,4 +16,6 @@ RUN if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
# Install plugin
RUN python setup.py build && python setup.py install

USER nobody

ENTRYPOINT ["/usr/local/bin/komand_pdf_reader"]
42 changes: 28 additions & 14 deletions plugins/pdf_reader/bin/komand_pdf_reader
Original file line number Diff line number Diff line change
@@ -1,30 +1,44 @@
#!/usr/bin/env python
# GENERATED BY KOMAND SDK - DO NOT EDIT
import komand
from komand_pdf_reader import connection, actions, triggers
import os
import json
from sys import argv

Name = "PDF Reader"
Vendor = "rapid7"
Version = "1.0.2"
Description = "Tools for extracting text from a PDF"

Name = 'PDF Reader'
Vendor = 'rapid7'
Version = '1.0.1'
Description = 'Tools for extracting text from a PDF'


class ICONPdfReader(komand.Plugin):
def __init__(self):
super(self.__class__, self).__init__(
def main():
if 'http' in argv:
if os.environ.get("GUNICORN_CONFIG_FILE"):
with open(os.environ.get("GUNICORN_CONFIG_FILE")) as gf:
gunicorn_cfg = json.load(gf)
if gunicorn_cfg.get("worker_class", "sync") == "gevent":
from gevent import monkey
monkey.patch_all()
elif 'gevent' in argv:
from gevent import monkey
monkey.patch_all()

import insightconnect_plugin_runtime
from komand_pdf_reader import connection, actions, triggers

class ICONPdfReader(insightconnect_plugin_runtime.Plugin):
def __init__(self):
super(self.__class__, self).__init__(
name=Name,
vendor=Vendor,
version=Version,
description=Description,
connection=connection.Connection()
)
self.add_action(actions.ExtractText())
)
self.add_action(actions.ExtractText())


def main():
"""Run plugin"""
cli = komand.CLI(ICONPdfReader())
cli = insightconnect_plugin_runtime.CLI(ICONPdfReader())
cli.run()


Expand Down
29 changes: 21 additions & 8 deletions plugins/pdf_reader/help.md

Large diffs are not rendered by default.

50 changes: 23 additions & 27 deletions plugins/pdf_reader/komand_pdf_reader/actions/extract_text/action.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,37 @@
import komand
from .schema import ExtractTextInput, ExtractTextOutput
import insightconnect_plugin_runtime
from .schema import ExtractTextInput, ExtractTextOutput, Input, Output, Component

# Custom imports below
import PyPDF2
from insightconnect_plugin_runtime.exceptions import PluginException
import pdfplumber
from pdfminer.pdfparser import PDFSyntaxError
import base64
import io


class ExtractText(komand.Action):
class ExtractText(insightconnect_plugin_runtime.Action):
def __init__(self):
super(self.__class__, self).__init__(
name="extract_text",
description="Extract text from PDF file",
description=Component.DESCRIPTION,
input=ExtractTextInput(),
output=ExtractTextOutput(),
)

def run(self, params={}):
pdf_text = ""
try:
if params.get("contents"):
pdfFile = base64.b64decode(params.get("contents"))
else:
raise Exception("File contents missing!")
except Exception as e:
self.logger.error("File contents missing: ", e)
raise
try:
with open("temp.pdf", "wb") as temp_pdf:
temp_pdf.write(pdfFile)
pdfReader = PyPDF2.PdfFileReader(open("temp.pdf", "rb"))
pdftext = ""
for page in range(pdfReader.numPages):
pageObj = pdfReader.getPage(page)
pdftext += pageObj.extractText().replace("\n", "")
except Exception as e:
self.logger.info("An error occurred while extracting text: ", e)
raise
return {"output": pdftext}

def test(self):
return {"output": "successful"}
with io.BytesIO(base64.b64decode(params.get(Input.CONTENTS))) as f:
pdf_file = pdfplumber.open(f)
try:
pages = pdf_file.pages
for page in enumerate(pages):
pdf_text += page[1].extract_text().replace("\n", " ")
finally:
pdf_file.close()
except PDFSyntaxError:
raise PluginException(
cause="The provided content is not in PDF file format.",
assistance="Please check that the input is correct and try again.",
)
return {Output.OUTPUT: pdf_text}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# GENERATED BY KOMAND SDK - DO NOT EDIT
import komand
import insightconnect_plugin_runtime
import json


Expand All @@ -15,7 +15,7 @@ class Output:
OUTPUT = "output"


class ExtractTextInput(komand.Input):
class ExtractTextInput(insightconnect_plugin_runtime.Input):
schema = json.loads("""
{
"type": "object",
Expand All @@ -40,7 +40,7 @@ def __init__(self):
super(self.__class__, self).__init__(self.schema)


class ExtractTextOutput(komand.Output):
class ExtractTextOutput(insightconnect_plugin_runtime.Output):
schema = json.loads("""
{
"type": "object",
Expand Down
4 changes: 2 additions & 2 deletions plugins/pdf_reader/komand_pdf_reader/connection/connection.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import komand
import insightconnect_plugin_runtime
from .schema import ConnectionSchema

# Custom imports below


class Connection(komand.Connection):
class Connection(insightconnect_plugin_runtime.Connection):
def __init__(self):
super(self.__class__, self).__init__(input=ConnectionSchema())

Expand Down
4 changes: 2 additions & 2 deletions plugins/pdf_reader/komand_pdf_reader/connection/schema.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# GENERATED BY KOMAND SDK - DO NOT EDIT
import komand
import insightconnect_plugin_runtime
import json


class Input:
pass

class ConnectionSchema(komand.Input):
class ConnectionSchema(insightconnect_plugin_runtime.Input):
schema = json.loads("""
{}
""")
Expand Down
5 changes: 3 additions & 2 deletions plugins/pdf_reader/plugin.spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ products: [insightconnect]
name: pdf_reader
title: PDF Reader
description: Tools for extracting text from a PDF
version: 1.0.1
version: 1.0.2
vendor: rapid7
support: community
status: []
Expand All @@ -17,7 +17,7 @@ tags:
- Extract
hub_tags:
use_cases: [data_utility]
keywords: [PDF, Reader, Extract]
keywords: [pdf, reader, extraction]
features: []
actions:
extract_text:
Expand All @@ -29,6 +29,7 @@ actions:
description: PDF file to extract text from
type: bytes
required: true
example: 
output:
output:
title: PDF Text
Expand Down
3 changes: 2 additions & 1 deletion plugins/pdf_reader/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# List third-party dependencies here, separated by newlines.
# All dependencies must be version-pinned, eg. requests==1.2.0
# See: https://pip.pypa.io/en/stable/user_guide/#requirements-files
PyPDF2==1.26.0
pdfplumber==0.5.28
parameterized==0.8.1
14 changes: 7 additions & 7 deletions plugins/pdf_reader/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
from setuptools import setup, find_packages


setup(name='pdf_reader-rapid7-plugin',
version='1.0.1',
description='Tools for extracting text from a PDF',
author='rapid7',
author_email='',
url='',
setup(name="pdf_reader-rapid7-plugin",
version="1.0.2",
description="Tools for extracting text from a PDF",
author="rapid7",
author_email="",
url="",
packages=find_packages(),
install_requires=['komand'], # Add third-party dependencies to requirements.txt, not here!
install_requires=['insightconnect-plugin-runtime'], # Add third-party dependencies to requirements.txt, not here!
scripts=['bin/komand_pdf_reader']
)
Empty file.
Loading

0 comments on commit 3bfddc6

Please sign in to comment.