Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MC-754][MC-838][MC-839][MC-840] PDF Reader plugin - Fix Extract Text action #1038

Merged
merged 9 commits into from
Dec 14, 2021
10 changes: 5 additions & 5 deletions plugins/pdf_reader/.CHECKSUM
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
{
"spec": "032f61826d1ac192a09f96cd5edb3dda",
"manifest": "f0248c3c049406b59ed105c138c74f0a",
"setup": "3cafeb8492493912376dca6c2e3a18d8",
"spec": "a081c10928cf560b109b01f0c6e3e201",
"manifest": "afe7a3a3d929385fa0013ee9b605714b",
"setup": "b3ddc65f7ceb6a228693f9c434f82102",
"schemas": [
{
"identifier": "extract_text/schema.py",
"hash": "5643b10a9793d0fd402d7d5514937986"
"hash": "51cc611ba8d27bf904f9251f8512db10"
},
{
"identifier": "connection/schema.py",
"hash": "cb60c2b5b62fafb9634d667a8ad96277"
"hash": "da5382221ca2a33a2f854e17b068d502"
}
]
}
10 changes: 3 additions & 7 deletions plugins/pdf_reader/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
FROM komand/python-pypy3-plugin:2
# The three supported python parent images are:
# - komand/python-2-plugin
# - komand/python-3-plugin
# - komand/python-pypy3-plugin
#
# Update the tag to a full semver version
FROM rapid7/insightconnect-python-3-38-plugin:4

# Add any custom package dependencies here
# NOTE: Add pip packages to requirements.txt
Expand All @@ -22,4 +16,6 @@ RUN if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
# Install plugin
RUN python setup.py build && python setup.py install

USER nobody

ENTRYPOINT ["/usr/local/bin/komand_pdf_reader"]
42 changes: 28 additions & 14 deletions plugins/pdf_reader/bin/komand_pdf_reader
Original file line number Diff line number Diff line change
@@ -1,30 +1,44 @@
#!/usr/bin/env python
# GENERATED BY KOMAND SDK - DO NOT EDIT
import komand
from komand_pdf_reader import connection, actions, triggers
import os
import json
from sys import argv

Name = "PDF Reader"
Vendor = "rapid7"
Version = "1.0.2"
Description = "Tools for extracting text from a PDF"

Name = 'PDF Reader'
Vendor = 'rapid7'
Version = '1.0.1'
Description = 'Tools for extracting text from a PDF'


class ICONPdfReader(komand.Plugin):
def __init__(self):
super(self.__class__, self).__init__(
def main():
if 'http' in argv:
if os.environ.get("GUNICORN_CONFIG_FILE"):
with open(os.environ.get("GUNICORN_CONFIG_FILE")) as gf:
gunicorn_cfg = json.load(gf)
if gunicorn_cfg.get("worker_class", "sync") == "gevent":
from gevent import monkey
monkey.patch_all()
elif 'gevent' in argv:
from gevent import monkey
monkey.patch_all()

import insightconnect_plugin_runtime
from komand_pdf_reader import connection, actions, triggers

class ICONPdfReader(insightconnect_plugin_runtime.Plugin):
def __init__(self):
super(self.__class__, self).__init__(
name=Name,
vendor=Vendor,
version=Version,
description=Description,
connection=connection.Connection()
)
self.add_action(actions.ExtractText())
)
self.add_action(actions.ExtractText())


def main():
"""Run plugin"""
cli = komand.CLI(ICONPdfReader())
cli = insightconnect_plugin_runtime.CLI(ICONPdfReader())
cli.run()


Expand Down
29 changes: 21 additions & 8 deletions plugins/pdf_reader/help.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,41 +1,37 @@
import komand
from .schema import ExtractTextInput, ExtractTextOutput
import insightconnect_plugin_runtime
from .schema import ExtractTextInput, ExtractTextOutput, Input, Output, Component

# Custom imports below
import PyPDF2
from insightconnect_plugin_runtime.exceptions import PluginException
import pdfplumber
from pdfminer.pdfparser import PDFSyntaxError
import base64
import io


class ExtractText(komand.Action):
class ExtractText(insightconnect_plugin_runtime.Action):
def __init__(self):
super(self.__class__, self).__init__(
name="extract_text",
description="Extract text from PDF file",
description=Component.DESCRIPTION,
input=ExtractTextInput(),
output=ExtractTextOutput(),
)

def run(self, params={}):
pdf_text = ""
try:
if params.get("contents"):
pdfFile = base64.b64decode(params.get("contents"))
else:
raise Exception("File contents missing!")
except Exception as e:
self.logger.error("File contents missing: ", e)
raise
try:
with open("temp.pdf", "wb") as temp_pdf:
temp_pdf.write(pdfFile)
pdfReader = PyPDF2.PdfFileReader(open("temp.pdf", "rb"))
pdftext = ""
for page in range(pdfReader.numPages):
pageObj = pdfReader.getPage(page)
pdftext += pageObj.extractText().replace("\n", "")
except Exception as e:
self.logger.info("An error occurred while extracting text: ", e)
raise
return {"output": pdftext}

def test(self):
return {"output": "successful"}
with io.BytesIO(base64.b64decode(params.get(Input.CONTENTS))) as f:
pdf_file = pdfplumber.open(f)
try:
pages = pdf_file.pages
for page in enumerate(pages):
pdf_text += page[1].extract_text().replace("\n", " ")
finally:
pdf_file.close()
except PDFSyntaxError:
raise PluginException(
cause="The provided content is not from the PDF file.",
dsliwinski-r7 marked this conversation as resolved.
Show resolved Hide resolved
assistance="Please check that the input is correct and try again.",
)
return {Output.OUTPUT: pdf_text}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# GENERATED BY KOMAND SDK - DO NOT EDIT
import komand
import insightconnect_plugin_runtime
import json


Expand All @@ -15,7 +15,7 @@ class Output:
OUTPUT = "output"


class ExtractTextInput(komand.Input):
class ExtractTextInput(insightconnect_plugin_runtime.Input):
schema = json.loads("""
{
"type": "object",
Expand All @@ -40,7 +40,7 @@ def __init__(self):
super(self.__class__, self).__init__(self.schema)


class ExtractTextOutput(komand.Output):
class ExtractTextOutput(insightconnect_plugin_runtime.Output):
schema = json.loads("""
{
"type": "object",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import komand
import insightconnect_plugin_runtime
from .schema import ConnectionSchema

# Custom imports below


class Connection(komand.Connection):
class Connection(insightconnect_plugin_runtime.Connection):
def __init__(self):
super(self.__class__, self).__init__(input=ConnectionSchema())

Expand Down
4 changes: 2 additions & 2 deletions plugins/pdf_reader/komand_pdf_reader/connection/schema.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# GENERATED BY KOMAND SDK - DO NOT EDIT
import komand
import insightconnect_plugin_runtime
import json


class Input:
pass

class ConnectionSchema(komand.Input):
class ConnectionSchema(insightconnect_plugin_runtime.Input):
schema = json.loads("""
{}
""")
Expand Down
5 changes: 3 additions & 2 deletions plugins/pdf_reader/plugin.spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ products: [insightconnect]
name: pdf_reader
title: PDF Reader
description: Tools for extracting text from a PDF
version: 1.0.1
version: 1.0.2
vendor: rapid7
support: community
status: []
Expand All @@ -17,7 +17,7 @@ tags:
- Extract
hub_tags:
use_cases: [data_utility]
keywords: [PDF, Reader, Extract]
keywords: [pdf, reader, extraction]
features: []
actions:
extract_text:
Expand All @@ -29,6 +29,7 @@ actions:
description: PDF file to extract text from
type: bytes
required: true
example: 
blaxminarayan-r7 marked this conversation as resolved.
Show resolved Hide resolved
output:
output:
title: PDF Text
Expand Down
3 changes: 2 additions & 1 deletion plugins/pdf_reader/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# List third-party dependencies here, separated by newlines.
# All dependencies must be version-pinned, eg. requests==1.2.0
# See: https://pip.pypa.io/en/stable/user_guide/#requirements-files
PyPDF2==1.26.0
pdfplumber==0.5.28
parameterized==0.8.1
14 changes: 7 additions & 7 deletions plugins/pdf_reader/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
from setuptools import setup, find_packages


setup(name='pdf_reader-rapid7-plugin',
version='1.0.1',
description='Tools for extracting text from a PDF',
author='rapid7',
author_email='',
url='',
setup(name="pdf_reader-rapid7-plugin",
version="1.0.2",
description="Tools for extracting text from a PDF",
author="rapid7",
author_email="",
url="",
packages=find_packages(),
install_requires=['komand'], # Add third-party dependencies to requirements.txt, not here!
install_requires=['insightconnect-plugin-runtime'], # Add third-party dependencies to requirements.txt, not here!
scripts=['bin/komand_pdf_reader']
)
Empty file.
Loading