Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add simple wrapper to extract text from pdf #330

Merged
merged 12 commits into from
Nov 7, 2019
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
[#307](https://github.com/pdfminer/pdfminer.six/pull/307))

### Added
- Simple wrapper to easily extract text from a PDF file [#330](https://github.com/pdfminer/pdfminer.six/pull/330)
- Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46))

### Fixed
Expand Down
45 changes: 45 additions & 0 deletions pdfminer/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
import six
import sys

# Conditional import because python 2 is stupid
if sys.version_info > (3, 0):
from io import StringIO
else:
from io import BytesIO as StringIO

from .pdfdocument import PDFDocument
from .pdfparser import PDFParser
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
Expand All @@ -15,6 +21,7 @@
from .converter import XMLConverter, HTMLConverter, TextConverter
from .cmapdb import CMapDB
from .image import ImageWriter
from .layout import LAParams


def extract_text_to_fp(inf, outfp,
Expand Down Expand Up @@ -88,3 +95,41 @@ def extract_text_to_fp(inf, outfp,
interpreter.process_page(page)

device.close()


def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, codec='utf-8', laparams=None):
"""
Parses and returns the text contained in a PDF file.
Takes loads of optional arguments but the defaults are somewhat sane.
Returns a string containing all of the text extracted.

:param pdf_file: Path to the PDF file to be worked on
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
:param caching: If resources should be cached
:param codec: Text decoding codec
:param laparams: LAParams object from pdfminer.layout.
"""
if laparams is None:
laparams = LAParams()

with open(pdf_file, "rb") as fp, StringIO() as output_string:
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, codec=codec,
laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

for page in PDFPage.get_pages(
fp,
page_numbers,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
):
interpreter.process_page(page)

return output_string.getvalue()

38 changes: 38 additions & 0 deletions tests/test_highlevel_extracttext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import unittest

from helpers import absolute_sample_path
from pdfminer.high_level import extract_text


def run(sample_path):
absolute_path = absolute_sample_path(sample_path)
s = extract_text(absolute_path)
return s


test_strings = {
"simple1.pdf": "Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o \n\nH e l l o \n\nW o r l d\n\nW o r l d\n\n\f",
"simple2.pdf": "\f",
"simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
}


class TestExtractText(unittest.TestCase):
def test_simple1(self):
test_file = "simple1.pdf"
s = run(test_file)
self.assertEqual(s, test_strings[test_file])

def test_simple2(self):
test_file = "simple2.pdf"
s = run(test_file)
self.assertEqual(s, test_strings[test_file])

def test_simple3(self):
test_file = "simple3.pdf"
s = run(test_file)
self.assertEqual(s, test_strings[test_file])


if __name__ == "__main__":
unittest.main()