Skip to content

Commit

Permalink
[loaders] Use pdfminer high_level function to load the file
Browse files Browse the repository at this point in the history
  • Loading branch information
jean-garret authored and jstockwin committed Apr 17, 2020
1 parent 7e47e67 commit c12de8d
Showing 1 changed file with 6 additions and 27 deletions.
33 changes: 6 additions & 27 deletions py_pdf_parser/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging

from pdfminer import converter, pdfdocument, pdfinterp, pdfpage, pdfparser
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LAParams

from .components import PDFDocument
Expand Down Expand Up @@ -65,36 +65,15 @@ def load(
Returns:
PDFDocument: A PDFDocument with the file loaded.
Raises:
pdfminer.pdfpage.PDFTextExtractionNotAllowed: If the document does not allow
text extraction.
"""
if la_params is None:
la_params = {}

parser = pdfparser.PDFParser(pdf_file)
document = pdfdocument.PDFDocument(parser)

if not document.is_extractable:
raise pdfpage.PDFTextExtractionNotAllowed

resource_manager = pdfinterp.PDFResourceManager()
device = converter.PDFPageAggregator(
resource_manager, laparams=LAParams(**la_params)
)
interpreter = pdfinterp.PDFPageInterpreter(resource_manager, device)

pages: Dict[int, Page] = {}
for page in pdfpage.PDFPage.create_pages(document):
interpreter.process_page(page)
results = device.get_result()

page_number = results.pageid

elements = [
element for element in results if isinstance(element, LTTextContainer)
]
for page_number, page in enumerate(
extract_pages(pdf_file, laparams=LAParams(**la_params)), 1
):
elements = [element for element in page if isinstance(element, LTTextContainer)]

if not elements:
logger.warning(
Expand All @@ -103,7 +82,7 @@ def load(
continue

pages[page_number] = Page(
width=results.width, height=results.height, elements=elements
width=page.width, height=page.height, elements=elements
)

return PDFDocument(pages=pages, pdf_file_path=pdf_file_path, **kwargs)

0 comments on commit c12de8d

Please sign in to comment.