Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add page range support to PDF converters. #3965

Merged
merged 14 commits into from
Jan 30, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 38 additions & 9 deletions haystack/nodes/file_converter/pdf.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from typing import List, Optional, Dict, Any

import logging
import tempfile
import subprocess
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional

try:
from pdf2image import convert_from_path
Expand All @@ -16,7 +15,6 @@
from haystack.nodes.file_converter.image import ImageToTextConverter
from haystack.schema import Document


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -81,6 +79,8 @@ def convert(
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None,
start_page: Optional[int] = None,
end_page: Optional[int] = None,
danielbichuetti marked this conversation as resolved.
Show resolved Hide resolved
) -> List[Document]:
"""
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
Expand All @@ -106,6 +106,8 @@ def convert(
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
:param start_page: The page number where to start the conversion
:param end_page: The page number where to end the conversion.
"""
if remove_numeric_tables is None:
remove_numeric_tables = self.remove_numeric_tables
Expand All @@ -116,7 +118,9 @@ def convert(

keep_physical_layout = self.keep_physical_layout

pages = self._read_pdf(file_path, layout=keep_physical_layout, encoding=encoding)
pages = self._read_pdf(
file_path, layout=keep_physical_layout, encoding=encoding, start_page=start_page, end_page=end_page
)

cleaned_pages = []
for page in pages:
Expand Down Expand Up @@ -160,7 +164,14 @@ def convert(
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
return [document]

def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = None) -> List[str]:
def _read_pdf(
self,
file_path: Path,
layout: bool,
encoding: Optional[str] = None,
start_page: Optional[int] = None,
end_page: Optional[int] = None,
) -> List[str]:
"""
Extract pages from the pdf file at file_path.

Expand All @@ -169,15 +180,27 @@ def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = Non
the content stream order.
:param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
(See list of available encodings by running `pdftotext -listenc` in the terminal)
:param start_page: The page number where to start the conversion
:param end_page: The page number where to end the conversion.
"""
if not encoding:
encoding = self.encoding

command = ["pdftotext", "-enc", str(encoding), "-layout" if layout else "-raw", str(file_path), "-"]
start_page = start_page or 1

command = ["pdftotext", "-enc", str(encoding), "-layout" if layout else "-raw", "-f", str(start_page)]

if end_page is not None:
command.extend(["-l", str(end_page)])

command.extend([str(file_path), "-"])

output = subprocess.run(command, stdout=subprocess.PIPE, shell=False, check=False)
document = output.stdout.decode(errors="ignore")
document = "\f" * (start_page - 1) + document # tracking skipped pages for correct page numbering
pages = document.split("\f")
pages = pages[:-1] # the last page in the split is always empty.

return pages


Expand Down Expand Up @@ -221,6 +244,8 @@ def convert(
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None,
start_page: Optional[int] = None,
end_page: Optional[int] = None,
) -> List[Document]:
"""
Convert a file to a dictionary containing the text and any associated meta data.
Expand All @@ -245,20 +270,24 @@ def convert(
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
:param start_page: The page number where to start the conversion
:param end_page: The page number where to end the conversion.
"""
if id_hash_keys is None:
id_hash_keys = self.id_hash_keys

start_page = start_page or 1

pages = []
try:
images = convert_from_path(file_path)
images = convert_from_path(file_path, first_page=start_page, last_page=end_page)
for image in images:
temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg")
image.save(temp_img.name)
pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content)
except Exception as exception:
logger.error("File %s has an error:\n%s", file_path, exception)

raw_text = "\f".join(pages)
raw_text = "\f" * (start_page - 1) + "\f".join(pages) # tracking skipped pages for correct page numbering
document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys)
return [document]
28 changes: 28 additions & 0 deletions test/nodes/test_file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
ParsrConverter,
TextConverter,
CsvTextConverter,
PreProcessor,
)

from ..conftest import SAMPLES_PATH
Expand Down Expand Up @@ -111,6 +112,33 @@ def test_pdf_ligatures(Converter):
assert "ɪ" not in document.content


@pytest.mark.parametrize("Converter", [PDFToTextConverter, PDFToTextOCRConverter])
def test_page_range(Converter):
converter = Converter()
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
pages = document.content.split("\f")

assert (
len(pages) == 4
) # the sample PDF file has four pages, we skipped first (but we wanna correct number of pages)
assert pages[0] == "" # the page 1 was skipped.
assert pages[1] != "" # the page 2 is not empty.
assert pages[2] == "" # the page 3 is empty.


@pytest.mark.parametrize("Converter", [PDFToTextConverter, PDFToTextOCRConverter])
def test_page_range_numbers(Converter):
converter = Converter()
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]

preprocessor = PreProcessor(
split_by="word", split_length=5, split_overlap=0, split_respect_sentence_boundary=False, add_page_number=True
)
documents = preprocessor.process([document])

assert documents[1].meta["page"] == 4


@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_table_removal(Converter):
Expand Down