diff --git a/haystack/nodes/file_converter/pdf.py b/haystack/nodes/file_converter/pdf.py index a18e39b540..c991f28e6a 100644 --- a/haystack/nodes/file_converter/pdf.py +++ b/haystack/nodes/file_converter/pdf.py @@ -1,9 +1,8 @@ -from typing import List, Optional, Dict, Any - import logging -import tempfile import subprocess +import tempfile from pathlib import Path +from typing import Any, Dict, List, Optional try: from pdf2image import convert_from_path @@ -16,7 +15,6 @@ from haystack.nodes.file_converter.image import ImageToTextConverter from haystack.schema import Document - logger = logging.getLogger(__name__) @@ -81,6 +79,8 @@ def convert( valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None, + start_page: Optional[int] = None, + end_page: Optional[int] = None, ) -> List[Document]: """ Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html) @@ -106,6 +106,8 @@ def convert( attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. + :param start_page: The page number where to start the conversion + :param end_page: The page number where to end the conversion. """ if remove_numeric_tables is None: remove_numeric_tables = self.remove_numeric_tables @@ -116,7 +118,9 @@ def convert( keep_physical_layout = self.keep_physical_layout - pages = self._read_pdf(file_path, layout=keep_physical_layout, encoding=encoding) + pages = self._read_pdf( + file_path, layout=keep_physical_layout, encoding=encoding, start_page=start_page, end_page=end_page + ) cleaned_pages = [] for page in pages: @@ -160,7 +164,14 @@ def convert( document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys) return [document] - def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = None) -> List[str]: + def _read_pdf( + self, + file_path: Path, + layout: bool, + encoding: Optional[str] = None, + start_page: Optional[int] = None, + end_page: Optional[int] = None, + ) -> List[str]: """ Extract pages from the pdf file at file_path. @@ -169,15 +180,27 @@ def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = Non the content stream order. :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`. (See list of available encodings by running `pdftotext -listenc` in the terminal) + :param start_page: The page number where to start the conversion + :param end_page: The page number where to end the conversion. """ if not encoding: encoding = self.encoding - command = ["pdftotext", "-enc", str(encoding), "-layout" if layout else "-raw", str(file_path), "-"] + start_page = start_page or 1 + + command = ["pdftotext", "-enc", str(encoding), "-layout" if layout else "-raw", "-f", str(start_page)] + + if end_page is not None: + command.extend(["-l", str(end_page)]) + + command.extend([str(file_path), "-"]) + output = subprocess.run(command, stdout=subprocess.PIPE, shell=False, check=False) document = output.stdout.decode(errors="ignore") + document = "\f" * (start_page - 1) + document # tracking skipped pages for correct page numbering pages = document.split("\f") pages = pages[:-1] # the last page in the split is always empty. + return pages @@ -221,6 +244,8 @@ def convert( valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None, + start_page: Optional[int] = None, + end_page: Optional[int] = None, ) -> List[Document]: """ Convert a file to a dictionary containing the text and any associated meta data. @@ -245,13 +270,17 @@ def convert( attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. + :param start_page: The page number where to start the conversion + :param end_page: The page number where to end the conversion. """ if id_hash_keys is None: id_hash_keys = self.id_hash_keys + start_page = start_page or 1 + pages = [] try: - images = convert_from_path(file_path) + images = convert_from_path(file_path, first_page=start_page, last_page=end_page) for image in images: temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg") image.save(temp_img.name) @@ -259,6 +288,6 @@ def convert( except Exception as exception: logger.error("File %s has an error:\n%s", file_path, exception) - raw_text = "\f".join(pages) + raw_text = "\f" * (start_page - 1) + "\f".join(pages) # tracking skipped pages for correct page numbering document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys) return [document] diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py index c34ea1672f..e8fd95ab71 100644 --- a/test/nodes/test_file_converter.py +++ b/test/nodes/test_file_converter.py @@ -19,6 +19,7 @@ ParsrConverter, TextConverter, CsvTextConverter, + PreProcessor, ) from ..conftest import SAMPLES_PATH @@ -111,6 +112,33 @@ def test_pdf_ligatures(Converter): assert "ɪ" not in document.content +@pytest.mark.parametrize("Converter", [PDFToTextConverter]) +def test_page_range(Converter): + converter = Converter() + document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0] + pages = document.content.split("\f") + + assert ( + len(pages) == 4 + ) # the sample PDF file has four pages, we skipped first (but we wanna correct number of pages) + assert pages[0] == "" # the page 1 was skipped. + assert pages[1] != "" # the page 2 is not empty. + assert pages[2] == "" # the page 3 is empty. + + +@pytest.mark.parametrize("Converter", [PDFToTextConverter]) +def test_page_range_numbers(Converter): + converter = Converter() + document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0] + + preprocessor = PreProcessor( + split_by="word", split_length=5, split_overlap=0, split_respect_sentence_boundary=False, add_page_number=True + ) + documents = preprocessor.process([document]) + + assert documents[1].meta["page"] == 4 + + @pytest.mark.tika @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter]) def test_table_removal(Converter):