deepset-ai · sjrl · Jan 30, 2023 · Jan 26, 2023 · Jan 26, 2023 · Jan 26, 2023
@@ -1,9 +1,8 @@
-from typing import List, Optional, Dict, Any
-
 import logging
-import tempfile
 import subprocess
+import tempfile
 from pathlib import Path
+from typing import Any, Dict, List, Optional
 
 try:
     from pdf2image import convert_from_path
@@ -16,7 +15,6 @@
 from haystack.nodes.file_converter.image import ImageToTextConverter
 from haystack.schema import Document
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -81,6 +79,8 @@ def convert(
         valid_languages: Optional[List[str]] = None,
         encoding: Optional[str] = None,
         id_hash_keys: Optional[List[str]] = None,
+        start_page: Optional[int] = None,
+        end_page: Optional[int] = None,
     ) -> List[Document]:
         """
         Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
@@ -106,6 +106,8 @@ def convert(
             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
             In this case the id will be generated by using the content and the defined metadata.
+        :param start_page: The page number where to start the conversion
+        :param end_page: The page number where to end the conversion.
         """
         if remove_numeric_tables is None:
             remove_numeric_tables = self.remove_numeric_tables
@@ -116,7 +118,9 @@ def convert(
 
         keep_physical_layout = self.keep_physical_layout
 
-        pages = self._read_pdf(file_path, layout=keep_physical_layout, encoding=encoding)
+        pages = self._read_pdf(
+            file_path, layout=keep_physical_layout, encoding=encoding, start_page=start_page, end_page=end_page
+        )
 
         cleaned_pages = []
         for page in pages:
@@ -160,7 +164,14 @@ def convert(
         document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
         return [document]
 
-    def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = None) -> List[str]:
+    def _read_pdf(
+        self,
+        file_path: Path,
+        layout: bool,
+        encoding: Optional[str] = None,
+        start_page: Optional[int] = None,
+        end_page: Optional[int] = None,
+    ) -> List[str]:
         """
         Extract pages from the pdf file at file_path.
 
@@ -169,15 +180,27 @@ def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = Non
                        the content stream order.
         :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
                          (See list of available encodings by running `pdftotext -listenc` in the terminal)
+        :param start_page: The page number where to start the conversion
+        :param end_page: The page number where to end the conversion.
         """
         if not encoding:
             encoding = self.encoding
 
-        command = ["pdftotext", "-enc", str(encoding), "-layout" if layout else "-raw", str(file_path), "-"]
+        start_page = start_page or 1
+
+        command = ["pdftotext", "-enc", str(encoding), "-layout" if layout else "-raw", "-f", str(start_page)]
+
+        if end_page is not None:
+            command.extend(["-l", str(end_page)])
+
+        command.extend([str(file_path), "-"])
+
         output = subprocess.run(command, stdout=subprocess.PIPE, shell=False, check=False)
         document = output.stdout.decode(errors="ignore")
+        document = "\f" * (start_page - 1) + document  # tracking skipped pages for correct page numbering
         pages = document.split("\f")
         pages = pages[:-1]  # the last page in the split is always empty.
+
         return pages
 
 
@@ -221,6 +244,8 @@ def convert(
         valid_languages: Optional[List[str]] = None,
         encoding: Optional[str] = None,
         id_hash_keys: Optional[List[str]] = None,
+        start_page: Optional[int] = None,
+        end_page: Optional[int] = None,
     ) -> List[Document]:
         """
         Convert a file to a dictionary containing the text and any associated meta data.
@@ -245,20 +270,24 @@ def convert(
             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
             In this case the id will be generated by using the content and the defined metadata.
+        :param start_page: The page number where to start the conversion
+        :param end_page: The page number where to end the conversion.
         """
         if id_hash_keys is None:
             id_hash_keys = self.id_hash_keys
 
+        start_page = start_page or 1
+
         pages = []
         try:
-            images = convert_from_path(file_path)
+            images = convert_from_path(file_path, first_page=start_page, last_page=end_page)
             for image in images:
                 temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg")
                 image.save(temp_img.name)
                 pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content)
         except Exception as exception:
             logger.error("File %s has an error:\n%s", file_path, exception)
 
-        raw_text = "\f".join(pages)
+        raw_text = "\f" * (start_page - 1) + "\f".join(pages)  # tracking skipped pages for correct page numbering
         document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys)
         return [document]
@@ -19,6 +19,7 @@
     ParsrConverter,
     TextConverter,
     CsvTextConverter,
+    PreProcessor,
 )
 
 from ..conftest import SAMPLES_PATH
@@ -111,6 +112,33 @@ def test_pdf_ligatures(Converter):
     assert "ɪ" not in document.content
 
 
+@pytest.mark.parametrize("Converter", [PDFToTextConverter, PDFToTextOCRConverter])
+def test_page_range(Converter):
+    converter = Converter()
+    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
+    pages = document.content.split("\f")
+
+    assert (
+        len(pages) == 4
+    )  # the sample PDF file has four pages, we skipped first (but we wanna correct number of pages)
+    assert pages[0] == ""  # the page 1 was skipped.
+    assert pages[1] != ""  # the page 2 is not empty.
+    assert pages[2] == ""  # the page 3 is empty.
+
+
+@pytest.mark.parametrize("Converter", [PDFToTextConverter, PDFToTextOCRConverter])
+def test_page_range_numbers(Converter):
+    converter = Converter()
+    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
+
+    preprocessor = PreProcessor(
+        split_by="word", split_length=5, split_overlap=0, split_respect_sentence_boundary=False, add_page_number=True
+    )
+    documents = preprocessor.process([document])
+
+    assert documents[1].meta["page"] == 4
+
+
 @pytest.mark.tika
 @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
 def test_table_removal(Converter):