pdfminer · pietermarsman · Feb 1, 2022 · Oct 13, 2021 · Oct 13, 2021 · Oct 13, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ### Added
 - Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679))
 - Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
+- Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680))
 
 ### Fixed
 - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))

diff --git a/pdfminer/data_structures.py b/pdfminer/data_structures.py
@@ -0,0 +1,53 @@
+import functools
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+from pdfminer import settings
+from pdfminer.pdfparser import PDFSyntaxError
+from pdfminer.pdftypes import list_value, int_value, dict_value
+from pdfminer.utils import choplist
+
+
+class NumberTree:
+    """A PDF number tree.
+
+    See Section 3.8.6 of the PDF Reference.
+    """
+    def __init__(self, obj: Any):
+        self._obj = dict_value(obj)
+        self.nums: Optional[Iterable[Any]] = None
+        self.kids: Optional[Iterable[Any]] = None
+        self.limits: Optional[Iterable[Any]] = None
+
+        if 'Nums' in self._obj:
+            self.nums = list_value(self._obj['Nums'])
+        if 'Kids' in self._obj:
+            self.kids = list_value(self._obj['Kids'])
+        if 'Limits' in self._obj:
+            self.limits = list_value(self._obj['Limits'])
+
+    def _parse(self) -> List[Tuple[int, Any]]:
+        l = []
+        if self.nums:  # Leaf node
+            for k, v in choplist(2, self.nums):
+                l.append((int_value(k), v))
+
+        if self.kids:  # Root or intermediate node
+            for child_ref in self.kids:
+                l += NumberTree(child_ref)._parse()
+
+        return l
+
+    values: List[Tuple[int, Any]]  # workaround decorators unsupported by mypy
+
+    @property  # type: ignore [no-redef,misc]
+    @functools.lru_cache
+    def values(self) -> List[Tuple[int, Any]]:
+        values = self._parse()
+
+        if settings.STRICT:
+            if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
+                raise PDFSyntaxError('Number tree elements are out of order')
+        else:
+            values.sort(key=lambda t: t[0])
+
+        return values
diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py
@@ -1,3 +1,4 @@
+import itertools
 import logging
 import re
 import struct
@@ -10,12 +11,14 @@
 
 from . import settings
 from .arcfour import Arcfour
+from .data_structures import NumberTree
 from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser
 from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream, \
     PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
     uint_value, dict_value, stream_value
 from .psparser import PSEOF, literal_name, LIT, KWD
-from .utils import choplist, nunpack, decode_text
+from .utils import choplist, decode_text, nunpack, format_int_roman, \
+    format_int_alpha
 
 log = logging.getLogger(__name__)
 
@@ -36,6 +39,10 @@ class PDFNoOutlines(PDFException):
     pass
 
 
+class PDFNoPageLabels(PDFException):
+    pass
+
+
 class PDFDestinationNotFound(PDFException):
     pass
 
@@ -890,6 +897,24 @@ def search(entry: object, level: int
             return
         return search(self.catalog['Outlines'], 0)
 
+    def get_page_labels(self) -> Iterator[str]:
+        """
+        Generate page label strings for the PDF document.
+
+        If the document includes page labels, generates strings, one per page.
+        If not, raises PDFNoPageLabels.
+
+        The resulting iteration is unbounded.
+        """
+        assert self.catalog is not None
+
+        try:
+            page_labels = PageLabels(self.catalog['PageLabels'])
+        except (PDFTypeError, KeyError):
+            raise PDFNoPageLabels
+
+        return page_labels.labels
+
     def lookup_name(
         self,
         cat: str,
@@ -989,3 +1014,61 @@ def read_xref_from(
             pos = int_value(trailer['Prev'])
             self.read_xref_from(parser, pos, xrefs)
         return
+
+
+class PageLabels(NumberTree):
+    """PageLabels from the document catalog.
+
+    See Section 8.3.1 in the PDF Reference.
+    """
+
+    @property
+    def labels(self) -> Iterator[str]:
+        ranges = self.values
+
+        # The tree must begin with page index 0
+        if len(ranges) == 0 or ranges[0][0] != 0:
+            if settings.STRICT:
+                raise PDFSyntaxError('PageLabels is missing page index 0')
+            else:
+                # Try to cope, by assuming empty labels for the initial pages
+                ranges.insert(0, (0, {}))
+
+        for (next, (start, label_dict_unchecked)) in enumerate(ranges, 1):
+            label_dict = dict_value(label_dict_unchecked)
+            style = label_dict.get('S')
+            prefix = decode_text(str_value(label_dict.get('P', b'')))
+            first_value = int_value(label_dict.get('St', 1))
+
+            if next == len(ranges):
+                # This is the last specified range. It continues until the end
+                # of the document.
+                values: Iterable[int] = itertools.count(first_value)
+            else:
+                end, _ = ranges[next]
+                range_length = end - start
+                values = range(first_value, first_value + range_length)
+
+            for value in values:
+                label = self._format_page_label(value, style)
+                yield prefix + label
+
+    @staticmethod
+    def _format_page_label(value: int, style: Any) -> str:
+        """Format page label value in a specific style"""
+        if style is None:
+            label = ''
+        elif style is LIT('D'):  # Decimal arabic numerals
+            label = str(value)
+        elif style is LIT('R'):  # Uppercase roman numerals
+            label = format_int_roman(value).upper()
+        elif style is LIT('r'):  # Lowercase roman numerals
+            label = format_int_roman(value)
+        elif style is LIT('A'):  # Uppercase letters A-Z, AA-ZZ...
+            label = format_int_alpha(value).upper()
+        elif style is LIT('a'):  # Lowercase letters a-z, aa-zz...
+            label = format_int_alpha(value)
+        else:
+            log.warning('Unknown page label style: %r', style)
+            label = ''
+        return label
diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py
@@ -1,9 +1,11 @@
+import itertools
 import logging
 from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple
 
 from pdfminer.utils import Rect
 from . import settings
-from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
+from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, \
+    PDFNoPageLabels
 from .pdfparser import PDFParser
 from .pdftypes import PDFObjectNotFound
 from .pdftypes import dict_value
@@ -38,23 +40,27 @@ class PDFPage:
       rotate: the page rotation (in degree).
       annots: the page annotations.
       beads: a chain that represents natural reading order.
+      label: the page's label (typically, the logical page number).
     """
 
     def __init__(
         self,
         doc: PDFDocument,
         pageid: object,
-        attrs: object
+        attrs: object,
+        label: Optional[str]
     ) -> None:
         """Initialize a page object.
 
         doc: a PDFDocument object.
         pageid: any Python object that can uniquely identify the page.
         attrs: a dictionary of page attributes.
+        label: page label string.
         """
         self.doc = doc
         self.pageid = pageid
         self.attrs = dict_value(attrs)
+        self.label = label
         self.lastmod = resolve1(self.attrs.get('LastModified'))
         self.resources: Dict[object, object] = \
             resolve1(self.attrs.get('Resources', dict()))
@@ -109,11 +115,17 @@ def search(
             elif tree_type is LITERAL_PAGE:
                 log.info('Page: %r', tree)
                 yield (objid, tree)
+
+        try:
+            page_labels: Iterator[Optional[str]] = document.get_page_labels()
+        except PDFNoPageLabels:
+            page_labels = itertools.repeat(None)
+
         pages = False
         if 'Pages' in document.catalog:
             objects = search(document.catalog['Pages'], document.catalog)
             for (objid, tree) in objects:
-                yield cls(document, objid, tree)
+                yield cls(document, objid, tree, next(page_labels))
                 pages = True
         if not pages:
             # fallback when /Pages is missing.
@@ -123,7 +135,7 @@ def search(
                         obj = document.getobj(objid)
                         if isinstance(obj, dict) \
                                 and obj.get('Type') is LITERAL_PAGE:
-                            yield cls(document, objid, obj)
+                            yield cls(document, objid, obj, next(page_labels))
                     except PDFObjectNotFound:
                         pass
         return

diff --git a/pdfminer/utils.py b/pdfminer/utils.py
@@ -3,6 +3,7 @@
 """
 import io
 import pathlib
+import string
 import struct
 from html import escape
 from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator,
@@ -527,3 +528,47 @@ def find(self, bbox: Rect) -> Iterator[LTComponentT]:
                         or y1 <= obj.y0:
                     continue
                 yield obj
+
+
+ROMAN_ONES = ['i', 'x', 'c', 'm']
+ROMAN_FIVES = ['v', 'l', 'd']
+
+
+def format_int_roman(value: int) -> str:
+    """Format a number as lowercase Roman numerals."""
+
+    assert 0 < value < 4000
+    result: List[str] = []
+    index = 0
+
+    while value != 0:
+        value, remainder = divmod(value, 10)
+        if remainder == 9:
+            result.insert(0, ROMAN_ONES[index])
+            result.insert(1, ROMAN_ONES[index + 1])
+        elif remainder == 4:
+            result.insert(0, ROMAN_ONES[index])
+            result.insert(1, ROMAN_FIVES[index])
+        else:
+            over_five = remainder >= 5
+            if over_five:
+                result.insert(0, ROMAN_FIVES[index])
+                remainder -= 5
+            result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
+        index += 1
+
+    return ''.join(result)
+
+
+def format_int_alpha(value: int) -> str:
+    """Format a number as lowercase letters a-z, aa-zz, etc."""
+
+    assert value > 0
+    result: List[str] = []
+
+    while value != 0:
+        value, remainder = divmod(value - 1, len(string.ascii_lowercase))
+        result.append(string.ascii_lowercase[remainder])
+
+    result.reverse()
+    return ''.join(result)
diff --git a/samples/contrib/pagelabels.pdf b/samples/contrib/pagelabels.pdf
diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py
@@ -1,9 +1,11 @@
+import itertools
+
 from nose.tools import assert_equal, raises
 
 from helpers import absolute_sample_path
-from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfdocument import PDFDocument, PDFNoPageLabels
 from pdfminer.pdfparser import PDFParser
-from pdfminer.pdftypes import PDFObjectNotFound
+from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value
 
 
 class TestPdfDocument(object):
@@ -25,3 +27,21 @@ def test_encrypted_no_id(self):
             doc = PDFDocument(parser)
             assert_equal(doc.info,
                          [{'Producer': b'European Patent Office'}])
+
+    def test_page_labels(self):
+        path = absolute_sample_path('contrib/pagelabels.pdf')
+        with open(path, 'rb') as fp:
+            parser = PDFParser(fp)
+            doc = PDFDocument(parser)
+            total_pages = int_value(dict_value(doc.catalog['Pages'])['Count'])
+            assert_equal(
+                list(itertools.islice(doc.get_page_labels(), total_pages)),
+                ['iii', 'iv', '1', '2', '1'])
+
+    @raises(PDFNoPageLabels)
+    def test_no_page_labels(self):
+        path = absolute_sample_path('simple1.pdf')
+        with open(path, 'rb') as fp:
+            parser = PDFParser(fp)
+            doc = PDFDocument(parser)
+            doc.get_page_labels()
diff --git a/tests/test_pdfpage.py b/tests/test_pdfpage.py
@@ -0,0 +1,18 @@
+from nose.tools import assert_equal
+
+from helpers import absolute_sample_path
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfpage import PDFPage
+
+
+class TestPdfPage(object):
+    def test_page_labels(self):
+        path = absolute_sample_path('contrib/pagelabels.pdf')
+        expected_labels = ['iii', 'iv', '1', '2', '1']
+
+        with open(path, 'rb') as fp:
+            parser = PDFParser(fp)
+            doc = PDFDocument(parser)
+            for (i, page) in enumerate(PDFPage.create_pages(doc)):
+                assert_equal(page.label, expected_labels[i])