Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: page labels #680

Merged
merged 11 commits into from
Feb 1, 2022
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Added
- Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679))
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
- Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680))

### Fixed
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
Expand Down
53 changes: 53 additions & 0 deletions pdfminer/data_structures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import functools
from typing import Any, Dict, Iterable, List, Optional, Tuple

from pdfminer import settings
from pdfminer.pdfparser import PDFSyntaxError
from pdfminer.pdftypes import list_value, int_value, dict_value
from pdfminer.utils import choplist


class NumberTree:
"""A PDF number tree.

See Section 3.8.6 of the PDF Reference.
"""
def __init__(self, obj: Any):
self._obj = dict_value(obj)
self.nums: Optional[Iterable[Any]] = None
self.kids: Optional[Iterable[Any]] = None
self.limits: Optional[Iterable[Any]] = None

if 'Nums' in self._obj:
self.nums = list_value(self._obj['Nums'])
if 'Kids' in self._obj:
self.kids = list_value(self._obj['Kids'])
if 'Limits' in self._obj:
self.limits = list_value(self._obj['Limits'])

def _parse(self) -> List[Tuple[int, Any]]:
l = []
if self.nums: # Leaf node
for k, v in choplist(2, self.nums):
l.append((int_value(k), v))

if self.kids: # Root or intermediate node
for child_ref in self.kids:
l += NumberTree(child_ref)._parse()

return l

values: List[Tuple[int, Any]] # workaround decorators unsupported by mypy

@property # type: ignore [no-redef,misc]
@functools.lru_cache
def values(self) -> List[Tuple[int, Any]]:
values = self._parse()

if settings.STRICT:
if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
raise PDFSyntaxError('Number tree elements are out of order')
else:
values.sort(key=lambda t: t[0])

return values
85 changes: 84 additions & 1 deletion pdfminer/pdfdocument.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import itertools
import logging
import re
import struct
Expand All @@ -10,12 +11,14 @@

from . import settings
from .arcfour import Arcfour
from .data_structures import NumberTree
from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser
from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream, \
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
uint_value, dict_value, stream_value
from .psparser import PSEOF, literal_name, LIT, KWD
from .utils import choplist, nunpack, decode_text
from .utils import choplist, decode_text, nunpack, format_int_roman, \
format_int_alpha

log = logging.getLogger(__name__)

Expand All @@ -36,6 +39,10 @@ class PDFNoOutlines(PDFException):
pass


class PDFNoPageLabels(PDFException):
pass


class PDFDestinationNotFound(PDFException):
pass

Expand Down Expand Up @@ -890,6 +897,24 @@ def search(entry: object, level: int
return
return search(self.catalog['Outlines'], 0)

def get_page_labels(self) -> Iterator[str]:
"""
Generate page label strings for the PDF document.

If the document includes page labels, generates strings, one per page.
If not, raises PDFNoPageLabels.

The resulting iteration is unbounded.
"""
assert self.catalog is not None

try:
page_labels = PageLabels(self.catalog['PageLabels'])
except (PDFTypeError, KeyError):
raise PDFNoPageLabels

return page_labels.labels

def lookup_name(
self,
cat: str,
Expand Down Expand Up @@ -989,3 +1014,61 @@ def read_xref_from(
pos = int_value(trailer['Prev'])
self.read_xref_from(parser, pos, xrefs)
return


class PageLabels(NumberTree):
"""PageLabels from the document catalog.

See Section 8.3.1 in the PDF Reference.
"""

@property
def labels(self) -> Iterator[str]:
ranges = self.values

# The tree must begin with page index 0
if len(ranges) == 0 or ranges[0][0] != 0:
if settings.STRICT:
raise PDFSyntaxError('PageLabels is missing page index 0')
else:
# Try to cope, by assuming empty labels for the initial pages
ranges.insert(0, (0, {}))

for (next, (start, label_dict_unchecked)) in enumerate(ranges, 1):
label_dict = dict_value(label_dict_unchecked)
style = label_dict.get('S')
prefix = decode_text(str_value(label_dict.get('P', b'')))
first_value = int_value(label_dict.get('St', 1))

if next == len(ranges):
# This is the last specified range. It continues until the end
# of the document.
values: Iterable[int] = itertools.count(first_value)
else:
end, _ = ranges[next]
range_length = end - start
values = range(first_value, first_value + range_length)

for value in values:
label = self._format_page_label(value, style)
yield prefix + label

@staticmethod
def _format_page_label(value: int, style: Any) -> str:
"""Format page label value in a specific style"""
if style is None:
label = ''
elif style is LIT('D'): # Decimal arabic numerals
label = str(value)
elif style is LIT('R'): # Uppercase roman numerals
label = format_int_roman(value).upper()
elif style is LIT('r'): # Lowercase roman numerals
label = format_int_roman(value)
elif style is LIT('A'): # Uppercase letters A-Z, AA-ZZ...
label = format_int_alpha(value).upper()
elif style is LIT('a'): # Lowercase letters a-z, aa-zz...
label = format_int_alpha(value)
else:
log.warning('Unknown page label style: %r', style)
label = ''
return label
20 changes: 16 additions & 4 deletions pdfminer/pdfpage.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import itertools
import logging
from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple

from pdfminer.utils import Rect
from . import settings
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, \
PDFNoPageLabels
from .pdfparser import PDFParser
from .pdftypes import PDFObjectNotFound
from .pdftypes import dict_value
Expand Down Expand Up @@ -38,23 +40,27 @@ class PDFPage:
rotate: the page rotation (in degree).
annots: the page annotations.
beads: a chain that represents natural reading order.
label: the page's label (typically, the logical page number).
"""

def __init__(
self,
doc: PDFDocument,
pageid: object,
attrs: object
attrs: object,
label: Optional[str]
) -> None:
"""Initialize a page object.

doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
label: page label string.
"""
self.doc = doc
self.pageid = pageid
self.attrs = dict_value(attrs)
self.label = label
self.lastmod = resolve1(self.attrs.get('LastModified'))
self.resources: Dict[object, object] = \
resolve1(self.attrs.get('Resources', dict()))
Expand Down Expand Up @@ -109,11 +115,17 @@ def search(
elif tree_type is LITERAL_PAGE:
log.info('Page: %r', tree)
yield (objid, tree)

try:
page_labels: Iterator[Optional[str]] = document.get_page_labels()
except PDFNoPageLabels:
page_labels = itertools.repeat(None)

pages = False
if 'Pages' in document.catalog:
objects = search(document.catalog['Pages'], document.catalog)
for (objid, tree) in objects:
yield cls(document, objid, tree)
yield cls(document, objid, tree, next(page_labels))
pages = True
if not pages:
# fallback when /Pages is missing.
Expand All @@ -123,7 +135,7 @@ def search(
obj = document.getobj(objid)
if isinstance(obj, dict) \
and obj.get('Type') is LITERAL_PAGE:
yield cls(document, objid, obj)
yield cls(document, objid, obj, next(page_labels))
except PDFObjectNotFound:
pass
return
Expand Down
45 changes: 45 additions & 0 deletions pdfminer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
import io
import pathlib
import string
import struct
from html import escape
from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator,
Expand Down Expand Up @@ -527,3 +528,47 @@ def find(self, bbox: Rect) -> Iterator[LTComponentT]:
or y1 <= obj.y0:
continue
yield obj


ROMAN_ONES = ['i', 'x', 'c', 'm']
ROMAN_FIVES = ['v', 'l', 'd']


def format_int_roman(value: int) -> str:
"""Format a number as lowercase Roman numerals."""

assert 0 < value < 4000
result: List[str] = []
index = 0

while value != 0:
value, remainder = divmod(value, 10)
if remainder == 9:
result.insert(0, ROMAN_ONES[index])
result.insert(1, ROMAN_ONES[index + 1])
elif remainder == 4:
result.insert(0, ROMAN_ONES[index])
result.insert(1, ROMAN_FIVES[index])
else:
over_five = remainder >= 5
if over_five:
result.insert(0, ROMAN_FIVES[index])
remainder -= 5
result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
index += 1

return ''.join(result)


def format_int_alpha(value: int) -> str:
"""Format a number as lowercase letters a-z, aa-zz, etc."""

assert value > 0
result: List[str] = []

while value != 0:
value, remainder = divmod(value - 1, len(string.ascii_lowercase))
result.append(string.ascii_lowercase[remainder])

result.reverse()
return ''.join(result)
Binary file added samples/contrib/pagelabels.pdf
Binary file not shown.
24 changes: 22 additions & 2 deletions tests/test_pdfdocument.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import itertools

from nose.tools import assert_equal, raises

from helpers import absolute_sample_path
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfdocument import PDFDocument, PDFNoPageLabels
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjectNotFound
from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value


class TestPdfDocument(object):
Expand All @@ -25,3 +27,21 @@ def test_encrypted_no_id(self):
doc = PDFDocument(parser)
assert_equal(doc.info,
[{'Producer': b'European Patent Office'}])

def test_page_labels(self):
path = absolute_sample_path('contrib/pagelabels.pdf')
with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
total_pages = int_value(dict_value(doc.catalog['Pages'])['Count'])
assert_equal(
list(itertools.islice(doc.get_page_labels(), total_pages)),
['iii', 'iv', '1', '2', '1'])

@raises(PDFNoPageLabels)
def test_no_page_labels(self):
path = absolute_sample_path('simple1.pdf')
with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
doc.get_page_labels()
18 changes: 18 additions & 0 deletions tests/test_pdfpage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from nose.tools import assert_equal

from helpers import absolute_sample_path
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfpage import PDFPage


class TestPdfPage(object):
def test_page_labels(self):
path = absolute_sample_path('contrib/pagelabels.pdf')
expected_labels = ['iii', 'iv', '1', '2', '1']

with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
for (i, page) in enumerate(PDFPage.create_pages(doc)):
assert_equal(page.label, expected_labels[i])
Loading