Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: page labels #680

Merged
merged 11 commits into from
Feb 1, 2022
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Added
- Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679))
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
- Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680))

### Fixed
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
Expand Down
107 changes: 106 additions & 1 deletion pdfminer/pdfdocument.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import itertools
import logging
import re
import struct
Expand All @@ -15,7 +16,8 @@
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
uint_value, dict_value, stream_value
from .psparser import PSEOF, literal_name, LIT, KWD
from .utils import choplist, nunpack, decode_text
from .utils import (choplist, decode_text, format_int_alpha, format_int_roman,
nunpack)

log = logging.getLogger(__name__)

Expand All @@ -36,6 +38,10 @@ class PDFNoOutlines(PDFException):
pass


class PDFNoPageLabels(PDFException):
pass


class PDFDestinationNotFound(PDFException):
pass

Expand Down Expand Up @@ -895,6 +901,105 @@ def search(entry: object, level: int
return
return search(self.catalog['Outlines'], 0)

def get_page_labels(
self,
total_pages: Optional[int] = None
) -> Iterator[str]:
"""
Generate page label strings for the PDF document.

If the document includes page labels, generates strings, one per page.
If not, raises PDFNoPageLabels.

If total_pages is specified, it is used to determine when the iteration
stops; if not, the iteration is unbounded.
"""
assert self.catalog is not None

try:
labels_tree = dict_value(self.catalog['PageLabels'])
except (PDFTypeError, KeyError):
raise PDFNoPageLabels

def walk_number_tree(
td: Dict[Any, Any]
) -> Iterator[Tuple[int, Dict[Any, Any]]]:
"""
Walk number tree node dictionary yielding (page index, dict) pairs.

See PDF spec, section 3.8.5.
"""
if 'Nums' in td: # Leaf node
objs = list_value(td['Nums'])
for (k, v) in choplist(2, objs):
yield int_value(k), dict_value(v)

if 'Kids' in td: # Intermediate node
for child_ref in list_value(td['Kids']):
yield from walk_number_tree(dict_value(child_ref))

# Extract and sanity-check ranges
ranges = list(walk_number_tree(labels_tree))

# The tree should be sorted
if settings.STRICT:
if not all(a[0] <= b[0] for a, b in zip(ranges, ranges[1:])):
raise PDFSyntaxError('PageLabels are out of order')
else:
ranges.sort(key=lambda t: t[0])

# The tree must begin with page index 0
if len(ranges) == 0 or ranges[0][0] != 0:
if settings.STRICT:
raise PDFSyntaxError('PageLabels is missing page index 0')

# Try to cope, by assuming empty labels for the initial pages
ranges.insert(0, (0, {}))

def emit_labels(
ranges: List[Tuple[int, Dict[Any, Any]]],
total_pages: Optional[int]
) -> Iterator[str]:
"""
Walk a list of ranges and label dicts, yielding label strings.
"""
for (i, (range_start, label_dict)) in enumerate(ranges):
style = label_dict.get('S')
prefix = decode_text(str_value(label_dict.get('P', b'')))
first = int_value(label_dict.get('St', 1))

def mkrange(limit: int) -> range:
"Construct a suitable range for the values to format."
return range(first, first + limit - range_start)

if i + 1 == len(ranges):
# This is the last specified range. It continues until
# the end of the document, which may be unknown.
if total_pages is None:
values: Iterable[int] = itertools.count(first)
else:
values = mkrange(total_pages)
else:
values = mkrange(ranges[i + 1][0])

for value in values:
if style is LIT('D'): # Decimal arabic numerals
label = str(value)
elif style is LIT('R'): # Uppercase roman numerals
label = format_int_roman(value).upper()
elif style is LIT('r'): # Lowercase roman numerals
label = format_int_roman(value)
elif style is LIT('A'): # Uppercase letters A-Z, AA-ZZ...
label = format_int_alpha(value).upper()
elif style is LIT('a'): # Lowercase letters a-z, aa-zz...
label = format_int_alpha(value)
else:
label = ''

yield prefix + label

return emit_labels(ranges, total_pages)

def lookup_name(
self,
cat: str,
Expand Down
22 changes: 18 additions & 4 deletions pdfminer/pdfpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from .pdftypes import list_value
from .pdftypes import dict_value
from .pdfparser import PDFParser
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, \
PDFNoPageLabels


log = logging.getLogger(__name__)
Expand Down Expand Up @@ -38,23 +39,27 @@ class PDFPage:
rotate: the page rotation (in degree).
annots: the page annotations.
beads: a chain that represents natural reading order.
label: the page's label (typically, the logical page number).
"""

def __init__(
self,
doc: PDFDocument,
pageid: object,
attrs: object
attrs: object,
label: Optional[str]
) -> None:
"""Initialize a page object.

doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
label: page label string.
"""
self.doc = doc
self.pageid = pageid
self.attrs = dict_value(attrs)
self.label = label
self.lastmod = resolve1(self.attrs.get('LastModified'))
self.resources: Dict[object, object] = \
resolve1(self.attrs.get('Resources', dict()))
Expand Down Expand Up @@ -110,11 +115,18 @@ def search(
elif tree_type is LITERAL_PAGE:
log.info('Page: %r', tree)
yield (objid, tree)

try:
page_labels: Optional[Iterator[str]] = document.get_page_labels()
except PDFNoPageLabels:
page_labels = None

pages = False
if 'Pages' in document.catalog:
objects = search(document.catalog['Pages'], document.catalog)
for (objid, tree) in objects:
yield cls(document, objid, tree)
yield cls(document, objid, tree,
next(page_labels) if page_labels else None)
pages = True
if not pages:
# fallback when /Pages is missing.
Expand All @@ -124,7 +136,9 @@ def search(
obj = document.getobj(objid)
if isinstance(obj, dict) \
and obj.get('Type') is LITERAL_PAGE:
yield cls(document, objid, obj)
yield cls(
document, objid, obj,
next(page_labels) if page_labels else None)
except PDFObjectNotFound:
pass
return
Expand Down
45 changes: 45 additions & 0 deletions pdfminer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
import io
import pathlib
import string
import struct
from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator,
List, Optional, Set, TextIO, Tuple, TypeVar, Union,
Expand Down Expand Up @@ -528,3 +529,47 @@ def find(self, bbox: Rect) -> Iterator[LTComponentT]:
or y1 <= obj.y0:
continue
yield obj


ROMAN_ONES = ['i', 'x', 'c', 'm']
ROMAN_FIVES = ['v', 'l', 'd']


def format_int_roman(value: int) -> str:
"""Format a number as lowercase Roman numerals."""

assert 0 < value < 4000
result: List[str] = []
index = 0

while value != 0:
value, remainder = divmod(value, 10)
if remainder == 9:
result.insert(0, ROMAN_ONES[index])
result.insert(1, ROMAN_ONES[index + 1])
elif remainder == 4:
result.insert(0, ROMAN_ONES[index])
result.insert(1, ROMAN_FIVES[index])
else:
over_five = remainder >= 5
if over_five:
result.insert(0, ROMAN_FIVES[index])
remainder -= 5
result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
index += 1

return ''.join(result)


def format_int_alpha(value: int) -> str:
"""Format a number as lowercase letters a-z, aa-zz, etc."""

assert value > 0
result: List[str] = []

while value != 0:
value, remainder = divmod(value - 1, len(string.ascii_lowercase))
result.append(string.ascii_lowercase[remainder])

result.reverse()
return ''.join(result)
Binary file added samples/contrib/pagelabels.pdf
Binary file not shown.
22 changes: 20 additions & 2 deletions tests/test_pdfdocument.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from nose.tools import assert_equal, raises

from helpers import absolute_sample_path
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfdocument import PDFDocument, PDFNoPageLabels
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjectNotFound
from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value


class TestPdfDocument(object):
Expand All @@ -25,3 +25,21 @@ def test_encrypted_no_id(self):
doc = PDFDocument(parser)
assert_equal(doc.info,
[{'Producer': b'European Patent Office'}])

def test_page_labels(self):
path = absolute_sample_path('contrib/pagelabels.pdf')
with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
total_pages = int_value(dict_value(doc.catalog['Pages'])['Count'])
assert_equal(
list(doc.get_page_labels(total_pages)),
['iii', 'iv', '1', '2', '1'])

@raises(PDFNoPageLabels)
def test_no_page_labels(self):
path = absolute_sample_path('simple1.pdf')
with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
doc.get_page_labels()
18 changes: 18 additions & 0 deletions tests/test_pdfpage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from nose.tools import assert_equal

from helpers import absolute_sample_path
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfpage import PDFPage


class TestPdfPage(object):
def test_page_labels(self):
path = absolute_sample_path('contrib/pagelabels.pdf')
expected_labels = ['iii', 'iv', '1', '2', '1']

with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
for (i, page) in enumerate(PDFPage.create_pages(doc)):
assert_equal(page.label, expected_labels[i])
34 changes: 33 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

from helpers import absolute_sample_path
from pdfminer.layout import LTComponent
from pdfminer.utils import open_filename, Plane, shorten_str
from pdfminer.utils import (format_int_alpha, format_int_roman, open_filename,
Plane, shorten_str)


class TestOpenFilename:
Expand Down Expand Up @@ -76,3 +77,34 @@ def test_shorten_short_str_is_same(self):

def test_shorten_to_really_short(self):
assert_equal('Hello', shorten_str('Hello World', 5))

def test_format_int_alpha(self):
assert_equal('a', format_int_alpha(1))
assert_equal('b', format_int_alpha(2))
assert_equal('z', format_int_alpha(26))
assert_equal('aa', format_int_alpha(27))
assert_equal('ab', format_int_alpha(28))
assert_equal('az', format_int_alpha(26*2))
assert_equal('ba', format_int_alpha(26*2 + 1))
assert_equal('zz', format_int_alpha(26*27))
assert_equal('aaa', format_int_alpha(26*27 + 1))

def test_format_int_roman(self):
assert_equal('i', format_int_roman(1))
assert_equal('ii', format_int_roman(2))
assert_equal('iii', format_int_roman(3))
assert_equal('iv', format_int_roman(4))
assert_equal('v', format_int_roman(5))
assert_equal('vi', format_int_roman(6))
assert_equal('vii', format_int_roman(7))
assert_equal('viii', format_int_roman(8))
assert_equal('ix', format_int_roman(9))
assert_equal('x', format_int_roman(10))
assert_equal('xi', format_int_roman(11))
assert_equal('xx', format_int_roman(20))
assert_equal('xl', format_int_roman(40))
assert_equal('xlv', format_int_roman(45))
assert_equal('l', format_int_roman(50))
assert_equal('xc', format_int_roman(90))
assert_equal('xci', format_int_roman(91))
assert_equal('c', format_int_roman(100))