Skip to content

Commit

Permalink
Always try to get CMap, even if name is not recognized (#438)
Browse files Browse the repository at this point in the history
* Add trying to get cmap from pickle file. And cleaning up a bit.

* Don't use keyword argument for dict.get

* Add docs

* Make _get_cmap_name static

* Add test

* Add CHANGELOG.md

* Remove identity mappings from IDENTITY_ENCODER because that's now the default if the key is not in there

* Add CJK characters to expected output of simple3.pdf

* Fix line length

* Add comment
  • Loading branch information
pietermarsman authored Jul 23, 2020
1 parent 3cebf5e commit 4f65242
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 20 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Added
- Support for painting multiple rectangles at once ([#371](https://github.com/pdfminer/pdfminer.six/pull/371))

### Fixed
## Fixed
- Always try to get CMap, not only for identity encodings ([#438](https://github.com/pdfminer/pdfminer.six/pull/438))
- Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451))

### Changed
Expand Down
44 changes: 26 additions & 18 deletions pdfminer/pdffont.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import sys
from io import BytesIO


from . import settings
from .cmapdb import CMap
from .cmapdb import CMapDB
Expand Down Expand Up @@ -133,16 +132,12 @@ def do_keyword(self, pos, token):
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-',
None, '-')

# Note: DLIdent-* isn't found in PDF Reference but is been kept as
# it is harmless and have possibility of been a type.
# (induced from bug report/PR)
IDENTITY_ENCODER = {'Identity-H': 'Identity-H',
'Identity-V': 'Identity-V',
'DLIdent-H': 'Identity-H',
'DLIdent-V': 'Identity-V',
'OneByteIdentityH': 'OneByteIdentityH',
'OneByteIdentityV': 'OneByteIdentityV',
}
# Mapping of cmap names. Original cmap name is kept if not in the mapping.
# (missing reference for why DLIdent is mapped to Identity)
IDENTITY_ENCODER = {
'DLIdent-H': 'Identity-H',
'DLIdent-V': 'Identity-V',
}


def getdict(data):
Expand Down Expand Up @@ -725,13 +720,28 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
return

def get_cmap_from_spec(self, spec, strict):
"""
"""Get cmap from font specification
For certain PDFs, Encoding Type isn't mentioned as an attribute of
Encoding but as an attribute of CMapName, where CMapName is an
attribute of spec['Encoding'].
The horizontal/vertical modes are mentioned with different name
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
"""
cmap_name = self._get_cmap_name(spec, strict)

try:
return CMapDB.get_cmap(cmap_name)
except CMapDB.CMapNotFound as e:
if strict:
raise PDFFontError(e)
return CMap()

@staticmethod
def _get_cmap_name(spec, strict):
"""Get cmap name from font specification"""
cmap_name = 'unknown' # default value

try:
spec_encoding = spec['Encoding']
if hasattr(spec_encoding, 'name'):
Expand All @@ -741,18 +751,16 @@ def get_cmap_from_spec(self, spec, strict):
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
cmap_name = 'unknown'

if type(cmap_name) is PDFStream:
if 'CMapName' in cmap_name:
cmap_name = cmap_name.get('CMapName').name
else:
if strict:
raise PDFFontError('CMapName unspecified for encoding')
cmap_name = 'unknown'
if cmap_name in IDENTITY_ENCODER:
return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name])
else:
return CMap()

cmap_name = IDENTITY_ENCODER.get(cmap_name, cmap_name)
return cmap_name

def __repr__(self):
return '<PDFCIDFont: basefont={!r}, cidcoding={!r}>'\
Expand Down
3 changes: 2 additions & 1 deletion tests/test_highlevel_extracttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def run_with_file(sample_path):
"H e l l o \n\nW o r l d\n\n"
"H e l l o \n\nW o r l d\n\n\f",
"simple2.pdf": "\f",
"simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
"simple3.pdf": "Hello\n\nHello\n\n\n\n\n\n\n\n\n\n\n"
"World\n\nWorld\n\n\f",
"simple4.pdf": "Text1\nText2\nText3\n\n\f"
}

Expand Down
21 changes: 21 additions & 0 deletions tests/test_pdffont.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from nose.tools import assert_equal, assert_greater

from pdfminer.pdffont import PDFCIDFont
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.psparser import PSLiteral


def test_get_cmap_from_pickle():
"""Test if cmap file is read from pdfminer/cmap
Regression test for https://github.com/pdfminer/pdfminer.six/issues/391
"""
cmap_name = 'UniGB-UCS2-H'
spec = {'Encoding': PSLiteral(cmap_name)}
resource_manager = PDFResourceManager()
font = PDFCIDFont(resource_manager, spec)

cmap = font.get_cmap_from_spec(spec, False)

assert_equal(cmap.attrs.get('CMapName'), cmap_name)
assert_greater(len(cmap.code2cid), 0)

0 comments on commit 4f65242

Please sign in to comment.