Always try to get CMap, even if name is not recognized (#438)

* Add trying to get cmap from pickle file. And cleaning up a bit. * Don't use keyword argument for dict.get * Add docs * Make _get_cmap_name static * Add test * Add CHANGELOG.md * Remove identity mappings from IDENTITY_ENCODER because that's now the default if the key is not in there * Add CJK characters to expected output of simple3.pdf * Fix line length * Add comment
pdfminer · Jul 23, 2020 · 4f65242 · 4f65242
1 parent 3cebf5e
commit 4f65242
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 20 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,7 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ### Added
 - Support for painting multiple rectangles at once ([#371](https://github.com/pdfminer/pdfminer.six/pull/371))
 
-### Fixed
+## Fixed
+- Always try to get CMap, not only for identity encodings ([#438](https://github.com/pdfminer/pdfminer.six/pull/438))
 - Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451))
 
 ### Changed

diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
@@ -3,7 +3,6 @@
 import sys
 from io import BytesIO
 
-
 from . import settings
 from .cmapdb import CMap
 from .cmapdb import CMapDB
@@ -133,16 +132,12 @@ def do_keyword(self, pos, token):
 NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-',
            None, '-')
 
-# Note: DLIdent-* isn't found in PDF Reference but is been kept as
-# it is harmless and have possibility of been a type.
-# (induced from bug report/PR)
-IDENTITY_ENCODER = {'Identity-H': 'Identity-H',
-                    'Identity-V': 'Identity-V',
-                    'DLIdent-H': 'Identity-H',
-                    'DLIdent-V': 'Identity-V',
-                    'OneByteIdentityH': 'OneByteIdentityH',
-                    'OneByteIdentityV': 'OneByteIdentityV',
-                    }
+# Mapping of cmap names. Original cmap name is kept if not in the mapping.
+# (missing reference for why DLIdent is mapped to Identity)
+IDENTITY_ENCODER = {
+    'DLIdent-H': 'Identity-H',
+    'DLIdent-V': 'Identity-V',
+}
 
 
 def getdict(data):
@@ -725,13 +720,28 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
         return
 
     def get_cmap_from_spec(self, spec, strict):
-        """
+        """Get cmap from font specification
+
         For certain PDFs, Encoding Type isn't mentioned as an attribute of
         Encoding but as an attribute of CMapName, where CMapName is an
         attribute of spec['Encoding'].
         The horizontal/vertical modes are mentioned with different name
         such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
         """
+        cmap_name = self._get_cmap_name(spec, strict)
+
+        try:
+            return CMapDB.get_cmap(cmap_name)
+        except CMapDB.CMapNotFound as e:
+            if strict:
+                raise PDFFontError(e)
+            return CMap()
+
+    @staticmethod
+    def _get_cmap_name(spec, strict):
+        """Get cmap name from font specification"""
+        cmap_name = 'unknown'  # default value
+
         try:
             spec_encoding = spec['Encoding']
             if hasattr(spec_encoding, 'name'):
@@ -741,18 +751,16 @@ def get_cmap_from_spec(self, spec, strict):
         except KeyError:
             if strict:
                 raise PDFFontError('Encoding is unspecified')
-            cmap_name = 'unknown'
+
         if type(cmap_name) is PDFStream:
             if 'CMapName' in cmap_name:
                 cmap_name = cmap_name.get('CMapName').name
             else:
                 if strict:
                     raise PDFFontError('CMapName unspecified for encoding')
-                cmap_name = 'unknown'
-        if cmap_name in IDENTITY_ENCODER:
-            return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name])
-        else:
-            return CMap()
+
+        cmap_name = IDENTITY_ENCODER.get(cmap_name, cmap_name)
+        return cmap_name
 
     def __repr__(self):
         return '<PDFCIDFont: basefont={!r}, cidcoding={!r}>'\

diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py
@@ -28,7 +28,8 @@ def run_with_file(sample_path):
                                  "H e l l o  \n\nW o r l d\n\n"
                                  "H e l l o  \n\nW o r l d\n\n\f",
     "simple2.pdf": "\f",
-    "simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
+    "simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n"
+                   "World\n\nWorld\n\n\f",
     "simple4.pdf": "Text1\nText2\nText3\n\n\f"
 }
 

diff --git a/tests/test_pdffont.py b/tests/test_pdffont.py
@@ -0,0 +1,21 @@
+from nose.tools import assert_equal, assert_greater
+
+from pdfminer.pdffont import PDFCIDFont
+from pdfminer.pdfinterp import PDFResourceManager
+from pdfminer.psparser import PSLiteral
+
+
+def test_get_cmap_from_pickle():
+    """Test if cmap file is read from pdfminer/cmap
+
+    Regression test for https://github.com/pdfminer/pdfminer.six/issues/391
+    """
+    cmap_name = 'UniGB-UCS2-H'
+    spec = {'Encoding': PSLiteral(cmap_name)}
+    resource_manager = PDFResourceManager()
+    font = PDFCIDFont(resource_manager, spec)
+
+    cmap = font.get_cmap_from_spec(spec, False)
+
+    assert_equal(cmap.attrs.get('CMapName'), cmap_name)
+    assert_greater(len(cmap.code2cid), 0)