Merge commit '1bf3c42b59125f4491d863e1c11dca7ebbe96adc' into develop

* commit '1bf3c42b59125f4491d863e1c11dca7ebbe96adc': Use charset-normalizer instead of chardet (pdfminer#744) Refactor ImageWriter and add method for exporting an image from bytes. (pdfminer#737) Log warning and continue gracefully if errors in cmap (pdfminer#731) Fix log.debug statement in lzw.py by ensuring that self.table is always set (pdfminer#732) Raise KeyError when name in name2unicode is not of type str (pdfminer#733) Convert fontname to str if it is bytes in HTMLConverter (pdfminer#734) Fix github actions tag regex Fix github actions tag regex Bump version Add github action for releasing to pypi if git tag is added. (pdfminer#727)
HiTalentAlgorithms · Apr 26, 2022 · 529905e · 529905e
2 parents 93ca149 + 1bf3c42
commit 529905e
Show file tree

Hide file tree

Showing 15 changed files with 256 additions and 178 deletions.
diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml
@@ -61,4 +61,4 @@ jobs:
           python -m pip install nox
       - name: Build docs
         run: |
-          nox --error-on-missing-interpreters --non-interactive --session docs
+          nox --error-on-missing-interpreters --non-interactive --session docs
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes in pdfminer.six will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [20220426]
+
+### Fixed
+
+- Merge master breach
+
 ## [20220329]
 
 ### Fixed
@@ -16,19 +22,25 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 - Fixed Bugs: Fixed bug with TypeError in `pdfminer/pdftypes.py:376`
 
+## [20220319]
+
+### Added
+
+- Added features: Add more than one `LTAnno` at a time
+
 ## [20220318]
 
 ### Fixed
 
 - Fixed Bugs: Fix font with CIDFont name and cmap empty,
-    - Fixed cmap without `endbfchar`
+  - Fixed cmap without `endbfchar`
 
 ## [20220317]
 
 ### Fixed
 
 - Fixed Bugs: When `paint_path()` is called with a `path` that has `h` command but length is 0, it will crash.
-    - Depends on Section8.5.1 in PDF Reference Document V1.7
+  - Depends on Section8.5.1 in PDF Reference Document V1.7
 
 ## [20220314]
 
@@ -38,9 +50,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [20220308]
 
+### Fixed
+
+- `IndexError` when handling invalid bfrange code map in CMap ([#731](https://github.com/pdfminer/pdfminer.six/pull/731))
+- `TypeError` in lzw.py when `self.table` is not set ([#732](https://github.com/pdfminer/pdfminer.six/pull/732))
+- `TypeError` in encodingdb.py when name of unicode is not str ([#733](https://github.com/pdfminer/pdfminer.six/pull/733))
+- `TypeError` in HTMLConverter when using a bytes fontname ([#734](https://github.com/pdfminer/pdfminer.six/pull/734))
+
 ### Added
 
-- Added features: Add more than one `LTAnno` at a time
+- Exporting images without any specific encoding ([#737](https://github.com/pdfminer/pdfminer.six/pull/737))
+
+### Changed
+
+- Using charset-normalizer instead of chardet for less restrictive license ([#744](https://github.com/pdfminer/pdfminer.six/pull/744))
 
 ## [20220304]
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -27,8 +27,8 @@ Any contribution is appreciated! You might want to:
 ## Guideline for creating pull request
 
 * A pull request should close an existing issue.
-* Pull requests should be merged to develop, not master. This ensures that master always equals the released version.  
-* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case 
+* Pull requests should be merged to master. Version tags are used indicate the releases.
+* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case
   of features, this will show that your code works correctly.
 * Code should work for Python 3.6+.
 * Code should be formatted with [black](https://github.com/psf/black). 

diff --git a/Makefile b/Makefile
@@ -1,37 +1,11 @@
 ##  Makefile (for maintenance purpose)
 ##
 
-PACKAGE=pdfminer
-
 PYTHON=python
-GIT=git
 RM=rm -f
 CP=cp -f
 MKDIR=mkdir
 
-all:
-
-install:
-	$(PYTHON) setup.py install --home=$(HOME)
-
-clean:
-	-$(PYTHON) setup.py clean
-	-$(RM) -r build dist MANIFEST
-	-cd $(PACKAGE) && $(MAKE) clean
-	-cd tools && $(MAKE) clean
-	-cd samples && $(MAKE) clean
-
-distclean: clean cmap_clean
-
-sdist: distclean MANIFEST.in
-	$(PYTHON) setup.py sdist
-register: distclean MANIFEST.in
-	$(PYTHON) setup.py sdist upload register
-
-WEBDIR=../euske.github.io/$(PACKAGE)
-publish:
-	$(CP) docs/*.html docs/*.png docs/*.css $(WEBDIR)
-
 CONV_CMAP=$(PYTHON) tools/conv_cmap.py
 CMAPSRC=cmaprsrc
 CMAPDST=pdfminer/cmap

diff --git a/mypy.ini b/mypy.ini
@@ -23,8 +23,11 @@ ignore_missing_imports = True
 [mypy-pytest.*]
 ignore_missing_imports = True
 
-[mypy-setuptools]
+[mypy-setuptools.*]
 ignore_missing_imports = True
 
-[mypy-nox]
+[mypy-nox.*]
+ignore_missing_imports = True
+
+[mypy-charset_normalizer.*]
 ignore_missing_imports = True
diff --git a/pdfminer/Makefile b/pdfminer/Makefile
diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "20220329"
+__version__ = "__VERSION__"  # auto replaced with tag in github actions
 
 if __name__ == "__main__":
     print(__version__)
diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
@@ -29,6 +29,7 @@
     Tuple,
     Union,
     cast,
+    Set,
 )
 
 from .encodingdb import name2unicode
@@ -285,6 +286,7 @@ def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
         self.cmap = cmap
         # some ToUnicode maps don't have "begincmap" keyword.
         self._in_cmap = True
+        self._warnings: Set[str] = set()
         return
 
     def run(self) -> None:
@@ -312,20 +314,26 @@ def run(self) -> None:
     KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
 
     def do_keyword(self, pos: int, token: PSKeyword) -> None:
+        """ToUnicode CMaps
+
+        See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
+        """
         if token is self.KEYWORD_BEGINCMAP:
             self._in_cmap = True
             self.popall()
             return
+
         elif token is self.KEYWORD_ENDCMAP:
             self._in_cmap = False
             objs = [obj for (__, obj) in self.popall()]
             for (cid, code) in choplist(2, objs):
                 if isinstance(cid, bytes) and isinstance(code, bytes):
                     self.cmap.add_cid2unichr(nunpack(cid), code)
             return
+
         if not self._in_cmap:
             return
-        #
+
         if token is self.KEYWORD_DEF:
             try:
                 ((_, k), (_, v)) = self.pop(2)
@@ -354,33 +362,47 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
         if token is self.KEYWORD_BEGINCIDRANGE:
             self.popall()
             return
+
         if token is self.KEYWORD_ENDCIDRANGE:
             objs = [obj for (__, obj) in self.popall()]
-            for (s, e, cid) in choplist(3, objs):
-                if (
-                    not isinstance(s, bytes)
-                    or not isinstance(e, bytes)
-                    or not isinstance(cid, int)
-                    or len(s) != len(e)
-                ):
+            for (start_byte, end_byte, cid) in choplist(3, objs):
+                if not isinstance(start_byte, bytes):
+                    self._warn_once("The start object of begincidrange is not a byte.")
+                    continue
+                if not isinstance(end_byte, bytes):
+                    self._warn_once("The end object of begincidrange is not a byte.")
+                    continue
+                if not isinstance(cid, int):
+                    self._warn_once("The cid object of begincidrange is not a byte.")
+                    continue
+                if len(start_byte) != len(end_byte):
+                    self._warn_once(
+                        "The start and end byte of begincidrange have "
+                        "different lengths."
+                    )
                     continue
-                sprefix = s[:-4]
-                eprefix = e[:-4]
-                if sprefix != eprefix:
+                start_prefix = start_byte[:-4]
+                end_prefix = end_byte[:-4]
+                if start_prefix != end_prefix:
+                    self._warn_once(
+                        "The prefix of the start and end byte of "
+                        "begincidrange are not the same."
+                    )
                     continue
-                svar = s[-4:]
-                evar = e[-4:]
-                s1 = nunpack(svar)
-                e1 = nunpack(evar)
+                svar = start_byte[-4:]
+                evar = end_byte[-4:]
+                start = nunpack(svar)
+                end = nunpack(evar)
                 vlen = len(svar)
-                for i in range(e1 - s1 + 1):
-                    x = sprefix + struct.pack(">L", s1 + i)[-vlen:]
+                for i in range(end - start + 1):
+                    x = start_prefix + struct.pack(">L", start + i)[-vlen:]
                     self.cmap.add_cid2unichr(cid + i, x)
             return
 
         if token is self.KEYWORD_BEGINCIDCHAR:
             self.popall()
             return
+
         if token is self.KEYWORD_ENDCIDCHAR:
             objs = [obj for (__, obj) in self.popall()]
             for (cid, code) in choplist(2, objs):
@@ -391,34 +413,44 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
         if token is self.KEYWORD_BEGINBFRANGE:
             self.popall()
             return
+
         if token is self.KEYWORD_ENDBFRANGE:
             objs = [obj for (__, obj) in self.popall()]
-            for (s, e, code) in choplist(3, objs):
-                if (
-                    not isinstance(s, bytes)
-                    or not isinstance(e, bytes)
-                    or len(s) != len(e)
-                ):
+            for (start_byte, end_byte, code) in choplist(3, objs):
+                if not isinstance(start_byte, bytes):
+                    self._warn_once("The start object is not a byte.")
+                    continue
+                if not isinstance(end_byte, bytes):
+                    self._warn_once("The end object is not a byte.")
+                    continue
+                if len(start_byte) != len(end_byte):
+                    self._warn_once("The start and end byte have different lengths.")
                     continue
-                s1 = nunpack(s)
-                e1 = nunpack(e)
+                start = nunpack(start_byte)
+                end = nunpack(end_byte)
                 if isinstance(code, list):
-                    for i in range(e1 - s1 + 1):
-                        self.cmap.add_cid2unichr(s1 + i, code[i])
+                    if len(code) != end - start + 1:
+                        self._warn_once(
+                            "The difference between the start and end "
+                            "offsets does not match the code length."
+                        )
+                    for cid, unicode_value in zip(range(start, end + 1), code):
+                        self.cmap.add_cid2unichr(cid, unicode_value)
                 else:
                     assert isinstance(code, bytes)
                     var = code[-4:]
                     base = nunpack(var)
                     prefix = code[:-4]
                     vlen = len(var)
-                    for i in range(e1 - s1 + 1):
+                    for i in range(end - start + 1):
                         x = prefix + struct.pack(">L", base + i)[-vlen:]
-                        self.cmap.add_cid2unichr(s1 + i, x)
+                        self.cmap.add_cid2unichr(start + i, x)
             return
 
         if token is self.KEYWORD_BEGINBFCHAR:
             self.popall()
             return
+
         if token is self.KEYWORD_ENDBFCHAR:
             objs = [obj for (__, obj) in self.popall()]
             for (cid, code) in choplist(2, objs):
@@ -429,12 +461,23 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
         if token is self.KEYWORD_BEGINNOTDEFRANGE:
             self.popall()
             return
+
         if token is self.KEYWORD_ENDNOTDEFRANGE:
             self.popall()
             return
 
         self.push((pos, token))
-        return
+
+    def _warn_once(self, msg: str) -> None:
+        """Warn once for each unique message"""
+        if msg not in self._warnings:
+            self._warnings.add(msg)
+            base_msg = (
+                "Ignoring (part of) ToUnicode map because the PDF data "
+                "does not conform to the format. This could result in "
+                "(cid) values in the output. "
+            )
+            log.warning(base_msg + msg)
 
 
 def main(argv: List[str]) -> None:

diff --git a/pdfminer/converter.py b/pdfminer/converter.py
@@ -40,7 +40,7 @@
 from .pdfinterp import PDFGraphicState, PDFResourceManager
 from .pdfpage import PDFPage
 from .pdftypes import PDFStream
-from .utils import AnyIO, Point, Matrix, Rect, PathSegment
+from .utils import AnyIO, Point, Matrix, Rect, PathSegment, make_compat_str
 from .utils import apply_matrix_pt
 from .utils import bbox2str
 from .utils import enc
@@ -639,7 +639,8 @@ def render(item: LTItem) -> None:
                             render(child)
                         self.end_div("textbox")
                     elif isinstance(item, LTChar):
-                        self.put_text(item.get_text(), item.fontname, item.size)
+                        fontname = make_compat_str(item.fontname)
+                        self.put_text(item.get_text(), fontname, item.size)
                     elif isinstance(item, LTText):
                         self.write_text(item.get_text())
             return

diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py
@@ -25,6 +25,12 @@ def name2unicode(name: str) -> str:
     :returns unicode character if name resembles something,
     otherwise a KeyError
     """
+    if not isinstance(name, str):
+        raise KeyError(
+            'Could not convert unicode name "%s" to character because '
+            "it should be of type str but is of type %s" % (name, type(name))
+        )
+
     name = name.split(".")[0]
     components = name.split("_")