From cc294805325020d305e7d57271c0937dd8a3dadb Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sat, 19 Mar 2022 21:10:02 +0100 Subject: [PATCH 1/5] Refactor ImageWriter and add method for exporting an image from bytes. E.g. when FlateDecode just results in a list of RGB bytes. --- pdfminer/image.py | 196 ++++++++++++++++++++++++++-------------------- 1 file changed, 113 insertions(+), 83 deletions(-) diff --git a/pdfminer/image.py b/pdfminer/image.py index fb300314..b3f45a33 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -2,14 +2,19 @@ import os.path import struct from io import BytesIO -from typing import BinaryIO, Tuple, List, Any +from typing import BinaryIO, Tuple, Literal from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter from .layout import LTImage from .pdfcolor import LITERAL_DEVICE_CMYK from .pdfcolor import LITERAL_DEVICE_GRAY from .pdfcolor import LITERAL_DEVICE_RGB -from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, LITERALS_JPX_DECODE +from .pdftypes import ( + LITERALS_DCT_DECODE, + LITERALS_JBIG2_DECODE, + LITERALS_JPX_DECODE, + LITERALS_FLATE_DECODE, +) PIL_ERROR_MESSAGE = ( "Could not import Pillow. This dependency of pdfminer.six is not " @@ -90,14 +95,40 @@ def __init__(self, outdir: str) -> None: def export_image(self, image: LTImage) -> str: (width, height) = image.srcsize - is_jbig2 = self.is_jbig2_image(image) - ext = self._get_image_extension(image, width, height, is_jbig2) - name, path = self._create_unique_image_name(self.outdir, image.name, ext) + filters = image.stream.get_filters() + + if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: + name = self._save_jpeg(image) + + elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE: + name = self._save_jpeg2000(image) + + elif self._is_jbig2_iamge(image): + name = self._save_jbig2(image) + + elif image.bits == 1: + name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits) + + elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace: + name = self._save_bmp(image, width, height, width * 3, image.bits * 3) + + elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace: + name = self._save_bmp(image, width, height, width, image.bits) + + elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE: + name = self._save_bytes(image) + + else: + name = self._save_raw(image) + + return name + + def _save_jpeg(self, image: LTImage) -> str: + raw_data = image.stream.get_rawdata() + assert raw_data is not None - fp = open(path, "wb") - if ext == ".jpg": - raw_data = image.stream.get_rawdata() - assert raw_data is not None + name, path = self._create_unique_image_name(image, ".jpg") + with open(path, "wb") as fp: if LITERAL_DEVICE_CMYK in image.colorspace: try: from PIL import Image, ImageChops # type: ignore[import] @@ -111,9 +142,17 @@ def export_image(self, image: LTImage) -> str: i.save(fp, "JPEG") else: fp.write(raw_data) - elif ext == ".jp2": + + return name + + def _save_jpeg2000(self, image: LTImage) -> str: + raw_data = image.stream.get_rawdata() + assert raw_data is not None + + name, path = self._create_unique_image_name(image, ".jp2") + with open(path, "wb") as fp: try: - from PIL import Image + from PIL import Image # type: ignore[import] except ImportError: raise ImportError(PIL_ERROR_MESSAGE) @@ -121,14 +160,22 @@ def export_image(self, image: LTImage) -> str: # that I have tried cannot open the file. However, # open and saving with PIL produces a file that # seems to be easily opened by other programs - raw_data = image.stream.get_rawdata() - assert raw_data is not None ifp = BytesIO(raw_data) i = Image.open(ifp) i.save(fp, "JPEG2000") - elif is_jbig2: + return name + + def _save_jbig2(self, image: LTImage) -> str: + name, path = self._create_unique_image_name(image, ".jb2") + with open(path, "wb") as fp: input_stream = BytesIO() - global_streams = self.jbig2_global(image) + + global_streams = [] + filters = image.stream.get_filters() + for filter_name, params in filters: + if filter_name in LITERALS_JBIG2_DECODE: + global_streams.append(params["JBIG2Globals"].resolve()) + if len(global_streams) > 1: msg = ( "There should never be more than one JBIG2Globals " @@ -144,86 +191,69 @@ def export_image(self, image: LTImage) -> str: writer = JBIG2StreamWriter(fp) writer.write_file(segments) - elif image.bits == 1: - bmp = BMPWriter(fp, 1, width, height) - data = image.stream.get_data() - i = 0 - width = (width + 7) // 8 - for y in range(height): - bmp.write_line(y, data[i : i + width]) - i += width - elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace: - bmp = BMPWriter(fp, 24, width, height) - data = image.stream.get_data() - i = 0 - width = width * 3 - for y in range(height): - bmp.write_line(y, data[i : i + width]) - i += width - elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace: - bmp = BMPWriter(fp, 8, width, height) + return name + + def _save_bmp( + self, image: LTImage, width: int, height: int, bytes_per_line: int, + bits: int + ) -> str: + name, path = self._create_unique_image_name(image, ".bmp") + with open(path, "wb") as fp: + bmp = BMPWriter(fp, bits, width, height) data = image.stream.get_data() i = 0 for y in range(height): - bmp.write_line(y, data[i : i + width]) - i += width - else: - fp.write(image.stream.get_data()) - fp.close() + bmp.write_line(y, data[i: i + bytes_per_line]) + i += bytes_per_line return name - @staticmethod - def is_jbig2_image(image: LTImage) -> bool: - filters = image.stream.get_filters() - is_jbig2 = False - for filter_name, params in filters: - if filter_name in LITERALS_JBIG2_DECODE: - is_jbig2 = True - break - return is_jbig2 + def _save_bytes(self, image: LTImage) -> str: + name, path = self._create_unique_image_name(image, ".jpg") + width, height = image.srcsize + channels = len(image.stream.get_data()) / width / height / (image.bits / 8) + with open(path, "wb") as fp: + try: + from PIL import Image # type: ignore[import] + except ImportError: + raise ImportError(PIL_ERROR_MESSAGE) + + mode: Literal["1", "8", "RGB", "CMYK"] + if image.bits == 1: + mode = "1" + elif image.bits == 8 and channels == 1: + mode = "8" + elif image.bits == 8 and channels == 3: + mode = "RGB" + elif image.bits == 8 and channels == 4: + mode = "CMYK" + + img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw") + img.save(fp) + + return name + + def _save_raw(self, image: LTImage) -> str: + ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1]) + name, path = self._create_unique_image_name(image, ext) + + with open(path, "wb") as fp: + fp.write(image.stream.get_data()) + return name @staticmethod - def jbig2_global(image: LTImage) -> List[Any]: - global_streams = [] + def _is_jbig2_iamge(image: LTImage) -> bool: filters = image.stream.get_filters() for filter_name, params in filters: if filter_name in LITERALS_JBIG2_DECODE: - global_streams.append(params["JBIG2Globals"].resolve()) - return global_streams + return True + return False - @staticmethod - def _get_image_extension( - image: LTImage, width: int, height: int, is_jbig2: bool - ) -> str: - filters = image.stream.get_filters() - if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: - ext = ".jpg" - elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE: - ext = ".jp2" - elif is_jbig2: - ext = ".jb2" - elif ( - image.bits == 1 - or image.bits == 8 - and ( - LITERAL_DEVICE_RGB in image.colorspace - or LITERAL_DEVICE_GRAY in image.colorspace - ) - ): - ext = ".%dx%d.bmp" % (width, height) - else: - ext = ".%d.%dx%d.img" % (image.bits, width, height) - return ext - - @staticmethod - def _create_unique_image_name( - dirname: str, image_name: str, ext: str - ) -> Tuple[str, str]: - name = image_name + ext - path = os.path.join(dirname, name) + def _create_unique_image_name(self, image: LTImage, ext: str) -> Tuple[str, str]: + name = image.name + ext + path = os.path.join(self.outdir, name) img_index = 0 while os.path.exists(path): - name = "%s.%d%s" % (image_name, img_index, ext) - path = os.path.join(dirname, name) + name = "%s.%d%s" % (image.name, img_index, ext) + path = os.path.join(self.outdir, name) img_index += 1 return name, path From b71477c97140817762df3f3d61162f4d521d4cc8 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Mon, 21 Mar 2022 22:52:56 +0100 Subject: [PATCH 2/5] Added docstrings --- pdfminer/image.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pdfminer/image.py b/pdfminer/image.py index b3f45a33..4e822b4b 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -93,6 +93,7 @@ def __init__(self, outdir: str) -> None: os.makedirs(self.outdir) def export_image(self, image: LTImage) -> str: + """Save an LTImage to disk""" (width, height) = image.srcsize filters = image.stream.get_filters() @@ -124,6 +125,7 @@ def export_image(self, image: LTImage) -> str: return name def _save_jpeg(self, image: LTImage) -> str: + """Save a JPEG encoded image""" raw_data = image.stream.get_rawdata() assert raw_data is not None @@ -146,6 +148,7 @@ def _save_jpeg(self, image: LTImage) -> str: return name def _save_jpeg2000(self, image: LTImage) -> str: + """Save a JPEG 2000 encoded image""" raw_data = image.stream.get_rawdata() assert raw_data is not None @@ -166,6 +169,7 @@ def _save_jpeg2000(self, image: LTImage) -> str: return name def _save_jbig2(self, image: LTImage) -> str: + """Save a JBIG2 encoded image""" name, path = self._create_unique_image_name(image, ".jb2") with open(path, "wb") as fp: input_stream = BytesIO() @@ -197,6 +201,7 @@ def _save_bmp( self, image: LTImage, width: int, height: int, bytes_per_line: int, bits: int ) -> str: + """Save a BMP encoded image""" name, path = self._create_unique_image_name(image, ".bmp") with open(path, "wb") as fp: bmp = BMPWriter(fp, bits, width, height) @@ -208,6 +213,7 @@ def _save_bmp( return name def _save_bytes(self, image: LTImage) -> str: + """Save an image without encoding, just bytes""" name, path = self._create_unique_image_name(image, ".jpg") width, height = image.srcsize channels = len(image.stream.get_data()) / width / height / (image.bits / 8) @@ -233,6 +239,7 @@ def _save_bytes(self, image: LTImage) -> str: return name def _save_raw(self, image: LTImage) -> str: + """Save an image with unknown encoding""" ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1]) name, path = self._create_unique_image_name(image, ext) From d7f3187a73eac787d065bd6e470c0ffad470215e Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Mon, 21 Mar 2022 22:54:57 +0100 Subject: [PATCH 3/5] Add CHANGELOG.md --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 07beb6dc..82c3d455 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). str ([#733](https://github.com/pdfminer/pdfminer.six/pull/733)) - `TypeError` in HTMLConverter when using a bytes fontname ([#734](https://github.com/pdfminer/pdfminer.six/pull/734)) +### Added + +- Exporting images without any specific encoding ([#737](https://github.com/pdfminer/pdfminer.six/pull/737)) + ## [20220319] ### Added From c3f17195bd3382c4bb2344f7adc0d1a290c01545 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Mon, 21 Mar 2022 22:59:21 +0100 Subject: [PATCH 4/5] Run black --- pdfminer/image.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pdfminer/image.py b/pdfminer/image.py index 4e822b4b..e1fbd630 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -2,7 +2,12 @@ import os.path import struct from io import BytesIO -from typing import BinaryIO, Tuple, Literal +from typing import BinaryIO, Tuple + +try: + from typing import Literal +except ImportError: + from typing_extensions import Literal # type: ignore[misc] from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter from .layout import LTImage From 6fc4dbfa5162dea3296342a8f6739b90fa37a834 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Mon, 21 Mar 2022 22:59:49 +0100 Subject: [PATCH 5/5] Run black --- pdfminer/image.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pdfminer/image.py b/pdfminer/image.py index e1fbd630..2b412534 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -203,8 +203,7 @@ def _save_jbig2(self, image: LTImage) -> str: return name def _save_bmp( - self, image: LTImage, width: int, height: int, bytes_per_line: int, - bits: int + self, image: LTImage, width: int, height: int, bytes_per_line: int, bits: int ) -> str: """Save a BMP encoded image""" name, path = self._create_unique_image_name(image, ".bmp") @@ -213,7 +212,7 @@ def _save_bmp( data = image.stream.get_data() i = 0 for y in range(height): - bmp.write_line(y, data[i: i + bytes_per_line]) + bmp.write_line(y, data[i : i + bytes_per_line]) i += bytes_per_line return name