Skip to content

Commit

Permalink
Fixes jbig2 writer to write valid jb2 files
Browse files Browse the repository at this point in the history
See: #653

Squashed commit of the following:

commit 8748c9f
Author: Pieter Marsman <[email protected]>
Date:   Sun Jan 23 21:40:50 2022 +0100

    Remove prints in test

commit bb97725
Author: Pieter Marsman <[email protected]>
Date:   Sun Jan 23 21:35:12 2022 +0100

    Cleanup exception handling for jbig2 global streams

commit cf0b47b
Merge: a5831d1 708dd20
Author: Pieter Marsman <[email protected]>
Date:   Sun Jan 23 21:29:15 2022 +0100

    Merge branch 'develop' into jbig2_fix

commit a5831d1
Author: Forest Gregg <[email protected]>
Date:   Sun Aug 1 22:59:17 2021 -0400

    flake8 tests

commit 18ffa29
Author: Forest Gregg <[email protected]>
Date:   Sun Aug 1 22:52:11 2021 -0400

    add description in changelog

commit 6c7ee43
Author: Forest Gregg <[email protected]>
Date:   Sun Aug 1 22:43:36 2021 -0400

    Fixes jbig2 writer to write valid jb2 files

    - closes #652
  • Loading branch information
pietermarsman committed Jan 23, 2022
1 parent 708dd20 commit aa5dec2
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 11 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Fixed
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
- Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645))
- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))

## [20211012]

Expand Down
16 changes: 16 additions & 0 deletions pdfminer/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,13 @@ def export_image(self, image: LTImage) -> str:
i.save(fp, 'JPEG2000')
elif is_jbig2:
input_stream = BytesIO()
global_streams = self.jbig2_global(image)
if len(global_streams) > 1:
msg = 'There should never be more than one JBIG2Globals ' \
'associated with a JBIG2 embedded image'
raise ValueError(msg)
if len(global_streams) == 1:
input_stream.write(global_streams[0].get_data().rstrip(b'\n'))
input_stream.write(image.stream.get_data())
input_stream.seek(0)
reader = JBIG2StreamReader(input_stream)
Expand Down Expand Up @@ -157,6 +164,15 @@ def is_jbig2_image(image: LTImage) -> bool:
break
return is_jbig2

@staticmethod
def jbig2_global(image):
global_streams = []
filters = image.stream.get_filters()
for filter_name, params in filters:
if filter_name in LITERALS_JBIG2_DECODE:
global_streams.append(params['JBIG2Globals'].resolve())
return global_streams

@staticmethod
def _get_image_extension(
image: LTImage,
Expand Down
18 changes: 13 additions & 5 deletions pdfminer/jbig2.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,11 @@
# segment types
SEG_TYPE_IMMEDIATE_GEN_REGION = 38
SEG_TYPE_END_OF_PAGE = 49
SEG_TYPE_END_OF_FILE = 50
SEG_TYPE_END_OF_FILE = 51

# file literals
FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A'
FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010


def bit_set(bit_pos: int, value: int) -> bool:
Expand Down Expand Up @@ -243,8 +242,12 @@ def write_file(
fix_last_page: bool = True
) -> int:
header = FILE_HEADER_ID
header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN
header_flags = FILE_HEAD_FLAG_SEQUENTIAL
header += pack(">B", header_flags)
# The embedded JBIG2 files in a PDF always
# only have one page
number_of_pages = pack(">L", 1)
header += number_of_pages
self.stream.write(header)
data_len = len(header)

Expand All @@ -254,7 +257,11 @@ def write_file(
for segment in segments:
seg_num = cast(int, segment["number"])

eof_segment = self.get_eof_segment(seg_num + 1)
if fix_last_page:
seg_num_offset = 2
else:
seg_num_offset = 1
eof_segment = self.get_eof_segment(seg_num + seg_num_offset)
data = self.encode_segment(eof_segment)

self.stream.write(data)
Expand Down Expand Up @@ -305,7 +312,8 @@ def encode_retention_flags(
if ref_count <= 4:
flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
for ref_index, ref_retain in enumerate(retain_segments):
flags_byte |= 1 << ref_index
if ref_retain:
flags_byte |= 1 << ref_index
flags.append(flags_byte)
else:
bytes_count = math.ceil((ref_count + 1) / 8)
Expand Down
Binary file added samples/contrib/XIPLAYER0.jb2
Binary file not shown.
3 changes: 0 additions & 3 deletions tests/test_font_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,4 @@ def test_font_size():
for char in line:
if isinstance(char, LTChar):
actual_size = int(round(char.size))
print(char, actual_size, expected_size)
assert expected_size == actual_size
else:
print(repr(line.get_text()))
19 changes: 16 additions & 3 deletions tests/test_tools_pdf2txt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
from shutil import rmtree
from tempfile import mkdtemp
import filecmp

import tools.pdf2txt as pdf2txt
from helpers import absolute_sample_path
Expand Down Expand Up @@ -144,9 +145,21 @@ def test_jbig2_image_export(self):
Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46
"""
image_files = self.extract_images(
absolute_sample_path('../samples/contrib/pdf-with-jbig2.pdf'))
assert image_files[0].endswith('.jb2')
input_file = absolute_sample_path(
'../samples/contrib/pdf-with-jbig2.pdf')
output_dir = mkdtemp()
with TemporaryFilePath() as output_file_name:
commands = ['-o', output_file_name, '--output-dir',
output_dir, input_file]
pdf2txt.main(commands)
image_files = os.listdir(output_dir)
try:
assert image_files[0].endswith('.jb2')
assert filecmp.cmp(output_dir + '/' + image_files[0],
absolute_sample_path(
'../samples/contrib/XIPLAYER0.jb2'))
finally:
rmtree(output_dir)

def test_contrib_matplotlib(self):
"""Test a pdf with Type3 font"""
Expand Down

0 comments on commit aa5dec2

Please sign in to comment.