Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

create patch from .tar instead of .tar.gz #93

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
b7454ac
add Patcher gzip methods, with test
dennisvang Nov 20, 2023
55561d2
replace gzip_compress and gzip_decompress by a single gzip() method
dennisvang Nov 20, 2023
98fb75f
implement compression/decompression in patch methods
dennisvang Nov 20, 2023
48c04da
only accept .tar.gz files for the patch methods
dennisvang Nov 20, 2023
5f7a705
separate tests for gzip compress and decompress
dennisvang Nov 20, 2023
c2c1e7d
formatting
dennisvang Nov 20, 2023
f60b6dc
use Patcher in client._apply_updates()
dennisvang Nov 20, 2023
18820d2
use gzip.compress/gzip.decompress instead of gzip.open
dennisvang Nov 21, 2023
dfa0b3d
add tests for gzip header and reproducibility
dennisvang Nov 21, 2023
a5261d1
force gzip mtime to zero for patch
dennisvang Nov 21, 2023
65d5345
replace shutil.make_archive by tarfile and gzip
dennisvang Nov 21, 2023
f970aef
fix assertion in client._apply_updates
dennisvang Nov 21, 2023
072ffbb
todo: patch size condition
dennisvang Nov 23, 2023
d816c0d
workaround for permission error with TemporaryFile on Windows
dennisvang Nov 23, 2023
032b18f
fix sequential patch updates and corresponding test__apply_updates
dennisvang Nov 23, 2023
e41eb2f
fix expiration dates in repo example
dennisvang Nov 23, 2023
968ea12
add test for gzip header OS flag
dennisvang Nov 23, 2023
5fcc8f2
comments
dennisvang Nov 23, 2023
72c3638
add option to specify TEST_EXPIRATION env variable for repo workflow …
dennisvang Nov 24, 2023
4cf2da7
force OS field to 255 (unknown) in gzip header
dennisvang Nov 24, 2023
8af85b3
add OS values for macos and windows to gzip header test
dennisvang Nov 24, 2023
4ffe9f2
move gzip code from Patcher into separate GZipper class
dennisvang Nov 24, 2023
914cc9f
do full update if total patch size exceeds 80% of full archive size
dennisvang Nov 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions examples/repo/repo_workflow_example.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import copy
import logging
import os
import pathlib
import secrets # from python 3.9+ we can use random.randbytes
import shutil
Expand Down Expand Up @@ -45,7 +46,15 @@
TARGETS_DIR = REPO_DIR / DEFAULT_TARGETS_DIR_NAME

# Settings
EXPIRATION_DAYS = dict(root=365, targets=100, snapshot=7, timestamp=1)
_TEST_EXPIRATION = int(os.getenv('TEST_EXPIRATION', 0)) # for creating test repo data
if _TEST_EXPIRATION:
logger.warning(f'using TEST_EXPIRATION: {_TEST_EXPIRATION} days')
EXPIRATION_DAYS = dict(
root=_TEST_EXPIRATION or 365,
targets=_TEST_EXPIRATION or 100,
snapshot=_TEST_EXPIRATION or 7,
timestamp=_TEST_EXPIRATION or 1,
)
THRESHOLDS = dict(root=2, targets=1, snapshot=1, timestamp=1)
KEY_MAP = copy.deepcopy(DEFAULT_KEY_MAP)
KEY_MAP['root'].append('root_two') # use two keys for root
Expand Down Expand Up @@ -134,7 +143,7 @@
repo = Repository.from_config()

# Re-sign expired roles (downstream roles are refreshed automatically)
repo.refresh_expiration_date(role_name='snapshot', days=9)
repo.refresh_expiration_date(role_name='snapshot', days=_TEST_EXPIRATION or 9)
repo.publish_changes(private_key_dirs=[ONLINE_DIR])

# Time goes by
Expand Down
40 changes: 26 additions & 14 deletions src/tufup/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import bsdiff4
import logging
import pathlib
import shutil
Expand All @@ -12,12 +11,13 @@
from tuf.api.exceptions import DownloadError, UnsignedMetadataError
import tuf.ngclient

from tufup.common import TargetMeta
from tufup.common import Patcher, TargetMeta
from tufup.utils.platform_specific import install_update

logger = logging.getLogger(__name__)

DEFAULT_EXTRACT_DIR = pathlib.Path(tempfile.gettempdir()) / 'tufup'
MAX_SIZE_RATIO = 0.8 # do full update if patch-size/full-size > MAX_SIZE_RATIO


class Client(tuf.ngclient.Updater):
Expand Down Expand Up @@ -199,7 +199,9 @@ def check_for_updates(
# is not available, we must do a full update)
self.new_targets = new_patches
no_patches = total_patch_size == 0
patches_too_big = total_patch_size > self.new_archive_info.length
patches_too_big = (
total_patch_size / self.new_archive_info.length > MAX_SIZE_RATIO
)
current_archive_not_found = not self.current_archive_local_path.exists()
if not patch or no_patches or patches_too_big or current_archive_not_found:
self.new_targets = {new_archive_meta: self.new_archive_info}
Expand Down Expand Up @@ -239,24 +241,34 @@ def _apply_updates(
Note this has a side-effect: if self.extract_dir is not specified,
an extract_dir is created in a platform-specific temporary location.
"""
# patch current archive (if we have patches) or use new full archive
archive_bytes = None
# patch current archive or use new full archive (if there are multiple
# patches, these are applied sequentially)
patched_archive_path = None
for target, file_path in sorted(self.downloaded_target_files.items()):
if target.is_archive:
# just ensure the full archive file is available
assert len(self.downloaded_target_files) == 1
assert self.new_archive_local_path.exists()
elif target.is_patch:
# create new archive by patching current archive (patches
# must be sorted by increasing version)
if archive_bytes is None:
archive_bytes = self.current_archive_local_path.read_bytes()
archive_bytes = bsdiff4.patch(archive_bytes, file_path.read_bytes())
if archive_bytes:
if patched_archive_path is None:
patched_archive_path = self.current_archive_local_path
# create new archive by patching current archive (patches must be
# sorted by increasing version)
patched_archive_path = Patcher.apply_patch(
src_path=patched_archive_path, patch_path=file_path
)
logger.debug(f'patch applied: {file_path}')
Fixed Show fixed Hide fixed
if patched_archive_path:
# verify that the hash of the final result of the (possibly) sequential
# patching process matches that of the full archive specified in the tuf
# metadata
# todo: implement fallback to full update if patch update fails
# (this means we have to step back and download the full archive first...)
assert patched_archive_path.name == self.new_archive_local_path.name
# verify the patched archive length and hash
self.new_archive_info.verify_length_and_hashes(data=archive_bytes)
# write the patched new archive
self.new_archive_local_path.write_bytes(archive_bytes)
self.new_archive_info.verify_length_and_hashes(
data=patched_archive_path.read_bytes()
)
# extract archive to temporary directory
if self.extract_dir is None:
self.extract_dir = DEFAULT_EXTRACT_DIR
Expand Down
120 changes: 116 additions & 4 deletions src/tufup/common.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
import gzip
import logging
import pathlib
import re
from tempfile import TemporaryDirectory
from typing import Optional, Union

import bsdiff4
from packaging.version import Version, InvalidVersion

logger = logging.getLogger(__name__)

SUFFIX_ARCHIVE = '.tar.gz'
SUFFIX_TAR = '.tar'
SUFFIX_GZIP = '.gz'
SUFFIX_ARCHIVE = SUFFIX_TAR + SUFFIX_GZIP
SUFFIX_PATCH = '.patch'


Expand Down Expand Up @@ -129,6 +133,79 @@ def compose_filename(cls, name: str, version: str, is_archive: bool):
return cls.filename_pattern.format(name=name, version=version, suffix=suffix)


class GZipper(object):
@staticmethod
def _fix_gzip_header(file_path: pathlib.Path):
"""
quick & dirty fix to ensure that the OS byte in the gzip header is 255 "unknown"
"""
OS_FIELD = dict(offset=9, bytes=b'\xff') # noqa
file_path = pathlib.Path(file_path).resolve()
with file_path.open(mode='r+b') as file_obj:
file_obj.seek(OS_FIELD['offset'])
file_obj.write(OS_FIELD['bytes'])
logger.debug(f'gzip header fixed for {file_path}')

@classmethod
def gzip(
cls, src_path: pathlib.Path, dst_path: Optional[pathlib.Path] = None, **kwargs
) -> pathlib.Path:
"""
Compress or decompress a file using gzip.

The direction, i.e. compress or decompress, depends on src_path.suffix.

Supported kwargs, i.e. `compresslevel` and/or `mtime`, are passed on to
`gzip.compress()` [5].

# Notes

- See GZIP header definition in rfc1952 spec [6], and python's implementation
in the gzip module [7].

- The gzip header [6] includes an MTIME (timestamp) field by default,
as well as an OS field. In addition, the FNAME (filename) field may be
specified. This makes the default gzip header unreproducible. To fix this we
need to do the equivalent of `gzip --no-name` from GNU gzip [1].

- Both `gzip.open()` and the `gzip.GzipFile` class set the FNAME field in the
header. To prevent this, we use `gzip.compress()` instead, which also
supports an `mtime` argument to set a fixed timestamp [2].

- To ensure identical gzip output, we need to make sure the same algorithm is
used, with the same compression setting.

- BEWARE: The output of the gzip compression depends on the implementation,
so there is no guarantee that different operating systems will yield
identical compressed data, even if all settings and headers are equal.

[1]: https://www.gnu.org/software/gzip/manual/gzip.html#Invoking-gzip
[2]: https://docs.python.org/3/library/gzip.html#examples-of-usage
[3]: https://reproducible-builds.org/docs/source-date-epoch/
[4]: https://www.gnu.org/software/gzip/manual/gzip.html#Environment
[5]: https://docs.python.org/3/library/gzip.html#gzip.compress
[6]: https://datatracker.ietf.org/doc/html/rfc1952#page-5
[7]: https://github.com/python/cpython/blob/d9fc15222e96942e30ea8b0561dec5c82ecb4663/Lib/gzip.py#L599
"""
if src_path.suffix == SUFFIX_GZIP:
gzip_function = gzip.decompress
dst_suffix = ''
if kwargs:
logger.warning(f'gzip.decompress does not accept kwargs: {kwargs}')
kwargs = dict()
else:
gzip_function = gzip.compress
dst_suffix = src_path.suffix + SUFFIX_GZIP
if dst_path is None:
dst_path = src_path.with_suffix(dst_suffix)
logger.debug(f'gzip {gzip_function.__name__} {src_path} into {dst_path}')
dst_path.write_bytes(gzip_function(data=src_path.read_bytes(), **kwargs))
if dst_suffix:
# fix compressed file header
cls._fix_gzip_header(dst_path)
return dst_path


class Patcher(object):
@classmethod
def create_patch(
Expand All @@ -137,20 +214,55 @@ def create_patch(
"""
Create a binary patch file based on source and destination files.

The source and destination files are decompressed first, so the patch is
created based on the .tar archives. A diff based on the .tar.gz files could
become very large, making it practically useless.

Patch file path matches destination file path, except for suffix.
"""
# replace suffix twice, in case we have a .tar.gz
# only accept .tar.gz files
for path in [src_path, dst_path]:
assert path.suffix == SUFFIX_GZIP, f'not a .gz file: {path}'
# replace suffix (twice, for .tar.gz)
patch_path = dst_path.with_suffix('').with_suffix(SUFFIX_PATCH)
bsdiff4.file_diff(src_path=src_path, dst_path=dst_path, patch_path=patch_path)
# decompress files to prevent large diff
with TemporaryDirectory() as tmp_dir:
tmp_dir_path = pathlib.Path(tmp_dir)
decompressed_paths = dict(src_path=src_path, dst_path=dst_path)
for key, path in decompressed_paths.items():
decompressed_paths[key] = tmp_dir_path / path.with_suffix('').name
GZipper.gzip(src_path=path, dst_path=decompressed_paths[key])
# create patch
bsdiff4.file_diff(**decompressed_paths, patch_path=patch_path)
return patch_path

@classmethod
def apply_patch(cls, src_path: pathlib.Path, patch_path: pathlib.Path):
"""
Apply binary patch file to source file to create destination file.

Patches are based on the (uncompressed) .tar archives, so the source .tar.gz
archive is decompressed, then the patch is applied, and the resulting .tar is
compressed again, to save storage space.

Destination file path matches patch file path, except for suffix.
"""
# only accept .tar.gz files
assert src_path.suffix == SUFFIX_GZIP, f'not a .gz file: {src_path}'
dst_path = patch_path.with_suffix(SUFFIX_ARCHIVE)
bsdiff4.file_patch(src_path=src_path, dst_path=dst_path, patch_path=patch_path)
# decompress archive, apply patch, and compress again
with TemporaryDirectory() as tmp_dir:
tmp_dir_path = pathlib.Path(tmp_dir)
# decompress
decompressed_src_path = tmp_dir_path / src_path.with_suffix('').name
GZipper.gzip(src_path=src_path, dst_path=decompressed_src_path)
# apply patch to .tar archives
decompressed_dst_path = tmp_dir_path / dst_path.with_suffix('').name
bsdiff4.file_patch(
src_path=decompressed_src_path,
dst_path=decompressed_dst_path,
patch_path=patch_path,
)
# compress result (mtime=0 for reproducibility)
GZipper.gzip(src_path=decompressed_dst_path, dst_path=dst_path, mtime=0)
return dst_path
30 changes: 15 additions & 15 deletions src/tufup/repo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import tempfile
from copy import deepcopy
from datetime import datetime, timedelta
import inspect
import json
import logging
import pathlib
import tarfile

try:
# workaround for PyInstaller issue 6911 (setuptools issue 3089)
Expand Down Expand Up @@ -37,7 +39,7 @@
)
from tuf.api.serialization.json import JSONSerializer

from tufup.common import Patcher, SUFFIX_ARCHIVE, SUFFIX_PATCH, TargetMeta
from tufup.common import GZipper, Patcher, SUFFIX_PATCH, TargetMeta
from tufup.utils.platform_specific import _patched_resolve

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -79,12 +81,7 @@ def make_gztar_archive(
dst_dir: Union[pathlib.Path, str],
app_name: str,
version: str,
**kwargs, # allowed kwargs are passed on to shutil.make_archive
) -> Optional[TargetMeta]:
# remove disallowed kwargs
for key in ['base_name', 'root_dir', 'format']:
if kwargs.pop(key, None):
logger.warning(f'{key} ignored: using default')
# ensure paths
src_dir = pathlib.Path(src_dir)
dst_dir = pathlib.Path(dst_dir)
Expand All @@ -97,15 +94,18 @@ def make_gztar_archive(
if input(f'Found existing archive: {archive_path}.\nOverwrite? [n]/y') != 'y':
print('Using existing archive.')
return TargetMeta(archive_path)
# make archive
base_name = str(dst_dir / archive_filename.replace(SUFFIX_ARCHIVE, ''))
archive_path_str = shutil.make_archive(
base_name=base_name, # archive file path, no suffix
root_dir=str(src_dir), # paths in archive will be relative to root_dir
format='gztar',
**kwargs,
)
return TargetMeta(target_path=archive_path_str)
# create archive
with tempfile.TemporaryDirectory() as temp_dir:
# we use TemporaryDirectory because a TemporaryFile cannot be opened multiple
# times on Windows ([Errno 13] Permission denied)
temp_file_path = pathlib.Path(temp_dir) / 'temp.tar'
# make temporary tar archive
with tarfile.open(temp_file_path, mode='w') as tar:
for path in src_dir.iterdir():
tar.add(name=path, arcname=path.relative_to(src_dir), recursive=True)
# compress tar archive using gzip (force mtime to zero for reproducibility)
GZipper.gzip(src_path=temp_file_path, dst_path=archive_path, mtime=0)
return TargetMeta(target_path=archive_path)


class RolesDict(TypedDict):
Expand Down
2 changes: 1 addition & 1 deletion src/tufup/utils/platform_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def _install_update_win(
batch_template_extra_kwargs: Optional[dict] = None,
log_file_name: Optional[str] = None,
robocopy_options_override: Optional[List[str]] = None,
process_creation_flags = None,
process_creation_flags=None,
):
"""
Create a batch script that moves files from src to dst, then run the
Expand Down
4 changes: 2 additions & 2 deletions tests/data/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Large parts of the test data were copied verbatim from the `python-tuf` [repository_data][1] folder.
These test data were generated using the examples/repo/repo_workflow_example.py script.

[1]: https://github.com/theupdateframework/python-tuf/tree/develop/tests/repository_data
(expiration dates were set to some time far in the future)
1 change: 1 addition & 0 deletions tests/data/keystore/offline_secrets_1/root
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"keytype": "ed25519", "scheme": "ed25519", "keyid": "91eea8b502cc1fe46b864d35b519d6f59a91987ebbb10af91e4d6c4cadb34073", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "385ec89459a2c7d633b7ad23d2907d3f657d62d4a4905004f4ab3cfc677d55a9", "private": "9fb5e17040ad57431f4f7938c921f188585586941d20a90c8a3d974849ed8628"}}
1 change: 1 addition & 0 deletions tests/data/keystore/offline_secrets_1/targets
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"keytype": "ed25519", "scheme": "ed25519", "keyid": "4db5632432d8c6583e17070c3f39100b6b5bdb4359044a662e480c09a4c0888e", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "bda6d9338ebfb060a42524b412b511b78d5dbcf04832f251fb0fe9f6952bc75f", "private": "d210760a863a2b62d6a733fb5e56d9219cb4eab46b9cc22b13629d5a249658a0"}}
1 change: 1 addition & 0 deletions tests/data/keystore/offline_secrets_2/root_three
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"keytype": "ed25519", "scheme": "ed25519", "keyid": "008f09c49428b92a5747d6de45896efbedb7d50105ad5441d19e4db4bb7f84e0", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "c7c45c87f2779950bc91fc00fe8218f364a4289061385cbe8058eb5a9a36307d", "private": "b2ecdf0815ecedd7e4bcecd2a2adfadebdd0320310e6a6e84ac0745f680e21c7"}}
1 change: 1 addition & 0 deletions tests/data/keystore/offline_secrets_2/root_three.pub
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"keytype": "ed25519", "scheme": "ed25519", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "c7c45c87f2779950bc91fc00fe8218f364a4289061385cbe8058eb5a9a36307d"}}
1 change: 1 addition & 0 deletions tests/data/keystore/offline_secrets_2/root_two
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"keytype": "ed25519", "scheme": "ed25519", "keyid": "3b087d05c50176b029f7cd31ed3f9df65d6c383d7bd98bca398cda8e07b41c3b", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "3bd5a2ea888092c83b18f620a85e0509c811c2451e7916e24c5021b4e930ecaa", "private": "25c0ff6523252ec4c29ddbf46759377a680f76e94ad87f889c25547b976dd61d"}}
1 change: 1 addition & 0 deletions tests/data/keystore/online_secrets/snapshot
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"keytype": "ed25519", "scheme": "ed25519", "keyid": "ad33be8011022f94a6552fa8a87c8a6cc78ca06d560e6faf75ca0d7235690d29", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "fd84c37dcfec7c715acf32f4c9a12f6dadb96b0614f865a216545bb8c8c49cc3", "private": "36a41729e9cb04099b8a2f4cdd73446d9f07d06ed63bb3a7d169a8693f33ffc1"}}
1 change: 1 addition & 0 deletions tests/data/keystore/online_secrets/timestamp
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"keytype": "ed25519", "scheme": "ed25519", "keyid": "5df555c76d911fa3738017ccc675c02e5045f7dbf491ad5c452e9e3700649093", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "e9b4dcae339296b70c791d687e1e881524a3a1a0c69115fe05239f217e8bb1bc", "private": "6ff20938358f1d60e22710059a1225e72f3207728759e5fa762b80da325a5a1f"}}
1 change: 0 additions & 1 deletion tests/data/keystore/root

This file was deleted.

2 changes: 1 addition & 1 deletion tests/data/keystore/root.pub
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"keytype": "ed25519", "scheme": "ed25519", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "0e492fadf5643a11049e2d7e59db6b8fc766945315f5bdc5648bd94fe2b427cb"}}
{"keytype": "ed25519", "scheme": "ed25519", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "385ec89459a2c7d633b7ad23d2907d3f657d62d4a4905004f4ab3cfc677d55a9"}}
1 change: 1 addition & 0 deletions tests/data/keystore/root_two.pub
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"keytype": "ed25519", "scheme": "ed25519", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "3bd5a2ea888092c83b18f620a85e0509c811c2451e7916e24c5021b4e930ecaa"}}
1 change: 0 additions & 1 deletion tests/data/keystore/snapshot

This file was deleted.

2 changes: 1 addition & 1 deletion tests/data/keystore/snapshot.pub
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"keytype": "ed25519", "scheme": "ed25519", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "f3be5f4d498ca80145c84f6ca1d443b139efcbd2dde91219396aa3a0b5d7a987"}}
{"keytype": "ed25519", "scheme": "ed25519", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "fd84c37dcfec7c715acf32f4c9a12f6dadb96b0614f865a216545bb8c8c49cc3"}}
1 change: 0 additions & 1 deletion tests/data/keystore/targets

This file was deleted.

2 changes: 1 addition & 1 deletion tests/data/keystore/targets.pub
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"keytype": "ed25519", "scheme": "ed25519", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "2c12e0cd2837cfe0448d77c93c0258ba8cbc2af89351b9b0bad3aae43e6433bf"}}
{"keytype": "ed25519", "scheme": "ed25519", "keyid_hash_algorithms": ["sha256", "sha512"], "keyval": {"public": "bda6d9338ebfb060a42524b412b511b78d5dbcf04832f251fb0fe9f6952bc75f"}}
Loading