Add support for locking VCS requirements. (#1687)

This supports all forms of VCS requirements Pip supports as direct or transitive requirements. VCS source archives fetched by Pip are sha256 hashed by Pex providing a stable & secure fingerprint even for VCS requirements pointing to possibly mutable tags or branches or even insecure commit ids. Fixes #1556
pex-tool · Mar 25, 2022 · c0bb954 · c0bb954
1 parent 8c824c1
commit c0bb954
Show file tree

Hide file tree

Showing 24 changed files with 1,536 additions and 324 deletions.
diff --git a/pex/hashing.py b/pex/hashing.py
@@ -0,0 +1,208 @@
+# Copyright 2022 Pants project contributors (see CONTRIBUTORS.md).
+# Licensed under the Apache License, Version 2.0 (see LICENSE).
+
+from __future__ import absolute_import
+
+import hashlib
+import os
+
+from pex.typing import TYPE_CHECKING, Generic
+
+if TYPE_CHECKING:
+    from typing import IO, Callable, Iterable, Iterator, Optional, Protocol, Type, TypeVar
+
+    class HintedDigest(Protocol):
+        @property
+        def block_size(self):
+            # type: () -> int
+            pass
+
+        def update(self, data):
+            # type: (bytes) -> None
+            pass
+
+    class Hasher(HintedDigest, Protocol):
+        @property
+        def name(self):
+            # type: () -> str
+            pass
+
+        def digest(self):
+            # type: () -> bytes
+            pass
+
+        def hexdigest(self):
+            # type: () -> str
+            pass
+
+
+class Fingerprint(str):
+    class Algorithm(object):
+        def __get__(
+            self,
+            _instance,  # type: Optional[Fingerprint]
+            owner,  # type: Type[Fingerprint]
+        ):
+            # type: (...) -> str
+
+            alg = getattr(owner, "_alg", None)
+            if alg is None:
+                alg = owner.__name__[: -len(Fingerprint.__name__)].lower()
+                setattr(owner, "_alg", alg)
+            return alg
+
+    algorithm = Algorithm()
+
+    @classmethod
+    def new_hasher(cls, data=b""):
+        # type: (bytes) -> HashlibHasher
+        return HashlibHasher(cls, data=data)
+
+    def __eq__(self, other):
+        if isinstance(other, Fingerprint) and type(self) != type(other):
+            return False
+        return super(Fingerprint, self).__eq__(other)
+
+    def __ne__(self, other):
+        return not self == other
+
+
+def new_fingerprint(
+    algorithm,  # type: str
+    hexdigest,  # type: str
+):
+    # type: (...) -> Fingerprint
+
+    for subclass in Fingerprint.__subclasses__():
+        if subclass.algorithm == algorithm:
+            return subclass(hexdigest)
+
+    raise ValueError(
+        "There is no fingerprint type registered for hash algorithm {algorithm}. The supported "
+        "algorithms are: {algorithms}".format(
+            algorithm=algorithm,
+            algorithms=", ".join(fp.algorithm for fp in Fingerprint.__subclasses__()),
+        )
+    )
+
+
+class Sha1Fingerprint(Fingerprint):
+    pass
+
+
+class Sha256Fingerprint(Fingerprint):
+    pass
+
+
+if TYPE_CHECKING:
+    _F = TypeVar("_F", bound=Fingerprint)
+
+
+class HashlibHasher(Generic["_F"]):
+    def __init__(
+        self,
+        hexdigest_type,  # type: Type[_F]
+        data=b"",  # type: bytes
+    ):
+        # type: (...) -> None
+        self._hexdigest_type = hexdigest_type
+        self._hasher = hashlib.new(hexdigest_type.algorithm, data)
+
+    @property
+    def name(self):
+        # type: () -> str
+        return self._hasher.name
+
+    @property
+    def block_size(self):
+        # type: () -> int
+        return self._hasher.block_size
+
+    def update(self, data):
+        # type: (bytes) -> None
+        self._hasher.update(data)
+
+    def digest(self):
+        # type: () -> bytes
+        return self._hasher.digest()
+
+    def hexdigest(self):
+        # type: () -> _F
+        return self._hexdigest_type(self._hasher.hexdigest())
+
+
+class Sha1(HashlibHasher[Sha1Fingerprint]):
+    def __init__(self, data=b""):
+        # type: (bytes) -> None
+        super(Sha1, self).__init__(hexdigest_type=Sha1Fingerprint, data=data)
+
+
+class Sha256(HashlibHasher[Sha256Fingerprint]):
+    def __init__(self, data=b""):
+        # type: (bytes) -> None
+        super(Sha256, self).__init__(hexdigest_type=Sha256Fingerprint, data=data)
+
+
+class MultiDigest(object):
+    def __init__(self, digests):
+        # type: (Iterable[HintedDigest]) -> None
+        self._digests = digests
+        self._block_size = max(digest.block_size for digest in digests)
+
+    @property
+    def block_size(self):
+        # type: () -> int
+        return self._block_size
+
+    def update(self, data):
+        # type: (bytes) -> None
+        for digest in self._digests:
+            digest.update(data)
+
+
+def update_hash(
+    filelike,  # type: IO[bytes]
+    digest,  # type: HintedDigest
+):
+    # type: (...) -> None
+    """Update the digest of a single file in a memory-efficient manner."""
+    block_size = digest.block_size * 1024
+    for chunk in iter(lambda: filelike.read(block_size), b""):
+        digest.update(chunk)
+
+
+def file_hash(
+    path,  # type: str
+    digest,  # type: HintedDigest
+):
+    # type: (...) -> None
+    """Digest of a single file in a memory-efficient manner."""
+    with open(path, "rb") as fp:
+        update_hash(filelike=fp, digest=digest)
+
+
+def dir_hash(
+    directory,  # type: str
+    digest,  # type: HintedDigest
+    dir_filter=lambda dirs: dirs,  # type: Callable[[Iterable[str]], Iterable[str]]
+    file_filter=lambda files: files,  # type: Callable[[Iterable[str]], Iterable[str]]
+):
+    # type: (...) -> None
+    """Digest the contents of a directory in a reproducible manner."""
+
+    def iter_files():
+        # type: () -> Iterator[str]
+        normpath = os.path.realpath(os.path.normpath(directory))
+        for root, dirs, files in os.walk(normpath):
+            dirs[:] = list(dir_filter(dirs))
+            for f in file_filter(files):
+                yield os.path.relpath(os.path.join(root, f), normpath)
+
+    names = sorted(iter_files())
+
+    # Always use / as the path separator, since that's what zip uses.
+    hashed_names = [n.replace(os.sep, "/") for n in names]
+    digest.update("".join(hashed_names).encode("utf-8"))
+
+    for name in names:
+        file_hash(os.path.join(directory, name), digest)
diff --git a/pex/pep_376.py b/pex/pep_376.py
@@ -15,7 +15,7 @@
 from contextlib import closing
 from fileinput import FileInput
 
-from pex import dist_metadata
+from pex import dist_metadata, hashing
 from pex.common import (
     filter_pyc_dirs,
     filter_pyc_files,
@@ -24,18 +24,13 @@
     safe_mkdir,
     safe_open,
 )
-from pex.compatibility import PY2, get_stdout_bytes_buffer, urlparse
+from pex.compatibility import get_stdout_bytes_buffer, urlparse
 from pex.interpreter import PythonInterpreter
 from pex.third_party.pkg_resources import EntryPoint
 from pex.typing import TYPE_CHECKING, cast
-from pex.util import CacheHelper
 from pex.venv.virtualenv import Virtualenv
 
 if TYPE_CHECKING:
-    if PY2:
-        from hashlib import _hash as _Hash
-    else:
-        from hashlib import _Hash
     from typing import (
         Callable,
         Container,
@@ -50,6 +45,8 @@
 
     import attr  # vendor:skip
 
+    from pex.hashing import Hasher
+
     class CSVWriter(Protocol):
         def writerow(self, row):
             # type: (Iterable[Union[str, int]]) -> None
@@ -65,15 +62,15 @@ class Digest(object):
     encoded_hash = attr.ib()  # type: str
 
     def new_hasher(self):
-        # type: () -> _Hash
+        # type: () -> Hasher
         return hashlib.new(self.algorithm)
 
 
 @attr.s(frozen=True)
 class Hash(object):
     @classmethod
     def create(cls, hasher):
-        # type: (_Hash) -> Hash
+        # type: (Hasher) -> Hash
 
         # The fingerprint encoding is defined for PEP-376 RECORD files as `urlsafe-base64-nopad`
         # which is fully spelled out in code in PEP-427:
@@ -270,9 +267,7 @@ def reinstall(
             self._reinstall_site_packages(site_packages_dir, symlink=symlink),
         ):
             hasher = hashlib.sha256()
-            with open(dst, "rb") as hash_fp:
-                CacheHelper.update_hash(hash_fp, digest=hasher)
-
+            hashing.file_hash(dst, digest=hasher)
             installed_files.append(
                 InstalledFile(
                     path=os.path.relpath(dst, site_packages_dir),

diff --git a/pex/pex_info.py b/pex/pex_info.py
@@ -78,10 +78,10 @@ def from_pex(cls, pex):
             with open_zip(pex) as zf:
                 pex_info = zf.read(cls.PATH)
         elif os.path.isfile(pex):  # Venv PEX
-            with open(os.path.join(os.path.dirname(pex), cls.PATH)) as fp:
+            with open(os.path.join(os.path.dirname(pex), cls.PATH), "rb") as fp:
                 pex_info = fp.read()
         else:  # Directory (Either loose or installed) PEX
-            with open(os.path.join(pex, cls.PATH)) as fp:
+            with open(os.path.join(pex, cls.PATH), "rb") as fp:
                 pex_info = fp.read()
         return cls.from_json(pex_info)