diff --git a/src/huggingface_hub/utils/sha.py b/src/huggingface_hub/utils/sha.py index 404d2e1b42..157ccb0379 100644 --- a/src/huggingface_hub/utils/sha.py +++ b/src/huggingface_hub/utils/sha.py @@ -1,23 +1,6 @@ -"""Utilities to efficiently compute the SHA 256 hash of a bunch of bytes""" - -from functools import partial +"""Utilities to efficiently compute the SHA 256 hash of a bunch of bytes.""" from hashlib import sha256 -from typing import BinaryIO, Iterable, Optional - - -def iter_fileobj( - fileobj: BinaryIO, chunk_size: Optional[int] = None -) -> Iterable[bytes]: - """Returns an iterator over the content of ``fileobj`` in chunks of ``chunk_size``""" - chunk_size = chunk_size or -1 - return iter(partial(fileobj.read, chunk_size), b"") - - -def sha_iter(iterable: Iterable[bytes]): - sha = sha256() - for chunk in iterable: - sha.update(chunk) - return sha.digest() +from typing import BinaryIO, Optional def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes: @@ -28,10 +11,17 @@ def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes: fileobj (file-like object): The File object to compute sha256 for, typically obtained with `open(path, "rb")` chunk_size (`int`, *optional*): - The number of bytes to read from `fileobj` at once, defaults to 512 + The number of bytes to read from `fileobj` at once, defaults to 1MB. Returns: `bytes`: `fileobj`'s sha256 hash as bytes """ - chunk_size = chunk_size if chunk_size is not None else 512 - return sha_iter(iter_fileobj(fileobj)) + chunk_size = chunk_size if chunk_size is not None else 1024 * 1024 + + sha = sha256() + while True: + chunk = fileobj.read(chunk_size) + sha.update(chunk) + if not chunk: + break + return sha.digest()