diff --git a/dandi/download.py b/dandi/download.py index 6e14908a5..71ff2641b 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -53,7 +53,7 @@ from .support.digests import get_digest, get_zarr_checksum from .support.iterators import IteratorWithAggregation from .support.pyout import naturalsize -from .support.typing import Literal +from .support.typing import Literal, Protocol from .utils import ( abbrev_prompt, ensure_datetime, @@ -520,6 +520,14 @@ def _populate_dandiset_yaml( } +class Hasher(Protocol): + def update(self, data: bytes) -> None: + ... + + def hexdigest(self) -> str: + ... + + def _download_file( downloader: Callable[[int], Iterator[bytes]], path: Path, @@ -646,20 +654,26 @@ def _download_file( yield {"status": "downloading"} - algo, digester, digest, downloaded_digest = None, None, None, None + algo: Optional[str] = None + digester: Optional[Callable[[], Hasher]] = None + digest: Optional[str] = None + downloaded_digest: Optional[Hasher] = None if digests: # choose first available for now. # TODO: reuse that sorting based on speed for algo, digest in digests.items(): - if algo == "dandi-etag": + if algo == "dandi-etag" and size is not None: from dandischema.digests.dandietag import ETagHashlike - digester = lambda: ETagHashlike(size) # noqa: E731 + # Instantiate outside the lambda so that mypy is assured that + # `size` is not None: + hasher = ETagHashlike(size) + digester = lambda: hasher # noqa: E731 else: digester = getattr(hashlib, algo, None) - if digester: + if digester is not None: break - if not digester: + if digester is None: lgr.warning("Found no digests in hashlib for any of %s", str(digests)) # TODO: how do we discover the total size???? @@ -725,12 +739,12 @@ def _download_file( if downloaded_digest and not resuming: assert downloaded_digest is not None - downloaded_digest = downloaded_digest.hexdigest() # we care only about hex + final_digest = downloaded_digest.hexdigest() # we care only about hex if digest_callback is not None: assert isinstance(algo, str) - digest_callback(algo, downloaded_digest) - if digest != downloaded_digest: - msg = f"{algo}: downloaded {downloaded_digest} != {digest}" + digest_callback(algo, final_digest) + if digest != final_digest: + msg = f"{algo}: downloaded {final_digest} != {digest}" yield {"checksum": "differs", "status": "error", "message": msg} lgr.debug("%s is different: %s.", path, msg) return diff --git a/dandi/files/bids.py b/dandi/files/bids.py index 8888b3e75..0d7abb8a7 100644 --- a/dandi/files/bids.py +++ b/dandi/files/bids.py @@ -92,7 +92,9 @@ def _validate(self) -> None: self._asset_errors: dict[str, list[ValidationResult]] = defaultdict( list ) - self._asset_metadata = defaultdict(BareAsset.unvalidated) + # Don't apply eta-reduction to the lambda, as mypy needs to be + # assured that defaultdict's argument takes no parameters. + self._asset_metadata = defaultdict(lambda: BareAsset.unvalidated()) for result in results: if result.id in BIDS_ASSET_ERRORS: assert result.path diff --git a/dandi/metadata.py b/dandi/metadata.py index bb3e00b7a..08ede2836 100644 --- a/dandi/metadata.py +++ b/dandi/metadata.py @@ -18,18 +18,18 @@ Type, TypeVar, Union, - cast, ) from uuid import uuid4 from xml.dom.minidom import parseString from dandischema import models +from pydantic import ByteSize, parse_obj_as import requests import tenacity from . import __version__, get_logger from .consts import metadata_all_fields -from .misctypes import Digest, LocalReadableFile, Readable +from .misctypes import DUMMY_DANDI_ETAG, Digest, LocalReadableFile, Readable from .pynwb_utils import ( _get_pynwb_metadata, get_neurodata_types, @@ -90,20 +90,21 @@ def get_metadata( dandiset_path, bids_dataset_description=bids_dataset_description, ) + assert isinstance(df, bids.BIDSAsset) if not digest: - _digest = "0" * 32 + "-1" - digest = Digest.dandi_etag(_digest) + digest = DUMMY_DANDI_ETAG path_metadata = df.get_metadata(digest=digest) - assert isinstance(df, bids.BIDSAsset) meta["bids_version"] = df.get_validation_bids_version() # there might be a more elegant way to do this: - for key in metadata_all_fields: - try: - value = getattr(path_metadata.wasAttributedTo[0], key) - except AttributeError: - pass - else: - meta[key] = value + if path_metadata.wasAttributedTo is not None: + attributed = path_metadata.wasAttributedTo[0] + for key in metadata_all_fields: + try: + value = getattr(attributed, key) + except AttributeError: + pass + else: + meta[key] = value if r.get_filename().endswith((".NWB", ".nwb")): if nwb_has_external_links(r): @@ -623,7 +624,7 @@ def extract_anatomy(metadata: dict) -> Optional[List[models.Anatomy]]: def extract_model(modelcls: Type[M], metadata: dict, **kwargs: Any) -> M: - m = cast(M, modelcls.unvalidated()) + m = modelcls.unvalidated() for field in m.__fields__.keys(): value = kwargs.get(field, extract_field(field, metadata)) if value is not None: @@ -1002,13 +1003,14 @@ def add_common_metadata( metadata.blobDateModified = mtime if mtime > metadata.dateModified: lgr.warning("mtime %s of %s is in the future", mtime, r) - metadata.contentSize = r.get_size() + size = r.get_size() if digest is not None and digest.algorithm is models.DigestType.dandi_zarr_checksum: m = re.fullmatch( r"(?P[0-9a-f]{32})-(?P[0-9]+)--(?P[0-9]+)", digest.value ) if m: - metadata.contentSize = int(m["size"]) + size = int(m["size"]) + metadata.contentSize = parse_obj_as(ByteSize, size) if metadata.wasGeneratedBy is None: metadata.wasGeneratedBy = [] metadata.wasGeneratedBy.append(get_generator(start_time, end_time)) diff --git a/dandi/support/typing.py b/dandi/support/typing.py index fb138f98a..25b365d9f 100644 --- a/dandi/support/typing.py +++ b/dandi/support/typing.py @@ -1,6 +1,6 @@ import sys if sys.version_info >= (3, 8): - from typing import Literal, TypedDict # noqa: F401 + from typing import Literal, Protocol, TypedDict # noqa: F401 else: - from typing_extensions import Literal, TypedDict # noqa: F401 + from typing_extensions import Literal, Protocol, TypedDict # noqa: F401 diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 75e183db8..b55a6bad0 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -225,6 +225,7 @@ def test_upload_bids_metadata( if "sub-" in apath: metadata = dandiset.get_asset_by_path(apath).get_metadata() # Hard-coded check for the subject identifier set in the fixture: + assert metadata.wasAttributedTo is not None assert metadata.wasAttributedTo[0].identifier == "Sub1" diff --git a/setup.cfg b/setup.cfg index 385e53ba7..97356fa7b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,7 +34,7 @@ install_requires = bidsschematools >= 0.5.0 click click-didyoumean - dandischema ~= 0.7.0 + dandischema ~= 0.8.0 etelemetry >= 0.2.2 fasteners fscacher >= 0.3.0