Skip to content

Commit

Permalink
Merge pull request #616 from dandi/gh-365
Browse files Browse the repository at this point in the history
Add "sync" option for upload & download
  • Loading branch information
yarikoptic authored May 10, 2021
2 parents a3b6a6b + f698a67 commit 5cead67
Show file tree
Hide file tree
Showing 10 changed files with 262 additions and 35 deletions.
6 changes: 5 additions & 1 deletion dandi/cli/cmd_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ def get_metavar(self, param):
default="all",
show_default=True,
)
@click.option(
"--sync", is_flag=True, help="Delete local assets that do not exist on the server"
)
@instance_option()
# Might be a cool feature, not unlike verifying a checksum, we verify that
# downloaded file passes the validator, and if not -- alert
Expand All @@ -91,7 +94,7 @@ def get_metavar(self, param):
@click.argument("url", nargs=-1)
@map_to_click_exceptions
def download(
url, output_dir, existing, jobs, format, download_types, dandi_instance=None
url, output_dir, existing, jobs, format, download_types, sync, dandi_instance=None
):
"""Download a file or entire folder from DANDI"""
# We need to import the download module rather than the download function
Expand Down Expand Up @@ -133,5 +136,6 @@ def download(
jobs=jobs,
get_metadata="dandiset.yaml" in download_types,
get_assets="assets" in download_types,
sync=sync,
# develop_debug=develop_debug
)
5 changes: 5 additions & 0 deletions dandi/cli/cmd_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ def get_metavar(self, param):
type=IntColonInt(),
help="Number of files to upload in parallel and, optionally, number of upload threads per file",
)
@click.option(
"--sync", is_flag=True, help="Delete assets on the server that do not exist locally"
)
@click.option(
"--validation",
help="Data must pass validation before the upload. Use of this option is highly discouraged.",
Expand Down Expand Up @@ -80,6 +83,7 @@ def get_metavar(self, param):
def upload(
paths,
jobs,
sync,
existing="refresh",
validation="require",
dandiset_path=None,
Expand Down Expand Up @@ -121,4 +125,5 @@ def upload(
devel_debug=devel_debug,
jobs=jobs,
jobs_per_file=jobs_per_file,
sync=sync,
)
7 changes: 7 additions & 0 deletions dandi/cli/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def test_download_defaults(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand All @@ -36,6 +37,7 @@ def test_download_all_types(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand All @@ -51,6 +53,7 @@ def test_download_metadata_only(mocker):
jobs=6,
get_metadata=True,
get_assets=False,
sync=False,
)


Expand All @@ -66,6 +69,7 @@ def test_download_assets_only(mocker):
jobs=6,
get_metadata=False,
get_assets=True,
sync=False,
)


Expand Down Expand Up @@ -96,6 +100,7 @@ def test_download_gui_instance_in_dandiset(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand All @@ -117,6 +122,7 @@ def test_download_api_instance_in_dandiset(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand All @@ -142,6 +148,7 @@ def test_download_url_instance_match(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand Down
108 changes: 77 additions & 31 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,15 @@
from .dandiset import Dandiset
from . import get_logger
from .support.pyout import naturalsize
from .utils import ensure_datetime, flattened, is_same_time
from .utils import (
abbrev_prompt,
ensure_datetime,
find_files,
flattened,
is_same_time,
on_windows,
pluralize,
)

lgr = get_logger()

Expand All @@ -30,12 +38,31 @@ def download(
jobs=1,
get_metadata=True,
get_assets=True,
sync=False,
):
# TODO: unduplicate with upload. For now stole from that one
# We will again use pyout to provide a neat table summarizing our progress
# with upload etc
from .support import pyout as pyouts

urls = flattened([urls])
if len(urls) > 1:
raise NotImplementedError("multiple URLs not supported")
if not urls:
# if no paths provided etc, we will download dandiset path
# we are at, BUT since we are not git -- we do not even know
# on which instance it exists! Thus ATM we would do nothing but crash
raise NotImplementedError("No URLs were provided. Cannot download anything")

parsed_url = parse_dandi_url(urls[0])

# TODO: if we are ALREADY in a dandiset - we can validate that it is the
# same dandiset and use that dandiset path as the one to download under
if isinstance(parsed_url, DandisetURL):
output_path = op.join(output_dir, parsed_url.dandiset_id)
else:
output_path = output_dir

# dandi.cli.formatters are used in cmd_ls to provide switchable
pyout_style = pyouts.get_style(hide_if_missing=False)

Expand All @@ -56,8 +83,8 @@ def download(
kw["yield_generator_for_fields"] = rec_fields[1:] # all but path

gen_ = download_generator(
urls,
output_dir,
parsed_url,
output_path,
existing=existing,
get_metadata=get_metadata,
get_assets=get_assets,
Expand All @@ -81,10 +108,52 @@ def download(
else:
raise ValueError(format)

if sync and not isinstance(parsed_url, SingleAssetURL):
client = parsed_url.get_client()
with client.session():
asset_paths = {asset["path"] for asset in parsed_url.get_assets(client)}
if isinstance(parsed_url, DandisetURL):
prefix = os.curdir
download_dir = output_path
elif isinstance(parsed_url, MultiAssetURL):
folder_path = op.normpath(parsed_url.path)
prefix = folder_path
download_dir = op.join(output_path, op.basename(folder_path))
else:
raise NotImplementedError(
f"Unexpected URL type {type(parsed_url).__name__}"
)
to_delete = []
for p in find_files(".*", download_dir, exclude_datalad=True):
if p == op.join(output_path, dandiset_metadata_file):
continue
a_path = op.normpath(op.join(prefix, op.relpath(p, download_dir)))
if on_windows:
a_path = a_path.replace("\\", "/")
if a_path not in asset_paths:
to_delete.append(p)
if to_delete:
while True:
opt = abbrev_prompt(
f"Delete {pluralize(len(to_delete), 'local asset')}?",
"yes",
"no",
"list",
)
if opt == "list":
for p in to_delete:
print(p)
elif opt == "yes":
for p in to_delete:
os.unlink(p)
break
else:
break


def download_generator(
urls,
output_dir,
parsed_url,
output_path,
*,
assets_it=None,
yield_generator_for_fields=None,
Expand All @@ -107,38 +176,15 @@ def download_generator(
summary statistics while already downloading. TODO: reimplement properly!
"""
urls = flattened([urls])
if len(urls) > 1:
raise NotImplementedError("multiple URLs not supported")
if not urls:
# if no paths provided etc, we will download dandiset path
# we are at, BUT since we are not git -- we do not even know
# on which instance it exists! Thus ATM we would do nothing but crash
raise NotImplementedError("No URLs were provided. Cannot download anything")

parsed_url = parse_dandi_url(urls[0])
with parsed_url.navigate() as (client, dandiset, assets):
if assets_it:
assets_it.gen = assets
assets = assets_it

# TODO: if we are ALREADY in a dandiset - we can validate that it is the
# same dandiset and use that dandiset path as the one to download under
if dandiset:
identifier = Dandiset._get_identifier(dandiset)
if not identifier:
raise ValueError(f"Cannot deduce dandiset identifier from {dandiset}")
if isinstance(parsed_url, DandisetURL):
output_path = op.join(output_dir, identifier)
if get_metadata:
for resp in _populate_dandiset_yaml(
output_path, dandiset, existing
):
yield dict(path=dandiset_metadata_file, **resp)
else:
output_path = output_dir
else:
output_path = output_dir
if isinstance(parsed_url, DandisetURL) and get_metadata:
for resp in _populate_dandiset_yaml(output_path, dandiset, existing):
yield dict(path=dandiset_metadata_file, **resp)

# TODO: do analysis of assets for early detection of needed renames etc
# to avoid any need for late treatment of existing and also for
Expand Down
4 changes: 2 additions & 2 deletions dandi/tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,11 +284,11 @@ def text_dandiset(local_dandi_api, monkeypatch, tmp_path_factory):
(dspath / "subdir2" / "banana.txt").write_text("Banana\n")
(dspath / "subdir2" / "coconut.txt").write_text("Coconut\n")

def upload_dandiset(**kwargs):
def upload_dandiset(paths=None, **kwargs):
with monkeypatch.context() as m:
m.setenv("DANDI_API_KEY", local_dandi_api["api_key"])
upload(
paths=[],
paths=paths or [],
dandiset_path=dspath,
dandi_instance=local_dandi_api["instance_id"],
devel_debug=True,
Expand Down
63 changes: 63 additions & 0 deletions dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,66 @@ def test_download_asset_id(local_dandi_api, text_dandiset, tmp_path):
tmp_path / "coconut.txt"
]
assert (tmp_path / "coconut.txt").read_text() == "Coconut\n"


@pytest.mark.parametrize("confirm", [True, False])
def test_download_sync(confirm, local_dandi_api, mocker, text_dandiset, tmp_path):
text_dandiset["client"].delete_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
dspath = tmp_path / text_dandiset["dandiset_id"]
os.rename(text_dandiset["dspath"], dspath)
confirm_mock = mocker.patch(
"dandi.download.abbrev_prompt", return_value="yes" if confirm else "no"
)
download(
f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}",
tmp_path,
existing="overwrite",
sync=True,
)
confirm_mock.assert_called_with("Delete 1 local asset?", "yes", "no", "list")
if confirm:
assert not (dspath / "file.txt").exists()
else:
assert (dspath / "file.txt").exists()


def test_download_sync_folder(local_dandi_api, mocker, text_dandiset):
text_dandiset["client"].delete_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
text_dandiset["client"].delete_asset_bypath(
text_dandiset["dandiset_id"], "draft", "subdir2/banana.txt"
)
confirm_mock = mocker.patch("dandi.download.abbrev_prompt", return_value="yes")
download(
f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}/subdir2/",
text_dandiset["dspath"],
existing="overwrite",
sync=True,
)
confirm_mock.assert_called_with("Delete 1 local asset?", "yes", "no", "list")
assert (text_dandiset["dspath"] / "file.txt").exists()
assert not (text_dandiset["dspath"] / "subdir2" / "banana.txt").exists()


def test_download_sync_list(capsys, local_dandi_api, mocker, text_dandiset, tmp_path):
text_dandiset["client"].delete_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
dspath = tmp_path / text_dandiset["dandiset_id"]
os.rename(text_dandiset["dspath"], dspath)
input_mock = mocker.patch("dandi.utils.input", side_effect=["list", "yes"])
download(
f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}",
tmp_path,
existing="overwrite",
sync=True,
)
assert not (dspath / "file.txt").exists()
assert input_mock.call_args_list == [
mocker.call("Delete 1 local asset? ([y]es/[n]o/[l]ist): "),
mocker.call("Delete 1 local asset? ([y]es/[n]o/[l]ist): "),
]
assert capsys.readouterr().out.splitlines()[-1] == str(dspath / "file.txt")
35 changes: 35 additions & 0 deletions dandi/tests/test_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,38 @@ def test_upload_download_small_file(contents, local_dandi_api, monkeypatch, tmp_
download_dir / dandiset_id / "file.txt",
]
assert files[1].read_bytes() == contents


@pytest.mark.parametrize("confirm", [True, False])
def test_upload_sync(confirm, mocker, text_dandiset):
(text_dandiset["dspath"] / "file.txt").unlink()
confirm_mock = mocker.patch("click.confirm", return_value=confirm)
text_dandiset["reupload"](sync=True)
confirm_mock.assert_called_with("Delete 1 asset on server?")
asset = text_dandiset["client"].get_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
if confirm:
assert asset is None
else:
assert asset is not None


def test_upload_sync_folder(mocker, text_dandiset):
(text_dandiset["dspath"] / "file.txt").unlink()
(text_dandiset["dspath"] / "subdir2" / "banana.txt").unlink()
confirm_mock = mocker.patch("click.confirm", return_value=True)
text_dandiset["reupload"](paths=[text_dandiset["dspath"] / "subdir2"], sync=True)
confirm_mock.assert_called_with("Delete 1 asset on server?")
assert (
text_dandiset["client"].get_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
is not None
)
assert (
text_dandiset["client"].get_asset_bypath(
text_dandiset["dandiset_id"], "draft", "subdir2/banana.txt"
)
is None
)
Loading

0 comments on commit 5cead67

Please sign in to comment.