Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add "sync" option for upload & download #616

Merged
merged 4 commits into from
May 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion dandi/cli/cmd_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ def get_metavar(self, param):
default="all",
show_default=True,
)
@click.option(
"--sync", is_flag=True, help="Delete local assets that do not exist on the server"
)
@instance_option()
# Might be a cool feature, not unlike verifying a checksum, we verify that
# downloaded file passes the validator, and if not -- alert
Expand All @@ -91,7 +94,7 @@ def get_metavar(self, param):
@click.argument("url", nargs=-1)
@map_to_click_exceptions
def download(
url, output_dir, existing, jobs, format, download_types, dandi_instance=None
url, output_dir, existing, jobs, format, download_types, sync, dandi_instance=None
):
"""Download a file or entire folder from DANDI"""
# We need to import the download module rather than the download function
Expand Down Expand Up @@ -133,5 +136,6 @@ def download(
jobs=jobs,
get_metadata="dandiset.yaml" in download_types,
get_assets="assets" in download_types,
sync=sync,
# develop_debug=develop_debug
)
5 changes: 5 additions & 0 deletions dandi/cli/cmd_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ def get_metavar(self, param):
type=IntColonInt(),
help="Number of files to upload in parallel and, optionally, number of upload threads per file",
)
@click.option(
"--sync", is_flag=True, help="Delete assets on the server that do not exist locally"
)
@click.option(
"--validation",
help="Data must pass validation before the upload. Use of this option is highly discouraged.",
Expand Down Expand Up @@ -80,6 +83,7 @@ def get_metavar(self, param):
def upload(
paths,
jobs,
sync,
existing="refresh",
validation="require",
dandiset_path=None,
Expand Down Expand Up @@ -121,4 +125,5 @@ def upload(
devel_debug=devel_debug,
jobs=jobs,
jobs_per_file=jobs_per_file,
sync=sync,
)
7 changes: 7 additions & 0 deletions dandi/cli/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def test_download_defaults(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand All @@ -36,6 +37,7 @@ def test_download_all_types(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand All @@ -51,6 +53,7 @@ def test_download_metadata_only(mocker):
jobs=6,
get_metadata=True,
get_assets=False,
sync=False,
)


Expand All @@ -66,6 +69,7 @@ def test_download_assets_only(mocker):
jobs=6,
get_metadata=False,
get_assets=True,
sync=False,
)


Expand Down Expand Up @@ -96,6 +100,7 @@ def test_download_gui_instance_in_dandiset(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand All @@ -117,6 +122,7 @@ def test_download_api_instance_in_dandiset(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand All @@ -142,6 +148,7 @@ def test_download_url_instance_match(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand Down
108 changes: 77 additions & 31 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,15 @@
from .dandiset import Dandiset
from . import get_logger
from .support.pyout import naturalsize
from .utils import ensure_datetime, flattened, is_same_time
from .utils import (
abbrev_prompt,
ensure_datetime,
find_files,
flattened,
is_same_time,
on_windows,
pluralize,
)

lgr = get_logger()

Expand All @@ -30,12 +38,31 @@ def download(
jobs=1,
get_metadata=True,
get_assets=True,
sync=False,
):
# TODO: unduplicate with upload. For now stole from that one
# We will again use pyout to provide a neat table summarizing our progress
# with upload etc
from .support import pyout as pyouts

urls = flattened([urls])
if len(urls) > 1:
raise NotImplementedError("multiple URLs not supported")
if not urls:
# if no paths provided etc, we will download dandiset path
# we are at, BUT since we are not git -- we do not even know
# on which instance it exists! Thus ATM we would do nothing but crash
raise NotImplementedError("No URLs were provided. Cannot download anything")

parsed_url = parse_dandi_url(urls[0])

# TODO: if we are ALREADY in a dandiset - we can validate that it is the
# same dandiset and use that dandiset path as the one to download under
if isinstance(parsed_url, DandisetURL):
output_path = op.join(output_dir, parsed_url.dandiset_id)
else:
output_path = output_dir

# dandi.cli.formatters are used in cmd_ls to provide switchable
pyout_style = pyouts.get_style(hide_if_missing=False)

Expand All @@ -56,8 +83,8 @@ def download(
kw["yield_generator_for_fields"] = rec_fields[1:] # all but path

gen_ = download_generator(
urls,
output_dir,
parsed_url,
output_path,
existing=existing,
get_metadata=get_metadata,
get_assets=get_assets,
Expand All @@ -81,10 +108,52 @@ def download(
else:
raise ValueError(format)

if sync and not isinstance(parsed_url, SingleAssetURL):
client = parsed_url.get_client()
with client.session():
asset_paths = {asset["path"] for asset in parsed_url.get_assets(client)}
if isinstance(parsed_url, DandisetURL):
prefix = os.curdir
download_dir = output_path
elif isinstance(parsed_url, MultiAssetURL):
folder_path = op.normpath(parsed_url.path)
prefix = folder_path
download_dir = op.join(output_path, op.basename(folder_path))
else:
raise NotImplementedError(
f"Unexpected URL type {type(parsed_url).__name__}"
)
to_delete = []
for p in find_files(".*", download_dir, exclude_datalad=True):
if p == op.join(output_path, dandiset_metadata_file):
continue
a_path = op.normpath(op.join(prefix, op.relpath(p, download_dir)))
if on_windows:
a_path = a_path.replace("\\", "/")
if a_path not in asset_paths:
to_delete.append(p)
if to_delete:
while True:
opt = abbrev_prompt(
f"Delete {pluralize(len(to_delete), 'local asset')}?",
"yes",
"no",
"list",
)
if opt == "list":
for p in to_delete:
print(p)
elif opt == "yes":
for p in to_delete:
os.unlink(p)
break
else:
break


def download_generator(
urls,
output_dir,
parsed_url,
output_path,
*,
assets_it=None,
yield_generator_for_fields=None,
Expand All @@ -107,38 +176,15 @@ def download_generator(
summary statistics while already downloading. TODO: reimplement properly!

"""
urls = flattened([urls])
if len(urls) > 1:
raise NotImplementedError("multiple URLs not supported")
if not urls:
# if no paths provided etc, we will download dandiset path
# we are at, BUT since we are not git -- we do not even know
# on which instance it exists! Thus ATM we would do nothing but crash
raise NotImplementedError("No URLs were provided. Cannot download anything")

parsed_url = parse_dandi_url(urls[0])
with parsed_url.navigate() as (client, dandiset, assets):
if assets_it:
assets_it.gen = assets
assets = assets_it

# TODO: if we are ALREADY in a dandiset - we can validate that it is the
# same dandiset and use that dandiset path as the one to download under
if dandiset:
identifier = Dandiset._get_identifier(dandiset)
if not identifier:
raise ValueError(f"Cannot deduce dandiset identifier from {dandiset}")
if isinstance(parsed_url, DandisetURL):
output_path = op.join(output_dir, identifier)
if get_metadata:
for resp in _populate_dandiset_yaml(
output_path, dandiset, existing
):
yield dict(path=dandiset_metadata_file, **resp)
else:
output_path = output_dir
else:
output_path = output_dir
if isinstance(parsed_url, DandisetURL) and get_metadata:
for resp in _populate_dandiset_yaml(output_path, dandiset, existing):
yield dict(path=dandiset_metadata_file, **resp)

# TODO: do analysis of assets for early detection of needed renames etc
# to avoid any need for late treatment of existing and also for
Expand Down
4 changes: 2 additions & 2 deletions dandi/tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,11 +284,11 @@ def text_dandiset(local_dandi_api, monkeypatch, tmp_path_factory):
(dspath / "subdir2" / "banana.txt").write_text("Banana\n")
(dspath / "subdir2" / "coconut.txt").write_text("Coconut\n")

def upload_dandiset(**kwargs):
def upload_dandiset(paths=None, **kwargs):
with monkeypatch.context() as m:
m.setenv("DANDI_API_KEY", local_dandi_api["api_key"])
upload(
paths=[],
paths=paths or [],
dandiset_path=dspath,
dandi_instance=local_dandi_api["instance_id"],
devel_debug=True,
Expand Down
63 changes: 63 additions & 0 deletions dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,66 @@ def test_download_asset_id(local_dandi_api, text_dandiset, tmp_path):
tmp_path / "coconut.txt"
]
assert (tmp_path / "coconut.txt").read_text() == "Coconut\n"


@pytest.mark.parametrize("confirm", [True, False])
def test_download_sync(confirm, local_dandi_api, mocker, text_dandiset, tmp_path):
text_dandiset["client"].delete_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
dspath = tmp_path / text_dandiset["dandiset_id"]
os.rename(text_dandiset["dspath"], dspath)
confirm_mock = mocker.patch(
"dandi.download.abbrev_prompt", return_value="yes" if confirm else "no"
)
download(
f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}",
tmp_path,
existing="overwrite",
sync=True,
)
confirm_mock.assert_called_with("Delete 1 local asset?", "yes", "no", "list")
if confirm:
assert not (dspath / "file.txt").exists()
else:
assert (dspath / "file.txt").exists()


def test_download_sync_folder(local_dandi_api, mocker, text_dandiset):
text_dandiset["client"].delete_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
text_dandiset["client"].delete_asset_bypath(
text_dandiset["dandiset_id"], "draft", "subdir2/banana.txt"
)
confirm_mock = mocker.patch("dandi.download.abbrev_prompt", return_value="yes")
download(
f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}/subdir2/",
text_dandiset["dspath"],
existing="overwrite",
sync=True,
)
confirm_mock.assert_called_with("Delete 1 local asset?", "yes", "no", "list")
assert (text_dandiset["dspath"] / "file.txt").exists()
assert not (text_dandiset["dspath"] / "subdir2" / "banana.txt").exists()


def test_download_sync_list(capsys, local_dandi_api, mocker, text_dandiset, tmp_path):
text_dandiset["client"].delete_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
dspath = tmp_path / text_dandiset["dandiset_id"]
os.rename(text_dandiset["dspath"], dspath)
input_mock = mocker.patch("dandi.utils.input", side_effect=["list", "yes"])
download(
f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}",
tmp_path,
existing="overwrite",
sync=True,
)
assert not (dspath / "file.txt").exists()
assert input_mock.call_args_list == [
mocker.call("Delete 1 local asset? ([y]es/[n]o/[l]ist): "),
mocker.call("Delete 1 local asset? ([y]es/[n]o/[l]ist): "),
]
assert capsys.readouterr().out.splitlines()[-1] == str(dspath / "file.txt")
35 changes: 35 additions & 0 deletions dandi/tests/test_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,38 @@ def test_upload_download_small_file(contents, local_dandi_api, monkeypatch, tmp_
download_dir / dandiset_id / "file.txt",
]
assert files[1].read_bytes() == contents


@pytest.mark.parametrize("confirm", [True, False])
def test_upload_sync(confirm, mocker, text_dandiset):
(text_dandiset["dspath"] / "file.txt").unlink()
confirm_mock = mocker.patch("click.confirm", return_value=confirm)
text_dandiset["reupload"](sync=True)
confirm_mock.assert_called_with("Delete 1 asset on server?")
asset = text_dandiset["client"].get_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
if confirm:
assert asset is None
else:
assert asset is not None


def test_upload_sync_folder(mocker, text_dandiset):
(text_dandiset["dspath"] / "file.txt").unlink()
(text_dandiset["dspath"] / "subdir2" / "banana.txt").unlink()
confirm_mock = mocker.patch("click.confirm", return_value=True)
text_dandiset["reupload"](paths=[text_dandiset["dspath"] / "subdir2"], sync=True)
confirm_mock.assert_called_with("Delete 1 asset on server?")
assert (
text_dandiset["client"].get_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
is not None
)
assert (
text_dandiset["client"].get_asset_bypath(
text_dandiset["dandiset_id"], "draft", "subdir2/banana.txt"
)
is None
)
Loading