Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add "sync" option for upload & download #616

Merged
merged 4 commits into from
May 10, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion dandi/cli/cmd_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ def get_metavar(self, param):
default="all",
show_default=True,
)
@click.option(
"--sync", is_flag=True, help="Delete local assets that do not exist on the server"
)
@instance_option()
# Might be a cool feature, not unlike verifying a checksum, we verify that
# downloaded file passes the validator, and if not -- alert
Expand All @@ -91,7 +94,7 @@ def get_metavar(self, param):
@click.argument("url", nargs=-1)
@map_to_click_exceptions
def download(
url, output_dir, existing, jobs, format, download_types, dandi_instance=None
url, output_dir, existing, jobs, format, download_types, sync, dandi_instance=None
):
"""Download a file or entire folder from DANDI"""
# We need to import the download module rather than the download function
Expand Down Expand Up @@ -133,5 +136,6 @@ def download(
jobs=jobs,
get_metadata="dandiset.yaml" in download_types,
get_assets="assets" in download_types,
sync=sync,
# develop_debug=develop_debug
)
5 changes: 5 additions & 0 deletions dandi/cli/cmd_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ def get_metavar(self, param):
type=IntColonInt(),
help="Number of files to upload in parallel and, optionally, number of upload threads per file",
)
@click.option(
"--sync", is_flag=True, help="Delete assets on the server that do not exist locally"
)
@click.option(
"--validation",
help="Data must pass validation before the upload. Use of this option is highly discouraged.",
Expand Down Expand Up @@ -80,6 +83,7 @@ def get_metavar(self, param):
def upload(
paths,
jobs,
sync,
existing="refresh",
validation="require",
dandiset_path=None,
Expand Down Expand Up @@ -121,4 +125,5 @@ def upload(
devel_debug=devel_debug,
jobs=jobs,
jobs_per_file=jobs_per_file,
sync=sync,
)
7 changes: 7 additions & 0 deletions dandi/cli/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def test_download_defaults(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand All @@ -36,6 +37,7 @@ def test_download_all_types(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand All @@ -51,6 +53,7 @@ def test_download_metadata_only(mocker):
jobs=6,
get_metadata=True,
get_assets=False,
sync=False,
)


Expand All @@ -66,6 +69,7 @@ def test_download_assets_only(mocker):
jobs=6,
get_metadata=False,
get_assets=True,
sync=False,
)


Expand Down Expand Up @@ -96,6 +100,7 @@ def test_download_gui_instance_in_dandiset(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand All @@ -117,6 +122,7 @@ def test_download_api_instance_in_dandiset(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand All @@ -142,6 +148,7 @@ def test_download_url_instance_match(mocker):
jobs=6,
get_metadata=True,
get_assets=True,
sync=False,
)


Expand Down
87 changes: 56 additions & 31 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sys
import time

import click
import humanize
import requests

Expand All @@ -16,7 +17,7 @@
from .dandiset import Dandiset
from . import get_logger
from .support.pyout import naturalsize
from .utils import ensure_datetime, flattened, is_same_time
from .utils import ensure_datetime, find_files, flattened, is_same_time, on_windows

lgr = get_logger()

Expand All @@ -30,12 +31,31 @@ def download(
jobs=1,
get_metadata=True,
get_assets=True,
sync=False,
):
# TODO: unduplicate with upload. For now stole from that one
# We will again use pyout to provide a neat table summarizing our progress
# with upload etc
from .support import pyout as pyouts

urls = flattened([urls])
if len(urls) > 1:
raise NotImplementedError("multiple URLs not supported")
if not urls:
# if no paths provided etc, we will download dandiset path
# we are at, BUT since we are not git -- we do not even know
# on which instance it exists! Thus ATM we would do nothing but crash
raise NotImplementedError("No URLs were provided. Cannot download anything")

parsed_url = parse_dandi_url(urls[0])

# TODO: if we are ALREADY in a dandiset - we can validate that it is the
# same dandiset and use that dandiset path as the one to download under
if isinstance(parsed_url, DandisetURL):
output_path = op.join(output_dir, parsed_url.dandiset_id)
else:
output_path = output_dir

# dandi.cli.formatters are used in cmd_ls to provide switchable
pyout_style = pyouts.get_style(hide_if_missing=False)

Expand All @@ -56,8 +76,8 @@ def download(
kw["yield_generator_for_fields"] = rec_fields[1:] # all but path

gen_ = download_generator(
urls,
output_dir,
parsed_url,
output_path,
existing=existing,
get_metadata=get_metadata,
get_assets=get_assets,
Expand All @@ -81,10 +101,38 @@ def download(
else:
raise ValueError(format)

if sync and not isinstance(parsed_url, SingleAssetURL):
client = parsed_url.get_client()
with client.session():
asset_paths = {asset["path"] for asset in parsed_url.get_assets(client)}
if isinstance(parsed_url, DandisetURL):
prefix = os.curdir
download_dir = output_path
elif isinstance(parsed_url, MultiAssetURL):
folder_path = op.normpath(parsed_url.path)
prefix = folder_path
download_dir = op.join(output_path, op.basename(folder_path))
else:
raise NotImplementedError(
f"Unexpected URL type {type(parsed_url).__name__}"
)
to_delete = []
for p in find_files(".*", download_dir, exclude_datalad=True):
if p == op.join(output_path, dandiset_metadata_file):
continue
a_path = op.normpath(op.join(prefix, op.relpath(p, download_dir)))
if on_windows:
a_path = a_path.replace("\\", "/")
if a_path not in asset_paths:
to_delete.append(p)
if to_delete and click.confirm(f"Delete {len(to_delete)} local assets?"):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if to_delete and click.confirm(f"Delete {len(to_delete)} local assets?"):
if to_delete and click.confirm(f"Delete {len(to_delete)} local asset{'s' if len(to_delete) > 1 else ''}?"):

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's improve UX right away, and instead of plain confirm give a choice for "[l]ist" or alike and if asked, list all the assets to be deleted so user could make an informed decision

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

filed related #620 on allowing for non-interactive, which might be done after or melded into this PR if we agree before it is ready

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

for p in to_delete:
os.unlink(p)


def download_generator(
urls,
output_dir,
parsed_url,
output_path,
*,
assets_it=None,
yield_generator_for_fields=None,
Expand All @@ -107,38 +155,15 @@ def download_generator(
summary statistics while already downloading. TODO: reimplement properly!

"""
urls = flattened([urls])
if len(urls) > 1:
raise NotImplementedError("multiple URLs not supported")
if not urls:
# if no paths provided etc, we will download dandiset path
# we are at, BUT since we are not git -- we do not even know
# on which instance it exists! Thus ATM we would do nothing but crash
raise NotImplementedError("No URLs were provided. Cannot download anything")

parsed_url = parse_dandi_url(urls[0])
with parsed_url.navigate() as (client, dandiset, assets):
if assets_it:
assets_it.gen = assets
assets = assets_it

# TODO: if we are ALREADY in a dandiset - we can validate that it is the
# same dandiset and use that dandiset path as the one to download under
if dandiset:
identifier = Dandiset._get_identifier(dandiset)
if not identifier:
raise ValueError(f"Cannot deduce dandiset identifier from {dandiset}")
if isinstance(parsed_url, DandisetURL):
output_path = op.join(output_dir, identifier)
if get_metadata:
for resp in _populate_dandiset_yaml(
output_path, dandiset, existing
):
yield dict(path=dandiset_metadata_file, **resp)
else:
output_path = output_dir
else:
output_path = output_dir
if isinstance(parsed_url, DandisetURL) and get_metadata:
for resp in _populate_dandiset_yaml(output_path, dandiset, existing):
yield dict(path=dandiset_metadata_file, **resp)

# TODO: do analysis of assets for early detection of needed renames etc
# to avoid any need for late treatment of existing and also for
Expand Down
4 changes: 2 additions & 2 deletions dandi/tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,11 +284,11 @@ def text_dandiset(local_dandi_api, monkeypatch, tmp_path_factory):
(dspath / "subdir2" / "banana.txt").write_text("Banana\n")
(dspath / "subdir2" / "coconut.txt").write_text("Coconut\n")

def upload_dandiset(**kwargs):
def upload_dandiset(paths=None, **kwargs):
with monkeypatch.context() as m:
m.setenv("DANDI_API_KEY", local_dandi_api["api_key"])
upload(
paths=[],
paths=paths or [],
dandiset_path=dspath,
dandi_instance=local_dandi_api["instance_id"],
devel_debug=True,
Expand Down
40 changes: 40 additions & 0 deletions dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,43 @@ def test_download_asset_id(local_dandi_api, text_dandiset, tmp_path):
tmp_path / "coconut.txt"
]
assert (tmp_path / "coconut.txt").read_text() == "Coconut\n"


@pytest.mark.parametrize("confirm", [True, False])
def test_download_sync(confirm, local_dandi_api, mocker, text_dandiset, tmp_path):
text_dandiset["client"].delete_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
dspath = tmp_path / text_dandiset["dandiset_id"]
os.rename(text_dandiset["dspath"], dspath)
confirm_mock = mocker.patch("click.confirm", return_value=confirm)
download(
f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}",
tmp_path,
existing="overwrite",
sync=True,
)
confirm_mock.assert_called_with("Delete 1 local assets?")
jwodder marked this conversation as resolved.
Show resolved Hide resolved
if confirm:
assert not (dspath / "file.txt").exists()
else:
assert (dspath / "file.txt").exists()


def test_download_sync_folder(local_dandi_api, mocker, text_dandiset):
text_dandiset["client"].delete_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
text_dandiset["client"].delete_asset_bypath(
text_dandiset["dandiset_id"], "draft", "subdir2/banana.txt"
)
confirm_mock = mocker.patch("click.confirm", return_value=True)
download(
f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}/subdir2/",
text_dandiset["dspath"],
existing="overwrite",
sync=True,
)
confirm_mock.assert_called_with("Delete 1 local assets?")
jwodder marked this conversation as resolved.
Show resolved Hide resolved
assert (text_dandiset["dspath"] / "file.txt").exists()
assert not (text_dandiset["dspath"] / "subdir2" / "banana.txt").exists()
35 changes: 35 additions & 0 deletions dandi/tests/test_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,38 @@ def test_upload_download_small_file(contents, local_dandi_api, monkeypatch, tmp_
download_dir / dandiset_id / "file.txt",
]
assert files[1].read_bytes() == contents


@pytest.mark.parametrize("confirm", [True, False])
def test_upload_sync(confirm, mocker, text_dandiset):
(text_dandiset["dspath"] / "file.txt").unlink()
confirm_mock = mocker.patch("click.confirm", return_value=confirm)
text_dandiset["reupload"](sync=True)
confirm_mock.assert_called_with("Delete 1 assets on server?")
asset = text_dandiset["client"].get_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
if confirm:
assert asset is None
else:
assert asset is not None


def test_upload_sync_folder(mocker, text_dandiset):
(text_dandiset["dspath"] / "file.txt").unlink()
(text_dandiset["dspath"] / "subdir2" / "banana.txt").unlink()
confirm_mock = mocker.patch("click.confirm", return_value=True)
text_dandiset["reupload"](paths=[text_dandiset["dspath"] / "subdir2"], sync=True)
confirm_mock.assert_called_with("Delete 1 assets on server?")
assert (
text_dandiset["client"].get_asset_bypath(
text_dandiset["dandiset_id"], "draft", "file.txt"
)
is not None
)
assert (
text_dandiset["client"].get_asset_bypath(
text_dandiset["dandiset_id"], "draft", "subdir2/banana.txt"
)
is None
)
25 changes: 25 additions & 0 deletions dandi/upload.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from functools import reduce
import os.path
from pathlib import Path, PurePosixPath
import re
import time

import click

from .consts import dandiset_identifier_regex, dandiset_metadata_file
from . import lgr
from .utils import ensure_datetime, get_instance
Expand All @@ -18,6 +22,7 @@ def upload(
devel_debug=False,
jobs=None,
jobs_per_file=None,
sync=False,
):
from .dandiapi import DandiAPIClient
from .dandiset import APIDandiset, Dandiset
Expand Down Expand Up @@ -61,6 +66,7 @@ def upload(
#
if not paths:
paths = [dandiset.path]
original_paths = paths

# Expand and validate all paths -- they should reside within dandiset
paths = find_files(".*", paths) if allow_any_path else find_dandi_files(paths)
Expand Down Expand Up @@ -335,3 +341,22 @@ def upload_agg(*ignored):
else:
rec.update(skip_file(exc))
out(rec)

if sync:
relpaths = []
for p in original_paths:
rp = os.path.relpath(p, dandiset.path)
relpaths.append("" if rp == "." else rp)
path_prefix = reduce(os.path.commonprefix, relpaths)
to_delete = []
for asset in client.get_dandiset_assets(
ds_identifier, "draft", path=path_prefix
):
if (
any(p == "" or path_is_subpath(asset["path"], p) for p in relpaths)
and not Path(dandiset.path, asset["path"]).exists()
):
to_delete.append(asset["asset_id"])
if to_delete and click.confirm(f"Delete {len(to_delete)} assets on server?"):
jwodder marked this conversation as resolved.
Show resolved Hide resolved
for asset_id in to_delete:
client.delete_asset(ds_identifier, "draft", asset_id)