From 0166d9f25f1aeee2e3a77800367830fc760d942e Mon Sep 17 00:00:00 2001 From: Joep Vanlier Date: Wed, 2 Oct 2024 12:04:18 +0200 Subject: [PATCH] download: allow non-url characters in filenames strips control characters from urls such that spaces and such are allowed in filenames coming from Zenodo --- changelog.md | 1 + lumicks/pylake/file_download.py | 32 ++++++++++++++++++++-- lumicks/pylake/tests/test_file_download.py | 28 +++++++++++++++++++ 3 files changed, 59 insertions(+), 2 deletions(-) diff --git a/changelog.md b/changelog.md index c6d1cb280..2877b3ca7 100644 --- a/changelog.md +++ b/changelog.md @@ -15,6 +15,7 @@ * Added improved printing of calibrations performed with `Pylake`. * Added parameter `titles` to customize title of each subplot in [`Kymo.plot_with_channels()`](https://lumicks-pylake.readthedocs.io/en/latest/_api/lumicks.pylake.kymo.Kymo.html#lumicks.pylake.kymo.Kymo.plot_with_channels). * Added [`KymoTrack.sample_from_channel()`](https://lumicks-pylake.readthedocs.io/en/latest/_api/lumicks.pylake.kymotracker.kymotrack.KymoTrack.html#lumicks.pylake.kymotracker.kymotrack.KymoTrack.sample_from_channel) to downsample channel data to the time points of a kymotrack. +* Added support for file names with spaces in [`lk.download_from_doi()`](https://lumicks-pylake.readthedocs.io/en/latest/_api/lumicks.pylake.download_from_doi.html#lumicks.pylake.download_from_doi). ## v1.5.2 | 2024-07-24 diff --git a/lumicks/pylake/file_download.py b/lumicks/pylake/file_download.py index 03462599e..1b6a750cd 100644 --- a/lumicks/pylake/file_download.py +++ b/lumicks/pylake/file_download.py @@ -2,7 +2,7 @@ import json import hashlib import urllib.error -from urllib.parse import urljoin +from urllib.parse import quote, urljoin, urlparse from urllib.request import urlopen from tqdm.auto import tqdm @@ -10,9 +10,34 @@ __all__ = ["download_from_doi"] +def strip_control_characters(url_str) -> str: + """Strips control characters from a URL + + Parameters + ---------- + url_str : str + URL to encode + + Raises + ------ + ValueError + if the URL does not contain a scheme (e.g. https) or net location + """ + url = urlparse(url_str) + + if not url.scheme or not url.netloc: + raise ValueError(f"Invalid URL provided: {url}") + + base_url = url.scheme + "://" + url.netloc + quote(url.path) + return base_url + "?" + quote(url.query) if url.query else base_url + + def get_url_from_doi(doi): """Obtains a Zenodo record from the DOI (e.g. 10.5281/zenodo.#)""" url = doi if doi.startswith("http") else urljoin("https://doi.org/", doi) + + url = strip_control_characters(url) + try: with urlopen(url) as response: return response.url @@ -23,7 +48,8 @@ def get_url_from_doi(doi): def download_record_metadata(record_number): """Download specific Zenodo record metadata""" zenodo_url = "https://zenodo.org/api/records/" - with urlopen(urljoin(zenodo_url, str(record_number))) as response: + + with urlopen(strip_control_characters(urljoin(zenodo_url, str(record_number)))) as response: if response.status == 200: return json.loads(response.read()) else: @@ -46,6 +72,8 @@ def download_file(url, target_path, download_path, show_progress=True, block_siz block_size : int Block size to use when downloading """ + url = strip_control_characters(url) + with urlopen(url) as response: size_bytes = int(response.headers.get("Content-Length", 0)) diff --git a/lumicks/pylake/tests/test_file_download.py b/lumicks/pylake/tests/test_file_download.py index 5d08cbb4c..88cc17ad5 100644 --- a/lumicks/pylake/tests/test_file_download.py +++ b/lumicks/pylake/tests/test_file_download.py @@ -5,9 +5,37 @@ get_url_from_doi, download_from_doi, download_record_metadata, + strip_control_characters, ) +@pytest.mark.parametrize( + "url, url_ref", + [ + ("https://zenodo.org/api/records/13880274", "https://zenodo.org/api/records/13880274"), + ("https://zenodo.org/api/records/test?yes", "https://zenodo.org/api/records/test?yes"), + ("https://zenodo.org/api/test?yes no", "https://zenodo.org/api/test?yes%20no"), + ("https://zenodo.org/api/test?yes>no", "https://zenodo.org/api/test?yes%3Eno"), + ( + "https://zenodo.org/space bar/test?yes>no", + "https://zenodo.org/space%20bar/test?yes%3Eno", + ), + ( + "https://zenodo.org/api/records/13880274/files/20220203-165412 Marker 0.85_NotOscillate.h5/content", + "https://zenodo.org/api/records/13880274/files/20220203-165412%20Marker%200.85_NotOscillate.h5/content", + ), + ], +) +def test_strip_control_characters_url(url, url_ref): + assert strip_control_characters(url) == url_ref + + +@pytest.mark.parametrize("invalid_url", ["https:://", "zenodo.org"]) +def test_invalid_url(invalid_url): + with pytest.raises(ValueError, match="Invalid URL provided"): + strip_control_characters(invalid_url) + + @pytest.mark.preflight def test_grab_record(): assert get_url_from_doi("10.5281/zenodo.4247279") == "https://zenodo.org/records/4247279"