Skip to content

Commit

Permalink
download: allow non-url characters in filenames
Browse files Browse the repository at this point in the history
strips control characters from urls such that spaces and such are allowed in filenames coming from Zenodo
  • Loading branch information
JoepVanlier committed Oct 7, 2024
1 parent 96551bc commit 0166d9f
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 2 deletions.
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
* Added improved printing of calibrations performed with `Pylake`.
* Added parameter `titles` to customize title of each subplot in [`Kymo.plot_with_channels()`](https://lumicks-pylake.readthedocs.io/en/latest/_api/lumicks.pylake.kymo.Kymo.html#lumicks.pylake.kymo.Kymo.plot_with_channels).
* Added [`KymoTrack.sample_from_channel()`](https://lumicks-pylake.readthedocs.io/en/latest/_api/lumicks.pylake.kymotracker.kymotrack.KymoTrack.html#lumicks.pylake.kymotracker.kymotrack.KymoTrack.sample_from_channel) to downsample channel data to the time points of a kymotrack.
* Added support for file names with spaces in [`lk.download_from_doi()`](https://lumicks-pylake.readthedocs.io/en/latest/_api/lumicks.pylake.download_from_doi.html#lumicks.pylake.download_from_doi).

## v1.5.2 | 2024-07-24

Expand Down
32 changes: 30 additions & 2 deletions lumicks/pylake/file_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,42 @@
import json
import hashlib
import urllib.error
from urllib.parse import urljoin
from urllib.parse import quote, urljoin, urlparse
from urllib.request import urlopen

from tqdm.auto import tqdm

__all__ = ["download_from_doi"]


def strip_control_characters(url_str) -> str:
"""Strips control characters from a URL
Parameters
----------
url_str : str
URL to encode
Raises
------
ValueError
if the URL does not contain a scheme (e.g. https) or net location
"""
url = urlparse(url_str)

if not url.scheme or not url.netloc:
raise ValueError(f"Invalid URL provided: {url}")

base_url = url.scheme + "://" + url.netloc + quote(url.path)
return base_url + "?" + quote(url.query) if url.query else base_url


def get_url_from_doi(doi):
"""Obtains a Zenodo record from the DOI (e.g. 10.5281/zenodo.#)"""
url = doi if doi.startswith("http") else urljoin("https://doi.org/", doi)

url = strip_control_characters(url)

try:
with urlopen(url) as response:
return response.url
Expand All @@ -23,7 +48,8 @@ def get_url_from_doi(doi):
def download_record_metadata(record_number):
"""Download specific Zenodo record metadata"""
zenodo_url = "https://zenodo.org/api/records/"
with urlopen(urljoin(zenodo_url, str(record_number))) as response:

with urlopen(strip_control_characters(urljoin(zenodo_url, str(record_number)))) as response:
if response.status == 200:
return json.loads(response.read())
else:
Expand All @@ -46,6 +72,8 @@ def download_file(url, target_path, download_path, show_progress=True, block_siz
block_size : int
Block size to use when downloading
"""
url = strip_control_characters(url)

with urlopen(url) as response:
size_bytes = int(response.headers.get("Content-Length", 0))

Expand Down
28 changes: 28 additions & 0 deletions lumicks/pylake/tests/test_file_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,37 @@
get_url_from_doi,
download_from_doi,
download_record_metadata,
strip_control_characters,
)


@pytest.mark.parametrize(
"url, url_ref",
[
("https://zenodo.org/api/records/13880274", "https://zenodo.org/api/records/13880274"),
("https://zenodo.org/api/records/test?yes", "https://zenodo.org/api/records/test?yes"),
("https://zenodo.org/api/test?yes no", "https://zenodo.org/api/test?yes%20no"),
("https://zenodo.org/api/test?yes>no", "https://zenodo.org/api/test?yes%3Eno"),
(
"https://zenodo.org/space bar/test?yes>no",
"https://zenodo.org/space%20bar/test?yes%3Eno",
),
(
"https://zenodo.org/api/records/13880274/files/20220203-165412 Marker 0.85_NotOscillate.h5/content",
"https://zenodo.org/api/records/13880274/files/20220203-165412%20Marker%200.85_NotOscillate.h5/content",
),
],
)
def test_strip_control_characters_url(url, url_ref):
assert strip_control_characters(url) == url_ref


@pytest.mark.parametrize("invalid_url", ["https:://", "zenodo.org"])
def test_invalid_url(invalid_url):
with pytest.raises(ValueError, match="Invalid URL provided"):
strip_control_characters(invalid_url)


@pytest.mark.preflight
def test_grab_record():
assert get_url_from_doi("10.5281/zenodo.4247279") == "https://zenodo.org/records/4247279"
Expand Down

0 comments on commit 0166d9f

Please sign in to comment.