Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bboxes sample data #231

Merged
merged 12 commits into from
Jun 28, 2024
7 changes: 5 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -322,17 +322,20 @@ To add a new file, you will need to:
2. Ask to be added as a collaborator on the [movement data repository](gin:neuroinformatics/movement-test-data) (if not already)
3. Download the [GIN CLI](gin:G-Node/Info/wiki/GIN+CLI+Setup#quickstart) and set it up with your GIN credentials, by running `gin login` in a terminal.
4. Clone the movement data repository to your local machine, by running `gin get neuroinformatics/movement-test-data` in a terminal.
5. Add your new files to the `poses`, `videos`, and/or `frames` folders as appropriate. Follow the existing file naming conventions as closely as possible.
6. Determine the sha256 checksum hash of each new file by running `sha256sum <filename>` in a terminal. For convenience, we've included a `get_sha256_hashes.py` script in the [movement data repository](gin:neuroinformatics/movement-test-data). If you run this from the root of the data repository, within a Python environment with `movement` installed, it will calculate the sha256 hashes for all files in the `poses`, `videos`, and `frames` folders and write them to files named `poses_hashes.txt`, `videos_hashes.txt`, and `frames_hashes.txt`, respectively.
5. Add your new files to the `poses`, `videos`, `frames` and/or `bboxes` folders as appropriate. Follow the existing file naming conventions as closely as possible.
6. Determine the sha256 checksum hash of each new file. You can do this in terminal by running `sha256sum <filename>` in Ubuntu, or `shasum -a 256 <filename>` in MacOS. For convenience, we've included a `get_sha256_hashes.py` script in the [movement data repository](gin:neuroinformatics/movement-test-data). If you run this from the root of the data repository, within a Python environment with `movement` installed, it will calculate the sha256 hashes for all files in the `poses`, `videos`, and `frames` folders and write them to files named `poses_hashes.txt`, `videos_hashes.txt`, and `frames_hashes.txt`, respectively.
7. Add metadata for your new files to `metadata.yaml`, including their sha256 hashes you've calculated. See the example entry below for guidance.
8. Commit a specific file with `gin commit -m <message> <filename>`, or `gin commit -m <message> .` to commit all changes.
9. Upload the committed changes to the GIN repository by running `gin upload`. Latest changes to the repository can be pulled via `gin download`. `gin sync` will synchronise the latest changes bidirectionally.



### `metadata.yaml` example entry
```yaml
"SLEAP_three-mice_Aeon_proofread.analysis.h5":
sha256sum: "82ebd281c406a61536092863bc51d1a5c7c10316275119f7daf01c1ff33eac2a"
source_software: "SLEAP"
type: "poses" # "poses" or "bboxes" depending on the type of tracked data
fps: 50
species: "mouse"
number_of_individuals: 3
Expand Down
70 changes: 51 additions & 19 deletions movement/sample_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@
def _generate_file_registry(metadata: dict[str, dict]) -> dict[str, str]:
"""Generate a file registry based on the contents of the metadata.

This includes files containing poses, frames, or entire videos.
This includes files containing poses, frames, videos or bounding boxes
data.

Parameters
----------
Expand All @@ -131,7 +132,7 @@
"""
file_registry = {}
for ds, val in metadata.items():
file_registry[f"poses/{ds}"] = val["sha256sum"]
file_registry[f"{val['type']}/{ds}"] = val["sha256sum"]
for key in ["video", "frame"]:
file_name = val[key]["file_name"]
if file_name:
Expand Down Expand Up @@ -178,14 +179,15 @@
-------
paths : dict
Dictionary mapping file types to their respective paths. The possible
file types are: "poses", "frame", "video". If "frame" or "video" are
not available, the corresponding value is None.
file types are: "poses", "frame", "video" or "bboxes. If "frame" or
"video" are not available, the corresponding value is None.

Examples
--------
>>> from movement.sample_data import fetch_dataset_paths
>>> paths = fetch_dataset_paths("DLC_single-mouse_EPM.predictions.h5")
>>> poses_path = paths["poses"]
>>> poses_path = paths["poses"] # if the data is "pose" data
>>> bboxes_path = paths["poses"] # if the data is "bboxes" data
>>> frame_path = paths["frame"]
>>> video_path = paths["video"]

Expand All @@ -205,21 +207,42 @@
frame_file_name = metadata[filename]["frame"]["file_name"]
video_file_name = metadata[filename]["video"]["file_name"]

return {
"poses": Path(
SAMPLE_DATA.fetch(f"poses/{filename}", progressbar=True)
),
"frame": None
# add trajectory data
# if no type is specified, assume "poses" for backwards compatibility
paths_dict: dict[str, Path | None] = {}
if ("type" not in metadata[filename]) or (
metadata[filename]["type"] == "poses"
):
paths_dict = {
"poses": Path(
SAMPLE_DATA.fetch(f"poses/{filename}", progressbar=True)
)
}
elif metadata[filename]["type"] == "bboxes":
paths_dict = {
"bboxes": Path(
SAMPLE_DATA.fetch(f"bboxes/{filename}", progressbar=True)
)
}

# add frame and video data if available
paths_dict["frame"] = (
None
if not frame_file_name
else Path(
SAMPLE_DATA.fetch(f"frames/{frame_file_name}", progressbar=True)
),
"video": None
)
)

paths_dict["video"] = (
None
if not video_file_name
else Path(
SAMPLE_DATA.fetch(f"videos/{video_file_name}", progressbar=True)
),
}
)
)

return paths_dict


def fetch_dataset(
Expand Down Expand Up @@ -257,11 +280,20 @@
"""
file_paths = fetch_dataset_paths(filename)

ds = load_poses.from_file(
file_paths["poses"],
source_software=metadata[filename]["source_software"],
fps=metadata[filename]["fps"],
)
if "poses" in file_paths:
ds = load_poses.from_file(
file_paths["poses"],
source_software=metadata[filename]["source_software"],
fps=metadata[filename]["fps"],
)
elif "bboxes" in file_paths:
pass

Check warning on line 290 in movement/sample_data.py

View check run for this annotation

Codecov / codecov/patch

movement/sample_data.py#L289-L290

Added lines #L289 - L290 were not covered by tests
# TO BE IMPLEMENTED IN PR 229: https://github.com/neuroinformatics-unit/movement/pull/229
# ds = load_bboxes.from_file(
# file_paths["bboxes"],
# source_software=metadata[filename]["source_software"],
# fps=metadata[filename]["fps"],
# )
ds.attrs["frame_path"] = file_paths["frame"]
ds.attrs["video_path"] = file_paths["video"]

Expand Down
17 changes: 9 additions & 8 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,13 @@ def pytest_configure():
"""Perform initial configuration for pytest.
Fetches pose data file paths as a dictionary for tests.
"""
pytest.POSE_DATA_PATHS = {
file_name: fetch_dataset_paths(file_name)["poses"]
for file_name in list_datasets()
}
pytest.DATA_PATHS = {}
for file_name in list_datasets():
paths_dict = fetch_dataset_paths(file_name)
if "poses" in paths_dict:
pytest.DATA_PATHS[file_name] = paths_dict["poses"]
elif "bboxes" in paths_dict:
pytest.DATA_PATHS[file_name] = paths_dict["bboxes"]


@pytest.fixture(autouse=True)
Expand Down Expand Up @@ -194,9 +197,7 @@ def new_csv_file(tmp_path):
@pytest.fixture
def dlc_style_df():
"""Return a valid DLC-style DataFrame."""
return pd.read_hdf(
pytest.POSE_DATA_PATHS.get("DLC_single-wasp.predictions.h5")
)
return pd.read_hdf(pytest.DATA_PATHS.get("DLC_single-wasp.predictions.h5"))


@pytest.fixture(
Expand All @@ -211,7 +212,7 @@ def dlc_style_df():
)
def sleap_file(request):
"""Return the file path for a SLEAP .h5 or .slp file."""
return pytest.POSE_DATA_PATHS.get(request.param)
return pytest.DATA_PATHS.get(request.param)


@pytest.fixture
Expand Down
6 changes: 3 additions & 3 deletions tests/test_integration/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
import pytest
import xarray as xr
from pytest import POSE_DATA_PATHS
from pytest import DATA_PATHS

from movement.io import load_poses, save_poses

Expand Down Expand Up @@ -62,7 +62,7 @@ def test_to_sleap_analysis_file_returns_same_h5_file_content(
file) to a SLEAP-style .h5 analysis file returns the same file
contents.
"""
sleap_h5_file_path = POSE_DATA_PATHS.get(sleap_h5_file)
sleap_h5_file_path = DATA_PATHS.get(sleap_h5_file)
ds = load_poses.from_sleap_file(sleap_h5_file_path, fps=fps)
save_poses.to_sleap_analysis_file(ds, new_h5_file)

Expand Down Expand Up @@ -93,7 +93,7 @@ def test_to_sleap_analysis_file_source_file(self, file, new_h5_file):
to a SLEAP-style .h5 analysis file stores the .slp labels path
only when the source file is a .slp file.
"""
file_path = POSE_DATA_PATHS.get(file)
file_path = DATA_PATHS.get(file)
if file.startswith("DLC"):
ds = load_poses.from_dlc_file(file_path)
else:
Expand Down
28 changes: 13 additions & 15 deletions tests/test_unit/test_load_poses.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pytest
import xarray as xr
from pytest import POSE_DATA_PATHS
from pytest import DATA_PATHS
from sleap_io.io.slp import read_labels, write_labels
from sleap_io.model.labels import LabeledFrame, Labels

Expand All @@ -18,9 +18,7 @@ class TestLoadPoses:
@pytest.fixture
def sleap_slp_file_without_tracks(self, tmp_path):
"""Mock and return the path to a SLEAP .slp file without tracks."""
sleap_file = POSE_DATA_PATHS.get(
"SLEAP_single-mouse_EPM.predictions.slp"
)
sleap_file = DATA_PATHS.get("SLEAP_single-mouse_EPM.predictions.slp")
labels = read_labels(sleap_file)
file_path = tmp_path / "track_is_none.slp"
lfs = []
Expand Down Expand Up @@ -48,7 +46,7 @@ def sleap_slp_file_without_tracks(self, tmp_path):
@pytest.fixture
def sleap_h5_file_without_tracks(self, tmp_path):
"""Mock and return the path to a SLEAP .h5 file without tracks."""
sleap_file = POSE_DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
sleap_file = DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
file_path = tmp_path / "track_is_none.h5"
with h5py.File(sleap_file, "r") as f1, h5py.File(file_path, "w") as f2:
for key in list(f1.keys()):
Expand Down Expand Up @@ -120,7 +118,7 @@ def test_load_from_sleap_file_without_tracks(
sleap_file_without_tracks
)
ds_from_tracked = load_poses.from_sleap_file(
POSE_DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
)
# Check if the "individuals" coordinate matches
# the assigned default "individuals_0"
Expand Down Expand Up @@ -153,8 +151,8 @@ def test_load_from_sleap_slp_file_or_h5_file_returns_same(
"""Test that loading pose tracks from SLEAP .slp and .h5 files
return the same Dataset.
"""
slp_file_path = POSE_DATA_PATHS.get(slp_file)
h5_file_path = POSE_DATA_PATHS.get(h5_file)
slp_file_path = DATA_PATHS.get(slp_file)
h5_file_path = DATA_PATHS.get(h5_file)
ds_from_slp = load_poses.from_sleap_file(slp_file_path)
ds_from_h5 = load_poses.from_sleap_file(h5_file_path)
xr.testing.assert_allclose(ds_from_h5, ds_from_slp)
Expand All @@ -171,7 +169,7 @@ def test_load_from_dlc_file(self, file_name):
"""Test that loading pose tracks from valid DLC files
returns a proper Dataset.
"""
file_path = POSE_DATA_PATHS.get(file_name)
file_path = DATA_PATHS.get(file_name)
ds = load_poses.from_dlc_file(file_path)
self.assert_dataset(ds, file_path, "DeepLabCut")

Expand All @@ -191,8 +189,8 @@ def test_load_from_dlc_file_csv_or_h5_file_returns_same(self):
"""Test that loading pose tracks from DLC .csv and .h5 files
return the same Dataset.
"""
csv_file_path = POSE_DATA_PATHS.get("DLC_single-wasp.predictions.csv")
h5_file_path = POSE_DATA_PATHS.get("DLC_single-wasp.predictions.h5")
csv_file_path = DATA_PATHS.get("DLC_single-wasp.predictions.csv")
h5_file_path = DATA_PATHS.get("DLC_single-wasp.predictions.h5")
ds_from_csv = load_poses.from_dlc_file(csv_file_path)
ds_from_h5 = load_poses.from_dlc_file(h5_file_path)
xr.testing.assert_allclose(ds_from_h5, ds_from_csv)
Expand All @@ -210,7 +208,7 @@ def test_load_from_dlc_file_csv_or_h5_file_returns_same(self):
def test_fps_and_time_coords(self, fps, expected_fps, expected_time_unit):
"""Test that time coordinates are set according to the provided fps."""
ds = load_poses.from_sleap_file(
POSE_DATA_PATHS.get("SLEAP_three-mice_Aeon_proofread.analysis.h5"),
DATA_PATHS.get("SLEAP_three-mice_Aeon_proofread.analysis.h5"),
fps=fps,
)
assert ds.time_unit == expected_time_unit
Expand All @@ -234,7 +232,7 @@ def test_load_from_lp_file(self, file_name):
"""Test that loading pose tracks from valid LightningPose (LP) files
returns a proper Dataset.
"""
file_path = POSE_DATA_PATHS.get(file_name)
file_path = DATA_PATHS.get(file_name)
ds = load_poses.from_lp_file(file_path)
self.assert_dataset(ds, file_path, "LightningPose")

Expand All @@ -243,7 +241,7 @@ def test_load_from_lp_or_dlc_file_returns_same(self):
using either the `from_lp_file` or `from_dlc_file` function
returns the same Dataset (except for the source_software).
"""
file_path = POSE_DATA_PATHS.get("LP_mouse-face_AIND.predictions.csv")
file_path = DATA_PATHS.get("LP_mouse-face_AIND.predictions.csv")
ds_drom_lp = load_poses.from_lp_file(file_path)
ds_from_dlc = load_poses.from_dlc_file(file_path)
xr.testing.assert_allclose(ds_from_dlc, ds_drom_lp)
Expand All @@ -254,7 +252,7 @@ def test_load_multi_individual_from_lp_file_raises(self):
"""Test that loading a multi-individual .csv file using the
`from_lp_file` function raises a ValueError.
"""
file_path = POSE_DATA_PATHS.get("DLC_two-mice.predictions.csv")
file_path = DATA_PATHS.get("DLC_two-mice.predictions.csv")
with pytest.raises(ValueError):
load_poses.from_lp_file(file_path)

Expand Down
3 changes: 2 additions & 1 deletion tests/test_unit/test_sample_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def validate_metadata(metadata: dict[str, dict]) -> None:
"""Assert that the metadata is in the expected format."""
metadata_fields = [
"sha256sum",
"type",
"source_software",
"fps",
"species",
Expand All @@ -61,7 +62,7 @@ def validate_metadata(metadata: dict[str, dict]) -> None:
set(val.keys()) == set(metadata_fields) for val in metadata.values()
), f"Found issues with the names of medatada fields. {check_yaml_msg}"

# check that metadata keys (pose file names) are unique
# check that metadata keys (file names) are unique
assert len(metadata.keys()) == len(set(metadata.keys()))

# check that the first 2 fields are present and are strings
Expand Down
12 changes: 6 additions & 6 deletions tests/test_unit/test_save_poses.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
import pytest
import xarray as xr
from pytest import POSE_DATA_PATHS
from pytest import DATA_PATHS

from movement.io import load_poses, save_poses

Expand Down Expand Up @@ -66,33 +66,33 @@ def output_file_params(self, request):
(np.array([1, 2, 3]), pytest.raises(ValueError)), # incorrect type
(
load_poses.from_dlc_file(
POSE_DATA_PATHS.get("DLC_single-wasp.predictions.h5")
DATA_PATHS.get("DLC_single-wasp.predictions.h5")
),
does_not_raise(),
), # valid dataset
(
load_poses.from_dlc_file(
POSE_DATA_PATHS.get("DLC_two-mice.predictions.csv")
DATA_PATHS.get("DLC_two-mice.predictions.csv")
),
does_not_raise(),
), # valid dataset
(
load_poses.from_sleap_file(
POSE_DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5")
),
does_not_raise(),
), # valid dataset
(
load_poses.from_sleap_file(
POSE_DATA_PATHS.get(
DATA_PATHS.get(
"SLEAP_three-mice_Aeon_proofread.predictions.slp"
)
),
does_not_raise(),
), # valid dataset
(
load_poses.from_lp_file(
POSE_DATA_PATHS.get("LP_mouse-face_AIND.predictions.csv")
DATA_PATHS.get("LP_mouse-face_AIND.predictions.csv")
),
does_not_raise(),
), # valid dataset
Expand Down