diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 384638c4d..10d3bab27 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -322,17 +322,44 @@ To add a new file, you will need to: 2. Ask to be added as a collaborator on the [movement data repository](gin:neuroinformatics/movement-test-data) (if not already) 3. Download the [GIN CLI](gin:G-Node/Info/wiki/GIN+CLI+Setup#quickstart) and set it up with your GIN credentials, by running `gin login` in a terminal. 4. Clone the movement data repository to your local machine, by running `gin get neuroinformatics/movement-test-data` in a terminal. -5. Add your new files to the `poses`, `videos`, and/or `frames` folders as appropriate. Follow the existing file naming conventions as closely as possible. -6. Determine the sha256 checksum hash of each new file by running `sha256sum ` in a terminal. For convenience, we've included a `get_sha256_hashes.py` script in the [movement data repository](gin:neuroinformatics/movement-test-data). If you run this from the root of the data repository, within a Python environment with `movement` installed, it will calculate the sha256 hashes for all files in the `poses`, `videos`, and `frames` folders and write them to files named `poses_hashes.txt`, `videos_hashes.txt`, and `frames_hashes.txt`, respectively. +5. Add your new files to the `poses`, `videos`, `frames`, and/or `bboxes` folders as appropriate. Follow the existing file naming conventions as closely as possible. +6. Determine the sha256 checksum hash of each new file. You can do this in a terminal by running: + ::::{tab-set} + + :::{tab-item} Ubuntu + ```bash + sha256sum + ``` + ::: + + :::{tab-item} MacOS + ```bash + shasum -a 256 + ``` + ::: + + :::{tab-item} Windows + ```bash + certutil -hashfile SHA256 + ``` + ::: + :::: + For convenience, we've included a `get_sha256_hashes.py` script in the [movement data repository](gin:neuroinformatics/movement-test-data). If you run this from the root of the data repository, within a Python environment with `movement` installed, it will calculate the sha256 hashes for all files in the `poses`, `videos`, `frames`, and `bboxes` folders and write them to files named `poses_hashes.txt`, `videos_hashes.txt`, `frames_hashes.txt`, and `bboxes_hashes.txt`, respectively. + 7. Add metadata for your new files to `metadata.yaml`, including their sha256 hashes you've calculated. See the example entry below for guidance. + 8. Commit a specific file with `gin commit -m `, or `gin commit -m .` to commit all changes. + 9. Upload the committed changes to the GIN repository by running `gin upload`. Latest changes to the repository can be pulled via `gin download`. `gin sync` will synchronise the latest changes bidirectionally. + + ### `metadata.yaml` example entry ```yaml "SLEAP_three-mice_Aeon_proofread.analysis.h5": sha256sum: "82ebd281c406a61536092863bc51d1a5c7c10316275119f7daf01c1ff33eac2a" source_software: "SLEAP" + type: "poses" # "poses" or "bboxes" depending on the type of tracked data fps: 50 species: "mouse" number_of_individuals: 3 diff --git a/movement/sample_data.py b/movement/sample_data.py index bfbf3234d..900a352a0 100644 --- a/movement/sample_data.py +++ b/movement/sample_data.py @@ -87,7 +87,7 @@ def _fetch_metadata( ------- dict A dictionary containing metadata for each sample dataset, with the - dataset name (pose file name) as the key. + dataset file name as the key. """ local_file_path = Path(data_dir / file_name) @@ -116,7 +116,8 @@ def _fetch_metadata( def _generate_file_registry(metadata: dict[str, dict]) -> dict[str, str]: """Generate a file registry based on the contents of the metadata. - This includes files containing poses, frames, or entire videos. + This includes files containing poses, frames, videos, or bounding boxes + data. Parameters ---------- @@ -131,7 +132,7 @@ def _generate_file_registry(metadata: dict[str, dict]) -> dict[str, str]: """ file_registry = {} for ds, val in metadata.items(): - file_registry[f"poses/{ds}"] = val["sha256sum"] + file_registry[f"{val['type']}/{ds}"] = val["sha256sum"] for key in ["video", "frame"]: file_name = val[key]["file_name"] if file_name: @@ -139,7 +140,7 @@ def _generate_file_registry(metadata: dict[str, dict]) -> dict[str, str]: return file_registry -# Create a download manager for the pose data +# Create a download manager for the sample data metadata = _fetch_metadata(METADATA_FILE, DATA_DIR) file_registry = _generate_file_registry(metadata) SAMPLE_DATA = pooch.create( @@ -151,19 +152,19 @@ def _generate_file_registry(metadata: dict[str, dict]) -> dict[str, str]: def list_datasets() -> list[str]: - """Find available sample datasets. + """List available sample datasets. Returns ------- filenames : list of str - List of filenames for available pose data. + List of filenames for available sample datasets. """ return list(metadata.keys()) def fetch_dataset_paths(filename: str) -> dict: - """Get paths to sample pose data and any associated frames or videos. + """Get paths to sample dataset and any associated frames or videos. The data are downloaded from the ``movement`` data repository to the user's local machine upon first use and are stored in a local cache directory. @@ -172,20 +173,21 @@ def fetch_dataset_paths(filename: str) -> dict: Parameters ---------- filename : str - Name of the pose file to fetch. + Name of the sample data file to fetch. Returns ------- paths : dict Dictionary mapping file types to their respective paths. The possible - file types are: "poses", "frame", "video". If "frame" or "video" are - not available, the corresponding value is None. + file types are: "poses", "frame", "video" or "bboxes". If "frame" or + "video" is not available, the corresponding value is None. Examples -------- >>> from movement.sample_data import fetch_dataset_paths >>> paths = fetch_dataset_paths("DLC_single-mouse_EPM.predictions.h5") - >>> poses_path = paths["poses"] + >>> poses_path = paths["poses"] # if the data is "pose" data + >>> bboxes_path = paths["bboxes"] # if the data is "bboxes" data >>> frame_path = paths["frame"] >>> video_path = paths["video"] @@ -194,21 +196,17 @@ def fetch_dataset_paths(filename: str) -> dict: fetch_dataset """ - available_pose_files = list_datasets() - if filename not in available_pose_files: + available_data_files = list_datasets() + if filename not in available_data_files: raise log_error( ValueError, f"File '{filename}' is not in the registry. " - f"Valid filenames are: {available_pose_files}", + f"Valid filenames are: {available_data_files}", ) frame_file_name = metadata[filename]["frame"]["file_name"] video_file_name = metadata[filename]["video"]["file_name"] - - return { - "poses": Path( - SAMPLE_DATA.fetch(f"poses/{filename}", progressbar=True) - ), + paths_dict = { "frame": None if not frame_file_name else Path( @@ -220,16 +218,23 @@ def fetch_dataset_paths(filename: str) -> dict: SAMPLE_DATA.fetch(f"videos/{video_file_name}", progressbar=True) ), } + # Add trajectory data + # Assume "poses" if not of type "bboxes" + data_type = "bboxes" if metadata[filename]["type"] == "bboxes" else "poses" + paths_dict[data_type] = Path( + SAMPLE_DATA.fetch(f"{data_type}/{filename}", progressbar=True) + ) + return paths_dict def fetch_dataset( filename: str, ) -> xarray.Dataset: - """Load a sample dataset containing pose data. + """Load a sample dataset. The data are downloaded from the ``movement`` data repository to the user's local machine upon first use and are stored in a local cache directory. - This function returns the pose data as an xarray Dataset. + This function returns the data as an xarray Dataset. If there are any associated frames or videos, these files are also downloaded and the paths are stored as dataset attributes. @@ -241,7 +246,7 @@ def fetch_dataset( Returns ------- ds : xarray.Dataset - Pose data contained in the fetched sample file. + Data contained in the fetched sample file. Examples -------- @@ -262,6 +267,10 @@ def fetch_dataset( source_software=metadata[filename]["source_software"], fps=metadata[filename]["fps"], ) + + # TODO: Add support for loading bounding boxes data. + # Implemented in PR 229: https://github.com/neuroinformatics-unit/movement/pull/229 + ds.attrs["frame_path"] = file_paths["frame"] ds.attrs["video_path"] = file_paths["video"] diff --git a/tests/conftest.py b/tests/conftest.py index d3b3f9f19..bb3e77d6a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,10 +20,11 @@ def pytest_configure(): """Perform initial configuration for pytest. Fetches pose data file paths as a dictionary for tests. """ - pytest.POSE_DATA_PATHS = { - file_name: fetch_dataset_paths(file_name)["poses"] - for file_name in list_datasets() - } + pytest.DATA_PATHS = {} + for file_name in list_datasets(): + paths_dict = fetch_dataset_paths(file_name) + data_path = paths_dict.get("poses") or paths_dict.get("bboxes") + pytest.DATA_PATHS[file_name] = data_path @pytest.fixture(autouse=True) @@ -194,9 +195,7 @@ def new_csv_file(tmp_path): @pytest.fixture def dlc_style_df(): """Return a valid DLC-style DataFrame.""" - return pd.read_hdf( - pytest.POSE_DATA_PATHS.get("DLC_single-wasp.predictions.h5") - ) + return pd.read_hdf(pytest.DATA_PATHS.get("DLC_single-wasp.predictions.h5")) @pytest.fixture( @@ -211,7 +210,7 @@ def dlc_style_df(): ) def sleap_file(request): """Return the file path for a SLEAP .h5 or .slp file.""" - return pytest.POSE_DATA_PATHS.get(request.param) + return pytest.DATA_PATHS.get(request.param) @pytest.fixture diff --git a/tests/test_integration/test_io.py b/tests/test_integration/test_io.py index e8820ad26..50f039335 100644 --- a/tests/test_integration/test_io.py +++ b/tests/test_integration/test_io.py @@ -2,7 +2,7 @@ import numpy as np import pytest import xarray as xr -from pytest import POSE_DATA_PATHS +from pytest import DATA_PATHS from movement.io import load_poses, save_poses @@ -62,7 +62,7 @@ def test_to_sleap_analysis_file_returns_same_h5_file_content( file) to a SLEAP-style .h5 analysis file returns the same file contents. """ - sleap_h5_file_path = POSE_DATA_PATHS.get(sleap_h5_file) + sleap_h5_file_path = DATA_PATHS.get(sleap_h5_file) ds = load_poses.from_sleap_file(sleap_h5_file_path, fps=fps) save_poses.to_sleap_analysis_file(ds, new_h5_file) @@ -93,7 +93,7 @@ def test_to_sleap_analysis_file_source_file(self, file, new_h5_file): to a SLEAP-style .h5 analysis file stores the .slp labels path only when the source file is a .slp file. """ - file_path = POSE_DATA_PATHS.get(file) + file_path = DATA_PATHS.get(file) if file.startswith("DLC"): ds = load_poses.from_dlc_file(file_path) else: diff --git a/tests/test_unit/test_load_poses.py b/tests/test_unit/test_load_poses.py index 58bfa2371..2c63500c9 100644 --- a/tests/test_unit/test_load_poses.py +++ b/tests/test_unit/test_load_poses.py @@ -4,7 +4,7 @@ import numpy as np import pytest import xarray as xr -from pytest import POSE_DATA_PATHS +from pytest import DATA_PATHS from sleap_io.io.slp import read_labels, write_labels from sleap_io.model.labels import LabeledFrame, Labels @@ -18,9 +18,7 @@ class TestLoadPoses: @pytest.fixture def sleap_slp_file_without_tracks(self, tmp_path): """Mock and return the path to a SLEAP .slp file without tracks.""" - sleap_file = POSE_DATA_PATHS.get( - "SLEAP_single-mouse_EPM.predictions.slp" - ) + sleap_file = DATA_PATHS.get("SLEAP_single-mouse_EPM.predictions.slp") labels = read_labels(sleap_file) file_path = tmp_path / "track_is_none.slp" lfs = [] @@ -48,7 +46,7 @@ def sleap_slp_file_without_tracks(self, tmp_path): @pytest.fixture def sleap_h5_file_without_tracks(self, tmp_path): """Mock and return the path to a SLEAP .h5 file without tracks.""" - sleap_file = POSE_DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5") + sleap_file = DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5") file_path = tmp_path / "track_is_none.h5" with h5py.File(sleap_file, "r") as f1, h5py.File(file_path, "w") as f2: for key in list(f1.keys()): @@ -120,7 +118,7 @@ def test_load_from_sleap_file_without_tracks( sleap_file_without_tracks ) ds_from_tracked = load_poses.from_sleap_file( - POSE_DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5") + DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5") ) # Check if the "individuals" coordinate matches # the assigned default "individuals_0" @@ -153,8 +151,8 @@ def test_load_from_sleap_slp_file_or_h5_file_returns_same( """Test that loading pose tracks from SLEAP .slp and .h5 files return the same Dataset. """ - slp_file_path = POSE_DATA_PATHS.get(slp_file) - h5_file_path = POSE_DATA_PATHS.get(h5_file) + slp_file_path = DATA_PATHS.get(slp_file) + h5_file_path = DATA_PATHS.get(h5_file) ds_from_slp = load_poses.from_sleap_file(slp_file_path) ds_from_h5 = load_poses.from_sleap_file(h5_file_path) xr.testing.assert_allclose(ds_from_h5, ds_from_slp) @@ -171,7 +169,7 @@ def test_load_from_dlc_file(self, file_name): """Test that loading pose tracks from valid DLC files returns a proper Dataset. """ - file_path = POSE_DATA_PATHS.get(file_name) + file_path = DATA_PATHS.get(file_name) ds = load_poses.from_dlc_file(file_path) self.assert_dataset(ds, file_path, "DeepLabCut") @@ -191,8 +189,8 @@ def test_load_from_dlc_file_csv_or_h5_file_returns_same(self): """Test that loading pose tracks from DLC .csv and .h5 files return the same Dataset. """ - csv_file_path = POSE_DATA_PATHS.get("DLC_single-wasp.predictions.csv") - h5_file_path = POSE_DATA_PATHS.get("DLC_single-wasp.predictions.h5") + csv_file_path = DATA_PATHS.get("DLC_single-wasp.predictions.csv") + h5_file_path = DATA_PATHS.get("DLC_single-wasp.predictions.h5") ds_from_csv = load_poses.from_dlc_file(csv_file_path) ds_from_h5 = load_poses.from_dlc_file(h5_file_path) xr.testing.assert_allclose(ds_from_h5, ds_from_csv) @@ -210,7 +208,7 @@ def test_load_from_dlc_file_csv_or_h5_file_returns_same(self): def test_fps_and_time_coords(self, fps, expected_fps, expected_time_unit): """Test that time coordinates are set according to the provided fps.""" ds = load_poses.from_sleap_file( - POSE_DATA_PATHS.get("SLEAP_three-mice_Aeon_proofread.analysis.h5"), + DATA_PATHS.get("SLEAP_three-mice_Aeon_proofread.analysis.h5"), fps=fps, ) assert ds.time_unit == expected_time_unit @@ -234,7 +232,7 @@ def test_load_from_lp_file(self, file_name): """Test that loading pose tracks from valid LightningPose (LP) files returns a proper Dataset. """ - file_path = POSE_DATA_PATHS.get(file_name) + file_path = DATA_PATHS.get(file_name) ds = load_poses.from_lp_file(file_path) self.assert_dataset(ds, file_path, "LightningPose") @@ -243,7 +241,7 @@ def test_load_from_lp_or_dlc_file_returns_same(self): using either the `from_lp_file` or `from_dlc_file` function returns the same Dataset (except for the source_software). """ - file_path = POSE_DATA_PATHS.get("LP_mouse-face_AIND.predictions.csv") + file_path = DATA_PATHS.get("LP_mouse-face_AIND.predictions.csv") ds_drom_lp = load_poses.from_lp_file(file_path) ds_from_dlc = load_poses.from_dlc_file(file_path) xr.testing.assert_allclose(ds_from_dlc, ds_drom_lp) @@ -254,7 +252,7 @@ def test_load_multi_individual_from_lp_file_raises(self): """Test that loading a multi-individual .csv file using the `from_lp_file` function raises a ValueError. """ - file_path = POSE_DATA_PATHS.get("DLC_two-mice.predictions.csv") + file_path = DATA_PATHS.get("DLC_two-mice.predictions.csv") with pytest.raises(ValueError): load_poses.from_lp_file(file_path) diff --git a/tests/test_unit/test_sample_data.py b/tests/test_unit/test_sample_data.py index 50967d626..ce94db19a 100644 --- a/tests/test_unit/test_sample_data.py +++ b/tests/test_unit/test_sample_data.py @@ -38,6 +38,7 @@ def validate_metadata(metadata: dict[str, dict]) -> None: """Assert that the metadata is in the expected format.""" metadata_fields = [ "sha256sum", + "type", "source_software", "fps", "species", @@ -59,9 +60,9 @@ def validate_metadata(metadata: dict[str, dict]) -> None: ), f"Expected metadata values to be dicts. {check_yaml_msg}" assert all( set(val.keys()) == set(metadata_fields) for val in metadata.values() - ), f"Found issues with the names of medatada fields. {check_yaml_msg}" + ), f"Found issues with the names of metadata fields. {check_yaml_msg}" - # check that metadata keys (pose file names) are unique + # check that metadata keys (file names) are unique assert len(metadata.keys()) == len(set(metadata.keys())) # check that the first 2 fields are present and are strings diff --git a/tests/test_unit/test_save_poses.py b/tests/test_unit/test_save_poses.py index ed3baf9c3..1efd3a477 100644 --- a/tests/test_unit/test_save_poses.py +++ b/tests/test_unit/test_save_poses.py @@ -5,7 +5,7 @@ import pandas as pd import pytest import xarray as xr -from pytest import POSE_DATA_PATHS +from pytest import DATA_PATHS from movement.io import load_poses, save_poses @@ -66,25 +66,25 @@ def output_file_params(self, request): (np.array([1, 2, 3]), pytest.raises(ValueError)), # incorrect type ( load_poses.from_dlc_file( - POSE_DATA_PATHS.get("DLC_single-wasp.predictions.h5") + DATA_PATHS.get("DLC_single-wasp.predictions.h5") ), does_not_raise(), ), # valid dataset ( load_poses.from_dlc_file( - POSE_DATA_PATHS.get("DLC_two-mice.predictions.csv") + DATA_PATHS.get("DLC_two-mice.predictions.csv") ), does_not_raise(), ), # valid dataset ( load_poses.from_sleap_file( - POSE_DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5") + DATA_PATHS.get("SLEAP_single-mouse_EPM.analysis.h5") ), does_not_raise(), ), # valid dataset ( load_poses.from_sleap_file( - POSE_DATA_PATHS.get( + DATA_PATHS.get( "SLEAP_three-mice_Aeon_proofread.predictions.slp" ) ), @@ -92,7 +92,7 @@ def output_file_params(self, request): ), # valid dataset ( load_poses.from_lp_file( - POSE_DATA_PATHS.get("LP_mouse-face_AIND.predictions.csv") + DATA_PATHS.get("LP_mouse-face_AIND.predictions.csv") ), does_not_raise(), ), # valid dataset