From 321e53d1fc9a2abc427eb6c60d473db8ea06fc87 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 26 Aug 2022 17:02:28 -0700 Subject: [PATCH 01/89] start creating the structure for lazy echodata combine --- echopype/echodata/combine_lazily.py | 46 +++++++++++++++++++++++++ echopype/echodata/combine_preprocess.py | 39 +++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 echopype/echodata/combine_lazily.py create mode 100644 echopype/echodata/combine_preprocess.py diff --git a/echopype/echodata/combine_lazily.py b/echopype/echodata/combine_lazily.py new file mode 100644 index 000000000..89b0b987e --- /dev/null +++ b/echopype/echodata/combine_lazily.py @@ -0,0 +1,46 @@ +from .combine_preprocess import ProvenancePreprocess +from echopype.echodata import EchoData +from datatree import DataTree +import xarray as xr + +group_preprocess = {'provenance': ProvenancePreprocess} + + +# desired_raw_file_paths = fs.glob('OOI_zarrs_ep_ex/temp/*.zarr') + + + +# initial strucuture for lazy combine +# tree_dict = {} +# result = EchoData() +# +# # for group, value in EchoData.group_map.items()[:2]: +# for group, value in list(EchoData.group_map.items())[:3]: +# +# print(value["ep_group"]) +# +# obj = ProvenancePreprocess(desired_raw_file_paths) +# +# combined_group = xr.open_mfdataset(desired_raw_file_paths, +# engine='zarr', coords='minimal', preprocess=obj, +# combine="nested", group=value["ep_group"], concat_dim=None) +# +# if value["ep_group"] is None: +# tree_dict["/"] = combined_group +# else: +# tree_dict[value["ep_group"]] = combined_group +# +# # Set tree into echodata object +# result._set_tree(tree=DataTree.from_dict(tree_dict, name="root")) +# result._load_tree() + + + +# How to construct Provenance Group +# obj = ProvenancePreprocess(desired_raw_file_paths) +# +# out = xr.open_mfdataset(desired_raw_file_paths[:2], +# engine='zarr', coords='minimal', +# combine="nested", group='Provenance', +# preprocess=obj, concat_dim=None) +# TODO: to be identical to in-memory combine remove filenames as coordinate (keep as dim) \ No newline at end of file diff --git a/echopype/echodata/combine_preprocess.py b/echopype/echodata/combine_preprocess.py new file mode 100644 index 000000000..f0ed9fd1a --- /dev/null +++ b/echopype/echodata/combine_preprocess.py @@ -0,0 +1,39 @@ +import numpy as np +from pathlib import Path +import xarray as xr + + +class ProvenancePreprocess: + def __init__(self, file_paths): + self.file_paths = file_paths + + def __call__(self, ds): + self.assign_file_index(ds) + self.store_attrs(ds) + + return ds + + def assign_file_index(self, ds): + + ind_file = self.file_paths.index(ds.encoding["source"]) + ds['filenames'] = (['filenames'], np.array([ind_file])) + + def store_attrs(self, ds): + + file_name = Path(ds.encoding["source"]).name + + attrs_var = xr.DataArray(data=np.array([list(ds.attrs.values())]), + coords={'echodata_filename': (['echodata_filename'], np.array([file_name])), + 'provenance_attr_key': (['provenance_attr_key'], + np.array(['conversion_software_name', + 'conversion_software_version', + 'conversion_time', + 'duplicate_ping_times']))}) + + ds['provenance_attrs'] = attrs_var + + + + + + From c1426f299ac1429dfca0d84a3d2c24d26c166e9f Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 30 Aug 2022 11:45:55 -0700 Subject: [PATCH 02/89] create PreprocessCallable class and add functionality to laze_combine --- echopype/echodata/combine_lazily.py | 89 +++++++++++++++++-------- echopype/echodata/combine_preprocess.py | 44 +++++++----- 2 files changed, 89 insertions(+), 44 deletions(-) diff --git a/echopype/echodata/combine_lazily.py b/echopype/echodata/combine_lazily.py index 89b0b987e..da7327c2c 100644 --- a/echopype/echodata/combine_lazily.py +++ b/echopype/echodata/combine_lazily.py @@ -1,38 +1,75 @@ -from .combine_preprocess import ProvenancePreprocess +from .combine_preprocess import PreprocessCallable from echopype.echodata import EchoData from datatree import DataTree import xarray as xr -group_preprocess = {'provenance': ProvenancePreprocess} +# desired_raw_file_paths = fs.glob('OOI_zarrs_ep_ex/temp/*.zarr') -# desired_raw_file_paths = fs.glob('OOI_zarrs_ep_ex/temp/*.zarr') +def reassign_attrs(ed_comb: EchoData): + """ + Reassigns stored group attributes to the Provenance group. + """ + for group, value in EchoData.group_map.items(): + if value["ep_group"] not in ['Sonar/Beam_group2', 'Sonar/Beam_group3', 'Sonar/Beam_group4']: -# initial strucuture for lazy combine -# tree_dict = {} -# result = EchoData() -# -# # for group, value in EchoData.group_map.items()[:2]: -# for group, value in list(EchoData.group_map.items())[:3]: -# -# print(value["ep_group"]) -# -# obj = ProvenancePreprocess(desired_raw_file_paths) -# -# combined_group = xr.open_mfdataset(desired_raw_file_paths, -# engine='zarr', coords='minimal', preprocess=obj, -# combine="nested", group=value["ep_group"], concat_dim=None) -# -# if value["ep_group"] is None: -# tree_dict["/"] = combined_group -# else: -# tree_dict[value["ep_group"]] = combined_group -# -# # Set tree into echodata object -# result._set_tree(tree=DataTree.from_dict(tree_dict, name="root")) -# result._load_tree() + if value["ep_group"] != "Provenance": + + attr_var_name = group + '_attrs' + attr_coord_name = group + '_attr_key' + + if value["ep_group"]: + ed_grp = value["ep_group"] + else: + ed_grp = "Top-level" + + # move attribute variable to Provenance + ed_comb["Provenance"][attr_var_name] = ed_comb[ed_grp][attr_var_name] + + # remove attribute variable and coords from group + ed_comb[ed_grp] = ed_comb[ed_grp].drop_vars([attr_var_name, attr_coord_name, + 'echodata_filename']) + + +def lazy_combine(desired_raw_file_paths): + + # initial strucuture for lazy combine + tree_dict = {} + result = EchoData() + + # grab object that does pre-processing + preprocess_obj = PreprocessCallable(desired_raw_file_paths) + + for group, value in EchoData.group_map.items(): + + print(value["ep_group"]) + + if value["ep_group"] not in ['Sonar/Beam_group2', 'Sonar/Beam_group3', 'Sonar/Beam_group4']: + + preprocess_obj.update_ed_group(group) + + combined_group = xr.open_mfdataset(desired_raw_file_paths, + engine='zarr', coords='minimal', preprocess=preprocess_obj, + combine="nested", group=value["ep_group"], concat_dim=None) + + if value["ep_group"] is None: + tree_dict["/"] = combined_group + else: + tree_dict[value["ep_group"]] = combined_group + + # Set tree into echodata object + result._set_tree(tree=DataTree.from_dict(tree_dict, name="root")) + result._load_tree() + + # reassign stored group attributes to the provenance group + reassign_attrs(result) + + # TODO: modify Provenance conversion_time attribute + # dt.utcnow().isoformat(timespec="seconds") + "Z", # use UTC time + + return result diff --git a/echopype/echodata/combine_preprocess.py b/echopype/echodata/combine_preprocess.py index f0ed9fd1a..1f2afe49d 100644 --- a/echopype/echodata/combine_preprocess.py +++ b/echopype/echodata/combine_preprocess.py @@ -1,39 +1,47 @@ import numpy as np from pathlib import Path import xarray as xr +from typing import List -class ProvenancePreprocess: - def __init__(self, file_paths): +class PreprocessCallable: + """ + Class that has all preprocessing functions and is callable. + """ + def __init__(self, file_paths: List[str]): self.file_paths = file_paths + self.ed_group = None def __call__(self, ds): - self.assign_file_index(ds) - self.store_attrs(ds) - return ds + if self.ed_group == "provenance": + self._assign_file_index(ds) - def assign_file_index(self, ds): + self._store_attrs(ds) - ind_file = self.file_paths.index(ds.encoding["source"]) - ds['filenames'] = (['filenames'], np.array([ind_file])) + # TODO: add method to check and correct reversed times - def store_attrs(self, ds): + return ds - file_name = Path(ds.encoding["source"]).name + def update_ed_group(self, group: str): + self.ed_group = group - attrs_var = xr.DataArray(data=np.array([list(ds.attrs.values())]), - coords={'echodata_filename': (['echodata_filename'], np.array([file_name])), - 'provenance_attr_key': (['provenance_attr_key'], - np.array(['conversion_software_name', - 'conversion_software_version', - 'conversion_time', - 'duplicate_ping_times']))}) + def _assign_file_index(self, ds): - ds['provenance_attrs'] = attrs_var + ind_file = self.file_paths.index(ds.encoding["source"]) + ds['filenames'] = (['filenames'], np.array([ind_file])) + # TODO: add method to check and correct reversed times + def _store_attrs(self, ds): + file_name = Path(ds.encoding["source"]).name + grp_key_name = self.ed_group + '_attr_key' + grp_attr_names = np.array(list(ds.attrs.keys())) + attrs_var = xr.DataArray(data=np.array([list(ds.attrs.values())]), + coords={'echodata_filename': (['echodata_filename'], np.array([file_name])), + grp_key_name: ([grp_key_name], grp_attr_names)}) + ds[self.ed_group + '_attrs'] = attrs_var From bb59291695e8f2f1cd5bebdc25732cda37abba7f Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 30 Aug 2022 17:04:18 -0700 Subject: [PATCH 03/89] finish creating a working version of lazy_combine --- echopype/echodata/combine_lazily.py | 85 +++++++++++++++++++++++------ 1 file changed, 67 insertions(+), 18 deletions(-) diff --git a/echopype/echodata/combine_lazily.py b/echopype/echodata/combine_lazily.py index da7327c2c..bcacf4cf1 100644 --- a/echopype/echodata/combine_lazily.py +++ b/echopype/echodata/combine_lazily.py @@ -2,38 +2,75 @@ from echopype.echodata import EchoData from datatree import DataTree import xarray as xr +from fsspec.implementations.local import LocalFileSystem # desired_raw_file_paths = fs.glob('OOI_zarrs_ep_ex/temp/*.zarr') -def reassign_attrs(ed_comb: EchoData): +def get_ed_path_from_str(zarr_path: str, path: str): + """ + + Parameters + ---------- + zarr_path: str + Full path to zarr file + path: str + Full path to ``.zgroup`` + """ + + # the names of the groups that are needed to get to path + all_grp_names = [elm for elm in path.split('/') if (elm not in zarr_path.split('/')) and (elm != '.zgroup')] + + return '/'.join(all_grp_names) + + +def get_zarr_grp_names(path: str, fs: LocalFileSystem) -> set: + """ + Identifies the zarr group names using the path + """ + + # grab all paths that have .zgroup + info = fs.glob(path + '/**.zgroup') + + # infer the group name based on the path + ed_grp_name = {get_ed_path_from_str(path, entry) for entry in info} + + # remove the zarr file name and replace it with Top-level + if '' in ed_grp_name: + ed_grp_name.remove('') + ed_grp_name.add(None) + + return ed_grp_name + + +def reassign_attrs(ed_comb: EchoData, common_grps: set): """ Reassigns stored group attributes to the Provenance group. """ for group, value in EchoData.group_map.items(): - if value["ep_group"] not in ['Sonar/Beam_group2', 'Sonar/Beam_group3', 'Sonar/Beam_group4']: + if (value["ep_group"] != "Provenance") and (value["ep_group"] in common_grps) and (value["ep_group"] != 'Sonar/Beam_group1'): - if value["ep_group"] != "Provenance": + attr_var_name = group + '_attrs' + attr_coord_name = group + '_attr_key' - attr_var_name = group + '_attrs' - attr_coord_name = group + '_attr_key' + if value["ep_group"]: + ed_grp = value["ep_group"] + else: + ed_grp = "Top-level" - if value["ep_group"]: - ed_grp = value["ep_group"] - else: - ed_grp = "Top-level" + # move attribute variable to Provenance + ed_comb["Provenance"][attr_var_name] = ed_comb[ed_grp][attr_var_name] - # move attribute variable to Provenance - ed_comb["Provenance"][attr_var_name] = ed_comb[ed_grp][attr_var_name] + # remove attribute variable and coords from group + ed_comb[ed_grp] = ed_comb[ed_grp].drop_vars([attr_var_name, attr_coord_name, + 'echodata_filename']) - # remove attribute variable and coords from group - ed_comb[ed_grp] = ed_comb[ed_grp].drop_vars([attr_var_name, attr_coord_name, - 'echodata_filename']) +def lazy_combine(desired_raw_file_paths, fs): -def lazy_combine(desired_raw_file_paths): + # TODO: test code when we have to do an expansion in range_sample # initial strucuture for lazy combine tree_dict = {} @@ -42,12 +79,24 @@ def lazy_combine(desired_raw_file_paths): # grab object that does pre-processing preprocess_obj = PreprocessCallable(desired_raw_file_paths) + # TODO: the subsequent line is zarr specific!! Account for nc in the future + # determine each zarr's group names + file_grps = [get_zarr_grp_names(path, fs) for path in desired_raw_file_paths] + + # get the group names that all files share + common_grps = set.intersection(*file_grps) + + # check that all zarrs have the same groups + if any([common_grps.symmetric_difference(s) for s in file_grps]): + raise RuntimeError('All input files must have the same groups!') + for group, value in EchoData.group_map.items(): - print(value["ep_group"]) + if (value["ep_group"] in common_grps) and (value["ep_group"] != 'Sonar/Beam_group1'): - if value["ep_group"] not in ['Sonar/Beam_group2', 'Sonar/Beam_group3', 'Sonar/Beam_group4']: + print(f"ed group = {value['ep_group']}") + convention_name = EchoData.group_map preprocess_obj.update_ed_group(group) combined_group = xr.open_mfdataset(desired_raw_file_paths, @@ -64,7 +113,7 @@ def lazy_combine(desired_raw_file_paths): result._load_tree() # reassign stored group attributes to the provenance group - reassign_attrs(result) + reassign_attrs(result, common_grps) # TODO: modify Provenance conversion_time attribute # dt.utcnow().isoformat(timespec="seconds") + "Z", # use UTC time From e58df72e916d7e5520f45fb7c7acc215411f7eea Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 31 Aug 2022 15:47:31 -0700 Subject: [PATCH 04/89] start working on v2 of combine_lazily --- echopype/echodata/combine_lazily.py | 5 +- echopype/echodata/combine_lazily_v2.py | 95 +++++++++++++++++++++++++ echopype/echodata/combine_preprocess.py | 16 +++++ 3 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 echopype/echodata/combine_lazily_v2.py diff --git a/echopype/echodata/combine_lazily.py b/echopype/echodata/combine_lazily.py index bcacf4cf1..0ab7d35b6 100644 --- a/echopype/echodata/combine_lazily.py +++ b/echopype/echodata/combine_lazily.py @@ -50,7 +50,7 @@ def reassign_attrs(ed_comb: EchoData, common_grps: set): for group, value in EchoData.group_map.items(): - if (value["ep_group"] != "Provenance") and (value["ep_group"] in common_grps) and (value["ep_group"] != 'Sonar/Beam_group1'): + if (value["ep_group"] != "Provenance") and (value["ep_group"] in common_grps): attr_var_name = group + '_attrs' attr_coord_name = group + '_attr_key' @@ -92,11 +92,10 @@ def lazy_combine(desired_raw_file_paths, fs): for group, value in EchoData.group_map.items(): - if (value["ep_group"] in common_grps) and (value["ep_group"] != 'Sonar/Beam_group1'): + if (value["ep_group"] in common_grps): print(f"ed group = {value['ep_group']}") - convention_name = EchoData.group_map preprocess_obj.update_ed_group(group) combined_group = xr.open_mfdataset(desired_raw_file_paths, diff --git a/echopype/echodata/combine_lazily_v2.py b/echopype/echodata/combine_lazily_v2.py new file mode 100644 index 000000000..19d84e616 --- /dev/null +++ b/echopype/echodata/combine_lazily_v2.py @@ -0,0 +1,95 @@ +import xarray as xr +import pandas as pd +import dask.array +import dask + +const_dims = ['channel'] + +def get_ds_dims_info(ds_list): + + ds_dims = [] + for ds in ds_list: + ds_dims.append(ds.dims) + + dims_df = pd.DataFrame(ds_dims) + dims_sum = dims_df.sum(axis=0).to_dict() + dims_max = dims_df.max(axis=0).to_dict() + dims_csum = dims_df.cumsum(axis=0).to_dict() + + return dims_sum, dims_csum, dims_max + + +def get_temp_arr_vals(dims, dims_max, dims_sum): + + shape = [dims_max[dim] if dim in const_dims else dims_sum[dim] for dim in dims] + + chnk_shape = [None if dim in const_dims else dims_max[dim] for dim in dims] + + return shape, chnk_shape + + +def constuct_lazy_ds(ds_model, dims_sum, dims_max): + + xr_dict = dict() + + unwritten_vars = [] + for name, val in ds_model.variables.items(): + + if ('channel',) != val.dims: + shape, chnk_shape = get_temp_arr_vals(val.dims, dims_max, dims_sum) + temp_arr = dask.array.zeros(shape=shape, chunks=chnk_shape) + + xr_dict[name] = (val.dims, temp_arr, val.attrs) + + else: + unwritten_vars.append(name) + + ds = xr.Dataset(xr_dict) + + return ds, unwritten_vars + + +def get_region(ds_ind, dims_csum): + + print([csum[ds_ind] for dim, csum in dims_csum.items()]) + + if ds_ind == 0: + region = {dim: slice(0, csum[ds_ind]) for dim, csum in dims_csum.items() if dim not in const_dims} + + else: + region = {dim: slice(csum[ds_ind-1], csum[ds_ind]) for dim, csum in dims_csum.items() if dim not in const_dims} + + return region + + + +def direct_write(path, ds_list): + + dims_sum, dims_csum, dims_max = get_ds_dims_info(ds_list) + + ds_lazy, unwritten_vars = constuct_lazy_ds(ds_list[0], dims_sum, dims_max) + + # ds_lazy.to_zarr(path, compute=False) + + for i in range(len(ds_list)): + + print(get_region(i, dims_csum)) + + + # + # eds_lazy[0] = eds_lazy[0].drop(['time1', 'channel', 'frequency_nominal']) + # eds_lazy[0].to_zarr(path, region={"time1": slice(0, var_cumulative_sum["time1"].loc[0])}) + # + # for i in range(1, len(eds_lazy)): + # print(i) + # eds_lazy[i] = eds_lazy[i].drop(['time1', 'channel', 'frequency_nominal']) + # + # print(var_cumulative_sum["time1"].loc[i - 1], var_cumulative_sum["time1"].loc[i]) + # slc = slice(var_cumulative_sum["time1"].loc[i - 1], var_cumulative_sum["time1"].loc[i]) + # eds_lazy[i].to_zarr(path, region={"time1": slc}) + + +# def lazy_combine(path, eds): +# +# # TODO: do direct_write(path, ds_list) for each group in eds + diff --git a/echopype/echodata/combine_preprocess.py b/echopype/echodata/combine_preprocess.py index 1f2afe49d..acccb6530 100644 --- a/echopype/echodata/combine_preprocess.py +++ b/echopype/echodata/combine_preprocess.py @@ -19,6 +19,8 @@ def __call__(self, ds): self._store_attrs(ds) + ds = self.re_chunk(ds) + # TODO: add method to check and correct reversed times return ds @@ -26,6 +28,20 @@ def __call__(self, ds): def update_ed_group(self, group: str): self.ed_group = group + def re_chunk(self, ds): + + # chunk_dict = {'time2': 1000, 'time3': 1000} + # chunk_dict = {'ping_time': 100, 'range_sample': 100} + + # ds = ds.chunk(chunk_dict) + + for drop_var in ['backscatter_r', 'angle_athwartship', 'angle_alongship']: + + if drop_var in ds: + ds = ds.drop_vars(drop_var) + + return ds + def _assign_file_index(self, ds): ind_file = self.file_paths.index(ds.encoding["source"]) From e2b9ec664a2b01003b56c9683c569b11bd9706a5 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 31 Aug 2022 17:37:42 -0700 Subject: [PATCH 05/89] get a working version of direct_write in combine_lazily_v2 --- echopype/echodata/combine_lazily_v2.py | 54 +++++++++++++++----------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/echopype/echodata/combine_lazily_v2.py b/echopype/echodata/combine_lazily_v2.py index 19d84e616..0370642d5 100644 --- a/echopype/echodata/combine_lazily_v2.py +++ b/echopype/echodata/combine_lazily_v2.py @@ -2,6 +2,7 @@ import pandas as pd import dask.array import dask +import numpy as np const_dims = ['channel'] @@ -16,19 +17,19 @@ def get_ds_dims_info(ds_list): dims_max = dims_df.max(axis=0).to_dict() dims_csum = dims_df.cumsum(axis=0).to_dict() - return dims_sum, dims_csum, dims_max + return dims_sum, dims_csum, dims_max, dims_df -def get_temp_arr_vals(dims, dims_max, dims_sum): +def get_temp_arr_vals(dims, dims_max, dims_sum, dims_df): shape = [dims_max[dim] if dim in const_dims else dims_sum[dim] for dim in dims] - chnk_shape = [None if dim in const_dims else dims_max[dim] for dim in dims] + chnk_shape = [None if dim in const_dims else tuple(dims_df[dim].to_list()) for dim in dims] return shape, chnk_shape -def constuct_lazy_ds(ds_model, dims_sum, dims_max): +def constuct_lazy_ds(ds_model, dims_sum, dims_max, dims_df): xr_dict = dict() @@ -36,8 +37,9 @@ def constuct_lazy_ds(ds_model, dims_sum, dims_max): for name, val in ds_model.variables.items(): if ('channel',) != val.dims: - shape, chnk_shape = get_temp_arr_vals(val.dims, dims_max, dims_sum) - temp_arr = dask.array.zeros(shape=shape, chunks=chnk_shape) + shape, chnk_shape = get_temp_arr_vals(val.dims, dims_max, dims_sum, dims_df) + + temp_arr = dask.array.zeros(shape=shape, chunks=chnk_shape, dtype=val.dtype) xr_dict[name] = (val.dims, temp_arr, val.attrs) @@ -51,8 +53,6 @@ def constuct_lazy_ds(ds_model, dims_sum, dims_max): def get_region(ds_ind, dims_csum): - print([csum[ds_ind] for dim, csum in dims_csum.items()]) - if ds_ind == 0: region = {dim: slice(0, csum[ds_ind]) for dim, csum in dims_csum.items() if dim not in const_dims} @@ -62,31 +62,39 @@ def get_region(ds_ind, dims_csum): return region +def get_fill_dict(ds_lazy): + + fill_vals = dict() + for var, val in ds_lazy.variables.items(): + + if val.dtype == np.float64: + fill_vals[var] = {'_FillValue': np.nan} + elif val.dtype == np.dtype(' Date: Thu, 1 Sep 2022 16:58:04 -0700 Subject: [PATCH 06/89] make construct_lazy_ds return ds_unwritten --- echopype/echodata/combine_lazily_v2.py | 63 +++++++++++++++++++------- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/echopype/echodata/combine_lazily_v2.py b/echopype/echodata/combine_lazily_v2.py index 0370642d5..2e70713fc 100644 --- a/echopype/echodata/combine_lazily_v2.py +++ b/echopype/echodata/combine_lazily_v2.py @@ -4,7 +4,10 @@ import dask import numpy as np -const_dims = ['channel'] +const_dims = ['channel'] # those dimensions that should not be chunked +time_dims = ['time1', 'time2', 'time3'] # those dimensions associated with time +possible_dims = [] #const_dims + time_dims # all possible dimensions we can encounter + def get_ds_dims_info(ds_list): @@ -20,35 +23,36 @@ def get_ds_dims_info(ds_list): return dims_sum, dims_csum, dims_max, dims_df -def get_temp_arr_vals(dims, dims_max, dims_sum, dims_df): +def get_temp_arr_vals(dims, dims_max, dims_sum): shape = [dims_max[dim] if dim in const_dims else dims_sum[dim] for dim in dims] - chnk_shape = [None if dim in const_dims else tuple(dims_df[dim].to_list()) for dim in dims] + chnk_shape = [None if dim in const_dims else dims_max[dim] for dim in dims] return shape, chnk_shape -def constuct_lazy_ds(ds_model, dims_sum, dims_max, dims_df): +def construct_lazy_ds(ds_model, dims_sum, dims_max): xr_dict = dict() - unwritten_vars = [] + unwritten_dict = dict() for name, val in ds_model.variables.items(): - if ('channel',) != val.dims: - shape, chnk_shape = get_temp_arr_vals(val.dims, dims_max, dims_sum, dims_df) + if (name not in possible_dims) and (val.dims != ('channel',)): # TODO: hard coded, can we avoid it? + shape, chnk_shape = get_temp_arr_vals(val.dims, dims_max, dims_sum) temp_arr = dask.array.zeros(shape=shape, chunks=chnk_shape, dtype=val.dtype) xr_dict[name] = (val.dims, temp_arr, val.attrs) else: - unwritten_vars.append(name) + unwritten_dict[name] = val ds = xr.Dataset(xr_dict) + ds_unwritten = xr.Dataset(unwritten_dict) - return ds, unwritten_vars + return ds, ds_unwritten def get_region(ds_ind, dims_csum): @@ -77,25 +81,52 @@ def get_fill_dict(ds_lazy): return fill_vals -def direct_write(path, ds_list): +def direct_write(path, ds_list, group): dims_sum, dims_csum, dims_max, dims_df = get_ds_dims_info(ds_list) - ds_lazy, unwritten_vars = constuct_lazy_ds(ds_list[0], dims_sum, dims_max, dims_df) + # TODO: Do check that all of the channels are the same and times don't overlap and they increase + + ds_lazy, ds_unwritten = construct_lazy_ds(ds_list[0], dims_sum, dims_max) # set fill value for each of the arrays fill_vals = get_fill_dict(ds_lazy) - ds_lazy.to_zarr(path, compute=False, encoding=fill_vals) + print("group") + ds_lazy.to_zarr(path, compute=False, group=group, encoding=fill_vals, consolidated=True) + + # variables to drop from each ds and write in later + drop_vars = list(ds_unwritten) + list(ds_unwritten.dims) + + for i in range(len(ds_list)): # TODO: parallelize this loop + + region = get_region(i, dims_csum) + ds_list[i].drop(drop_vars).to_zarr(path, group=group, region=region) + + + # TODO: maybe this will work for time: + # ds_lazy[0][["time1"]].to_zarr(path, group=grp_name, region={'time1': slice(0, 5923)}) - for i in range(len(ds_list)): + # ds_opened = xr.open_zarr(path, group=group) + # + # dims_drop = set(ds_unwritten.dims).intersection(set(time_dims)) + # for name, val in ds_unwritten.drop(dims_drop).items(): + # ds_opened[name] = val + # + # def func(ds): + # + # return ds[time_dims] + # + # times = xr.concat(list(map(func, ds_lazy)), dim=time_dims, coords='all').drop("concat_dim") + # + # for time, val in times.coords.items(): + # ds_opened[time] = val - ds_list[i] = ds_list[i].drop(unwritten_vars) - ds_list[i].to_zarr(path, region=get_region(i, dims_csum)) - #TODO: figure out why time1 is not being correctly written to zarr + # TODO: add back in coordinates and attributes for dataset + # TODO: re-chunk the zarr store after everything has been added # def lazy_combine(path, eds): # From b4d9a13a96f4ec329f76115490c7e5f9d424f683 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 2 Sep 2022 10:30:22 -0700 Subject: [PATCH 07/89] correctly write all variables and dimensions for the Environment group using combine_lazily_v2 --- echopype/echodata/combine_lazily_v2.py | 103 +++++++++++-------------- 1 file changed, 44 insertions(+), 59 deletions(-) diff --git a/echopype/echodata/combine_lazily_v2.py b/echopype/echodata/combine_lazily_v2.py index 2e70713fc..baa1b25b8 100644 --- a/echopype/echodata/combine_lazily_v2.py +++ b/echopype/echodata/combine_lazily_v2.py @@ -2,11 +2,12 @@ import pandas as pd import dask.array import dask -import numpy as np + const_dims = ['channel'] # those dimensions that should not be chunked time_dims = ['time1', 'time2', 'time3'] # those dimensions associated with time -possible_dims = [] #const_dims + time_dims # all possible dimensions we can encounter +possible_dims = const_dims + time_dims # all possible dimensions we can encounter +lazy_encodings = ["chunks", "preferred_chunks", "compressor"] def get_ds_dims_info(ds_list): @@ -23,62 +24,54 @@ def get_ds_dims_info(ds_list): return dims_sum, dims_csum, dims_max, dims_df -def get_temp_arr_vals(dims, dims_max, dims_sum): +def get_temp_arr(dims, dtype, dims_max, dims_sum): shape = [dims_max[dim] if dim in const_dims else dims_sum[dim] for dim in dims] - chnk_shape = [None if dim in const_dims else dims_max[dim] for dim in dims] + chnk_shape = [dims_max[dim] for dim in dims] - return shape, chnk_shape + return dask.array.zeros(shape=shape, chunks=chnk_shape, dtype=dtype) def construct_lazy_ds(ds_model, dims_sum, dims_max): - xr_dict = dict() - - unwritten_dict = dict() + xr_vars_dict = dict() + xr_coords_dict = dict() for name, val in ds_model.variables.items(): - - if (name not in possible_dims) and (val.dims != ('channel',)): # TODO: hard coded, can we avoid it? - shape, chnk_shape = get_temp_arr_vals(val.dims, dims_max, dims_sum) - - temp_arr = dask.array.zeros(shape=shape, chunks=chnk_shape, dtype=val.dtype) - - xr_dict[name] = (val.dims, temp_arr, val.attrs) + if name not in possible_dims: + temp_arr = get_temp_arr(list(val.dims), val.dtype, dims_max, dims_sum) + xr_vars_dict[name] = (val.dims, temp_arr, val.attrs) else: - unwritten_dict[name] = val + temp_arr = get_temp_arr(list(val.dims), val.dtype, dims_max, dims_sum) + xr_coords_dict[name] = (val.dims, temp_arr, val.attrs) - ds = xr.Dataset(xr_dict) - ds_unwritten = xr.Dataset(unwritten_dict) + ds = xr.Dataset(xr_vars_dict, coords=xr_coords_dict) - return ds, ds_unwritten + # TODO: add ds attributes here? + return ds -def get_region(ds_ind, dims_csum): + +def get_region(ds_ind, dims_csum, ds_dims): if ds_ind == 0: - region = {dim: slice(0, csum[ds_ind]) for dim, csum in dims_csum.items() if dim not in const_dims} + region = {dim: slice(0, dims_csum[dim][ds_ind]) for dim in ds_dims} else: - region = {dim: slice(csum[ds_ind-1], csum[ds_ind]) for dim, csum in dims_csum.items() if dim not in const_dims} + region = {dim: slice(dims_csum[dim][ds_ind-1], dims_csum[dim][ds_ind]) for dim in ds_dims} return region -def get_fill_dict(ds_lazy): - - fill_vals = dict() - for var, val in ds_lazy.variables.items(): +def get_ds_encodings(ds_model): - if val.dtype == np.float64: - fill_vals[var] = {'_FillValue': np.nan} - elif val.dtype == np.dtype(' Date: Fri, 2 Sep 2022 11:50:35 -0700 Subject: [PATCH 08/89] account for the rest of the constant dimensions --- echopype/echodata/combine_lazily_v2.py | 52 +++++++++++++++++++++----- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/echopype/echodata/combine_lazily_v2.py b/echopype/echodata/combine_lazily_v2.py index baa1b25b8..04329248f 100644 --- a/echopype/echodata/combine_lazily_v2.py +++ b/echopype/echodata/combine_lazily_v2.py @@ -4,9 +4,16 @@ import dask -const_dims = ['channel'] # those dimensions that should not be chunked -time_dims = ['time1', 'time2', 'time3'] # those dimensions associated with time -possible_dims = const_dims + time_dims # all possible dimensions we can encounter +# those dimensions that should not be chunked +const_dims = ['channel', 'beam_group', 'beam', 'range_sample', 'pulse_length_bin'] + +# those dimensions associated with time +time_dims = ['time1', 'time2', 'time3', 'ping_time'] + +# all possible dimensions we can encounter +possible_dims = const_dims + time_dims + +# encodings associated with lazy loaded variables lazy_encodings = ["chunks", "preferred_chunks", "compressor"] @@ -74,7 +81,23 @@ def get_ds_encodings(ds_model): return encodings -def direct_write(path, ds_list, group): +def get_constant_vars(ds_model): + + dim_form = [(dim,) for dim in const_dims] + + # account for Vendor_specific vars + dim_form.append(('channel', 'pulse_length_bin')) # TODO: is there a better way? + + const_vars = [] + for name, val in ds_model.variables.items(): + + if val.dims in dim_form: + const_vars.append(name) + + return const_vars + + +def direct_write(path, ds_list, group, storage_options): dims_sum, dims_csum, dims_max, dims_df = get_ds_dims_info(ds_list) @@ -86,10 +109,11 @@ def direct_write(path, ds_list, group): # get encodings for each of the arrays encodings = get_ds_encodings(ds_list[0]) - ds_lazy.to_zarr(path, compute=False, group=group, encoding=encodings, consolidated=True) + ds_lazy.to_zarr(path, compute=False, group=group, encoding=encodings, + consolidated=True, storage_options=storage_options) # constant variables that will be written in later - const_vars = ["frequency_nominal", "channel"] # TODO: generalize this! + const_vars = get_constant_vars(ds_list[0]) print(f"const_vars = {const_vars}") @@ -98,20 +122,28 @@ def direct_write(path, ds_list, group): ds_dims = set(ds_list[i].dims) - set(const_vars) region = get_region(i, dims_csum, ds_dims) - ds_list[i].drop(const_vars).to_zarr(path, group=group, region=region) + ds_list[i].drop(const_vars).to_zarr(path, group=group, region=region, + storage_options=storage_options) # write constant vars to zarr using the first element of ds_list for var in const_vars: # TODO: one should not parallelize this loop?? - if var not in possible_dims: # dims will be automatically filled in + # dims will be automatically filled when they occur in a variable + if (var not in possible_dims) or (var in ['beam', 'range_sample']): region = get_region(0, dims_csum, list(ds_list[0][var].dims)) - ds_list[0][[var]].to_zarr(path, group=group, region=region) + ds_list[0][[var]].to_zarr(path, group=group, region=region, + storage_options=storage_options) # TODO: add back in attributes for dataset + # TODO: correctly add attribute keys for Provenance group + + # TODO: need to consider the case where range_sample needs to be padded + + # TODO: re-chunk the zarr store after everything has been added? - # TODO: re-chunk the zarr store after everything has been added + # TODO: is there a way we can preserve order in variables with writing? # def lazy_combine(path, eds): # From 44faf4db72672a8e26acd9b5aea67e2488f11233 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 2 Sep 2022 14:40:40 -0700 Subject: [PATCH 09/89] add comments and documentation to code in combine_lazily_v2 --- echopype/echodata/combine_lazily_v2.py | 252 +++++++++++++++++++++---- 1 file changed, 217 insertions(+), 35 deletions(-) diff --git a/echopype/echodata/combine_lazily_v2.py b/echopype/echodata/combine_lazily_v2.py index 04329248f..40c7cdd8f 100644 --- a/echopype/echodata/combine_lazily_v2.py +++ b/echopype/echodata/combine_lazily_v2.py @@ -2,6 +2,10 @@ import pandas as pd import dask.array import dask +from typing import List, Tuple, Dict, Hashable, Optional, Set + + +# TODO: make this a class and have dims info/below lists as a class variable # those dimensions that should not be chunked @@ -17,111 +21,285 @@ lazy_encodings = ["chunks", "preferred_chunks", "compressor"] -def get_ds_dims_info(ds_list): - - ds_dims = [] - for ds in ds_list: - ds_dims.append(ds.dims) - - dims_df = pd.DataFrame(ds_dims) +def get_ds_dims_info(ds_list: List[xr.Dataset]) -> Tuple[dict, dict, dict]: + """ + Constructs useful dictionaries that contain information + about the dimensions of the Dataset + + Parameters + ---------- + ds_list: List[xr.Dataset] + The Datasets that will be combined + + Returns + ------- + dims_sum: dict + Keys as the dimension name and values as the corresponding + sum of the lengths across all Datasets + dims_csum: dict + Keys as the dimension name and values as a dictionary of + the corresponding cumulative sum of the lengths across + all Datasets + dims_max: dict + Keys as the dimension name and values as the corresponding + maximum length across all Datasets + """ + + # Dataframe with column as dim names and rows as the different Datasets + dims_df = pd.DataFrame([ds.dims for ds in ds_list]) + + # calculate useful information about the dimensions dims_sum = dims_df.sum(axis=0).to_dict() - dims_max = dims_df.max(axis=0).to_dict() dims_csum = dims_df.cumsum(axis=0).to_dict() + dims_max = dims_df.max(axis=0).to_dict() - return dims_sum, dims_csum, dims_max, dims_df - - -def get_temp_arr(dims, dtype, dims_max, dims_sum): - + return dims_sum, dims_csum, dims_max + + +def get_temp_arr(dims: List[str], dtype: type, + dims_max: dict, dims_sum: dict) -> dask.array: + """ + Constructs a temporary (or dummy) array representing a + variable in its final combined form. + + Parameters + ---------- + dims: List[str] + A list of the dimension names + dtype: type + The data type of the variable + dims_max: dict + Keys as the dimension name and values as the corresponding + maximum length across all Datasets + dims_sum: dict + Keys as the dimension name and values as the corresponding + sum of the lengths across all Datasets + + Returns + ------- + dask.array + a temporary (or dummy) array representing a + variable in its final combined form. + + Notes + ----- + This array is never interacted with in a traditional sense. + Its sole purpose is to construct metadata for the zarr store. + """ + + # Create the shape of the variable in its final combined form (padding occurs here) # TODO: make sure this is true shape = [dims_max[dim] if dim in const_dims else dims_sum[dim] for dim in dims] + # Create the chunk shape of the variable chnk_shape = [dims_max[dim] for dim in dims] return dask.array.zeros(shape=shape, chunks=chnk_shape, dtype=dtype) -def construct_lazy_ds(ds_model, dims_sum, dims_max): +def construct_lazy_ds(ds_model: xr.Dataset, dims_sum: dict, + dims_max: dict) -> xr.Dataset: + """ + Constructs a lazy Dataset representing the EchoData group + Dataset in its final combined form. + + Parameters + ---------- + ds_model: xr.Dataset + A Dataset that we will model our lazy Dataset after. In practice, + this is the first element in the list of Datasets to be combined. + dims_sum: dict + Keys as the dimension name and values as the corresponding + sum of the lengths across all Datasets + dims_max: dict + Keys as the dimension name and values as the corresponding + maximum length across all Datasets + + Returns + ------- + xr.Dataset + A lazy Dataset representing the EchoData group Dataset in + its final combined form + + Notes + ----- + The sole purpose of the Dataset created is to construct metadata + for the zarr store. + """ xr_vars_dict = dict() xr_coords_dict = dict() for name, val in ds_model.variables.items(): if name not in possible_dims: + + # create lazy DataArray representations corresponding to the variables temp_arr = get_temp_arr(list(val.dims), val.dtype, dims_max, dims_sum) xr_vars_dict[name] = (val.dims, temp_arr, val.attrs) else: + + # create lazy DataArray representations corresponding to the coordinates temp_arr = get_temp_arr(list(val.dims), val.dtype, dims_max, dims_sum) xr_coords_dict[name] = (val.dims, temp_arr, val.attrs) + # construct lazy Dataset form ds = xr.Dataset(xr_vars_dict, coords=xr_coords_dict) - # TODO: add ds attributes here? + # TODO: add ds attributes here and store all dataset attributes? return ds -def get_region(ds_ind, dims_csum, ds_dims): - - if ds_ind == 0: - region = {dim: slice(0, dims_csum[dim][ds_ind]) for dim in ds_dims} - - else: - region = {dim: slice(dims_csum[dim][ds_ind-1], dims_csum[dim][ds_ind]) for dim in ds_dims} +def get_ds_encodings(ds_model: xr.Dataset) -> Dict[Hashable, dict]: + """ + Obtains the encodings needed for each variable + of the lazy Dataset form. - return region + Parameters + ---------- + ds_model: xr.Dataset + The Dataset that we modelled our lazy Dataset after. In practice, + this is the first element in the list of Datasets to be combined. + Returns + ------- + encodings: Dict[Hashable, dict] + The keys are a string representing the variable name and the + values are a dictionary of the corresponding encodings -def get_ds_encodings(ds_model): + Notes + ----- + The encodings corresponding to the lazy encodings (e.g. compressor) + should not be included here, these will be generated by `to_zarr`. + """ encodings = dict() for name, val in ds_model.variables.items(): + + # get all encodings except the lazy encodings encodings[name] = {key: encod for key, encod in val.encoding.items() if key not in lazy_encodings} return encodings -def get_constant_vars(ds_model): +def get_constant_vars(ds_model: xr.Dataset) -> list: + """ + Obtains all variable and dimension names that will + be the same across all Datasets that will be combined. + + Parameters + ---------- + ds_model: xr.Dataset + The Dataset that we modelled our lazy Dataset after. In practice, + this is the first element in the list of Datasets to be combined. + + Returns + ------- + const_vars: list + Variable and dimension names that will be the same across all + Datasets that will be combined. + """ + # obtain the form of the dimensions for each constant variable dim_form = [(dim,) for dim in const_dims] - # account for Vendor_specific vars + # account for Vendor_specific variables dim_form.append(('channel', 'pulse_length_bin')) # TODO: is there a better way? + # obtain all constant variables and dimensions const_vars = [] for name, val in ds_model.variables.items(): - if val.dims in dim_form: const_vars.append(name) return const_vars -def direct_write(path, ds_list, group, storage_options): +def get_region(ds_ind: int, dims_csum: dict, + ds_dims: Set[Hashable]) -> Dict[str, slice]: + """ + Returns the region of the zarr file to write to. This region + corresponds to the input set of dimensions. + + Parameters + ---------- + ds_ind: int + The key of the values of ``dims_csum`` to use for each + dimension name + dims_csum: dict + Keys as the dimension name and values as a dictionary of + the corresponding cumulative sum of the lengths across + all Datasets + ds_dims: Set[Hashable] + The names of the dimensions used in the region creation + + Returns + ------- + region: Dict[str, slice] + Keys set as the dimension name and values as + the slice of the zarr portion to write to + """ - dims_sum, dims_csum, dims_max, dims_df = get_ds_dims_info(ds_list) + if ds_ind == 0: + + # get the initial region + region = {dim: slice(0, dims_csum[dim][ds_ind]) for dim in ds_dims} + + else: - # TODO: Do check that all of the channels are the same and times don't overlap and they increase + # get all other regions + region = {dim: slice(dims_csum[dim][ds_ind - 1], dims_csum[dim][ds_ind]) for dim in ds_dims} + + return region + + +def direct_write(path: str, ds_list: List[xr.Dataset], + group: str, storage_options: Optional[dict] = {}) -> None: + """ + Creates a zarr store and then appends each Dataset + in ``ds_list`` to it. The final result is a combined + Dataset along the time dimensions. + + Parameters + ---------- + path: str + The full path of the final combined zarr store + ds_list: List[xr.Dataset] + The Datasets that will be combined + group: str + The name of the group of the zarr store + corresponding to the Datasets in ``ds_list`` + storage_options: Optional[dict] + Any additional parameters for the storage + backend (ignored for local paths) + """ + + dims_sum, dims_csum, dims_max = get_ds_dims_info(ds_list) + + # TODO: Check that all of the channels are the same and times don't overlap and they increase # may have an issue with time1 and NaT ds_lazy = construct_lazy_ds(ds_list[0], dims_sum, dims_max) - # get encodings for each of the arrays encodings = get_ds_encodings(ds_list[0]) + # create zarr file and all associated metadata (this is delayed) ds_lazy.to_zarr(path, compute=False, group=group, encoding=encodings, consolidated=True, storage_options=storage_options) - # constant variables that will be written in later + # constant variables that will be written later const_vars = get_constant_vars(ds_list[0]) print(f"const_vars = {const_vars}") + # write each non-constant variable in ds_list to the zarr store for i in range(len(ds_list)): # TODO: parallelize this loop + # obtain the names of all ds dimensions that are not constant ds_dims = set(ds_list[i].dims) - set(const_vars) region = get_region(i, dims_csum, ds_dims) + ds_list[i].drop(const_vars).to_zarr(path, group=group, region=region, storage_options=storage_options) @@ -131,7 +309,8 @@ def direct_write(path, ds_list, group, storage_options): # dims will be automatically filled when they occur in a variable if (var not in possible_dims) or (var in ['beam', 'range_sample']): - region = get_region(0, dims_csum, list(ds_list[0][var].dims)) + region = get_region(0, dims_csum, set(ds_list[0][var].dims)) + ds_list[0][[var]].to_zarr(path, group=group, region=region, storage_options=storage_options) @@ -139,7 +318,7 @@ def direct_write(path, ds_list, group, storage_options): # TODO: add back in attributes for dataset # TODO: correctly add attribute keys for Provenance group - # TODO: need to consider the case where range_sample needs to be padded + # TODO: need to consider the case where range_sample needs to be padded? # TODO: re-chunk the zarr store after everything has been added? @@ -147,5 +326,8 @@ def direct_write(path, ds_list, group, storage_options): # def lazy_combine(path, eds): # -# # TODO: do direct_write(path, ds_list) for each group in eds + +# TODO: do direct_write(path, ds_list) for each group in eds +# then do open_converted(path) --> here we could re-chunk? + From 2a89e6d52c920752aec8bc07a23e1ac75ff7f8fe Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 6 Sep 2022 09:04:45 -0700 Subject: [PATCH 10/89] make combine_lazily_v2 into a class --- echopype/echodata/combine_lazily.py | 50 ++- echopype/echodata/combine_lazily_v2.py | 573 ++++++++++++------------- 2 files changed, 315 insertions(+), 308 deletions(-) diff --git a/echopype/echodata/combine_lazily.py b/echopype/echodata/combine_lazily.py index 0ab7d35b6..fec7f90ee 100644 --- a/echopype/echodata/combine_lazily.py +++ b/echopype/echodata/combine_lazily.py @@ -1,9 +1,11 @@ -from .combine_preprocess import PreprocessCallable -from echopype.echodata import EchoData -from datatree import DataTree import xarray as xr +from datatree import DataTree from fsspec.implementations.local import LocalFileSystem +from echopype.echodata import EchoData + +from .combine_preprocess import PreprocessCallable + # desired_raw_file_paths = fs.glob('OOI_zarrs_ep_ex/temp/*.zarr') @@ -19,9 +21,11 @@ def get_ed_path_from_str(zarr_path: str, path: str): """ # the names of the groups that are needed to get to path - all_grp_names = [elm for elm in path.split('/') if (elm not in zarr_path.split('/')) and (elm != '.zgroup')] + all_grp_names = [ + elm for elm in path.split("/") if (elm not in zarr_path.split("/")) and (elm != ".zgroup") + ] - return '/'.join(all_grp_names) + return "/".join(all_grp_names) def get_zarr_grp_names(path: str, fs: LocalFileSystem) -> set: @@ -30,14 +34,14 @@ def get_zarr_grp_names(path: str, fs: LocalFileSystem) -> set: """ # grab all paths that have .zgroup - info = fs.glob(path + '/**.zgroup') + info = fs.glob(path + "/**.zgroup") # infer the group name based on the path ed_grp_name = {get_ed_path_from_str(path, entry) for entry in info} # remove the zarr file name and replace it with Top-level - if '' in ed_grp_name: - ed_grp_name.remove('') + if "" in ed_grp_name: + ed_grp_name.remove("") ed_grp_name.add(None) return ed_grp_name @@ -52,8 +56,8 @@ def reassign_attrs(ed_comb: EchoData, common_grps: set): if (value["ep_group"] != "Provenance") and (value["ep_group"] in common_grps): - attr_var_name = group + '_attrs' - attr_coord_name = group + '_attr_key' + attr_var_name = group + "_attrs" + attr_coord_name = group + "_attr_key" if value["ep_group"]: ed_grp = value["ep_group"] @@ -64,15 +68,16 @@ def reassign_attrs(ed_comb: EchoData, common_grps: set): ed_comb["Provenance"][attr_var_name] = ed_comb[ed_grp][attr_var_name] # remove attribute variable and coords from group - ed_comb[ed_grp] = ed_comb[ed_grp].drop_vars([attr_var_name, attr_coord_name, - 'echodata_filename']) + ed_comb[ed_grp] = ed_comb[ed_grp].drop_vars( + [attr_var_name, attr_coord_name, "echodata_filename"] + ) def lazy_combine(desired_raw_file_paths, fs): # TODO: test code when we have to do an expansion in range_sample - # initial strucuture for lazy combine + # initial structure for lazy combine tree_dict = {} result = EchoData() @@ -88,19 +93,25 @@ def lazy_combine(desired_raw_file_paths, fs): # check that all zarrs have the same groups if any([common_grps.symmetric_difference(s) for s in file_grps]): - raise RuntimeError('All input files must have the same groups!') + raise RuntimeError("All input files must have the same groups!") for group, value in EchoData.group_map.items(): - if (value["ep_group"] in common_grps): + if value["ep_group"] in common_grps: print(f"ed group = {value['ep_group']}") preprocess_obj.update_ed_group(group) - combined_group = xr.open_mfdataset(desired_raw_file_paths, - engine='zarr', coords='minimal', preprocess=preprocess_obj, - combine="nested", group=value["ep_group"], concat_dim=None) + combined_group = xr.open_mfdataset( + desired_raw_file_paths, + engine="zarr", + coords="minimal", + preprocess=preprocess_obj, + combine="nested", + group=value["ep_group"], + concat_dim=None, + ) if value["ep_group"] is None: tree_dict["/"] = combined_group @@ -120,7 +131,6 @@ def lazy_combine(desired_raw_file_paths, fs): return result - # How to construct Provenance Group # obj = ProvenancePreprocess(desired_raw_file_paths) # @@ -128,4 +138,4 @@ def lazy_combine(desired_raw_file_paths, fs): # engine='zarr', coords='minimal', # combine="nested", group='Provenance', # preprocess=obj, concat_dim=None) -# TODO: to be identical to in-memory combine remove filenames as coordinate (keep as dim) \ No newline at end of file +# TODO: to be identical to in-memory combine remove filenames as coordinate (keep as dim) diff --git a/echopype/echodata/combine_lazily_v2.py b/echopype/echodata/combine_lazily_v2.py index 40c7cdd8f..2af793eba 100644 --- a/echopype/echodata/combine_lazily_v2.py +++ b/echopype/echodata/combine_lazily_v2.py @@ -1,333 +1,330 @@ -import xarray as xr -import pandas as pd -import dask.array -import dask -from typing import List, Tuple, Dict, Hashable, Optional, Set +from typing import Dict, Hashable, List, Optional, Set +import dask +import dask.array +import pandas as pd +import xarray as xr # TODO: make this a class and have dims info/below lists as a class variable -# those dimensions that should not be chunked -const_dims = ['channel', 'beam_group', 'beam', 'range_sample', 'pulse_length_bin'] - -# those dimensions associated with time -time_dims = ['time1', 'time2', 'time3', 'ping_time'] - -# all possible dimensions we can encounter -possible_dims = const_dims + time_dims - -# encodings associated with lazy loaded variables -lazy_encodings = ["chunks", "preferred_chunks", "compressor"] - - -def get_ds_dims_info(ds_list: List[xr.Dataset]) -> Tuple[dict, dict, dict]: - """ - Constructs useful dictionaries that contain information - about the dimensions of the Dataset - - Parameters - ---------- - ds_list: List[xr.Dataset] - The Datasets that will be combined - - Returns - ------- - dims_sum: dict - Keys as the dimension name and values as the corresponding - sum of the lengths across all Datasets - dims_csum: dict - Keys as the dimension name and values as a dictionary of - the corresponding cumulative sum of the lengths across - all Datasets - dims_max: dict - Keys as the dimension name and values as the corresponding - maximum length across all Datasets - """ - - # Dataframe with column as dim names and rows as the different Datasets - dims_df = pd.DataFrame([ds.dims for ds in ds_list]) - - # calculate useful information about the dimensions - dims_sum = dims_df.sum(axis=0).to_dict() - dims_csum = dims_df.cumsum(axis=0).to_dict() - dims_max = dims_df.max(axis=0).to_dict() - - return dims_sum, dims_csum, dims_max - - -def get_temp_arr(dims: List[str], dtype: type, - dims_max: dict, dims_sum: dict) -> dask.array: - """ - Constructs a temporary (or dummy) array representing a - variable in its final combined form. - - Parameters - ---------- - dims: List[str] - A list of the dimension names - dtype: type - The data type of the variable - dims_max: dict - Keys as the dimension name and values as the corresponding - maximum length across all Datasets - dims_sum: dict - Keys as the dimension name and values as the corresponding - sum of the lengths across all Datasets - - Returns - ------- - dask.array - a temporary (or dummy) array representing a - variable in its final combined form. - - Notes - ----- - This array is never interacted with in a traditional sense. - Its sole purpose is to construct metadata for the zarr store. - """ - - # Create the shape of the variable in its final combined form (padding occurs here) # TODO: make sure this is true - shape = [dims_max[dim] if dim in const_dims else dims_sum[dim] for dim in dims] - - # Create the chunk shape of the variable - chnk_shape = [dims_max[dim] for dim in dims] - - return dask.array.zeros(shape=shape, chunks=chnk_shape, dtype=dtype) - - -def construct_lazy_ds(ds_model: xr.Dataset, dims_sum: dict, - dims_max: dict) -> xr.Dataset: - """ - Constructs a lazy Dataset representing the EchoData group - Dataset in its final combined form. - - Parameters - ---------- - ds_model: xr.Dataset - A Dataset that we will model our lazy Dataset after. In practice, - this is the first element in the list of Datasets to be combined. - dims_sum: dict - Keys as the dimension name and values as the corresponding - sum of the lengths across all Datasets - dims_max: dict - Keys as the dimension name and values as the corresponding - maximum length across all Datasets - - Returns - ------- - xr.Dataset - A lazy Dataset representing the EchoData group Dataset in - its final combined form - - Notes - ----- - The sole purpose of the Dataset created is to construct metadata - for the zarr store. - """ - - xr_vars_dict = dict() - xr_coords_dict = dict() - for name, val in ds_model.variables.items(): - if name not in possible_dims: - - # create lazy DataArray representations corresponding to the variables - temp_arr = get_temp_arr(list(val.dims), val.dtype, dims_max, dims_sum) - xr_vars_dict[name] = (val.dims, temp_arr, val.attrs) - - else: - - # create lazy DataArray representations corresponding to the coordinates - temp_arr = get_temp_arr(list(val.dims), val.dtype, dims_max, dims_sum) - xr_coords_dict[name] = (val.dims, temp_arr, val.attrs) - - # construct lazy Dataset form - ds = xr.Dataset(xr_vars_dict, coords=xr_coords_dict) - - # TODO: add ds attributes here and store all dataset attributes? - - return ds - - -def get_ds_encodings(ds_model: xr.Dataset) -> Dict[Hashable, dict]: - """ - Obtains the encodings needed for each variable - of the lazy Dataset form. - - Parameters - ---------- - ds_model: xr.Dataset - The Dataset that we modelled our lazy Dataset after. In practice, - this is the first element in the list of Datasets to be combined. - - Returns - ------- - encodings: Dict[Hashable, dict] - The keys are a string representing the variable name and the - values are a dictionary of the corresponding encodings - - Notes - ----- - The encodings corresponding to the lazy encodings (e.g. compressor) - should not be included here, these will be generated by `to_zarr`. - """ - - encodings = dict() - for name, val in ds_model.variables.items(): - - # get all encodings except the lazy encodings - encodings[name] = {key: encod for key, encod in val.encoding.items() if - key not in lazy_encodings} - - return encodings +class LazyCombine: + def __init__(self): + # those dimensions that should not be chunked + self.const_dims = ["channel", "beam_group", "beam", "range_sample", "pulse_length_bin"] -def get_constant_vars(ds_model: xr.Dataset) -> list: - """ - Obtains all variable and dimension names that will - be the same across all Datasets that will be combined. + # those dimensions associated with time + self.time_dims = ["time1", "time2", "time3", "ping_time"] - Parameters - ---------- - ds_model: xr.Dataset - The Dataset that we modelled our lazy Dataset after. In practice, - this is the first element in the list of Datasets to be combined. + # all possible dimensions we can encounter + self.possible_dims = self.const_dims + self.time_dims - Returns - ------- - const_vars: list - Variable and dimension names that will be the same across all - Datasets that will be combined. - """ + # encodings associated with lazy loaded variables + self.lazy_encodings = ["chunks", "preferred_chunks", "compressor"] - # obtain the form of the dimensions for each constant variable - dim_form = [(dim,) for dim in const_dims] + # dictionary to hold every group's attributes + self.group_attrs = dict() - # account for Vendor_specific variables - dim_form.append(('channel', 'pulse_length_bin')) # TODO: is there a better way? + def _get_ds_dims_info(self, ds_list: List[xr.Dataset]) -> None: + """ + Constructs useful dictionaries that contain information + about the dimensions of the Dataset - # obtain all constant variables and dimensions - const_vars = [] - for name, val in ds_model.variables.items(): - if val.dims in dim_form: - const_vars.append(name) + Parameters + ---------- + ds_list: List[xr.Dataset] + The Datasets that will be combined - return const_vars + Notes + ----- + This method creates the following class variables: + dims_sum: dict + Keys as the dimension name and values as the corresponding + sum of the lengths across all Datasets + dims_csum: dict + Keys as the dimension name and values as a dictionary of + the corresponding cumulative sum of the lengths across + all Datasets + dims_max: dict + Keys as the dimension name and values as the corresponding + maximum length across all Datasets + """ + # Dataframe with column as dim names and rows as the different Datasets + dims_df = pd.DataFrame([ds.dims for ds in ds_list]) -def get_region(ds_ind: int, dims_csum: dict, - ds_dims: Set[Hashable]) -> Dict[str, slice]: - """ - Returns the region of the zarr file to write to. This region - corresponds to the input set of dimensions. + # calculate useful information about the dimensions + self.dims_sum = dims_df.sum(axis=0).to_dict() + self.dims_csum = dims_df.cumsum(axis=0).to_dict() + self.dims_max = dims_df.max(axis=0).to_dict() - Parameters - ---------- - ds_ind: int - The key of the values of ``dims_csum`` to use for each - dimension name - dims_csum: dict - Keys as the dimension name and values as a dictionary of - the corresponding cumulative sum of the lengths across - all Datasets - ds_dims: Set[Hashable] - The names of the dimensions used in the region creation + # collect Dataset attributes + # [ds.attrs for count, ds in enumerate(ds_list)] - Returns - ------- - region: Dict[str, slice] - Keys set as the dimension name and values as - the slice of the zarr portion to write to - """ - - if ds_ind == 0: - - # get the initial region - region = {dim: slice(0, dims_csum[dim][ds_ind]) for dim in ds_dims} - - else: - - # get all other regions - region = {dim: slice(dims_csum[dim][ds_ind - 1], dims_csum[dim][ds_ind]) for dim in ds_dims} + def _get_temp_arr(self, dims: List[str], dtype: type) -> dask.array: + """ + Constructs a temporary (or dummy) array representing a + variable in its final combined form. - return region + Parameters + ---------- + dims: List[str] + A list of the dimension names + dtype: type + The data type of the variable + + Returns + ------- + dask.array + a temporary (or dummy) array representing a + variable in its final combined form. + + Notes + ----- + This array is never interacted with in a traditional sense. + Its sole purpose is to construct metadata for the zarr store. + """ + + # Create the shape of the variable in its final combined + # form (padding occurs here) # TODO: make sure this is true + shape = [ + self.dims_max[dim] if dim in self.const_dims else self.dims_sum[dim] for dim in dims + ] + + # Create the chunk shape of the variable + chnk_shape = [self.dims_max[dim] for dim in dims] + + return dask.array.zeros(shape=shape, chunks=chnk_shape, dtype=dtype) + + def _construct_lazy_ds(self, ds_model: xr.Dataset) -> xr.Dataset: + """ + Constructs a lazy Dataset representing the EchoData group + Dataset in its final combined form. + + Parameters + ---------- + ds_model: xr.Dataset + A Dataset that we will model our lazy Dataset after. In practice, + this is the first element in the list of Datasets to be combined. + + Returns + ------- + xr.Dataset + A lazy Dataset representing the EchoData group Dataset in + its final combined form + + Notes + ----- + The sole purpose of the Dataset created is to construct metadata + for the zarr store. + """ + + xr_vars_dict = dict() + xr_coords_dict = dict() + for name, val in ds_model.variables.items(): + if name not in self.possible_dims: + + # create lazy DataArray representations corresponding to the variables + temp_arr = self._get_temp_arr(list(val.dims), val.dtype) + xr_vars_dict[name] = (val.dims, temp_arr, val.attrs) + + else: + + # create lazy DataArray representations corresponding to the coordinates + temp_arr = self._get_temp_arr(list(val.dims), val.dtype) + xr_coords_dict[name] = (val.dims, temp_arr, val.attrs) + + # construct lazy Dataset form + ds = xr.Dataset(xr_vars_dict, coords=xr_coords_dict) + + # TODO: add ds attributes here and store all dataset attributes? + + return ds + + def _get_ds_encodings(self, ds_model: xr.Dataset) -> Dict[Hashable, dict]: + """ + Obtains the encodings needed for each variable + of the lazy Dataset form. + + Parameters + ---------- + ds_model: xr.Dataset + The Dataset that we modelled our lazy Dataset after. In practice, + this is the first element in the list of Datasets to be combined. + + Returns + ------- + encodings: Dict[Hashable, dict] + The keys are a string representing the variable name and the + values are a dictionary of the corresponding encodings + + Notes + ----- + The encodings corresponding to the lazy encodings (e.g. compressor) + should not be included here, these will be generated by `to_zarr`. + """ + + encodings = dict() + for name, val in ds_model.variables.items(): + + # get all encodings except the lazy encodings + encodings[name] = { + key: encod for key, encod in val.encoding.items() if key not in self.lazy_encodings + } + + return encodings + + def _get_constant_vars(self, ds_model: xr.Dataset) -> list: + """ + Obtains all variable and dimension names that will + be the same across all Datasets that will be combined. + + Parameters + ---------- + ds_model: xr.Dataset + The Dataset that we modelled our lazy Dataset after. In practice, + this is the first element in the list of Datasets to be combined. + + Returns + ------- + const_vars: list + Variable and dimension names that will be the same across all + Datasets that will be combined. + """ + + # obtain the form of the dimensions for each constant variable + dim_form = [(dim,) for dim in self.const_dims] + + # account for Vendor_specific variables + dim_form.append(("channel", "pulse_length_bin")) # TODO: is there a better way? + + # obtain all constant variables and dimensions + const_vars = [] + for name, val in ds_model.variables.items(): + if val.dims in dim_form: + const_vars.append(name) + + return const_vars + + def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: + """ + Returns the region of the zarr file to write to. This region + corresponds to the input set of dimensions. + + Parameters + ---------- + ds_ind: int + The key of the values of ``dims_csum`` to use for each + dimension name + ds_dims: Set[Hashable] + The names of the dimensions used in the region creation + + Returns + ------- + region: Dict[str, slice] + Keys set as the dimension name and values as + the slice of the zarr portion to write to + """ + + if ds_ind == 0: + + # get the initial region + region = {dim: slice(0, self.dims_csum[dim][ds_ind]) for dim in ds_dims} + else: -def direct_write(path: str, ds_list: List[xr.Dataset], - group: str, storage_options: Optional[dict] = {}) -> None: - """ - Creates a zarr store and then appends each Dataset - in ``ds_list`` to it. The final result is a combined - Dataset along the time dimensions. + # get all other regions + region = { + dim: slice(self.dims_csum[dim][ds_ind - 1], self.dims_csum[dim][ds_ind]) + for dim in ds_dims + } - Parameters - ---------- - path: str - The full path of the final combined zarr store - ds_list: List[xr.Dataset] - The Datasets that will be combined - group: str - The name of the group of the zarr store - corresponding to the Datasets in ``ds_list`` - storage_options: Optional[dict] - Any additional parameters for the storage - backend (ignored for local paths) - """ + return region - dims_sum, dims_csum, dims_max = get_ds_dims_info(ds_list) + def direct_write( + self, path: str, ds_list: List[xr.Dataset], group: str, storage_options: Optional[dict] = {} + ) -> None: + """ + Creates a zarr store and then appends each Dataset + in ``ds_list`` to it. The final result is a combined + Dataset along the time dimensions. - # TODO: Check that all of the channels are the same and times don't overlap and they increase - # may have an issue with time1 and NaT + Parameters + ---------- + path: str + The full path of the final combined zarr store + ds_list: List[xr.Dataset] + The Datasets that will be combined + group: str + The name of the group of the zarr store + corresponding to the Datasets in ``ds_list`` + storage_options: Optional[dict] + Any additional parameters for the storage + backend (ignored for local paths) + """ - ds_lazy = construct_lazy_ds(ds_list[0], dims_sum, dims_max) + self._get_ds_dims_info(ds_list) - encodings = get_ds_encodings(ds_list[0]) + # TODO: Check that all of the channels are the same and times + # don't overlap and they increase may have an issue with time1 and NaT - # create zarr file and all associated metadata (this is delayed) - ds_lazy.to_zarr(path, compute=False, group=group, encoding=encodings, - consolidated=True, storage_options=storage_options) + ds_lazy = self._construct_lazy_ds(ds_list[0]) - # constant variables that will be written later - const_vars = get_constant_vars(ds_list[0]) + encodings = self._get_ds_encodings(ds_list[0]) - print(f"const_vars = {const_vars}") + # create zarr file and all associated metadata (this is delayed) + ds_lazy.to_zarr( + path, + compute=False, + group=group, + encoding=encodings, + consolidated=True, + storage_options=storage_options, + ) - # write each non-constant variable in ds_list to the zarr store - for i in range(len(ds_list)): # TODO: parallelize this loop + # constant variables that will be written later + const_vars = self._get_constant_vars(ds_list[0]) - # obtain the names of all ds dimensions that are not constant - ds_dims = set(ds_list[i].dims) - set(const_vars) + print(f"const_vars = {const_vars}") - region = get_region(i, dims_csum, ds_dims) + # write each non-constant variable in ds_list to the zarr store + for ind, ds in enumerate(ds_list): # TODO: parallelize this loop - ds_list[i].drop(const_vars).to_zarr(path, group=group, region=region, - storage_options=storage_options) + # obtain the names of all ds dimensions that are not constant + ds_dims = set(ds.dims) - set(const_vars) - # write constant vars to zarr using the first element of ds_list - for var in const_vars: # TODO: one should not parallelize this loop?? + region = self._get_region(ind, ds_dims) - # dims will be automatically filled when they occur in a variable - if (var not in possible_dims) or (var in ['beam', 'range_sample']): + ds.drop(const_vars).to_zarr( + path, group=group, region=region, storage_options=storage_options + ) - region = get_region(0, dims_csum, set(ds_list[0][var].dims)) + # TODO: do a blocking call here, once we parallelize - ds_list[0][[var]].to_zarr(path, group=group, region=region, - storage_options=storage_options) + # write constant vars to zarr using the first element of ds_list + for var in const_vars: # TODO: one should not parallelize this loop?? + # dims will be automatically filled when they occur in a variable + if (var not in self.possible_dims) or (var in ["beam", "range_sample"]): - # TODO: add back in attributes for dataset - # TODO: correctly add attribute keys for Provenance group + region = self._get_region(0, set(ds_list[0][var].dims)) - # TODO: need to consider the case where range_sample needs to be padded? + ds_list[0][[var]].to_zarr( + path, group=group, region=region, storage_options=storage_options + ) - # TODO: re-chunk the zarr store after everything has been added? + # TODO: add back in attributes for dataset + # TODO: correctly add attribute keys for Provenance group - # TODO: is there a way we can preserve order in variables with writing? + # TODO: need to consider the case where range_sample needs to be padded? -# def lazy_combine(path, eds): -# + # TODO: re-chunk the zarr store after everything has been added? -# TODO: do direct_write(path, ds_list) for each group in eds -# then do open_converted(path) --> here we could re-chunk? + # TODO: is there a way we can preserve order in variables with writing? + # def lazy_combine(path, eds): + # + # TODO: do direct_write(path, ds_list) for each group in eds + # then do open_converted(path) --> here we could re-chunk? From 71fc731f9091c2f06ae641e5390dfd7ba9de38f8 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 6 Sep 2022 11:47:41 -0700 Subject: [PATCH 11/89] add mechanism to strore dataset attributes and make first attempt at a full EchoData combine --- echopype/echodata/combine_lazily_v2.py | 87 +++++++++++++++++++------- 1 file changed, 63 insertions(+), 24 deletions(-) diff --git a/echopype/echodata/combine_lazily_v2.py b/echopype/echodata/combine_lazily_v2.py index 2af793eba..02a44a141 100644 --- a/echopype/echodata/combine_lazily_v2.py +++ b/echopype/echodata/combine_lazily_v2.py @@ -1,9 +1,10 @@ from typing import Dict, Hashable, List, Optional, Set - +from collections import defaultdict import dask import dask.array import pandas as pd import xarray as xr +from .echodata import EchoData # TODO: make this a class and have dims info/below lists as a class variable @@ -23,18 +24,24 @@ def __init__(self): # encodings associated with lazy loaded variables self.lazy_encodings = ["chunks", "preferred_chunks", "compressor"] - # dictionary to hold every group's attributes - self.group_attrs = dict() + # defaultdict of defaultdicts that holds every group's attributes + self.group_attrs = defaultdict(lambda: defaultdict(list)) - def _get_ds_dims_info(self, ds_list: List[xr.Dataset]) -> None: + def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> None: """ Constructs useful dictionaries that contain information - about the dimensions of the Dataset + about the dimensions of the Dataset. Additionally, collects + the attributes from each Dataset in ``ds_list`` and saves + this group specific information to the class variable + ``group_attrs``. Parameters ---------- ds_list: List[xr.Dataset] The Datasets that will be combined + ed_name: str + The name of the EchoData group corresponding to the + Datasets in ``ds_list`` Notes ----- @@ -60,7 +67,10 @@ def _get_ds_dims_info(self, ds_list: List[xr.Dataset]) -> None: self.dims_max = dims_df.max(axis=0).to_dict() # collect Dataset attributes - # [ds.attrs for count, ds in enumerate(ds_list)] + for count, ds in enumerate(ds_list): + if count == 0: + self.group_attrs[ed_name]['attr_key'].extend(ds.attrs.keys()) + self.group_attrs[ed_name]['attrs'].append(list(ds.attrs.values())) def _get_temp_arr(self, dims: List[str], dtype: type) -> dask.array: """ @@ -136,9 +146,11 @@ def _construct_lazy_ds(self, ds_model: xr.Dataset) -> xr.Dataset: xr_coords_dict[name] = (val.dims, temp_arr, val.attrs) # construct lazy Dataset form - ds = xr.Dataset(xr_vars_dict, coords=xr_coords_dict) + ds = xr.Dataset(xr_vars_dict, coords=xr_coords_dict, attrs=ds_model.attrs) + + # TODO: add ds attributes here? - # TODO: add ds attributes here and store all dataset attributes? + # TODO: do special case for Provenance, where we create attr variables return ds @@ -243,7 +255,8 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: return region def direct_write( - self, path: str, ds_list: List[xr.Dataset], group: str, storage_options: Optional[dict] = {} + self, path: str, ds_list: List[xr.Dataset], zarr_group: str, ed_name: str, + storage_options: Optional[dict] = {} ) -> None: """ Creates a zarr store and then appends each Dataset @@ -256,19 +269,24 @@ def direct_write( The full path of the final combined zarr store ds_list: List[xr.Dataset] The Datasets that will be combined - group: str + zarr_group: str The name of the group of the zarr store corresponding to the Datasets in ``ds_list`` + ed_name: str + The name of the EchoData group corresponding to the + Datasets in ``ds_list`` storage_options: Optional[dict] Any additional parameters for the storage backend (ignored for local paths) """ - self._get_ds_dims_info(ds_list) + self._get_ds_info(ds_list, ed_name) # TODO: Check that all of the channels are the same and times # don't overlap and they increase may have an issue with time1 and NaT + # TODO: check for and correct reversed time + ds_lazy = self._construct_lazy_ds(ds_list[0]) encodings = self._get_ds_encodings(ds_list[0]) @@ -277,7 +295,7 @@ def direct_write( ds_lazy.to_zarr( path, compute=False, - group=group, + group=zarr_group, encoding=encodings, consolidated=True, storage_options=storage_options, @@ -286,7 +304,7 @@ def direct_write( # constant variables that will be written later const_vars = self._get_constant_vars(ds_list[0]) - print(f"const_vars = {const_vars}") + # print(f"const_vars = {const_vars}") # write each non-constant variable in ds_list to the zarr store for ind, ds in enumerate(ds_list): # TODO: parallelize this loop @@ -297,7 +315,7 @@ def direct_write( region = self._get_region(ind, ds_dims) ds.drop(const_vars).to_zarr( - path, group=group, region=region, storage_options=storage_options + path, group=zarr_group, region=region, storage_options=storage_options ) # TODO: do a blocking call here, once we parallelize @@ -311,20 +329,41 @@ def direct_write( region = self._get_region(0, set(ds_list[0][var].dims)) ds_list[0][[var]].to_zarr( - path, group=group, region=region, storage_options=storage_options + path, group=zarr_group, region=region, storage_options=storage_options ) - # TODO: add back in attributes for dataset - # TODO: correctly add attribute keys for Provenance group - # TODO: need to consider the case where range_sample needs to be padded? + # TODO: is there a way we can preserve order in variables with writing? - # TODO: re-chunk the zarr store after everything has been added? + def combine(self, path: str, eds: List[EchoData], storage_options: Optional[dict] = {}): - # TODO: is there a way we can preserve order in variables with writing? + for grp_info in EchoData.group_map.values(): + + print(grp_info) + + if grp_info['ep_group']: + ed_group = grp_info['ep_group'] + else: + ed_group = "Top-level" + + zarr_group = grp_info['ep_group'] + + ds_list = [ed[ed_group] for ed in eds if ed_group in ed.group_paths] + + if ds_list: + print(ed_group, zarr_group) + + self.direct_write(path, + ds_list=ds_list, + zarr_group=zarr_group, ed_name=ed_group, storage_options=storage_options) + + # TODO: add back in attributes for dataset + # TODO: correctly add attribute keys for Provenance group + # TODO: re-chunk the zarr store after everything has been added? - # def lazy_combine(path, eds): - # + # TODO: do provenance group last + # temp = {key: {"dims": ["echodata_filename"], "data": val} for key, val in self.group_attrs.items()} + # xr.Dataset.from_dict(temp) - # TODO: do direct_write(path, ds_list) for each group in eds - # then do open_converted(path) --> here we could re-chunk? + # TODO: do direct_write(path, ds_list) for each group in eds + # then do open_converted(path) --> here we could re-chunk? From 6be4dc040db61516b42b7efb47a92cc710c389b3 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 6 Sep 2022 17:05:24 -0700 Subject: [PATCH 12/89] delay region write in direct_write --- echopype/echodata/combine_lazily_v2.py | 40 ++++++++++++++++---------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/echopype/echodata/combine_lazily_v2.py b/echopype/echodata/combine_lazily_v2.py index 02a44a141..d9709e0ff 100644 --- a/echopype/echodata/combine_lazily_v2.py +++ b/echopype/echodata/combine_lazily_v2.py @@ -24,8 +24,8 @@ def __init__(self): # encodings associated with lazy loaded variables self.lazy_encodings = ["chunks", "preferred_chunks", "compressor"] - # defaultdict of defaultdicts that holds every group's attributes - self.group_attrs = defaultdict(lambda: defaultdict(list)) + # defaultdict that holds every group's attributes + self.group_attrs = defaultdict(list) def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> None: """ @@ -66,11 +66,16 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non self.dims_csum = dims_df.cumsum(axis=0).to_dict() self.dims_max = dims_df.max(axis=0).to_dict() + # format ed_name appropriately + ed_name = ed_name.replace('-', '_').replace('/', '_').lower() + # collect Dataset attributes for count, ds in enumerate(ds_list): if count == 0: - self.group_attrs[ed_name]['attr_key'].extend(ds.attrs.keys()) - self.group_attrs[ed_name]['attrs'].append(list(ds.attrs.values())) + self.group_attrs[ed_name + '_attr_key'].extend(ds.attrs.keys()) + self.group_attrs[ed_name + '_attrs'].append(list(ds.attrs.values())) + + # TODO: document/bring up that I changed naming scheme of attributes def _get_temp_arr(self, dims: List[str], dtype: type) -> dask.array: """ @@ -304,21 +309,26 @@ def direct_write( # constant variables that will be written later const_vars = self._get_constant_vars(ds_list[0]) - # print(f"const_vars = {const_vars}") + to_zarr_compute = True + + print(f"to_zarr_compute = {to_zarr_compute}") # write each non-constant variable in ds_list to the zarr store - for ind, ds in enumerate(ds_list): # TODO: parallelize this loop + delayed_to_zarr = [] + for ind, ds in enumerate(ds_list): # obtain the names of all ds dimensions that are not constant ds_dims = set(ds.dims) - set(const_vars) region = self._get_region(ind, ds_dims) - ds.drop(const_vars).to_zarr( - path, group=zarr_group, region=region, storage_options=storage_options - ) + delayed_to_zarr.append(ds.drop(const_vars).to_zarr( + path, group=zarr_group, region=region, storage_options=storage_options, compute=to_zarr_compute + )) + # TODO: see if compression is occurring, maybe mess with encoding. - # TODO: do a blocking call here, once we parallelize + if not to_zarr_compute: + dask.compute(*delayed_to_zarr) # write constant vars to zarr using the first element of ds_list for var in const_vars: # TODO: one should not parallelize this loop?? @@ -339,23 +349,23 @@ def combine(self, path: str, eds: List[EchoData], storage_options: Optional[dict for grp_info in EchoData.group_map.values(): - print(grp_info) + # print(grp_info) if grp_info['ep_group']: ed_group = grp_info['ep_group'] else: ed_group = "Top-level" - zarr_group = grp_info['ep_group'] - ds_list = [ed[ed_group] for ed in eds if ed_group in ed.group_paths] if ds_list: - print(ed_group, zarr_group) + + print(f"ed_group = {ed_group}") self.direct_write(path, ds_list=ds_list, - zarr_group=zarr_group, ed_name=ed_group, storage_options=storage_options) + zarr_group=grp_info['ep_group'], + ed_name=ed_group, storage_options=storage_options) # TODO: add back in attributes for dataset # TODO: correctly add attribute keys for Provenance group From ce62334898340543ecd2d3c083d0b7a498251cf0 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 7 Sep 2022 17:07:56 -0700 Subject: [PATCH 13/89] add sychronizer for to_zarr and turn off blosc threads when using combine --- echopype/echodata/combine_lazily_v2.py | 28 +++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/echopype/echodata/combine_lazily_v2.py b/echopype/echodata/combine_lazily_v2.py index d9709e0ff..e4c952c5e 100644 --- a/echopype/echodata/combine_lazily_v2.py +++ b/echopype/echodata/combine_lazily_v2.py @@ -5,6 +5,8 @@ import pandas as pd import xarray as xr from .echodata import EchoData +import zarr +from numcodecs import blosc # TODO: make this a class and have dims info/below lists as a class variable @@ -261,7 +263,7 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: def direct_write( self, path: str, ds_list: List[xr.Dataset], zarr_group: str, ed_name: str, - storage_options: Optional[dict] = {} + storage_options: Optional[dict] = {}, to_zarr_compute: bool = True ) -> None: """ Creates a zarr store and then appends each Dataset @@ -303,16 +305,12 @@ def direct_write( group=zarr_group, encoding=encodings, consolidated=True, - storage_options=storage_options, + storage_options=storage_options, synchronizer=zarr.ThreadSynchronizer() ) # constant variables that will be written later const_vars = self._get_constant_vars(ds_list[0]) - to_zarr_compute = True - - print(f"to_zarr_compute = {to_zarr_compute}") - # write each non-constant variable in ds_list to the zarr store delayed_to_zarr = [] for ind, ds in enumerate(ds_list): @@ -323,7 +321,8 @@ def direct_write( region = self._get_region(ind, ds_dims) delayed_to_zarr.append(ds.drop(const_vars).to_zarr( - path, group=zarr_group, region=region, storage_options=storage_options, compute=to_zarr_compute + path, group=zarr_group, region=region, storage_options=storage_options, compute=to_zarr_compute, + synchronizer=zarr.ThreadSynchronizer() )) # TODO: see if compression is occurring, maybe mess with encoding. @@ -339,7 +338,8 @@ def direct_write( region = self._get_region(0, set(ds_list[0][var].dims)) ds_list[0][[var]].to_zarr( - path, group=zarr_group, region=region, storage_options=storage_options + path, group=zarr_group, region=region, storage_options=storage_options, + synchronizer=zarr.ThreadSynchronizer() ) # TODO: need to consider the case where range_sample needs to be padded? @@ -347,6 +347,13 @@ def direct_write( def combine(self, path: str, eds: List[EchoData], storage_options: Optional[dict] = {}): + to_zarr_compute = False + + print(f"to_zarr_compute = {to_zarr_compute}") + + # tell Blosc to runs in single-threaded contextual mode (necessary for parallel) + blosc.use_threads = False + for grp_info in EchoData.group_map.values(): # print(grp_info) @@ -365,7 +372,7 @@ def combine(self, path: str, eds: List[EchoData], storage_options: Optional[dict self.direct_write(path, ds_list=ds_list, zarr_group=grp_info['ep_group'], - ed_name=ed_group, storage_options=storage_options) + ed_name=ed_group, storage_options=storage_options, to_zarr_compute=to_zarr_compute) # TODO: add back in attributes for dataset # TODO: correctly add attribute keys for Provenance group @@ -377,3 +384,6 @@ def combine(self, path: str, eds: List[EchoData], storage_options: Optional[dict # TODO: do direct_write(path, ds_list) for each group in eds # then do open_converted(path) --> here we could re-chunk? + + # re-enable automatic switching (the default behavior) + blosc.use_threads = None From 36afe2b110b6adc261a2a5c33a28ae8b508de274 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 8 Sep 2022 15:28:05 -0700 Subject: [PATCH 14/89] Rename class and add attributes from all datasets to the Provenance group --- echopype/echodata/combine_lazily_v2.py | 96 ++++++++++++++++++-------- 1 file changed, 69 insertions(+), 27 deletions(-) diff --git a/echopype/echodata/combine_lazily_v2.py b/echopype/echodata/combine_lazily_v2.py index e4c952c5e..d347c8673 100644 --- a/echopype/echodata/combine_lazily_v2.py +++ b/echopype/echodata/combine_lazily_v2.py @@ -5,13 +5,20 @@ import pandas as pd import xarray as xr from .echodata import EchoData +from .api import open_converted import zarr from numcodecs import blosc +from ..utils.prov import echopype_prov_attrs +from warnings import warn -# TODO: make this a class and have dims info/below lists as a class variable +class ZarrCombine: + """ + A class that combines a list of EchoData objects by + creating a Zarr store and appending each group's + Dataset to the store. + """ -class LazyCombine: def __init__(self): # those dimensions that should not be chunked @@ -77,8 +84,6 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non self.group_attrs[ed_name + '_attr_key'].extend(ds.attrs.keys()) self.group_attrs[ed_name + '_attrs'].append(list(ds.attrs.values())) - # TODO: document/bring up that I changed naming scheme of attributes - def _get_temp_arr(self, dims: List[str], dtype: type) -> dask.array: """ Constructs a temporary (or dummy) array representing a @@ -155,10 +160,6 @@ def _construct_lazy_ds(self, ds_model: xr.Dataset) -> xr.Dataset: # construct lazy Dataset form ds = xr.Dataset(xr_vars_dict, coords=xr_coords_dict, attrs=ds_model.attrs) - # TODO: add ds attributes here? - - # TODO: do special case for Provenance, where we create attr variables - return ds def _get_ds_encodings(self, ds_model: xr.Dataset) -> Dict[Hashable, dict]: @@ -261,7 +262,7 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: return region - def direct_write( + def _append_ds_list_to_zarr( self, path: str, ds_list: List[xr.Dataset], zarr_group: str, ed_name: str, storage_options: Optional[dict] = {}, to_zarr_compute: bool = True ) -> None: @@ -327,7 +328,7 @@ def direct_write( # TODO: see if compression is occurring, maybe mess with encoding. if not to_zarr_compute: - dask.compute(*delayed_to_zarr) + dask.compute(*delayed_to_zarr) # TODO: maybe use persist in the future? # write constant vars to zarr using the first element of ds_list for var in const_vars: # TODO: one should not parallelize this loop?? @@ -343,9 +344,54 @@ def direct_write( ) # TODO: need to consider the case where range_sample needs to be padded? - # TODO: is there a way we can preserve order in variables with writing? - def combine(self, path: str, eds: List[EchoData], storage_options: Optional[dict] = {}): + def _append_provenance_attr_vars(self, path: str, storage_options: Optional[dict] = {}) -> None: + """ + Creates an xarray Dataset with variables set as the attributes + from all groups before the combination. Additionally, appends + this Dataset to the ``Provenance`` group located in the zarr + store specified by ``path``. + + Parameters + ---------- + path: str + The full path of the final combined zarr store + storage_options: Optional[dict] + Any additional parameters for the storage + backend (ignored for local paths) + """ + + xr_dict = dict() + for name, val in self.group_attrs.items(): + + if "attrs" in name: + + # create Dataset variables + coord_name = name[:-1] + "_key" + xr_dict[name] = {"dims": ["echodata_filename", coord_name], "data": val} + + else: + + # create Dataset coordinates + xr_dict[name] = {"dims": [name], "data": val} + + # construct Dataset and assign Provenance attributes + all_ds_attrs = xr.Dataset.from_dict(xr_dict).assign_attrs(echopype_prov_attrs("conversion")) + + # append Dataset to zarr + all_ds_attrs.to_zarr(path, group="Provenance", mode="a", + storage_options=storage_options, consolidated=True) + + def combine(self, path: str, eds: List[EchoData] = None, + storage_options: Optional[dict] = {}) -> EchoData: + + # return empty EchoData object, if no EchoData objects are provided + if (isinstance(eds, list) and len(eds) == 0) or (not eds): + warn("No EchoData objects were provided, returning an empty EchoData object.") + return EchoData() + + # collect filenames associated with EchoData objects + self.group_attrs["echodata_filename"].extend([str(ed.source_file) if ed.source_file is not None else str(ed.converted_raw_path) for ed in eds]) to_zarr_compute = False @@ -356,8 +402,6 @@ def combine(self, path: str, eds: List[EchoData], storage_options: Optional[dict for grp_info in EchoData.group_map.values(): - # print(grp_info) - if grp_info['ep_group']: ed_group = grp_info['ep_group'] else: @@ -369,21 +413,19 @@ def combine(self, path: str, eds: List[EchoData], storage_options: Optional[dict print(f"ed_group = {ed_group}") - self.direct_write(path, - ds_list=ds_list, - zarr_group=grp_info['ep_group'], - ed_name=ed_group, storage_options=storage_options, to_zarr_compute=to_zarr_compute) - - # TODO: add back in attributes for dataset - # TODO: correctly add attribute keys for Provenance group - # TODO: re-chunk the zarr store after everything has been added? + self._append_ds_list_to_zarr(path, ds_list=ds_list, zarr_group=grp_info['ep_group'], + ed_name=ed_group, storage_options=storage_options, + to_zarr_compute=to_zarr_compute) - # TODO: do provenance group last - # temp = {key: {"dims": ["echodata_filename"], "data": val} for key, val in self.group_attrs.items()} - # xr.Dataset.from_dict(temp) + # append all group attributes before combination to zarr store + self._append_provenance_attr_vars(path, storage_options=storage_options) - # TODO: do direct_write(path, ds_list) for each group in eds - # then do open_converted(path) --> here we could re-chunk? + # TODO: re-chunk the zarr store after everything has been added? # re-enable automatic switching (the default behavior) blosc.use_threads = None + + # open lazy loaded combined EchoData object + ed_combined = open_converted(path) + + return ed_combined From 8e95644841191cfa0b36c4a3246ea2850e22472e Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 8 Sep 2022 15:36:52 -0700 Subject: [PATCH 15/89] add additional type checks to combine --- echopype/echodata/combine_lazily_v2.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/echopype/echodata/combine_lazily_v2.py b/echopype/echodata/combine_lazily_v2.py index d347c8673..e13ccf14e 100644 --- a/echopype/echodata/combine_lazily_v2.py +++ b/echopype/echodata/combine_lazily_v2.py @@ -382,11 +382,17 @@ def _append_provenance_attr_vars(self, path: str, storage_options: Optional[dict all_ds_attrs.to_zarr(path, group="Provenance", mode="a", storage_options=storage_options, consolidated=True) - def combine(self, path: str, eds: List[EchoData] = None, + def combine(self, path: str, eds: List[EchoData] = [], storage_options: Optional[dict] = {}) -> EchoData: + if not isinstance(eds, list): + raise TypeError("The input, eds, must be a list of EchoData objects!") + + if not isinstance(path, str): + raise TypeError("The input, path, must be a string!") + # return empty EchoData object, if no EchoData objects are provided - if (isinstance(eds, list) and len(eds) == 0) or (not eds): + if not eds: warn("No EchoData objects were provided, returning an empty EchoData object.") return EchoData() From a7b51e7bcfda597012c6d17c56da9cc637e480fb Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 8 Sep 2022 15:39:19 -0700 Subject: [PATCH 16/89] rename combine_lazily_v2.py to zarr_combine.py --- echopype/echodata/{combine_lazily_v2.py => zarr_combine.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename echopype/echodata/{combine_lazily_v2.py => zarr_combine.py} (100%) diff --git a/echopype/echodata/combine_lazily_v2.py b/echopype/echodata/zarr_combine.py similarity index 100% rename from echopype/echodata/combine_lazily_v2.py rename to echopype/echodata/zarr_combine.py From 932355e23ae06eaee9874725b3541e2b83e4ed85 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 9 Sep 2022 12:03:49 -0700 Subject: [PATCH 17/89] start simplifying the logic needed to append data and removal of parallel write of coords --- echopype/echodata/zarr_combine.py | 232 +++++++++++++++--------------- 1 file changed, 115 insertions(+), 117 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index e13ccf14e..3d8906e8d 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -1,13 +1,15 @@ -from typing import Dict, Hashable, List, Optional, Set +from typing import Dict, Hashable, List, Optional, Set, Tuple, Any from collections import defaultdict import dask import dask.array +import dask.distributed import pandas as pd import xarray as xr from .echodata import EchoData from .api import open_converted import zarr from numcodecs import blosc +from numcodecs import Zstd from ..utils.prov import echopype_prov_attrs from warnings import warn @@ -21,14 +23,8 @@ class ZarrCombine: def __init__(self): - # those dimensions that should not be chunked - self.const_dims = ["channel", "beam_group", "beam", "range_sample", "pulse_length_bin"] - - # those dimensions associated with time - self.time_dims = ["time1", "time2", "time3", "ping_time"] - - # all possible dimensions we can encounter - self.possible_dims = self.const_dims + self.time_dims + # all possible dimensions that we will append to (mainly time dims) + self.append_dims = {"time1", "time2", "time3", "ping_time", "filenames"} # encodings associated with lazy loaded variables self.lazy_encodings = ["chunks", "preferred_chunks", "compressor"] @@ -111,7 +107,7 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> dask.array: # Create the shape of the variable in its final combined # form (padding occurs here) # TODO: make sure this is true shape = [ - self.dims_max[dim] if dim in self.const_dims else self.dims_sum[dim] for dim in dims + self.dims_sum[dim] if dim in self.append_dims else self.dims_max[dim] for dim in dims ] # Create the chunk shape of the variable @@ -119,10 +115,14 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> dask.array: return dask.array.zeros(shape=shape, chunks=chnk_shape, dtype=dtype) - def _construct_lazy_ds(self, ds_model: xr.Dataset) -> xr.Dataset: + def _construct_lazy_ds_and_var_info(self, ds_model: xr.Dataset) -> Tuple[xr.Dataset, List[str], Dict[str, dict]]: """ Constructs a lazy Dataset representing the EchoData group - Dataset in its final combined form. + Dataset in its final combined form. Additionally, collects + all variable and dimension names that are constant across + the Datasets to be combined, and collects the encodings for + all variables and dimensions that will be written to the + zarr store by regions Parameters ---------- @@ -132,9 +132,15 @@ def _construct_lazy_ds(self, ds_model: xr.Dataset) -> xr.Dataset: Returns ------- - xr.Dataset + ds: xr.Dataset A lazy Dataset representing the EchoData group Dataset in its final combined form + const_names: List[str] + The names of all variables and dimensions that are constant + across all Datasets to be combined + encodings: Dict[str, dict] + The encodings for all variables and dimensions that will be + written to the zarr store by regions Notes ----- @@ -144,88 +150,46 @@ def _construct_lazy_ds(self, ds_model: xr.Dataset) -> xr.Dataset: xr_vars_dict = dict() xr_coords_dict = dict() - for name, val in ds_model.variables.items(): - if name not in self.possible_dims: - - # create lazy DataArray representations corresponding to the variables - temp_arr = self._get_temp_arr(list(val.dims), val.dtype) - xr_vars_dict[name] = (val.dims, temp_arr, val.attrs) - - else: - - # create lazy DataArray representations corresponding to the coordinates - temp_arr = self._get_temp_arr(list(val.dims), val.dtype) - xr_coords_dict[name] = (val.dims, temp_arr, val.attrs) - - # construct lazy Dataset form - ds = xr.Dataset(xr_vars_dict, coords=xr_coords_dict, attrs=ds_model.attrs) - - return ds - - def _get_ds_encodings(self, ds_model: xr.Dataset) -> Dict[Hashable, dict]: - """ - Obtains the encodings needed for each variable - of the lazy Dataset form. - - Parameters - ---------- - ds_model: xr.Dataset - The Dataset that we modelled our lazy Dataset after. In practice, - this is the first element in the list of Datasets to be combined. - - Returns - ------- - encodings: Dict[Hashable, dict] - The keys are a string representing the variable name and the - values are a dictionary of the corresponding encodings - - Notes - ----- - The encodings corresponding to the lazy encodings (e.g. compressor) - should not be included here, these will be generated by `to_zarr`. - """ - encodings = dict() + const_names = [] for name, val in ds_model.variables.items(): - # get all encodings except the lazy encodings - encodings[name] = { - key: encod for key, encod in val.encoding.items() if key not in self.lazy_encodings - } + # get all dimensions of val that are also append dimensions + append_dims_in_val = set(val.dims).intersection(self.append_dims) - return encodings + if (not append_dims_in_val) or (name in ds_model.dims): - def _get_constant_vars(self, ds_model: xr.Dataset) -> list: - """ - Obtains all variable and dimension names that will - be the same across all Datasets that will be combined. + # collect the names of all constant variables/dimensions + const_names.append(str(name)) - Parameters - ---------- - ds_model: xr.Dataset - The Dataset that we modelled our lazy Dataset after. In practice, - this is the first element in the list of Datasets to be combined. - - Returns - ------- - const_vars: list - Variable and dimension names that will be the same across all - Datasets that will be combined. - """ + elif name not in ds_model.dims: - # obtain the form of the dimensions for each constant variable - dim_form = [(dim,) for dim in self.const_dims] + # create lazy DataArray representations corresponding to the variables + temp_arr = self._get_temp_arr(list(val.dims), val.dtype) + xr_vars_dict[name] = (val.dims, temp_arr, val.attrs) - # account for Vendor_specific variables - dim_form.append(("channel", "pulse_length_bin")) # TODO: is there a better way? + encodings[str(name)] = { + key: encod for key, encod in val.encoding.items() if key not in self.lazy_encodings + } + encodings[str(name)]["compressor"] = Zstd(level=1) + + # elif name in self.append_dims: + # + # # create lazy DataArray for those coordinates that can be appended to + # temp_arr = self._get_temp_arr(list(val.dims), val.dtype) + # xr_coords_dict[name] = (val.dims, temp_arr, val.attrs) + # + # encodings[str(name)] = { + # key: encod for key, encod in val.encoding.items() if key not in self.lazy_encodings + # } + # + # encodings[str(name)]["compressor"] = Zstd(level=1) - # obtain all constant variables and dimensions - const_vars = [] - for name, val in ds_model.variables.items(): - if val.dims in dim_form: - const_vars.append(name) + # construct lazy Dataset form + # ds = xr.Dataset(xr_vars_dict, coords=xr_coords_dict, attrs=ds_model.attrs) + ds = xr.Dataset(xr_vars_dict, attrs=ds_model.attrs) - return const_vars + return ds, const_names, encodings def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: """ @@ -245,23 +209,43 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: region: Dict[str, slice] Keys set as the dimension name and values as the slice of the zarr portion to write to + + Notes + ----- + Only append dimensions should show up in the region result. """ if ds_ind == 0: # get the initial region - region = {dim: slice(0, self.dims_csum[dim][ds_ind]) for dim in ds_dims} + region = {dim: slice(0, self.dims_csum[dim][ds_ind]) for dim in ds_dims if dim in self.append_dims} else: # get all other regions region = { dim: slice(self.dims_csum[dim][ds_ind - 1], self.dims_csum[dim][ds_ind]) - for dim in ds_dims + for dim in ds_dims if dim in self.append_dims } return region + @dask.delayed + def _append_const_vars_to_zarr(self, const_vars, ds_list, path, zarr_group, storage_options): + + # write constant vars to zarr using the first element of ds_list + for var in const_vars: + + print(f"writing constant vars = {var}") + + # # dims will be automatically filled when they occur in a variable + # if (var not in self.possible_dims) or (var in ["beam", "range_sample"]): + # region = self._get_region(0, set(ds_list[0][var].dims)) + # + # ds_list[0][[var]].to_zarr( + # path, group=zarr_group, region=region, storage_options=storage_options + # ) + def _append_ds_list_to_zarr( self, path: str, ds_list: List[xr.Dataset], zarr_group: str, ed_name: str, storage_options: Optional[dict] = {}, to_zarr_compute: bool = True @@ -295,9 +279,7 @@ def _append_ds_list_to_zarr( # TODO: check for and correct reversed time - ds_lazy = self._construct_lazy_ds(ds_list[0]) - - encodings = self._get_ds_encodings(ds_list[0]) + ds_lazy, const_names, encodings = self._construct_lazy_ds_and_var_info(ds_list[0]) # create zarr file and all associated metadata (this is delayed) ds_lazy.to_zarr( @@ -309,19 +291,15 @@ def _append_ds_list_to_zarr( storage_options=storage_options, synchronizer=zarr.ThreadSynchronizer() ) - # constant variables that will be written later - const_vars = self._get_constant_vars(ds_list[0]) + print(f"const_names = {const_names}") # write each non-constant variable in ds_list to the zarr store delayed_to_zarr = [] for ind, ds in enumerate(ds_list): - # obtain the names of all ds dimensions that are not constant - ds_dims = set(ds.dims) - set(const_vars) - - region = self._get_region(ind, ds_dims) + region = self._get_region(ind, set(ds.dims)) - delayed_to_zarr.append(ds.drop(const_vars).to_zarr( + delayed_to_zarr.append(ds.drop(const_names).to_zarr( path, group=zarr_group, region=region, storage_options=storage_options, compute=to_zarr_compute, synchronizer=zarr.ThreadSynchronizer() )) @@ -329,19 +307,30 @@ def _append_ds_list_to_zarr( if not to_zarr_compute: dask.compute(*delayed_to_zarr) # TODO: maybe use persist in the future? - - # write constant vars to zarr using the first element of ds_list - for var in const_vars: # TODO: one should not parallelize this loop?? - - # dims will be automatically filled when they occur in a variable - if (var not in self.possible_dims) or (var in ["beam", "range_sample"]): - - region = self._get_region(0, set(ds_list[0][var].dims)) - - ds_list[0][[var]].to_zarr( - path, group=zarr_group, region=region, storage_options=storage_options, - synchronizer=zarr.ThreadSynchronizer() - ) + # futures = dask.distributed.get_client().submit() + # dask.distributed.get_client().wait_for_workers() + + # # write constant vars to zarr using the first element of ds_list + # for var in const_vars: + # + # print(f"writing constant vars = {var}") + # + # # dims will be automatically filled when they occur in a variable + # if (var not in self.possible_dims) or (var in ["beam", "range_sample"]): + # + # region = self._get_region(0, set(ds_list[0][var].dims)) + # + # ds_list[0][[var]].to_zarr( + # path, group=zarr_group, region=region, storage_options=storage_options + # ) + + delayed_const_append = self._append_const_vars_to_zarr(const_names, ds_list, + path, zarr_group, storage_options) + + # TODO: figure things out when to_zarr_compute == True + + # if not to_zarr_compute: + # dask.compute(delayed_const_append) # TODO: need to consider the case where range_sample needs to be padded? @@ -403,8 +392,16 @@ def combine(self, path: str, eds: List[EchoData] = [], print(f"to_zarr_compute = {to_zarr_compute}") - # tell Blosc to runs in single-threaded contextual mode (necessary for parallel) - blosc.use_threads = False + def set_blosc_thread_options(dask_worker, single_thread: bool): + + if single_thread: + # tell Blosc to runs in single-threaded contextual mode (necessary for parallel) + blosc.use_threads = False + else: + # re-enable automatic switching (the default behavior) + blosc.use_threads = None + + # dask.distributed.get_client().run(set_blosc_thread_options, single_thread=True) for grp_info in EchoData.group_map.values(): @@ -424,14 +421,15 @@ def combine(self, path: str, eds: List[EchoData] = [], to_zarr_compute=to_zarr_compute) # append all group attributes before combination to zarr store - self._append_provenance_attr_vars(path, storage_options=storage_options) + # self._append_provenance_attr_vars(path, storage_options=storage_options) # TODO: this should be delayed! # TODO: re-chunk the zarr store after everything has been added? # re-enable automatic switching (the default behavior) - blosc.use_threads = None + # dask.distributed.get_client().run(set_blosc_thread_options, single_thread=False) + - # open lazy loaded combined EchoData object - ed_combined = open_converted(path) + # # open lazy loaded combined EchoData object + # ed_combined = open_converted(path) - return ed_combined + return #ed_combined From 36768c6bffcdd66a654a7500f3756e181ef9ef6d Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 9 Sep 2022 17:00:57 -0700 Subject: [PATCH 18/89] reorganize code and include original compressor in encodings --- echopype/echodata/zarr_combine.py | 157 ++++++++++++++---------------- 1 file changed, 72 insertions(+), 85 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 3d8906e8d..3cf54b2ae 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -27,7 +27,7 @@ def __init__(self): self.append_dims = {"time1", "time2", "time3", "ping_time", "filenames"} # encodings associated with lazy loaded variables - self.lazy_encodings = ["chunks", "preferred_chunks", "compressor"] + self.lazy_encodings = ["chunks", "preferred_chunks"] # defaultdict that holds every group's attributes self.group_attrs = defaultdict(list) @@ -64,12 +64,12 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non """ # Dataframe with column as dim names and rows as the different Datasets - dims_df = pd.DataFrame([ds.dims for ds in ds_list]) + self.dims_df = pd.DataFrame([ds.dims for ds in ds_list]) # calculate useful information about the dimensions - self.dims_sum = dims_df.sum(axis=0).to_dict() - self.dims_csum = dims_df.cumsum(axis=0).to_dict() - self.dims_max = dims_df.max(axis=0).to_dict() + self.dims_sum = self.dims_df.sum(axis=0).to_dict() + self.dims_csum = self.dims_df.cumsum(axis=0).to_dict() + self.dims_max = self.dims_df.max(axis=0).to_dict() # format ed_name appropriately ed_name = ed_name.replace('-', '_').replace('/', '_').lower() @@ -80,7 +80,7 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non self.group_attrs[ed_name + '_attr_key'].extend(ds.attrs.keys()) self.group_attrs[ed_name + '_attrs'].append(list(ds.attrs.values())) - def _get_temp_arr(self, dims: List[str], dtype: type) -> dask.array: + def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), list]: """ Constructs a temporary (or dummy) array representing a variable in its final combined form. @@ -94,9 +94,11 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> dask.array: Returns ------- - dask.array + temp_arr: dask.array a temporary (or dummy) array representing a variable in its final combined form. + chnk_shape: List[int] + The chunk shape used to construct ``temp_arr`` Notes ----- @@ -113,7 +115,18 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> dask.array: # Create the chunk shape of the variable chnk_shape = [self.dims_max[dim] for dim in dims] - return dask.array.zeros(shape=shape, chunks=chnk_shape, dtype=dtype) + temp_arr = dask.array.zeros(shape=shape, chunks=chnk_shape, dtype=dtype) + + return temp_arr, chnk_shape + + def _get_encodings(self, encodings, name, val, chnk_shape): + + # TODO: document!! + + encodings[str(name)] = { + key: encod for key, encod in val.encoding.items() if key not in self.lazy_encodings + } + encodings[str(name)]["chunks"] = chnk_shape def _construct_lazy_ds_and_var_info(self, ds_model: xr.Dataset) -> Tuple[xr.Dataset, List[str], Dict[str, dict]]: """ @@ -157,7 +170,7 @@ def _construct_lazy_ds_and_var_info(self, ds_model: xr.Dataset) -> Tuple[xr.Data # get all dimensions of val that are also append dimensions append_dims_in_val = set(val.dims).intersection(self.append_dims) - if (not append_dims_in_val) or (name in ds_model.dims): + if not append_dims_in_val: # collect the names of all constant variables/dimensions const_names.append(str(name)) @@ -165,29 +178,21 @@ def _construct_lazy_ds_and_var_info(self, ds_model: xr.Dataset) -> Tuple[xr.Data elif name not in ds_model.dims: # create lazy DataArray representations corresponding to the variables - temp_arr = self._get_temp_arr(list(val.dims), val.dtype) + temp_arr, chnk_shape = self._get_temp_arr(list(val.dims), val.dtype) xr_vars_dict[name] = (val.dims, temp_arr, val.attrs) - encodings[str(name)] = { - key: encod for key, encod in val.encoding.items() if key not in self.lazy_encodings - } - encodings[str(name)]["compressor"] = Zstd(level=1) - - # elif name in self.append_dims: - # - # # create lazy DataArray for those coordinates that can be appended to - # temp_arr = self._get_temp_arr(list(val.dims), val.dtype) - # xr_coords_dict[name] = (val.dims, temp_arr, val.attrs) - # - # encodings[str(name)] = { - # key: encod for key, encod in val.encoding.items() if key not in self.lazy_encodings - # } - # - # encodings[str(name)]["compressor"] = Zstd(level=1) + self._get_encodings(encodings, name, val, chnk_shape) + + elif name in self.append_dims: + + # create lazy DataArray for those coordinates that can be appended to + temp_arr, chnk_shape = self._get_temp_arr(list(val.dims), val.dtype) + xr_coords_dict[name] = (val.dims, temp_arr, val.attrs) + + self._get_encodings(encodings, name, val, chnk_shape) # construct lazy Dataset form - # ds = xr.Dataset(xr_vars_dict, coords=xr_coords_dict, attrs=ds_model.attrs) - ds = xr.Dataset(xr_vars_dict, attrs=ds_model.attrs) + ds = xr.Dataset(xr_vars_dict, coords=xr_coords_dict, attrs=ds_model.attrs) return ds, const_names, encodings @@ -230,26 +235,28 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: return region - @dask.delayed - def _append_const_vars_to_zarr(self, const_vars, ds_list, path, zarr_group, storage_options): + @staticmethod + def _append_const_vars_to_zarr(const_vars, ds_list, path, zarr_group, storage_options): + + # TODO: document this! # write constant vars to zarr using the first element of ds_list for var in const_vars: - print(f"writing constant vars = {var}") + # dims will be automatically filled when they occur in a variable + if (var not in list(ds_list[0].dims)) or (var in ["beam", "range_sample"]): + + # TODO: when range_sample needs to be padded, here we will + # need to pick the dataset with the max size for range_sample - # # dims will be automatically filled when they occur in a variable - # if (var not in self.possible_dims) or (var in ["beam", "range_sample"]): - # region = self._get_region(0, set(ds_list[0][var].dims)) - # - # ds_list[0][[var]].to_zarr( - # path, group=zarr_group, region=region, storage_options=storage_options - # ) + ds_list[0][[var]].to_zarr( + path, group=zarr_group, mode='a', storage_options=storage_options + ) def _append_ds_list_to_zarr( self, path: str, ds_list: List[xr.Dataset], zarr_group: str, ed_name: str, storage_options: Optional[dict] = {}, to_zarr_compute: bool = True - ) -> None: + ) -> List[str]: """ Creates a zarr store and then appends each Dataset in ``ds_list`` to it. The final result is a combined @@ -288,52 +295,30 @@ def _append_ds_list_to_zarr( group=zarr_group, encoding=encodings, consolidated=True, - storage_options=storage_options, synchronizer=zarr.ThreadSynchronizer() + storage_options=storage_options#, synchronizer=zarr.ThreadSynchronizer() ) - print(f"const_names = {const_names}") - # write each non-constant variable in ds_list to the zarr store delayed_to_zarr = [] for ind, ds in enumerate(ds_list): region = self._get_region(ind, set(ds.dims)) - delayed_to_zarr.append(ds.drop(const_names).to_zarr( + ds_drop = ds.drop(const_names) + + delayed_to_zarr.append(ds_drop.to_zarr( path, group=zarr_group, region=region, storage_options=storage_options, compute=to_zarr_compute, - synchronizer=zarr.ThreadSynchronizer() + # synchronizer=zarr.ThreadSynchronizer() )) # TODO: see if compression is occurring, maybe mess with encoding. if not to_zarr_compute: dask.compute(*delayed_to_zarr) # TODO: maybe use persist in the future? - # futures = dask.distributed.get_client().submit() - # dask.distributed.get_client().wait_for_workers() - - # # write constant vars to zarr using the first element of ds_list - # for var in const_vars: - # - # print(f"writing constant vars = {var}") - # - # # dims will be automatically filled when they occur in a variable - # if (var not in self.possible_dims) or (var in ["beam", "range_sample"]): - # - # region = self._get_region(0, set(ds_list[0][var].dims)) - # - # ds_list[0][[var]].to_zarr( - # path, group=zarr_group, region=region, storage_options=storage_options - # ) - - delayed_const_append = self._append_const_vars_to_zarr(const_names, ds_list, - path, zarr_group, storage_options) - - # TODO: figure things out when to_zarr_compute == True - - # if not to_zarr_compute: - # dask.compute(delayed_const_append) # TODO: need to consider the case where range_sample needs to be padded? + return const_names + def _append_provenance_attr_vars(self, path: str, storage_options: Optional[dict] = {}) -> None: """ Creates an xarray Dataset with variables set as the attributes @@ -392,15 +377,15 @@ def combine(self, path: str, eds: List[EchoData] = [], print(f"to_zarr_compute = {to_zarr_compute}") - def set_blosc_thread_options(dask_worker, single_thread: bool): - - if single_thread: - # tell Blosc to runs in single-threaded contextual mode (necessary for parallel) - blosc.use_threads = False - else: - # re-enable automatic switching (the default behavior) - blosc.use_threads = None - + # def set_blosc_thread_options(dask_worker, single_thread: bool): + # + # if single_thread: + # # tell Blosc to runs in single-threaded contextual mode (necessary for parallel) + # blosc.use_threads = False + # else: + # # re-enable automatic switching (the default behavior) + # blosc.use_threads = None + # # dask.distributed.get_client().run(set_blosc_thread_options, single_thread=True) for grp_info in EchoData.group_map.values(): @@ -416,20 +401,22 @@ def set_blosc_thread_options(dask_worker, single_thread: bool): print(f"ed_group = {ed_group}") - self._append_ds_list_to_zarr(path, ds_list=ds_list, zarr_group=grp_info['ep_group'], - ed_name=ed_group, storage_options=storage_options, - to_zarr_compute=to_zarr_compute) + const_names = self._append_ds_list_to_zarr(path, ds_list=ds_list, zarr_group=grp_info['ep_group'], + ed_name=ed_group, storage_options=storage_options, + to_zarr_compute=to_zarr_compute) + + self._append_const_vars_to_zarr(const_names, ds_list, + path, grp_info['ep_group'], storage_options) # append all group attributes before combination to zarr store - # self._append_provenance_attr_vars(path, storage_options=storage_options) # TODO: this should be delayed! + self._append_provenance_attr_vars(path, storage_options=storage_options) # TODO: re-chunk the zarr store after everything has been added? # re-enable automatic switching (the default behavior) # dask.distributed.get_client().run(set_blosc_thread_options, single_thread=False) + # open lazy loaded combined EchoData object + ed_combined = open_converted(path) - # # open lazy loaded combined EchoData object - # ed_combined = open_converted(path) - - return #ed_combined + return ed_combined From 3d87f0e0d34ff5cb5b04c6ac3642478fb3cf1f5f Mon Sep 17 00:00:00 2001 From: b-reyes Date: Mon, 12 Sep 2022 14:35:07 -0700 Subject: [PATCH 19/89] document functions and add retries in compute --- echopype/echodata/zarr_combine.py | 100 +++++++++++++++++++----------- 1 file changed, 65 insertions(+), 35 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 3cf54b2ae..9a136df40 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -8,8 +8,6 @@ from .echodata import EchoData from .api import open_converted import zarr -from numcodecs import blosc -from numcodecs import Zstd from ..utils.prov import echopype_prov_attrs from warnings import warn @@ -119,13 +117,36 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), return temp_arr, chnk_shape - def _get_encodings(self, encodings, name, val, chnk_shape): + def _set_encodings(self, encodings: Dict[str, dict], name: Hashable, + val: xr.Variable, chnk_shape: list) -> None: + """ + Sets the encodings for the variable ``name`` by including all + encodings in ``val``, except those encodings that are deemed + lazy encodings. - # TODO: document!! + Parameters + ---------- + encodings: Dict[str, dict] + The dictionary to set the encodings for + name: Hashable + The name of the variable we are setting the encodings for + val: xr.Variable + The variable that contains the encodings we want to assign + to ``name`` + chnk_shape: list + The shape of the chunks for ``name`` (used in encodings) + Notes + ----- + The input ``encodings`` is directly modified + """ + + # gather all encodings, except the lazy encodings encodings[str(name)] = { key: encod for key, encod in val.encoding.items() if key not in self.lazy_encodings } + + # set the chunk encoding encodings[str(name)]["chunks"] = chnk_shape def _construct_lazy_ds_and_var_info(self, ds_model: xr.Dataset) -> Tuple[xr.Dataset, List[str], Dict[str, dict]]: @@ -181,7 +202,7 @@ def _construct_lazy_ds_and_var_info(self, ds_model: xr.Dataset) -> Tuple[xr.Data temp_arr, chnk_shape = self._get_temp_arr(list(val.dims), val.dtype) xr_vars_dict[name] = (val.dims, temp_arr, val.attrs) - self._get_encodings(encodings, name, val, chnk_shape) + self._set_encodings(encodings, name, val, chnk_shape) elif name in self.append_dims: @@ -189,7 +210,7 @@ def _construct_lazy_ds_and_var_info(self, ds_model: xr.Dataset) -> Tuple[xr.Data temp_arr, chnk_shape = self._get_temp_arr(list(val.dims), val.dtype) xr_coords_dict[name] = (val.dims, temp_arr, val.attrs) - self._get_encodings(encodings, name, val, chnk_shape) + self._set_encodings(encodings, name, val, chnk_shape) # construct lazy Dataset form ds = xr.Dataset(xr_vars_dict, coords=xr_coords_dict, attrs=ds_model.attrs) @@ -235,10 +256,32 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: return region - @staticmethod - def _append_const_vars_to_zarr(const_vars, ds_list, path, zarr_group, storage_options): + def _append_const_to_zarr(self, const_vars: List[str], ds_list: List[xr.Dataset], + path: str, zarr_group: str, storage_options: dict): + """ + Appends all constant (i.e. not chunked) variables and dimensions to the + zarr group. + + Parameters + ---------- + const_vars: List[str] + The names of all variables/dimensions that are not chunked + ds_list: List[xr.Dataset] + The Datasets that will be combined + path: str + The full path of the final combined zarr store + zarr_group: str + The name of the group of the zarr store + corresponding to the Datasets in ``ds_list`` + storage_options: dict + Any additional parameters for the storage + backend (ignored for local paths) - # TODO: document this! + Notes + ----- + Those variables/dimensions that are in ``self.append_dims`` + should not be appended here. + """ # write constant vars to zarr using the first element of ds_list for var in const_vars: @@ -249,7 +292,13 @@ def _append_const_vars_to_zarr(const_vars, ds_list, path, zarr_group, storage_op # TODO: when range_sample needs to be padded, here we will # need to pick the dataset with the max size for range_sample - ds_list[0][[var]].to_zarr( + # make sure to choose the dataset with the largest size for variable + if var in self.dims_df: + ds_list_ind = int(self.dims_df[var].argmax()) + else: + ds_list_ind = int(0) + + ds_list[ds_list_ind][[var]].to_zarr( path, group=zarr_group, mode='a', storage_options=storage_options ) @@ -295,7 +344,7 @@ def _append_ds_list_to_zarr( group=zarr_group, encoding=encodings, consolidated=True, - storage_options=storage_options#, synchronizer=zarr.ThreadSynchronizer() + storage_options=storage_options, synchronizer=zarr.ThreadSynchronizer() ) # write each non-constant variable in ds_list to the zarr store @@ -308,12 +357,11 @@ def _append_ds_list_to_zarr( delayed_to_zarr.append(ds_drop.to_zarr( path, group=zarr_group, region=region, storage_options=storage_options, compute=to_zarr_compute, - # synchronizer=zarr.ThreadSynchronizer() + synchronizer=zarr.ThreadSynchronizer() )) - # TODO: see if compression is occurring, maybe mess with encoding. if not to_zarr_compute: - dask.compute(*delayed_to_zarr) # TODO: maybe use persist in the future? + dask.compute(*delayed_to_zarr, retries=1) # TODO: maybe use persist in the future? # TODO: need to consider the case where range_sample needs to be padded? @@ -375,19 +423,6 @@ def combine(self, path: str, eds: List[EchoData] = [], to_zarr_compute = False - print(f"to_zarr_compute = {to_zarr_compute}") - - # def set_blosc_thread_options(dask_worker, single_thread: bool): - # - # if single_thread: - # # tell Blosc to runs in single-threaded contextual mode (necessary for parallel) - # blosc.use_threads = False - # else: - # # re-enable automatic switching (the default behavior) - # blosc.use_threads = None - # - # dask.distributed.get_client().run(set_blosc_thread_options, single_thread=True) - for grp_info in EchoData.group_map.values(): if grp_info['ep_group']: @@ -405,18 +440,13 @@ def combine(self, path: str, eds: List[EchoData] = [], ed_name=ed_group, storage_options=storage_options, to_zarr_compute=to_zarr_compute) - self._append_const_vars_to_zarr(const_names, ds_list, - path, grp_info['ep_group'], storage_options) + self._append_const_to_zarr(const_names, ds_list, + path, grp_info['ep_group'], storage_options) # append all group attributes before combination to zarr store self._append_provenance_attr_vars(path, storage_options=storage_options) - # TODO: re-chunk the zarr store after everything has been added? - - # re-enable automatic switching (the default behavior) - # dask.distributed.get_client().run(set_blosc_thread_options, single_thread=False) - # open lazy loaded combined EchoData object ed_combined = open_converted(path) - + # return ed_combined From 339ce72d4f5c7e805f8387d0eebb527ab27cfdc5 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Mon, 12 Sep 2022 16:59:40 -0700 Subject: [PATCH 20/89] start implementing checks for time and channel coordinates --- echopype/echodata/zarr_combine.py | 67 +++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 9a136df40..653b9d57c 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -8,8 +8,10 @@ from .echodata import EchoData from .api import open_converted import zarr +import numpy as np from ..utils.prov import echopype_prov_attrs from warnings import warn +from .combine import check_echodatas_input, check_and_correct_reversed_time class ZarrCombine: @@ -21,8 +23,11 @@ class ZarrCombine: def __init__(self): + # all possible time dimensions + self.possible_time_dims = {"time1", "time2", "time3", "ping_time"} + # all possible dimensions that we will append to (mainly time dims) - self.append_dims = {"time1", "time2", "time3", "ping_time", "filenames"} + self.append_dims = {"filenames"}.union(self.possible_time_dims) # encodings associated with lazy loaded variables self.lazy_encodings = ["chunks", "preferred_chunks"] @@ -30,6 +35,58 @@ def __init__(self): # defaultdict that holds every group's attributes self.group_attrs = defaultdict(list) + self.sonar_model = None + + def _check_ds_times(self, ds_list: List[xr.Dataset], ed_name: str): + + ed_time_dim = set(ds_list[0].dims).intersection(self.possible_time_dims) + + for time in ed_time_dim: + + max_time = [ds[time].max().values for ds in ds_list] + min_time = [ds[time].min().values for ds in ds_list] + + max_all_nan = all(np.isnan(max_time)) + min_all_nan = all(np.isnan(min_time)) + + # checks to see that times are in ascending order + if max_time[:-1] > min_time[1:] and (not max_all_nan) and (not min_all_nan): + + raise RuntimeError(f"The coordinate {time} is not in ascending order for group {ed_name}, combine cannot be used!") + + + # TODO: check and store time values + for ds in ds_list: + old_time = check_and_correct_reversed_time(ds, time_str=str(time), sonar_model=self.sonar_model) + + print(f"old_time = {old_time}, group = {ed_name}") + + def _check_channels(self, ds_list: List[xr.Dataset], ed_name: str): + """ + Makes sure that each Dataset in ``ds_list`` has the + same number of channels and the same name for each + of these channels. + + """ + + # TODO: document this! + + if "channel" in ds_list[0].dims: + + # check to make sure we have the same number of channels in each ds + if np.unique([len(ds["channel"].values) for ds in ds_list]).size == 1: + + # make each array an element of a numpy array + channel_arrays = np.array([ds["channel"].values for ds in ds_list]) + + # check for unique rows + if np.unique(channel_arrays, axis=0).shape[0] > 1: + + raise RuntimeError(f"All {ed_name} groups do not have that same channel coordinate, combine cannot be used!") + + else: + raise RuntimeError(f"All {ed_name} groups do not have that same number of channel coordinates, combine cannot be used!") + def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> None: """ Constructs useful dictionaries that contain information @@ -61,6 +118,9 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non maximum length across all Datasets """ + self._check_ds_times(ds_list, ed_name) + self._check_channels(ds_list, ed_name) + # Dataframe with column as dim names and rows as the different Datasets self.dims_df = pd.DataFrame([ds.dims for ds in ds_list]) @@ -418,8 +478,7 @@ def combine(self, path: str, eds: List[EchoData] = [], warn("No EchoData objects were provided, returning an empty EchoData object.") return EchoData() - # collect filenames associated with EchoData objects - self.group_attrs["echodata_filename"].extend([str(ed.source_file) if ed.source_file is not None else str(ed.converted_raw_path) for ed in eds]) + self.sonar_model, self.group_attrs["echodata_filename"] = check_echodatas_input(eds) to_zarr_compute = False @@ -448,5 +507,5 @@ def combine(self, path: str, eds: List[EchoData] = [], # open lazy loaded combined EchoData object ed_combined = open_converted(path) - # + return ed_combined From c2af831856ce4de553f3d7f95cdd725430b2cc66 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 13 Sep 2022 15:13:05 -0700 Subject: [PATCH 21/89] add TODO statements --- echopype/echodata/zarr_combine.py | 37 ++++++++++++++++++------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 653b9d57c..b9f310997 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -39,6 +39,8 @@ def __init__(self): def _check_ds_times(self, ds_list: List[xr.Dataset], ed_name: str): + # TODO: document this! + ed_time_dim = set(ds_list[0].dims).intersection(self.possible_time_dims) for time in ed_time_dim: @@ -54,12 +56,15 @@ def _check_ds_times(self, ds_list: List[xr.Dataset], ed_name: str): raise RuntimeError(f"The coordinate {time} is not in ascending order for group {ed_name}, combine cannot be used!") - # TODO: check and store time values + + # TODO: do this first [exist_reversed_time(ds, time_str) for ds in ds_list] + # if any are True, then continue by creating an old time variable in each ds + for ds in ds_list: old_time = check_and_correct_reversed_time(ds, time_str=str(time), sonar_model=self.sonar_model) - print(f"old_time = {old_time}, group = {ed_name}") + # print(f"old_time = {old_time}, group = {ed_name}") def _check_channels(self, ds_list: List[xr.Dataset], ed_name: str): """ @@ -206,6 +211,10 @@ def _set_encodings(self, encodings: Dict[str, dict], name: Hashable, key: encod for key, encod in val.encoding.items() if key not in self.lazy_encodings } + # TODO: if 'compressor' or 'filters' or '_FillValue' or 'dtype' do not exist, then + # assign them to a default value + # 'compressor': Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, blocksize=0) + # set the chunk encoding encodings[str(name)]["chunks"] = chnk_shape @@ -346,21 +355,19 @@ def _append_const_to_zarr(self, const_vars: List[str], ds_list: List[xr.Dataset] # write constant vars to zarr using the first element of ds_list for var in const_vars: - # dims will be automatically filled when they occur in a variable - if (var not in list(ds_list[0].dims)) or (var in ["beam", "range_sample"]): + # TODO: when range_sample needs to be padded, here we will + # need to pick the dataset with the max size for range_sample + # (might be done with change below) - # TODO: when range_sample needs to be padded, here we will - # need to pick the dataset with the max size for range_sample - - # make sure to choose the dataset with the largest size for variable - if var in self.dims_df: - ds_list_ind = int(self.dims_df[var].argmax()) - else: - ds_list_ind = int(0) + # make sure to choose the dataset with the largest size for variable + if var in self.dims_df: + ds_list_ind = int(self.dims_df[var].argmax()) + else: + ds_list_ind = int(0) - ds_list[ds_list_ind][[var]].to_zarr( - path, group=zarr_group, mode='a', storage_options=storage_options - ) + ds_list[ds_list_ind][[var]].to_zarr( + path, group=zarr_group, mode='a', storage_options=storage_options + ) def _append_ds_list_to_zarr( self, path: str, ds_list: List[xr.Dataset], zarr_group: str, ed_name: str, From 3665a56f1a0bea6b4034332f24250cad1cee7e07 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 13 Sep 2022 15:45:02 -0700 Subject: [PATCH 22/89] fix pre-commit issues --- echopype/echodata/combine_preprocess.py | 24 +++-- echopype/echodata/zarr_combine.py | 131 ++++++++++++++++-------- 2 files changed, 104 insertions(+), 51 deletions(-) diff --git a/echopype/echodata/combine_preprocess.py b/echopype/echodata/combine_preprocess.py index acccb6530..ea659bc69 100644 --- a/echopype/echodata/combine_preprocess.py +++ b/echopype/echodata/combine_preprocess.py @@ -1,13 +1,15 @@ -import numpy as np from pathlib import Path -import xarray as xr from typing import List +import numpy as np +import xarray as xr + class PreprocessCallable: """ Class that has all preprocessing functions and is callable. """ + def __init__(self, file_paths: List[str]): self.file_paths = file_paths self.ed_group = None @@ -35,7 +37,7 @@ def re_chunk(self, ds): # ds = ds.chunk(chunk_dict) - for drop_var in ['backscatter_r', 'angle_athwartship', 'angle_alongship']: + for drop_var in ["backscatter_r", "angle_athwartship", "angle_alongship"]: if drop_var in ds: ds = ds.drop_vars(drop_var) @@ -45,7 +47,7 @@ def re_chunk(self, ds): def _assign_file_index(self, ds): ind_file = self.file_paths.index(ds.encoding["source"]) - ds['filenames'] = (['filenames'], np.array([ind_file])) + ds["filenames"] = (["filenames"], np.array([ind_file])) # TODO: add method to check and correct reversed times @@ -53,11 +55,15 @@ def _store_attrs(self, ds): file_name = Path(ds.encoding["source"]).name - grp_key_name = self.ed_group + '_attr_key' + grp_key_name = self.ed_group + "_attr_key" grp_attr_names = np.array(list(ds.attrs.keys())) - attrs_var = xr.DataArray(data=np.array([list(ds.attrs.values())]), - coords={'echodata_filename': (['echodata_filename'], np.array([file_name])), - grp_key_name: ([grp_key_name], grp_attr_names)}) + attrs_var = xr.DataArray( + data=np.array([list(ds.attrs.values())]), + coords={ + "echodata_filename": (["echodata_filename"], np.array([file_name])), + grp_key_name: ([grp_key_name], grp_attr_names), + }, + ) - ds[self.ed_group + '_attrs'] = attrs_var + ds[self.ed_group + "_attrs"] = attrs_var diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index b9f310997..27b44efac 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -1,17 +1,19 @@ -from typing import Dict, Hashable, List, Optional, Set, Tuple, Any from collections import defaultdict +from typing import Dict, Hashable, List, Optional, Set, Tuple +from warnings import warn + import dask import dask.array import dask.distributed +import numpy as np import pandas as pd import xarray as xr -from .echodata import EchoData -from .api import open_converted import zarr -import numpy as np + from ..utils.prov import echopype_prov_attrs -from warnings import warn -from .combine import check_echodatas_input, check_and_correct_reversed_time +from .api import open_converted +from .combine import check_echodatas_input # , check_and_correct_reversed_time +from .echodata import EchoData class ZarrCombine: @@ -54,17 +56,22 @@ def _check_ds_times(self, ds_list: List[xr.Dataset], ed_name: str): # checks to see that times are in ascending order if max_time[:-1] > min_time[1:] and (not max_all_nan) and (not min_all_nan): - raise RuntimeError(f"The coordinate {time} is not in ascending order for group {ed_name}, combine cannot be used!") + raise RuntimeError( + f"The coordinate {time} is not in ascending order for group {ed_name}, " + f"combine cannot be used!" + ) # TODO: check and store time values # TODO: do this first [exist_reversed_time(ds, time_str) for ds in ds_list] # if any are True, then continue by creating an old time variable in each ds - for ds in ds_list: - old_time = check_and_correct_reversed_time(ds, time_str=str(time), sonar_model=self.sonar_model) + # for ds in ds_list: + # old_time = check_and_correct_reversed_time( + # ds, time_str=str(time), sonar_model=self.sonar_model + # ) - # print(f"old_time = {old_time}, group = {ed_name}") + # print(f"old_time = {old_time}, group = {ed_name}") def _check_channels(self, ds_list: List[xr.Dataset], ed_name: str): """ @@ -87,10 +94,16 @@ def _check_channels(self, ds_list: List[xr.Dataset], ed_name: str): # check for unique rows if np.unique(channel_arrays, axis=0).shape[0] > 1: - raise RuntimeError(f"All {ed_name} groups do not have that same channel coordinate, combine cannot be used!") + raise RuntimeError( + f"All {ed_name} groups do not have that same channel coordinate, " + f"combine cannot be used!" + ) else: - raise RuntimeError(f"All {ed_name} groups do not have that same number of channel coordinates, combine cannot be used!") + raise RuntimeError( + f"All {ed_name} groups do not have that same number of channel coordinates, " + f"combine cannot be used!" + ) def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> None: """ @@ -135,13 +148,13 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non self.dims_max = self.dims_df.max(axis=0).to_dict() # format ed_name appropriately - ed_name = ed_name.replace('-', '_').replace('/', '_').lower() + ed_name = ed_name.replace("-", "_").replace("/", "_").lower() # collect Dataset attributes for count, ds in enumerate(ds_list): if count == 0: - self.group_attrs[ed_name + '_attr_key'].extend(ds.attrs.keys()) - self.group_attrs[ed_name + '_attrs'].append(list(ds.attrs.values())) + self.group_attrs[ed_name + "_attr_key"].extend(ds.attrs.keys()) + self.group_attrs[ed_name + "_attrs"].append(list(ds.attrs.values())) def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), list]: """ @@ -182,8 +195,9 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), return temp_arr, chnk_shape - def _set_encodings(self, encodings: Dict[str, dict], name: Hashable, - val: xr.Variable, chnk_shape: list) -> None: + def _set_encodings( + self, encodings: Dict[str, dict], name: Hashable, val: xr.Variable, chnk_shape: list + ) -> None: """ Sets the encodings for the variable ``name`` by including all encodings in ``val``, except those encodings that are deemed @@ -218,7 +232,9 @@ def _set_encodings(self, encodings: Dict[str, dict], name: Hashable, # set the chunk encoding encodings[str(name)]["chunks"] = chnk_shape - def _construct_lazy_ds_and_var_info(self, ds_model: xr.Dataset) -> Tuple[xr.Dataset, List[str], Dict[str, dict]]: + def _construct_lazy_ds_and_var_info( + self, ds_model: xr.Dataset + ) -> Tuple[xr.Dataset, List[str], Dict[str, dict]]: """ Constructs a lazy Dataset representing the EchoData group Dataset in its final combined form. Additionally, collects @@ -313,20 +329,31 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: if ds_ind == 0: # get the initial region - region = {dim: slice(0, self.dims_csum[dim][ds_ind]) for dim in ds_dims if dim in self.append_dims} + region = { + dim: slice(0, self.dims_csum[dim][ds_ind]) + for dim in ds_dims + if dim in self.append_dims + } else: # get all other regions region = { dim: slice(self.dims_csum[dim][ds_ind - 1], self.dims_csum[dim][ds_ind]) - for dim in ds_dims if dim in self.append_dims + for dim in ds_dims + if dim in self.append_dims } return region - def _append_const_to_zarr(self, const_vars: List[str], ds_list: List[xr.Dataset], - path: str, zarr_group: str, storage_options: dict): + def _append_const_to_zarr( + self, + const_vars: List[str], + ds_list: List[xr.Dataset], + path: str, + zarr_group: str, + storage_options: dict, + ): """ Appends all constant (i.e. not chunked) variables and dimensions to the zarr group. @@ -366,12 +393,17 @@ def _append_const_to_zarr(self, const_vars: List[str], ds_list: List[xr.Dataset] ds_list_ind = int(0) ds_list[ds_list_ind][[var]].to_zarr( - path, group=zarr_group, mode='a', storage_options=storage_options + path, group=zarr_group, mode="a", storage_options=storage_options ) def _append_ds_list_to_zarr( - self, path: str, ds_list: List[xr.Dataset], zarr_group: str, ed_name: str, - storage_options: Optional[dict] = {}, to_zarr_compute: bool = True + self, + path: str, + ds_list: List[xr.Dataset], + zarr_group: str, + ed_name: str, + storage_options: Optional[dict] = {}, + to_zarr_compute: bool = True, ) -> List[str]: """ Creates a zarr store and then appends each Dataset @@ -411,7 +443,8 @@ def _append_ds_list_to_zarr( group=zarr_group, encoding=encodings, consolidated=True, - storage_options=storage_options, synchronizer=zarr.ThreadSynchronizer() + storage_options=storage_options, + synchronizer=zarr.ThreadSynchronizer(), ) # write each non-constant variable in ds_list to the zarr store @@ -422,10 +455,16 @@ def _append_ds_list_to_zarr( ds_drop = ds.drop(const_names) - delayed_to_zarr.append(ds_drop.to_zarr( - path, group=zarr_group, region=region, storage_options=storage_options, compute=to_zarr_compute, - synchronizer=zarr.ThreadSynchronizer() - )) + delayed_to_zarr.append( + ds_drop.to_zarr( + path, + group=zarr_group, + region=region, + storage_options=storage_options, + compute=to_zarr_compute, + synchronizer=zarr.ThreadSynchronizer(), + ) + ) if not to_zarr_compute: dask.compute(*delayed_to_zarr, retries=1) # TODO: maybe use persist in the future? @@ -468,11 +507,13 @@ def _append_provenance_attr_vars(self, path: str, storage_options: Optional[dict all_ds_attrs = xr.Dataset.from_dict(xr_dict).assign_attrs(echopype_prov_attrs("conversion")) # append Dataset to zarr - all_ds_attrs.to_zarr(path, group="Provenance", mode="a", - storage_options=storage_options, consolidated=True) + all_ds_attrs.to_zarr( + path, group="Provenance", mode="a", storage_options=storage_options, consolidated=True + ) - def combine(self, path: str, eds: List[EchoData] = [], - storage_options: Optional[dict] = {}) -> EchoData: + def combine( + self, path: str, eds: List[EchoData] = [], storage_options: Optional[dict] = {} + ) -> EchoData: if not isinstance(eds, list): raise TypeError("The input, eds, must be a list of EchoData objects!") @@ -491,8 +532,8 @@ def combine(self, path: str, eds: List[EchoData] = [], for grp_info in EchoData.group_map.values(): - if grp_info['ep_group']: - ed_group = grp_info['ep_group'] + if grp_info["ep_group"]: + ed_group = grp_info["ep_group"] else: ed_group = "Top-level" @@ -502,12 +543,18 @@ def combine(self, path: str, eds: List[EchoData] = [], print(f"ed_group = {ed_group}") - const_names = self._append_ds_list_to_zarr(path, ds_list=ds_list, zarr_group=grp_info['ep_group'], - ed_name=ed_group, storage_options=storage_options, - to_zarr_compute=to_zarr_compute) - - self._append_const_to_zarr(const_names, ds_list, - path, grp_info['ep_group'], storage_options) + const_names = self._append_ds_list_to_zarr( + path, + ds_list=ds_list, + zarr_group=grp_info["ep_group"], + ed_name=ed_group, + storage_options=storage_options, + to_zarr_compute=to_zarr_compute, + ) + + self._append_const_to_zarr( + const_names, ds_list, path, grp_info["ep_group"], storage_options + ) # append all group attributes before combination to zarr store self._append_provenance_attr_vars(path, storage_options=storage_options) From 8eaed23542588f6529d6ef5374e7ddcc18854a62 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 15 Sep 2022 12:01:51 -0700 Subject: [PATCH 23/89] add routine to check Dataset attributes and drop them if they are numpy arrays --- echopype/echodata/zarr_combine.py | 95 +++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 5 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 27b44efac..2f4068355 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Dict, Hashable, List, Optional, Set, Tuple +from typing import Dict, Hashable, List, Optional, Set, Tuple, Any from warnings import warn import dask @@ -105,6 +105,67 @@ def _check_channels(self, ds_list: List[xr.Dataset], ed_name: str): f"combine cannot be used!" ) + @staticmethod + def _compare_attrs(attr1: dict, attr2: dict) -> List[str]: + """ + Compares two attribute dictionaries to ensure that they + are acceptably identical. + + Parameters + ---------- + attr1: dict + Attributes from Dataset 1 + attr2: dict + Attributes from Dataset 2 + + Returns + ------- + numpy_keys: List[str] + All keys that have numpy arrays as values + + Raises + ------ + RuntimeError + - If the keys are not the same + - If the values are not identical + - If the keys ``date_created``, ``conversion_time`` + do not have the same types + + Notes + ----- + For the keys ``date_created``, ``conversion_time`` the values + are not required to be identical, rather their type must be identical. + """ + + # make sure all keys are identical (this should never be triggered) + if attr1.keys() != attr2.keys(): + raise RuntimeError("The attribute keys amongst the ds lists are not the same, combine cannot be used!") + + # make sure that all values are identical + numpy_keys = [] + for key in attr1.keys(): + + if isinstance(attr1[key], np.ndarray): + + numpy_keys.append(key) + + if not np.allclose(attr1[key], attr2[key], rtol=1e-12, atol=1e-12, equal_nan=True): + raise RuntimeError( + f"The attribute {key}'s value amongst the ds lists are not the same, combine cannot be used!") + elif key in ["date_created", "conversion_time"]: + + if not isinstance(attr1[key], type(attr2[key])): + raise RuntimeError(f"The attribute {key}'s type amongst the ds lists " + f"are not the same, combine cannot be used!") + + else: + + if attr1[key] != attr2[key]: + raise RuntimeError( + f"The attribute {key}'s value amongst the ds lists are not the same, combine cannot be used!") + + return numpy_keys + def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> None: """ Constructs useful dictionaries that contain information @@ -134,6 +195,12 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non dims_max: dict Keys as the dimension name and values as the corresponding maximum length across all Datasets + + Notes + ----- + If attribute values are numpy arrays, then they will not be included + in the ``self.group_attrs``. Instead, these values will only appear + in the attributes of the combined ``EchoData`` object. """ self._check_ds_times(ds_list, ed_name) @@ -150,11 +217,24 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non # format ed_name appropriately ed_name = ed_name.replace("-", "_").replace("/", "_").lower() + if len(ds_list) == 1: + # get numpy keys if we only have one Dataset + numpy_keys = self._compare_attrs(ds_list[0].attrs, ds_list[0].attrs) + else: + # compare attributes and get numpy keys, if they exist + for ind in range(len(ds_list) - 1): + numpy_keys = self._compare_attrs(ds_list[ind].attrs, + ds_list[ind + 1].attrs) + # collect Dataset attributes for count, ds in enumerate(ds_list): + + # get reduced attributes that do not include numpy keys + red_attrs = {key: val for key, val in ds.attrs.items() if key not in numpy_keys} + if count == 0: - self.group_attrs[ed_name + "_attr_key"].extend(ds.attrs.keys()) - self.group_attrs[ed_name + "_attrs"].append(list(ds.attrs.values())) + self.group_attrs[ed_name + "_attr_key"].extend(red_attrs.keys()) + self.group_attrs[ed_name + "_attrs"].append(list(red_attrs.values())) def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), list]: """ @@ -439,17 +519,22 @@ def _append_ds_list_to_zarr( # create zarr file and all associated metadata (this is delayed) ds_lazy.to_zarr( path, + mode='w-', compute=False, group=zarr_group, encoding=encodings, - consolidated=True, + consolidated=None, storage_options=storage_options, synchronizer=zarr.ThreadSynchronizer(), ) + # print("computing ds_lazy") + # dask.compute(out) + # # write each non-constant variable in ds_list to the zarr store delayed_to_zarr = [] for ind, ds in enumerate(ds_list): + print(f"ind = {ind}") region = self._get_region(ind, set(ds.dims)) @@ -560,6 +645,6 @@ def combine( self._append_provenance_attr_vars(path, storage_options=storage_options) # open lazy loaded combined EchoData object - ed_combined = open_converted(path) + ed_combined = open_converted(path, chunks={}) # TODO: is this appropriate for chunks? return ed_combined From 7ff0ea1cfadb23cdcd96e9c5708ca6201c9d9022 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:02:23 +0000 Subject: [PATCH 24/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopype/echodata/zarr_combine.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 2f4068355..88cb1a7e3 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Dict, Hashable, List, Optional, Set, Tuple, Any +from typing import Any, Dict, Hashable, List, Optional, Set, Tuple from warnings import warn import dask @@ -139,7 +139,9 @@ def _compare_attrs(attr1: dict, attr2: dict) -> List[str]: # make sure all keys are identical (this should never be triggered) if attr1.keys() != attr2.keys(): - raise RuntimeError("The attribute keys amongst the ds lists are not the same, combine cannot be used!") + raise RuntimeError( + "The attribute keys amongst the ds lists are not the same, combine cannot be used!" + ) # make sure that all values are identical numpy_keys = [] @@ -151,18 +153,22 @@ def _compare_attrs(attr1: dict, attr2: dict) -> List[str]: if not np.allclose(attr1[key], attr2[key], rtol=1e-12, atol=1e-12, equal_nan=True): raise RuntimeError( - f"The attribute {key}'s value amongst the ds lists are not the same, combine cannot be used!") + f"The attribute {key}'s value amongst the ds lists are not the same, combine cannot be used!" + ) elif key in ["date_created", "conversion_time"]: if not isinstance(attr1[key], type(attr2[key])): - raise RuntimeError(f"The attribute {key}'s type amongst the ds lists " - f"are not the same, combine cannot be used!") + raise RuntimeError( + f"The attribute {key}'s type amongst the ds lists " + f"are not the same, combine cannot be used!" + ) else: if attr1[key] != attr2[key]: raise RuntimeError( - f"The attribute {key}'s value amongst the ds lists are not the same, combine cannot be used!") + f"The attribute {key}'s value amongst the ds lists are not the same, combine cannot be used!" + ) return numpy_keys @@ -223,8 +229,7 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non else: # compare attributes and get numpy keys, if they exist for ind in range(len(ds_list) - 1): - numpy_keys = self._compare_attrs(ds_list[ind].attrs, - ds_list[ind + 1].attrs) + numpy_keys = self._compare_attrs(ds_list[ind].attrs, ds_list[ind + 1].attrs) # collect Dataset attributes for count, ds in enumerate(ds_list): @@ -519,7 +524,7 @@ def _append_ds_list_to_zarr( # create zarr file and all associated metadata (this is delayed) ds_lazy.to_zarr( path, - mode='w-', + mode="w-", compute=False, group=zarr_group, encoding=encodings, From b7fd81ec63a1ecc3d5d15faae7bee00895968617 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 15 Sep 2022 17:09:35 -0700 Subject: [PATCH 25/89] set all variables and dims compressor to be the same in io.py and default compressor in zarr_combine.py --- echopype/echodata/zarr_combine.py | 14 ++++++++++---- echopype/utils/io.py | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 2f4068355..ab52bb43a 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Dict, Hashable, List, Optional, Set, Tuple, Any +from typing import Dict, Hashable, List, Optional, Set, Tuple from warnings import warn import dask @@ -14,6 +14,7 @@ from .api import open_converted from .combine import check_echodatas_input # , check_and_correct_reversed_time from .echodata import EchoData +from ..convert.api import COMPRESSION_SETTINGS class ZarrCombine: @@ -309,6 +310,9 @@ def _set_encodings( # assign them to a default value # 'compressor': Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, blocksize=0) + if 'compressor' not in encodings[str(name)]: + encodings[str(name)]['compressor'] = COMPRESSION_SETTINGS['zarr']['compressor'] + # set the chunk encoding encodings[str(name)]["chunks"] = chnk_shape @@ -519,7 +523,6 @@ def _append_ds_list_to_zarr( # create zarr file and all associated metadata (this is delayed) ds_lazy.to_zarr( path, - mode='w-', compute=False, group=zarr_group, encoding=encodings, @@ -534,7 +537,6 @@ def _append_ds_list_to_zarr( # write each non-constant variable in ds_list to the zarr store delayed_to_zarr = [] for ind, ds in enumerate(ds_list): - print(f"ind = {ind}") region = self._get_region(ind, set(ds.dims)) @@ -611,6 +613,8 @@ def combine( warn("No EchoData objects were provided, returning an empty EchoData object.") return EchoData() + # blosc.use_threads = False + self.sonar_model, self.group_attrs["echodata_filename"] = check_echodatas_input(eds) to_zarr_compute = False @@ -644,7 +648,9 @@ def combine( # append all group attributes before combination to zarr store self._append_provenance_attr_vars(path, storage_options=storage_options) + # blosc.use_threads = None + # open lazy loaded combined EchoData object - ed_combined = open_converted(path, chunks={}) # TODO: is this appropriate for chunks? + ed_combined = open_converted(path, chunks={}, synchronizer=zarr.ThreadSynchronizer()) # TODO: is this appropriate for chunks? return ed_combined diff --git a/echopype/utils/io.py b/echopype/utils/io.py index aea21eb92..6d1000413 100644 --- a/echopype/utils/io.py +++ b/echopype/utils/io.py @@ -36,7 +36,7 @@ def save_file(ds, path, mode, engine, group=None, compression_settings=None): """Saves a dataset to netcdf or zarr depending on the engine If ``compression_settings`` are set, compress all variables with those settings""" encoding = ( - {var: compression_settings for var in ds.data_vars} + {var: compression_settings for var in ds.variables} if compression_settings is not None else {} ) From 14ccb8458ac6a7955e8798642546c3e7bdc869d5 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 16 Sep 2022 10:03:58 -0700 Subject: [PATCH 26/89] change conversion to combination --- echopype/echodata/zarr_combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 22b712421..004ebe13a 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -598,7 +598,7 @@ def _append_provenance_attr_vars(self, path: str, storage_options: Optional[dict xr_dict[name] = {"dims": [name], "data": val} # construct Dataset and assign Provenance attributes - all_ds_attrs = xr.Dataset.from_dict(xr_dict).assign_attrs(echopype_prov_attrs("conversion")) + all_ds_attrs = xr.Dataset.from_dict(xr_dict).assign_attrs(echopype_prov_attrs("combination")) # append Dataset to zarr all_ds_attrs.to_zarr( From 3c7ad86c3d1670eaa1297d966c45468130d00f33 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 Sep 2022 17:05:29 +0000 Subject: [PATCH 27/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopype/echodata/zarr_combine.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 004ebe13a..db8399299 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -598,7 +598,9 @@ def _append_provenance_attr_vars(self, path: str, storage_options: Optional[dict xr_dict[name] = {"dims": [name], "data": val} # construct Dataset and assign Provenance attributes - all_ds_attrs = xr.Dataset.from_dict(xr_dict).assign_attrs(echopype_prov_attrs("combination")) + all_ds_attrs = xr.Dataset.from_dict(xr_dict).assign_attrs( + echopype_prov_attrs("combination") + ) # append Dataset to zarr all_ds_attrs.to_zarr( From a34e6c3efde57486f8bc64a12e8149134072477c Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 16 Sep 2022 14:54:55 -0700 Subject: [PATCH 28/89] set compressor encoding for all types of zarr variables --- echopype/convert/api.py | 72 +++++++++++-------------------- echopype/echodata/zarr_combine.py | 28 ++++++------ echopype/utils/coding.py | 15 ++++--- echopype/utils/io.py | 54 ++++++++++++++++++++--- 4 files changed, 100 insertions(+), 69 deletions(-) diff --git a/echopype/convert/api.py b/echopype/convert/api.py index 3c510f5fd..7fdce7858 100644 --- a/echopype/convert/api.py +++ b/echopype/convert/api.py @@ -3,7 +3,6 @@ from typing import TYPE_CHECKING, Dict, Optional, Tuple import fsspec -import zarr from datatree import DataTree # fmt: off @@ -18,10 +17,7 @@ from ..utils import io from ..utils.log import _init_logger -COMPRESSION_SETTINGS = { - "netcdf4": {"zlib": True, "complevel": 4}, - "zarr": {"compressor": zarr.Blosc(cname="zstd", clevel=3, shuffle=2)}, -} +from ..utils.coding import COMPRESSION_SETTINGS DEFAULT_CHUNK_SIZE = {"range_sample": 25000, "ping_time": 2500} @@ -106,28 +102,23 @@ def _save_groups_to_file(echodata, output_path, engine, compress=True): # TODO: in terms of chunking, would using rechunker at the end be faster and more convenient? # Top-level group - io.save_file(echodata["Top-level"], path=output_path, mode="w", engine=engine) + io.save_file( + echodata["Top-level"], + path=output_path, + mode="w", + engine=engine, + compression_settings=COMPRESSION_SETTINGS[engine] if compress else None, + ) # Environment group - if "time1" in echodata["Environment"]: - io.save_file( - # echodata["Environment"].chunk( - # {"time1": DEFAULT_CHUNK_SIZE["ping_time"]} - # ), # TODO: chunking necessary? - echodata["Environment"], - path=output_path, - mode="a", - engine=engine, - group="Environment", - ) - else: - io.save_file( - echodata["Environment"], - path=output_path, - mode="a", - engine=engine, - group="Environment", - ) + io.save_file( + echodata["Environment"], # TODO: chunking necessary? + path=output_path, + mode="a", + engine=engine, + group="Environment", + compression_settings=COMPRESSION_SETTINGS[engine] if compress else None, + ) # Platform group io.save_file( @@ -157,6 +148,7 @@ def _save_groups_to_file(echodata, output_path, engine, compress=True): group="Provenance", mode="a", engine=engine, + compression_settings=COMPRESSION_SETTINGS[engine] if compress else None, ) # Sonar group @@ -166,6 +158,7 @@ def _save_groups_to_file(echodata, output_path, engine, compress=True): group="Sonar", mode="a", engine=engine, + compression_settings=COMPRESSION_SETTINGS[engine] if compress else None, ) # /Sonar/Beam_groupX group @@ -217,27 +210,14 @@ def _save_groups_to_file(echodata, output_path, engine, compress=True): ) # Vendor_specific group - if "ping_time" in echodata["Vendor_specific"]: - io.save_file( - # echodata["Vendor_specific"].chunk( - # {"ping_time": DEFAULT_CHUNK_SIZE["ping_time"]} - # ), # TODO: chunking necessary? - echodata["Vendor_specific"], - path=output_path, - mode="a", - engine=engine, - group="Vendor_specific", - compression_settings=COMPRESSION_SETTINGS[engine] if compress else None, - ) - else: - io.save_file( - echodata["Vendor_specific"], # TODO: chunking necessary? - path=output_path, - mode="a", - engine=engine, - group="Vendor_specific", - compression_settings=COMPRESSION_SETTINGS[engine] if compress else None, - ) + io.save_file( + echodata["Vendor_specific"], # TODO: chunking necessary? + path=output_path, + mode="a", + engine=engine, + group="Vendor_specific", + compression_settings=COMPRESSION_SETTINGS[engine] if compress else None, + ) def _set_convert_params(param_dict: Dict[str, str]) -> Dict[str, str]: diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 004ebe13a..78940ca63 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -548,6 +548,8 @@ def _append_ds_list_to_zarr( region = self._get_region(ind, set(ds.dims)) ds_drop = ds.drop(const_names) + print(ds_drop) + print(" ") delayed_to_zarr.append( ds_drop.to_zarr( @@ -561,7 +563,7 @@ def _append_ds_list_to_zarr( ) if not to_zarr_compute: - dask.compute(*delayed_to_zarr, retries=1) # TODO: maybe use persist in the future? + dask.compute(*delayed_to_zarr) #, retries=1) # TODO: maybe use persist in the future? # TODO: need to consider the case where range_sample needs to be padded? @@ -635,7 +637,7 @@ def combine( ds_list = [ed[ed_group] for ed in eds if ed_group in ed.group_paths] - if ds_list: + if ds_list and grp_info["ep_group"] == "Platform/NMEA": #"Environment": #"Top-level": #"Platform" print(f"ed_group = {ed_group}") @@ -648,18 +650,18 @@ def combine( to_zarr_compute=to_zarr_compute, ) - self._append_const_to_zarr( - const_names, ds_list, path, grp_info["ep_group"], storage_options - ) - - # append all group attributes before combination to zarr store - self._append_provenance_attr_vars(path, storage_options=storage_options) + # self._append_const_to_zarr( + # const_names, ds_list, path, grp_info["ep_group"], storage_options + # ) + # + # # append all group attributes before combination to zarr store + # self._append_provenance_attr_vars(path, storage_options=storage_options) # blosc.use_threads = None # open lazy loaded combined EchoData object - ed_combined = open_converted( - path, chunks={}, synchronizer=zarr.ThreadSynchronizer() - ) # TODO: is this appropriate for chunks? - - return ed_combined + # ed_combined = open_converted( + # path, chunks={}, synchronizer=zarr.ThreadSynchronizer() + # ) # TODO: is this appropriate for chunks? + # + # return ed_combined diff --git a/echopype/utils/coding.py b/echopype/utils/coding.py index ca1339ed5..7adb3a21f 100644 --- a/echopype/utils/coding.py +++ b/echopype/utils/coding.py @@ -5,11 +5,6 @@ import zarr from xarray import coding -COMPRESSION_SETTINGS = { - "netcdf4": {"zlib": True, "complevel": 4}, - "zarr": {"compressor": zarr.Blosc(cname="zstd", clevel=3, shuffle=2)}, -} - DEFAULT_TIME_ENCODING = { "units": "seconds since 1900-01-01T00:00:00+00:00", "calendar": "gregorian", @@ -17,6 +12,16 @@ "dtype": np.dtype("float64"), } +COMPRESSION_SETTINGS = { + "netcdf4": {"zlib": True, "complevel": 4}, + + # zarr compressors were chosen based on xarray results + "zarr": {"float": {"compressor": zarr.Blosc(cname="zstd", clevel=3, shuffle=2)}, + "int": {"compressor": zarr.Blosc(cname='lz4', clevel=5, shuffle=1, blocksize=0)}, + "string": {"compressor": zarr.Blosc(cname='lz4', clevel=5, shuffle=1, blocksize=0)}, + "time": {"compressor": zarr.Blosc(cname='lz4', clevel=5, shuffle=1, blocksize=0)}}, +} + DEFAULT_ENCODINGS = { "ping_time": DEFAULT_TIME_ENCODING, diff --git a/echopype/utils/io.py b/echopype/utils/io.py index 6d1000413..cefc84082 100644 --- a/echopype/utils/io.py +++ b/echopype/utils/io.py @@ -5,6 +5,8 @@ import sys from pathlib import Path from typing import TYPE_CHECKING, Dict, Union +import numpy as np +import xarray as xr import fsspec from fsspec import FSMap @@ -32,14 +34,56 @@ def get_files_from_dir(folder): return [f for f in os.listdir(folder) if os.path.splitext(f)[1] in valid_ext] +def set_zarr_encodings(ds: xr.Dataset, compression_settings: dict): + """ + Sets all variable encodings based on zarr default values + """ + + # create zarr specific encoding + encoding = dict() + for name, val in ds.variables.items(): + + val_encoding = val.encoding + if np.issubdtype(val.dtype, np.floating): + val_encoding.update(compression_settings['float']) + encoding[name] = val_encoding + elif np.issubdtype(val.dtype, np.integer): + val_encoding.update(compression_settings['int']) + encoding[name] = val_encoding + elif np.issubdtype(val.dtype, np.str_): + val_encoding.update(compression_settings['string']) + encoding[name] = val_encoding + elif np.issubdtype(val.dtype, np.datetime64): + val_encoding.update(compression_settings['time']) + encoding[name] = val_encoding + else: + raise NotImplementedError(f"Zarr Encoding for dtype = {val.dtype} has not been set!") + + return encoding + + def save_file(ds, path, mode, engine, group=None, compression_settings=None): """Saves a dataset to netcdf or zarr depending on the engine If ``compression_settings`` are set, compress all variables with those settings""" - encoding = ( - {var: compression_settings for var in ds.variables} - if compression_settings is not None - else {} - ) + + if compression_settings is not None: + + if "float" in compression_settings: # only zarr has this key in it + + encoding = set_zarr_encodings(ds, compression_settings) + + else: + + # TODO: below is the encoding we were using for netcdf, we need to make + # sure that the encoding is appropriate for all data variables + encoding = ( + {var: compression_settings for var in ds.data_vars} + if compression_settings is not None + else {} + ) + else: + encoding = {} + # Allows saving both NetCDF and Zarr files from an xarray dataset if engine == "netcdf4": ds.to_netcdf(path=path, mode=mode, group=group, encoding=encoding) From aaedf5a0e1863ab07c02de2b398d2cfffe5d1d40 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 Sep 2022 21:57:55 +0000 Subject: [PATCH 29/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopype/convert/api.py | 3 +-- echopype/echodata/zarr_combine.py | 6 ++++-- echopype/utils/coding.py | 11 ++++++----- echopype/utils/io.py | 12 ++++++------ 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/echopype/convert/api.py b/echopype/convert/api.py index 7fdce7858..0bd0af623 100644 --- a/echopype/convert/api.py +++ b/echopype/convert/api.py @@ -15,9 +15,8 @@ # fmt: on from ..echodata.echodata import XARRAY_ENGINE_MAP, EchoData from ..utils import io -from ..utils.log import _init_logger - from ..utils.coding import COMPRESSION_SETTINGS +from ..utils.log import _init_logger DEFAULT_CHUNK_SIZE = {"range_sample": 25000, "ping_time": 2500} diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 727343af3..0ab47985b 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -563,7 +563,7 @@ def _append_ds_list_to_zarr( ) if not to_zarr_compute: - dask.compute(*delayed_to_zarr) #, retries=1) # TODO: maybe use persist in the future? + dask.compute(*delayed_to_zarr) # , retries=1) # TODO: maybe use persist in the future? # TODO: need to consider the case where range_sample needs to be padded? @@ -639,7 +639,9 @@ def combine( ds_list = [ed[ed_group] for ed in eds if ed_group in ed.group_paths] - if ds_list and grp_info["ep_group"] == "Platform/NMEA": #"Environment": #"Top-level": #"Platform" + if ( + ds_list and grp_info["ep_group"] == "Platform/NMEA" + ): # "Environment": #"Top-level": #"Platform" print(f"ed_group = {ed_group}") diff --git a/echopype/utils/coding.py b/echopype/utils/coding.py index 7adb3a21f..6093a7716 100644 --- a/echopype/utils/coding.py +++ b/echopype/utils/coding.py @@ -14,12 +14,13 @@ COMPRESSION_SETTINGS = { "netcdf4": {"zlib": True, "complevel": 4}, - # zarr compressors were chosen based on xarray results - "zarr": {"float": {"compressor": zarr.Blosc(cname="zstd", clevel=3, shuffle=2)}, - "int": {"compressor": zarr.Blosc(cname='lz4', clevel=5, shuffle=1, blocksize=0)}, - "string": {"compressor": zarr.Blosc(cname='lz4', clevel=5, shuffle=1, blocksize=0)}, - "time": {"compressor": zarr.Blosc(cname='lz4', clevel=5, shuffle=1, blocksize=0)}}, + "zarr": { + "float": {"compressor": zarr.Blosc(cname="zstd", clevel=3, shuffle=2)}, + "int": {"compressor": zarr.Blosc(cname="lz4", clevel=5, shuffle=1, blocksize=0)}, + "string": {"compressor": zarr.Blosc(cname="lz4", clevel=5, shuffle=1, blocksize=0)}, + "time": {"compressor": zarr.Blosc(cname="lz4", clevel=5, shuffle=1, blocksize=0)}, + }, } diff --git a/echopype/utils/io.py b/echopype/utils/io.py index cefc84082..f6c667384 100644 --- a/echopype/utils/io.py +++ b/echopype/utils/io.py @@ -5,10 +5,10 @@ import sys from pathlib import Path from typing import TYPE_CHECKING, Dict, Union -import numpy as np -import xarray as xr import fsspec +import numpy as np +import xarray as xr from fsspec import FSMap from fsspec.implementations.local import LocalFileSystem @@ -45,16 +45,16 @@ def set_zarr_encodings(ds: xr.Dataset, compression_settings: dict): val_encoding = val.encoding if np.issubdtype(val.dtype, np.floating): - val_encoding.update(compression_settings['float']) + val_encoding.update(compression_settings["float"]) encoding[name] = val_encoding elif np.issubdtype(val.dtype, np.integer): - val_encoding.update(compression_settings['int']) + val_encoding.update(compression_settings["int"]) encoding[name] = val_encoding elif np.issubdtype(val.dtype, np.str_): - val_encoding.update(compression_settings['string']) + val_encoding.update(compression_settings["string"]) encoding[name] = val_encoding elif np.issubdtype(val.dtype, np.datetime64): - val_encoding.update(compression_settings['time']) + val_encoding.update(compression_settings["time"]) encoding[name] = val_encoding else: raise NotImplementedError(f"Zarr Encoding for dtype = {val.dtype} has not been set!") From 64d52424c19770c7ee66b84f894fd2e848afcaf7 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 16 Sep 2022 15:07:21 -0700 Subject: [PATCH 30/89] change Provenance attribute name back to conversion and add zarr compression defaults --- echopype/echodata/zarr_combine.py | 34 +++++++++++++++++++------------ 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 727343af3..b93cfef0f 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -15,6 +15,7 @@ from .api import open_converted from .combine import check_echodatas_input # , check_and_correct_reversed_time from .echodata import EchoData +from ..utils.io import set_zarr_encodings class ZarrCombine: @@ -317,9 +318,21 @@ def _set_encodings( # assign them to a default value # 'compressor': Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, blocksize=0) + # TODO: we should probably use ..utils.io function to reduce repetition if "compressor" not in encodings[str(name)]: encodings[str(name)]["compressor"] = COMPRESSION_SETTINGS["zarr"]["compressor"] + if np.issubdtype(val.dtype, np.floating): + encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]['float']) + elif np.issubdtype(val.dtype, np.integer): + encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]['int']) + elif np.issubdtype(val.dtype, np.str_): + encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]['string']) + elif np.issubdtype(val.dtype, np.datetime64): + encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]['time']) + else: + raise NotImplementedError(f"Zarr Encoding for dtype = {val.dtype} has not been set!") + # set the chunk encoding encodings[str(name)]["chunks"] = chnk_shape @@ -538,9 +551,6 @@ def _append_ds_list_to_zarr( synchronizer=zarr.ThreadSynchronizer(), ) - # print("computing ds_lazy") - # dask.compute(out) - # # write each non-constant variable in ds_list to the zarr store delayed_to_zarr = [] for ind, ds in enumerate(ds_list): @@ -548,8 +558,6 @@ def _append_ds_list_to_zarr( region = self._get_region(ind, set(ds.dims)) ds_drop = ds.drop(const_names) - print(ds_drop) - print(" ") delayed_to_zarr.append( ds_drop.to_zarr( @@ -601,7 +609,7 @@ def _append_provenance_attr_vars(self, path: str, storage_options: Optional[dict # construct Dataset and assign Provenance attributes all_ds_attrs = xr.Dataset.from_dict(xr_dict).assign_attrs( - echopype_prov_attrs("combination") + echopype_prov_attrs("conversion") ) # append Dataset to zarr @@ -639,7 +647,7 @@ def combine( ds_list = [ed[ed_group] for ed in eds if ed_group in ed.group_paths] - if ds_list and grp_info["ep_group"] == "Platform/NMEA": #"Environment": #"Top-level": #"Platform" + if ds_list: print(f"ed_group = {ed_group}") @@ -652,12 +660,12 @@ def combine( to_zarr_compute=to_zarr_compute, ) - # self._append_const_to_zarr( - # const_names, ds_list, path, grp_info["ep_group"], storage_options - # ) - # - # # append all group attributes before combination to zarr store - # self._append_provenance_attr_vars(path, storage_options=storage_options) + self._append_const_to_zarr( + const_names, ds_list, path, grp_info["ep_group"], storage_options + ) + + # append all group attributes before combination to zarr store + self._append_provenance_attr_vars(path, storage_options=storage_options) # blosc.use_threads = None From b3993bcc679ef42bc9b5c7edeac98a41ad7f855b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 Sep 2022 22:09:15 +0000 Subject: [PATCH 31/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopype/echodata/zarr_combine.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 86034f128..b58d4957a 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -11,11 +11,11 @@ import zarr from ..convert.api import COMPRESSION_SETTINGS +from ..utils.io import set_zarr_encodings from ..utils.prov import echopype_prov_attrs from .api import open_converted from .combine import check_echodatas_input # , check_and_correct_reversed_time from .echodata import EchoData -from ..utils.io import set_zarr_encodings class ZarrCombine: @@ -323,15 +323,17 @@ def _set_encodings( encodings[str(name)]["compressor"] = COMPRESSION_SETTINGS["zarr"]["compressor"] if np.issubdtype(val.dtype, np.floating): - encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]['float']) + encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]["float"]) elif np.issubdtype(val.dtype, np.integer): - encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]['int']) + encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]["int"]) elif np.issubdtype(val.dtype, np.str_): - encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]['string']) + encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]["string"]) elif np.issubdtype(val.dtype, np.datetime64): - encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]['time']) + encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]["time"]) else: - raise NotImplementedError(f"Zarr Encoding for dtype = {val.dtype} has not been set!") + raise NotImplementedError( + f"Zarr Encoding for dtype = {val.dtype} has not been set!" + ) # set the chunk encoding encodings[str(name)]["chunks"] = chnk_shape @@ -608,9 +610,7 @@ def _append_provenance_attr_vars(self, path: str, storage_options: Optional[dict xr_dict[name] = {"dims": [name], "data": val} # construct Dataset and assign Provenance attributes - all_ds_attrs = xr.Dataset.from_dict(xr_dict).assign_attrs( - echopype_prov_attrs("conversion") - ) + all_ds_attrs = xr.Dataset.from_dict(xr_dict).assign_attrs(echopype_prov_attrs("conversion")) # append Dataset to zarr all_ds_attrs.to_zarr( From 496c4703469f0d04195c663954822c359880f5ee Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 16 Sep 2022 15:11:28 -0700 Subject: [PATCH 32/89] remove unnecessary import --- echopype/echodata/zarr_combine.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index b58d4957a..66c6eab8a 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -11,7 +11,6 @@ import zarr from ..convert.api import COMPRESSION_SETTINGS -from ..utils.io import set_zarr_encodings from ..utils.prov import echopype_prov_attrs from .api import open_converted from .combine import check_echodatas_input # , check_and_correct_reversed_time @@ -670,8 +669,8 @@ def combine( # blosc.use_threads = None # open lazy loaded combined EchoData object - # ed_combined = open_converted( - # path, chunks={}, synchronizer=zarr.ThreadSynchronizer() - # ) # TODO: is this appropriate for chunks? - # - # return ed_combined + ed_combined = open_converted( + path, chunks={}, synchronizer=zarr.ThreadSynchronizer() + ) # TODO: is this appropriate for chunks? + + return ed_combined From 2735334a7ef1b1b50a09840acfc8f9f32dac2aa4 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 16 Sep 2022 15:17:54 -0700 Subject: [PATCH 33/89] remove chunking in Platform group for EK60 set_groups --- echopype/convert/set_groups_ek60.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/echopype/convert/set_groups_ek60.py b/echopype/convert/set_groups_ek60.py index aa88cbc3f..435218ea8 100644 --- a/echopype/convert/set_groups_ek60.py +++ b/echopype/convert/set_groups_ek60.py @@ -9,7 +9,7 @@ from ..utils.prov import echopype_prov_attrs, source_files_vars # fmt: off -from .set_groups_base import DEFAULT_CHUNK_SIZE, SetGroupsBase +from .set_groups_base import SetGroupsBase # fmt: on @@ -250,7 +250,6 @@ def set_platform(self, NMEA_only=False) -> xr.Dataset: ) }, ) - ds = ds.chunk({"time1": DEFAULT_CHUNK_SIZE["ping_time"]}) if not NMEA_only: ch_ids = list(self.parser_obj.config_datagram["transceivers"].keys()) @@ -385,8 +384,6 @@ def set_platform(self, NMEA_only=False) -> xr.Dataset: # Merge with NMEA data ds = xr.merge([ds, ds_plat], combine_attrs="override") - ds = ds.chunk({"time2": DEFAULT_CHUNK_SIZE["ping_time"]}) - return set_encodings(ds) def _set_beam_group1_zarr_vars(self, ds: xr.Dataset) -> xr.Dataset: From ace66dc97476c15748c2b72ebbb6c17c9bd22973 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 16 Sep 2022 16:45:23 -0700 Subject: [PATCH 34/89] add todo about filename variable write --- echopype/echodata/zarr_combine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 66c6eab8a..1de8b726b 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -319,8 +319,6 @@ def _set_encodings( # TODO: we should probably use ..utils.io function to reduce repetition if "compressor" not in encodings[str(name)]: - encodings[str(name)]["compressor"] = COMPRESSION_SETTINGS["zarr"]["compressor"] - if np.issubdtype(val.dtype, np.floating): encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]["float"]) elif np.issubdtype(val.dtype, np.integer): @@ -572,7 +570,7 @@ def _append_ds_list_to_zarr( ) if not to_zarr_compute: - dask.compute(*delayed_to_zarr) # , retries=1) # TODO: maybe use persist in the future? + dask.compute(*delayed_to_zarr, retries=1) # TODO: maybe use persist in the future? # TODO: need to consider the case where range_sample needs to be padded? @@ -666,6 +664,8 @@ def combine( # append all group attributes before combination to zarr store self._append_provenance_attr_vars(path, storage_options=storage_options) + # TODO: change filenames numbering to range(len(filenames)) + # blosc.use_threads = None # open lazy loaded combined EchoData object From efe940dd59acba12b478dcae48e175c91c40d3ea Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 20 Sep 2022 17:06:48 -0700 Subject: [PATCH 35/89] allow for variables with different sized dims to be written (primarily focused on different sized range_sample dims) --- echopype/echodata/zarr_combine.py | 40 ++++++++++++++----------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 1de8b726b..d1427c3b1 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -413,8 +413,8 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: Parameters ---------- ds_ind: int - The key of the values of ``dims_csum`` to use for each - dimension name + The key of the values of ``dims_csum`` or index of + ``self.dims_df`` to use for each dimension name ds_dims: Set[Hashable] The names of the dimensions used in the region creation @@ -423,29 +423,25 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: region: Dict[str, slice] Keys set as the dimension name and values as the slice of the zarr portion to write to - - Notes - ----- - Only append dimensions should show up in the region result. """ - if ds_ind == 0: + # get the initial region + region = dict() + for dim in ds_dims: - # get the initial region - region = { - dim: slice(0, self.dims_csum[dim][ds_ind]) - for dim in ds_dims - if dim in self.append_dims - } + if dim in self.append_dims: - else: + if ds_ind == 0: + # get the initial region + region[dim] = slice(0, self.dims_csum[dim][ds_ind]) + else: + # get all other regions + region[dim] = slice( + self.dims_csum[dim][ds_ind - 1], self.dims_csum[dim][ds_ind] + ) - # get all other regions - region = { - dim: slice(self.dims_csum[dim][ds_ind - 1], self.dims_csum[dim][ds_ind]) - for dim in ds_dims - if dim in self.append_dims - } + else: + region[dim] = slice(0, self.dims_df.loc[ds_ind][dim]) return region @@ -554,10 +550,10 @@ def _append_ds_list_to_zarr( delayed_to_zarr = [] for ind, ds in enumerate(ds_list): - region = self._get_region(ind, set(ds.dims)) - ds_drop = ds.drop(const_names) + region = self._get_region(ind, set(ds_drop.dims)) + delayed_to_zarr.append( ds_drop.to_zarr( path, From 5e806d1e5e9a43bd059c475ee2370298c59615ed Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 22 Sep 2022 14:44:18 -0700 Subject: [PATCH 36/89] document and finalize check_channels --- echopype/echodata/zarr_combine.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index d1427c3b1..72f0745db 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -38,6 +38,7 @@ def __init__(self): # defaultdict that holds every group's attributes self.group_attrs = defaultdict(list) + # The sonar_model for the new combined EchoData object self.sonar_model = None def _check_ds_times(self, ds_list: List[xr.Dataset], ed_name: str): @@ -74,16 +75,21 @@ def _check_ds_times(self, ds_list: List[xr.Dataset], ed_name: str): # print(f"old_time = {old_time}, group = {ed_name}") - def _check_channels(self, ds_list: List[xr.Dataset], ed_name: str): + @staticmethod + def _check_channels(ds_list: List[xr.Dataset], ed_name: str) -> None: """ Makes sure that each Dataset in ``ds_list`` has the same number of channels and the same name for each of these channels. + Parameters + ---------- + ds_list: List[xr.Dataset] + List of Datasets to be combined + ed_name: str + The name of the ``EchoData`` group being combined """ - # TODO: document this! - if "channel" in ds_list[0].dims: # check to make sure we have the same number of channels in each ds From 7e5553164d1b50f7d68d9ddc66341f8ed2dbb35e Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 22 Sep 2022 15:11:03 -0700 Subject: [PATCH 37/89] document and finalize check_ascending_ds_times --- echopype/echodata/zarr_combine.py | 45 ++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 72f0745db..6c52373fb 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -41,39 +41,60 @@ def __init__(self): # The sonar_model for the new combined EchoData object self.sonar_model = None - def _check_ds_times(self, ds_list: List[xr.Dataset], ed_name: str): + def _check_ascending_ds_times(self, ds_list: List[xr.Dataset], ed_name: str) -> None: + """ + Ensures that the time dimensions are in ascending order + across all Datasets being combined. For example, the + maximum time of the first Dataset must be less than the + minimum time of the second Dataset. - # TODO: document this! + Parameters + ---------- + ds_list: List[xr.Dataset] + List of Datasets to be combined + ed_name: str + The name of the ``EchoData`` group being combined + """ + # get all time dimensions of the input Datasets ed_time_dim = set(ds_list[0].dims).intersection(self.possible_time_dims) for time in ed_time_dim: + # get maximum and minimum time of all Datasets max_time = [ds[time].max().values for ds in ds_list] min_time = [ds[time].min().values for ds in ds_list] + # see if all Datasets have NaN for time max_all_nan = all(np.isnan(max_time)) min_all_nan = all(np.isnan(min_time)) + # True means our time is not filled with NaNs + # This is necessary because some time dims can be filled with NaNs + nan_time_cond = (not max_all_nan) and (not min_all_nan) + # checks to see that times are in ascending order - if max_time[:-1] > min_time[1:] and (not max_all_nan) and (not min_all_nan): + if nan_time_cond and max_time[:-1] > min_time[1:]: raise RuntimeError( f"The coordinate {time} is not in ascending order for group {ed_name}, " f"combine cannot be used!" ) - # TODO: check and store time values + def _reverse_time_check_and_storage(self, ds_list: List[xr.Dataset], ed_name: str): + + # TODO: check and store time values - # TODO: do this first [exist_reversed_time(ds, time_str) for ds in ds_list] - # if any are True, then continue by creating an old time variable in each ds + # TODO: do this first [exist_reversed_time(ds, time_str) for ds in ds_list] + # if any are True, then continue by creating an old time variable in each ds - # for ds in ds_list: - # old_time = check_and_correct_reversed_time( - # ds, time_str=str(time), sonar_model=self.sonar_model - # ) + # for ds in ds_list: + # old_time = check_and_correct_reversed_time( + # ds, time_str=str(time), sonar_model=self.sonar_model + # ) - # print(f"old_time = {old_time}, group = {ed_name}") + old_time = None + print(f"old_time = {old_time}, group = {ed_name}") @staticmethod def _check_channels(ds_list: List[xr.Dataset], ed_name: str) -> None: @@ -218,7 +239,7 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non in the attributes of the combined ``EchoData`` object. """ - self._check_ds_times(ds_list, ed_name) + self._check_ascending_ds_times(ds_list, ed_name) self._check_channels(ds_list, ed_name) # Dataframe with column as dim names and rows as the different Datasets From 41bccab23785c3b949877fd0ea2bc83b9337a6ba Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 23 Sep 2022 17:28:36 -0700 Subject: [PATCH 38/89] investigate decompression error and create routines that can identify when the same chunk is being written to --- echopype/echodata/zarr_combine.py | 186 +++- echopype/test_data/README.md | 3 - .../test_cluster_dump/test_zarr_combine.yaml | 873 ++++++++++++++++++ .../tests/echodata/test_echodata_combine.py | 35 + 4 files changed, 1043 insertions(+), 54 deletions(-) create mode 100644 echopype/tests/echodata/test_cluster_dump/test_zarr_combine.yaml diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 6c52373fb..afc81fe66 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -82,6 +82,13 @@ def _check_ascending_ds_times(self, ds_list: List[xr.Dataset], ed_name: str) -> ) def _reverse_time_check_and_storage(self, ds_list: List[xr.Dataset], ed_name: str): + """ + Determine if there exist reversed time dimensions in each + of the Datasets individually. Additionally, if there are + reversed times correct them and store the old time dimension + as a variable of + + """ # TODO: check and store time values @@ -249,6 +256,7 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non self.dims_sum = self.dims_df.sum(axis=0).to_dict() self.dims_csum = self.dims_df.cumsum(axis=0).to_dict() self.dims_max = self.dims_df.max(axis=0).to_dict() + self.dims_min = self.dims_df.min(axis=0).to_dict() # format ed_name appropriately ed_name = ed_name.replace("-", "_").replace("/", "_").lower() @@ -304,9 +312,13 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), ] # Create the chunk shape of the variable - chnk_shape = [self.dims_max[dim] for dim in dims] + # TODO: investigate which of the two chunk shapes is best + # chnk_shape = [self.dims_max[dim] for dim in dims] + chnk_shape = [ + self.dims_min[dim] if dim in self.append_dims else self.dims_max[dim] for dim in dims + ] - temp_arr = dask.array.zeros(shape=shape, chunks=chnk_shape, dtype=dtype) + temp_arr = dask.array.zeros(shape=shape, dtype=dtype, chunks=chnk_shape) return temp_arr, chnk_shape @@ -472,54 +484,48 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: return region - def _append_const_to_zarr( - self, - const_vars: List[str], - ds_list: List[xr.Dataset], - path: str, - zarr_group: str, - storage_options: dict, - ): + @staticmethod + def get_intervals(csum): + """creates a list of intervals from a cumulative sum + + use case: cumulative sum of max append dimensions or + self.dims_csum """ - Appends all constant (i.e. not chunked) variables and dimensions to the - zarr group. - Parameters - ---------- - const_vars: List[str] - The names of all variables/dimensions that are not chunked - ds_list: List[xr.Dataset] - The Datasets that will be combined - path: str - The full path of the final combined zarr store - zarr_group: str - The name of the group of the zarr store - corresponding to the Datasets in ``ds_list`` - storage_options: dict - Any additional parameters for the storage - backend (ignored for local paths) + # TODO: Document this - Notes - ----- - Those variables/dimensions that are in ``self.append_dims`` - should not be appended here. + intervals = [] + for count, val in enumerate(csum): + + if count == 0: + # get the initial region + intervals.append(pd.Interval(left=0, right=val, closed="left")) + + else: + # get all other regions + intervals.append(pd.Interval(left=csum[count - 1], right=val, closed="left")) + + return intervals + + @staticmethod + def get_common_chunks(interval_list_dim, interval_list_max): """ + determines what intervals overlap - # write constant vars to zarr using the first element of ds_list - for var in const_vars: + use case: makes it so we can determine which to_zarr calls will + write to the same chunk, we can use this result to do dask locking - # TODO: when range_sample needs to be padded, here we will - # need to pick the dataset with the max size for range_sample - # (might be done with change below) + """ - # make sure to choose the dataset with the largest size for variable - if var in self.dims_df: - ds_list_ind = int(self.dims_df[var].argmax()) - else: - ds_list_ind = int(0) + chunks = defaultdict(list) - ds_list[ds_list_ind][[var]].to_zarr( - path, group=zarr_group, mode="a", storage_options=storage_options + for i in range(len(interval_list_max)): + chunks[i].extend( + [ + count + for count, interval in enumerate(interval_list_dim) + if interval_list_max[i].overlaps(interval) + ] ) def _append_ds_list_to_zarr( @@ -573,32 +579,110 @@ def _append_ds_list_to_zarr( synchronizer=zarr.ThreadSynchronizer(), ) + def ds_to_zarr(dataset, write_path, zarr_grp, rgn, storage_opts, synch): + + dataset.to_zarr( + write_path, + group=zarr_grp, + region=rgn, + storage_options=storage_opts, + compute=True, + synchronizer=synch, + ) + # write each non-constant variable in ds_list to the zarr store - delayed_to_zarr = [] + # delayed_to_zarr = [] + to_zarr_futures = [] for ind, ds in enumerate(ds_list): + # TODO: may need to write ds in stages of append dimension + # e.g. split ds into a ds with time1 dim and a ds with + # time2 dim, then write them using the locking. + ds_drop = ds.drop(const_names) region = self._get_region(ind, set(ds_drop.dims)) - delayed_to_zarr.append( - ds_drop.to_zarr( + # delayed_to_zarr.append( + # ds_drop.to_zarr( + # path, + # group=zarr_group, + # region=region, + # storage_options=storage_options, + # compute=to_zarr_compute, + # synchronizer=zarr.ThreadSynchronizer(), + # ) + # ) + to_zarr_futures.append( + dask.distributed.get_client().submit( + ds_to_zarr, + ds_drop, path, - group=zarr_group, - region=region, - storage_options=storage_options, - compute=to_zarr_compute, - synchronizer=zarr.ThreadSynchronizer(), + zarr_group, + region, + storage_options, + zarr.ThreadSynchronizer(), ) ) if not to_zarr_compute: - dask.compute(*delayed_to_zarr, retries=1) # TODO: maybe use persist in the future? + # dask.compute(*delayed_to_zarr) #, retries=1) # TODO: maybe use persist in the future? + [f.result() for f in to_zarr_futures] # TODO: need to consider the case where range_sample needs to be padded? return const_names + def _append_const_to_zarr( + self, + const_vars: List[str], + ds_list: List[xr.Dataset], + path: str, + zarr_group: str, + storage_options: dict, + ): + """ + Appends all constant (i.e. not chunked) variables and dimensions to the + zarr group. + + Parameters + ---------- + const_vars: List[str] + The names of all variables/dimensions that are not chunked + ds_list: List[xr.Dataset] + The Datasets that will be combined + path: str + The full path of the final combined zarr store + zarr_group: str + The name of the group of the zarr store + corresponding to the Datasets in ``ds_list`` + storage_options: dict + Any additional parameters for the storage + backend (ignored for local paths) + + Notes + ----- + Those variables/dimensions that are in ``self.append_dims`` + should not be appended here. + """ + + # write constant vars to zarr using the first element of ds_list + for var in const_vars: + + # TODO: when range_sample needs to be padded, here we will + # need to pick the dataset with the max size for range_sample + # (might be done with change below) + + # make sure to choose the dataset with the largest size for variable + if var in self.dims_df: + ds_list_ind = int(self.dims_df[var].argmax()) + else: + ds_list_ind = int(0) + + ds_list[ds_list_ind][[var]].to_zarr( + path, group=zarr_group, mode="a", storage_options=storage_options + ) + def _append_provenance_attr_vars(self, path: str, storage_options: Optional[dict] = {}) -> None: """ Creates an xarray Dataset with variables set as the attributes diff --git a/echopype/test_data/README.md b/echopype/test_data/README.md index d3295604e..c79ad71f3 100644 --- a/echopype/test_data/README.md +++ b/echopype/test_data/README.md @@ -11,8 +11,6 @@ Most of these files are stored on Git LFS but the ones that aren't (due to file - 2019118 group2survey-D20191214-T081342.raw: Contains 6 channels but only 2 of those channels collect ping data - D20200528-T125932.raw: Data collected from WBT mini (instead of WBT), from @emlynjdavies - Green2.Survey2.FM.short.slow.-D20191004-T211557.raw: Contains 2-in-1 transducer, from @FletcherFT (reduced from 104.9 MB to 765 KB in test data updates) -- raw4-D20220514-T172704.raw: Contains RAW4 datagram, 1 channel only, from @cornejotux -- D20210330-T123857.raw: do not contain filter coefficients ### EA640 @@ -24,7 +22,6 @@ Most of these files are stored on Git LFS but the ones that aren't (due to file - Winter2017-D20170115-T150122.raw: Contains a change of recording length in the middle of the file - 2015843-D20151023-T190636.raw: Not used in tests but contains ranges are not constant across ping times - SH1701_consecutive_files_w_range_change: Not used in tests. [Folder](https://drive.google.com/drive/u/1/folders/1PaDtL-xnG5EK3N3P1kGlXa5ub16Yic0f) on shared drive that contains sequential files with ranges that are not constant across ping times. -- NBP_B050N-D20180118-T090228.raw: split-beam setup without angle data ### AZFP diff --git a/echopype/tests/echodata/test_cluster_dump/test_zarr_combine.yaml b/echopype/tests/echodata/test_cluster_dump/test_zarr_combine.yaml new file mode 100644 index 000000000..7a89549a4 --- /dev/null +++ b/echopype/tests/echodata/test_cluster_dump/test_zarr_combine.yaml @@ -0,0 +1,873 @@ +scheduler: + address: tcp://127.0.0.1:50971 + clients: + Client-854fe396-3b63-11ed-b660-7aef93c2516e: + client_key: Client-854fe396-3b63-11ed-b660-7aef93c2516e + last_seen: 1663953414.2823439 + wants_what: [] + fire-and-forget: + client_key: fire-and-forget + last_seen: 1663953414.2209349 + wants_what: [] + events: + Client-854fe396-3b63-11ed-b660-7aef93c2516e: + - - 1663953414.282331 + - action: add-client + client: Client-854fe396-3b63-11ed-b660-7aef93c2516e + all: + - - 1663953414.261132 + - action: add-worker + worker: tcp://127.0.0.1:50972 + - - 1663953414.262537 + - action: add-worker + worker: tcp://127.0.0.1:50974 + - - 1663953414.282331 + - action: add-client + client: Client-854fe396-3b63-11ed-b660-7aef93c2516e + stealing: [] + tcp://127.0.0.1:50972: + - - 1663953414.261111 + - action: add-worker + - - 1663953414.265065 + - action: worker-status-change + prev-status: init + status: running + tcp://127.0.0.1:50974: + - - 1663953414.262531 + - action: add-worker + - - 1663953414.2653031 + - action: worker-status-change + prev-status: init + status: running + extensions: + amm: + events: + locks: + memory_sampler: + multi_locks: + publish: + pubsub: + queues: + replay-tasks: + semaphores: + shuffle: + stealing: + cost_multipliers: + - 1.0 + - 1.03125 + - 1.0625 + - 1.125 + - 1.25 + - 1.5 + - 2 + - 3 + - 5 + - 9 + - 17 + - 33 + - 65 + - 129 + - 257 + count: 0 + in_flight: {} + in_flight_occupancy: {} + key_stealable: {} + scheduler: + address: tcp://127.0.0.1:50971 + clients: + Client-854fe396-3b63-11ed-b660-7aef93c2516e: + fire-and-forget: + events: + Client-854fe396-3b63-11ed-b660-7aef93c2516e: + - - 1663953414.282331 + - action: add-client + client: Client-854fe396-3b63-11ed-b660-7aef93c2516e + all: + - - 1663953414.261132 + - action: add-worker + worker: tcp://127.0.0.1:50972 + - - 1663953414.262537 + - action: add-worker + worker: tcp://127.0.0.1:50974 + - - 1663953414.282331 + - action: add-client + client: Client-854fe396-3b63-11ed-b660-7aef93c2516e + stealing: [] + tcp://127.0.0.1:50972: + - - 1663953414.261111 + - action: add-worker + - - 1663953414.265065 + - action: worker-status-change + prev-status: init + status: running + tcp://127.0.0.1:50974: + - - 1663953414.262531 + - action: add-worker + - - 1663953414.2653031 + - action: worker-status-change + prev-status: init + status: running + extensions: '{''locks'': , + ''multi_locks'': , + ''publish'': , + ''replay-tasks'': , ''queues'': , ''variables'': , ''pubsub'': , ''semaphores'': , ''events'': , ''amm'': , ''memory_sampler'': , ''shuffle'': , ''stealing'': }' + id: Scheduler-6409852a-5ae7-46a1-956a-da3f1329a529 + log: [] + memory: + managed: 0 + managed_in_memory: 0 + managed_spilled: 0 + optimistic: 388939776 + process: 388939776 + unmanaged: 388939776 + unmanaged_old: 388939776 + unmanaged_recent: 0 + services: + dashboard: 50970 + started: 1663953414.037181 + status: running + task_groups: {} + tasks: {} + thread_id: 8633697792 + transition_counter: 0 + transition_log: [] + type: Scheduler + workers: + tcp://127.0.0.1:50972: '' + tcp://127.0.0.1:50974: '' + stealable: + tcp://127.0.0.1:50972: + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + tcp://127.0.0.1:50974: + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + stealable_all: + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + - [] + variables: + id: Scheduler-6409852a-5ae7-46a1-956a-da3f1329a529 + log: [] + memory: + managed: 0 + managed_in_memory: 0 + managed_spilled: 0 + optimistic: 388939776 + process: 388939776 + unmanaged: 388939776 + unmanaged_old: 388939776 + unmanaged_recent: 0 + services: + dashboard: 50970 + started: 1663953414.037181 + status: running + task_groups: {} + tasks: {} + thread_id: 8633697792 + transition_counter: 0 + transition_log: [] + type: Scheduler + workers: + tcp://127.0.0.1:50972: + actors: [] + address: tcp://127.0.0.1:50972 + bandwidth: 100000000 + executing: {} + extra: {} + has_what: [] + host: 127.0.0.1 + last_seen: 1663953414.261237 + local_directory: /var/folders/68/bd5dqh4j3zgbwmhw2_9v4g180000gn/T/dask-worker-space/worker-zep12oa1 + long_running: [] + memory: + managed: 0 + managed_in_memory: 0 + managed_spilled: 0 + optimistic: 194433024 + process: 194433024 + unmanaged: 194433024 + unmanaged_old: 194433024 + unmanaged_recent: 0 + memory_limit: 17179869184 + metrics: + bandwidth: + total: 100000000 + types: {} + workers: {} + cpu: 0.0 + event_loop_interval: 0.5 + executing: 0 + in_flight: 0 + in_memory: 0 + memory: 194433024 + num_fds: 25 + read_bytes: 0.0 + read_bytes_disk: 0.0 + ready: 0 + spilled_nbytes: + disk: 0 + memory: 0 + time: 1663953414.226922 + write_bytes: 0.0 + write_bytes_disk: 0.0 + name: 0 + nanny: null + nbytes: 0 + nthreads: 1 + occupancy: 0 + pid: 95840 + processing: {} + resources: {} + server_id: Worker-a9534fbd-86d5-428d-a260-ede2c284bea8 + services: + dashboard: 50973 + status: '' + time_delay: 0.022827863693237305 + used_resources: {} + tcp://127.0.0.1:50974: + actors: [] + address: tcp://127.0.0.1:50974 + bandwidth: 100000000 + executing: {} + extra: {} + has_what: [] + host: 127.0.0.1 + last_seen: 1663953414.2626 + local_directory: /var/folders/68/bd5dqh4j3zgbwmhw2_9v4g180000gn/T/dask-worker-space/worker-bqrcff5y + long_running: [] + memory: + managed: 0 + managed_in_memory: 0 + managed_spilled: 0 + optimistic: 194506752 + process: 194506752 + unmanaged: 194506752 + unmanaged_old: 194506752 + unmanaged_recent: 0 + memory_limit: 17179869184 + metrics: + bandwidth: + total: 100000000 + types: {} + workers: {} + cpu: 0.0 + event_loop_interval: 0.5 + executing: 0 + in_flight: 0 + in_memory: 0 + memory: 194506752 + num_fds: 26 + read_bytes: 0.0 + read_bytes_disk: 0.0 + ready: 0 + spilled_nbytes: + disk: 0 + memory: 0 + time: 1663953414.229425 + write_bytes: 0.0 + write_bytes_disk: 0.0 + name: 1 + nanny: null + nbytes: 0 + nthreads: 2 + occupancy: 0 + pid: 95840 + processing: {} + resources: {} + server_id: Worker-44330199-ed16-48c6-b2c0-3554d0acd9c0 + services: + dashboard: 50975 + status: '' + time_delay: 0.003255128860473633 + used_resources: {} +versions: + host: + LANG: None + LC_ALL: None + OS: Darwin + OS-release: 21.5.0 + byteorder: little + machine: x86_64 + processor: i386 + python: 3.9.12.final.0 + python-bits: 64 + packages: + cloudpickle: 2.1.0 + dask: 2022.8.0 + distributed: 2022.8.0 + lz4: 4.0.0 + msgpack: 1.0.4 + numpy: 1.23.1 + pandas: 1.4.3 + python: 3.9.12.final.0 + toolz: 0.12.0 + tornado: '6.1' +workers: + tcp://127.0.0.1:50972: + address: tcp://127.0.0.1:50972 + busy_workers: [] + config: + array: + chunk-size: 128MiB + rechunk-threshold: 4 + slicing: + split-large-chunks: null + svg: + size: 120 + dataframe: + parquet: + metadata-task-size-local: 512 + metadata-task-size-remote: 16 + shuffle-compression: null + distributed: + adaptive: + interval: 1s + maximum: .inf + minimum: 0 + target-duration: 5s + wait-count: 3 + admin: + event-loop: tornado + log-format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + log-length: 10000 + max-error-length: 10000 + pdb-on-err: false + system-monitor: + disk: true + interval: 500ms + tick: + cycle: 1s + interval: 500 ms + limit: 3s + client: + heartbeat: 5s + preload: [] + preload-argv: [] + scheduler-info-interval: 2s + security-loader: null + comm: + compression: auto + default-scheme: tcp + offload: 10MiB + recent-messages-log-length: 0 + require-encryption: null + retry: + count: 0 + delay: + max: 20s + min: 1s + shard: 64MiB + socket-backlog: 2048 + tcp: + backend: tornado + timeouts: + connect: 5s + tcp: 30s + tls: + ca-file: null + ciphers: null + client: + cert: null + key: null + max-version: null + min-version: 1.2 + scheduler: + cert: null + key: null + worker: + cert: null + key: null + ucx: + create-cuda-context: null + cuda-copy: null + infiniband: null + nvlink: null + rdmacm: null + tcp: null + websockets: + shard: 8MiB + zstd: + level: 3 + threads: 0 + dashboard: + export-tool: false + graph-max-items: 5000 + link: '{scheme}://{host}:{port}/status' + prometheus: + namespace: dask + deploy: + cluster-repr-interval: 500ms + lost-worker-timeout: 15s + diagnostics: + computations: + ignore-modules: + - distributed + - dask + - xarray + - cudf + - cuml + - prefect + - xgboost + max-history: 100 + erred-tasks: + max-history: 100 + nvml: true + nanny: + environ: + MALLOC_TRIM_THRESHOLD_: 65536 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + preload: [] + preload-argv: [] + rmm: + pool-size: null + scheduler: + active-memory-manager: + interval: 2s + policies: + - class: distributed.active_memory_manager.ReduceReplicas + start: false + allowed-failures: 3 + allowed-imports: + - dask + - distributed + bandwidth: 100000000 + blocked-handlers: [] + contact-address: null + dashboard: + bokeh-application: + allow_websocket_origin: + - '*' + check_unused_sessions_milliseconds: 500 + keep_alive_milliseconds: 500 + status: + task-stream-length: 1000 + tasks: + task-stream-length: 100000 + tls: + ca-file: null + cert: null + key: null + default-data-size: 1kiB + default-task-durations: + rechunk-split: 1us + split-shuffle: 1us + events-cleanup-delay: 1h + events-log-length: 100000 + http: + routes: + - distributed.http.scheduler.prometheus + - distributed.http.scheduler.info + - distributed.http.scheduler.json + - distributed.http.health + - distributed.http.proxy + - distributed.http.statistics + idle-timeout: null + locks: + lease-timeout: 30s + lease-validation-interval: 10s + pickle: true + preload: [] + preload-argv: [] + transition-log-length: 100000 + unknown-task-duration: 500ms + validate: false + work-stealing: true + work-stealing-interval: 100ms + worker-ttl: 5 minutes + version: 2 + worker: + blocked-handlers: [] + connections: + incoming: 10 + outgoing: 50 + daemon: true + http: + routes: + - distributed.http.worker.prometheus + - distributed.http.health + - distributed.http.statistics + lifetime: + duration: null + restart: false + stagger: 0 seconds + memory: + max-spill: false + monitor-interval: 100ms + pause: 0.8 + rebalance: + measure: optimistic + recipient-max: 0.6 + sender-min: 0.3 + sender-recipient-gap: 0.1 + recent-to-old-time: 30s + spill: 0.7 + target: 0.6 + terminate: 0.95 + multiprocessing-method: spawn + preload: [] + preload-argv: [] + profile: + cycle: 1000ms + enabled: false + interval: 10ms + low-level: false + resources: {} + use-file-locking: true + validate: false + local_directory: /var/folders/68/bd5dqh4j3zgbwmhw2_9v4g180000gn/T + optimization: + fuse: + active: null + ave-width: 1 + max-depth-new-edges: null + max-height: .inf + max-width: null + rename-keys: true + subgraphs: null + scheduler: dask.distributed + shuffle: tasks + temporary-directory: null + tokenize: + ensure-deterministic: false + visualization: + engine: null + constrained: [] + data: {} + data_needed: {} + executing: [] + id: Worker-a9534fbd-86d5-428d-a260-ede2c284bea8 + in_flight_tasks: [] + in_flight_workers: {} + incoming_transfer_log: [] + log: [] + logs: [] + long_running: [] + max_spill: false + memory_limit: 17179869184 + memory_monitor_interval: 0.1 + memory_pause_fraction: 0.8 + memory_spill_fraction: 0.7 + memory_target_fraction: 0.6 + nthreads: 1 + outgoing_transfer_log: [] + ready: [] + running: true + scheduler: tcp://127.0.0.1:50971 + status: '' + stimulus_log: [] + tasks: {} + thread_id: 8633697792 + transition_counter: 0 + type: Worker + tcp://127.0.0.1:50974: + address: tcp://127.0.0.1:50974 + busy_workers: [] + config: + array: + chunk-size: 128MiB + rechunk-threshold: 4 + slicing: + split-large-chunks: null + svg: + size: 120 + dataframe: + parquet: + metadata-task-size-local: 512 + metadata-task-size-remote: 16 + shuffle-compression: null + distributed: + adaptive: + interval: 1s + maximum: .inf + minimum: 0 + target-duration: 5s + wait-count: 3 + admin: + event-loop: tornado + log-format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + log-length: 10000 + max-error-length: 10000 + pdb-on-err: false + system-monitor: + disk: true + interval: 500ms + tick: + cycle: 1s + interval: 500 ms + limit: 3s + client: + heartbeat: 5s + preload: [] + preload-argv: [] + scheduler-info-interval: 2s + security-loader: null + comm: + compression: auto + default-scheme: tcp + offload: 10MiB + recent-messages-log-length: 0 + require-encryption: null + retry: + count: 0 + delay: + max: 20s + min: 1s + shard: 64MiB + socket-backlog: 2048 + tcp: + backend: tornado + timeouts: + connect: 5s + tcp: 30s + tls: + ca-file: null + ciphers: null + client: + cert: null + key: null + max-version: null + min-version: 1.2 + scheduler: + cert: null + key: null + worker: + cert: null + key: null + ucx: + create-cuda-context: null + cuda-copy: null + infiniband: null + nvlink: null + rdmacm: null + tcp: null + websockets: + shard: 8MiB + zstd: + level: 3 + threads: 0 + dashboard: + export-tool: false + graph-max-items: 5000 + link: '{scheme}://{host}:{port}/status' + prometheus: + namespace: dask + deploy: + cluster-repr-interval: 500ms + lost-worker-timeout: 15s + diagnostics: + computations: + ignore-modules: + - distributed + - dask + - xarray + - cudf + - cuml + - prefect + - xgboost + max-history: 100 + erred-tasks: + max-history: 100 + nvml: true + nanny: + environ: + MALLOC_TRIM_THRESHOLD_: 65536 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + preload: [] + preload-argv: [] + rmm: + pool-size: null + scheduler: + active-memory-manager: + interval: 2s + policies: + - class: distributed.active_memory_manager.ReduceReplicas + start: false + allowed-failures: 3 + allowed-imports: + - dask + - distributed + bandwidth: 100000000 + blocked-handlers: [] + contact-address: null + dashboard: + bokeh-application: + allow_websocket_origin: + - '*' + check_unused_sessions_milliseconds: 500 + keep_alive_milliseconds: 500 + status: + task-stream-length: 1000 + tasks: + task-stream-length: 100000 + tls: + ca-file: null + cert: null + key: null + default-data-size: 1kiB + default-task-durations: + rechunk-split: 1us + split-shuffle: 1us + events-cleanup-delay: 1h + events-log-length: 100000 + http: + routes: + - distributed.http.scheduler.prometheus + - distributed.http.scheduler.info + - distributed.http.scheduler.json + - distributed.http.health + - distributed.http.proxy + - distributed.http.statistics + idle-timeout: null + locks: + lease-timeout: 30s + lease-validation-interval: 10s + pickle: true + preload: [] + preload-argv: [] + transition-log-length: 100000 + unknown-task-duration: 500ms + validate: false + work-stealing: true + work-stealing-interval: 100ms + worker-ttl: 5 minutes + version: 2 + worker: + blocked-handlers: [] + connections: + incoming: 10 + outgoing: 50 + daemon: true + http: + routes: + - distributed.http.worker.prometheus + - distributed.http.health + - distributed.http.statistics + lifetime: + duration: null + restart: false + stagger: 0 seconds + memory: + max-spill: false + monitor-interval: 100ms + pause: 0.8 + rebalance: + measure: optimistic + recipient-max: 0.6 + sender-min: 0.3 + sender-recipient-gap: 0.1 + recent-to-old-time: 30s + spill: 0.7 + target: 0.6 + terminate: 0.95 + multiprocessing-method: spawn + preload: [] + preload-argv: [] + profile: + cycle: 1000ms + enabled: false + interval: 10ms + low-level: false + resources: {} + use-file-locking: true + validate: false + local_directory: /var/folders/68/bd5dqh4j3zgbwmhw2_9v4g180000gn/T + optimization: + fuse: + active: null + ave-width: 1 + max-depth-new-edges: null + max-height: .inf + max-width: null + rename-keys: true + subgraphs: null + scheduler: dask.distributed + shuffle: tasks + temporary-directory: null + tokenize: + ensure-deterministic: false + visualization: + engine: null + constrained: [] + data: {} + data_needed: {} + executing: [] + id: Worker-44330199-ed16-48c6-b2c0-3554d0acd9c0 + in_flight_tasks: [] + in_flight_workers: {} + incoming_transfer_log: [] + log: [] + logs: [] + long_running: [] + max_spill: false + memory_limit: 17179869184 + memory_monitor_interval: 0.1 + memory_pause_fraction: 0.8 + memory_spill_fraction: 0.7 + memory_target_fraction: 0.6 + nthreads: 2 + outgoing_transfer_log: [] + ready: [] + running: true + scheduler: tcp://127.0.0.1:50971 + status: '' + stimulus_log: [] + tasks: {} + thread_id: 8633697792 + transition_counter: 0 + type: Worker diff --git a/echopype/tests/echodata/test_echodata_combine.py b/echopype/tests/echodata/test_echodata_combine.py index 229e3178e..394560bc2 100644 --- a/echopype/tests/echodata/test_echodata_combine.py +++ b/echopype/tests/echodata/test_echodata_combine.py @@ -12,6 +12,8 @@ from echopype.qc import exist_reversed_time from echopype.core import SONAR_MODELS +import zarr + @pytest.fixture def ek60_test_data(test_path): @@ -319,3 +321,36 @@ def test_combined_echodata_repr(ek60_test_data): actual = "\n".join(x.rstrip() for x in repr(combined).split("\n")) assert actual == expected_repr + + +# TODO: consider the following test structures +# from distributed.utils_test import client +# @gen_cluster(client=True) +# async def test_zarr_combine(client, scheduler, w1, w2): +# from distributed.utils_test import gen_cluster, inc +# from distributed.utils_test import client, loop, cluster_fixture, loop_in_thread, cleanup + +# from dask.distributed import Client +# +# # @pytest.fixture(scope="session") +# def test_zarr_combine(): +# +# client = Client() # n_workers=1) +# +# from fsspec.implementations.local import LocalFileSystem +# fs = LocalFileSystem() +# +# desired_raw_file_paths = fs.glob('/Users/brandonreyes/UW_work/Echopype_work/code_playing_around/OOI_zarrs_ep_ex/temp/*.zarr') +# +# ed_lazy = [] +# for ed_path in desired_raw_file_paths: +# print(ed_path) +# ed_lazy.append(echopype.open_converted(ed_path, chunks='auto', +# synchronizer=zarr.ThreadSynchronizer())) +# +# from echopype.echodata.zarr_combine import ZarrCombine +# +# path = '/Users/brandonreyes/UW_work/Echopype_work/code_playing_around/test.zarr' +# comb = ZarrCombine() +# +# ed_combined = comb.combine(path, ed_lazy, storage_options={}) \ No newline at end of file From b5f2acd6116c6a313ba300a34dfb72e3f807213b Mon Sep 17 00:00:00 2001 From: b-reyes Date: Sun, 25 Sep 2022 12:17:00 -0700 Subject: [PATCH 39/89] start working on locking writes to zarr --- echopype/echodata/zarr_combine.py | 137 ++++++++++++++++++++---------- 1 file changed, 94 insertions(+), 43 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index afc81fe66..a40176027 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -10,9 +10,10 @@ import xarray as xr import zarr -from ..convert.api import COMPRESSION_SETTINGS +from ..utils.coding import COMPRESSION_SETTINGS from ..utils.prov import echopype_prov_attrs -from .api import open_converted + +# from .api import open_converted from .combine import check_echodatas_input # , check_and_correct_reversed_time from .echodata import EchoData @@ -313,10 +314,10 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), # Create the chunk shape of the variable # TODO: investigate which of the two chunk shapes is best - # chnk_shape = [self.dims_max[dim] for dim in dims] - chnk_shape = [ - self.dims_min[dim] if dim in self.append_dims else self.dims_max[dim] for dim in dims - ] + chnk_shape = [self.dims_max[dim] for dim in dims] + # chnk_shape = [ + # self.dims_min[dim] if dim in self.append_dims else self.dims_max[dim] for dim in dims + # ] temp_arr = dask.array.zeros(shape=shape, dtype=dtype, chunks=chnk_shape) @@ -492,7 +493,7 @@ def get_intervals(csum): self.dims_csum """ - # TODO: Document this + # TODO: Document this! intervals = [] for count, val in enumerate(csum): @@ -517,6 +518,8 @@ def get_common_chunks(interval_list_dim, interval_list_max): """ + # TODO: Document this! + chunks = defaultdict(list) for i in range(len(interval_list_max)): @@ -528,6 +531,38 @@ def get_common_chunks(interval_list_dim, interval_list_max): ] ) + return chunks + + @staticmethod + def get_common_chunks_key(common_chunks, ind): + """ + Obtains the key in common chunk whose value + contains ind + + """ + + # TODO: Document this! + + for key, val in common_chunks.items(): + + if ind in val: + return key + + @dask.delayed + def write_ds_to_zarr(self, ds_in, path, group, rgn, name, storage_opts, sync): + + # TODO: document this! + + with dask.distributed.Lock(name): + ds_in.to_zarr( + path, + group=group, + region=rgn, + compute=True, + storage_options=storage_opts, + synchronizer=sync, + ) + def _append_ds_list_to_zarr( self, path: str, @@ -579,20 +614,8 @@ def _append_ds_list_to_zarr( synchronizer=zarr.ThreadSynchronizer(), ) - def ds_to_zarr(dataset, write_path, zarr_grp, rgn, storage_opts, synch): - - dataset.to_zarr( - write_path, - group=zarr_grp, - region=rgn, - storage_options=storage_opts, - compute=True, - synchronizer=synch, - ) - # write each non-constant variable in ds_list to the zarr store # delayed_to_zarr = [] - to_zarr_futures = [] for ind, ds in enumerate(ds_list): # TODO: may need to write ds in stages of append dimension @@ -601,7 +624,46 @@ def ds_to_zarr(dataset, write_path, zarr_grp, rgn, storage_opts, synch): ds_drop = ds.drop(const_names) - region = self._get_region(ind, set(ds_drop.dims)) + append_dims_in_ds = set(ds_drop.dims).intersection(self.append_dims) + + # TODO: there may be a better way to obtain the common chunks! + for dim in append_dims_in_ds: + # print(f"dim = {dim}") + # get all of those variables with dim in their dimensions + vars_w_dim = [val.name for val in ds_drop.values() if dim in val.dims] + + ds_drop_dim = ds_drop[vars_w_dim] + + region = self._get_region(ind, set(ds_drop_dim.dims)) + print(region) + + csum_dim = np.array(list(self.dims_csum[dim].values())) + + # print(f"csum_dim {csum_dim}") + + csum_max = np.cumsum(np.array([self.dims_max[dim]] * len(csum_dim))) + + # print(f"csum_max = {csum_max}") + + interval_list_max = self.get_intervals(csum_max) + interval_list_dim = self.get_intervals(csum_dim) + + com_chunks = self.get_common_chunks(interval_list_dim, interval_list_max) + + chunk = self.get_common_chunks_key(com_chunks, ind) + lock_name = dim + "_" + str(chunk) + print(f"lock_name = {lock_name}") + print(f"interval_list_max = {interval_list_max}") + print(f"interval_list_dim = {interval_list_dim} \n") + + # TODO: multiple locks can exist for the same region, we may need + # to split up the region + + # + # delayed_to_zarr.append(self.write_ds_to_zarr(ds_drop, path, + # zarr_group, region, + # lock_name, storage_options, + # zarr.ThreadSynchronizer())) # delayed_to_zarr.append( # ds_drop.to_zarr( @@ -613,21 +675,10 @@ def ds_to_zarr(dataset, write_path, zarr_grp, rgn, storage_opts, synch): # synchronizer=zarr.ThreadSynchronizer(), # ) # ) - to_zarr_futures.append( - dask.distributed.get_client().submit( - ds_to_zarr, - ds_drop, - path, - zarr_group, - region, - storage_options, - zarr.ThreadSynchronizer(), - ) - ) - if not to_zarr_compute: - # dask.compute(*delayed_to_zarr) #, retries=1) # TODO: maybe use persist in the future? - [f.result() for f in to_zarr_futures] + # if not to_zarr_compute: + # dask.compute(*delayed_to_zarr) #, retries=1) # TODO: maybe use persist in the future? + # # [f.result() for f in to_zarr_futures] # TODO: need to consider the case where range_sample needs to be padded? @@ -763,21 +814,21 @@ def combine( storage_options=storage_options, to_zarr_compute=to_zarr_compute, ) - - self._append_const_to_zarr( - const_names, ds_list, path, grp_info["ep_group"], storage_options - ) + print(const_names) + # self._append_const_to_zarr( + # const_names, ds_list, path, grp_info["ep_group"], storage_options + # ) # append all group attributes before combination to zarr store - self._append_provenance_attr_vars(path, storage_options=storage_options) + # self._append_provenance_attr_vars(path, storage_options=storage_options) # TODO: change filenames numbering to range(len(filenames)) # blosc.use_threads = None # open lazy loaded combined EchoData object - ed_combined = open_converted( - path, chunks={}, synchronizer=zarr.ThreadSynchronizer() - ) # TODO: is this appropriate for chunks? + # ed_combined = open_converted( + # path, chunks={}, synchronizer=zarr.ThreadSynchronizer() + # ) # TODO: is this appropriate for chunks? - return ed_combined + return # ed_combined From 321c82085522f4f10f530c6e19e64382739e8f63 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Mon, 26 Sep 2022 10:58:40 -0700 Subject: [PATCH 40/89] remove locking scheme attempt and return to corrupted approach, place all locking scheme code as a comment --- echopype/echodata/zarr_combine.py | 303 +++++++++++++----------------- 1 file changed, 132 insertions(+), 171 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index a40176027..cc61aeb66 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -12,8 +12,7 @@ from ..utils.coding import COMPRESSION_SETTINGS from ..utils.prov import echopype_prov_attrs - -# from .api import open_converted +from .api import open_converted from .combine import check_echodatas_input # , check_and_correct_reversed_time from .echodata import EchoData @@ -250,6 +249,8 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non self._check_ascending_ds_times(ds_list, ed_name) self._check_channels(ds_list, ed_name) + # TODO: check for and correct reversed time + # Dataframe with column as dim names and rows as the different Datasets self.dims_df = pd.DataFrame([ds.dims for ds in ds_list]) @@ -257,7 +258,6 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non self.dims_sum = self.dims_df.sum(axis=0).to_dict() self.dims_csum = self.dims_df.cumsum(axis=0).to_dict() self.dims_max = self.dims_df.max(axis=0).to_dict() - self.dims_min = self.dims_df.min(axis=0).to_dict() # format ed_name appropriately ed_name = ed_name.replace("-", "_").replace("/", "_").lower() @@ -306,18 +306,14 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), Its sole purpose is to construct metadata for the zarr store. """ - # Create the shape of the variable in its final combined - # form (padding occurs here) # TODO: make sure this is true + # Create the shape of the variable in its final combined form shape = [ self.dims_sum[dim] if dim in self.append_dims else self.dims_max[dim] for dim in dims ] # Create the chunk shape of the variable - # TODO: investigate which of the two chunk shapes is best + # TODO: investigate if this is the best chunking chnk_shape = [self.dims_max[dim] for dim in dims] - # chnk_shape = [ - # self.dims_min[dim] if dim in self.append_dims else self.dims_max[dim] for dim in dims - # ] temp_arr = dask.array.zeros(shape=shape, dtype=dtype, chunks=chnk_shape) @@ -485,84 +481,6 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: return region - @staticmethod - def get_intervals(csum): - """creates a list of intervals from a cumulative sum - - use case: cumulative sum of max append dimensions or - self.dims_csum - """ - - # TODO: Document this! - - intervals = [] - for count, val in enumerate(csum): - - if count == 0: - # get the initial region - intervals.append(pd.Interval(left=0, right=val, closed="left")) - - else: - # get all other regions - intervals.append(pd.Interval(left=csum[count - 1], right=val, closed="left")) - - return intervals - - @staticmethod - def get_common_chunks(interval_list_dim, interval_list_max): - """ - determines what intervals overlap - - use case: makes it so we can determine which to_zarr calls will - write to the same chunk, we can use this result to do dask locking - - """ - - # TODO: Document this! - - chunks = defaultdict(list) - - for i in range(len(interval_list_max)): - chunks[i].extend( - [ - count - for count, interval in enumerate(interval_list_dim) - if interval_list_max[i].overlaps(interval) - ] - ) - - return chunks - - @staticmethod - def get_common_chunks_key(common_chunks, ind): - """ - Obtains the key in common chunk whose value - contains ind - - """ - - # TODO: Document this! - - for key, val in common_chunks.items(): - - if ind in val: - return key - - @dask.delayed - def write_ds_to_zarr(self, ds_in, path, group, rgn, name, storage_opts, sync): - - # TODO: document this! - - with dask.distributed.Lock(name): - ds_in.to_zarr( - path, - group=group, - region=rgn, - compute=True, - storage_options=storage_opts, - synchronizer=sync, - ) - def _append_ds_list_to_zarr( self, path: str, @@ -570,7 +488,6 @@ def _append_ds_list_to_zarr( zarr_group: str, ed_name: str, storage_options: Optional[dict] = {}, - to_zarr_compute: bool = True, ) -> List[str]: """ Creates a zarr store and then appends each Dataset @@ -596,11 +513,6 @@ def _append_ds_list_to_zarr( self._get_ds_info(ds_list, ed_name) - # TODO: Check that all of the channels are the same and times - # don't overlap and they increase may have an issue with time1 and NaT - - # TODO: check for and correct reversed time - ds_lazy, const_names, encodings = self._construct_lazy_ds_and_var_info(ds_list[0]) # create zarr file and all associated metadata (this is delayed) @@ -614,73 +526,29 @@ def _append_ds_list_to_zarr( synchronizer=zarr.ThreadSynchronizer(), ) - # write each non-constant variable in ds_list to the zarr store - # delayed_to_zarr = [] + # collect delayed functions that write each non-constant variable + # in ds_list to the zarr store + delayed_to_zarr = [] for ind, ds in enumerate(ds_list): - # TODO: may need to write ds in stages of append dimension - # e.g. split ds into a ds with time1 dim and a ds with - # time2 dim, then write them using the locking. - ds_drop = ds.drop(const_names) - append_dims_in_ds = set(ds_drop.dims).intersection(self.append_dims) - - # TODO: there may be a better way to obtain the common chunks! - for dim in append_dims_in_ds: - # print(f"dim = {dim}") - # get all of those variables with dim in their dimensions - vars_w_dim = [val.name for val in ds_drop.values() if dim in val.dims] - - ds_drop_dim = ds_drop[vars_w_dim] - - region = self._get_region(ind, set(ds_drop_dim.dims)) - print(region) - - csum_dim = np.array(list(self.dims_csum[dim].values())) - - # print(f"csum_dim {csum_dim}") - - csum_max = np.cumsum(np.array([self.dims_max[dim]] * len(csum_dim))) - - # print(f"csum_max = {csum_max}") + region = self._get_region(ind, set(ds_drop.dims)) - interval_list_max = self.get_intervals(csum_max) - interval_list_dim = self.get_intervals(csum_dim) - - com_chunks = self.get_common_chunks(interval_list_dim, interval_list_max) - - chunk = self.get_common_chunks_key(com_chunks, ind) - lock_name = dim + "_" + str(chunk) - print(f"lock_name = {lock_name}") - print(f"interval_list_max = {interval_list_max}") - print(f"interval_list_dim = {interval_list_dim} \n") - - # TODO: multiple locks can exist for the same region, we may need - # to split up the region - - # - # delayed_to_zarr.append(self.write_ds_to_zarr(ds_drop, path, - # zarr_group, region, - # lock_name, storage_options, - # zarr.ThreadSynchronizer())) - - # delayed_to_zarr.append( - # ds_drop.to_zarr( - # path, - # group=zarr_group, - # region=region, - # storage_options=storage_options, - # compute=to_zarr_compute, - # synchronizer=zarr.ThreadSynchronizer(), - # ) - # ) - - # if not to_zarr_compute: - # dask.compute(*delayed_to_zarr) #, retries=1) # TODO: maybe use persist in the future? - # # [f.result() for f in to_zarr_futures] + # TODO: below is an xarray delayed approach, however, data will be corrupted, + # we can remove data corruption by implementing a locking scheme + delayed_to_zarr.append( + ds_drop.to_zarr( + path, + group=zarr_group, + region=region, + compute=False, + storage_options=storage_options, + synchronizer=zarr.ThreadSynchronizer(), + ) + ) - # TODO: need to consider the case where range_sample needs to be padded? + dask.compute(*delayed_to_zarr) return const_names @@ -720,10 +588,6 @@ def _append_const_to_zarr( # write constant vars to zarr using the first element of ds_list for var in const_vars: - # TODO: when range_sample needs to be padded, here we will - # need to pick the dataset with the max size for range_sample - # (might be done with change below) - # make sure to choose the dataset with the largest size for variable if var in self.dims_df: ds_list_ind = int(self.dims_df[var].argmax()) @@ -791,8 +655,6 @@ def combine( self.sonar_model, self.group_attrs["echodata_filename"] = check_echodatas_input(eds) - to_zarr_compute = False - for grp_info in EchoData.group_map.values(): if grp_info["ep_group"]: @@ -812,23 +674,122 @@ def combine( zarr_group=grp_info["ep_group"], ed_name=ed_group, storage_options=storage_options, - to_zarr_compute=to_zarr_compute, ) - print(const_names) - # self._append_const_to_zarr( - # const_names, ds_list, path, grp_info["ep_group"], storage_options - # ) + + self._append_const_to_zarr( + const_names, ds_list, path, grp_info["ep_group"], storage_options + ) # append all group attributes before combination to zarr store - # self._append_provenance_attr_vars(path, storage_options=storage_options) + self._append_provenance_attr_vars(path, storage_options=storage_options) # TODO: change filenames numbering to range(len(filenames)) # blosc.use_threads = None # open lazy loaded combined EchoData object - # ed_combined = open_converted( - # path, chunks={}, synchronizer=zarr.ThreadSynchronizer() - # ) # TODO: is this appropriate for chunks? - - return # ed_combined + ed_combined = open_converted( + path, chunks={}, synchronizer=zarr.ThreadSynchronizer() + ) # TODO: is this appropriate for chunks? + + return ed_combined + + +# Below are functions that may be useful when generating a locking scheme +# I am currently removing them until we implement this scheme +# TODO: this lock is extremely inefficient, it makes +# it so that the group is written sequentially, However, +# no data corruption will occur +# lock_name = zarr_group +# TODO: may need to write ds in stages of append dimension +# e.g. split ds into a ds with time1 dim and a ds with +# time2 dim, then write them using the locking. +# TODO: multiple locks can exist for the same region, we will need +# to split up the region +# @dask.delayed +# def write_ds_to_zarr(self, ds_in, path, group, rgn, name, storage_opts, sync): +# """ +# +# +# +# """ +# +# # TODO: document this! +# +# with dask.distributed.Lock(name): +# ds_in.to_zarr( +# path, +# group=group, +# region=rgn, +# compute=True, +# storage_options=storage_opts, +# synchronizer=sync, +# ) + +# code to include in loop to call above function +# delayed_to_zarr.append(self.write_ds_to_zarr(ds_drop, path, +# zarr_group, region, +# lock_name, storage_options, +# zarr.ThreadSynchronizer())) +# @staticmethod +# def get_intervals(csum): +# """creates a list of intervals from a cumulative sum +# +# use case: cumulative sum of max append dimensions or +# self.dims_csum +# """ +# +# # TODO: Document this! +# +# intervals = [] +# for count, val in enumerate(csum): +# +# if count == 0: +# # get the initial region +# intervals.append(pd.Interval(left=0, right=val, closed="left")) +# +# else: +# # get all other regions +# intervals.append(pd.Interval(left=csum[count - 1], right=val, closed="left")) +# +# return intervals +# +# @staticmethod +# def get_common_chunks(interval_list_dim, interval_list_max): +# """ +# determines what intervals overlap +# +# use case: makes it so we can determine which to_zarr calls will +# write to the same chunk, we can use this result to do dask locking +# +# """ +# +# # TODO: Document this! +# +# chunks = defaultdict(list) +# +# for i in range(len(interval_list_max)): +# chunks[i].extend( +# [ +# count +# for count, interval in enumerate(interval_list_dim) +# if interval_list_max[i].overlaps(interval) +# ] +# ) +# +# return chunks +# +# @staticmethod +# def get_common_chunks_key(common_chunks, ind): +# """ +# Obtains the key in common chunk whose value +# contains ind +# +# """ +# +# # TODO: Document this! +# +# for key, val in common_chunks.items(): +# +# if ind in val: +# return key From 12f5829bce8278018a51940523afbacd2e90bac7 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Mon, 26 Sep 2022 11:29:03 -0700 Subject: [PATCH 41/89] create general get_zarr_compression function in io so that it can be used elsewhere, in zarr_combine set default compressor if one does not exist --- echopype/echodata/zarr_combine.py | 23 +++++------------------ echopype/utils/io.py | 31 +++++++++++++++++-------------- 2 files changed, 22 insertions(+), 32 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index cc61aeb66..24de33266 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -11,6 +11,7 @@ import zarr from ..utils.coding import COMPRESSION_SETTINGS +from ..utils.io import get_zarr_compression from ..utils.prov import echopype_prov_attrs from .api import open_converted from .combine import check_echodatas_input # , check_and_correct_reversed_time @@ -325,7 +326,8 @@ def _set_encodings( """ Sets the encodings for the variable ``name`` by including all encodings in ``val``, except those encodings that are deemed - lazy encodings. + lazy encodings. Additionally, if a compressor is not found, + a default compressor will be assigned. Parameters ---------- @@ -349,24 +351,9 @@ def _set_encodings( key: encod for key, encod in val.encoding.items() if key not in self.lazy_encodings } - # TODO: if 'compressor' or 'filters' or '_FillValue' or 'dtype' do not exist, then - # assign them to a default value - # 'compressor': Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, blocksize=0) - - # TODO: we should probably use ..utils.io function to reduce repetition + # assign default compressor, if one does not exist if "compressor" not in encodings[str(name)]: - if np.issubdtype(val.dtype, np.floating): - encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]["float"]) - elif np.issubdtype(val.dtype, np.integer): - encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]["int"]) - elif np.issubdtype(val.dtype, np.str_): - encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]["string"]) - elif np.issubdtype(val.dtype, np.datetime64): - encodings[str(name)].update(COMPRESSION_SETTINGS["zarr"]["time"]) - else: - raise NotImplementedError( - f"Zarr Encoding for dtype = {val.dtype} has not been set!" - ) + encodings[str(name)].update(get_zarr_compression(val, COMPRESSION_SETTINGS["zarr"])) # set the chunk encoding encodings[str(name)]["chunks"] = chnk_shape diff --git a/echopype/utils/io.py b/echopype/utils/io.py index f6c667384..97d207092 100644 --- a/echopype/utils/io.py +++ b/echopype/utils/io.py @@ -34,6 +34,21 @@ def get_files_from_dir(folder): return [f for f in os.listdir(folder) if os.path.splitext(f)[1] in valid_ext] +def get_zarr_compression(var: xr.Variable, compression_settings: dict) -> dict: + """Returns the proper zarr compressor for a given variable type""" + + if np.issubdtype(var.dtype, np.floating): + return compression_settings["float"] + elif np.issubdtype(var.dtype, np.integer): + return compression_settings["int"] + elif np.issubdtype(var.dtype, np.str_): + return compression_settings["string"] + elif np.issubdtype(var.dtype, np.datetime64): + return compression_settings["time"] + else: + raise NotImplementedError(f"Zarr Encoding for dtype = {var.dtype} has not been set!") + + def set_zarr_encodings(ds: xr.Dataset, compression_settings: dict): """ Sets all variable encodings based on zarr default values @@ -44,20 +59,8 @@ def set_zarr_encodings(ds: xr.Dataset, compression_settings: dict): for name, val in ds.variables.items(): val_encoding = val.encoding - if np.issubdtype(val.dtype, np.floating): - val_encoding.update(compression_settings["float"]) - encoding[name] = val_encoding - elif np.issubdtype(val.dtype, np.integer): - val_encoding.update(compression_settings["int"]) - encoding[name] = val_encoding - elif np.issubdtype(val.dtype, np.str_): - val_encoding.update(compression_settings["string"]) - encoding[name] = val_encoding - elif np.issubdtype(val.dtype, np.datetime64): - val_encoding.update(compression_settings["time"]) - encoding[name] = val_encoding - else: - raise NotImplementedError(f"Zarr Encoding for dtype = {val.dtype} has not been set!") + val_encoding.update(get_zarr_compression(val, compression_settings)) + encoding[name] = val_encoding return encoding From f4922b0dd80ecce65cc8e0c608be0621179ca51a Mon Sep 17 00:00:00 2001 From: b-reyes Date: Mon, 26 Sep 2022 12:00:09 -0700 Subject: [PATCH 42/89] change filenames numbering to range(len(eds)) --- echopype/echodata/zarr_combine.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 24de33266..85dbaf5ad 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -623,6 +623,27 @@ def _append_provenance_attr_vars(self, path: str, storage_options: Optional[dict path, group="Provenance", mode="a", storage_options=storage_options, consolidated=True ) + @staticmethod + def _modify_prov_filenames(path: str, len_eds: int) -> None: + """ + After the ``Provenance`` group has been constructed, the + coordinate ``filenames`` will be filled with zeros. This + function fills ``filenames`` with the appropriate values + by directly overwriting the zarr array. + + Parameters + ---------- + path: str + The full path of the final combined zarr store + len_eds: int + The number of ``EchoData`` objects being combined + """ + + # obtain the filenames zarr array + zarr_filenames = zarr.open_array(path + "/Provenance/filenames", mode="r+") + + zarr_filenames[:] = np.arange(len_eds) + def combine( self, path: str, eds: List[EchoData] = [], storage_options: Optional[dict] = {} ) -> EchoData: @@ -670,7 +691,8 @@ def combine( # append all group attributes before combination to zarr store self._append_provenance_attr_vars(path, storage_options=storage_options) - # TODO: change filenames numbering to range(len(filenames)) + # change filenames numbering to range(len(eds)) + self._modify_prov_filenames(path, len_eds=len(eds)) # blosc.use_threads = None From 449f4e57c948217bdfa4e3ac269c73c786b7a7a2 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Mon, 26 Sep 2022 17:19:25 -0700 Subject: [PATCH 43/89] remove old time checks and replace with new time check for combined datasets, start working on correcting reversed time, and start working on the new combine api --- echopype/echodata/combine.py | 141 +++++++++++++----- echopype/echodata/zarr_combine.py | 88 ++++------- .../tests/echodata/test_echodata_combine.py | 2 +- 3 files changed, 134 insertions(+), 97 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 7bde42793..a0e50c5ae 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -1,5 +1,6 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple +from warnings import warn import xarray as xr from datatree import DataTree @@ -10,6 +11,7 @@ from ..utils.log import _init_logger from ..utils.prov import echopype_prov_attrs, source_files_vars from .echodata import EchoData +from .zarr_combine import ZarrCombine logger = _init_logger(__name__) @@ -65,7 +67,7 @@ def check_echodatas_input(echodatas: List[EchoData]) -> Tuple[str, List[str]]: def check_and_correct_reversed_time( - combined_group: xr.Dataset, time_str: str, sonar_model: str + combined_group: xr.Dataset, time_str: str, ed_group: str ) -> Optional[xr.DataArray]: """ Makes sure that the time coordinate ``time_str`` in @@ -79,8 +81,8 @@ def check_and_correct_reversed_time( Dataset representing a combined EchoData group time_str : str Name of time coordinate to be checked and corrected - sonar_model : str - Name of sonar model + ed_group : str + Name of ``EchoData`` group name Returns ------- @@ -92,7 +94,7 @@ def check_and_correct_reversed_time( if time_str in combined_group and exist_reversed_time(combined_group, time_str): logger.warning( - f"{sonar_model} {time_str} reversal detected; {time_str} will be corrected" # noqa + f"{ed_group} {time_str} reversal detected; {time_str} will be corrected" # noqa " (see https://github.com/OSOceanAcoustics/echopype/pull/297)" ) old_time = combined_group[time_str] @@ -343,9 +345,7 @@ def in_memory_combine( return result -def combine_echodata( - echodatas: List[EchoData], combine_attrs: str = "override", in_memory: bool = True -) -> EchoData: +def combine_echodata(echodatas: List[EchoData], zarr_store=None, storage_options={}) -> EchoData: """ Combines multiple ``EchoData`` objects into a single ``EchoData`` object. @@ -415,40 +415,103 @@ def combine_echodata( >>> combined = echopype.combine_echodata([ed1, ed2]) """ - if len(echodatas) == 0: - return EchoData() + if zarr_store is None: + zarr_store = "/Users/brandonreyes/UW_work/Echopype_work/code_playing_around/test.zarr" + raise RuntimeError("You need to provide a path!") # TODO: use Don's path - sonar_model, echodata_filenames = check_echodatas_input(echodatas) - - # all attributes before combination - # { group1: [echodata1 attrs, echodata2 attrs, ...], ... } - old_attrs: Dict[str, List[Dict[str, Any]]] = dict() + if not isinstance(echodatas, list): + raise TypeError("The input, eds, must be a list of EchoData objects!") - # dict that holds times before they are corrected - old_times: Dict[str, Optional[xr.DataArray]] = { - "old_ping_time": None, - "old_time1": None, - "old_time2": None, - "old_time3": None, - } + if not isinstance(zarr_store, str): # TODO: change this in the future + raise TypeError("The input, store, must be a string!") - if in_memory: - result = in_memory_combine(echodatas, sonar_model, combine_attrs, old_attrs, old_times) - else: - raise NotImplementedError( - "Lazy representation of combined EchoData object has not been implemented yet." - ) - - # save times before reversal correction - for key, val in old_times.items(): - if val is not None: - result["Provenance"][key] = val - result["Provenance"].attrs["reversed_ping_times"] = 1 - - # save attrs from before combination - store_old_attrs(result, old_attrs, echodata_filenames, sonar_model) + # return empty EchoData object, if no EchoData objects are provided + if not echodatas: + warn("No EchoData objects were provided, returning an empty EchoData object.") + return EchoData() - # TODO: possible parameter to disable original attributes and original ping_time storage - # in provenance group? + sonar_model, echodata_filenames = check_echodatas_input(echodatas) - return result + comb = ZarrCombine() + ed_comb = comb.combine( + zarr_store, + echodatas, + storage_options=storage_options, + sonar_model=sonar_model, + echodata_filenames=echodata_filenames, + ) + + # TODO: perform time check, put this in its own function + for group in ed_comb.group_paths: + + if group != "Platform/NMEA": + # Platform/NMEA is skipped because we found that the times correspond to other + # messages besides GPS. This causes multiple times to be out of order and + # correcting them is not possible with the current implementation of + # _clean_ping_time in qc.api + + # get all time dimensions of the group + ed_comb_time_dims = set(ed_comb[group].dims).intersection(comb.possible_time_dims) + + for time in ed_comb_time_dims: + + old_time = check_and_correct_reversed_time( + combined_group=ed_comb[group], time_str=time, ed_group=group + ) + + if old_time is not None: + + # get name of old time and dim for Provenance group + ed_name = group.replace("-", "_").replace("/", "_").lower() + old_time_name = ed_name + "_old_" + time + old_time_name_dim = old_time_name + "_dim" + + # put old times in Provenance and modify attribute + # TODO: should we give old time a long name? + old_time_array = xr.DataArray(data=old_time.values, dims=[old_time_name_dim]) + ed_comb["Provenance"][old_time_name] = old_time_array + ed_comb["Provenance"].attrs["reversed_ping_times"] = 1 + + # TODO: save new time and old time to zarr store + + return ed_comb + + # TODO: below is old combine code that will be removed + + # if len(echodatas) == 0: + # return EchoData() + # + # sonar_model, echodata_filenames = check_echodatas_input(echodatas) + # + # # all attributes before combination + # # { group1: [echodata1 attrs, echodata2 attrs, ...], ... } + # old_attrs: Dict[str, List[Dict[str, Any]]] = dict() + # + # # dict that holds times before they are corrected + # old_times: Dict[str, Optional[xr.DataArray]] = { + # "old_ping_time": None, + # "old_time1": None, + # "old_time2": None, + # "old_time3": None, + # } + # + # if in_memory: + # result = in_memory_combine(echodatas, sonar_model, combine_attrs, old_attrs, old_times) + # else: + # raise NotImplementedError( + # "Lazy representation of combined EchoData object has not been implemented yet." + # ) + # + # # save times before reversal correction + # for key, val in old_times.items(): + # if val is not None: + # result["Provenance"][key] = val + # result["Provenance"].attrs["reversed_ping_times"] = 1 + # + # # save attrs from before combination + # store_old_attrs(result, old_attrs, echodata_filenames, sonar_model) + # + # # TODO: possible parameter to disable original attributes and original ping_time storage + # # in provenance group? + # + # return result diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 85dbaf5ad..77a311907 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -1,6 +1,5 @@ from collections import defaultdict from typing import Dict, Hashable, List, Optional, Set, Tuple -from warnings import warn import dask import dask.array @@ -14,7 +13,6 @@ from ..utils.io import get_zarr_compression from ..utils.prov import echopype_prov_attrs from .api import open_converted -from .combine import check_echodatas_input # , check_and_correct_reversed_time from .echodata import EchoData @@ -44,10 +42,9 @@ def __init__(self): def _check_ascending_ds_times(self, ds_list: List[xr.Dataset], ed_name: str) -> None: """ - Ensures that the time dimensions are in ascending order - across all Datasets being combined. For example, the - maximum time of the first Dataset must be less than the - minimum time of the second Dataset. + A minimal check that the first time value of each Dataset is less than + the first time value of the subsequent Dataset. If each first time value + is NaT, then this check is skipped. Parameters ---------- @@ -62,47 +59,30 @@ def _check_ascending_ds_times(self, ds_list: List[xr.Dataset], ed_name: str) -> for time in ed_time_dim: - # get maximum and minimum time of all Datasets - max_time = [ds[time].max().values for ds in ds_list] - min_time = [ds[time].min().values for ds in ds_list] + # gather the first time of each Dataset + first_times = [] + for ds in ds_list: - # see if all Datasets have NaN for time - max_all_nan = all(np.isnan(max_time)) - min_all_nan = all(np.isnan(min_time)) - - # True means our time is not filled with NaNs - # This is necessary because some time dims can be filled with NaNs - nan_time_cond = (not max_all_nan) and (not min_all_nan) - - # checks to see that times are in ascending order - if nan_time_cond and max_time[:-1] > min_time[1:]: - - raise RuntimeError( - f"The coordinate {time} is not in ascending order for group {ed_name}, " - f"combine cannot be used!" - ) - - def _reverse_time_check_and_storage(self, ds_list: List[xr.Dataset], ed_name: str): - """ - Determine if there exist reversed time dimensions in each - of the Datasets individually. Additionally, if there are - reversed times correct them and store the old time dimension - as a variable of - - """ + times = ds[time].values + if isinstance(times, np.ndarray): + # store first time if we have an array + first_times.append(times[0]) + else: + # store first time if we have a single value + first_times.append(times) - # TODO: check and store time values + first_times = np.array(first_times) - # TODO: do this first [exist_reversed_time(ds, time_str) for ds in ds_list] - # if any are True, then continue by creating an old time variable in each ds + # skip check if all first times are NaT + if not np.isnan(first_times).all(): - # for ds in ds_list: - # old_time = check_and_correct_reversed_time( - # ds, time_str=str(time), sonar_model=self.sonar_model - # ) + is_descending = (np.diff(first_times) < np.timedelta64(0, "ns")).any() - old_time = None - print(f"old_time = {old_time}, group = {ed_name}") + if is_descending: + raise RuntimeError( + f"The coordinate {time} is not in ascending order for " + f"group {ed_name}, combine cannot be used!" + ) @staticmethod def _check_channels(ds_list: List[xr.Dataset], ed_name: str) -> None: @@ -250,8 +230,6 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non self._check_ascending_ds_times(ds_list, ed_name) self._check_channels(ds_list, ed_name) - # TODO: check for and correct reversed time - # Dataframe with column as dim names and rows as the different Datasets self.dims_df = pd.DataFrame([ds.dims for ds in ds_list]) @@ -645,23 +623,19 @@ def _modify_prov_filenames(path: str, len_eds: int) -> None: zarr_filenames[:] = np.arange(len_eds) def combine( - self, path: str, eds: List[EchoData] = [], storage_options: Optional[dict] = {} + self, + path: str, + eds: List[EchoData] = [], + storage_options: Optional[dict] = {}, + sonar_model: str = None, + echodata_filenames: List[str] = [], ) -> EchoData: - if not isinstance(eds, list): - raise TypeError("The input, eds, must be a list of EchoData objects!") - - if not isinstance(path, str): - raise TypeError("The input, path, must be a string!") - - # return empty EchoData object, if no EchoData objects are provided - if not eds: - warn("No EchoData objects were provided, returning an empty EchoData object.") - return EchoData() - # blosc.use_threads = False - self.sonar_model, self.group_attrs["echodata_filename"] = check_echodatas_input(eds) + self.sonar_model = sonar_model + + self.group_attrs["echodata_filename"] = echodata_filenames for grp_info in EchoData.group_map.values(): diff --git a/echopype/tests/echodata/test_echodata_combine.py b/echopype/tests/echodata/test_echodata_combine.py index 394560bc2..f5309fbde 100644 --- a/echopype/tests/echodata/test_echodata_combine.py +++ b/echopype/tests/echodata/test_echodata_combine.py @@ -175,7 +175,7 @@ def test_ping_time_reversal(ek60_reversed_ping_time_test_data): echopype.open_raw(file, "EK60") for file in ek60_reversed_ping_time_test_data ] - combined = echopype.combine_echodata(eds, "overwrite_conflicts") # type: ignore + combined = echopype.combine_echodata(eds) #, "overwrite_conflicts") # type: ignore for group_name, value in combined.group_map.items(): if value['ep_group'] is None: From 419b17426d0090867c10adac763d2a67f0084cd4 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 27 Sep 2022 08:30:13 -0700 Subject: [PATCH 44/89] remove unused old combine code --- echopype/echodata/combine.py | 285 +---------------------------------- 1 file changed, 1 insertion(+), 284 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index a0e50c5ae..9d4f8851f 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -1,15 +1,11 @@ from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import List, Optional, Tuple from warnings import warn import xarray as xr -from datatree import DataTree -from ..core import SONAR_MODELS from ..qc import coerce_increasing_time, exist_reversed_time -from ..utils.coding import set_encodings from ..utils.log import _init_logger -from ..utils.prov import echopype_prov_attrs, source_files_vars from .echodata import EchoData from .zarr_combine import ZarrCombine @@ -106,245 +102,6 @@ def check_and_correct_reversed_time( return old_time -def assemble_combined_provenance(input_paths): - prov_dict = echopype_prov_attrs(process_type="conversion") - source_files_var, source_files_coord = source_files_vars(input_paths) - ds = xr.Dataset(data_vars=source_files_var, coords=source_files_coord, attrs=prov_dict) - return ds - - -def union_attrs(datasets: List[xr.Dataset]) -> Dict[str, Any]: - """ - Merges attrs from a list of datasets. - Prioritizes keys from later datasets. - """ - - total_attrs = dict() - for ds in datasets: - total_attrs.update(ds.attrs) - return total_attrs - - -def examine_group_time_coords( - combined_group: xr.Dataset, - group: str, - sonar_model: str, - old_times: Dict[str, Optional[xr.DataArray]], -) -> None: - """ - Ensures that the time coords for each group are in the - correct order. - - Parameters - ---------- - combined_group: xr.Dataset - Dataset representing a combined ``EchoData`` group - group: str - Group name of ``combined_group`` obtained from ``EchoData.group_map`` - sonar_model: str - Name of sonar model - old_times: Dict[str, Optional[xr.DataArray]] - Dictionary that holds times before they are corrected - - Notes - ----- - If old time coordinates need to be stored, the input variable ``old_times`` - will be directly modified. - - This does not check the AD2CP time coordinates! - """ - - if sonar_model != "AD2CP": - - old_times["old_ping_time"] = check_and_correct_reversed_time( - combined_group, "ping_time", sonar_model - ) - - if group != "nmea": - old_times["old_time1"] = check_and_correct_reversed_time( - combined_group, "time1", sonar_model - ) - - old_times["old_time2"] = check_and_correct_reversed_time( - combined_group, "time2", sonar_model - ) - old_times["old_time3"] = check_and_correct_reversed_time( - combined_group, "time3", sonar_model - ) - - -def store_old_attrs( - result: EchoData, - old_attrs: Dict[str, List[Dict[str, Any]]], - echodata_filenames: List[str], - sonar_model: str, -) -> None: - """ - Stores all attributes of the groups in ``echodatas`` before - they were combined in the ``Provenance`` group of ``result`` - and specifies the sonar model of the new combined data. - - Parameters - ---------- - result: EchoData - The final ``EchoData`` object representing the combined data - old_attrs: Dict[str, List[Dict[str, Any]]] - All attributes before combination - echodata_filenames : List[str] - The source files names for all values in ``echodatas`` - sonar_model : str - The sonar model used for all values in ``echodatas`` - - Notes - ----- - The input ``result`` will be directly modified. - """ - - # store all old attributes - for group in old_attrs: - all_group_attrs = set() - for group_attrs in old_attrs[group]: - for attr in group_attrs: - all_group_attrs.add(attr) - attrs = xr.DataArray( - [ - [group_attrs.get(attr) for attr in all_group_attrs] - for group_attrs in old_attrs[group] - ], - coords={ - "echodata_filename": echodata_filenames, - f"{group}_attr_key": list(all_group_attrs), - }, - dims=["echodata_filename", f"{group}_attr_key"], - ) - result["Provenance"] = result["Provenance"].assign({f"{group}_attrs": attrs}) - - # Add back sonar model - result.sonar_model = sonar_model - - -def in_memory_combine( - echodatas: List[EchoData], - sonar_model: str, - combine_attrs: str, - old_attrs: Dict[str, List[Dict[str, Any]]], - old_times: Dict[str, Optional[xr.DataArray]], -) -> EchoData: - """ - Creates an in-memory (i.e. in RAM) combined ``EchoData`` - object from the values in ``echodatas``. - - Parameters - ---------- - echodatas : List[EchoData] - The list of ``EchoData`` objects to be combined. - sonar_model: str - The sonar model used for all values in ``echodatas`` - combine_attrs : str - String indicating how to combine attrs of the ``EchoData`` objects being merged. - old_attrs: Dict[str, List[Dict[str, Any]]] - All attributes before combination - old_times: Dict[str, Optional[xr.DataArray]] - Dictionary that holds times before they are corrected - - Returns - ------- - result : EchoData - An in-memory ``EchoData`` object with all data from the input - ``EchoData`` objects combined. - - Notes - ----- - If necessary, the input variables ``old_attrs`` and ``old_times`` - will be directly modified. - """ - - # initialize EchoData object and tree that will store the final result - tree_dict = {} - result = EchoData() - - # assign EchoData class variables - result.source_file = echodatas[0].source_file - result.converted_raw_path = echodatas[0].converted_raw_path - - # Specification for Echodata.group_map can be found in - # echopype/echodata/convention/1.0.yml - for group, value in EchoData.group_map.items(): - group_datasets = [] - group_path = value["ep_group"] - if group_path is None: - group_path = "Top-level" - - for echodata in echodatas: - if echodata[group_path] is not None: - group_datasets.append(echodata[group_path]) - - if group in ("top", "sonar"): - combined_group = echodatas[0][group_path] - elif group == "provenance": - combined_group = assemble_combined_provenance( - [ - echodata.source_file - if echodata.source_file is not None - else echodata.converted_raw_path - for echodata in echodatas - ] - ) - else: - if len(group_datasets) == 0: - continue - - concat_dim = SONAR_MODELS[sonar_model]["concat_dims"].get( - group, SONAR_MODELS[sonar_model]["concat_dims"]["default"] - ) - concat_data_vars = SONAR_MODELS[sonar_model]["concat_data_vars"].get( - group, SONAR_MODELS[sonar_model]["concat_data_vars"]["default"] - ) - combined_group = xr.combine_nested( - group_datasets, - [concat_dim], - data_vars=concat_data_vars, - coords="minimal", - combine_attrs="drop" if combine_attrs == "overwrite_conflicts" else combine_attrs, - ) - if combine_attrs == "overwrite_conflicts": - combined_group.attrs.update(union_attrs(group_datasets)) - - if group == "beam": - if sonar_model == "EK80": - combined_group["transceiver_software_version"] = combined_group[ - "transceiver_software_version" - ].astype(" 1: - old_attrs[group] = [group_dataset.attrs for group_dataset in group_datasets] - if combined_group is not None: - # xarray inserts this dimension when concatenating along multiple dimensions - combined_group = combined_group.drop_dims("concat_dim", errors="ignore") - - combined_group = set_encodings(combined_group) - if value["ep_group"] is None: - tree_dict["/"] = combined_group - else: - tree_dict[value["ep_group"]] = combined_group - - # Set tree into echodata object - result._set_tree(tree=DataTree.from_dict(tree_dict, name="root")) - result._load_tree() - - return result - - def combine_echodata(echodatas: List[EchoData], zarr_store=None, storage_options={}) -> EchoData: """ Combines multiple ``EchoData`` objects into a single ``EchoData`` object. @@ -475,43 +232,3 @@ def combine_echodata(echodatas: List[EchoData], zarr_store=None, storage_options # TODO: save new time and old time to zarr store return ed_comb - - # TODO: below is old combine code that will be removed - - # if len(echodatas) == 0: - # return EchoData() - # - # sonar_model, echodata_filenames = check_echodatas_input(echodatas) - # - # # all attributes before combination - # # { group1: [echodata1 attrs, echodata2 attrs, ...], ... } - # old_attrs: Dict[str, List[Dict[str, Any]]] = dict() - # - # # dict that holds times before they are corrected - # old_times: Dict[str, Optional[xr.DataArray]] = { - # "old_ping_time": None, - # "old_time1": None, - # "old_time2": None, - # "old_time3": None, - # } - # - # if in_memory: - # result = in_memory_combine(echodatas, sonar_model, combine_attrs, old_attrs, old_times) - # else: - # raise NotImplementedError( - # "Lazy representation of combined EchoData object has not been implemented yet." - # ) - # - # # save times before reversal correction - # for key, val in old_times.items(): - # if val is not None: - # result["Provenance"][key] = val - # result["Provenance"].attrs["reversed_ping_times"] = 1 - # - # # save attrs from before combination - # store_old_attrs(result, old_attrs, echodata_filenames, sonar_model) - # - # # TODO: possible parameter to disable original attributes and original ping_time storage - # # in provenance group? - # - # return result From 74920ecb783263aafb0b9aaec65a0af250fec157 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 27 Sep 2022 11:11:30 -0700 Subject: [PATCH 45/89] implement a reverse time check and update zarr and ed_comb appropriately, if a reversed time is detected --- echopype/echodata/combine.py | 154 +++++++++++++++++++++++++++-------- echopype/qc/api.py | 3 +- 2 files changed, 123 insertions(+), 34 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 9d4f8851f..7946a6fe3 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -93,15 +93,127 @@ def check_and_correct_reversed_time( f"{ed_group} {time_str} reversal detected; {time_str} will be corrected" # noqa " (see https://github.com/OSOceanAcoustics/echopype/pull/297)" ) - old_time = combined_group[time_str] + old_time = combined_group[time_str].copy() coerce_increasing_time(combined_group, time_name=time_str) - else: old_time = None return old_time +def create_old_time_array(group: str, old_time_in: xr.DataArray) -> xr.DataArray: + """ + Creates an old time array with the appropriate values, name, + attributes, and encoding. + + Parameters + ---------- + group: str + The name of the ``EchoData`` group that contained + the old time + old_time_in: xr.DataArray + The uncorrected old time + + Returns + ------- + old_time_array: xr.DataArray + The newly created old time array + """ + + # make a copy, so we don't change the source array + old_time = old_time_in.copy() + + # get name of old time and dim for Provenance group + ed_name = group.replace("-", "_").replace("/", "_").lower() + old_time_name = ed_name + "_old_" + old_time.name + + old_time_name_dim = old_time_name + "_dim" + + # construct old time attributes + attributes = old_time.attrs + attributes["comment"] = f"Uncorrected {old_time.name} from the combined group {group}." + + # create old time array + old_time_array = xr.DataArray( + data=old_time.values, dims=[old_time_name_dim], attrs=attributes, name=old_time_name + ) + + # set encodings + old_time_array.encoding = old_time.encoding + + return old_time_array + + +def orchestrate_reverse_time_check( + ed_comb: EchoData, zarr_store: str, possible_time_dims: List[str], storage_options: dict +) -> None: + """ + Performs a reverse time check of all groups and + each time dimension within the group. If a reversed + time is found it will be corrected in ``ed_comb``, + updated in the zarr store, the old time will be + added to the ``Provenance`` group in ``ed_comb``, + the old time will be written to the zarr store, + and the attribute ``reversed_ping_times`` in the + ``Provenance`` group will be set to ``1``. + + Parameters + ---------- + ed_comb: EchoData + ``EchoData`` object that has been constructed from + combined ``EchoData`` objects + zarr_store: str + The zarr store containing the ``ed_comb`` data + possible_time_dims: List[str] + All possible time dimensions that can occur within + ``ed_comb``, which should be checked + storage_options: dict + Additional keywords to pass to the filesystem class. + + Notes + ----- + If correction is necessary, ``ed_comb`` will be + directly modified. + """ + + for group in ed_comb.group_paths: + + if group != "Platform/NMEA": + # Platform/NMEA is skipped because we found that the times correspond to other + # messages besides GPS. This causes multiple times to be out of order and + # correcting them is not possible with the current implementation of + # _clean_ping_time in qc.api + + # get all time dimensions of the group + ed_comb_time_dims = set(ed_comb[group].dims).intersection(possible_time_dims) + + for time in ed_comb_time_dims: + + old_time = check_and_correct_reversed_time( + combined_group=ed_comb[group], time_str=time, ed_group=group + ) + + if old_time is not None: + + old_time_array = create_old_time_array(group, old_time) + + # put old times in Provenance and modify attribute + ed_comb["Provenance"][old_time_array.name] = old_time_array + ed_comb["Provenance"].attrs["reversed_ping_times"] = 1 + + # save old time to zarr store + old_time_ds = old_time_array.to_dataset() + old_time_ds.attrs = ed_comb["Provenance"].attrs + old_time_ds.to_zarr( + zarr_store, group="Provenance", mode="a", storage_options=storage_options + ) + + # save corrected time to zarr store + ed_comb[group][[time]].to_zarr( + zarr_store, group=group, mode="r+", storage_options=storage_options + ) + + def combine_echodata(echodatas: List[EchoData], zarr_store=None, storage_options={}) -> EchoData: """ Combines multiple ``EchoData`` objects into a single ``EchoData`` object. @@ -174,7 +286,7 @@ def combine_echodata(echodatas: List[EchoData], zarr_store=None, storage_options if zarr_store is None: zarr_store = "/Users/brandonreyes/UW_work/Echopype_work/code_playing_around/test.zarr" - raise RuntimeError("You need to provide a path!") # TODO: use Don's path + # raise RuntimeError("You need to provide a path!") # TODO: use Don's path if not isinstance(echodatas, list): raise TypeError("The input, eds, must be a list of EchoData objects!") @@ -198,37 +310,13 @@ def combine_echodata(echodatas: List[EchoData], zarr_store=None, storage_options echodata_filenames=echodata_filenames, ) - # TODO: perform time check, put this in its own function - for group in ed_comb.group_paths: - - if group != "Platform/NMEA": - # Platform/NMEA is skipped because we found that the times correspond to other - # messages besides GPS. This causes multiple times to be out of order and - # correcting them is not possible with the current implementation of - # _clean_ping_time in qc.api - - # get all time dimensions of the group - ed_comb_time_dims = set(ed_comb[group].dims).intersection(comb.possible_time_dims) - - for time in ed_comb_time_dims: + # set Provenance attribute to zero in ed_comb + ed_comb["Provenance"].attrs["reversed_ping_times"] = 0 - old_time = check_and_correct_reversed_time( - combined_group=ed_comb[group], time_str=time, ed_group=group - ) - - if old_time is not None: - - # get name of old time and dim for Provenance group - ed_name = group.replace("-", "_").replace("/", "_").lower() - old_time_name = ed_name + "_old_" + time - old_time_name_dim = old_time_name + "_dim" - - # put old times in Provenance and modify attribute - # TODO: should we give old time a long name? - old_time_array = xr.DataArray(data=old_time.values, dims=[old_time_name_dim]) - ed_comb["Provenance"][old_time_name] = old_time_array - ed_comb["Provenance"].attrs["reversed_ping_times"] = 1 + # set Provenance attribute to zero in zarr (Dataset needed for metadata creation) + only_attrs_ds = xr.Dataset(attrs=ed_comb["Provenance"].attrs) + only_attrs_ds.to_zarr(zarr_store, group="Provenance", mode="a", storage_options=storage_options) - # TODO: save new time and old time to zarr store + orchestrate_reverse_time_check(ed_comb, zarr_store, comb.possible_time_dims, storage_options) return ed_comb diff --git a/echopype/qc/api.py b/echopype/qc/api.py index 1b5344ce1..52f965207 100644 --- a/echopype/qc/api.py +++ b/echopype/qc/api.py @@ -55,7 +55,8 @@ def coerce_increasing_time( would remain undisturbed. """ - ds[time_name] = _clean_ping_time(ds[time_name].values, local_win_len=local_win_len) + ping_time_new = _clean_ping_time(ds[time_name].values, local_win_len=local_win_len) + ds[time_name].values[:] = ping_time_new def exist_reversed_time(ds, time_name): From 2b3b0afb6af28e4dcdc0e9d3382117fe5829872c Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 27 Sep 2022 14:08:41 -0700 Subject: [PATCH 46/89] remove alternative combine .py scripts --- echopype/echodata/combine_lazily.py | 141 ------------------------ echopype/echodata/combine_preprocess.py | 69 ------------ 2 files changed, 210 deletions(-) delete mode 100644 echopype/echodata/combine_lazily.py delete mode 100644 echopype/echodata/combine_preprocess.py diff --git a/echopype/echodata/combine_lazily.py b/echopype/echodata/combine_lazily.py deleted file mode 100644 index fec7f90ee..000000000 --- a/echopype/echodata/combine_lazily.py +++ /dev/null @@ -1,141 +0,0 @@ -import xarray as xr -from datatree import DataTree -from fsspec.implementations.local import LocalFileSystem - -from echopype.echodata import EchoData - -from .combine_preprocess import PreprocessCallable - -# desired_raw_file_paths = fs.glob('OOI_zarrs_ep_ex/temp/*.zarr') - - -def get_ed_path_from_str(zarr_path: str, path: str): - """ - - Parameters - ---------- - zarr_path: str - Full path to zarr file - path: str - Full path to ``.zgroup`` - """ - - # the names of the groups that are needed to get to path - all_grp_names = [ - elm for elm in path.split("/") if (elm not in zarr_path.split("/")) and (elm != ".zgroup") - ] - - return "/".join(all_grp_names) - - -def get_zarr_grp_names(path: str, fs: LocalFileSystem) -> set: - """ - Identifies the zarr group names using the path - """ - - # grab all paths that have .zgroup - info = fs.glob(path + "/**.zgroup") - - # infer the group name based on the path - ed_grp_name = {get_ed_path_from_str(path, entry) for entry in info} - - # remove the zarr file name and replace it with Top-level - if "" in ed_grp_name: - ed_grp_name.remove("") - ed_grp_name.add(None) - - return ed_grp_name - - -def reassign_attrs(ed_comb: EchoData, common_grps: set): - """ - Reassigns stored group attributes to the Provenance group. - """ - - for group, value in EchoData.group_map.items(): - - if (value["ep_group"] != "Provenance") and (value["ep_group"] in common_grps): - - attr_var_name = group + "_attrs" - attr_coord_name = group + "_attr_key" - - if value["ep_group"]: - ed_grp = value["ep_group"] - else: - ed_grp = "Top-level" - - # move attribute variable to Provenance - ed_comb["Provenance"][attr_var_name] = ed_comb[ed_grp][attr_var_name] - - # remove attribute variable and coords from group - ed_comb[ed_grp] = ed_comb[ed_grp].drop_vars( - [attr_var_name, attr_coord_name, "echodata_filename"] - ) - - -def lazy_combine(desired_raw_file_paths, fs): - - # TODO: test code when we have to do an expansion in range_sample - - # initial structure for lazy combine - tree_dict = {} - result = EchoData() - - # grab object that does pre-processing - preprocess_obj = PreprocessCallable(desired_raw_file_paths) - - # TODO: the subsequent line is zarr specific!! Account for nc in the future - # determine each zarr's group names - file_grps = [get_zarr_grp_names(path, fs) for path in desired_raw_file_paths] - - # get the group names that all files share - common_grps = set.intersection(*file_grps) - - # check that all zarrs have the same groups - if any([common_grps.symmetric_difference(s) for s in file_grps]): - raise RuntimeError("All input files must have the same groups!") - - for group, value in EchoData.group_map.items(): - - if value["ep_group"] in common_grps: - - print(f"ed group = {value['ep_group']}") - - preprocess_obj.update_ed_group(group) - - combined_group = xr.open_mfdataset( - desired_raw_file_paths, - engine="zarr", - coords="minimal", - preprocess=preprocess_obj, - combine="nested", - group=value["ep_group"], - concat_dim=None, - ) - - if value["ep_group"] is None: - tree_dict["/"] = combined_group - else: - tree_dict[value["ep_group"]] = combined_group - - # Set tree into echodata object - result._set_tree(tree=DataTree.from_dict(tree_dict, name="root")) - result._load_tree() - - # reassign stored group attributes to the provenance group - reassign_attrs(result, common_grps) - - # TODO: modify Provenance conversion_time attribute - # dt.utcnow().isoformat(timespec="seconds") + "Z", # use UTC time - - return result - - -# How to construct Provenance Group -# obj = ProvenancePreprocess(desired_raw_file_paths) -# -# out = xr.open_mfdataset(desired_raw_file_paths[:2], -# engine='zarr', coords='minimal', -# combine="nested", group='Provenance', -# preprocess=obj, concat_dim=None) -# TODO: to be identical to in-memory combine remove filenames as coordinate (keep as dim) diff --git a/echopype/echodata/combine_preprocess.py b/echopype/echodata/combine_preprocess.py deleted file mode 100644 index ea659bc69..000000000 --- a/echopype/echodata/combine_preprocess.py +++ /dev/null @@ -1,69 +0,0 @@ -from pathlib import Path -from typing import List - -import numpy as np -import xarray as xr - - -class PreprocessCallable: - """ - Class that has all preprocessing functions and is callable. - """ - - def __init__(self, file_paths: List[str]): - self.file_paths = file_paths - self.ed_group = None - - def __call__(self, ds): - - if self.ed_group == "provenance": - self._assign_file_index(ds) - - self._store_attrs(ds) - - ds = self.re_chunk(ds) - - # TODO: add method to check and correct reversed times - - return ds - - def update_ed_group(self, group: str): - self.ed_group = group - - def re_chunk(self, ds): - - # chunk_dict = {'time2': 1000, 'time3': 1000} - # chunk_dict = {'ping_time': 100, 'range_sample': 100} - - # ds = ds.chunk(chunk_dict) - - for drop_var in ["backscatter_r", "angle_athwartship", "angle_alongship"]: - - if drop_var in ds: - ds = ds.drop_vars(drop_var) - - return ds - - def _assign_file_index(self, ds): - - ind_file = self.file_paths.index(ds.encoding["source"]) - ds["filenames"] = (["filenames"], np.array([ind_file])) - - # TODO: add method to check and correct reversed times - - def _store_attrs(self, ds): - - file_name = Path(ds.encoding["source"]).name - - grp_key_name = self.ed_group + "_attr_key" - grp_attr_names = np.array(list(ds.attrs.keys())) - - attrs_var = xr.DataArray( - data=np.array([list(ds.attrs.values())]), - coords={ - "echodata_filename": (["echodata_filename"], np.array([file_name])), - grp_key_name: ([grp_key_name], grp_attr_names), - }, - ) - - ds[self.ed_group + "_attrs"] = attrs_var From 5636f528b7a2f3b07d37a6e4e7872da44d6ebb98 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 27 Sep 2022 14:51:55 -0700 Subject: [PATCH 47/89] finish documenting zarr_combine --- echopype/echodata/combine.py | 14 ++-- echopype/echodata/zarr_combine.py | 131 ++++++++++++++++++++++-------- 2 files changed, 105 insertions(+), 40 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 7946a6fe3..0c64c6989 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -176,6 +176,13 @@ def orchestrate_reverse_time_check( directly modified. """ + # set Provenance attribute to zero in ed_comb + ed_comb["Provenance"].attrs["reversed_ping_times"] = 0 + + # set Provenance attribute to zero in zarr (Dataset needed for metadata creation) + only_attrs_ds = xr.Dataset(attrs=ed_comb["Provenance"].attrs) + only_attrs_ds.to_zarr(zarr_store, group="Provenance", mode="a", storage_options=storage_options) + for group in ed_comb.group_paths: if group != "Platform/NMEA": @@ -310,13 +317,6 @@ def combine_echodata(echodatas: List[EchoData], zarr_store=None, storage_options echodata_filenames=echodata_filenames, ) - # set Provenance attribute to zero in ed_comb - ed_comb["Provenance"].attrs["reversed_ping_times"] = 0 - - # set Provenance attribute to zero in zarr (Dataset needed for metadata creation) - only_attrs_ds = xr.Dataset(attrs=ed_comb["Provenance"].attrs) - only_attrs_ds.to_zarr(zarr_store, group="Provenance", mode="a", storage_options=storage_options) - orchestrate_reverse_time_check(ed_comb, zarr_store, comb.possible_time_dims, storage_options) return ed_comb diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 77a311907..54f63ad69 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -144,7 +144,7 @@ def _compare_attrs(attr1: dict, attr2: dict) -> List[str]: RuntimeError - If the keys are not the same - If the values are not identical - - If the keys ``date_created``, ``conversion_time`` + - If the keys ``date_created`` or ``conversion_time`` do not have the same types Notes @@ -209,6 +209,9 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non Notes ----- This method creates the following class variables: + dims_df: pd.DataFrame + Dataframe with column as dim names, rows as the + different Datasets, and values as the length of the dimension dims_sum: dict Keys as the dimension name and values as the corresponding sum of the lengths across all Datasets @@ -220,8 +223,6 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non Keys as the dimension name and values as the corresponding maximum length across all Datasets - Notes - ----- If attribute values are numpy arrays, then they will not be included in the ``self.group_attrs``. Instead, these values will only appear in the attributes of the combined ``EchoData`` object. @@ -360,7 +361,7 @@ def _construct_lazy_ds_and_var_info( its final combined form const_names: List[str] The names of all variables and dimensions that are constant - across all Datasets to be combined + (with respect to chunking) across all Datasets to be combined encodings: Dict[str, dict] The encodings for all variables and dimensions that will be written to the zarr store by regions @@ -414,7 +415,7 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: Parameters ---------- ds_ind: int - The key of the values of ``dims_csum`` or index of + The key of the values of ``self.dims_csum`` or index of ``self.dims_df`` to use for each dimension name ds_dims: Set[Hashable] The names of the dimensions used in the region creation @@ -448,7 +449,7 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: def _append_ds_list_to_zarr( self, - path: str, + zarr_path: str, ds_list: List[xr.Dataset], zarr_group: str, ed_name: str, @@ -461,7 +462,7 @@ def _append_ds_list_to_zarr( Parameters ---------- - path: str + zarr_path: str The full path of the final combined zarr store ds_list: List[xr.Dataset] The Datasets that will be combined @@ -474,6 +475,12 @@ def _append_ds_list_to_zarr( storage_options: Optional[dict] Any additional parameters for the storage backend (ignored for local paths) + + Returns + ------- + const_names: List[str] + The names of all variables and dimensions that are constant + (with respect to chunking) across all Datasets to be combined """ self._get_ds_info(ds_list, ed_name) @@ -482,7 +489,7 @@ def _append_ds_list_to_zarr( # create zarr file and all associated metadata (this is delayed) ds_lazy.to_zarr( - path, + zarr_path, compute=False, group=zarr_group, encoding=encodings, @@ -504,7 +511,7 @@ def _append_ds_list_to_zarr( # we can remove data corruption by implementing a locking scheme delayed_to_zarr.append( ds_drop.to_zarr( - path, + zarr_path, group=zarr_group, region=region, compute=False, @@ -513,6 +520,7 @@ def _append_ds_list_to_zarr( ) ) + # compute all delayed writes to the zarr store dask.compute(*delayed_to_zarr) return const_names @@ -521,10 +529,10 @@ def _append_const_to_zarr( self, const_vars: List[str], ds_list: List[xr.Dataset], - path: str, + zarr_path: str, zarr_group: str, - storage_options: dict, - ): + storage_options: Optional[dict], + ) -> None: """ Appends all constant (i.e. not chunked) variables and dimensions to the zarr group. @@ -535,12 +543,12 @@ def _append_const_to_zarr( The names of all variables/dimensions that are not chunked ds_list: List[xr.Dataset] The Datasets that will be combined - path: str + zarr_path: str The full path of the final combined zarr store zarr_group: str The name of the group of the zarr store corresponding to the Datasets in ``ds_list`` - storage_options: dict + storage_options: Optional[dict] Any additional parameters for the storage backend (ignored for local paths) @@ -560,19 +568,21 @@ def _append_const_to_zarr( ds_list_ind = int(0) ds_list[ds_list_ind][[var]].to_zarr( - path, group=zarr_group, mode="a", storage_options=storage_options + zarr_path, group=zarr_group, mode="a", storage_options=storage_options ) - def _append_provenance_attr_vars(self, path: str, storage_options: Optional[dict] = {}) -> None: + def _append_provenance_attr_vars( + self, zarr_path: str, storage_options: Optional[dict] = {} + ) -> None: """ Creates an xarray Dataset with variables set as the attributes from all groups before the combination. Additionally, appends this Dataset to the ``Provenance`` group located in the zarr - store specified by ``path``. + store specified by ``zarr_path``. Parameters ---------- - path: str + zarr_path: str The full path of the final combined zarr store storage_options: Optional[dict] Any additional parameters for the storage @@ -598,11 +608,15 @@ def _append_provenance_attr_vars(self, path: str, storage_options: Optional[dict # append Dataset to zarr all_ds_attrs.to_zarr( - path, group="Provenance", mode="a", storage_options=storage_options, consolidated=True + zarr_path, + group="Provenance", + mode="a", + storage_options=storage_options, + consolidated=True, ) @staticmethod - def _modify_prov_filenames(path: str, len_eds: int) -> None: + def _modify_prov_filenames(zarr_path: str, len_eds: int) -> None: """ After the ``Provenance`` group has been constructed, the coordinate ``filenames`` will be filled with zeros. This @@ -611,47 +625,97 @@ def _modify_prov_filenames(path: str, len_eds: int) -> None: Parameters ---------- - path: str + zarr_path: str The full path of the final combined zarr store len_eds: int The number of ``EchoData`` objects being combined """ # obtain the filenames zarr array - zarr_filenames = zarr.open_array(path + "/Provenance/filenames", mode="r+") + zarr_filenames = zarr.open_array(zarr_path + "/Provenance/filenames", mode="r+") zarr_filenames[:] = np.arange(len_eds) def combine( self, - path: str, + zarr_path: str, eds: List[EchoData] = [], storage_options: Optional[dict] = {}, sonar_model: str = None, echodata_filenames: List[str] = [], ) -> EchoData: + """ + Combines all ``EchoData`` objects in ``eds`` by + writing each element in parallel to the zarr store + specified by ``zarr_path``. + Parameters + ---------- + zarr_path: str + The full path of the final combined zarr store + eds: List[EchoData] + The list of ``EchoData`` objects to be combined + storage_options: Optional[dict] + Any additional parameters for the storage + backend (ignored for local paths) + sonar_model : str + The sonar model used for all elements in ``eds`` + echodata_filenames : List[str] + The source files names for all elements in ``eds`` + + Returns + ------- + ed_combined: EchoData + The final combined form of the input ``eds`` before + a reversed time check has been run + + Raises + ------ + RuntimeError + If the first time value of each Dataset is not less than + the first time value of the subsequent Dataset + RuntimeError + If each Dataset in ``ds_list`` does not have the + same number of channels and the same name for each + of these channels. + RuntimeError + If any of the following attribute checks are not met + amongst the combined Datasets + - the keys are not the same + - the values are not identical + - the keys ``date_created`` or ``conversion_time`` + do not have the same types + + Notes + ----- + All attributes that are not arrays will be made into + variables and their result will be stored in the + ``Provenance`` group. + """ + + # TODO: the below line should be uncommented, if blosc issues persist # blosc.use_threads = False + # set class variables from input self.sonar_model = sonar_model - self.group_attrs["echodata_filename"] = echodata_filenames + # loop through all possible group and write them to a zarr store for grp_info in EchoData.group_map.values(): + # obtain the appropriate group name if grp_info["ep_group"]: ed_group = grp_info["ep_group"] else: ed_group = "Top-level" + # collect the group Dataset from all eds ds_list = [ed[ed_group] for ed in eds if ed_group in ed.group_paths] - if ds_list: - - print(f"ed_group = {ed_group}") + if ds_list: # necessary because a group may not be present const_names = self._append_ds_list_to_zarr( - path, + zarr_path, ds_list=ds_list, zarr_group=grp_info["ep_group"], ed_name=ed_group, @@ -659,27 +723,28 @@ def combine( ) self._append_const_to_zarr( - const_names, ds_list, path, grp_info["ep_group"], storage_options + const_names, ds_list, zarr_path, grp_info["ep_group"], storage_options ) # append all group attributes before combination to zarr store - self._append_provenance_attr_vars(path, storage_options=storage_options) + self._append_provenance_attr_vars(zarr_path, storage_options=storage_options) # change filenames numbering to range(len(eds)) - self._modify_prov_filenames(path, len_eds=len(eds)) + self._modify_prov_filenames(zarr_path, len_eds=len(eds)) + # TODO: the below line should be uncommented, if blosc issues persist # blosc.use_threads = None # open lazy loaded combined EchoData object ed_combined = open_converted( - path, chunks={}, synchronizer=zarr.ThreadSynchronizer() + zarr_path, chunks={}, synchronizer=zarr.ThreadSynchronizer() ) # TODO: is this appropriate for chunks? return ed_combined # Below are functions that may be useful when generating a locking scheme -# I am currently removing them until we implement this scheme +# I am currently removing/commenting out them until we implement this scheme # TODO: this lock is extremely inefficient, it makes # it so that the group is written sequentially, However, # no data corruption will occur From ddfb5fc2f40d2b8003d2b63ccd65bd22969cdb1f Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 27 Sep 2022 17:14:13 -0700 Subject: [PATCH 48/89] begin documenting the new combine api, create code section to validate the provided path input to combine api, and modify tests in test_echodata_combine.py --- echopype/echodata/combine.py | 58 ++++---- .../tests/echodata/test_echodata_combine.py | 134 +++++++----------- 2 files changed, 87 insertions(+), 105 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 0c64c6989..e180b650d 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -5,6 +5,7 @@ import xarray as xr from ..qc import coerce_increasing_time, exist_reversed_time +from ..utils.io import validate_output_path from ..utils.log import _init_logger from .echodata import EchoData from .zarr_combine import ZarrCombine @@ -221,7 +222,9 @@ def orchestrate_reverse_time_check( ) -def combine_echodata(echodatas: List[EchoData], zarr_store=None, storage_options={}) -> EchoData: +def combine_echodata( + echodatas: List[EchoData], zarr_path: Optional[str] = None, storage_options: Optional[dict] = {} +) -> EchoData: """ Combines multiple ``EchoData`` objects into a single ``EchoData`` object. @@ -229,22 +232,11 @@ def combine_echodata(echodatas: List[EchoData], zarr_store=None, storage_options ---------- echodatas : List[EchoData] The list of ``EchoData`` objects to be combined. - combine_attrs : str - String indicating how to combine attrs of the ``EchoData`` objects being merged. - This parameter matches the identically named xarray parameter - (see https://xarray.pydata.org/en/latest/generated/xarray.combine_nested.html) - with the exception of the "overwrite_conflicts" value. Possible options: - * ``"override"``: Default. skip comparing and copy attrs from the first ``EchoData`` - object to the result. - * ``"drop"``: empty attrs on returned ``EchoData`` object. - * ``"identical"``: all attrs must be the same on every object. - * ``"no_conflicts"``: attrs from all objects are combined, - any that have the same name must also have the same value. - * ``"overwrite_conflicts"``: attrs from all ``EchoData`` objects are combined, - attrs with conflicting keys will be overwritten by later ``EchoData`` objects. - in_memory : bool - If True, creates an in-memory form of the combined ``EchoData`` object, otherwise - a lazy ``EchoData`` object will be created (not currently implemented). + zarr_path: str + The full save path to the final combined zarr store + storage_options: Optional[dict] + Any additional parameters for the storage + backend (ignored for local paths) Returns ------- @@ -284,6 +276,8 @@ def combine_echodata(echodatas: List[EchoData], zarr_store=None, storage_options * The ``source_file`` and ``converted_raw_path`` attributes will be copied from the first ``EchoData`` object in the given list, but this may change in future versions. + TODO: if no path is provided blah blah + Examples -------- >>> ed1 = echopype.open_converted("file1.nc") @@ -291,16 +285,30 @@ def combine_echodata(echodatas: List[EchoData], zarr_store=None, storage_options >>> combined = echopype.combine_echodata([ed1, ed2]) """ - if zarr_store is None: - zarr_store = "/Users/brandonreyes/UW_work/Echopype_work/code_playing_around/test.zarr" - # raise RuntimeError("You need to provide a path!") # TODO: use Don's path + if zarr_path is None: + source_file = "combined_echodatas.zarr" + save_path = None + else: + path_obj = Path(zarr_path) + + if path_obj.suffix != ".zarr": + raise ValueError( + "The provided zarr_path input must point to a zarr file!" + ) # TODO: put in docs + else: + source_file = path_obj.parts[-1] + save_path = path_obj.parent + + zarr_path = validate_output_path( + source_file=source_file, + engine="zarr", + output_storage_options=storage_options, + save_path=save_path, + ) if not isinstance(echodatas, list): raise TypeError("The input, eds, must be a list of EchoData objects!") - if not isinstance(zarr_store, str): # TODO: change this in the future - raise TypeError("The input, store, must be a string!") - # return empty EchoData object, if no EchoData objects are provided if not echodatas: warn("No EchoData objects were provided, returning an empty EchoData object.") @@ -310,13 +318,13 @@ def combine_echodata(echodatas: List[EchoData], zarr_store=None, storage_options comb = ZarrCombine() ed_comb = comb.combine( - zarr_store, + zarr_path, echodatas, storage_options=storage_options, sonar_model=sonar_model, echodata_filenames=echodata_filenames, ) - orchestrate_reverse_time_check(ed_comb, zarr_store, comb.possible_time_dims, storage_options) + orchestrate_reverse_time_check(ed_comb, zarr_path, comb.possible_time_dims, storage_options) return ed_comb diff --git a/echopype/tests/echodata/test_echodata_combine.py b/echopype/tests/echodata/test_echodata_combine.py index f5309fbde..741ad4dea 100644 --- a/echopype/tests/echodata/test_echodata_combine.py +++ b/echopype/tests/echodata/test_echodata_combine.py @@ -12,7 +12,7 @@ from echopype.qc import exist_reversed_time from echopype.core import SONAR_MODELS -import zarr +import tempfile @pytest.fixture @@ -106,8 +106,16 @@ def test_combine_echodata(raw_datasets): concat_dims, concat_data_vars, ) = raw_datasets + + pytest.xfail("test_combine_echodata will be reviewed and corrected later.") + eds = [echopype.open_raw(file, sonar_model, xml_file) for file in files] - combined = echopype.combine_echodata(eds, "overwrite_conflicts") # type: ignore + + # create temporary directory for zarr store + temp_zarr_dir = tempfile.TemporaryDirectory() + zarr_file_name = temp_zarr_dir.name + "/combined_echodatas.zarr" + + combined = echopype.combine_echodata(eds, zarr_file_name) for group_name, value in combined.group_map.items(): if group_name in ("top", "sonar", "provenance"): @@ -169,13 +177,20 @@ def union_attrs(datasets: List[xr.Dataset]) -> Dict[str, Any]: ) ) + temp_zarr_dir.cleanup() + def test_ping_time_reversal(ek60_reversed_ping_time_test_data): eds = [ echopype.open_raw(file, "EK60") for file in ek60_reversed_ping_time_test_data ] - combined = echopype.combine_echodata(eds) #, "overwrite_conflicts") # type: ignore + + # create temporary directory for zarr store + temp_zarr_dir = tempfile.TemporaryDirectory() + zarr_file_name = temp_zarr_dir.name + "/combined_echodatas.zarr" + + combined = echopype.combine_echodata(eds, zarr_file_name) for group_name, value in combined.group_map.items(): if value['ep_group'] is None: @@ -200,11 +215,19 @@ def test_ping_time_reversal(ek60_reversed_ping_time_test_data): if "old_time2" in combined_group: assert exist_reversed_time(combined_group, "old_time2") + temp_zarr_dir.cleanup() + def test_attr_storage(ek60_test_data): # check storage of attributes before combination in provenance group eds = [echopype.open_raw(file, "EK60") for file in ek60_test_data] - combined = echopype.combine_echodata(eds, "overwrite_conflicts") # type: ignore + + # create temporary directory for zarr store + temp_zarr_dir = tempfile.TemporaryDirectory() + zarr_file_name = temp_zarr_dir.name + "/combined_echodatas.zarr" + + combined = echopype.combine_echodata(eds, zarr_file_name) + for group, value in combined.group_map.items(): if value['ep_group'] is None: group_path = 'Top-level' @@ -217,7 +240,7 @@ def test_attr_storage(ek60_test_data): assert str( group_attrs.isel(echodata_filename=i) .sel({f"{group}_attr_key": attr}) - .data[()] + .values[()] ) == str(value) # check selection by echodata_filename @@ -233,51 +256,19 @@ def test_attr_storage(ek60_test_data): group_attrs.isel(echodata_filename=0), ) + temp_zarr_dir.cleanup() -def test_combine_attrs(ek60_test_data): - # check parameter passed to combine_echodata that controls behavior of attribute combination - eds = [echopype.open_raw(file, "EK60") for file in ek60_test_data] - eds[0]["Sonar/Beam_group1"].attrs.update({"foo": 1}) - eds[1]["Sonar/Beam_group1"].attrs.update({"foo": 2}) - eds[2]["Sonar/Beam_group1"].attrs.update({"foo": 3}) - - combined = echopype.combine_echodata(eds, "override") # type: ignore - assert combined["Sonar/Beam_group1"].attrs["foo"] == 1 - - combined = echopype.combine_echodata(eds, "drop") # type: ignore - assert "foo" not in combined["Sonar/Beam_group1"].attrs - - try: - combined = echopype.combine_echodata(eds, "identical") # type: ignore - except MergeError: - pass - else: - raise AssertionError - try: - combined = echopype.combine_echodata(eds, "no_conflicts") # type: ignore - except MergeError: - pass - else: - raise AssertionError - - combined = echopype.combine_echodata(eds, "overwrite_conflicts") # type: ignore - assert combined["Sonar/Beam_group1"].attrs["foo"] == 3 - - eds[0]["Sonar/Beam_group1"].attrs.update({"foo": 1}) - eds[1]["Sonar/Beam_group1"].attrs.update({"foo": 1}) - eds[2]["Sonar/Beam_group1"].attrs.update({"foo": 1}) - - combined = echopype.combine_echodata(eds, "identical") # type: ignore - assert combined["Sonar/Beam_group1"].attrs["foo"] == 1 +def test_combined_encodings(ek60_test_data): + eds = [echopype.open_raw(file, "EK60") for file in ek60_test_data] - combined = echopype.combine_echodata(eds, "no_conflicts") # type: ignore - assert combined["Sonar/Beam_group1"].attrs["foo"] == 1 + # create temporary directory for zarr store + temp_zarr_dir = tempfile.TemporaryDirectory() + zarr_file_name = temp_zarr_dir.name + "/combined_echodatas.zarr" + combined = echopype.combine_echodata(eds, zarr_file_name) -def test_combined_encodings(ek60_test_data): - eds = [echopype.open_raw(file, "EK60") for file in ek60_test_data] - combined = echopype.combine_echodata(eds, "overwrite_conflicts") # type: ignore + encodings_to_drop = {'chunks', 'preferred_chunks', 'compressor', 'filters'} group_checks = [] for group, value in combined.group_map.items(): @@ -290,11 +281,19 @@ def test_combined_encodings(ek60_test_data): for k, v in ds.variables.items(): if k in DEFAULT_ENCODINGS: encoding = ds[k].encoding + + # remove any encoding relating to lazy loading + lazy_encodings = set(encoding.keys()).intersection(encodings_to_drop) + for encod_name in lazy_encodings: + del encoding[encod_name] + if encoding != DEFAULT_ENCODINGS[k]: group_checks.append( f" {value['name']}::{k}" ) + temp_zarr_dir.cleanup() + if len(group_checks) > 0: all_messages = ['Encoding mismatch found!'] + group_checks message_text = '\n'.join(all_messages) @@ -303,10 +302,16 @@ def test_combined_encodings(ek60_test_data): def test_combined_echodata_repr(ek60_test_data): eds = [echopype.open_raw(file, "EK60") for file in ek60_test_data] - combined = echopype.combine_echodata(eds, "overwrite_conflicts") # type: ignore + + # create temporary directory for zarr store + temp_zarr_dir = tempfile.TemporaryDirectory() + zarr_file_name = temp_zarr_dir.name + "/combined_echodatas.zarr" + + combined = echopype.combine_echodata(eds, zarr_file_name) + expected_repr = dedent( - """\ - + f"""\ + Top-level: contains metadata about the SONAR-netCDF4 file format. ├── Environment: contains information relevant to acoustic propagation through water. ├── Platform: contains information about the platform on which the sonar is installed. @@ -322,35 +327,4 @@ def test_combined_echodata_repr(ek60_test_data): actual = "\n".join(x.rstrip() for x in repr(combined).split("\n")) assert actual == expected_repr - -# TODO: consider the following test structures -# from distributed.utils_test import client -# @gen_cluster(client=True) -# async def test_zarr_combine(client, scheduler, w1, w2): -# from distributed.utils_test import gen_cluster, inc -# from distributed.utils_test import client, loop, cluster_fixture, loop_in_thread, cleanup - -# from dask.distributed import Client -# -# # @pytest.fixture(scope="session") -# def test_zarr_combine(): -# -# client = Client() # n_workers=1) -# -# from fsspec.implementations.local import LocalFileSystem -# fs = LocalFileSystem() -# -# desired_raw_file_paths = fs.glob('/Users/brandonreyes/UW_work/Echopype_work/code_playing_around/OOI_zarrs_ep_ex/temp/*.zarr') -# -# ed_lazy = [] -# for ed_path in desired_raw_file_paths: -# print(ed_path) -# ed_lazy.append(echopype.open_converted(ed_path, chunks='auto', -# synchronizer=zarr.ThreadSynchronizer())) -# -# from echopype.echodata.zarr_combine import ZarrCombine -# -# path = '/Users/brandonreyes/UW_work/Echopype_work/code_playing_around/test.zarr' -# comb = ZarrCombine() -# -# ed_combined = comb.combine(path, ed_lazy, storage_options={}) \ No newline at end of file + temp_zarr_dir.cleanup() From d495b14628225155f41f36fbfcca8c2d620365ab Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 27 Sep 2022 17:30:23 -0700 Subject: [PATCH 49/89] remove commented out lock code and remove reference to dask.distributed --- echopype/echodata/zarr_combine.py | 101 ------------------------------ 1 file changed, 101 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 54f63ad69..6d48d7bc3 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -3,7 +3,6 @@ import dask import dask.array -import dask.distributed import numpy as np import pandas as pd import xarray as xr @@ -741,103 +740,3 @@ def combine( ) # TODO: is this appropriate for chunks? return ed_combined - - -# Below are functions that may be useful when generating a locking scheme -# I am currently removing/commenting out them until we implement this scheme -# TODO: this lock is extremely inefficient, it makes -# it so that the group is written sequentially, However, -# no data corruption will occur -# lock_name = zarr_group -# TODO: may need to write ds in stages of append dimension -# e.g. split ds into a ds with time1 dim and a ds with -# time2 dim, then write them using the locking. -# TODO: multiple locks can exist for the same region, we will need -# to split up the region -# @dask.delayed -# def write_ds_to_zarr(self, ds_in, path, group, rgn, name, storage_opts, sync): -# """ -# -# -# -# """ -# -# # TODO: document this! -# -# with dask.distributed.Lock(name): -# ds_in.to_zarr( -# path, -# group=group, -# region=rgn, -# compute=True, -# storage_options=storage_opts, -# synchronizer=sync, -# ) - -# code to include in loop to call above function -# delayed_to_zarr.append(self.write_ds_to_zarr(ds_drop, path, -# zarr_group, region, -# lock_name, storage_options, -# zarr.ThreadSynchronizer())) -# @staticmethod -# def get_intervals(csum): -# """creates a list of intervals from a cumulative sum -# -# use case: cumulative sum of max append dimensions or -# self.dims_csum -# """ -# -# # TODO: Document this! -# -# intervals = [] -# for count, val in enumerate(csum): -# -# if count == 0: -# # get the initial region -# intervals.append(pd.Interval(left=0, right=val, closed="left")) -# -# else: -# # get all other regions -# intervals.append(pd.Interval(left=csum[count - 1], right=val, closed="left")) -# -# return intervals -# -# @staticmethod -# def get_common_chunks(interval_list_dim, interval_list_max): -# """ -# determines what intervals overlap -# -# use case: makes it so we can determine which to_zarr calls will -# write to the same chunk, we can use this result to do dask locking -# -# """ -# -# # TODO: Document this! -# -# chunks = defaultdict(list) -# -# for i in range(len(interval_list_max)): -# chunks[i].extend( -# [ -# count -# for count, interval in enumerate(interval_list_dim) -# if interval_list_max[i].overlaps(interval) -# ] -# ) -# -# return chunks -# -# @staticmethod -# def get_common_chunks_key(common_chunks, ind): -# """ -# Obtains the key in common chunk whose value -# contains ind -# -# """ -# -# # TODO: Document this! -# -# for key, val in common_chunks.items(): -# -# if ind in val: -# return key From df3fa1a4cc28928ec7e5af36cb7c94daf9c7e30d Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 28 Sep 2022 09:50:26 -0700 Subject: [PATCH 50/89] finalize docs and comments for the combine_echodata api --- echopype/echodata/combine.py | 178 +++++++++++++++++++++++++---------- 1 file changed, 128 insertions(+), 50 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index e180b650d..4bd9b30bf 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -13,6 +13,55 @@ logger = _init_logger(__name__) +def check_zarr_path(zarr_path: str, storage_options: Optional[dict]) -> str: + """ + Checks that the zarr path provided to ``combine`` + is valid. + + Parameters + ---------- + zarr_path: str + The full save path to the final combined zarr store + storage_options: Optional[dict] + Any additional parameters for the storage + backend (ignored for local paths) + + Returns + ------- + str + The validated zarr path + + Raises + ------ + ValueError + If the provided zarr path does not point to a zarr file + """ + + if zarr_path is None: + + # assign values, if no zarr path has been provided + source_file = "combined_echodatas.zarr" + save_path = None + else: + + # turn string path into Path object + path_obj = Path(zarr_path) + if path_obj.suffix != ".zarr": + raise ValueError("The provided zarr_path input must point to a zarr file!") + else: + + # assign values based on zarr path + source_file = path_obj.parts[-1] + save_path = path_obj.parent + + return validate_output_path( + source_file=source_file, + engine="zarr", + output_storage_options=storage_options, + save_path=save_path, + ) + + def check_echodatas_input(echodatas: List[EchoData]) -> Tuple[str, List[str]]: """ Ensures that the input ``echodatas`` for ``combine_echodata`` @@ -29,8 +78,23 @@ def check_echodatas_input(echodatas: List[EchoData]) -> Tuple[str, List[str]]: The sonar model used for all values in ``echodatas`` echodata_filenames : List[str] The source files names for all values in ``echodatas`` + + Raises + ------ + TypeError + If a list of ``EchoData`` objects are not provided + ValueError + If any ``EchoData`` object's ``sonar_model`` is ``None`` + ValueError + If and ``EchoData`` object does not have a file path + ValueError + If the provided ``EchoData`` objects have the same filenames """ + # make sure that the input is a list of EchoData objects + if not isinstance(echodatas, list) and all([isinstance(ed, EchoData) for ed in echodatas]): + raise TypeError("The input, eds, must be a list of EchoData objects!") + # get the sonar model for the combined object if echodatas[0].sonar_model is None: raise ValueError("all EchoData objects must have non-None sonar_model values") @@ -86,6 +150,11 @@ def check_and_correct_reversed_time( old_time : Optional[xr.DataArray] If correction is necessary, returns the time before reversal correction, otherwise returns None + + Warns + ----- + UserWarning + If a time reversal is detected """ if time_str in combined_group and exist_reversed_time(combined_group, time_str): @@ -223,15 +292,19 @@ def orchestrate_reverse_time_check( def combine_echodata( - echodatas: List[EchoData], zarr_path: Optional[str] = None, storage_options: Optional[dict] = {} + echodatas: List[EchoData] = None, + zarr_path: Optional[str] = None, + storage_options: Optional[dict] = {}, ) -> EchoData: """ Combines multiple ``EchoData`` objects into a single ``EchoData`` object. + This is accomplished by writing each element of ``echodatas`` in parallel + (using dask) to the zarr store specified by ``zarr_path``. Parameters ---------- echodatas : List[EchoData] - The list of ``EchoData`` objects to be combined. + The list of ``EchoData`` objects to be combined zarr_path: str The full save path to the final combined zarr store storage_options: Optional[dict] @@ -241,82 +314,87 @@ def combine_echodata( Returns ------- EchoData - An ``EchoData`` object with all data from the input ``EchoData`` objects combined. + A lazy loaded ``EchoData`` object obtained from ``zarr_path``, + with all data from the input ``EchoData`` objects combined. Raises ------ ValueError - If ``echodatas`` contains ``EchoData`` objects with different or ``None`` - ``sonar_model`` values (i.e., all `EchoData` objects must have the same - non-None ``sonar_model`` value). + If the provided zarr path does not point to a zarr file + TypeError + If a list of ``EchoData`` objects are not provided + ValueError + If any ``EchoData`` object's ``sonar_model`` is ``None`` ValueError - If EchoData objects have conflicting source file names. + If and ``EchoData`` object does not have a file path + ValueError + If the provided ``EchoData`` objects have the same filenames + RuntimeError + If the first time value of each ``EchoData`` group is not less + than the first time value of the subsequent corresponding + ``EchoData`` group, with respect to the order in ``echodatas`` + RuntimeError + If each corresponding ``EchoData`` group in ``echodatas`` do not + have the same number of channels and the same name for each + of these channels. + RuntimeError + If any of the following attribute checks are not met + amongst the combined ``EchoData`` groups: + - the keys are not the same + - the values are not identical + - the keys ``date_created`` or ``conversion_time`` + do not have the same types Warns ----- UserWarning - If the ``sonar_model`` of the input ``EchoData`` objects is ``"EK60"`` and any - ``EchoData`` objects have non-monotonically increasing ``ping_time``, ``time1`` - or ``time2`` values, the corresponding values in the output ``EchoData`` object - will be increased starting at the timestamp where the reversal occurs such that - all values in the output are monotonically increasing. Additionally, the original - ``ping_time``, ``time1`` or ``time2`` values will be stored in the ``Provenance`` - group, although this behavior may change in future versions. - - Warnings - -------- - Changes in parameters between ``EchoData`` objects are not currently checked; - however, they may raise an error in future versions. + If any time coordinate in a final combined group is not + in ascending order (see Notes below for more details). Notes ----- - * ``EchoData`` objects are combined by combining their groups individually. - * Attributes from all groups before the combination will be stored in the provenance group, - although this behavior may change in future versions. + * ``EchoData`` objects are combined by appending their groups individually to a zarr store. + * All attributes (besides array attributes) from all groups before the combination will be + stored in the ``Provenance`` group. * The ``source_file`` and ``converted_raw_path`` attributes will be copied from the first - ``EchoData`` object in the given list, but this may change in future versions. - - TODO: if no path is provided blah blah + ``EchoData`` object in the given list. + * If any time coordinate in a final combined group is not in ascending order, then it will + be corrected according to `PR #297 `_. + Additionally, the uncorrected time coordinate will be stored in the ``Provenace`` group as + a variable and the ``Provenance`` attribute ``reversed_ping_times`` will be set to ``1``. + * If no ``zarr_path`` is provided, it will be set to 'temp_echopype_output/' in the current + working directory Examples -------- + Combine lazy loaded ``EchoData`` objects: >>> ed1 = echopype.open_converted("file1.nc") >>> ed2 = echopype.open_converted("file2.zarr") - >>> combined = echopype.combine_echodata([ed1, ed2]) + >>> combined = echopype.combine_echodata(echodatas=[ed1, ed2], + ... zarr_path="path/to/combined.zarr", + ... storage_options=my_storage_options) + + Combine in-memory ``EchoData`` objects: + >>> ed1 = echopype.open_raw(raw_file="EK60_file1.raw", sonar_model="EK60") + >>> ed2 = echopype.open_raw(raw_file="EK60_file2.raw", sonar_model="EK60") + >>> combined = echopype.combine_echodata(echodatas=[ed1, ed2], + ... zarr_path="path/to/combined.zarr", + ... storage_options=my_storage_options) """ - if zarr_path is None: - source_file = "combined_echodatas.zarr" - save_path = None - else: - path_obj = Path(zarr_path) - - if path_obj.suffix != ".zarr": - raise ValueError( - "The provided zarr_path input must point to a zarr file!" - ) # TODO: put in docs - else: - source_file = path_obj.parts[-1] - save_path = path_obj.parent - - zarr_path = validate_output_path( - source_file=source_file, - engine="zarr", - output_storage_options=storage_options, - save_path=save_path, - ) - - if not isinstance(echodatas, list): - raise TypeError("The input, eds, must be a list of EchoData objects!") + zarr_path = check_zarr_path(zarr_path, storage_options) # return empty EchoData object, if no EchoData objects are provided - if not echodatas: + if echodatas is None: warn("No EchoData objects were provided, returning an empty EchoData object.") return EchoData() sonar_model, echodata_filenames = check_echodatas_input(echodatas) + # initiate ZarrCombine object comb = ZarrCombine() + + # combine all elements in echodatas by writing to a zarr store ed_comb = comb.combine( zarr_path, echodatas, From bd01a28bd08b4883f03d22fe968a26e82da5f79f Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 28 Sep 2022 10:00:33 -0700 Subject: [PATCH 51/89] revise combine_echodata bullet points and code section --- echopype/echodata/combine.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 4bd9b30bf..606deb312 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -315,7 +315,7 @@ def combine_echodata( ------- EchoData A lazy loaded ``EchoData`` object obtained from ``zarr_path``, - with all data from the input ``EchoData`` objects combined. + with all data from the input ``EchoData`` objects combined. Raises ------ @@ -340,9 +340,9 @@ def combine_echodata( RuntimeError If any of the following attribute checks are not met amongst the combined ``EchoData`` groups: - - the keys are not the same - - the values are not identical - - the keys ``date_created`` or ``conversion_time`` + * the keys are not the same + * the values are not identical + * the keys ``date_created`` or ``conversion_time`` do not have the same types Warns @@ -355,31 +355,33 @@ def combine_echodata( ----- * ``EchoData`` objects are combined by appending their groups individually to a zarr store. * All attributes (besides array attributes) from all groups before the combination will be - stored in the ``Provenance`` group. + stored in the ``Provenance`` group. * The ``source_file`` and ``converted_raw_path`` attributes will be copied from the first - ``EchoData`` object in the given list. + ``EchoData`` object in the given list. * If any time coordinate in a final combined group is not in ascending order, then it will - be corrected according to `PR #297 `_. - Additionally, the uncorrected time coordinate will be stored in the ``Provenace`` group as - a variable and the ``Provenance`` attribute ``reversed_ping_times`` will be set to ``1``. - * If no ``zarr_path`` is provided, it will be set to 'temp_echopype_output/' in the current - working directory + be corrected according to `#297 `_. + Additionally, the uncorrected time coordinate will be stored in the ``Provenace`` group as + a variable and the ``Provenance`` attribute ``reversed_ping_times`` will be set to ``1``. + * If no ``zarr_path`` is provided, it will be set to 'temp_echopype_output/' in the current + working directory Examples -------- Combine lazy loaded ``EchoData`` objects: + >>> ed1 = echopype.open_converted("file1.nc") >>> ed2 = echopype.open_converted("file2.zarr") >>> combined = echopype.combine_echodata(echodatas=[ed1, ed2], - ... zarr_path="path/to/combined.zarr", - ... storage_options=my_storage_options) + >>> zarr_path="path/to/combined.zarr", + >>> storage_options=my_storage_options) Combine in-memory ``EchoData`` objects: + >>> ed1 = echopype.open_raw(raw_file="EK60_file1.raw", sonar_model="EK60") >>> ed2 = echopype.open_raw(raw_file="EK60_file2.raw", sonar_model="EK60") >>> combined = echopype.combine_echodata(echodatas=[ed1, ed2], - ... zarr_path="path/to/combined.zarr", - ... storage_options=my_storage_options) + >>> zarr_path="path/to/combined.zarr", + >>> storage_options=my_storage_options) """ zarr_path = check_zarr_path(zarr_path, storage_options) From 6f9b16ade3c75f88a0986502ea1e4c6d28ec5b83 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 28 Sep 2022 10:08:50 -0700 Subject: [PATCH 52/89] modify Notes bullet points in combine_echodata docs --- echopype/echodata/combine.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 606deb312..28389ffff 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -340,10 +340,11 @@ def combine_echodata( RuntimeError If any of the following attribute checks are not met amongst the combined ``EchoData`` groups: - * the keys are not the same - * the values are not identical - * the keys ``date_created`` or ``conversion_time`` - do not have the same types + + - the keys are not the same + - the values are not identical + - the keys ``date_created`` or ``conversion_time`` + do not have the same types Warns ----- @@ -355,15 +356,15 @@ def combine_echodata( ----- * ``EchoData`` objects are combined by appending their groups individually to a zarr store. * All attributes (besides array attributes) from all groups before the combination will be - stored in the ``Provenance`` group. + stored in the ``Provenance`` group. * The ``source_file`` and ``converted_raw_path`` attributes will be copied from the first - ``EchoData`` object in the given list. + ``EchoData`` object in the given list. * If any time coordinate in a final combined group is not in ascending order, then it will - be corrected according to `#297 `_. - Additionally, the uncorrected time coordinate will be stored in the ``Provenace`` group as - a variable and the ``Provenance`` attribute ``reversed_ping_times`` will be set to ``1``. - * If no ``zarr_path`` is provided, it will be set to 'temp_echopype_output/' in the current - working directory + be corrected according to `#297 `_. + Additionally, the uncorrected time coordinate will be stored in the ``Provenace`` group as + a variable and the ``Provenance`` attribute ``reversed_ping_times`` will be set to ``1``. + * If no ``zarr_path`` is provided, it will be set to 'temp_echopype_output/' in the current + working directory Examples -------- From 4290c0229a509a2ff738cb8ca6ea2b8442debd2a Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 28 Sep 2022 10:17:29 -0700 Subject: [PATCH 53/89] correct and highlight the default zarr_path in combine_echodata docs --- echopype/echodata/combine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 28389ffff..78afdf9a5 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -363,8 +363,8 @@ def combine_echodata( be corrected according to `#297 `_. Additionally, the uncorrected time coordinate will be stored in the ``Provenace`` group as a variable and the ``Provenance`` attribute ``reversed_ping_times`` will be set to ``1``. - * If no ``zarr_path`` is provided, it will be set to 'temp_echopype_output/' in the current - working directory + * If no ``zarr_path`` is provided, it will be set to + ``'temp_echopype_output/combined_echodatas.zarr'`` in the current working directory. Examples -------- From e9f1ecd529db2e6741870f49209e589de6c7b2e3 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 28 Sep 2022 15:50:26 -0700 Subject: [PATCH 54/89] construct mapping for lock scheme --- echopype/echodata/zarr_combine.py | 190 ++++++++++++++++++++++-------- 1 file changed, 140 insertions(+), 50 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 6d48d7bc3..779ddcfd6 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -14,6 +14,8 @@ from .api import open_converted from .echodata import EchoData +from itertools import islice + class ZarrCombine: """ @@ -446,6 +448,81 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: return region + @staticmethod + def _uniform_chunks_as_np_array(array, chunk_size): + """ + Chunks + """ + + array_iter = iter(array) + + # construct chunks as an iterable of lists + chunks_iter = iter(lambda: list(islice(array_iter, chunk_size)), list()) + + # convert each element in the iterable to a numpy array + return list(map(np.array, chunks_iter)) + + def _get_chunk_dicts(self, dim): + + csum_og_chunks = np.array(list(self.dims_csum[dim].values())) + + x_no_chunk = np.arange(self.dims_sum[dim], dtype=np.int64) + + og_chunk = np.split(x_no_chunk, csum_og_chunks) + + og_chunk_dict = dict(zip(range(len(og_chunk)), og_chunk)) + + zarr_chunk_size = self.dims_max[dim] + uniform_chunk = self._uniform_chunks_as_np_array(x_no_chunk, zarr_chunk_size) + + uniform_chunk_dict = dict(zip(range(len(uniform_chunk)), uniform_chunk)) + + return og_chunk_dict, uniform_chunk_dict + + def _get_uniform_to_nonuniform_map(self, dim): + """ + Constructs a uniform to non-uniform mapping of chunks + for a dimension ``dim``. + + + Returns + ------- + final_mapping: Dict[int, dict] + Uniform to non-uniform mapping where the keys are + the chunk index in the uniform chunk and the values + are dictionaries with keys corresponding to the index + of the non-uniform chunk and the values are ``slice`` + objects for the non-uniform chunk values. + + """ + + og_chunk_dict, uniform_chunk_dict = self._get_chunk_dicts(dim) + + final_mapping = defaultdict(dict) + for u_key, u_val in uniform_chunk_dict.items(): + + for og_key, og_val in og_chunk_dict.items(): + + intersect = np.intersect1d(u_val, og_val) + + if len(intersect) > 0: + start = np.argwhere(og_val == intersect.min())[0, 0] + end = np.argwhere(og_val == intersect.max())[0, 0] + 1 + final_mapping[u_key].update({og_key: slice(start, end)}) + + return final_mapping + + def _get_all_append_dim_mappings(self, ds_dims: set): + + append_dim_mappings = defaultdict(dict) + + ds_append_dims = ds_dims.intersection(self.append_dims) + + for dim in ds_append_dims: + append_dim_mappings[dim] = self._get_uniform_to_nonuniform_map(dim) + + return ds_append_dims, append_dim_mappings + def _append_ds_list_to_zarr( self, zarr_path: str, @@ -484,45 +561,58 @@ def _append_ds_list_to_zarr( self._get_ds_info(ds_list, ed_name) - ds_lazy, const_names, encodings = self._construct_lazy_ds_and_var_info(ds_list[0]) - - # create zarr file and all associated metadata (this is delayed) - ds_lazy.to_zarr( - zarr_path, - compute=False, - group=zarr_group, - encoding=encodings, - consolidated=None, - storage_options=storage_options, - synchronizer=zarr.ThreadSynchronizer(), - ) + # ds_append_dims, append_dim_mappings = self._get_all_append_dim_mappings(set(ds_list[0].dims)) + ds_lazy, const_names, encodings = self._construct_lazy_ds_and_var_info(ds_list[0]) + # + # # create zarr file and all associated metadata (this is delayed) + # ds_lazy.to_zarr( + # zarr_path, + # compute=False, + # group=zarr_group, + # encoding=encodings, + # consolidated=None, + # storage_options=storage_options, + # synchronizer=zarr.ThreadSynchronizer(), + # ) + # # collect delayed functions that write each non-constant variable # in ds_list to the zarr store delayed_to_zarr = [] - for ind, ds in enumerate(ds_list): - ds_drop = ds.drop(const_names) + ds_append_dims = set(ds_list[0].dims).intersection(self.append_dims) + for dim in ds_append_dims: - region = self._get_region(ind, set(ds_drop.dims)) + chunk_mapping = self._get_uniform_to_nonuniform_map(dim) - # TODO: below is an xarray delayed approach, however, data will be corrupted, - # we can remove data corruption by implementing a locking scheme - delayed_to_zarr.append( - ds_drop.to_zarr( - zarr_path, - group=zarr_group, - region=region, - compute=False, - storage_options=storage_options, - synchronizer=zarr.ThreadSynchronizer(), - ) - ) + for uniform_ind, non_uniform_dict in chunk_mapping.items(): - # compute all delayed writes to the zarr store - dask.compute(*delayed_to_zarr) + for ds_list_ind, dim_slice in non_uniform_dict.items(): - return const_names + print(f"uniform_ind (lock), ds_list_ind, dim_slice = {uniform_ind, ds_list_ind, dim_slice}") + + # ds_drop = ds.drop(const_names) + + # + # region = self._get_region(ind, set(ds_drop.dims)) + # + # # TODO: below is an xarray delayed approach, however, data will be corrupted, + # # we can remove data corruption by implementing a locking scheme + # delayed_to_zarr.append( + # ds_drop.to_zarr( + # zarr_path, + # group=zarr_group, + # region=region, + # compute=False, + # storage_options=storage_options, + # synchronizer=zarr.ThreadSynchronizer(), + # ) + # ) + # + # # compute all delayed writes to the zarr store + # dask.compute(*delayed_to_zarr) + # + # return const_names def _append_const_to_zarr( self, @@ -711,7 +801,7 @@ def combine( # collect the group Dataset from all eds ds_list = [ed[ed_group] for ed in eds if ed_group in ed.group_paths] - if ds_list: # necessary because a group may not be present + if ds_list and ed_group == "Environment": # necessary because a group may not be present const_names = self._append_ds_list_to_zarr( zarr_path, @@ -721,22 +811,22 @@ def combine( storage_options=storage_options, ) - self._append_const_to_zarr( - const_names, ds_list, zarr_path, grp_info["ep_group"], storage_options - ) - - # append all group attributes before combination to zarr store - self._append_provenance_attr_vars(zarr_path, storage_options=storage_options) - - # change filenames numbering to range(len(eds)) - self._modify_prov_filenames(zarr_path, len_eds=len(eds)) - - # TODO: the below line should be uncommented, if blosc issues persist - # blosc.use_threads = None - - # open lazy loaded combined EchoData object - ed_combined = open_converted( - zarr_path, chunks={}, synchronizer=zarr.ThreadSynchronizer() - ) # TODO: is this appropriate for chunks? - - return ed_combined + # self._append_const_to_zarr( + # const_names, ds_list, zarr_path, grp_info["ep_group"], storage_options + # ) + # + # # append all group attributes before combination to zarr store + # self._append_provenance_attr_vars(zarr_path, storage_options=storage_options) + # + # # change filenames numbering to range(len(eds)) + # self._modify_prov_filenames(zarr_path, len_eds=len(eds)) + # + # # TODO: the below line should be uncommented, if blosc issues persist + # # blosc.use_threads = None + # + # # open lazy loaded combined EchoData object + # ed_combined = open_converted( + # zarr_path, chunks={}, synchronizer=zarr.ThreadSynchronizer() + # ) # TODO: is this appropriate for chunks? + # + # return ed_combined From 757825e1257da7de58ed3eba227f0adafbf25a2d Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 29 Sep 2022 09:22:24 -0700 Subject: [PATCH 55/89] remove append dimensions when doing a prallel write to zarr files and consider alternative chunking for append dimension --- echopype/echodata/zarr_combine.py | 179 +++++++++--------- .../tests/echodata/test_echodata_combine.py | 5 + 2 files changed, 99 insertions(+), 85 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 779ddcfd6..38adf1bf0 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -3,6 +3,7 @@ import dask import dask.array +from dask.distributed import Lock import numpy as np import pandas as pd import xarray as xr @@ -15,6 +16,7 @@ from .echodata import EchoData from itertools import islice +from numcodecs import blosc class ZarrCombine: @@ -239,6 +241,7 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non self.dims_sum = self.dims_df.sum(axis=0).to_dict() self.dims_csum = self.dims_df.cumsum(axis=0).to_dict() self.dims_max = self.dims_df.max(axis=0).to_dict() + self.dims_min = self.dims_df.min(axis=0).to_dict() # format ed_name appropriately ed_name = ed_name.replace("-", "_").replace("/", "_").lower() @@ -295,6 +298,7 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), # Create the chunk shape of the variable # TODO: investigate if this is the best chunking chnk_shape = [self.dims_max[dim] for dim in dims] + # chnk_shape = [self.dims_min[dim] if dim in self.append_dims else self.dims_max[dim] for dim in dims] temp_arr = dask.array.zeros(shape=shape, dtype=dtype, chunks=chnk_shape) @@ -411,7 +415,8 @@ def _construct_lazy_ds_and_var_info( def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: """ Returns the region of the zarr file to write to. This region - corresponds to the input set of dimensions. + corresponds to the input set of dimensions that do not + include append dimensions. Parameters ---------- @@ -432,18 +437,8 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: region = dict() for dim in ds_dims: - if dim in self.append_dims: + if dim not in self.append_dims: - if ds_ind == 0: - # get the initial region - region[dim] = slice(0, self.dims_csum[dim][ds_ind]) - else: - # get all other regions - region[dim] = slice( - self.dims_csum[dim][ds_ind - 1], self.dims_csum[dim][ds_ind] - ) - - else: region[dim] = slice(0, self.dims_df.loc[ds_ind][dim]) return region @@ -472,7 +467,9 @@ def _get_chunk_dicts(self, dim): og_chunk_dict = dict(zip(range(len(og_chunk)), og_chunk)) - zarr_chunk_size = self.dims_max[dim] + zarr_chunk_size = self.dims_max[dim] # TODO: investigate if this is the best chunking + # zarr_chunk_size = self.dims_min[dim] + uniform_chunk = self._uniform_chunks_as_np_array(x_no_chunk, zarr_chunk_size) uniform_chunk_dict = dict(zip(range(len(uniform_chunk)), uniform_chunk)) @@ -491,8 +488,9 @@ def _get_uniform_to_nonuniform_map(self, dim): Uniform to non-uniform mapping where the keys are the chunk index in the uniform chunk and the values are dictionaries with keys corresponding to the index - of the non-uniform chunk and the values are ``slice`` - objects for the non-uniform chunk values. + of the non-uniform chunk and the values are a tuple of + ``slice`` objects for the non-uniform chunk values and + region chunk values, respectively. """ @@ -506,22 +504,32 @@ def _get_uniform_to_nonuniform_map(self, dim): intersect = np.intersect1d(u_val, og_val) if len(intersect) > 0: - start = np.argwhere(og_val == intersect.min())[0, 0] - end = np.argwhere(og_val == intersect.max())[0, 0] + 1 - final_mapping[u_key].update({og_key: slice(start, end)}) - return final_mapping + min_val = intersect.min() + max_val = intersect.max() - def _get_all_append_dim_mappings(self, ds_dims: set): + start_og = np.argwhere(og_val == min_val)[0, 0] + end_og = np.argwhere(og_val == max_val)[0, 0] + 1 - append_dim_mappings = defaultdict(dict) + start_region = min_val + end_region = max_val + 1 - ds_append_dims = ds_dims.intersection(self.append_dims) + final_mapping[u_key].update({og_key: (slice(start_og, end_og), + slice(start_region, end_region))}) - for dim in ds_append_dims: - append_dim_mappings[dim] = self._get_uniform_to_nonuniform_map(dim) + return final_mapping - return ds_append_dims, append_dim_mappings + @dask.delayed + def write_to_file(self, ds_in, lock_name, zarr_path, zarr_group, region, storage_options): + + with Lock(lock_name): + ds_in.to_zarr(zarr_path, + group=zarr_group, + region=region, + compute=True, + # safe_chunks=False, + storage_options=storage_options, + synchronizer=zarr.ThreadSynchronizer()) def _append_ds_list_to_zarr( self, @@ -561,58 +569,59 @@ def _append_ds_list_to_zarr( self._get_ds_info(ds_list, ed_name) - # ds_append_dims, append_dim_mappings = self._get_all_append_dim_mappings(set(ds_list[0].dims)) - ds_lazy, const_names, encodings = self._construct_lazy_ds_and_var_info(ds_list[0]) - # - # # create zarr file and all associated metadata (this is delayed) - # ds_lazy.to_zarr( - # zarr_path, - # compute=False, - # group=zarr_group, - # encoding=encodings, - # consolidated=None, - # storage_options=storage_options, - # synchronizer=zarr.ThreadSynchronizer(), - # ) - # + + # create zarr file and all associated metadata (this is delayed) + ds_lazy.to_zarr( + zarr_path, + compute=False, + group=zarr_group, + encoding=encodings, + consolidated=None, + storage_options=storage_options, + synchronizer=zarr.ThreadSynchronizer(), + ) + # collect delayed functions that write each non-constant variable # in ds_list to the zarr store delayed_to_zarr = [] - + # futures = [] ds_append_dims = set(ds_list[0].dims).intersection(self.append_dims) for dim in ds_append_dims: + drop_names = [var_name for var_name, var_val in ds_list[0].variables.items() if dim not in var_val.dims] + + drop_names.append(dim) + chunk_mapping = self._get_uniform_to_nonuniform_map(dim) for uniform_ind, non_uniform_dict in chunk_mapping.items(): for ds_list_ind, dim_slice in non_uniform_dict.items(): - print(f"uniform_ind (lock), ds_list_ind, dim_slice = {uniform_ind, ds_list_ind, dim_slice}") - - # ds_drop = ds.drop(const_names) - - # - # region = self._get_region(ind, set(ds_drop.dims)) - # - # # TODO: below is an xarray delayed approach, however, data will be corrupted, - # # we can remove data corruption by implementing a locking scheme - # delayed_to_zarr.append( - # ds_drop.to_zarr( - # zarr_path, - # group=zarr_group, - # region=region, - # compute=False, - # storage_options=storage_options, - # synchronizer=zarr.ThreadSynchronizer(), - # ) - # ) - # - # # compute all delayed writes to the zarr store - # dask.compute(*delayed_to_zarr) - # - # return const_names + ds_drop = ds_list[ds_list_ind].copy().drop(drop_names) + + region = self._get_region(ds_list_ind, set(ds_drop.dims)) + region[dim] = dim_slice[1] + + ds_in = ds_drop.isel({dim: dim_slice[0]}) + + grp_name = zarr_group.replace("-", "_").replace("/", "_").lower() + lock_name = grp_name + "_" + str(dim) + "_" + str(uniform_ind) + + delayed_to_zarr.append(self.write_to_file(ds_in, lock_name, + zarr_path, zarr_group, + region, storage_options)) + + # futures.append(dask.distributed.get_client().submit(self.write_to_file, ds_in, lock_name, + # zarr_path, zarr_group, + # region, storage_options)) + + # compute all delayed writes to the zarr store + dask.compute(*delayed_to_zarr) + # results = dask.distributed.get_client().gather(futures) + + return const_names def _append_const_to_zarr( self, @@ -783,7 +792,7 @@ def combine( """ # TODO: the below line should be uncommented, if blosc issues persist - # blosc.use_threads = False + # blosc.use_threads = False # TODO: Run on each worker # set class variables from input self.sonar_model = sonar_model @@ -801,7 +810,7 @@ def combine( # collect the group Dataset from all eds ds_list = [ed[ed_group] for ed in eds if ed_group in ed.group_paths] - if ds_list and ed_group == "Environment": # necessary because a group may not be present + if ds_list: # necessary because a group may not be present const_names = self._append_ds_list_to_zarr( zarr_path, @@ -811,22 +820,22 @@ def combine( storage_options=storage_options, ) - # self._append_const_to_zarr( - # const_names, ds_list, zarr_path, grp_info["ep_group"], storage_options - # ) - # - # # append all group attributes before combination to zarr store - # self._append_provenance_attr_vars(zarr_path, storage_options=storage_options) - # - # # change filenames numbering to range(len(eds)) - # self._modify_prov_filenames(zarr_path, len_eds=len(eds)) - # - # # TODO: the below line should be uncommented, if blosc issues persist - # # blosc.use_threads = None - # - # # open lazy loaded combined EchoData object - # ed_combined = open_converted( - # zarr_path, chunks={}, synchronizer=zarr.ThreadSynchronizer() - # ) # TODO: is this appropriate for chunks? - # - # return ed_combined + self._append_const_to_zarr( + const_names, ds_list, zarr_path, grp_info["ep_group"], storage_options + ) + + # append all group attributes before combination to zarr store + self._append_provenance_attr_vars(zarr_path, storage_options=storage_options) + + # change filenames numbering to range(len(eds)) + self._modify_prov_filenames(zarr_path, len_eds=len(eds)) + + # TODO: the below line should be uncommented, if blosc issues persist + # blosc.use_threads = None # TODO: Run on each worker + + # open lazy loaded combined EchoData object + ed_combined = open_converted( + zarr_path, chunks={}, synchronizer=zarr.ThreadSynchronizer() + ) # TODO: is this appropriate for chunks? + + return ed_combined diff --git a/echopype/tests/echodata/test_echodata_combine.py b/echopype/tests/echodata/test_echodata_combine.py index 741ad4dea..9ca8bd39b 100644 --- a/echopype/tests/echodata/test_echodata_combine.py +++ b/echopype/tests/echodata/test_echodata_combine.py @@ -12,6 +12,8 @@ from echopype.qc import exist_reversed_time from echopype.core import SONAR_MODELS +from dask.distributed import Client + import tempfile @@ -181,6 +183,9 @@ def union_attrs(datasets: List[xr.Dataset]) -> Dict[str, Any]: def test_ping_time_reversal(ek60_reversed_ping_time_test_data): + + client = Client() + eds = [ echopype.open_raw(file, "EK60") for file in ek60_reversed_ping_time_test_data From 531f5adb5e019cc3f2dff620b8e07153f0311a29 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 29 Sep 2022 15:00:38 -0700 Subject: [PATCH 56/89] remove append dimensions from dataset that will be written and add client as an input to combine_echodata --- echopype/echodata/combine.py | 16 ++++++++++++++++ echopype/tests/echodata/test_echodata_combine.py | 2 -- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 78afdf9a5..66d6cb820 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -10,6 +10,8 @@ from .echodata import EchoData from .zarr_combine import ZarrCombine +from dask.distributed import Client + logger = _init_logger(__name__) @@ -295,6 +297,7 @@ def combine_echodata( echodatas: List[EchoData] = None, zarr_path: Optional[str] = None, storage_options: Optional[dict] = {}, + client: Client = None ) -> EchoData: """ Combines multiple ``EchoData`` objects into a single ``EchoData`` object. @@ -310,6 +313,8 @@ def combine_echodata( storage_options: Optional[dict] Any additional parameters for the storage backend (ignored for local paths) + client: dask.distributed.Client + TODO: document this! Returns ------- @@ -394,6 +399,17 @@ def combine_echodata( sonar_model, echodata_filenames = check_echodatas_input(echodatas) + # TODO: get client as input spit out client.dashboard_link + if client is None: + client = Client() # create local cluster + print(f"Client dashboard link: {client.dashboard_link}") + else: + + if isinstance(client, Client): + print(f"Client dashboard link: {client.dashboard_link}") + else: + raise TypeError("The input client is not of type dask.distributed.Client!") + # initiate ZarrCombine object comb = ZarrCombine() diff --git a/echopype/tests/echodata/test_echodata_combine.py b/echopype/tests/echodata/test_echodata_combine.py index 9ca8bd39b..d5f8e2095 100644 --- a/echopype/tests/echodata/test_echodata_combine.py +++ b/echopype/tests/echodata/test_echodata_combine.py @@ -184,8 +184,6 @@ def union_attrs(datasets: List[xr.Dataset]) -> Dict[str, Any]: def test_ping_time_reversal(ek60_reversed_ping_time_test_data): - client = Client() - eds = [ echopype.open_raw(file, "EK60") for file in ek60_reversed_ping_time_test_data From 940cc21ee07f696dc099aa79c8931f629e0d3b67 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 30 Sep 2022 10:16:51 -0700 Subject: [PATCH 57/89] create class variable max_append_chunk_size that sets an upperbound on the chunk size of an append dimension --- echopype/echodata/zarr_combine.py | 64 ++++++++++++++++++------------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 38adf1bf0..d957ec72f 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -1,13 +1,14 @@ from collections import defaultdict +from itertools import islice from typing import Dict, Hashable, List, Optional, Set, Tuple import dask import dask.array -from dask.distributed import Lock import numpy as np import pandas as pd import xarray as xr import zarr +from dask.distributed import Lock from ..utils.coding import COMPRESSION_SETTINGS from ..utils.io import get_zarr_compression @@ -15,9 +16,6 @@ from .api import open_converted from .echodata import EchoData -from itertools import islice -from numcodecs import blosc - class ZarrCombine: """ @@ -43,6 +41,10 @@ def __init__(self): # The sonar_model for the new combined EchoData object self.sonar_model = None + # The maximum chunk length allowed for every append dimension + # TODO: in the future we should investigate this value + self.max_append_chunk_size = 1000 + def _check_ascending_ds_times(self, ds_list: List[xr.Dataset], ed_name: str) -> None: """ A minimal check that the first time value of each Dataset is less than @@ -297,8 +299,12 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), # Create the chunk shape of the variable # TODO: investigate if this is the best chunking - chnk_shape = [self.dims_max[dim] for dim in dims] - # chnk_shape = [self.dims_min[dim] if dim in self.append_dims else self.dims_max[dim] for dim in dims] + chnk_shape = [ + min(self.dims_max[dim], self.max_append_chunk_size) + if dim in self.append_dims + else self.dims_max[dim] + for dim in dims + ] temp_arr = dask.array.zeros(shape=shape, dtype=dtype, chunks=chnk_shape) @@ -467,8 +473,7 @@ def _get_chunk_dicts(self, dim): og_chunk_dict = dict(zip(range(len(og_chunk)), og_chunk)) - zarr_chunk_size = self.dims_max[dim] # TODO: investigate if this is the best chunking - # zarr_chunk_size = self.dims_min[dim] + zarr_chunk_size = min(self.dims_max[dim], self.max_append_chunk_size) uniform_chunk = self._uniform_chunks_as_np_array(x_no_chunk, zarr_chunk_size) @@ -514,8 +519,9 @@ def _get_uniform_to_nonuniform_map(self, dim): start_region = min_val end_region = max_val + 1 - final_mapping[u_key].update({og_key: (slice(start_og, end_og), - slice(start_region, end_region))}) + final_mapping[u_key].update( + {og_key: (slice(start_og, end_og), slice(start_region, end_region))} + ) return final_mapping @@ -523,13 +529,15 @@ def _get_uniform_to_nonuniform_map(self, dim): def write_to_file(self, ds_in, lock_name, zarr_path, zarr_group, region, storage_options): with Lock(lock_name): - ds_in.to_zarr(zarr_path, - group=zarr_group, - region=region, - compute=True, - # safe_chunks=False, - storage_options=storage_options, - synchronizer=zarr.ThreadSynchronizer()) + ds_in.to_zarr( + zarr_path, + group=zarr_group, + region=region, + compute=True, + # safe_chunks=False, + storage_options=storage_options, + synchronizer=zarr.ThreadSynchronizer(), + ) def _append_ds_list_to_zarr( self, @@ -589,7 +597,11 @@ def _append_ds_list_to_zarr( ds_append_dims = set(ds_list[0].dims).intersection(self.append_dims) for dim in ds_append_dims: - drop_names = [var_name for var_name, var_val in ds_list[0].variables.items() if dim not in var_val.dims] + drop_names = [ + var_name + for var_name, var_val in ds_list[0].variables.items() + if dim not in var_val.dims + ] drop_names.append(dim) @@ -599,7 +611,8 @@ def _append_ds_list_to_zarr( for ds_list_ind, dim_slice in non_uniform_dict.items(): - ds_drop = ds_list[ds_list_ind].copy().drop(drop_names) + # ds_drop = ds_list[ds_list_ind].copy().drop(drop_names) + ds_drop = ds_list[ds_list_ind].drop(drop_names) region = self._get_region(ds_list_ind, set(ds_drop.dims)) region[dim] = dim_slice[1] @@ -609,17 +622,14 @@ def _append_ds_list_to_zarr( grp_name = zarr_group.replace("-", "_").replace("/", "_").lower() lock_name = grp_name + "_" + str(dim) + "_" + str(uniform_ind) - delayed_to_zarr.append(self.write_to_file(ds_in, lock_name, - zarr_path, zarr_group, - region, storage_options)) - - # futures.append(dask.distributed.get_client().submit(self.write_to_file, ds_in, lock_name, - # zarr_path, zarr_group, - # region, storage_options)) + delayed_to_zarr.append( + self.write_to_file( + ds_in, lock_name, zarr_path, zarr_group, region, storage_options + ) + ) # compute all delayed writes to the zarr store dask.compute(*delayed_to_zarr) - # results = dask.distributed.get_client().gather(futures) return const_names From dc1abf7e209d1cd91d48a80e006ca4145ec756db Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 30 Sep 2022 12:03:55 -0700 Subject: [PATCH 58/89] start documenting chunk mapping functions --- echopype/echodata/zarr_combine.py | 57 ++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 15d9cbe07..744f431ba 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -462,6 +462,7 @@ def _uniform_chunks_as_np_array(array, chunk_size): """ Chunks """ + # TODO: finish documentation! array_iter = iter(array) @@ -471,62 +472,100 @@ def _uniform_chunks_as_np_array(array, chunk_size): # convert each element in the iterable to a numpy array return list(map(np.array, chunks_iter)) - def _get_chunk_dicts(self, dim): + def _get_chunk_dicts(self, dim: str) -> Tuple[Dict[int, np.ndarray], Dict[int, np.ndarray]]: + """ + Obtains dictionaries specifying the chunk index and the + indices (with respect to the full combined length) that + are contained in that chunk, for both the uniform and + non-uniform chunks. - csum_og_chunks = np.array(list(self.dims_csum[dim].values())) + Parameters + ---------- + dim: str + The name of the dimension to create the chunk dicts for + + Returns + ------- + og_chunk_dict: Dict[int, np.ndarray] + The chunk dictionary corresponding to the original + non-uniform chunks + uniform_chunk_dict: Dict[int, np.ndarray] + The chunk dictionary corresponding to the uniform chunks + """ + # an array specifying the indices of the final combined array x_no_chunk = np.arange(self.dims_sum[dim], dtype=np.int64) + # get end indices for the non-uniform chunks + csum_og_chunks = np.array(list(self.dims_csum[dim].values())) + + # obtain the indices of the final combined array that are in each non-uniform chunk og_chunk = np.split(x_no_chunk, csum_og_chunks) + # construct a mapping between the non-uniform chunk and the indices og_chunk_dict = dict(zip(range(len(og_chunk)), og_chunk)) + # obtain the uniform chunk size zarr_chunk_size = min(self.dims_max[dim], self.max_append_chunk_size) + # get the indices of the final combined array that are in each uniform chunk uniform_chunk = self._uniform_chunks_as_np_array(x_no_chunk, zarr_chunk_size) + # construct a mapping between the uniform chunk and the indices uniform_chunk_dict = dict(zip(range(len(uniform_chunk)), uniform_chunk)) return og_chunk_dict, uniform_chunk_dict - def _get_uniform_to_nonuniform_map(self, dim): + def _get_uniform_to_nonuniform_map(self, dim: str) -> Dict[int, dict]: """ Constructs a uniform to non-uniform mapping of chunks for a dimension ``dim``. + Parameters + ---------- + dim: str + The name of the dimension to create a mapping for Returns ------- final_mapping: Dict[int, dict] Uniform to non-uniform mapping where the keys are the chunk index in the uniform chunk and the values - are dictionaries with keys corresponding to the index - of the non-uniform chunk and the values are a tuple of - ``slice`` objects for the non-uniform chunk values and - region chunk values, respectively. - + are dictionaries. The value dictionaries have keys + which correspond to the index of the non-uniform chunk + and the values are a tuple with the first element being + a ``slice`` object for the non-uniform chunk values and + the second element is a ``slice`` object for the region + chunk values. """ + # obtains dictionaries specifying the indices contained in each chunk og_chunk_dict, uniform_chunk_dict = self._get_chunk_dicts(dim) + # construct the uniform to non-uniform mapping final_mapping = defaultdict(dict) for u_key, u_val in uniform_chunk_dict.items(): for og_key, og_val in og_chunk_dict.items(): + # find the intersection of uniform and non-uniform chunk indices intersect = np.intersect1d(u_val, og_val) if len(intersect) > 0: + # get min and max indices in intersect min_val = intersect.min() max_val = intersect.max() + # determine the start and end index for the og_val start_og = np.argwhere(og_val == min_val)[0, 0] end_og = np.argwhere(og_val == max_val)[0, 0] + 1 + # determine the start and end index for the region start_region = min_val end_region = max_val + 1 + # add non-uniform specific information to final mapping final_mapping[u_key].update( {og_key: (slice(start_og, end_og), slice(start_region, end_region))} ) @@ -547,6 +586,8 @@ def write_to_file(self, ds_in, lock_name, zarr_path, zarr_group, region, storage synchronizer=zarr.ThreadSynchronizer(), ) + # TODO: put a check to make sure that the chunk has been written + def _append_ds_list_to_zarr( self, zarr_path: str, From 8d90363f927d6f2652f84582108e0f35a2326ed8 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 30 Sep 2022 13:28:13 -0700 Subject: [PATCH 59/89] finish documenting current function that construct the uniform to non-uniform mapping --- echopype/echodata/zarr_combine.py | 78 ++++++++++++++++++++++++++----- 1 file changed, 67 insertions(+), 11 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 744f431ba..2c3e52ac2 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -458,12 +458,31 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: return region @staticmethod - def _uniform_chunks_as_np_array(array, chunk_size): + def _uniform_chunks_as_np_array(array: np.ndarray, chunk_size: int) -> List[np.ndarray]: """ - Chunks + Split ``array`` into chunks with size ``chunk_size``, where the + last element in the split has length ``len(array) % chunk_size``. + + Parameters + ---------- + array: np.ndarray + Array to split up into chunks + chunk_size: int + The maximum chunk size + + Returns + ------- + List[np.ndarray] + The chunked input ``array`` + + Example + ------- + >>> arr = np.array([1, 2, 3, 4, 5]) + >>> _uniform_chunks_as_np_array(arr, 2) + [array([1, 2]), array([3, 4]), array([5])] """ - # TODO: finish documentation! + # get array iterable array_iter = iter(array) # construct chunks as an iterable of lists @@ -573,7 +592,38 @@ def _get_uniform_to_nonuniform_map(self, dim: str) -> Dict[int, dict]: return final_mapping @dask.delayed - def write_to_file(self, ds_in, lock_name, zarr_path, zarr_group, region, storage_options): + def write_to_file( + self, + ds_in: xr.Dataset, + lock_name: str, + zarr_path: str, + zarr_group: str, + region: Dict[str, slice], + storage_options: Optional[dict], + ) -> None: + """ + Constructs a delayed write of ``ds_in`` to the appropriate zarr + store position using a unique lock name. + + Parameters + ---------- + ds_in: xr.Dataset + Dataset subset with only one append dimension containing + variables with the append dimension in their dimensions + lock_name: str + A unique lock name for the chunk being written to + zarr_path: str + The full path of the final combined zarr store + zarr_group: str + The name of the group of the zarr store + corresponding to the Datasets in ``ds_list`` + region: Dict[str, slice] + Keys set as the dimension name and values as + the slice of the zarr portion to write to + storage_options: Optional[dict] + Any additional parameters for the storage + backend (ignored for local paths) + """ with Lock(lock_name): ds_in.to_zarr( @@ -639,38 +689,44 @@ def _append_ds_list_to_zarr( synchronizer=zarr.ThreadSynchronizer(), ) + # get all dimensions in ds that are append dimensions + ds_append_dims = set(ds_list[0].dims).intersection(self.append_dims) + # collect delayed functions that write each non-constant variable # in ds_list to the zarr store delayed_to_zarr = [] - # futures = [] - ds_append_dims = set(ds_list[0].dims).intersection(self.append_dims) for dim in ds_append_dims: + # collect all variables/coordinates that should be dropped drop_names = [ var_name for var_name, var_val in ds_list[0].variables.items() if dim not in var_val.dims ] - drop_names.append(dim) - chunk_mapping = self._get_uniform_to_nonuniform_map(dim) + chunk_mapping = self._get_uniform_to_nonuniform_map(str(dim)) for uniform_ind, non_uniform_dict in chunk_mapping.items(): - for ds_list_ind, dim_slice in non_uniform_dict.items(): - # ds_drop = ds_list[ds_list_ind].copy().drop(drop_names) + # get ds containing only variables who have dim in their dims ds_drop = ds_list[ds_list_ind].drop(drop_names) + # get xarray region for all dims, except dim region = self._get_region(ds_list_ind, set(ds_drop.dims)) - region[dim] = dim_slice[1] + # get xarray region for dim + region[str(dim)] = dim_slice[1] + + # select subset of dim corresponding to the region ds_in = ds_drop.isel({dim: dim_slice[0]}) + # construct the unique lock name for the uniform chunk grp_name = zarr_group.replace("-", "_").replace("/", "_").lower() lock_name = grp_name + "_" + str(dim) + "_" + str(uniform_ind) + # write the subset of each Dataset to a zarr file delayed_to_zarr.append( self.write_to_file( ds_in, lock_name, zarr_path, zarr_group, region, storage_options From 61c5265a6c148534c6f178a6bd6f49c9abc02871 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 30 Sep 2022 16:35:52 -0700 Subject: [PATCH 60/89] add function that writes all append dimensions and finish documenting client input for combine_echodata --- echopype/echodata/combine.py | 17 ++++----- echopype/echodata/zarr_combine.py | 60 ++++++++++++++++++++++++++----- 2 files changed, 61 insertions(+), 16 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 0cc730af4..5dda41dca 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -3,6 +3,7 @@ from warnings import warn import xarray as xr +from dask.distributed import Client from ..qc import coerce_increasing_time, exist_reversed_time from ..utils.io import validate_output_path @@ -10,8 +11,6 @@ from .echodata import EchoData from .zarr_combine import ZarrCombine -from dask.distributed import Client - logger = _init_logger(__name__) @@ -298,12 +297,12 @@ def combine_echodata( echodatas: List[EchoData] = None, zarr_path: Optional[str] = None, storage_options: Optional[dict] = {}, - client: Client = None + client: Optional[Client] = None, ) -> EchoData: """ Combines multiple ``EchoData`` objects into a single ``EchoData`` object. This is accomplished by writing each element of ``echodatas`` in parallel - (using dask) to the zarr store specified by ``zarr_path``. + (using Dask) to the zarr store specified by ``zarr_path``. Parameters ---------- @@ -314,8 +313,8 @@ def combine_echodata( storage_options: Optional[dict] Any additional parameters for the storage backend (ignored for local paths) - client: dask.distributed.Client - TODO: document this! + client: Optional[dask.distributed.Client] + An initialized Dask distributed client Returns ------- @@ -371,6 +370,8 @@ def combine_echodata( a variable and the ``Provenance`` attribute ``reversed_ping_times`` will be set to ``1``. * If no ``zarr_path`` is provided, the combined zarr file will be ``'temp_echopype_output/combined_echodatas.zarr'`` under the current working directory. + * If no ``client`` is provided, then a client with a local scheduler will be used. + * For each run of this function, we print our the client dashboard link. Examples -------- @@ -392,9 +393,9 @@ def combine_echodata( """ # TODO: change PR #297 reference to a link in our documentation - # TODO: get client as input spit out client.dashboard_link + # check the client input and print dashboard link if client is None: - client = Client() # create local cluster + client = Client() # create client with local scheduler print(f"Client dashboard link: {client.dashboard_link}") else: diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 2c3e52ac2..5146c2f2c 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -249,7 +249,6 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> Non self.dims_sum = self.dims_df.sum(axis=0).to_dict() self.dims_csum = self.dims_df.cumsum(axis=0).to_dict() self.dims_max = self.dims_df.max(axis=0).to_dict() - self.dims_min = self.dims_df.min(axis=0).to_dict() # format ed_name appropriately ed_name = ed_name.replace("-", "_").replace("/", "_").lower() @@ -711,7 +710,7 @@ def _append_ds_list_to_zarr( for ds_list_ind, dim_slice in non_uniform_dict.items(): # get ds containing only variables who have dim in their dims - ds_drop = ds_list[ds_list_ind].drop(drop_names) + ds_drop = ds_list[ds_list_ind].drop_vars(drop_names) # get xarray region for all dims, except dim region = self._get_region(ds_list_ind, set(ds_drop.dims)) @@ -784,6 +783,55 @@ def _append_const_to_zarr( zarr_path, group=zarr_group, mode="a", storage_options=storage_options ) + def _write_append_dims( + self, + ds_list: List[xr.Dataset], + zarr_path: str, + zarr_group: str, + storage_options: Optional[dict], + ) -> None: + """ + Sequentially writes each Dataset's append dimension in ``ds_list`` to + the appropriate final combined zarr store. + + Parameters + ---------- + ds_list: List[xr.Dataset] + The Datasets that will be combined + zarr_path: str + The full path of the final combined zarr store + zarr_group: str + The name of the group of the zarr store + corresponding to the Datasets in ``ds_list`` + storage_options: Optional[dict] + Any additional parameters for the storage + backend (ignored for local paths) + """ + + # get all dimensions in ds that are append dimensions + ds_append_dims = set(ds_list[0].dims).intersection(self.append_dims) + + for dim in ds_append_dims: + + for count, ds in enumerate(ds_list): + + # obtain the appropriate region to write to + if count == 0: + region = {str(dim): slice(0, self.dims_csum[dim][count])} + else: + region = { + str(dim): slice(self.dims_csum[dim][count - 1], self.dims_csum[dim][count]) + } + + ds[[dim]].to_zarr( + zarr_path, + group=zarr_group, + region=region, + compute=True, + storage_options=storage_options, + synchronizer=zarr.ThreadSynchronizer(), + ) + def _append_provenance_attr_vars( self, zarr_path: str, storage_options: Optional[dict] = {} ) -> None: @@ -906,9 +954,6 @@ def combine( ``Provenance`` group. """ - # TODO: the below line should be uncommented, if blosc issues persist - # blosc.use_threads = False # TODO: Run on each worker - # set class variables from input self.sonar_model = sonar_model self.group_attrs["echodata_filename"] = echodata_filenames @@ -939,15 +984,14 @@ def combine( const_names, ds_list, zarr_path, grp_info["ep_group"], storage_options ) + self._write_append_dims(ds_list, zarr_path, grp_info["ep_group"], storage_options) + # append all group attributes before combination to zarr store self._append_provenance_attr_vars(zarr_path, storage_options=storage_options) # change filenames numbering to range(len(eds)) self._modify_prov_filenames(zarr_path, len_eds=len(eds)) - # TODO: the below line should be uncommented, if blosc issues persist - # blosc.use_threads = None # TODO: Run on each worker - # open lazy loaded combined EchoData object ed_combined = open_converted( zarr_path, chunks={}, synchronizer=zarr.ThreadSynchronizer() From b9f2b28279a410e2817ddf2202be68a5422dc4d5 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 30 Sep 2022 17:06:50 -0700 Subject: [PATCH 61/89] remove unnecessary test_cluster_dump folder --- .../test_cluster_dump/test_zarr_combine.yaml | 873 ------------------ 1 file changed, 873 deletions(-) delete mode 100644 echopype/tests/echodata/test_cluster_dump/test_zarr_combine.yaml diff --git a/echopype/tests/echodata/test_cluster_dump/test_zarr_combine.yaml b/echopype/tests/echodata/test_cluster_dump/test_zarr_combine.yaml deleted file mode 100644 index 7a89549a4..000000000 --- a/echopype/tests/echodata/test_cluster_dump/test_zarr_combine.yaml +++ /dev/null @@ -1,873 +0,0 @@ -scheduler: - address: tcp://127.0.0.1:50971 - clients: - Client-854fe396-3b63-11ed-b660-7aef93c2516e: - client_key: Client-854fe396-3b63-11ed-b660-7aef93c2516e - last_seen: 1663953414.2823439 - wants_what: [] - fire-and-forget: - client_key: fire-and-forget - last_seen: 1663953414.2209349 - wants_what: [] - events: - Client-854fe396-3b63-11ed-b660-7aef93c2516e: - - - 1663953414.282331 - - action: add-client - client: Client-854fe396-3b63-11ed-b660-7aef93c2516e - all: - - - 1663953414.261132 - - action: add-worker - worker: tcp://127.0.0.1:50972 - - - 1663953414.262537 - - action: add-worker - worker: tcp://127.0.0.1:50974 - - - 1663953414.282331 - - action: add-client - client: Client-854fe396-3b63-11ed-b660-7aef93c2516e - stealing: [] - tcp://127.0.0.1:50972: - - - 1663953414.261111 - - action: add-worker - - - 1663953414.265065 - - action: worker-status-change - prev-status: init - status: running - tcp://127.0.0.1:50974: - - - 1663953414.262531 - - action: add-worker - - - 1663953414.2653031 - - action: worker-status-change - prev-status: init - status: running - extensions: - amm: - events: - locks: - memory_sampler: - multi_locks: - publish: - pubsub: - queues: - replay-tasks: - semaphores: - shuffle: - stealing: - cost_multipliers: - - 1.0 - - 1.03125 - - 1.0625 - - 1.125 - - 1.25 - - 1.5 - - 2 - - 3 - - 5 - - 9 - - 17 - - 33 - - 65 - - 129 - - 257 - count: 0 - in_flight: {} - in_flight_occupancy: {} - key_stealable: {} - scheduler: - address: tcp://127.0.0.1:50971 - clients: - Client-854fe396-3b63-11ed-b660-7aef93c2516e: - fire-and-forget: - events: - Client-854fe396-3b63-11ed-b660-7aef93c2516e: - - - 1663953414.282331 - - action: add-client - client: Client-854fe396-3b63-11ed-b660-7aef93c2516e - all: - - - 1663953414.261132 - - action: add-worker - worker: tcp://127.0.0.1:50972 - - - 1663953414.262537 - - action: add-worker - worker: tcp://127.0.0.1:50974 - - - 1663953414.282331 - - action: add-client - client: Client-854fe396-3b63-11ed-b660-7aef93c2516e - stealing: [] - tcp://127.0.0.1:50972: - - - 1663953414.261111 - - action: add-worker - - - 1663953414.265065 - - action: worker-status-change - prev-status: init - status: running - tcp://127.0.0.1:50974: - - - 1663953414.262531 - - action: add-worker - - - 1663953414.2653031 - - action: worker-status-change - prev-status: init - status: running - extensions: '{''locks'': , - ''multi_locks'': , - ''publish'': , - ''replay-tasks'': , ''queues'': , ''variables'': , ''pubsub'': , ''semaphores'': , ''events'': , ''amm'': , ''memory_sampler'': , ''shuffle'': , ''stealing'': }' - id: Scheduler-6409852a-5ae7-46a1-956a-da3f1329a529 - log: [] - memory: - managed: 0 - managed_in_memory: 0 - managed_spilled: 0 - optimistic: 388939776 - process: 388939776 - unmanaged: 388939776 - unmanaged_old: 388939776 - unmanaged_recent: 0 - services: - dashboard: 50970 - started: 1663953414.037181 - status: running - task_groups: {} - tasks: {} - thread_id: 8633697792 - transition_counter: 0 - transition_log: [] - type: Scheduler - workers: - tcp://127.0.0.1:50972: '' - tcp://127.0.0.1:50974: '' - stealable: - tcp://127.0.0.1:50972: - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - tcp://127.0.0.1:50974: - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - stealable_all: - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - - [] - variables: - id: Scheduler-6409852a-5ae7-46a1-956a-da3f1329a529 - log: [] - memory: - managed: 0 - managed_in_memory: 0 - managed_spilled: 0 - optimistic: 388939776 - process: 388939776 - unmanaged: 388939776 - unmanaged_old: 388939776 - unmanaged_recent: 0 - services: - dashboard: 50970 - started: 1663953414.037181 - status: running - task_groups: {} - tasks: {} - thread_id: 8633697792 - transition_counter: 0 - transition_log: [] - type: Scheduler - workers: - tcp://127.0.0.1:50972: - actors: [] - address: tcp://127.0.0.1:50972 - bandwidth: 100000000 - executing: {} - extra: {} - has_what: [] - host: 127.0.0.1 - last_seen: 1663953414.261237 - local_directory: /var/folders/68/bd5dqh4j3zgbwmhw2_9v4g180000gn/T/dask-worker-space/worker-zep12oa1 - long_running: [] - memory: - managed: 0 - managed_in_memory: 0 - managed_spilled: 0 - optimistic: 194433024 - process: 194433024 - unmanaged: 194433024 - unmanaged_old: 194433024 - unmanaged_recent: 0 - memory_limit: 17179869184 - metrics: - bandwidth: - total: 100000000 - types: {} - workers: {} - cpu: 0.0 - event_loop_interval: 0.5 - executing: 0 - in_flight: 0 - in_memory: 0 - memory: 194433024 - num_fds: 25 - read_bytes: 0.0 - read_bytes_disk: 0.0 - ready: 0 - spilled_nbytes: - disk: 0 - memory: 0 - time: 1663953414.226922 - write_bytes: 0.0 - write_bytes_disk: 0.0 - name: 0 - nanny: null - nbytes: 0 - nthreads: 1 - occupancy: 0 - pid: 95840 - processing: {} - resources: {} - server_id: Worker-a9534fbd-86d5-428d-a260-ede2c284bea8 - services: - dashboard: 50973 - status: '' - time_delay: 0.022827863693237305 - used_resources: {} - tcp://127.0.0.1:50974: - actors: [] - address: tcp://127.0.0.1:50974 - bandwidth: 100000000 - executing: {} - extra: {} - has_what: [] - host: 127.0.0.1 - last_seen: 1663953414.2626 - local_directory: /var/folders/68/bd5dqh4j3zgbwmhw2_9v4g180000gn/T/dask-worker-space/worker-bqrcff5y - long_running: [] - memory: - managed: 0 - managed_in_memory: 0 - managed_spilled: 0 - optimistic: 194506752 - process: 194506752 - unmanaged: 194506752 - unmanaged_old: 194506752 - unmanaged_recent: 0 - memory_limit: 17179869184 - metrics: - bandwidth: - total: 100000000 - types: {} - workers: {} - cpu: 0.0 - event_loop_interval: 0.5 - executing: 0 - in_flight: 0 - in_memory: 0 - memory: 194506752 - num_fds: 26 - read_bytes: 0.0 - read_bytes_disk: 0.0 - ready: 0 - spilled_nbytes: - disk: 0 - memory: 0 - time: 1663953414.229425 - write_bytes: 0.0 - write_bytes_disk: 0.0 - name: 1 - nanny: null - nbytes: 0 - nthreads: 2 - occupancy: 0 - pid: 95840 - processing: {} - resources: {} - server_id: Worker-44330199-ed16-48c6-b2c0-3554d0acd9c0 - services: - dashboard: 50975 - status: '' - time_delay: 0.003255128860473633 - used_resources: {} -versions: - host: - LANG: None - LC_ALL: None - OS: Darwin - OS-release: 21.5.0 - byteorder: little - machine: x86_64 - processor: i386 - python: 3.9.12.final.0 - python-bits: 64 - packages: - cloudpickle: 2.1.0 - dask: 2022.8.0 - distributed: 2022.8.0 - lz4: 4.0.0 - msgpack: 1.0.4 - numpy: 1.23.1 - pandas: 1.4.3 - python: 3.9.12.final.0 - toolz: 0.12.0 - tornado: '6.1' -workers: - tcp://127.0.0.1:50972: - address: tcp://127.0.0.1:50972 - busy_workers: [] - config: - array: - chunk-size: 128MiB - rechunk-threshold: 4 - slicing: - split-large-chunks: null - svg: - size: 120 - dataframe: - parquet: - metadata-task-size-local: 512 - metadata-task-size-remote: 16 - shuffle-compression: null - distributed: - adaptive: - interval: 1s - maximum: .inf - minimum: 0 - target-duration: 5s - wait-count: 3 - admin: - event-loop: tornado - log-format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s' - log-length: 10000 - max-error-length: 10000 - pdb-on-err: false - system-monitor: - disk: true - interval: 500ms - tick: - cycle: 1s - interval: 500 ms - limit: 3s - client: - heartbeat: 5s - preload: [] - preload-argv: [] - scheduler-info-interval: 2s - security-loader: null - comm: - compression: auto - default-scheme: tcp - offload: 10MiB - recent-messages-log-length: 0 - require-encryption: null - retry: - count: 0 - delay: - max: 20s - min: 1s - shard: 64MiB - socket-backlog: 2048 - tcp: - backend: tornado - timeouts: - connect: 5s - tcp: 30s - tls: - ca-file: null - ciphers: null - client: - cert: null - key: null - max-version: null - min-version: 1.2 - scheduler: - cert: null - key: null - worker: - cert: null - key: null - ucx: - create-cuda-context: null - cuda-copy: null - infiniband: null - nvlink: null - rdmacm: null - tcp: null - websockets: - shard: 8MiB - zstd: - level: 3 - threads: 0 - dashboard: - export-tool: false - graph-max-items: 5000 - link: '{scheme}://{host}:{port}/status' - prometheus: - namespace: dask - deploy: - cluster-repr-interval: 500ms - lost-worker-timeout: 15s - diagnostics: - computations: - ignore-modules: - - distributed - - dask - - xarray - - cudf - - cuml - - prefect - - xgboost - max-history: 100 - erred-tasks: - max-history: 100 - nvml: true - nanny: - environ: - MALLOC_TRIM_THRESHOLD_: 65536 - MKL_NUM_THREADS: 1 - OMP_NUM_THREADS: 1 - preload: [] - preload-argv: [] - rmm: - pool-size: null - scheduler: - active-memory-manager: - interval: 2s - policies: - - class: distributed.active_memory_manager.ReduceReplicas - start: false - allowed-failures: 3 - allowed-imports: - - dask - - distributed - bandwidth: 100000000 - blocked-handlers: [] - contact-address: null - dashboard: - bokeh-application: - allow_websocket_origin: - - '*' - check_unused_sessions_milliseconds: 500 - keep_alive_milliseconds: 500 - status: - task-stream-length: 1000 - tasks: - task-stream-length: 100000 - tls: - ca-file: null - cert: null - key: null - default-data-size: 1kiB - default-task-durations: - rechunk-split: 1us - split-shuffle: 1us - events-cleanup-delay: 1h - events-log-length: 100000 - http: - routes: - - distributed.http.scheduler.prometheus - - distributed.http.scheduler.info - - distributed.http.scheduler.json - - distributed.http.health - - distributed.http.proxy - - distributed.http.statistics - idle-timeout: null - locks: - lease-timeout: 30s - lease-validation-interval: 10s - pickle: true - preload: [] - preload-argv: [] - transition-log-length: 100000 - unknown-task-duration: 500ms - validate: false - work-stealing: true - work-stealing-interval: 100ms - worker-ttl: 5 minutes - version: 2 - worker: - blocked-handlers: [] - connections: - incoming: 10 - outgoing: 50 - daemon: true - http: - routes: - - distributed.http.worker.prometheus - - distributed.http.health - - distributed.http.statistics - lifetime: - duration: null - restart: false - stagger: 0 seconds - memory: - max-spill: false - monitor-interval: 100ms - pause: 0.8 - rebalance: - measure: optimistic - recipient-max: 0.6 - sender-min: 0.3 - sender-recipient-gap: 0.1 - recent-to-old-time: 30s - spill: 0.7 - target: 0.6 - terminate: 0.95 - multiprocessing-method: spawn - preload: [] - preload-argv: [] - profile: - cycle: 1000ms - enabled: false - interval: 10ms - low-level: false - resources: {} - use-file-locking: true - validate: false - local_directory: /var/folders/68/bd5dqh4j3zgbwmhw2_9v4g180000gn/T - optimization: - fuse: - active: null - ave-width: 1 - max-depth-new-edges: null - max-height: .inf - max-width: null - rename-keys: true - subgraphs: null - scheduler: dask.distributed - shuffle: tasks - temporary-directory: null - tokenize: - ensure-deterministic: false - visualization: - engine: null - constrained: [] - data: {} - data_needed: {} - executing: [] - id: Worker-a9534fbd-86d5-428d-a260-ede2c284bea8 - in_flight_tasks: [] - in_flight_workers: {} - incoming_transfer_log: [] - log: [] - logs: [] - long_running: [] - max_spill: false - memory_limit: 17179869184 - memory_monitor_interval: 0.1 - memory_pause_fraction: 0.8 - memory_spill_fraction: 0.7 - memory_target_fraction: 0.6 - nthreads: 1 - outgoing_transfer_log: [] - ready: [] - running: true - scheduler: tcp://127.0.0.1:50971 - status: '' - stimulus_log: [] - tasks: {} - thread_id: 8633697792 - transition_counter: 0 - type: Worker - tcp://127.0.0.1:50974: - address: tcp://127.0.0.1:50974 - busy_workers: [] - config: - array: - chunk-size: 128MiB - rechunk-threshold: 4 - slicing: - split-large-chunks: null - svg: - size: 120 - dataframe: - parquet: - metadata-task-size-local: 512 - metadata-task-size-remote: 16 - shuffle-compression: null - distributed: - adaptive: - interval: 1s - maximum: .inf - minimum: 0 - target-duration: 5s - wait-count: 3 - admin: - event-loop: tornado - log-format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s' - log-length: 10000 - max-error-length: 10000 - pdb-on-err: false - system-monitor: - disk: true - interval: 500ms - tick: - cycle: 1s - interval: 500 ms - limit: 3s - client: - heartbeat: 5s - preload: [] - preload-argv: [] - scheduler-info-interval: 2s - security-loader: null - comm: - compression: auto - default-scheme: tcp - offload: 10MiB - recent-messages-log-length: 0 - require-encryption: null - retry: - count: 0 - delay: - max: 20s - min: 1s - shard: 64MiB - socket-backlog: 2048 - tcp: - backend: tornado - timeouts: - connect: 5s - tcp: 30s - tls: - ca-file: null - ciphers: null - client: - cert: null - key: null - max-version: null - min-version: 1.2 - scheduler: - cert: null - key: null - worker: - cert: null - key: null - ucx: - create-cuda-context: null - cuda-copy: null - infiniband: null - nvlink: null - rdmacm: null - tcp: null - websockets: - shard: 8MiB - zstd: - level: 3 - threads: 0 - dashboard: - export-tool: false - graph-max-items: 5000 - link: '{scheme}://{host}:{port}/status' - prometheus: - namespace: dask - deploy: - cluster-repr-interval: 500ms - lost-worker-timeout: 15s - diagnostics: - computations: - ignore-modules: - - distributed - - dask - - xarray - - cudf - - cuml - - prefect - - xgboost - max-history: 100 - erred-tasks: - max-history: 100 - nvml: true - nanny: - environ: - MALLOC_TRIM_THRESHOLD_: 65536 - MKL_NUM_THREADS: 1 - OMP_NUM_THREADS: 1 - preload: [] - preload-argv: [] - rmm: - pool-size: null - scheduler: - active-memory-manager: - interval: 2s - policies: - - class: distributed.active_memory_manager.ReduceReplicas - start: false - allowed-failures: 3 - allowed-imports: - - dask - - distributed - bandwidth: 100000000 - blocked-handlers: [] - contact-address: null - dashboard: - bokeh-application: - allow_websocket_origin: - - '*' - check_unused_sessions_milliseconds: 500 - keep_alive_milliseconds: 500 - status: - task-stream-length: 1000 - tasks: - task-stream-length: 100000 - tls: - ca-file: null - cert: null - key: null - default-data-size: 1kiB - default-task-durations: - rechunk-split: 1us - split-shuffle: 1us - events-cleanup-delay: 1h - events-log-length: 100000 - http: - routes: - - distributed.http.scheduler.prometheus - - distributed.http.scheduler.info - - distributed.http.scheduler.json - - distributed.http.health - - distributed.http.proxy - - distributed.http.statistics - idle-timeout: null - locks: - lease-timeout: 30s - lease-validation-interval: 10s - pickle: true - preload: [] - preload-argv: [] - transition-log-length: 100000 - unknown-task-duration: 500ms - validate: false - work-stealing: true - work-stealing-interval: 100ms - worker-ttl: 5 minutes - version: 2 - worker: - blocked-handlers: [] - connections: - incoming: 10 - outgoing: 50 - daemon: true - http: - routes: - - distributed.http.worker.prometheus - - distributed.http.health - - distributed.http.statistics - lifetime: - duration: null - restart: false - stagger: 0 seconds - memory: - max-spill: false - monitor-interval: 100ms - pause: 0.8 - rebalance: - measure: optimistic - recipient-max: 0.6 - sender-min: 0.3 - sender-recipient-gap: 0.1 - recent-to-old-time: 30s - spill: 0.7 - target: 0.6 - terminate: 0.95 - multiprocessing-method: spawn - preload: [] - preload-argv: [] - profile: - cycle: 1000ms - enabled: false - interval: 10ms - low-level: false - resources: {} - use-file-locking: true - validate: false - local_directory: /var/folders/68/bd5dqh4j3zgbwmhw2_9v4g180000gn/T - optimization: - fuse: - active: null - ave-width: 1 - max-depth-new-edges: null - max-height: .inf - max-width: null - rename-keys: true - subgraphs: null - scheduler: dask.distributed - shuffle: tasks - temporary-directory: null - tokenize: - ensure-deterministic: false - visualization: - engine: null - constrained: [] - data: {} - data_needed: {} - executing: [] - id: Worker-44330199-ed16-48c6-b2c0-3554d0acd9c0 - in_flight_tasks: [] - in_flight_workers: {} - incoming_transfer_log: [] - log: [] - logs: [] - long_running: [] - max_spill: false - memory_limit: 17179869184 - memory_monitor_interval: 0.1 - memory_pause_fraction: 0.8 - memory_spill_fraction: 0.7 - memory_target_fraction: 0.6 - nthreads: 2 - outgoing_transfer_log: [] - ready: [] - running: true - scheduler: tcp://127.0.0.1:50971 - status: '' - stimulus_log: [] - tasks: {} - thread_id: 8633697792 - transition_counter: 0 - type: Worker From 42a2934fc62de07bb64088dfe1b7b77c237f34e6 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 30 Sep 2022 17:09:54 -0700 Subject: [PATCH 62/89] add back in items in test_data README --- echopype/test_data/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/echopype/test_data/README.md b/echopype/test_data/README.md index c79ad71f3..d3295604e 100644 --- a/echopype/test_data/README.md +++ b/echopype/test_data/README.md @@ -11,6 +11,8 @@ Most of these files are stored on Git LFS but the ones that aren't (due to file - 2019118 group2survey-D20191214-T081342.raw: Contains 6 channels but only 2 of those channels collect ping data - D20200528-T125932.raw: Data collected from WBT mini (instead of WBT), from @emlynjdavies - Green2.Survey2.FM.short.slow.-D20191004-T211557.raw: Contains 2-in-1 transducer, from @FletcherFT (reduced from 104.9 MB to 765 KB in test data updates) +- raw4-D20220514-T172704.raw: Contains RAW4 datagram, 1 channel only, from @cornejotux +- D20210330-T123857.raw: do not contain filter coefficients ### EA640 @@ -22,6 +24,7 @@ Most of these files are stored on Git LFS but the ones that aren't (due to file - Winter2017-D20170115-T150122.raw: Contains a change of recording length in the middle of the file - 2015843-D20151023-T190636.raw: Not used in tests but contains ranges are not constant across ping times - SH1701_consecutive_files_w_range_change: Not used in tests. [Folder](https://drive.google.com/drive/u/1/folders/1PaDtL-xnG5EK3N3P1kGlXa5ub16Yic0f) on shared drive that contains sequential files with ranges that are not constant across ping times. +- NBP_B050N-D20180118-T090228.raw: split-beam setup without angle data ### AZFP From b244f4d0b8a4360b97c33196b55e4a31ddd61e02 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Mon, 3 Oct 2022 14:48:51 -0700 Subject: [PATCH 63/89] modify docs, close client if it was not provided, include duplicate_ping_times attribute, and modify test_combine_echodata so it works with current combine form --- echopype/echodata/combine.py | 14 +- echopype/echodata/zarr_combine.py | 16 +- .../tests/echodata/test_echodata_combine.py | 181 ++++++++++-------- 3 files changed, 134 insertions(+), 77 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 5dda41dca..6e7d1b011 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -370,7 +370,8 @@ def combine_echodata( a variable and the ``Provenance`` attribute ``reversed_ping_times`` will be set to ``1``. * If no ``zarr_path`` is provided, the combined zarr file will be ``'temp_echopype_output/combined_echodatas.zarr'`` under the current working directory. - * If no ``client`` is provided, then a client with a local scheduler will be used. + * If no ``client`` is provided, then a client with a local scheduler will be used. The + created scheduler and client will be shutdown once computation has finished. * For each run of this function, we print our the client dashboard link. Examples @@ -395,10 +396,17 @@ def combine_echodata( # check the client input and print dashboard link if client is None: + + # set flag specifying that a client was created + client_created = True + client = Client() # create client with local scheduler print(f"Client dashboard link: {client.dashboard_link}") else: + # set flag specifying that a client was not created + client_created = False + if isinstance(client, Client): print(f"Client dashboard link: {client.dashboard_link}") else: @@ -429,4 +437,8 @@ def combine_echodata( orchestrate_reverse_time_check(ed_comb, zarr_path, comb.possible_time_dims, storage_options) + if client_created: + # close client + client.close() + return ed_comb diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 5146c2f2c..672083696 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -524,6 +524,7 @@ def _get_chunk_dicts(self, dim: str) -> Tuple[Dict[int, np.ndarray], Dict[int, n og_chunk_dict = dict(zip(range(len(og_chunk)), og_chunk)) # obtain the uniform chunk size + # TODO: investigate if this if the best chunk size zarr_chunk_size = min(self.dims_max[dim], self.max_append_chunk_size) # get the indices of the final combined array that are in each uniform chunk @@ -864,8 +865,21 @@ def _append_provenance_attr_vars( # create Dataset coordinates xr_dict[name] = {"dims": [name], "data": val} + # construct the Provenance Dataset's attributes + prov_attributes = echopype_prov_attrs("conversion") + + if "duplicate_ping_times" in self.group_attrs["provenance_attr_key"]: + dup_pings_position = self.group_attrs["provenance_attr_key"].index( + "duplicate_ping_times" + ) + prov_attributes["duplicate_ping_times"] = ( + 1 + if np.isin(1, np.array(self.group_attrs["provenance_attrs"])[:, dup_pings_position]) + else 0 + ) + # construct Dataset and assign Provenance attributes - all_ds_attrs = xr.Dataset.from_dict(xr_dict).assign_attrs(echopype_prov_attrs("conversion")) + all_ds_attrs = xr.Dataset.from_dict(xr_dict).assign_attrs(prov_attributes) # append Dataset to zarr all_ds_attrs.to_zarr( diff --git a/echopype/tests/echodata/test_echodata_combine.py b/echopype/tests/echodata/test_echodata_combine.py index 223cf33d1..07f5cce40 100644 --- a/echopype/tests/echodata/test_echodata_combine.py +++ b/echopype/tests/echodata/test_echodata_combine.py @@ -12,6 +12,7 @@ from echopype.core import SONAR_MODELS import tempfile +from dask.distributed import Client @pytest.fixture @@ -49,8 +50,8 @@ def ek80_test_data(test_path): def azfp_test_data(test_path): files = [ ("ooi", "18100407.01A"), - ("ooi", "18100409.01A"), ("ooi", "18100408.01A"), + ("ooi", "18100409.01A"), ] return [test_path["AZFP"].joinpath(*f) for f in files] @@ -61,24 +62,24 @@ def azfp_test_xml(test_path): @pytest.fixture( - params=[{ + params=[ + { "sonar_model": "EK60", "xml_file": None, "files": "ek60_test_data", - }, { - "sonar_model": "EK60", - "xml_file": None, - "files": "ek60_reversed_ping_time_test_data", - }, { - "sonar_model": "EK80", - "xml_file": None, - "files": "ek80_test_data", - }, { + }, + # { + # "sonar_model": "EK80", + # "xml_file": None, + # "files": "ek80_test_data", + # }, + { "sonar_model": "AZFP", "xml_file": "azfp_test_xml", "files": "azfp_test_data", - }], - ids=["ek60", "ek60_reversed_ping_time", "ek80", "azfp"] + } + ], + ids=["ek60", "azfp"] #["ek60", "ek80", "azfp"] ) def raw_datasets(request): files = request.param["files"] @@ -106,78 +107,84 @@ def test_combine_echodata(raw_datasets): concat_data_vars, ) = raw_datasets - pytest.xfail("test_combine_echodata will be reviewed and corrected later.") - eds = [echopype.open_raw(file, sonar_model, xml_file) for file in files] + append_dims = {"filenames", "time1", "time2", "time3", "ping_time"} + # create temporary directory for zarr store temp_zarr_dir = tempfile.TemporaryDirectory() zarr_file_name = temp_zarr_dir.name + "/combined_echodatas.zarr" - combined = echopype.combine_echodata(eds, zarr_file_name) + # create dask client + client = Client() - for group_name, value in combined.group_map.items(): - if group_name in ("top", "sonar", "provenance"): - continue - combined_group: xr.Dataset = combined[value['ep_group']] + combined = echopype.combine_echodata(eds, zarr_file_name, client=client) + + # get all possible dimensions that should be dropped + # these correspond to the attribute arrays created + all_drop_dims = [] + for grp in combined.group_paths: + # format group name appropriately + ed_name = grp.replace("-", "_").replace("/", "_").lower() + + # create and append attribute array dimension + all_drop_dims.append(ed_name + "_attr_key") + + # add dimension for Provenance group + all_drop_dims.append("echodata_filename") + + for group_name in combined.group_paths: + + # get all Datasets to be combined + combined_group: xr.Dataset = combined[group_name] eds_groups = [ - ed[value['ep_group']] + ed[group_name] for ed in eds - if ed[value['ep_group']] is not None + if ed[group_name] is not None ] - def union_attrs(datasets: List[xr.Dataset]) -> Dict[str, Any]: - """ - Merges attrs from a list of datasets. - Prioritizes keys from later datasets. - """ - - total_attrs = {} - for ds in datasets: - total_attrs.update(ds.attrs) - return total_attrs - - test_ds = xr.combine_nested( - eds_groups, - [concat_dims.get(group_name, concat_dims["default"])], - data_vars=concat_data_vars.get( - group_name, concat_data_vars["default"] - ), - coords="minimal", - combine_attrs="drop", - ) - test_ds.attrs.update(union_attrs(eds_groups)) - test_ds = test_ds.drop_dims( - [ - # xarray inserts "concat_dim" when concatenating along multiple dimensions - "concat_dim", - "old_ping_time", - "ping_time", - "old_time1", - "time1", - "old_time2", - "time2", - ], - errors="ignore", - ).drop_dims( - [f"{group}_attrs" for group in combined.group_map], errors="ignore" - ) - assert combined_group is None or test_ds.identical( - combined_group.drop_dims( - [ - "old_ping_time", - "ping_time", - "old_time1", - "time1", - "old_time2", - "time2", - ], - errors="ignore", - ) - ) + # all grp dimensions that are in all_drop_dims + if combined_group is None: + grp_drop_dims = [] + concat_dims = [] + else: + grp_drop_dims = list(set(combined_group.dims).intersection(set(all_drop_dims))) + concat_dims = list(set(combined_group.dims).intersection(append_dims)) + + # concat all Datasets along each concat dimension + diff_concats = [] + for dim in concat_dims: + + drop_dims = [c_dim for c_dim in concat_dims if c_dim != dim] + + diff_concats.append(xr.concat([ed_subset.drop_dims(drop_dims) for ed_subset in eds_groups], dim=dim, + coords="minimal", data_vars="minimal")) + + if len(diff_concats) < 1: + test_ds = eds_groups[0] # needed for groups that do not have append dims + else: + # create the full combined Dataset + test_ds = xr.merge(diff_concats, compat="override") + + # correctly set filenames values for constructed combined Dataset + if "filenames" in test_ds: + test_ds.filenames.values[:] = np.arange(len(test_ds.filenames), dtype=int) + + # correctly modify Provenance attributes so we can do a direct compare + if group_name == "Provenance": + test_ds.attrs["reversed_ping_times"] = 0 + + del test_ds.attrs["conversion_time"] + del combined_group.attrs["conversion_time"] + + if (combined_group is not None) and (test_ds is not None): + assert test_ds.identical(combined_group.drop_dims(grp_drop_dims)) temp_zarr_dir.cleanup() + # close client + client.close() + def test_ping_time_reversal(ek60_reversed_ping_time_test_data): @@ -190,7 +197,10 @@ def test_ping_time_reversal(ek60_reversed_ping_time_test_data): temp_zarr_dir = tempfile.TemporaryDirectory() zarr_file_name = temp_zarr_dir.name + "/combined_echodatas.zarr" - combined = echopype.combine_echodata(eds, zarr_file_name) + # create dask client + client = Client() + + combined = echopype.combine_echodata(eds, zarr_file_name, client=client) for group_name, value in combined.group_map.items(): if value['ep_group'] is None: @@ -217,6 +227,9 @@ def test_ping_time_reversal(ek60_reversed_ping_time_test_data): temp_zarr_dir.cleanup() + # close client + client.close() + def test_attr_storage(ek60_test_data): # check storage of attributes before combination in provenance group @@ -226,7 +239,10 @@ def test_attr_storage(ek60_test_data): temp_zarr_dir = tempfile.TemporaryDirectory() zarr_file_name = temp_zarr_dir.name + "/combined_echodatas.zarr" - combined = echopype.combine_echodata(eds, zarr_file_name) + # create dask client + client = Client() + + combined = echopype.combine_echodata(eds, zarr_file_name, client=client) for group, value in combined.group_map.items(): if value['ep_group'] is None: @@ -258,6 +274,9 @@ def test_attr_storage(ek60_test_data): temp_zarr_dir.cleanup() + # close client + client.close() + def test_combined_encodings(ek60_test_data): eds = [echopype.open_raw(file, "EK60") for file in ek60_test_data] @@ -266,7 +285,10 @@ def test_combined_encodings(ek60_test_data): temp_zarr_dir = tempfile.TemporaryDirectory() zarr_file_name = temp_zarr_dir.name + "/combined_echodatas.zarr" - combined = echopype.combine_echodata(eds, zarr_file_name) + # create dask client + client = Client() + + combined = echopype.combine_echodata(eds, zarr_file_name, client=client) encodings_to_drop = {'chunks', 'preferred_chunks', 'compressor', 'filters'} @@ -294,6 +316,9 @@ def test_combined_encodings(ek60_test_data): temp_zarr_dir.cleanup() + # close client + client.close() + if len(group_checks) > 0: all_messages = ['Encoding mismatch found!'] + group_checks message_text = '\n'.join(all_messages) @@ -307,7 +332,10 @@ def test_combined_echodata_repr(ek60_test_data): temp_zarr_dir = tempfile.TemporaryDirectory() zarr_file_name = temp_zarr_dir.name + "/combined_echodatas.zarr" - combined = echopype.combine_echodata(eds, zarr_file_name) + # create dask client + client = Client() + + combined = echopype.combine_echodata(eds, zarr_file_name, client=client) expected_repr = dedent( f"""\ @@ -328,3 +356,6 @@ def test_combined_echodata_repr(ek60_test_data): assert actual == expected_repr temp_zarr_dir.cleanup() + + # close client + client.close() From 260215e93f186b2fa91bf14c0a1a6993eac2338d Mon Sep 17 00:00:00 2001 From: b-reyes Date: Mon, 3 Oct 2022 15:05:44 -0700 Subject: [PATCH 64/89] add distributed to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index af476b5c5..2b320ae9f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ dask[array] +distributed jinja2 netCDF4 numpy From 6dbde236120200597a27df2585f34da1a8a0e73d Mon Sep 17 00:00:00 2001 From: b-reyes Date: Mon, 3 Oct 2022 15:14:08 -0700 Subject: [PATCH 65/89] change distributed in requirements.txt to the dask specific version --- requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2b320ae9f..3e4639224 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ -dask[array] -distributed +dask[array,distributed] jinja2 netCDF4 numpy From 58e842ec889254e3f20f05818c11d7eb9e7d218f Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 4 Oct 2022 08:10:46 -0700 Subject: [PATCH 66/89] import dask.distibuted and include dask.distibuted in typing of combine_echodata --- echopype/echodata/combine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 6e7d1b011..2df89d41b 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -2,6 +2,7 @@ from typing import List, Optional, Tuple from warnings import warn +import dask.distributed import xarray as xr from dask.distributed import Client @@ -297,7 +298,7 @@ def combine_echodata( echodatas: List[EchoData] = None, zarr_path: Optional[str] = None, storage_options: Optional[dict] = {}, - client: Optional[Client] = None, + client: Optional[dask.distributed.Client] = None, ) -> EchoData: """ Combines multiple ``EchoData`` objects into a single ``EchoData`` object. From dbe8b7a238e66b1dbbc86747498fdaef668b3da1 Mon Sep 17 00:00:00 2001 From: b-reyes <53541061+b-reyes@users.noreply.github.com> Date: Tue, 4 Oct 2022 08:14:20 -0700 Subject: [PATCH 67/89] Simplify the logic for checking the input client and printing the dashboard link Co-authored-by: Don Setiawan --- echopype/echodata/combine.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 2df89d41b..246f3fdbf 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -395,23 +395,20 @@ def combine_echodata( """ # TODO: change PR #297 reference to a link in our documentation + # set flag specifying that a client was not created + client_created = False + # check the client input and print dashboard link if client is None: - # set flag specifying that a client was created client_created = True client = Client() # create client with local scheduler print(f"Client dashboard link: {client.dashboard_link}") + elif isinstance(client, Client): + print(f"Client dashboard link: {client.dashboard_link}") else: - - # set flag specifying that a client was not created - client_created = False - - if isinstance(client, Client): - print(f"Client dashboard link: {client.dashboard_link}") - else: - raise TypeError("The input client is not of type dask.distributed.Client!") + raise TypeError(f"The input client is not of type {type(Client)}!") # Check the provided zarr_path is valid, or create a temp zarr_path if not provided zarr_path = check_zarr_path(zarr_path, storage_options) From d670fef3b3ddcec6bcbacb908724bf3524d13dc3 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 4 Oct 2022 15:19:20 -0700 Subject: [PATCH 68/89] add overwrite kwarg to combine_echodata and rectify warning caused by _append_provenance_attr_vars array comparison to a scalar --- echopype/echodata/combine.py | 37 ++++++++++++++++++++++++++++--- echopype/echodata/zarr_combine.py | 14 +++++++----- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 246f3fdbf..ca9ad21ee 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -3,6 +3,7 @@ from warnings import warn import dask.distributed +import fsspec import xarray as xr from dask.distributed import Client @@ -15,7 +16,7 @@ logger = _init_logger(__name__) -def check_zarr_path(zarr_path: str, storage_options: Optional[dict]) -> str: +def check_zarr_path(zarr_path: str, storage_options: Optional[dict], overwrite: bool) -> str: """ Checks that the zarr path provided to ``combine`` is valid. @@ -27,6 +28,10 @@ def check_zarr_path(zarr_path: str, storage_options: Optional[dict]) -> str: storage_options: Optional[dict] Any additional parameters for the storage backend (ignored for local paths) + overwrite: bool + If True, will overwrite the zarr store specified by + ``zarr_path`` if it already exists, otherwise an error + will be returned if the file already exists. Returns ------- @@ -37,6 +42,8 @@ def check_zarr_path(zarr_path: str, storage_options: Optional[dict]) -> str: ------ ValueError If the provided zarr path does not point to a zarr file + RuntimeError + If ``zarr_path`` already exists and ``overwrite=False`` """ if zarr_path is None: @@ -56,13 +63,30 @@ def check_zarr_path(zarr_path: str, storage_options: Optional[dict]) -> str: source_file = path_obj.parts[-1] save_path = path_obj.parent - return validate_output_path( + validated_path = validate_output_path( source_file=source_file, engine="zarr", output_storage_options=storage_options, save_path=save_path, ) + # check if validated_path already exists + fs = fsspec.get_mapper(validated_path, **storage_options).fs # get file system + exists = True if fs.exists(validated_path) else False + + if exists and not overwrite: + raise RuntimeError( + f"{zarr_path} already exists, please provide a different path" " or set overwrite=True." + ) + elif exists and overwrite: + + logger.info(f"overwriting {validated_path}") + + # remove zarr file + fs.rm(validated_path, recursive=True) + + return validated_path + def check_echodatas_input(echodatas: List[EchoData]) -> Tuple[str, List[str]]: """ @@ -297,6 +321,7 @@ def orchestrate_reverse_time_check( def combine_echodata( echodatas: List[EchoData] = None, zarr_path: Optional[str] = None, + overwrite: bool = False, storage_options: Optional[dict] = {}, client: Optional[dask.distributed.Client] = None, ) -> EchoData: @@ -311,6 +336,10 @@ def combine_echodata( The list of ``EchoData`` objects to be combined zarr_path: str The full save path to the final combined zarr store + overwrite: bool + If True, will overwrite the zarr store specified by + ``zarr_path`` if it already exists, otherwise an error + will be returned if the file already exists. storage_options: Optional[dict] Any additional parameters for the storage backend (ignored for local paths) @@ -327,6 +356,8 @@ def combine_echodata( ------ ValueError If the provided zarr path does not point to a zarr file + RuntimeError + If ``zarr_path`` already exists and ``overwrite=False`` TypeError If a list of ``EchoData`` objects are not provided ValueError @@ -411,7 +442,7 @@ def combine_echodata( raise TypeError(f"The input client is not of type {type(Client)}!") # Check the provided zarr_path is valid, or create a temp zarr_path if not provided - zarr_path = check_zarr_path(zarr_path, storage_options) + zarr_path = check_zarr_path(zarr_path, storage_options, overwrite) # return empty EchoData object, if no EchoData objects are provided if echodatas is None: diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 672083696..5e08e74a4 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -872,11 +872,15 @@ def _append_provenance_attr_vars( dup_pings_position = self.group_attrs["provenance_attr_key"].index( "duplicate_ping_times" ) - prov_attributes["duplicate_ping_times"] = ( - 1 - if np.isin(1, np.array(self.group_attrs["provenance_attrs"])[:, dup_pings_position]) - else 0 - ) + + # see if the duplicate_ping_times value is equal to 1 + elem_is_one = [ + True if val[dup_pings_position] == 1 else False + for val in self.group_attrs["provenance_attrs"] + ] + + # set duplicate_ping_times = 1 if any file has 1 + prov_attributes["duplicate_ping_times"] = 1 if any(elem_is_one) else 0 # construct Dataset and assign Provenance attributes all_ds_attrs = xr.Dataset.from_dict(xr_dict).assign_attrs(prov_attributes) From d18aa6c8b44b0b976b2607d1330b7c2ce679af3f Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 4 Oct 2022 17:06:08 -0700 Subject: [PATCH 69/89] modify input to validate_output_path so it will work with s3 buckets --- echopype/echodata/combine.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index ca9ad21ee..3dc12a0af 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -46,28 +46,18 @@ def check_zarr_path(zarr_path: str, storage_options: Optional[dict], overwrite: If ``zarr_path`` already exists and ``overwrite=False`` """ - if zarr_path is None: + # check that the appropriate suffix was provided + if not str(zarr_path).strip("/").endswith(".zarr"): + raise ValueError("The provided zarr_path input must have '.zarr' suffix!") - # assign values, if no zarr path has been provided - source_file = "combined_echodatas.zarr" - save_path = None - else: - - # turn string path into Path object - path_obj = Path(zarr_path) - if path_obj.suffix != ".zarr": - raise ValueError("The provided zarr_path input must point to a zarr file!") - else: - - # assign values based on zarr path - source_file = path_obj.parts[-1] - save_path = path_obj.parent + # set default source_file name (will be used only if zarr_path is None) + source_file = "combined_echodatas.zarr" validated_path = validate_output_path( source_file=source_file, engine="zarr", output_storage_options=storage_options, - save_path=save_path, + save_path=zarr_path, ) # check if validated_path already exists From 1cee3eb9178f78be5935e47183922c33ca3d51fe Mon Sep 17 00:00:00 2001 From: b-reyes <53541061+b-reyes@users.noreply.github.com> Date: Tue, 4 Oct 2022 17:09:19 -0700 Subject: [PATCH 70/89] remove double quotes Co-authored-by: Don Setiawan --- echopype/echodata/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 3dc12a0af..c77732f66 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -66,7 +66,7 @@ def check_zarr_path(zarr_path: str, storage_options: Optional[dict], overwrite: if exists and not overwrite: raise RuntimeError( - f"{zarr_path} already exists, please provide a different path" " or set overwrite=True." + f"{zarr_path} already exists, please provide a different path or set overwrite=True." ) elif exists and overwrite: From ee05db4a4b9d91195f40aac45ec60b97ef9593de Mon Sep 17 00:00:00 2001 From: b-reyes <53541061+b-reyes@users.noreply.github.com> Date: Tue, 4 Oct 2022 17:11:33 -0700 Subject: [PATCH 71/89] allow Path type for zarr_path Co-authored-by: Don Setiawan --- echopype/echodata/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index c77732f66..8c287d765 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -16,7 +16,7 @@ logger = _init_logger(__name__) -def check_zarr_path(zarr_path: str, storage_options: Optional[dict], overwrite: bool) -> str: +def check_zarr_path(zarr_path: Union[Path, str], storage_options: Optional[dict], overwrite: bool) -> str: """ Checks that the zarr path provided to ``combine`` is valid. From 3d96012249a94ffd8604d859929157cf97320f9f Mon Sep 17 00:00:00 2001 From: b-reyes <53541061+b-reyes@users.noreply.github.com> Date: Tue, 4 Oct 2022 17:11:47 -0700 Subject: [PATCH 72/89] add union typing Co-authored-by: Don Setiawan --- echopype/echodata/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 8c287d765..46ffaae7c 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Union from warnings import warn import dask.distributed From e5334c1427d9d96177bda5122ee0324a2be9b289 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 5 Oct 2022 00:11:52 +0000 Subject: [PATCH 73/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopype/echodata/combine.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 46ffaae7c..cdfe237fc 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -16,7 +16,9 @@ logger = _init_logger(__name__) -def check_zarr_path(zarr_path: Union[Path, str], storage_options: Optional[dict], overwrite: bool) -> str: +def check_zarr_path( + zarr_path: Union[Path, str], storage_options: Optional[dict], overwrite: bool +) -> str: """ Checks that the zarr path provided to ``combine`` is valid. From fe0379bce14e5d5c4dbf93066eda77db7f9ab58e Mon Sep 17 00:00:00 2001 From: b-reyes <53541061+b-reyes@users.noreply.github.com> Date: Tue, 4 Oct 2022 17:16:37 -0700 Subject: [PATCH 74/89] add storage_options to open_converted call Co-authored-by: Don Setiawan --- echopype/echodata/zarr_combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 5e08e74a4..ea2a60a15 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -1012,7 +1012,7 @@ def combine( # open lazy loaded combined EchoData object ed_combined = open_converted( - zarr_path, chunks={}, synchronizer=zarr.ThreadSynchronizer() + zarr_path, chunks={}, synchronizer=zarr.ThreadSynchronizer(), storage_options=storage_options ) # TODO: is this appropriate for chunks? return ed_combined From bf4c48f0ca43f51b1d36fdf85cccd256619744ca Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 5 Oct 2022 00:16:54 +0000 Subject: [PATCH 75/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopype/echodata/zarr_combine.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index ea2a60a15..aa24ec978 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -1012,7 +1012,10 @@ def combine( # open lazy loaded combined EchoData object ed_combined = open_converted( - zarr_path, chunks={}, synchronizer=zarr.ThreadSynchronizer(), storage_options=storage_options + zarr_path, + chunks={}, + synchronizer=zarr.ThreadSynchronizer(), + storage_options=storage_options, ) # TODO: is this appropriate for chunks? return ed_combined From bfb9209379a3011dd5564ef30d6711df905b30b1 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 4 Oct 2022 17:34:11 -0700 Subject: [PATCH 76/89] add storage_options to zarr.open_array call --- echopype/echodata/zarr_combine.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index aa24ec978..829968f04 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -895,7 +895,9 @@ def _append_provenance_attr_vars( ) @staticmethod - def _modify_prov_filenames(zarr_path: str, len_eds: int) -> None: + def _modify_prov_filenames( + zarr_path: str, len_eds: int, storage_options: Optional[dict] + ) -> None: """ After the ``Provenance`` group has been constructed, the coordinate ``filenames`` will be filled with zeros. This @@ -908,10 +910,15 @@ def _modify_prov_filenames(zarr_path: str, len_eds: int) -> None: The full path of the final combined zarr store len_eds: int The number of ``EchoData`` objects being combined + storage_options: Optional[dict] + Any additional parameters for the storage + backend (ignored for local paths) """ # obtain the filenames zarr array - zarr_filenames = zarr.open_array(zarr_path + "/Provenance/filenames", mode="r+") + zarr_filenames = zarr.open_array( + zarr_path + "/Provenance/filenames", mode="r+", storage_options=storage_options + ) zarr_filenames[:] = np.arange(len_eds) @@ -1008,7 +1015,7 @@ def combine( self._append_provenance_attr_vars(zarr_path, storage_options=storage_options) # change filenames numbering to range(len(eds)) - self._modify_prov_filenames(zarr_path, len_eds=len(eds)) + self._modify_prov_filenames(zarr_path, len_eds=len(eds), storage_options=storage_options) # open lazy loaded combined EchoData object ed_combined = open_converted( From 5e5c882c276c00f36ad87728aa7bcc7e09646f99 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 5 Oct 2022 08:40:20 -0700 Subject: [PATCH 77/89] only allow zarr_path to be a string and remove option for Path type --- echopype/echodata/combine.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index cdfe237fc..2b414ca3e 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple from warnings import warn import dask.distributed @@ -16,9 +16,7 @@ logger = _init_logger(__name__) -def check_zarr_path( - zarr_path: Union[Path, str], storage_options: Optional[dict], overwrite: bool -) -> str: +def check_zarr_path(zarr_path: str, storage_options: Optional[dict], overwrite: bool) -> str: """ Checks that the zarr path provided to ``combine`` is valid. @@ -48,8 +46,12 @@ def check_zarr_path( If ``zarr_path`` already exists and ``overwrite=False`` """ + # check that zarr_path is a string + if not isinstance(zarr_path, str): + raise TypeError(f"zarr_path must be of type {str}") + # check that the appropriate suffix was provided - if not str(zarr_path).strip("/").endswith(".zarr"): + if not zarr_path.strip("/").endswith(".zarr"): raise ValueError("The provided zarr_path input must have '.zarr' suffix!") # set default source_file name (will be used only if zarr_path is None) From 3acce9dfa892ad598039f0687c1958f3a774ee32 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 5 Oct 2022 08:43:05 -0700 Subject: [PATCH 78/89] send client dashboard link to the logger instead of printing it --- echopype/echodata/combine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 2b414ca3e..3e2da7499 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -429,9 +429,9 @@ def combine_echodata( client_created = True client = Client() # create client with local scheduler - print(f"Client dashboard link: {client.dashboard_link}") + logger.info(f"Client dashboard link: {client.dashboard_link}") elif isinstance(client, Client): - print(f"Client dashboard link: {client.dashboard_link}") + logger.info(f"Client dashboard link: {client.dashboard_link}") else: raise TypeError(f"The input client is not of type {type(Client)}!") From 4facc4903d1bb35d1955bf90c03798f11b6361d0 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 5 Oct 2022 08:47:59 -0700 Subject: [PATCH 79/89] set storage_option equal to empty dict in _modify_prov_filenames --- echopype/echodata/zarr_combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 829968f04..58da371f7 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -896,7 +896,7 @@ def _append_provenance_attr_vars( @staticmethod def _modify_prov_filenames( - zarr_path: str, len_eds: int, storage_options: Optional[dict] + zarr_path: str, len_eds: int, storage_options: Optional[dict] = {} ) -> None: """ After the ``Provenance`` group has been constructed, the From b3b375d67358f28976787a31de13b159dc26a8fd Mon Sep 17 00:00:00 2001 From: b-reyes <53541061+b-reyes@users.noreply.github.com> Date: Wed, 5 Oct 2022 15:06:57 -0700 Subject: [PATCH 80/89] change storage options typing and set default value for overwrite in check_zarr_path Co-authored-by: Don Setiawan --- echopype/echodata/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 3e2da7499..b0f7f026c 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -16,7 +16,7 @@ logger = _init_logger(__name__) -def check_zarr_path(zarr_path: str, storage_options: Optional[dict], overwrite: bool) -> str: +def check_zarr_path(zarr_path: str, storage_options: Dict[str, Any] = {}, overwrite: bool = False) -> str: """ Checks that the zarr path provided to ``combine`` is valid. From 4f1d4fb427fd155074e22ade1f2f03e23149f4b8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 5 Oct 2022 22:07:17 +0000 Subject: [PATCH 81/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopype/echodata/combine.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index b0f7f026c..33e59318b 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -16,7 +16,9 @@ logger = _init_logger(__name__) -def check_zarr_path(zarr_path: str, storage_options: Dict[str, Any] = {}, overwrite: bool = False) -> str: +def check_zarr_path( + zarr_path: str, storage_options: Dict[str, Any] = {}, overwrite: bool = False +) -> str: """ Checks that the zarr path provided to ``combine`` is valid. From 376c56fb1b7b43a614ec209e14822da8be946ba1 Mon Sep 17 00:00:00 2001 From: b-reyes <53541061+b-reyes@users.noreply.github.com> Date: Wed, 5 Oct 2022 15:07:55 -0700 Subject: [PATCH 82/89] add Dict and Any typing in combine.py Co-authored-by: Don Setiawan --- echopype/echodata/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 33e59318b..83baec861 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict, Any from warnings import warn import dask.distributed From 387479be7abebc94e3537744d9e4c62976c82cc3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 5 Oct 2022 22:08:12 +0000 Subject: [PATCH 83/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopype/echodata/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 83baec861..5adceecab 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import List, Optional, Tuple, Dict, Any +from typing import Any, Dict, List, Optional, Tuple from warnings import warn import dask.distributed From 0bf73afe4374614e2df81723441d8fba0f21185f Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 5 Oct 2022 15:10:51 -0700 Subject: [PATCH 84/89] change docstring type for storage_options in check_zarr_path --- echopype/echodata/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 5adceecab..8ed678c6c 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -27,7 +27,7 @@ def check_zarr_path( ---------- zarr_path: str The full save path to the final combined zarr store - storage_options: Optional[dict] + storage_options: Dict[str, Any] Any additional parameters for the storage backend (ignored for local paths) overwrite: bool From 9346fb16b9e9ff2546140a993d7c4a8f4b20f864 Mon Sep 17 00:00:00 2001 From: b-reyes <53541061+b-reyes@users.noreply.github.com> Date: Wed, 5 Oct 2022 15:14:34 -0700 Subject: [PATCH 85/89] change storage options typing in combine_echodata input Co-authored-by: Don Setiawan --- echopype/echodata/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 8ed678c6c..f41ebc127 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -318,7 +318,7 @@ def combine_echodata( echodatas: List[EchoData] = None, zarr_path: Optional[str] = None, overwrite: bool = False, - storage_options: Optional[dict] = {}, + storage_options: Dict[str, Any] = {}, client: Optional[dask.distributed.Client] = None, ) -> EchoData: """ From 290c72919a6bd841829dea3519e42d5385c25df6 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 5 Oct 2022 15:17:20 -0700 Subject: [PATCH 86/89] update typing for storage_options in docstring of combine_echodata --- echopype/echodata/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index f41ebc127..8d3658d63 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -336,7 +336,7 @@ def combine_echodata( If True, will overwrite the zarr store specified by ``zarr_path`` if it already exists, otherwise an error will be returned if the file already exists. - storage_options: Optional[dict] + storage_options: Dict[str, Any] Any additional parameters for the storage backend (ignored for local paths) client: Optional[dask.distributed.Client] From b3a51ffb722075c7ef8dd6da6e6284f24005af3e Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 5 Oct 2022 15:39:50 -0700 Subject: [PATCH 87/89] change all typing for storage_options in zarr_combine --- echopype/echodata/zarr_combine.py | 32 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index 58da371f7..c94cef58e 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -1,6 +1,6 @@ from collections import defaultdict from itertools import islice -from typing import Dict, Hashable, List, Optional, Set, Tuple +from typing import Any, Dict, Hashable, List, Set, Tuple import dask import dask.array @@ -201,7 +201,7 @@ def _compare_attrs(attr1: dict, attr2: dict) -> List[str]: return numpy_keys - def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: Optional[str]) -> None: + def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: str) -> None: """ Constructs useful dictionaries that contain information about the dimensions of the Dataset. Additionally, collects @@ -599,7 +599,7 @@ def write_to_file( zarr_path: str, zarr_group: str, region: Dict[str, slice], - storage_options: Optional[dict], + storage_options: Dict[str, Any] = {}, ) -> None: """ Constructs a delayed write of ``ds_in`` to the appropriate zarr @@ -620,7 +620,7 @@ def write_to_file( region: Dict[str, slice] Keys set as the dimension name and values as the slice of the zarr portion to write to - storage_options: Optional[dict] + storage_options: Dict[str, Any] Any additional parameters for the storage backend (ignored for local paths) """ @@ -644,7 +644,7 @@ def _append_ds_list_to_zarr( ds_list: List[xr.Dataset], zarr_group: str, ed_name: str, - storage_options: Optional[dict] = {}, + storage_options: Dict[str, Any] = {}, ) -> List[str]: """ Creates a zarr store and then appends each Dataset @@ -663,7 +663,7 @@ def _append_ds_list_to_zarr( ed_name: str The name of the EchoData group corresponding to the Datasets in ``ds_list`` - storage_options: Optional[dict] + storage_options: Dict[str, Any] Any additional parameters for the storage backend (ignored for local paths) @@ -744,7 +744,7 @@ def _append_const_to_zarr( ds_list: List[xr.Dataset], zarr_path: str, zarr_group: str, - storage_options: Optional[dict], + storage_options: Dict[str, Any] = {}, ) -> None: """ Appends all constant (i.e. not chunked) variables and dimensions to the @@ -761,7 +761,7 @@ def _append_const_to_zarr( zarr_group: str The name of the group of the zarr store corresponding to the Datasets in ``ds_list`` - storage_options: Optional[dict] + storage_options: Dict[str, Any] Any additional parameters for the storage backend (ignored for local paths) @@ -789,7 +789,7 @@ def _write_append_dims( ds_list: List[xr.Dataset], zarr_path: str, zarr_group: str, - storage_options: Optional[dict], + storage_options: Dict[str, Any] = {}, ) -> None: """ Sequentially writes each Dataset's append dimension in ``ds_list`` to @@ -804,7 +804,7 @@ def _write_append_dims( zarr_group: str The name of the group of the zarr store corresponding to the Datasets in ``ds_list`` - storage_options: Optional[dict] + storage_options: Dict[str, Any] Any additional parameters for the storage backend (ignored for local paths) """ @@ -834,7 +834,7 @@ def _write_append_dims( ) def _append_provenance_attr_vars( - self, zarr_path: str, storage_options: Optional[dict] = {} + self, zarr_path: str, storage_options: Dict[str, Any] = {} ) -> None: """ Creates an xarray Dataset with variables set as the attributes @@ -846,7 +846,7 @@ def _append_provenance_attr_vars( ---------- zarr_path: str The full path of the final combined zarr store - storage_options: Optional[dict] + storage_options: Dict[str, Any] Any additional parameters for the storage backend (ignored for local paths) """ @@ -896,7 +896,7 @@ def _append_provenance_attr_vars( @staticmethod def _modify_prov_filenames( - zarr_path: str, len_eds: int, storage_options: Optional[dict] = {} + zarr_path: str, len_eds: int, storage_options: Dict[str, Any] = {} ) -> None: """ After the ``Provenance`` group has been constructed, the @@ -910,7 +910,7 @@ def _modify_prov_filenames( The full path of the final combined zarr store len_eds: int The number of ``EchoData`` objects being combined - storage_options: Optional[dict] + storage_options: Dict[str, Any] Any additional parameters for the storage backend (ignored for local paths) """ @@ -926,7 +926,7 @@ def combine( self, zarr_path: str, eds: List[EchoData] = [], - storage_options: Optional[dict] = {}, + storage_options: Dict[str, Any] = {}, sonar_model: str = None, echodata_filenames: List[str] = [], ) -> EchoData: @@ -941,7 +941,7 @@ def combine( The full path of the final combined zarr store eds: List[EchoData] The list of ``EchoData`` objects to be combined - storage_options: Optional[dict] + storage_options: Dict[str, Any] Any additional parameters for the storage backend (ignored for local paths) sonar_model : str From 63866466dd7637471829d4ab89e9eded295b6184 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 6 Oct 2022 08:43:06 -0700 Subject: [PATCH 88/89] remove typing types from docstrings and add optional where necessary --- echopype/echodata/combine.py | 18 ++++----- echopype/echodata/zarr_combine.py | 63 +++++++++++++++---------------- 2 files changed, 40 insertions(+), 41 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index 8d3658d63..b36c847f0 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -27,7 +27,7 @@ def check_zarr_path( ---------- zarr_path: str The full save path to the final combined zarr store - storage_options: Dict[str, Any] + storage_options: dict Any additional parameters for the storage backend (ignored for local paths) overwrite: bool @@ -91,14 +91,14 @@ def check_echodatas_input(echodatas: List[EchoData]) -> Tuple[str, List[str]]: Parameters ---------- - echodatas: List[EchoData] + echodatas: list The list of `EchoData` objects to be combined. Returns ------- sonar_model : str The sonar model used for all values in ``echodatas`` - echodata_filenames : List[str] + echodata_filenames : list The source files names for all values in ``echodatas`` Raises @@ -169,7 +169,7 @@ def check_and_correct_reversed_time( Returns ------- - old_time : Optional[xr.DataArray] + old_time : xr.DataArray or None If correction is necessary, returns the time before reversal correction, otherwise returns None @@ -256,7 +256,7 @@ def orchestrate_reverse_time_check( combined ``EchoData`` objects zarr_store: str The zarr store containing the ``ed_comb`` data - possible_time_dims: List[str] + possible_time_dims: list All possible time dimensions that can occur within ``ed_comb``, which should be checked storage_options: dict @@ -328,18 +328,18 @@ def combine_echodata( Parameters ---------- - echodatas : List[EchoData] + echodatas : list The list of ``EchoData`` objects to be combined - zarr_path: str + zarr_path: str, optional The full save path to the final combined zarr store overwrite: bool If True, will overwrite the zarr store specified by ``zarr_path`` if it already exists, otherwise an error will be returned if the file already exists. - storage_options: Dict[str, Any] + storage_options: dict Any additional parameters for the storage backend (ignored for local paths) - client: Optional[dask.distributed.Client] + client: dask.distributed.Client, optional An initialized Dask distributed client Returns diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index c94cef58e..abb465d90 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -59,7 +59,7 @@ def _check_ascending_ds_times(self, ds_list: List[xr.Dataset], ed_name: str) -> Parameters ---------- - ds_list: List[xr.Dataset] + ds_list: list List of Datasets to be combined ed_name: str The name of the ``EchoData`` group being combined @@ -104,7 +104,7 @@ def _check_channels(ds_list: List[xr.Dataset], ed_name: str) -> None: Parameters ---------- - ds_list: List[xr.Dataset] + ds_list: list List of Datasets to be combined ed_name: str The name of the ``EchoData`` group being combined @@ -147,7 +147,7 @@ def _compare_attrs(attr1: dict, attr2: dict) -> List[str]: Returns ------- - numpy_keys: List[str] + numpy_keys: list All keys that have numpy arrays as values Raises @@ -211,7 +211,7 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: str) -> None: Parameters ---------- - ds_list: List[xr.Dataset] + ds_list: list The Datasets that will be combined ed_name: str The name of the EchoData group corresponding to the @@ -280,7 +280,7 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), Parameters ---------- - dims: List[str] + dims: list A list of the dimension names dtype: type The data type of the variable @@ -288,9 +288,9 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), Returns ------- temp_arr: dask.array - a temporary (or dummy) array representing a + A temporary (or dummy) array representing a variable in its final combined form. - chnk_shape: List[int] + chnk_shape: list The chunk shape used to construct ``temp_arr`` Notes @@ -337,7 +337,7 @@ def _get_encodings(self, name: str, val: xr.Variable, chnk_shape: list) -> Dict[ Returns ------- - var_encoding : Dict[str, dict] + var_encoding : dict All encodings associated with ``name`` """ @@ -380,10 +380,10 @@ def _construct_lazy_ds_and_var_info( ds: xr.Dataset A lazy Dataset representing the EchoData group Dataset in its final combined form - const_names: List[str] + const_names: list The names of all variables and dimensions that are constant (with respect to chunking) across all Datasets to be combined - encodings: Dict[str, dict] + encodings: dict The encodings for all variables and dimensions that will be written to the zarr store by regions @@ -436,12 +436,12 @@ def _get_region(self, ds_ind: int, ds_dims: Set[Hashable]) -> Dict[str, slice]: ds_ind: int The key of the values of ``self.dims_csum`` or index of ``self.dims_df`` to use for each dimension name - ds_dims: Set[Hashable] + ds_dims: set The names of the dimensions used in the region creation Returns ------- - region: Dict[str, slice] + region: dict Keys set as the dimension name and values as the slice of the zarr portion to write to """ @@ -471,7 +471,7 @@ def _uniform_chunks_as_np_array(array: np.ndarray, chunk_size: int) -> List[np.n Returns ------- - List[np.ndarray] + list The chunked input ``array`` Example @@ -504,10 +504,10 @@ def _get_chunk_dicts(self, dim: str) -> Tuple[Dict[int, np.ndarray], Dict[int, n Returns ------- - og_chunk_dict: Dict[int, np.ndarray] + og_chunk_dict: dict The chunk dictionary corresponding to the original non-uniform chunks - uniform_chunk_dict: Dict[int, np.ndarray] + uniform_chunk_dict: dict The chunk dictionary corresponding to the uniform chunks """ @@ -547,7 +547,7 @@ def _get_uniform_to_nonuniform_map(self, dim: str) -> Dict[int, dict]: Returns ------- - final_mapping: Dict[int, dict] + final_mapping: dict Uniform to non-uniform mapping where the keys are the chunk index in the uniform chunk and the values are dictionaries. The value dictionaries have keys @@ -617,10 +617,10 @@ def write_to_file( zarr_group: str The name of the group of the zarr store corresponding to the Datasets in ``ds_list`` - region: Dict[str, slice] + region: dict Keys set as the dimension name and values as the slice of the zarr portion to write to - storage_options: Dict[str, Any] + storage_options: dict Any additional parameters for the storage backend (ignored for local paths) """ @@ -631,7 +631,6 @@ def write_to_file( group=zarr_group, region=region, compute=True, - # safe_chunks=False, storage_options=storage_options, synchronizer=zarr.ThreadSynchronizer(), ) @@ -655,7 +654,7 @@ def _append_ds_list_to_zarr( ---------- zarr_path: str The full path of the final combined zarr store - ds_list: List[xr.Dataset] + ds_list: list The Datasets that will be combined zarr_group: str The name of the group of the zarr store @@ -663,13 +662,13 @@ def _append_ds_list_to_zarr( ed_name: str The name of the EchoData group corresponding to the Datasets in ``ds_list`` - storage_options: Dict[str, Any] + storage_options: dict Any additional parameters for the storage backend (ignored for local paths) Returns ------- - const_names: List[str] + const_names: list The names of all variables and dimensions that are constant (with respect to chunking) across all Datasets to be combined """ @@ -752,16 +751,16 @@ def _append_const_to_zarr( Parameters ---------- - const_vars: List[str] + const_vars: list The names of all variables/dimensions that are not chunked - ds_list: List[xr.Dataset] + ds_list: list The Datasets that will be combined zarr_path: str The full path of the final combined zarr store zarr_group: str The name of the group of the zarr store corresponding to the Datasets in ``ds_list`` - storage_options: Dict[str, Any] + storage_options: dict Any additional parameters for the storage backend (ignored for local paths) @@ -797,14 +796,14 @@ def _write_append_dims( Parameters ---------- - ds_list: List[xr.Dataset] + ds_list: list The Datasets that will be combined zarr_path: str The full path of the final combined zarr store zarr_group: str The name of the group of the zarr store corresponding to the Datasets in ``ds_list`` - storage_options: Dict[str, Any] + storage_options: dict Any additional parameters for the storage backend (ignored for local paths) """ @@ -846,7 +845,7 @@ def _append_provenance_attr_vars( ---------- zarr_path: str The full path of the final combined zarr store - storage_options: Dict[str, Any] + storage_options: dict Any additional parameters for the storage backend (ignored for local paths) """ @@ -910,7 +909,7 @@ def _modify_prov_filenames( The full path of the final combined zarr store len_eds: int The number of ``EchoData`` objects being combined - storage_options: Dict[str, Any] + storage_options: dict Any additional parameters for the storage backend (ignored for local paths) """ @@ -939,14 +938,14 @@ def combine( ---------- zarr_path: str The full path of the final combined zarr store - eds: List[EchoData] + eds: list The list of ``EchoData`` objects to be combined - storage_options: Dict[str, Any] + storage_options: dict Any additional parameters for the storage backend (ignored for local paths) sonar_model : str The sonar model used for all elements in ``eds`` - echodata_filenames : List[str] + echodata_filenames : list The source files names for all elements in ``eds`` Returns From 5afa7152b56fbb5508f91bcd0e6922b0f899bc84 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 6 Oct 2022 11:21:21 -0700 Subject: [PATCH 89/89] specify the type of the elements in a list within docstrings --- echopype/echodata/combine.py | 8 ++++---- echopype/echodata/zarr_combine.py | 30 +++++++++++++++--------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/echopype/echodata/combine.py b/echopype/echodata/combine.py index b36c847f0..b9b725c1c 100644 --- a/echopype/echodata/combine.py +++ b/echopype/echodata/combine.py @@ -91,14 +91,14 @@ def check_echodatas_input(echodatas: List[EchoData]) -> Tuple[str, List[str]]: Parameters ---------- - echodatas: list + echodatas: list of EchoData object The list of `EchoData` objects to be combined. Returns ------- sonar_model : str The sonar model used for all values in ``echodatas`` - echodata_filenames : list + echodata_filenames : list of str The source files names for all values in ``echodatas`` Raises @@ -256,7 +256,7 @@ def orchestrate_reverse_time_check( combined ``EchoData`` objects zarr_store: str The zarr store containing the ``ed_comb`` data - possible_time_dims: list + possible_time_dims: list of str All possible time dimensions that can occur within ``ed_comb``, which should be checked storage_options: dict @@ -328,7 +328,7 @@ def combine_echodata( Parameters ---------- - echodatas : list + echodatas : list of EchoData object The list of ``EchoData`` objects to be combined zarr_path: str, optional The full save path to the final combined zarr store diff --git a/echopype/echodata/zarr_combine.py b/echopype/echodata/zarr_combine.py index abb465d90..48d069125 100644 --- a/echopype/echodata/zarr_combine.py +++ b/echopype/echodata/zarr_combine.py @@ -59,7 +59,7 @@ def _check_ascending_ds_times(self, ds_list: List[xr.Dataset], ed_name: str) -> Parameters ---------- - ds_list: list + ds_list: list of xr.Dataset List of Datasets to be combined ed_name: str The name of the ``EchoData`` group being combined @@ -104,7 +104,7 @@ def _check_channels(ds_list: List[xr.Dataset], ed_name: str) -> None: Parameters ---------- - ds_list: list + ds_list: list of xr.Dataset List of Datasets to be combined ed_name: str The name of the ``EchoData`` group being combined @@ -211,7 +211,7 @@ def _get_ds_info(self, ds_list: List[xr.Dataset], ed_name: str) -> None: Parameters ---------- - ds_list: list + ds_list: list of xr.Dataset The Datasets that will be combined ed_name: str The name of the EchoData group corresponding to the @@ -280,7 +280,7 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), Parameters ---------- - dims: list + dims: list of str A list of the dimension names dtype: type The data type of the variable @@ -290,7 +290,7 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), temp_arr: dask.array A temporary (or dummy) array representing a variable in its final combined form. - chnk_shape: list + chnk_shape: list of int The chunk shape used to construct ``temp_arr`` Notes @@ -317,7 +317,7 @@ def _get_temp_arr(self, dims: List[str], dtype: type) -> Tuple[type(dask.array), return temp_arr, chnk_shape - def _get_encodings(self, name: str, val: xr.Variable, chnk_shape: list) -> Dict[str, dict]: + def _get_encodings(self, name: str, val: xr.Variable, chnk_shape: List[int]) -> Dict[str, dict]: """ Obtains the encodings for the variable ``name`` by including all encodings in ``val``, except those encodings that are specified by @@ -332,7 +332,7 @@ def _get_encodings(self, name: str, val: xr.Variable, chnk_shape: list) -> Dict[ val: xr.Variable The variable that contains the encodings we want to assign to ``name`` - chnk_shape: list + chnk_shape: list of int The shape of the chunks for ``name`` (used in encodings) Returns @@ -380,7 +380,7 @@ def _construct_lazy_ds_and_var_info( ds: xr.Dataset A lazy Dataset representing the EchoData group Dataset in its final combined form - const_names: list + const_names: list of str The names of all variables and dimensions that are constant (with respect to chunking) across all Datasets to be combined encodings: dict @@ -471,7 +471,7 @@ def _uniform_chunks_as_np_array(array: np.ndarray, chunk_size: int) -> List[np.n Returns ------- - list + list of np.ndarray The chunked input ``array`` Example @@ -654,7 +654,7 @@ def _append_ds_list_to_zarr( ---------- zarr_path: str The full path of the final combined zarr store - ds_list: list + ds_list: list of xr.Dataset The Datasets that will be combined zarr_group: str The name of the group of the zarr store @@ -751,9 +751,9 @@ def _append_const_to_zarr( Parameters ---------- - const_vars: list + const_vars: list of str The names of all variables/dimensions that are not chunked - ds_list: list + ds_list: list of xr.Dataset The Datasets that will be combined zarr_path: str The full path of the final combined zarr store @@ -796,7 +796,7 @@ def _write_append_dims( Parameters ---------- - ds_list: list + ds_list: list of xr.Dataset The Datasets that will be combined zarr_path: str The full path of the final combined zarr store @@ -938,14 +938,14 @@ def combine( ---------- zarr_path: str The full path of the final combined zarr store - eds: list + eds: list of EchoData object The list of ``EchoData`` objects to be combined storage_options: dict Any additional parameters for the storage backend (ignored for local paths) sonar_model : str The sonar model used for all elements in ``eds`` - echodata_filenames : list + echodata_filenames : list of str The source files names for all elements in ``eds`` Returns