diff --git a/dacapo/apply.py b/dacapo/apply.py index 872e3c532..8c3d7cbf7 100644 --- a/dacapo/apply.py +++ b/dacapo/apply.py @@ -91,7 +91,7 @@ def apply( ), "Either validation_dataset and criterion, or iteration must be provided." # retrieving run - print(f"Loading run {run_name}") + logger.info(f"Loading run {run_name}") config_store = create_config_store() run_config = config_store.retrieve_run_config(run_name) run = Run(run_config) @@ -102,7 +102,7 @@ def apply( # load weights if iteration is None: iteration = weights_store.retrieve_best(run_name, validation_dataset, criterion) # type: ignore - print(f"Loading weights for iteration {iteration}") + logger.info(f"Loading weights for iteration {iteration}") weights_store.retrieve_weights(run_name, iteration) if parameters is None: @@ -121,7 +121,7 @@ def apply( raise ValueError( "validation_dataset must be a dataset name or a Dataset object, or parameters must be provided explicitly." ) - print(f"Finding best parameters for validation dataset {_validation_dataset}") + logger.info(f"Finding best parameters for validation dataset {_validation_dataset}") parameters = run.task.evaluator.get_overall_best_parameters( _validation_dataset, criterion ) @@ -183,7 +183,7 @@ def apply( output_container, f"output_{run_name}_{iteration}_{parameters}" ) - print( + logger.info( f"Applying best results from run {run.name} at iteration {iteration} to dataset {Path(input_container, input_dataset)}" ) return apply_run( @@ -243,7 +243,7 @@ def apply_run( ... ) """ # render prediction dataset - print(f"Predicting on dataset {prediction_array_identifier}") + logger.info(f"Predicting on dataset {prediction_array_identifier}") predict( run.name, iteration, @@ -257,7 +257,7 @@ def apply_run( ) # post-process the output - print( + logger.info( f"Post-processing output to dataset {output_array_identifier}", output_array_identifier, ) @@ -265,5 +265,5 @@ def apply_run( post_processor.set_prediction(prediction_array_identifier) post_processor.process(parameters, output_array_identifier, num_workers=num_workers) - print("Done") + logger.info("Done") return diff --git a/dacapo/cli.py b/dacapo/cli.py index e0b86a1c1..6fe5c9281 100644 --- a/dacapo/cli.py +++ b/dacapo/cli.py @@ -680,7 +680,7 @@ def segment_blockwise( overwrite=overwrite, write_size=write_roi.shape, ) - print( + logger.info( f"Created output array {output_array_identifier.container}:{output_array_identifier.dataset} with ROI {_total_roi}." ) @@ -791,7 +791,7 @@ def config(): def generate_dacapo_yaml(config): with open("dacapo.yaml", "w") as f: yaml.dump(config.serialize(), f, default_flow_style=False) - print("dacapo.yaml has been created.") + logger.info("dacapo.yaml has been created.") def generate_config( @@ -832,7 +832,7 @@ def unpack_ctx(ctx): Example: >>> ctx = ... >>> kwargs = unpack_ctx(ctx) - >>> print(kwargs) + >>> logger.info(kwargs) {'arg1': value1, 'arg2': value2, ...} """ kwargs = { @@ -843,7 +843,7 @@ def unpack_ctx(ctx): kwargs[k] = int(v) elif v.replace(".", "").isnumeric(): kwargs[k] = float(v) - print(f"{k}: {kwargs[k]}") + logger.info(f"{k}: {kwargs[k]}") return kwargs diff --git a/dacapo/experiments/datasplits/datasplit.py b/dacapo/experiments/datasplits/datasplit.py index 4e94eb281..b2a91aae0 100644 --- a/dacapo/experiments/datasplits/datasplit.py +++ b/dacapo/experiments/datasplits/datasplit.py @@ -5,6 +5,9 @@ import json import itertools +import logging + +logger = logging.getLogger(__name__) class DataSplit(ABC): """ @@ -84,7 +87,7 @@ def _neuroglancer(self, embedded=False, bind_address="0.0.0.0", bind_port=0): neuroglancer.LayerGroupViewer(layers=list(validate_layers.keys())), ] ) - print(f"Neuroglancer link: {viewer}") + logger.info(f"Neuroglancer link: {viewer}") if embedded: from IPython.display import IFrame diff --git a/dacapo/experiments/datasplits/datasplit_generator.py b/dacapo/experiments/datasplits/datasplit_generator.py deleted file mode 100644 index f968b0fa1..000000000 --- a/dacapo/experiments/datasplits/datasplit_generator.py +++ /dev/null @@ -1,1073 +0,0 @@ -from dacapo.experiments.tasks import TaskConfig -from dacapo.experiments.datasplits.datasets.arrays import ArrayConfig -from upath import UPath as Path -from typing import List, Union, Optional, Sequence -from enum import Enum, EnumMeta -from funlib.geometry import Coordinate - -import zarr -from zarr.n5 import N5FSStore -import numpy as np -from dacapo.experiments.datasplits.datasets.arrays import ( - ResampledArrayConfig, - BinarizeArrayConfig, - IntensitiesArrayConfig, - ConcatArrayConfig, - LogicalOrArrayConfig, - ConstantArrayConfig, - CropArrayConfig, - ZarrArrayConfig, -) -from dacapo.experiments.datasplits import TrainValidateDataSplitConfig -from dacapo.experiments.datasplits.datasets import RawGTDatasetConfig -import logging - - -logger = logging.getLogger(__name__) - - -def is_zarr_group(file_name: Path, dataset: str): - """ - Check if the dataset is a Zarr group. If the dataset is a Zarr group, it will return True, otherwise False. - - Args: - file_name : str - The name of the file. - dataset : str - The name of the dataset. - Returns: - bool : True if the dataset is a Zarr group, otherwise False. - Raises: - FileNotFoundError - If the file does not exist, a FileNotFoundError is raised. - Examples: - >>> is_zarr_group(file_name, dataset) - Notes: - This function is used to check if the dataset is a Zarr group. - """ - if file_name.suffix == ".n5": - zarr_file = zarr.open(N5FSStore(str(file_name)), mode="r") - else: - zarr_file = zarr.open(str(file_name), mode="r") - return isinstance(zarr_file[dataset], zarr.hierarchy.Group) - - -def resize_if_needed( - array_config: ZarrArrayConfig, target_resolution: Coordinate, extra_str="" -): - """ - Resize the array if needed. If the array needs to be resized, it will return the resized array, otherwise it will return the original array. - - Args: - array_config : obj - The configuration of the array. - target_resolution : obj - The target resolution. - extra_str : str - An extra string. - Returns: - obj : The resized array if needed, otherwise the original array. - Raises: - FileNotFoundError - If the file does not exist, a FileNotFoundError is raised. - Examples: - >>> resize_if_needed(array_config, target_resolution, extra_str) - Notes: - This function is used to resize the array if needed. - """ - zarr_array = array_config.array() - raw_voxel_size = zarr_array.voxel_size - - raw_upsample = raw_voxel_size / target_resolution - raw_downsample = target_resolution / raw_voxel_size - assert len(target_resolution) == zarr_array.dims, ( - f"Target resolution {target_resolution} and raw voxel size {raw_voxel_size} " - f"have different dimensions {zarr_array.dims}" - ) - if any([u > 1 or d > 1 for u, d in zip(raw_upsample, raw_downsample)]): - print( - f"dataset {array_config} needs resampling to {target_resolution}, upsample: {raw_upsample}, downsample: {raw_downsample}" - ) - return ResampledArrayConfig( - name=f"{extra_str}_{array_config.name}_{array_config.dataset}_resampled", - source_array_config=array_config, - upsample=raw_upsample, - downsample=raw_downsample, - interp_order=False, - ) - else: - # print(f"dataset {array_config.dataset} does not need resampling found {raw_voxel_size}=={target_resolution}") - return array_config - - -def limit_validation_crop_size(gt_config, mask_config, max_size): - gt_array = gt_config.array() - voxel_shape = gt_array.roi.shape / gt_array.voxel_size - crop = False - while np.prod(voxel_shape) > max_size: - crop = True - max_idx = np.argmax(voxel_shape) - voxel_shape = Coordinate( - s if i != max_idx else s // 2 for i, s in enumerate(voxel_shape) - ) - if crop: - crop_roi_shape = voxel_shape * gt_array.voxel_size - context = (gt_array.roi.shape - crop_roi_shape) / 2 - crop_roi = gt_array.roi.grow(-context, -context) - crop_roi = crop_roi.snap_to_grid(gt_array.voxel_size, mode="shrink") - - logger.debug( - f"Cropped {gt_config.name}: original roi: {gt_array.roi}, new_roi: {crop_roi}" - ) - - gt_config = CropArrayConfig( - name=gt_config.name + "_cropped", - source_array_config=gt_config, - roi=crop_roi, - ) - mask_config = CropArrayConfig( - name=mask_config.name + "_cropped", - source_array_config=gt_config, - roi=crop_roi, - ) - return gt_config, mask_config - - -def get_right_resolution_array_config( - container: Path, dataset, target_resolution, extra_str="" -): - """ - Get the right resolution array configuration. It will return the right resolution array configuration. - - Args: - container : obj - The container. - dataset : str - The dataset. - target_resolution : obj - The target resolution. - extra_str : str - An extra string. - Returns: - obj : The right resolution array configuration. - Raises: - FileNotFoundError - If the file does not exist, a FileNotFoundError is raised. - Examples: - >>> get_right_resolution_array_config(container, dataset, target_resolution, extra_str) - Notes: - This function is used to get the right resolution array configuration. - """ - level = 0 - current_dataset_path = Path(dataset, f"s{level}") - if not (container / current_dataset_path).exists(): - raise FileNotFoundError( - f"Path {container} is a Zarr Group and /s0 does not exist." - ) - - zarr_config = ZarrArrayConfig( - name=f"{extra_str}_{container.stem}_{dataset}_uint8", - file_name=container, - dataset=str(current_dataset_path), - snap_to_grid=target_resolution, - mode="r", - ) - zarr_array = zarr_config.array() - while ( - all([z < t for (z, t) in zip(zarr_array.voxel_size, target_resolution)]) - and Path(container, Path(dataset, f"s{level+1}")).exists() - ): - level += 1 - zarr_config = ZarrArrayConfig( - name=f"{extra_str}_{container.stem}_{dataset}_s{level}_uint8", - file_name=container, - dataset=str(Path(dataset, f"s{level}")), - snap_to_grid=target_resolution, - mode="r", - ) - - zarr_array = zarr_config.array() - return resize_if_needed(zarr_config, target_resolution, extra_str) - - -class CustomEnumMeta(EnumMeta): - """ - Custom Enum Meta class to raise KeyError when an invalid option is passed. - - Attributes: - _member_names_ : list - The list of member names. - Methods: - __getitem__(self, item) - A method to get the item. - Notes: - This class is used to raise KeyError when an invalid option is passed. - """ - - def __getitem__(self, item): - """ - Get the item. - - Args: - item : obj - The item. - Returns: - obj : The item. - Raises: - KeyError - If the item is not a valid option, a KeyError is raised. - Examples: - >>> __getitem__(item) - Notes: - This function is used to get the item. - """ - if item not in self._member_names_: - raise KeyError( - f"{item} is not a valid option of {self.__name__}, the valid options are {self._member_names_}" - ) - return super().__getitem__(item) - - -class CustomEnum(Enum, metaclass=CustomEnumMeta): - """ - A custom Enum class to raise KeyError when an invalid option is passed. - - Attributes: - __str__ : str - The string representation of the class. - Methods: - __str__(self) - A method to get the string representation of the class. - Notes: - This class is used to raise KeyError when an invalid option is passed. - """ - - def __str__(self) -> str: - """ - Get the string representation of the class. - - Args: - self : obj - The object. - Returns: - str : The string representation of the class. - Raises: - KeyError - If the item is not a valid option, a KeyError is raised. - Examples: - >>> __str__() - Notes: - This function is used to get the string representation of the class. - """ - return self.name - - -class DatasetType(CustomEnum): - """ - An Enum class to represent the dataset type. It is derived from `CustomEnum` class. - - Attributes: - val : int - The value of the dataset type. - train : int - The training dataset type. - Methods: - __str__(self) - A method to get the string representation of the class. - Notes: - This class is used to represent the dataset type. - """ - - val = 1 - train = 2 - - -class SegmentationType(CustomEnum): - """ - An Enum class to represent the segmentation type. It is derived from `CustomEnum` class. - - Attributes: - semantic : int - The semantic segmentation type. - instance : int - The instance segmentation type. - Methods: - __str__(self) - A method to get the string representation of the class. - Notes: - This class is used to represent the segmentation type. - """ - - semantic = 1 - instance = 2 - - -class DatasetSpec: - """ - A class for dataset specification. It is used to specify the dataset. - - Attributes: - dataset_type : obj - The dataset type. - raw_container : obj - The raw container. - raw_dataset : str - The raw dataset. - gt_container : obj - The ground truth container. - gt_dataset : str - The ground truth dataset. - Methods: - __init__(dataset_type, raw_container, raw_dataset, gt_container, gt_dataset) - Initializes the DatasetSpec class with the specified dataset type, raw container, raw dataset, ground truth container, and ground truth dataset. - __str__(self) - A method to get the string representation of the class. - Notes: - This class is used to specify the dataset. - """ - - def __init__( - self, - dataset_type: Union[str, DatasetType], - raw_container: Union[str, Path], - raw_dataset: str, - gt_container: Union[str, Path], - gt_dataset: str, - ): - """ - Initializes the DatasetSpec class with the specified dataset type, raw container, raw dataset, ground truth container, and ground truth dataset. - - Args: - dataset_type : obj - The dataset type. - raw_container : obj - The raw container. - raw_dataset : str - The raw dataset. - gt_container : obj - The ground truth container. - gt_dataset : str - The ground truth dataset. - Raises: - KeyError - If the item is not a valid option, a KeyError is raised. - Methods: - __init__(dataset_type, raw_container, raw_dataset, gt_container, gt_dataset) - Notes: - This function is used to initialize the DatasetSpec class with the specified dataset type, raw container, raw dataset, ground truth container, and ground truth dataset. - """ - if isinstance(dataset_type, str): - dataset_type = DatasetType[dataset_type.lower()] - - if isinstance(raw_container, str): - raw_container = Path(raw_container) - - if isinstance(gt_container, str): - gt_container = Path(gt_container) - - self.dataset_type = dataset_type - self.raw_container = raw_container - self.raw_dataset = raw_dataset - self.gt_container = gt_container - self.gt_dataset = gt_dataset - - def __str__(self) -> str: - """ - Get the string representation of the class. - - Args: - self : obj - The object. - Returns: - str : The string representation of the class. - Raises: - KeyError - If the item is not a valid option, a KeyError is raised. - Examples: - >>> __str__() - Notes: - This function is used to get the string representation of the class. - """ - return f"{self.raw_container.stem}_{self.gt_dataset}" - - -def generate_dataspec_from_csv(csv_path: Path): - """ - Generate the dataset specification from the CSV file. It will return the dataset specification. - - Args: - csv_path : obj - The CSV file path. - Returns: - list : The dataset specification. - Raises: - FileNotFoundError - If the file does not exist, a FileNotFoundError is raised. - Examples: - >>> generate_dataspec_from_csv(csv_path) - Notes: - This function is used to generate the dataset specification from the CSV file. - """ - datasets = [] - if not csv_path.exists(): - raise FileNotFoundError(f"CSV file {csv_path} does not exist.") - with open(csv_path, "r") as f: - for line in f: - ( - dataset_type, - raw_container, - raw_dataset, - gt_container, - gt_dataset, - ) = line.strip().split(",") - datasets.append( - DatasetSpec( - dataset_type=DatasetType[dataset_type.lower()], - raw_container=Path(raw_container), - raw_dataset=raw_dataset, - gt_container=Path(gt_container), - gt_dataset=gt_dataset, - ) - ) - - return datasets - - -class DataSplitGenerator: - """Generates DataSplitConfig for a given task config and datasets. - - Class names in gt_dataset should be within [] e.g. [mito&peroxisome&er] for - multiple classes or [mito] for one class. - - Currently only supports: - - semantic segmentation. - Supports: - - 2D and 3D datasets. - - Zarr, N5 and OME-Zarr datasets. - - Multi class targets. - - Different resolutions for raw and ground truth datasets. - - Different resolutions for training and validation datasets. - - Attributes: - name : str - The name of the data split generator. - datasets : list - The list of dataset specifications. - input_resolution : obj - The input resolution. - output_resolution : obj - The output resolution. - targets : list - The list of targets. - segmentation_type : obj - The segmentation type. - max_gt_downsample : int - The maximum ground truth downsample. - max_gt_upsample : int - The maximum ground truth upsample. - max_raw_training_downsample : int - The maximum raw training downsample. - max_raw_training_upsample : int - The maximum raw training upsample. - max_raw_validation_downsample : int - The maximum raw validation downsample. - max_raw_validation_upsample : int - The maximum raw validation upsample. - min_training_volume_size : int - The minimum training volume size. - raw_min : int - The minimum raw value. - raw_max : int - The maximum raw value. - classes_separator_character : str - The classes separator character. - max_validation_volume_size : int - The maximum validation volume size. Default is None. If None, the validation volume size is not limited. - else, the validation volume size is limited to the specified value. - e.g. 600**3 for 600^3 voxels = 216_000_000 voxels. - Methods: - __init__(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_character) - Initializes the DataSplitGenerator class with the specified name, datasets, input resolution, output resolution, targets, segmentation type, maximum ground truth downsample, maximum ground truth upsample, maximum raw training downsample, maximum raw training upsample, maximum raw validation downsample, maximum raw validation upsample, minimum training volume size, minimum raw value, maximum raw value, and classes separator character. - __str__(self) - A method to get the string representation of the class. - class_name(self) - A method to get the class name. - check_class_name(self, class_name) - A method to check the class name. - compute(self) - A method to compute the data split. - __generate_semantic_seg_datasplit(self) - A method to generate the semantic segmentation data split. - __generate_semantic_seg_dataset_crop(self, dataset) - A method to generate the semantic segmentation dataset crop. - generate_csv(datasets, csv_path) - A method to generate the CSV file. - generate_from_csv(csv_path, input_resolution, output_resolution, name, **kwargs) - A method to generate the data split from the CSV file. - Notes: - - This class is used to generate the DataSplitConfig for a given task config and datasets. - - Class names in gt_dataset shoulb be within [] e.g. [mito&peroxisome&er] for mutiple classes or [mito] for one class - """ - - def __init__( - self, - name: str, - datasets: List[DatasetSpec], - input_resolution: Union[Sequence[int], Coordinate], - output_resolution: Union[Sequence[int], Coordinate], - targets: Optional[List[str]] = None, - segmentation_type: Union[str, SegmentationType] = "semantic", - max_gt_downsample=32, - max_gt_upsample=4, - max_raw_training_downsample=16, - max_raw_training_upsample=2, - max_raw_validation_downsample=8, - max_raw_validation_upsample=2, - min_training_volume_size=8_000, # 20**3 - raw_min=0, - raw_max=255, - classes_separator_character="&", - use_negative_class=False, - max_validation_volume_size=None, - binarize_gt=False, - ): - """ - Initializes the DataSplitGenerator class with the specified: - - name - - datasets - - input resolution - - output resolution - - targets - - segmentation type - - maximum ground truth downsample - - maximum ground truth upsample - - maximum raw training downsample - - maximum raw training upsample - - maximum raw validation downsample - - maximum raw validation upsample - - minimum training volume size - - minimum raw value - - maximum raw value - - classes separator character - - use negative class - - binarize ground truth - - Args: - name : str - The name of the data split generator. - datasets : list - The list of dataset specifications. - input_resolution : obj - The input resolution. - output_resolution : obj - The output resolution. - targets : list - The list of targets. - segmentation_type : obj - The segmentation type. - max_gt_downsample : int - The maximum ground truth downsample. - max_gt_upsample : int - The maximum ground truth upsample. - max_raw_training_downsample : int - The maximum raw training downsample. - max_raw_training_upsample : int - The maximum raw training upsample. - max_raw_validation_downsample : int - The maximum raw validation downsample. - max_raw_validation_upsample : int - The maximum raw validation upsample. - min_training_volume_size : int - The minimum training volume size. - raw_min : int - The minimum raw value. - raw_max : int - The maximum raw value. - classes_separator_character : str - The classes separator character. - use_negative_class : bool - Whether to use negative classes. - binarize_gt : bool - Whether to binarize the ground truth as part of preprocessing. Use this if you are doing semantic segmentation on instance labels (where each object has a unique ID). - Returns: - obj : The DataSplitGenerator class. - Raises: - ValueError - If the class name is already set, a ValueError is raised. - Examples: - >>> DataSplitGenerator(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_character) - Notes: - This function is used to initialize the DataSplitGenerator class with the specified name, datasets, input resolution, output resolution, targets, segmentation type, maximum ground truth downsample, maximum ground truth upsample, maximum raw training downsample, maximum raw training upsample, maximum raw validation downsample, maximum raw validation upsample, minimum training volume size, minimum raw value, maximum raw value, and classes separator character. - - """ - if not isinstance(input_resolution, Coordinate): - input_resolution = Coordinate(input_resolution) - if not isinstance(output_resolution, Coordinate): - output_resolution = Coordinate(output_resolution) - if isinstance(segmentation_type, str): - segmentation_type = SegmentationType[segmentation_type.lower()] - - self.name = name - self.datasets = datasets - self.input_resolution = input_resolution - self.output_resolution = output_resolution - self.targets = targets - self._class_name = None - self.segmentation_type = segmentation_type - self.max_gt_downsample = max_gt_downsample - self.max_gt_upsample = max_gt_upsample - self.max_raw_training_downsample = max_raw_training_downsample - self.max_raw_training_upsample = max_raw_training_upsample - self.max_raw_validation_downsample = max_raw_validation_downsample - self.max_raw_validation_upsample = max_raw_validation_upsample - self.min_training_volume_size = min_training_volume_size - self.raw_min = raw_min - self.raw_max = raw_max - self.classes_separator_character = classes_separator_character - self.use_negative_class = use_negative_class - self.max_validation_volume_size = max_validation_volume_size - self.binarize_gt = binarize_gt - if use_negative_class: - if targets is None: - raise ValueError( - "use_negative_class=True requires targets to be specified." - ) - - def __str__(self) -> str: - """ - Get the string representation of the class. - - Args: - self : obj - The object. - Returns: - str : The string representation of the class. - Raises: - ValueError - If the class name is already set, a ValueError is raised. - Examples: - >>> __str__() - Notes: - This function is used to get the string representation of the class. - """ - return f"DataSplitGenerator:{self.name}_{self.segmentation_type}_{self.class_name}_{self.output_resolution[0]}nm" - - @property - def class_name(self): - """ - Get the class name. - - Args: - self : obj - The object. - Returns: - obj : The class name. - Raises: - ValueError - If the class name is already set, a ValueError is raised. - Examples: - >>> class_name - Notes: - This function is used to get the class name. - """ - if self._class_name is None: - if self.targets is None: - logger.warning("Both targets and class name are None.") - return None - self._class_name = self.targets - return self._class_name - - # Goal is to force class_name to be set only once, so we have the same classes for all datasets - @class_name.setter - def class_name(self, class_name): - """ - Set the class name. - - Args: - self : obj - The object. - class_name : obj - The class name. - Returns: - obj : The class name. - Raises: - ValueError - If the class name is already set, a ValueError is raised. - Examples: - >>> class_name - Notes: - This function is used to set the class name. - """ - if self._class_name is not None: - raise ValueError( - f"Class name already set. Current class name is {self.class_name} and new class name is {class_name}" - ) - self._class_name = class_name - - def check_class_name(self, class_name): - """ - Check the class name. - - Args: - self : obj - The object. - class_name : obj - The class name. - Returns: - obj : The class name. - Raises: - ValueError - If the class name is already set, a ValueError is raised. - Examples: - >>> check_class_name(class_name) - Notes: - This function is used to check the class name. - - """ - datasets, classes = format_class_name( - class_name, self.classes_separator_character, self.targets - ) - if self.class_name is None: - self.class_name = classes - if self.targets is None: - logger.warning( - f" No targets specified, using all classes in the dataset as target {classes}." - ) - elif self.class_name != classes: - raise ValueError( - f"Datasets are having different classes names: {classes} does not match {self.class_name}" - ) - return datasets, classes - - def compute(self): - """ - Compute the data split. - - Args: - self : obj - The object. - Returns: - obj : The data split. - Raises: - NotImplementedError - If the segmentation type is not implemented, a NotImplementedError is raised. - Examples: - >>> compute() - Notes: - This function is used to compute the data split. - """ - if self.segmentation_type == SegmentationType.semantic: - return self.__generate_semantic_seg_datasplit() - else: - raise NotImplementedError( - f"{self.segmentation_type} segmentation not implemented yet!" - ) - - def __generate_semantic_seg_datasplit(self): - """ - Generate the semantic segmentation data split. - - Args: - self : obj - The object. - Returns: - obj : The data split. - Raises: - FileNotFoundError - If the file does not exist, a FileNotFoundError is raised. - Examples: - >>> __generate_semantic_seg_datasplit() - Notes: - This function is used to generate the semantic segmentation data split. - - """ - train_dataset_configs = [] - validation_dataset_configs = [] - for dataset in self.datasets: - ( - raw_config, - gt_config, - mask_config, - ) = self.__generate_semantic_seg_dataset_crop(dataset) - if type(self.class_name) == list: - classes = self.classes_separator_character.join(self.class_name) - else: - classes = self.class_name - if dataset.dataset_type == DatasetType.train: - train_dataset_configs.append( - RawGTDatasetConfig( - name=f"{dataset}_{gt_config.name}_{classes}_{self.output_resolution[0]}nm", - raw_config=raw_config, - gt_config=gt_config, - mask_config=mask_config, - ) - ) - else: - if self.max_validation_volume_size is not None: - gt_config, mask_config = limit_validation_crop_size( - gt_config, mask_config, self.max_validation_volume_size - ) - validation_dataset_configs.append( - RawGTDatasetConfig( - name=f"{dataset}_{gt_config.name}_{classes}_{self.output_resolution[0]}nm", - raw_config=raw_config, - gt_config=gt_config, - mask_config=mask_config, - ) - ) - - return TrainValidateDataSplitConfig( - name=f"{self.name}_{self.segmentation_type}_{classes}_{self.output_resolution[0]}nm", - train_configs=train_dataset_configs, - validate_configs=validation_dataset_configs, - ) - - def __generate_semantic_seg_dataset_crop(self, dataset: DatasetSpec): - """ - Generate the semantic segmentation dataset crop. - - Args: - self : obj - The object. - dataset : obj - The dataset. - Returns: - obj : The dataset crop. - Raises: - FileNotFoundError - If the file does not exist, a FileNotFoundError is raised. - Examples: - >>> __generate_semantic_seg_dataset_crop(dataset) - Notes: - This function is used to generate the semantic segmentation dataset crop. - """ - raw_container = dataset.raw_container - raw_dataset = dataset.raw_dataset - gt_path = dataset.gt_container - gt_dataset = dataset.gt_dataset - - if not (raw_container / raw_dataset).exists(): - raise FileNotFoundError( - f"Raw path {raw_container/raw_dataset} does not exist." - ) - - # print( - # f"Processing raw_container:{raw_container} raw_dataset:{raw_dataset} gt_path:{gt_path} gt_dataset:{gt_dataset}" - # ) - - if is_zarr_group(raw_container, raw_dataset): - raw_config = get_right_resolution_array_config( - raw_container, raw_dataset, self.input_resolution, "raw" - ) - else: - raw_config = resize_if_needed( - ZarrArrayConfig( - name=f"raw_{raw_container.stem}_uint8", - file_name=raw_container, - dataset=raw_dataset, - mode="r", - ), - self.input_resolution, - "raw", - ) - raw_config = IntensitiesArrayConfig( - name=f"raw_{raw_container.stem}_uint8", - source_array_config=raw_config, - min=self.raw_min, - max=self.raw_max, - ) - organelle_arrays = {} - # classes_datasets, classes = self.check_class_name(gt_dataset) - classes_datasets, classes = format_class_name( - gt_dataset, self.classes_separator_character, self.targets - ) - for current_class_dataset, current_class_name in zip(classes_datasets, classes): - if not (gt_path / current_class_dataset).exists(): - raise FileNotFoundError( - f"GT path {gt_path/current_class_dataset} does not exist." - ) - if is_zarr_group(gt_path, current_class_dataset): - gt_config = get_right_resolution_array_config( - gt_path, current_class_dataset, self.output_resolution, "gt" - ) - else: - gt_config = resize_if_needed( - ZarrArrayConfig( - name=f"gt_{gt_path.stem}_{current_class_dataset}_uint8", - file_name=gt_path, - dataset=current_class_dataset, - mode="r", - ), - self.output_resolution, - "gt", - ) - if self.binarize_gt: - gt_config = BinarizeArrayConfig( - f"{dataset}_{current_class_name}_{self.output_resolution[0]}nm_binarized", - source_array_config=gt_config, - groupings=[(current_class_name, [])], - ) - organelle_arrays[current_class_name] = gt_config - - if self.targets is None: - targets_str = "_".join(classes) - current_targets = classes - else: - current_targets = self.targets - targets_str = "_".join(self.targets) - - target_images = dict[str, ArrayConfig]() - target_masks = dict[str, ArrayConfig]() - - missing_classes = [c for c in current_targets if c not in classes] - found_classes = [c for c in current_targets if c in classes] - for t in found_classes: - target_images[t] = organelle_arrays[t] - - if len(missing_classes) > 0: - if not self.use_negative_class: - raise ValueError( - f"Missing classes found, {str(missing_classes)}, please specify use_negative_class=True to generate the missing classes." - ) - else: - if len(organelle_arrays) == 0: - raise ValueError( - f"No target classes found, please specify targets to generate the negative classes." - ) - # generate negative class - if len(organelle_arrays) > 1: - found_gt_config = ConcatArrayConfig( - name=f"{dataset}_{current_class_name}_{self.output_resolution[0]}nm_gt", - channels=list(organelle_arrays.keys()), - source_array_configs=organelle_arrays, - ) - missing_mask_config = LogicalOrArrayConfig( - name=f"{dataset}_{current_class_name}_{self.output_resolution[0]}nm_labelled_voxels", - source_array_config=found_gt_config, - ) - else: - missing_mask_config = list(organelle_arrays.values())[0] - missing_gt_config = ConstantArrayConfig( - name=f"{dataset}_{current_class_name}_{self.output_resolution[0]}nm_gt", - source_array_config=list(organelle_arrays.values())[0], - constant=0, - ) - for t in missing_classes: - target_images[t] = missing_gt_config - target_masks[t] = missing_mask_config - - for t in found_classes: - target_masks[t] = ConstantArrayConfig( - name=f"{dataset}_{t}_{self.output_resolution[0]}nm_labelled_voxels", - source_array_config=target_images[t], - constant=1, - ) - - # if len(target_images) > 1: - gt_config = ConcatArrayConfig( - name=f"{dataset}_{targets_str}_{self.output_resolution[0]}nm_gt", - channels=[organelle for organelle in current_targets], - # source_array_configs={k: gt for k, gt in target_images.items()}, - source_array_configs={k: target_images[k] for k in current_targets}, - ) - mask_config = ConcatArrayConfig( - name=f"{dataset}_{targets_str}_{self.output_resolution[0]}nm_mask", - channels=[organelle for organelle in current_targets], - # source_array_configs={k: mask for k, mask in target_masks.items()}, - # to be sure to have the same order - source_array_configs={k: target_masks[k] for k in current_targets}, - ) - # else: - # gt_config = list(target_images.values())[0] - # mask_config = list(target_masks.values())[0] - - return raw_config, gt_config, mask_config - - # @staticmethod - # def generate_csv(datasets: List[DatasetSpec], csv_path: Path): - # print(f"Writing dataspecs to {csv_path}") - # with open(csv_path, "w") as f: - # for dataset in datasets: - # f.write( - # f"{dataset.dataset_type.name},{str(dataset.raw_container)},{dataset.raw_dataset},{str(dataset.gt_container)},{dataset.gt_dataset}\n" - # ) - - @staticmethod - def generate_from_csv( - csv_path: Path, - input_resolution: Union[Sequence[int], Coordinate], - output_resolution: Union[Sequence[int], Coordinate], - name: Optional[str] = None, - **kwargs, - ): - """ - Generate the data split from the CSV file. - - Args: - csv_path : obj - The CSV file path. - input_resolution : obj - The input resolution. - output_resolution : obj - The output resolution. - name : str - The name. - **kwargs : dict - The keyword arguments. - Returns: - obj : The data split. - Raises: - FileNotFoundError - If the file does not exist, a FileNotFoundError is raised. - Examples: - >>> generate_from_csv(csv_path, input_resolution, output_resolution, name, **kwargs) - Notes: - This function is used to generate the data split from the CSV file. - - """ - if isinstance(csv_path, str): - csv_path = Path(csv_path) - - if name is None: - name = csv_path.stem - - return DataSplitGenerator( - name, - generate_dataspec_from_csv(csv_path), - input_resolution, - output_resolution, - **kwargs, - ) - - -def format_class_name(class_name, separator_character="&", targets=None): - """ - Format the class name. - - Args: - class_name : obj - The class name. - separator_character : str - The separator character. - Returns: - obj : The class name. - Raises: - ValueError - If the class name is invalid, a ValueError is raised. - Examples: - >>> format_class_name(class_name, separator_character) - Notes: - This function is used to format the class name. - """ - if "[" in class_name: - if "]" not in class_name: - raise ValueError(f"Invalid class name {class_name} missing ']'") - classes = class_name.split("[")[1].split("]")[0].split(separator_character) - base_class_name = class_name.split("[")[0] - return [f"{base_class_name}{c}" for c in classes], classes - else: - if targets is None: - raise ValueError(f"Invalid class name {class_name} missing '[' and ']'") - if len(targets) > 1: - raise ValueError(f"Invalid class name {class_name} missing '[' and ']'") - return [class_name], [targets[0]] diff --git a/dacapo/experiments/run_config.py b/dacapo/experiments/run_config.py index 73fa9badf..98ad84752 100644 --- a/dacapo/experiments/run_config.py +++ b/dacapo/experiments/run_config.py @@ -489,7 +489,7 @@ def save_bioimage_io_model( summary = test_model(my_model_descr) summary.display() - print( + logger.info( "package path:", save_bioimageio_package(my_model_descr, output_path=path), ) @@ -551,14 +551,14 @@ def resume_training(self, stats_store, weights_store) -> int: # remove validation past existing training stats if validated_until > trained_until: - print( + logger.info( f"Trained until {trained_until}, but validated until {validated_until}! " "Deleting extra validation stats" ) self.validation_scores.delete_after(trained_until) - # print current training state - print(f"Current state: trained {trained_until}/{self.num_iterations}") + # logger.info current training state + logger.info(f"Current state: trained {trained_until}/{self.num_iterations}") # read weights of the latest iteration latest_weights_iteration = weights_store.latest_iteration(self) @@ -597,7 +597,7 @@ def resume_training(self, stats_store, weights_store) -> int: # perfectly in sync. We can continue training elif latest_weights_iteration == trained_until: - print(f"Resuming training from iteration {trained_until}") + logger.info(f"Resuming training from iteration {trained_until}") weights = weights_store.retrieve_weights( self, iteration=trained_until @@ -623,12 +623,8 @@ def resume_training(self, stats_store, weights_store) -> int: def train_step(self, raw: torch.Tensor, target: torch.Tensor, weight: torch.Tensor): self.optimizer.zero_grad() - print(raw.min(), raw.max(), target.min(), target.max()) - predicted = self.model.forward(raw.float().to(self.device)) - print(predicted.min(), predicted.max()) - print(weight.min(), weight.max()) predicted.retain_grad() loss = self.task.loss.compute( predicted, diff --git a/dacapo/experiments/tasks/evaluators/binary_segmentation_evaluator.py b/dacapo/experiments/tasks/evaluators/binary_segmentation_evaluator.py index 896d215b4..0a4f01ac0 100644 --- a/dacapo/experiments/tasks/evaluators/binary_segmentation_evaluator.py +++ b/dacapo/experiments/tasks/evaluators/binary_segmentation_evaluator.py @@ -131,7 +131,7 @@ def evaluate(self, output_array_identifier, evaluation_array): # removed the .squeeze() because it was used for batch size and now we are feeding 4d c, z, y, x evaluation_data = evaluation_array[evaluation_array.roi] output_data = output_array[output_array.roi] - print( + logger.info( f"Evaluating binary segmentations on evaluation_data of shape: {evaluation_data.shape}" ) assert ( diff --git a/dacapo/experiments/trainers/gunpowder_trainer_config.py b/dacapo/experiments/trainers/gunpowder_trainer_config.py index 70bd317d4..9673997bf 100644 --- a/dacapo/experiments/trainers/gunpowder_trainer_config.py +++ b/dacapo/experiments/trainers/gunpowder_trainer_config.py @@ -1,5 +1,6 @@ import attr import random +import logging from .gp_augments import AugmentConfig from .trainer_config import TrainerConfig @@ -16,6 +17,8 @@ import torch +logger = logging.getLogger(__name__) + def pipeline_generator( pipeline: gp.Pipeline, @@ -298,7 +301,7 @@ def batch_generator(): batch_gen = batch_generator() def load_batch(event): - print("fetching_batch") + logger.info("fetching_batch") batch = next(batch_gen) with viewer.txn() as s: @@ -308,7 +311,7 @@ def load_batch(event): # reverse order for raw so we can set opacity to 1, this # way higher res raw replaces low res when available for name, array in batch.arrays.items(): - print(name) + logger.info(name) data = array.data[0] channel_dims = len(data.shape) - len(array.spec.voxel_size) @@ -357,7 +360,7 @@ def load_batch(event): with viewer.config_state.txn() as s: s.input_event_bindings.data_view["keyt"] = "load_batch" - print(viewer) + logger.info(viewer) load_batch(None) input("Enter to quit!") diff --git a/dacapo/gp/elastic_augment_fuse.py b/dacapo/gp/elastic_augment_fuse.py index a703559ec..a1c6ae917 100644 --- a/dacapo/gp/elastic_augment_fuse.py +++ b/dacapo/gp/elastic_augment_fuse.py @@ -146,10 +146,6 @@ def _create_rotation_transformation(shape, angle, subsample=1, voxel_size=None): # rotate control points center = np.array([0.5 * (d - 1) * vs for d, vs in zip(shape, voxel_size)]) - # print("Creating rotation transformation with:") - # print("\tangle : " + str(angle)) - # print("\tcenter: " + str(center)) - control_point_offsets = np.zeros((dims,) + control_points, dtype=np.float32) for control_point in np.ndindex(control_points): point = np.array(control_point) * control_point_scaling_factor @@ -197,10 +193,6 @@ def _create_uniform_3d_transformation(shape, rotation, subsample=1, voxel_size=N # rotate control points center = np.array([0.5 * (d - 1) * vs for d, vs in zip(shape, voxel_size)]) - # print("Creating rotation transformation with:") - # print("\tangle : " + str(angle)) - # print("\tcenter: " + str(center)) - control_point_offsets = np.zeros((dims,) + control_points, dtype=np.float32) for control_point in np.ndindex(control_points): point = np.array(control_point) * control_point_scaling_factor diff --git a/dacapo/predict.py b/dacapo/predict.py index 7eda281b5..3b087088b 100644 --- a/dacapo/predict.py +++ b/dacapo/predict.py @@ -120,9 +120,9 @@ def predict( if isinstance(output_dtype, str): output_dtype = np.dtype(output_dtype) - print(f"Predicting with input size {input_size}, output size {output_size}") + logger.info(f"Predicting with input size {input_size}, output size {output_size}") - print(f"Total input ROI: {_input_roi}, output ROI: {output_roi}") + logger.info(f"Total input ROI: {_input_roi}, output ROI: {output_roi}") # prepare prediction dataset if raw_array.channel_dims == 0: @@ -147,7 +147,7 @@ def predict( # run blockwise prediction worker_file = str(Path(Path(dacapo.blockwise.__file__).parent, "predict_worker.py")) - print("Running blockwise prediction with worker_file: ", worker_file) + logger.info("Running blockwise prediction with worker_file: ", worker_file) success = run_blockwise( worker_file=worker_file, total_roi=_input_roi, @@ -162,5 +162,5 @@ def predict( input_array_identifier=input_array_identifier, output_array_identifier=output_array_identifier, ) - print("Done predicting.") + logger.info("Done predicting.") return success diff --git a/dacapo/store/converter.py b/dacapo/store/converter.py index 7e5451b3c..8e9b24c16 100644 --- a/dacapo/store/converter.py +++ b/dacapo/store/converter.py @@ -2,6 +2,10 @@ from cattr.gen import make_dict_unstructure_fn, make_dict_structure_fn from .conversion_hooks import register_hooks +import logging + +logger = logging.getLogger(__name__) + class TypedConverter(Converter): """A converter that stores and retrieves type information for selected @@ -122,10 +126,10 @@ class from unstructured data. structure_fn = make_dict_structure_fn(cls, self) return structure_fn(obj_data, cls) except Exception as e: - print( + logger.error( f"Could not structure object of type {obj_data}. will try unstructured data. attr __type__ can be missing because of old version of the data." ) - print(e) + logger.error(e) return obj_data diff --git a/dacapo/store/file_config_store.py b/dacapo/store/file_config_store.py index 55543b462..853f031fe 100644 --- a/dacapo/store/file_config_store.py +++ b/dacapo/store/file_config_store.py @@ -58,7 +58,7 @@ def __init__(self, path): Examples: >>> store = FileConfigStore("path/to/configs") """ - print(f"Creating FileConfigStore:\n\tpath: {path}") + logger.info(f"Creating FileConfigStore:\n\tpath: {path}") self.path = Path(path) diff --git a/dacapo/store/file_stats_store.py b/dacapo/store/file_stats_store.py index 6d01eb329..e70e8a670 100644 --- a/dacapo/store/file_stats_store.py +++ b/dacapo/store/file_stats_store.py @@ -47,7 +47,7 @@ def __init__(self, path): >>> store = FileStatsStore("store") """ - print(f"Creating FileStatsStore:\n\tpath : {path}") + logger.info(f"Creating FileStatsStore:\n\tpath : {path}") self.path = Path(path) @@ -137,7 +137,7 @@ def store_validation_iteration_scores(self, run_name, scores): self.__delete_validation_iteration_scores(run_name) if store_from_iteration > 0: - print( + logger.info( f"Updating validation scores of run {run_name} after iteration {store_from_iteration}" ) diff --git a/dacapo/store/local_array_store.py b/dacapo/store/local_array_store.py index f47119831..ea4fd993e 100644 --- a/dacapo/store/local_array_store.py +++ b/dacapo/store/local_array_store.py @@ -212,7 +212,7 @@ def remove(self, array_identifier: "LocalArrayIdentifier") -> None: f"Asked to remove dataset {dataset} in container {container}, but it is not a directory. Will not delete." ) return - print(f"Removing dataset {dataset} in container {container}") + logger.info(f"Removing dataset {dataset} in container {container}") shutil.rmtree(path) diff --git a/dacapo/store/local_weights_store.py b/dacapo/store/local_weights_store.py index 9e2367ede..eac1d7cd7 100644 --- a/dacapo/store/local_weights_store.py +++ b/dacapo/store/local_weights_store.py @@ -63,7 +63,7 @@ def __init__(self, basedir): The directory is created if it does not exist. """ - print(f"Creating local weights store in directory {basedir}") + logger.info(f"Creating local weights store in directory {basedir}") self.basedir = basedir @@ -84,7 +84,7 @@ def save_trace(self, run: RunConfig): trace_file, ) except SystemError as e: - print(f"Error saving trace: {e}, this model will not be traced") + logger.info(f"Error saving trace: {e}, this model will not be traced") trace_file.touch() def latest_iteration(self, run: str) -> Optional[int]: @@ -159,7 +159,7 @@ def retrieve_weights(self, run: str, iteration: int) -> Weights: The weights are stored in the format of a Weights object, which is a simple container for the model and optimizer state dicts. """ - print(f"Retrieving weights for run {run}, iteration {iteration}") + logger.info(f"Retrieving weights for run {run}, iteration {iteration}") weights_name = self.__get_weights_dir(run) / "iterations" / str(iteration) @@ -272,7 +272,7 @@ def retrieve_best(self, run: str, dataset: str | Dataset, criterion: str) -> int Note: The best weights are stored in a json file that contains the iteration number. """ - print(f"Retrieving weights for run {run}, criterion {criterion}") + logger.info(f"Retrieving weights for run {run}, criterion {criterion}") with (self.__get_weights_dir(run) / criterion / f"{dataset}.json").open( "r" @@ -293,7 +293,7 @@ def _load_best(self, run: Run, criterion: str): Note: This method is used internally by the store to load the best weights for a given run and criterion. """ - print(f"Retrieving weights for run {run}, criterion {criterion}") + logger.info(f"Retrieving weights for run {run}, criterion {criterion}") weights_name = self.__get_weights_dir(run) / f"{criterion}" diff --git a/dacapo/store/mongo_config_store.py b/dacapo/store/mongo_config_store.py index c5daecd4d..a3d3672e2 100644 --- a/dacapo/store/mongo_config_store.py +++ b/dacapo/store/mongo_config_store.py @@ -75,7 +75,7 @@ def __init__(self, db_host, db_name): Examples: >>> store = MongoConfigStore('localhost', 'dacapo') """ - print( + logger.info( f"Creating MongoConfigStore:\n\thost : {db_host}\n\tdatabase: {db_name}" ) diff --git a/dacapo/store/mongo_stats_store.py b/dacapo/store/mongo_stats_store.py index c05928032..c289b441f 100644 --- a/dacapo/store/mongo_stats_store.py +++ b/dacapo/store/mongo_stats_store.py @@ -45,7 +45,7 @@ def __init__(self, db_host, db_name): The MongoStatsStore will connect to the MongoDB database at the given host. """ - print( + logger.info( f"Creating MongoStatsStore:\n\thost : {db_host}\n\tdatabase: {db_name}" ) @@ -149,7 +149,7 @@ def store_validation_iteration_scores( self.__delete_validation_scores(run_name) if store_from_iteration > 0: - print( + logger.info( f"Updating validation scores of run {run_name} after iteration {store_from_iteration}" ) diff --git a/dacapo/train.py b/dacapo/train.py index 9b2ff6803..cfc01ded7 100644 --- a/dacapo/train.py +++ b/dacapo/train.py @@ -37,7 +37,7 @@ def train(run_name: str, validate=True): # # we are done here. # return - print(f"Training run {run_name}") + logger.info(f"Training run {run_name}") # create run @@ -57,7 +57,7 @@ def train_run(run: RunConfig, validate: bool = True, save_snapshots: bool = Fals ValueError: If run_name is not found in config store """ - print(f"Starting/resuming training for run {run.name}...") + logger.info(f"Starting/resuming training for run {run.name}...") stats_store = create_stats_store() weights_store = create_weights_store() diff --git a/dacapo/utils/view.py b/dacapo/utils/view.py index 5cadc29d5..1cb7f61be 100644 --- a/dacapo/utils/view.py +++ b/dacapo/utils/view.py @@ -13,6 +13,9 @@ import copy import json from typing import Optional +import logging + +logger = logging.getLogger(__name__) def get_viewer( @@ -394,7 +397,7 @@ def start_neuroglancer(self, bind_address="0.0.0.0", bind_port=None): bind_address=bind_address, bind_port=bind_port ) self.viewer = neuroglancer.Viewer() - print(f"Neuroglancer viewer: {self.viewer}") + logger.info(f"Neuroglancer viewer: {self.viewer}") with self.viewer.txn() as state: state.showSlices = False @@ -589,7 +592,7 @@ def update_with_new_validation_if_possible(self): time.sleep(10) new_best_exists = self.best_score.does_new_best_exist() if new_best_exists: - print( + logger.info( f"New best f1 score of {self.best_score.score} at iteration {self.best_score.iteration} and parameter {self.best_score.parameter}" ) self.update_best_layer()