From 7b388295769d5198b7475906a6e4eb2d83235063 Mon Sep 17 00:00:00 2001 From: Dmitrii Lavrukhin Date: Wed, 8 Jan 2025 18:26:49 +0400 Subject: [PATCH] sync with upstream - renaming some files, moving some classes (#76) * moving ItemStatus, DatasetItemStorage, DatasetItemStorageDatasetView to components/dataset_item_storage.py * moving DatasetPatch, DatasetStorage to components/dataset_storage.py * moving DEFAULT_SUBSET to util/definitions.py * renaming components/dataset_filter.py to components/filter.py * moving components/hl_ops.py to components/hl_ops/__init__.py * renaming components/dataset_generator.py to components/generator.py * moving Importer to components/importer.py * moving Transform, ItemTransform to components/transformer.py --- src/datumaro/__init__.py | 18 +- src/datumaro/cli/contexts/project/__init__.py | 2 +- src/datumaro/components/cli_plugin.py | 4 +- src/datumaro/components/dataset.py | 606 +----------------- src/datumaro/components/dataset_base.py | 164 +---- .../components/dataset_item_storage.py | 161 +++++ src/datumaro/components/dataset_storage.py | 470 ++++++++++++++ src/datumaro/components/environment.py | 12 +- src/datumaro/components/exporter.py | 2 +- .../{dataset_filter.py => filter.py} | 2 +- .../{dataset_generator.py => generator.py} | 0 .../{hl_ops.py => hl_ops/__init__.py} | 7 +- src/datumaro/components/importer.py | 103 +++ src/datumaro/components/launcher.py | 2 +- src/datumaro/components/operations.py | 3 +- src/datumaro/components/transformer.py | 62 ++ .../plugins/data_formats/ade20k2017.py | 3 +- .../plugins/data_formats/ade20k2020.py | 3 +- src/datumaro/plugins/data_formats/brats.py | 3 +- .../plugins/data_formats/brats_numpy.py | 3 +- src/datumaro/plugins/data_formats/camvid.py | 5 +- .../data_formats/celeba/align_celeba.py | 3 +- .../plugins/data_formats/celeba/celeba.py | 3 +- src/datumaro/plugins/data_formats/cifar.py | 5 +- .../plugins/data_formats/cityscapes.py | 5 +- .../plugins/data_formats/coco/base.py | 3 +- .../plugins/data_formats/coco/exporter.py | 2 +- .../plugins/data_formats/coco/importer.py | 3 +- .../common_semantic_segmentation.py | 3 +- .../data_formats/common_super_resolution.py | 3 +- .../plugins/data_formats/cvat/base.py | 3 +- .../plugins/data_formats/cvat/exporter.py | 2 +- .../plugins/data_formats/datumaro/base.py | 3 +- .../plugins/data_formats/datumaro/exporter.py | 5 +- .../plugins/data_formats/icdar/base.py | 3 +- .../plugins/data_formats/image_dir.py | 3 +- .../plugins/data_formats/image_zip.py | 3 +- src/datumaro/plugins/data_formats/imagenet.py | 3 +- .../plugins/data_formats/imagenet_txt.py | 3 +- src/datumaro/plugins/data_formats/kinetics.py | 3 +- .../plugins/data_formats/kitti/importer.py | 2 +- .../plugins/data_formats/kitti_raw/base.py | 3 +- .../data_formats/kitti_raw/exporter.py | 2 +- src/datumaro/plugins/data_formats/labelme.py | 3 +- src/datumaro/plugins/data_formats/lfw.py | 3 +- .../data_formats/mapillary_vistas/importer.py | 3 +- .../plugins/data_formats/market1501.py | 3 +- src/datumaro/plugins/data_formats/mars.py | 3 +- src/datumaro/plugins/data_formats/mnist.py | 3 +- .../plugins/data_formats/mnist_csv.py | 3 +- src/datumaro/plugins/data_formats/mot.py | 3 +- src/datumaro/plugins/data_formats/mots.py | 3 +- .../plugins/data_formats/mpii/mpii_json.py | 3 +- .../plugins/data_formats/mpii/mpii_mat.py | 3 +- .../plugins/data_formats/nyu_depth_v2.py | 3 +- .../plugins/data_formats/open_images.py | 5 +- .../data_formats/sly_pointcloud/base.py | 3 +- .../data_formats/sly_pointcloud/exporter.py | 2 +- src/datumaro/plugins/data_formats/synthia.py | 3 +- .../data_formats/tf_detection_api/base.py | 3 +- .../plugins/data_formats/vgg_face2.py | 3 +- src/datumaro/plugins/data_formats/video.py | 4 +- .../plugins/data_formats/voc/exporter.py | 2 +- .../plugins/data_formats/voc/importer.py | 2 +- src/datumaro/plugins/data_formats/vott_csv.py | 3 +- .../plugins/data_formats/vott_json.py | 3 +- .../plugins/data_formats/widerface.py | 3 +- .../plugins/data_formats/yolo/exporter.py | 6 +- .../plugins/data_formats/yolo/importer.py | 2 +- src/datumaro/plugins/ndr.py | 3 +- .../plugins/sampler/random_sampler.py | 3 +- .../plugins/sampler/relevancy_sampler.py | 3 +- src/datumaro/plugins/splitter.py | 3 +- .../plugins/synthetic_data/image_generator.py | 2 +- src/datumaro/plugins/transforms.py | 10 +- src/datumaro/util/definitions.py | 3 + tests/unit/data_formats/test_yolo_format.py | 3 +- tests/unit/test_compare.py | 3 +- tests/unit/test_dataset.py | 18 +- tests/unit/test_ops.py | 3 +- tests/unit/test_project.py | 3 +- 81 files changed, 963 insertions(+), 877 deletions(-) create mode 100644 src/datumaro/components/dataset_item_storage.py create mode 100644 src/datumaro/components/dataset_storage.py rename src/datumaro/components/{dataset_filter.py => filter.py} (99%) rename src/datumaro/components/{dataset_generator.py => generator.py} (100%) rename src/datumaro/components/{hl_ops.py => hl_ops/__init__.py} (95%) create mode 100644 src/datumaro/components/importer.py create mode 100644 src/datumaro/components/transformer.py create mode 100644 src/datumaro/util/definitions.py diff --git a/src/datumaro/__init__.py b/src/datumaro/__init__.py index d18f28647c..21c5b8d142 100644 --- a/src/datumaro/__init__.py +++ b/src/datumaro/__init__.py @@ -32,26 +32,17 @@ Skeleton, ) from .components.cli_plugin import CliPlugin -from .components.dataset import ( - Dataset, - DatasetPatch, - DatasetSubset, - IDataset, - ItemStatus, - eager_mode, -) +from .components.dataset import Dataset, DatasetSubset, IDataset, eager_mode from .components.dataset_base import ( - DEFAULT_SUBSET_NAME, CategoriesInfo, DatasetBase, DatasetItem, FailingImportErrorPolicy, - Importer, ImportErrorPolicy, - ItemTransform, SubsetBase, - Transform, ) +from .components.dataset_item_storage import ItemStatus +from .components.dataset_storage import DatasetPatch from .components.environment import Environment, PluginRegistry from .components.exporter import Exporter, ExportErrorPolicy, FailingExportErrorPolicy from .components.hl_ops import ( # pylint: disable=redefined-builtin @@ -62,9 +53,12 @@ transform, validate, ) +from .components.importer import Importer from .components.launcher import Launcher, ModelTransform from .components.media import ByteImage, Image, MediaElement, PointCloud, Video, VideoFrame from .components.media_manager import MediaManager from .components.progress_reporting import NullProgressReporter, ProgressReporter +from .components.transformer import ItemTransform, Transform from .components.validator import Validator +from .util.definitions import DEFAULT_SUBSET_NAME from .version import VERSION diff --git a/src/datumaro/cli/contexts/project/__init__.py b/src/datumaro/cli/contexts/project/__init__.py index d9cd50842f..ac19139766 100644 --- a/src/datumaro/cli/contexts/project/__init__.py +++ b/src/datumaro/cli/contexts/project/__init__.py @@ -8,9 +8,9 @@ import os.path as osp from enum import Enum -from datumaro.components.dataset_filter import DatasetItemEncoder from datumaro.components.environment import Environment from datumaro.components.errors import MigrationError, ProjectNotFoundError +from datumaro.components.filter import DatasetItemEncoder from datumaro.components.operations import compute_ann_statistics, compute_image_statistics from datumaro.components.project import Project, ProjectBuildTargets from datumaro.components.validator import TaskType diff --git a/src/datumaro/components/cli_plugin.py b/src/datumaro/components/cli_plugin.py index 21116f1e03..719dade324 100644 --- a/src/datumaro/components/cli_plugin.py +++ b/src/datumaro/components/cli_plugin.py @@ -15,9 +15,11 @@ def plugin_types() -> List[Type["CliPlugin"]]: global _plugin_types if _plugin_types is None: - from datumaro.components.dataset_base import DatasetBase, Importer, Transform + from datumaro.components.dataset_base import DatasetBase from datumaro.components.exporter import Exporter + from datumaro.components.importer import Importer from datumaro.components.launcher import Launcher + from datumaro.components.transformer import Transform from datumaro.components.validator import Validator _plugin_types = [Launcher, DatasetBase, Transform, Importer, Exporter, Validator] diff --git a/src/datumaro/components/dataset.py b/src/datumaro/components/dataset.py index 98f5ca8704..bbd170784e 100644 --- a/src/datumaro/components/dataset.py +++ b/src/datumaro/components/dataset.py @@ -11,40 +11,33 @@ import warnings from contextlib import contextmanager from copy import copy -from enum import Enum, auto from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union from datumaro.components.annotation import AnnotationType, LabelCategories from datumaro.components.config_model import Source from datumaro.components.dataset_base import ( - DEFAULT_SUBSET_NAME, CategoriesInfo, DatasetBase, DatasetItem, IDataset, ImportContext, ImportErrorPolicy, - ItemTransform, - Transform, _ImportFail, ) -from datumaro.components.dataset_filter import XPathAnnotationsFilter, XPathDatasetFilter +from datumaro.components.dataset_storage import DatasetPatch, DatasetStorage from datumaro.components.environment import Environment from datumaro.components.errors import ( - CategoriesRedefinedError, - ConflictingCategoriesError, - MediaTypeError, MultipleFormatsMatchError, NoMatchingFormatsError, - RepeatedItemError, UnknownFormatError, ) from datumaro.components.exporter import ExportContext, Exporter, ExportErrorPolicy, _ExportFail +from datumaro.components.filter import XPathAnnotationsFilter, XPathDatasetFilter from datumaro.components.launcher import Launcher, ModelTransform from datumaro.components.media import Image, MediaElement from datumaro.components.progress_reporting import NullProgressReporter, ProgressReporter -from datumaro.plugins.transforms import ProjectLabels -from datumaro.util import is_method_redefined +from datumaro.components.transformer import ItemTransform, Transform +from datumaro.util.definitions import DEFAULT_SUBSET_NAME from datumaro.util.log_utils import logging_disabled from datumaro.util.os_util import rmtree from datumaro.util.scope import on_error_do, scoped @@ -52,193 +45,6 @@ DEFAULT_FORMAT = "datumaro" -class DatasetItemStorage: - def __init__(self): - self.data = {} # { subset_name: { id: DatasetItem } } - self._traversal_order = {} # maintain the order of elements - - def __iter__(self) -> Iterator[DatasetItem]: - for item in self._traversal_order.values(): - yield item - - def __len__(self) -> int: - return len(self._traversal_order) - - def is_empty(self) -> bool: - # Subsets might contain removed items, so this may differ from __len__ - return all(len(s) == 0 for s in self.data.values()) - - def put(self, item: DatasetItem) -> bool: - subset = self.data.setdefault(item.subset, {}) - is_new = subset.get(item.id) is None - self._traversal_order[(item.id, item.subset)] = item - subset[item.id] = item - return is_new - - def get( - self, id: Union[str, DatasetItem], subset: Optional[str] = None, dummy: Any = None - ) -> Optional[DatasetItem]: - if isinstance(id, DatasetItem): - id, subset = id.id, id.subset - else: - id = str(id) - subset = subset or DEFAULT_SUBSET_NAME - - return self.data.get(subset, {}).get(id, dummy) - - def remove(self, id: Union[str, DatasetItem], subset: Optional[str] = None) -> bool: - if isinstance(id, DatasetItem): - id, subset = id.id, id.subset - else: - id = str(id) - subset = subset or DEFAULT_SUBSET_NAME - - subset_data = self.data.setdefault(subset, {}) - is_removed = subset_data.get(id) is not None - subset_data[id] = None - if is_removed: - self._traversal_order.pop((id, subset)) - return is_removed - - def __contains__(self, x: Union[DatasetItem, Tuple[str, str]]) -> bool: - if not isinstance(x, tuple): - x = [x] - dummy = 0 - return self.get(*x, dummy=dummy) is not dummy - - def get_subset(self, name): - return self.data.get(name, {}) - - def subsets(self): - return self.data - - def __copy__(self): - copied = DatasetItemStorage() - copied._traversal_order = copy(self._traversal_order) - copied.data = copy(self.data) - return copied - - -class DatasetItemStorageDatasetView(IDataset): - class Subset(IDataset): - def __init__(self, parent: DatasetItemStorageDatasetView, name: str): - super().__init__() - self.parent = parent - self.name = name - - @property - def _data(self): - return self.parent._get_subset_data(self.name) - - def __iter__(self): - for item in self._data.values(): - if item: - yield item - - def __len__(self): - return len(self._data) - - def put(self, item): - return self._data.put(item) - - def get(self, id, subset=None): - assert (subset or DEFAULT_SUBSET_NAME) == (self.name or DEFAULT_SUBSET_NAME) - return self._data.get(id, subset) - - def remove(self, id, subset=None): - assert (subset or DEFAULT_SUBSET_NAME) == (self.name or DEFAULT_SUBSET_NAME) - return self._data.remove(id, subset) - - def get_subset(self, name): - assert (name or DEFAULT_SUBSET_NAME) == (self.name or DEFAULT_SUBSET_NAME) - return self - - def subsets(self): - return {self.name or DEFAULT_SUBSET_NAME: self} - - def categories(self): - return self.parent.categories() - - def media_type(self): - return self.parent.media_type() - - def __init__( - self, - parent: DatasetItemStorage, - categories: CategoriesInfo, - media_type: Optional[Type[MediaElement]], - ): - self._parent = parent - self._categories = categories - self._media_type = media_type - - def __iter__(self): - yield from self._parent - - def __len__(self): - return len(self._parent) - - def categories(self): - return self._categories - - def get_subset(self, name): - return self.Subset(self, name) - - def _get_subset_data(self, name): - return self._parent.get_subset(name) - - def subsets(self): - return {k: self.get_subset(k) for k in self._parent.subsets()} - - def get(self, id, subset=None): - return self._parent.get(id, subset=subset) - - def media_type(self): - return self._media_type - - -class ItemStatus(Enum): - added = auto() - modified = auto() - removed = auto() - - -class DatasetPatch: - class DatasetPatchWrapper(DatasetItemStorageDatasetView): - # The purpose of this class is to indicate that the input dataset is - # a patch and autofill patch info in Exporter - def __init__(self, patch: DatasetPatch, parent: IDataset): - super().__init__(patch.data, parent.categories(), parent.media_type()) - self.patch = patch - - def subsets(self): - return {s: self.get_subset(s) for s in self.patch.updated_subsets} - - def __init__( - self, - data: DatasetItemStorage, - categories: CategoriesInfo, - updated_items: Dict[Tuple[str, str], ItemStatus], - updated_subsets: Dict[str, ItemStatus] = None, - ): - self.data = data - self.categories = categories - self.updated_items = updated_items - self._updated_subsets = updated_subsets - - @property - def updated_subsets(self) -> Dict[str, ItemStatus]: - if self._updated_subsets is None: - self._updated_subsets = {s: ItemStatus.modified for s in self.data.subsets()} - return self._updated_subsets - - def __contains__(self, x: Union[DatasetItem, Tuple[str, str]]) -> bool: - return x in self.data - - def as_dataset(self, parent: IDataset) -> IDataset: - return __class__.DatasetPatchWrapper(self, parent) - - class DatasetSubset(IDataset): # non-owning view def __init__(self, parent: Dataset, name: str): super().__init__() @@ -281,410 +87,6 @@ def as_dataset(self) -> Dataset: return Dataset.from_extractors(self, env=self.parent.env) -class DatasetStorage(IDataset): - def __init__( - self, - source: Union[IDataset, DatasetItemStorage] = None, - categories: Optional[CategoriesInfo] = None, - media_type: Optional[Type[MediaElement]] = None, - ): - if source is None and categories is None: - categories = {} - elif isinstance(source, IDataset) and categories is not None: - raise ValueError("Can't use both source and categories") - self._categories = categories - - if media_type: - pass - elif isinstance(source, IDataset) and source.media_type(): - media_type = source.media_type() - else: - raise ValueError("Media type must be provided for a dataset") - assert issubclass(media_type, MediaElement) - self._media_type = media_type - - # Possible combinations: - # 1. source + storage - # - Storage contains a patch to the Source data. - # 2. no source + storage - # - a dataset created from scratch - # - a dataset from a source or transform, which was cached - if isinstance(source, DatasetItemStorage): - self._source = None - self._storage = source - else: - self._source = source - self._storage = DatasetItemStorage() # patch or cache - self._transforms = [] # A stack of postponed transforms - - # Describes changes in the dataset since initialization - self._updated_items = {} # (id, subset) -> ItemStatus - - self._flush_changes = False # Deferred flush indicator - - self._length = len(self._storage) if self._source is None else None - - def is_cache_initialized(self) -> bool: - return self._source is None and not self._transforms - - @property - def _is_unchanged_wrapper(self) -> bool: - return self._source is not None and self._storage.is_empty() and not self._transforms - - def init_cache(self): - if not self.is_cache_initialized(): - for _ in self._iter_init_cache(): - pass - - def _iter_init_cache(self) -> Iterable[DatasetItem]: - try: - # Can't just return from the method, because it won't add exception handling - # It covers cases when we save the null error handler in the source - for item in self._iter_init_cache_unchecked(): - yield item - except _ImportFail as e: - raise e.__cause__ - - def _iter_init_cache_unchecked(self) -> Iterable[DatasetItem]: - # Merges the source, source transforms and patch, caches the result - # and provides an iterator for the resulting item sequence. - # - # If iterated in parallel, the result is undefined. - # If storage is changed during iteration, the result is undefined. - # - # TODO: can potentially be optimized by sharing - # the cache between parallel consumers and introducing some kind of lock - # - # Cases: - # 1. Has source and patch - # 2. Has source, transforms and patch - # a. Transforms affect only an item (i.e. they are local) - # b. Transforms affect whole dataset - # - # The patch is always applied on top of the source / transforms stack. - - class _StackedTransform(Transform): - def __init__(self, source, transforms): - super().__init__(source) - - self.is_local = True - self.transforms: List[Transform] = [] - for transform in transforms: - source = transform[0](source, *transform[1], **transform[2]) - self.transforms.append(source) - - if self.is_local and not isinstance(source, ItemTransform): - self.is_local = False - - def transform_item(self, item): - for t in self.transforms: - if item is None: - break - item = t.transform_item(item) - return item - - def __iter__(self): - yield from self.transforms[-1] - - def categories(self): - return self.transforms[-1].categories() - - def media_type(self): - return self.transforms[-1].media_type() - - def _update_status(item_id, new_status: ItemStatus): - current_status = self._updated_items.get(item_id) - - if current_status is None: - self._updated_items[item_id] = new_status - elif new_status == ItemStatus.removed: - if current_status == ItemStatus.added: - self._updated_items.pop(item_id) - else: - self._updated_items[item_id] = ItemStatus.removed - elif new_status == ItemStatus.modified: - if current_status != ItemStatus.added: - self._updated_items[item_id] = ItemStatus.modified - elif new_status == ItemStatus.added: - if current_status != ItemStatus.added: - self._updated_items[item_id] = ItemStatus.modified - else: - assert False, "Unknown status %s" % new_status - - media_type = self._media_type - patch = self._storage # must be empty after transforming - cache = DatasetItemStorage() - source = self._source or DatasetItemStorageDatasetView( - self._storage, categories=self._categories, media_type=media_type - ) - - transform = None - old_ids = None - if self._transforms: - transform = _StackedTransform(source, self._transforms) - if transform.is_local: - # An optimized way to find modified items: - # Transform items inplace and analyze transform outputs - pass - else: - # A generic way to find modified items: - # Collect all the dataset original ids and compare - # with transform outputs. - # TODO: introduce Extractor.items() / .ids() to avoid extra - # dataset traversals? - old_ids = set((item.id, item.subset) for item in source) - source = transform - - if not issubclass(transform.media_type(), media_type): - # TODO: make it statically available - raise MediaTypeError( - "Transforms are not allowed to change media " "type of dataset items" - ) - - i = -1 - for i, item in enumerate(source): - if item.media and not isinstance(item.media, media_type): - raise MediaTypeError( - "Unexpected media type of a dataset item '%s'. " - "Expected '%s', actual '%s' " % (item.id, media_type, type(item.media)) - ) - - if transform and transform.is_local: - old_id = (item.id, item.subset) - item = transform.transform_item(item) - - item_id = (item.id, item.subset) if item else None - - if item_id in cache: - raise RepeatedItemError(item_id) - - if item in patch: - # Apply changes from the patch - item = patch.get(*item_id) - elif transform and not self._flush_changes: - # Find changes made by transforms, if not overridden by patch - if transform.is_local: - if not item: - _update_status(old_id, ItemStatus.removed) - elif old_id != item_id: - _update_status(old_id, ItemStatus.removed) - _update_status(item_id, ItemStatus.added) - else: - # Consider all items modified without comparison, - # because such comparison would be very expensive - _update_status(old_id, ItemStatus.modified) - else: - if item: - if item_id not in old_ids: - _update_status(item_id, ItemStatus.added) - else: - _update_status(item_id, ItemStatus.modified) - - if not item: - continue - - cache.put(item) - yield item - - if i == -1: - cache = patch - for item in patch: - if not self._flush_changes: - _update_status((item.id, item.subset), ItemStatus.added) - yield item - else: - for item in patch: - if item in cache: # already processed - continue - if not self._flush_changes: - _update_status((item.id, item.subset), ItemStatus.added) - cache.put(item) - yield item - - if not self._flush_changes and transform and not transform.is_local: - # Mark removed items that were not produced by transforms - for old_id in old_ids: - if old_id not in self._updated_items: - self._updated_items[old_id] = ItemStatus.removed - - self._storage = cache - self._length = len(cache) - - if transform: - source_cat = transform.categories() - else: - source_cat = source.categories() - if source_cat is not None: - # Don't need to override categories if already defined - self._categories = source_cat - - self._source = None - self._transforms = [] - - if self._flush_changes: - self._flush_changes = False - self._updated_items = {} - - def __iter__(self) -> Iterator[DatasetItem]: - if self._is_unchanged_wrapper: - yield from self._iter_init_cache() - else: - yield from self._merged() - - def _merged(self) -> IDataset: - if self._is_unchanged_wrapper: - return self._source - elif self._source is not None: - self.init_cache() - return DatasetItemStorageDatasetView(self._storage, self._categories, self._media_type) - - def __len__(self) -> int: - if self._length is None: - self.init_cache() - return self._length - - def categories(self) -> CategoriesInfo: - if self.is_cache_initialized(): - return self._categories - elif self._categories is not None: - return self._categories - elif any(is_method_redefined("categories", Transform, t[0]) for t in self._transforms): - self.init_cache() - return self._categories - else: - return self._source.categories() - - def define_categories(self, categories: CategoriesInfo): - if self._categories or self._source is not None: - raise CategoriesRedefinedError() - self._categories = categories - - def media_type(self) -> Type[MediaElement]: - return self._media_type - - def put(self, item: DatasetItem): - if item.media and not isinstance(item.media, self._media_type): - raise MediaTypeError( - "Mismatching item media type '%s', " - "the dataset contains '%s' items." % (type(item.media), self._media_type) - ) - - is_new = self._storage.put(item) - - if not self.is_cache_initialized() or is_new: - self._updated_items[(item.id, item.subset)] = ItemStatus.added - else: - self._updated_items[(item.id, item.subset)] = ItemStatus.modified - - if is_new and not self.is_cache_initialized(): - self._length = None - if self._length is not None: - self._length += is_new - - def get(self, id, subset=None) -> Optional[DatasetItem]: - id = str(id) - subset = subset or DEFAULT_SUBSET_NAME - - item = self._storage.get(id, subset) - if item is None and not self.is_cache_initialized(): - if self._source.get.__func__ == DatasetBase.get or self._transforms: - # can be improved if IDataset is ABC - self.init_cache() - item = self._storage.get(id, subset) - else: - item = self._source.get(id, subset) - if item: - self._storage.put(item) - return item - - def remove(self, id, subset=None): - id = str(id) - subset = subset or DEFAULT_SUBSET_NAME - - self._storage.remove(id, subset) - is_removed = self._updated_items.get((id, subset)) != ItemStatus.removed - if is_removed: - self._updated_items[(id, subset)] = ItemStatus.removed - if is_removed and not self.is_cache_initialized(): - self._length = None - if self._length is not None: - self._length -= is_removed - - def get_subset(self, name): - return self._merged().get_subset(name) - - def subsets(self): - # TODO: check if this can be optimized in case of transforms - # and other cases - return self._merged().subsets() - - def transform(self, method: Type[Transform], *args, **kwargs): - # Flush accumulated changes - if not self._storage.is_empty(): - source = self._merged() - self._storage = DatasetItemStorage() - else: - source = self._source - - if not self._transforms: - # The stack of transforms only needs a single source - self._source = source - self._transforms.append((method, args, kwargs)) - - if is_method_redefined("categories", Transform, method): - self._categories = None - self._length = None - - def has_updated_items(self): - return bool(self._transforms) or bool(self._updated_items) - - def get_patch(self): - # Patch includes only added or modified items. - # To find removed items, one needs to consult updated_items list. - if self._transforms: - self.init_cache() - - # The current patch (storage) - # - can miss some removals done so we add them manually - # - can include items than not in the patch - # (e.g. an item could get there after source was cached) - # So we reconstruct the patch instead of copying storage. - patch = DatasetItemStorage() - for (item_id, subset), status in self._updated_items.items(): - if status is ItemStatus.removed: - patch.remove(item_id, subset) - else: - patch.put(self._storage.get(item_id, subset)) - - return DatasetPatch(patch, self._categories, self._updated_items) - - def flush_changes(self): - self._updated_items = {} - if not (self.is_cache_initialized() or self._is_unchanged_wrapper): - self._flush_changes = True - - def update(self, source: Union[DatasetPatch, IDataset, Iterable[DatasetItem]]): - # TODO: provide a more efficient implementation with patch reuse - - if isinstance(source, DatasetPatch): - if source.categories() != self.categories(): - raise ConflictingCategoriesError() - - for item_id, status in source.updated_items.items(): - if status == ItemStatus.removed: - self.remove(*item_id) - else: - self.put(source.data.get(*item_id)) - elif isinstance(source, IDataset): - for item in ProjectLabels( - source, self.categories().get(AnnotationType.label, LabelCategories()) - ): - self.put(item) - else: - for item in source: - self.put(item) - - class Dataset(IDataset): """ Represents a dataset, contains metainfo about labels and dataset items. diff --git a/src/datumaro/components/dataset_base.py b/src/datumaro/components/dataset_base.py index 1e5ae36562..2325a1be8e 100644 --- a/src/datumaro/components/dataset_base.py +++ b/src/datumaro/components/dataset_base.py @@ -5,13 +5,9 @@ from __future__ import annotations -import os -import os.path as osp import warnings -from glob import iglob from typing import ( Any, - Callable, Dict, Iterator, List, @@ -31,19 +27,11 @@ from datumaro.components.annotation import Annotation, AnnotationType, Categories from datumaro.components.cli_plugin import CliPlugin -from datumaro.components.errors import ( - AnnotationImportError, - DatasetNotFoundError, - DatumaroError, - ItemImportError, -) -from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext +from datumaro.components.errors import AnnotationImportError, DatumaroError, ItemImportError from datumaro.components.media import Image, MediaElement, PointCloud from datumaro.components.progress_reporting import NullProgressReporter, ProgressReporter -from datumaro.util import is_method_redefined from datumaro.util.attrs_util import default_if_none, not_empty - -DEFAULT_SUBSET_NAME = "default" +from datumaro.util.definitions import DEFAULT_SUBSET_NAME MediaType = TypeVar("MediaType", bound=MediaElement) @@ -412,151 +400,3 @@ def __len__(self): def get(self, id, subset=None): assert subset == self._subset, "%s != %s" % (subset, self._subset) return super().get(id, subset or self._subset) - - -class Importer(CliPlugin): - @classmethod - def detect( - cls, - context: FormatDetectionContext, - ) -> Optional[FormatDetectionConfidence]: - if not cls.find_sources_with_params(context.root_path): - context.fail("specific requirement information unavailable") - - return FormatDetectionConfidence.LOW - - @classmethod - def find_sources(cls, path) -> List[Dict]: - raise NotImplementedError() - - @classmethod - def find_sources_with_params(cls, path, **extra_params) -> List[Dict]: - return cls.find_sources(path) - - def __call__(self, path, **extra_params): - if not path or not osp.exists(path): - raise DatasetNotFoundError(path) - - found_sources = self.find_sources_with_params(osp.normpath(path), **extra_params) - if not found_sources: - raise DatasetNotFoundError(path) - - sources = [] - for desc in found_sources: - params = dict(extra_params) - params.update(desc.get("options", {})) - desc["options"] = params - sources.append(desc) - - return sources - - @classmethod - def _find_sources_recursive( - cls, - path: str, - ext: Optional[str], - extractor_name: str, - filename: str = "*", - dirname: str = "", - file_filter: Optional[Callable[[str], bool]] = None, - max_depth: int = 3, - ): - """ - Finds sources in the specified location, using the matching pattern - to filter file names and directories. - Supposed to be used, and to be the only call in subclasses. - - Parameters: - path: a directory or file path, where sources need to be found. - ext: file extension to match. To match directories, - set this parameter to None or ''. Comparison is case-independent, - a starting dot is not required. - extractor_name: the name of the associated Extractor type - filename: a glob pattern for file names - dirname: a glob pattern for filename prefixes - file_filter: a callable (abspath: str) -> bool, to filter paths found - max_depth: the maximum depth for recursive search. - - Returns: a list of source configurations - (i.e. Extractor type names and c-tor parameters) - """ - - if ext: - if not ext.startswith("."): - ext = "." + ext - ext = ext.lower() - - if (path.lower().endswith(ext) and osp.isfile(path)) or ( - not ext - and dirname - and osp.isdir(path) - and os.sep + osp.normpath(dirname.lower()) + os.sep - in osp.abspath(path.lower()) + os.sep - ): - sources = [{"url": path, "format": extractor_name}] - else: - sources = [] - for d in range(max_depth + 1): - sources.extend( - {"url": p, "format": extractor_name} - for p in iglob(osp.join(path, *("*" * d), dirname, filename + ext)) - if (callable(file_filter) and file_filter(p)) or (not callable(file_filter)) - ) - if sources: - break - return sources - - -class Transform(_DatasetBase, CliPlugin): - """ - A base class for dataset transformations that change dataset items - or their annotations. - """ - - @staticmethod - def wrap_item(item, **kwargs): - return item.wrap(**kwargs) - - def __init__(self, extractor: IDataset): - super().__init__() - - self._extractor = extractor - - def categories(self): - return self._extractor.categories() - - def subsets(self): - if self._subsets is None: - self._subsets = set(self._extractor.subsets()) - return super().subsets() - - def __len__(self): - assert self._length in {None, "parent"} or isinstance(self._length, int) - if ( - self._length is None - and not is_method_redefined("__iter__", Transform, self) - or self._length == "parent" - ): - self._length = len(self._extractor) - return super().__len__() - - def media_type(self): - return self._extractor.media_type() - - -class ItemTransform(Transform): - def transform_item(self, item: DatasetItem) -> Optional[DatasetItem]: - """ - Returns a modified copy of the input item. - - Avoid changing and returning the input item, because it can lead to - unexpected problems. Use wrap_item() or item.wrap() to simplify copying. - """ - - raise NotImplementedError() - - def __iter__(self): - for item in self._extractor: - item = self.transform_item(item) - if item is not None: - yield item diff --git a/src/datumaro/components/dataset_item_storage.py b/src/datumaro/components/dataset_item_storage.py new file mode 100644 index 0000000000..b143e2639e --- /dev/null +++ b/src/datumaro/components/dataset_item_storage.py @@ -0,0 +1,161 @@ +from __future__ import annotations + +from copy import copy +from enum import Enum, auto +from typing import Any, Iterator, Optional, Tuple, Type, Union + +from datumaro.components.dataset_base import CategoriesInfo, DatasetItem, IDataset, MediaElement +from datumaro.util.definitions import DEFAULT_SUBSET_NAME + +__all__ = ["ItemStatus", "DatasetItemStorage", "DatasetItemStorageDatasetView"] + + +class ItemStatus(Enum): + added = auto() + modified = auto() + removed = auto() + + +class DatasetItemStorage: + def __init__(self): + self.data = {} # { subset_name: { id: DatasetItem } } + self._traversal_order = {} # maintain the order of elements + + def __iter__(self) -> Iterator[DatasetItem]: + for item in self._traversal_order.values(): + yield item + + def __len__(self) -> int: + return len(self._traversal_order) + + def is_empty(self) -> bool: + # Subsets might contain removed items, so this may differ from __len__ + return all(len(s) == 0 for s in self.data.values()) + + def put(self, item: DatasetItem) -> bool: + subset = self.data.setdefault(item.subset, {}) + is_new = subset.get(item.id) is None + self._traversal_order[(item.id, item.subset)] = item + subset[item.id] = item + return is_new + + def get( + self, id: Union[str, DatasetItem], subset: Optional[str] = None, dummy: Any = None + ) -> Optional[DatasetItem]: + if isinstance(id, DatasetItem): + id, subset = id.id, id.subset + else: + id = str(id) + subset = subset or DEFAULT_SUBSET_NAME + + return self.data.get(subset, {}).get(id, dummy) + + def remove(self, id: Union[str, DatasetItem], subset: Optional[str] = None) -> bool: + if isinstance(id, DatasetItem): + id, subset = id.id, id.subset + else: + id = str(id) + subset = subset or DEFAULT_SUBSET_NAME + + subset_data = self.data.setdefault(subset, {}) + is_removed = subset_data.get(id) is not None + subset_data[id] = None + if is_removed: + self._traversal_order.pop((id, subset)) + return is_removed + + def __contains__(self, x: Union[DatasetItem, Tuple[str, str]]) -> bool: + if not isinstance(x, tuple): + x = [x] + dummy = 0 + return self.get(*x, dummy=dummy) is not dummy + + def get_subset(self, name): + return self.data.get(name, {}) + + def subsets(self): + return self.data + + def __copy__(self): + copied = DatasetItemStorage() + copied._traversal_order = copy(self._traversal_order) + copied.data = copy(self.data) + return copied + + +class DatasetItemStorageDatasetView(IDataset): + class Subset(IDataset): + def __init__(self, parent: DatasetItemStorageDatasetView, name: str): + super().__init__() + self.parent = parent + self.name = name + + @property + def _data(self): + return self.parent._get_subset_data(self.name) + + def __iter__(self): + for item in self._data.values(): + if item: + yield item + + def __len__(self): + return len(self._data) + + def put(self, item): + return self._data.put(item) + + def get(self, id, subset=None): + assert (subset or DEFAULT_SUBSET_NAME) == (self.name or DEFAULT_SUBSET_NAME) + return self._data.get(id, subset) + + def remove(self, id, subset=None): + assert (subset or DEFAULT_SUBSET_NAME) == (self.name or DEFAULT_SUBSET_NAME) + return self._data.remove(id, subset) + + def get_subset(self, name): + assert (name or DEFAULT_SUBSET_NAME) == (self.name or DEFAULT_SUBSET_NAME) + return self + + def subsets(self): + return {self.name or DEFAULT_SUBSET_NAME: self} + + def categories(self): + return self.parent.categories() + + def media_type(self): + return self.parent.media_type() + + def __init__( + self, + parent: DatasetItemStorage, + categories: CategoriesInfo, + media_type: Optional[Type[MediaElement]], + ): + self._parent = parent + self._categories = categories + self._media_type = media_type + + def __iter__(self): + yield from self._parent + + def __len__(self): + return len(self._parent) + + def categories(self): + return self._categories + + def get_subset(self, name): + return self.Subset(self, name) + + def _get_subset_data(self, name): + return self._parent.get_subset(name) + + def subsets(self): + return {k: self.get_subset(k) for k in self._parent.subsets()} + + def get(self, id, subset=None): + return self._parent.get(id, subset=subset) + + def media_type(self): + return self._media_type diff --git a/src/datumaro/components/dataset_storage.py b/src/datumaro/components/dataset_storage.py new file mode 100644 index 0000000000..abf0a7e44f --- /dev/null +++ b/src/datumaro/components/dataset_storage.py @@ -0,0 +1,470 @@ +from __future__ import annotations + +from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union + +from datumaro.components.annotation import AnnotationType, LabelCategories +from datumaro.components.dataset_base import ( + CategoriesInfo, + DatasetBase, + DatasetItem, + IDataset, + _ImportFail, +) +from datumaro.components.dataset_item_storage import ( + DatasetItemStorage, + DatasetItemStorageDatasetView, + ItemStatus, +) +from datumaro.components.errors import ( + CategoriesRedefinedError, + ConflictingCategoriesError, + MediaTypeError, + RepeatedItemError, +) +from datumaro.components.media import MediaElement +from datumaro.components.transformer import ItemTransform, Transform +from datumaro.plugins.transforms import ProjectLabels +from datumaro.util import is_method_redefined +from datumaro.util.definitions import DEFAULT_SUBSET_NAME + +__all__ = ["DatasetPatch", "DatasetStorage"] + + +class DatasetPatch: + class DatasetPatchWrapper(DatasetItemStorageDatasetView): + # The purpose of this class is to indicate that the input dataset is + # a patch and autofill patch info in Exporter + def __init__(self, patch: DatasetPatch, parent: IDataset): + super().__init__(patch.data, parent.categories(), parent.media_type()) + self.patch = patch + + def subsets(self): + return {s: self.get_subset(s) for s in self.patch.updated_subsets} + + def __init__( + self, + data: DatasetItemStorage, + categories: CategoriesInfo, + updated_items: Dict[Tuple[str, str], ItemStatus], + updated_subsets: Dict[str, ItemStatus] = None, + ): + self.data = data + self.categories = categories + self.updated_items = updated_items + self._updated_subsets = updated_subsets + + @property + def updated_subsets(self) -> Dict[str, ItemStatus]: + if self._updated_subsets is None: + self._updated_subsets = {s: ItemStatus.modified for s in self.data.subsets()} + return self._updated_subsets + + def __contains__(self, x: Union[DatasetItem, Tuple[str, str]]) -> bool: + return x in self.data + + def as_dataset(self, parent: IDataset) -> IDataset: + return __class__.DatasetPatchWrapper(self, parent) + + +class DatasetStorage(IDataset): + def __init__( + self, + source: Union[IDataset, DatasetItemStorage] = None, + categories: Optional[CategoriesInfo] = None, + media_type: Optional[Type[MediaElement]] = None, + ): + if source is None and categories is None: + categories = {} + elif isinstance(source, IDataset) and categories is not None: + raise ValueError("Can't use both source and categories") + self._categories = categories + + if media_type: + pass + elif isinstance(source, IDataset) and source.media_type(): + media_type = source.media_type() + else: + raise ValueError("Media type must be provided for a dataset") + assert issubclass(media_type, MediaElement) + self._media_type = media_type + + # Possible combinations: + # 1. source + storage + # - Storage contains a patch to the Source data. + # 2. no source + storage + # - a dataset created from scratch + # - a dataset from a source or transform, which was cached + if isinstance(source, DatasetItemStorage): + self._source = None + self._storage = source + else: + self._source = source + self._storage = DatasetItemStorage() # patch or cache + self._transforms = [] # A stack of postponed transforms + + # Describes changes in the dataset since initialization + self._updated_items = {} # (id, subset) -> ItemStatus + + self._flush_changes = False # Deferred flush indicator + + self._length = len(self._storage) if self._source is None else None + + def is_cache_initialized(self) -> bool: + return self._source is None and not self._transforms + + @property + def _is_unchanged_wrapper(self) -> bool: + return self._source is not None and self._storage.is_empty() and not self._transforms + + def init_cache(self): + if not self.is_cache_initialized(): + for _ in self._iter_init_cache(): + pass + + def _iter_init_cache(self) -> Iterable[DatasetItem]: + try: + # Can't just return from the method, because it won't add exception handling + # It covers cases when we save the null error handler in the source + for item in self._iter_init_cache_unchecked(): + yield item + except _ImportFail as e: + raise e.__cause__ + + def _iter_init_cache_unchecked(self) -> Iterable[DatasetItem]: + # Merges the source, source transforms and patch, caches the result + # and provides an iterator for the resulting item sequence. + # + # If iterated in parallel, the result is undefined. + # If storage is changed during iteration, the result is undefined. + # + # TODO: can potentially be optimized by sharing + # the cache between parallel consumers and introducing some kind of lock + # + # Cases: + # 1. Has source and patch + # 2. Has source, transforms and patch + # a. Transforms affect only an item (i.e. they are local) + # b. Transforms affect whole dataset + # + # The patch is always applied on top of the source / transforms stack. + + class _StackedTransform(Transform): + def __init__(self, source, transforms): + super().__init__(source) + + self.is_local = True + self.transforms: List[Transform] = [] + for transform in transforms: + source = transform[0](source, *transform[1], **transform[2]) + self.transforms.append(source) + + if self.is_local and not isinstance(source, ItemTransform): + self.is_local = False + + def transform_item(self, item): + for t in self.transforms: + if item is None: + break + item = t.transform_item(item) + return item + + def __iter__(self): + yield from self.transforms[-1] + + def categories(self): + return self.transforms[-1].categories() + + def media_type(self): + return self.transforms[-1].media_type() + + def _update_status(item_id, new_status: ItemStatus): + current_status = self._updated_items.get(item_id) + + if current_status is None: + self._updated_items[item_id] = new_status + elif new_status == ItemStatus.removed: + if current_status == ItemStatus.added: + self._updated_items.pop(item_id) + else: + self._updated_items[item_id] = ItemStatus.removed + elif new_status == ItemStatus.modified: + if current_status != ItemStatus.added: + self._updated_items[item_id] = ItemStatus.modified + elif new_status == ItemStatus.added: + if current_status != ItemStatus.added: + self._updated_items[item_id] = ItemStatus.modified + else: + assert False, "Unknown status %s" % new_status + + media_type = self._media_type + patch = self._storage # must be empty after transforming + cache = DatasetItemStorage() + source = self._source or DatasetItemStorageDatasetView( + self._storage, categories=self._categories, media_type=media_type + ) + + transform = None + old_ids = None + if self._transforms: + transform = _StackedTransform(source, self._transforms) + if transform.is_local: + # An optimized way to find modified items: + # Transform items inplace and analyze transform outputs + pass + else: + # A generic way to find modified items: + # Collect all the dataset original ids and compare + # with transform outputs. + # TODO: introduce Extractor.items() / .ids() to avoid extra + # dataset traversals? + old_ids = set((item.id, item.subset) for item in source) + source = transform + + if not issubclass(transform.media_type(), media_type): + # TODO: make it statically available + raise MediaTypeError( + "Transforms are not allowed to change media " "type of dataset items" + ) + + i = -1 + for i, item in enumerate(source): + if item.media and not isinstance(item.media, media_type): + raise MediaTypeError( + "Unexpected media type of a dataset item '%s'. " + "Expected '%s', actual '%s' " % (item.id, media_type, type(item.media)) + ) + + if transform and transform.is_local: + old_id = (item.id, item.subset) + item = transform.transform_item(item) + + item_id = (item.id, item.subset) if item else None + + if item_id in cache: + raise RepeatedItemError(item_id) + + if item in patch: + # Apply changes from the patch + item = patch.get(*item_id) + elif transform and not self._flush_changes: + # Find changes made by transforms, if not overridden by patch + if transform.is_local: + if not item: + _update_status(old_id, ItemStatus.removed) + elif old_id != item_id: + _update_status(old_id, ItemStatus.removed) + _update_status(item_id, ItemStatus.added) + else: + # Consider all items modified without comparison, + # because such comparison would be very expensive + _update_status(old_id, ItemStatus.modified) + else: + if item: + if item_id not in old_ids: + _update_status(item_id, ItemStatus.added) + else: + _update_status(item_id, ItemStatus.modified) + + if not item: + continue + + cache.put(item) + yield item + + if i == -1: + cache = patch + for item in patch: + if not self._flush_changes: + _update_status((item.id, item.subset), ItemStatus.added) + yield item + else: + for item in patch: + if item in cache: # already processed + continue + if not self._flush_changes: + _update_status((item.id, item.subset), ItemStatus.added) + cache.put(item) + yield item + + if not self._flush_changes and transform and not transform.is_local: + # Mark removed items that were not produced by transforms + for old_id in old_ids: + if old_id not in self._updated_items: + self._updated_items[old_id] = ItemStatus.removed + + self._storage = cache + self._length = len(cache) + + if transform: + source_cat = transform.categories() + else: + source_cat = source.categories() + if source_cat is not None: + # Don't need to override categories if already defined + self._categories = source_cat + + self._source = None + self._transforms = [] + + if self._flush_changes: + self._flush_changes = False + self._updated_items = {} + + def __iter__(self) -> Iterator[DatasetItem]: + if self._is_unchanged_wrapper: + yield from self._iter_init_cache() + else: + yield from self._merged() + + def _merged(self) -> IDataset: + if self._is_unchanged_wrapper: + return self._source + elif self._source is not None: + self.init_cache() + return DatasetItemStorageDatasetView(self._storage, self._categories, self._media_type) + + def __len__(self) -> int: + if self._length is None: + self.init_cache() + return self._length + + def categories(self) -> CategoriesInfo: + if self.is_cache_initialized(): + return self._categories + elif self._categories is not None: + return self._categories + elif any(is_method_redefined("categories", Transform, t[0]) for t in self._transforms): + self.init_cache() + return self._categories + else: + return self._source.categories() + + def define_categories(self, categories: CategoriesInfo): + if self._categories or self._source is not None: + raise CategoriesRedefinedError() + self._categories = categories + + def media_type(self) -> Type[MediaElement]: + return self._media_type + + def put(self, item: DatasetItem): + if item.media and not isinstance(item.media, self._media_type): + raise MediaTypeError( + "Mismatching item media type '%s', " + "the dataset contains '%s' items." % (type(item.media), self._media_type) + ) + + is_new = self._storage.put(item) + + if not self.is_cache_initialized() or is_new: + self._updated_items[(item.id, item.subset)] = ItemStatus.added + else: + self._updated_items[(item.id, item.subset)] = ItemStatus.modified + + if is_new and not self.is_cache_initialized(): + self._length = None + if self._length is not None: + self._length += is_new + + def get(self, id, subset=None) -> Optional[DatasetItem]: + id = str(id) + subset = subset or DEFAULT_SUBSET_NAME + + item = self._storage.get(id, subset) + if item is None and not self.is_cache_initialized(): + if self._source.get.__func__ == DatasetBase.get or self._transforms: + # can be improved if IDataset is ABC + self.init_cache() + item = self._storage.get(id, subset) + else: + item = self._source.get(id, subset) + if item: + self._storage.put(item) + return item + + def remove(self, id, subset=None): + id = str(id) + subset = subset or DEFAULT_SUBSET_NAME + + self._storage.remove(id, subset) + is_removed = self._updated_items.get((id, subset)) != ItemStatus.removed + if is_removed: + self._updated_items[(id, subset)] = ItemStatus.removed + if is_removed and not self.is_cache_initialized(): + self._length = None + if self._length is not None: + self._length -= is_removed + + def get_subset(self, name): + return self._merged().get_subset(name) + + def subsets(self): + # TODO: check if this can be optimized in case of transforms + # and other cases + return self._merged().subsets() + + def transform(self, method: Type[Transform], *args, **kwargs): + # Flush accumulated changes + if not self._storage.is_empty(): + source = self._merged() + self._storage = DatasetItemStorage() + else: + source = self._source + + if not self._transforms: + # The stack of transforms only needs a single source + self._source = source + self._transforms.append((method, args, kwargs)) + + if is_method_redefined("categories", Transform, method): + self._categories = None + self._length = None + + def has_updated_items(self): + return bool(self._transforms) or bool(self._updated_items) + + def get_patch(self): + # Patch includes only added or modified items. + # To find removed items, one needs to consult updated_items list. + if self._transforms: + self.init_cache() + + # The current patch (storage) + # - can miss some removals done so we add them manually + # - can include items than not in the patch + # (e.g. an item could get there after source was cached) + # So we reconstruct the patch instead of copying storage. + patch = DatasetItemStorage() + for (item_id, subset), status in self._updated_items.items(): + if status is ItemStatus.removed: + patch.remove(item_id, subset) + else: + patch.put(self._storage.get(item_id, subset)) + + return DatasetPatch(patch, self._categories, self._updated_items) + + def flush_changes(self): + self._updated_items = {} + if not (self.is_cache_initialized() or self._is_unchanged_wrapper): + self._flush_changes = True + + def update(self, source: Union[DatasetPatch, IDataset, Iterable[DatasetItem]]): + # TODO: provide a more efficient implementation with patch reuse + + if isinstance(source, DatasetPatch): + if source.categories() != self.categories(): + raise ConflictingCategoriesError() + + for item_id, status in source.updated_items.items(): + if status == ItemStatus.removed: + self.remove(*item_id) + else: + self.put(source.data.get(*item_id)) + elif isinstance(source, IDataset): + for item in ProjectLabels( + source, self.categories().get(AnnotationType.label, LabelCategories()) + ): + self.put(item) + else: + for item in source: + self.put(item) diff --git a/src/datumaro/components/environment.py b/src/datumaro/components/environment.py index 8541dcb262..9477f2149d 100644 --- a/src/datumaro/components/environment.py +++ b/src/datumaro/components/environment.py @@ -72,16 +72,12 @@ def _check_type(t, *, accept, skip): return issubclass(t, accept) and t not in skip def __init__(self): - from datumaro.components.dataset_base import ( - DatasetBase, - Importer, - ItemTransform, - SubsetBase, - Transform, - ) - from datumaro.components.dataset_generator import DatasetGenerator + from datumaro.components.dataset_base import DatasetBase, SubsetBase from datumaro.components.exporter import Exporter + from datumaro.components.generator import DatasetGenerator + from datumaro.components.importer import Importer from datumaro.components.launcher import Launcher + from datumaro.components.transformer import ItemTransform, Transform from datumaro.components.validator import Validator _filter = self._make_filter diff --git a/src/datumaro/components/exporter.py b/src/datumaro/components/exporter.py index ece9f72e07..8a7702e71c 100644 --- a/src/datumaro/components/exporter.py +++ b/src/datumaro/components/exporter.py @@ -204,7 +204,7 @@ def __init__( # TODO: refactor this variable. # Can be used by a subclass to store the current patch info - from datumaro.components.dataset import DatasetPatch + from datumaro.components.dataset_storage import DatasetPatch if isinstance(extractor, DatasetPatch.DatasetPatchWrapper): self._patch = extractor.patch diff --git a/src/datumaro/components/dataset_filter.py b/src/datumaro/components/filter.py similarity index 99% rename from src/datumaro/components/dataset_filter.py rename to src/datumaro/components/filter.py index 16cc82fff3..1962f5af9c 100644 --- a/src/datumaro/components/dataset_filter.py +++ b/src/datumaro/components/filter.py @@ -18,8 +18,8 @@ Polygon, PolyLine, ) -from datumaro.components.dataset_base import ItemTransform from datumaro.components.media import Image +from datumaro.components.transformer import ItemTransform class DatasetItemEncoder: diff --git a/src/datumaro/components/dataset_generator.py b/src/datumaro/components/generator.py similarity index 100% rename from src/datumaro/components/dataset_generator.py rename to src/datumaro/components/generator.py diff --git a/src/datumaro/components/hl_ops.py b/src/datumaro/components/hl_ops/__init__.py similarity index 95% rename from src/datumaro/components/hl_ops.py rename to src/datumaro/components/hl_ops/__init__.py index 0b756a2daf..b0116b8e6b 100644 --- a/src/datumaro/components/hl_ops.py +++ b/src/datumaro/components/hl_ops/__init__.py @@ -8,13 +8,14 @@ import shutil from typing import Dict, Optional, Type, Union -from datumaro.components.dataset import Dataset, DatasetItemStorageDatasetView, IDataset -from datumaro.components.dataset_base import Transform -from datumaro.components.dataset_filter import XPathAnnotationsFilter, XPathDatasetFilter +from datumaro.components.dataset import Dataset, IDataset +from datumaro.components.dataset_item_storage import DatasetItemStorageDatasetView from datumaro.components.environment import Environment from datumaro.components.exporter import Exporter +from datumaro.components.filter import XPathAnnotationsFilter, XPathDatasetFilter from datumaro.components.launcher import Launcher, ModelTransform from datumaro.components.operations import ExactMerge +from datumaro.components.transformer import Transform from datumaro.components.validator import TaskType, Validator from datumaro.util import parse_str_enum_value from datumaro.util.scope import on_error_do, scoped diff --git a/src/datumaro/components/importer.py b/src/datumaro/components/importer.py new file mode 100644 index 0000000000..8f7582e33a --- /dev/null +++ b/src/datumaro/components/importer.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import os +from glob import iglob +from os import path as osp +from typing import Callable, Dict, List, Optional + +from datumaro import CliPlugin +from datumaro.components.errors import DatasetNotFoundError +from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext + + +class Importer(CliPlugin): + @classmethod + def detect( + cls, + context: FormatDetectionContext, + ) -> Optional[FormatDetectionConfidence]: + if not cls.find_sources_with_params(context.root_path): + context.fail("specific requirement information unavailable") + + return FormatDetectionConfidence.LOW + + @classmethod + def find_sources(cls, path) -> List[Dict]: + raise NotImplementedError() + + @classmethod + def find_sources_with_params(cls, path, **extra_params) -> List[Dict]: + return cls.find_sources(path) + + def __call__(self, path, **extra_params): + if not path or not osp.exists(path): + raise DatasetNotFoundError(path) + + found_sources = self.find_sources_with_params(osp.normpath(path), **extra_params) + if not found_sources: + raise DatasetNotFoundError(path) + + sources = [] + for desc in found_sources: + params = dict(extra_params) + params.update(desc.get("options", {})) + desc["options"] = params + sources.append(desc) + + return sources + + @classmethod + def _find_sources_recursive( + cls, + path: str, + ext: Optional[str], + extractor_name: str, + filename: str = "*", + dirname: str = "", + file_filter: Optional[Callable[[str], bool]] = None, + max_depth: int = 3, + ): + """ + Finds sources in the specified location, using the matching pattern + to filter file names and directories. + Supposed to be used, and to be the only call in subclasses. + + Parameters: + path: a directory or file path, where sources need to be found. + ext: file extension to match. To match directories, + set this parameter to None or ''. Comparison is case-independent, + a starting dot is not required. + extractor_name: the name of the associated Extractor type + filename: a glob pattern for file names + dirname: a glob pattern for filename prefixes + file_filter: a callable (abspath: str) -> bool, to filter paths found + max_depth: the maximum depth for recursive search. + + Returns: a list of source configurations + (i.e. Extractor type names and c-tor parameters) + """ + + if ext: + if not ext.startswith("."): + ext = "." + ext + ext = ext.lower() + + if (path.lower().endswith(ext) and osp.isfile(path)) or ( + not ext + and dirname + and osp.isdir(path) + and os.sep + osp.normpath(dirname.lower()) + os.sep + in osp.abspath(path.lower()) + os.sep + ): + sources = [{"url": path, "format": extractor_name}] + else: + sources = [] + for d in range(max_depth + 1): + sources.extend( + {"url": p, "format": extractor_name} + for p in iglob(osp.join(path, *("*" * d), dirname, filename + ext)) + if (callable(file_filter) and file_filter(p)) or (not callable(file_filter)) + ) + if sources: + break + return sources diff --git a/src/datumaro/components/launcher.py b/src/datumaro/components/launcher.py index 59e72191df..1baf43e8a8 100644 --- a/src/datumaro/components/launcher.py +++ b/src/datumaro/components/launcher.py @@ -6,7 +6,7 @@ from datumaro.components.annotation import AnnotationType, LabelCategories from datumaro.components.cli_plugin import CliPlugin -from datumaro.components.dataset_base import Transform +from datumaro.components.transformer import Transform from datumaro.util import take_by diff --git a/src/datumaro/components/operations.py b/src/datumaro/components/operations.py index 5c06ee5788..f972b315fa 100644 --- a/src/datumaro/components/operations.py +++ b/src/datumaro/components/operations.py @@ -41,8 +41,9 @@ PointsCategories, ) from datumaro.components.cli_plugin import CliPlugin -from datumaro.components.dataset import Dataset, DatasetItemStorage, IDataset +from datumaro.components.dataset import Dataset, IDataset from datumaro.components.dataset_base import CategoriesInfo, DatasetItem +from datumaro.components.dataset_item_storage import DatasetItemStorage from datumaro.components.errors import ( AnnotationsTooCloseError, ConflictingCategoriesError, diff --git a/src/datumaro/components/transformer.py b/src/datumaro/components/transformer.py new file mode 100644 index 0000000000..b28a1f3965 --- /dev/null +++ b/src/datumaro/components/transformer.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from typing import Optional + +from datumaro.components.cli_plugin import CliPlugin +from datumaro.components.dataset_base import DatasetBase, DatasetItem, IDataset +from datumaro.util import is_method_redefined + + +class Transform(DatasetBase, CliPlugin): + """ + A base class for dataset transformations that change dataset items + or their annotations. + """ + + @staticmethod + def wrap_item(item, **kwargs): + return item.wrap(**kwargs) + + def __init__(self, extractor: IDataset): + super().__init__() + + self._extractor = extractor + + def categories(self): + return self._extractor.categories() + + def subsets(self): + if self._subsets is None: + self._subsets = set(self._extractor.subsets()) + return super().subsets() + + def __len__(self): + assert self._length in {None, "parent"} or isinstance(self._length, int) + if ( + self._length is None + and not is_method_redefined("__iter__", Transform, self) + or self._length == "parent" + ): + self._length = len(self._extractor) + return super().__len__() + + def media_type(self): + return self._extractor.media_type() + + +class ItemTransform(Transform): + def transform_item(self, item: DatasetItem) -> Optional[DatasetItem]: + """ + Returns a modified copy of the input item. + + Avoid changing and returning the input item, because it can lead to + unexpected problems. Use wrap_item() or item.wrap() to simplify copying. + """ + + raise NotImplementedError() + + def __iter__(self): + for item in self._extractor: + item = self.transform_item(item) + if item is not None: + yield item diff --git a/src/datumaro/plugins/data_formats/ade20k2017.py b/src/datumaro/plugins/data_formats/ade20k2017.py index 96081db7be..7deb9bbf96 100644 --- a/src/datumaro/plugins/data_formats/ade20k2017.py +++ b/src/datumaro/plugins/data_formats/ade20k2017.py @@ -11,8 +11,9 @@ import numpy as np from datumaro.components.annotation import AnnotationType, CompiledMask, LabelCategories, Mask -from datumaro.components.dataset_base import DatasetBase, DatasetItem, Importer +from datumaro.components.dataset_base import DatasetBase, DatasetItem from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.image import IMAGE_EXTENSIONS, find_images, lazy_image, load_image from datumaro.util.meta_file_util import has_meta_file, parse_meta_file diff --git a/src/datumaro/plugins/data_formats/ade20k2020.py b/src/datumaro/plugins/data_formats/ade20k2020.py index e8967328f2..e960dd68fe 100644 --- a/src/datumaro/plugins/data_formats/ade20k2020.py +++ b/src/datumaro/plugins/data_formats/ade20k2020.py @@ -17,8 +17,9 @@ Mask, Polygon, ) -from datumaro.components.dataset_base import DatasetBase, DatasetItem, Importer +from datumaro.components.dataset_base import DatasetBase, DatasetItem from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util import parse_json from datumaro.util.image import IMAGE_EXTENSIONS, find_images, lazy_image, load_image diff --git a/src/datumaro/plugins/data_formats/brats.py b/src/datumaro/plugins/data_formats/brats.py index 6d66371573..72bf839078 100644 --- a/src/datumaro/plugins/data_formats/brats.py +++ b/src/datumaro/plugins/data_formats/brats.py @@ -9,8 +9,9 @@ import numpy as np from datumaro.components.annotation import AnnotationType, LabelCategories, Mask -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import MultiframeImage diff --git a/src/datumaro/plugins/data_formats/brats_numpy.py b/src/datumaro/plugins/data_formats/brats_numpy.py index 0dad316324..20232b55bc 100644 --- a/src/datumaro/plugins/data_formats/brats_numpy.py +++ b/src/datumaro/plugins/data_formats/brats_numpy.py @@ -7,8 +7,9 @@ import numpy as np from datumaro.components.annotation import AnnotationType, Cuboid3d, LabelCategories, Mask -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import MultiframeImage from datumaro.util.pickle_util import PickleLoader diff --git a/src/datumaro/plugins/data_formats/camvid.py b/src/datumaro/plugins/data_formats/camvid.py index f99510fcc0..d3fa898022 100644 --- a/src/datumaro/plugins/data_formats/camvid.py +++ b/src/datumaro/plugins/data_formats/camvid.py @@ -20,11 +20,12 @@ Mask, MaskCategories, ) -from datumaro.components.dataset import ItemStatus -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase +from datumaro.components.dataset_item_storage import ItemStatus from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util import find, str_to_bool from datumaro.util.annotation_util import make_label_id_mapping diff --git a/src/datumaro/plugins/data_formats/celeba/align_celeba.py b/src/datumaro/plugins/data_formats/celeba/align_celeba.py index fac944963f..64f02f1f14 100644 --- a/src/datumaro/plugins/data_formats/celeba/align_celeba.py +++ b/src/datumaro/plugins/data_formats/celeba/align_celeba.py @@ -11,8 +11,9 @@ Points, PointsCategories, ) -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.errors import DatasetImportError +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.image import find_images from datumaro.util.meta_file_util import has_meta_file, parse_meta_file diff --git a/src/datumaro/plugins/data_formats/celeba/celeba.py b/src/datumaro/plugins/data_formats/celeba/celeba.py index 04e6d6b322..311fca83e5 100644 --- a/src/datumaro/plugins/data_formats/celeba/celeba.py +++ b/src/datumaro/plugins/data_formats/celeba/celeba.py @@ -12,8 +12,9 @@ Points, PointsCategories, ) -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.errors import DatasetImportError +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.image import find_images from datumaro.util.meta_file_util import has_meta_file, parse_meta_file diff --git a/src/datumaro/plugins/data_formats/cifar.py b/src/datumaro/plugins/data_formats/cifar.py index 1e81b802cb..3b4fc54d2e 100644 --- a/src/datumaro/plugins/data_formats/cifar.py +++ b/src/datumaro/plugins/data_formats/cifar.py @@ -10,10 +10,11 @@ import numpy as np from datumaro.components.annotation import AnnotationType, Label, LabelCategories -from datumaro.components.dataset import ItemStatus -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase +from datumaro.components.dataset_item_storage import ItemStatus from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util import cast from datumaro.util.meta_file_util import has_meta_file, parse_meta_file diff --git a/src/datumaro/plugins/data_formats/cityscapes.py b/src/datumaro/plugins/data_formats/cityscapes.py index d62065ce01..19d5c55df5 100644 --- a/src/datumaro/plugins/data_formats/cityscapes.py +++ b/src/datumaro/plugins/data_formats/cityscapes.py @@ -21,11 +21,12 @@ MaskCategories, RgbColor, ) -from datumaro.components.dataset import ItemStatus -from datumaro.components.dataset_base import CategoriesInfo, DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import CategoriesInfo, DatasetItem, SubsetBase +from datumaro.components.dataset_item_storage import ItemStatus from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util import find from datumaro.util.annotation_util import make_label_id_mapping diff --git a/src/datumaro/plugins/data_formats/coco/base.py b/src/datumaro/plugins/data_formats/coco/base.py index f1e192d84a..f3c188535d 100644 --- a/src/datumaro/plugins/data_formats/coco/base.py +++ b/src/datumaro/plugins/data_formats/coco/base.py @@ -24,7 +24,7 @@ RleMask, Skeleton, ) -from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME, DatasetItem, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.errors import ( DatasetImportError, InvalidAnnotationError, @@ -34,6 +34,7 @@ ) from datumaro.components.media import Image from datumaro.util import NOTSET, parse_json_file, take_by +from datumaro.util.definitions import DEFAULT_SUBSET_NAME from datumaro.util.image import lazy_image, load_image from datumaro.util.mask_tools import bgr2index from datumaro.util.meta_file_util import has_meta_file, parse_meta_file diff --git a/src/datumaro/plugins/data_formats/coco/exporter.py b/src/datumaro/plugins/data_formats/coco/exporter.py index a4abbe3e8c..ba7ecd2cc7 100644 --- a/src/datumaro/plugins/data_formats/coco/exporter.py +++ b/src/datumaro/plugins/data_formats/coco/exporter.py @@ -20,8 +20,8 @@ Points, Skeleton, ) -from datumaro.components.dataset import ItemStatus from datumaro.components.dataset_base import DatasetItem +from datumaro.components.dataset_item_storage import ItemStatus from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.media import Image diff --git a/src/datumaro/plugins/data_formats/coco/importer.py b/src/datumaro/plugins/data_formats/coco/importer.py index bd1c75951f..82edaf690c 100644 --- a/src/datumaro/plugins/data_formats/coco/importer.py +++ b/src/datumaro/plugins/data_formats/coco/importer.py @@ -6,9 +6,9 @@ import os.path as osp from glob import glob -from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME, Importer from datumaro.components.errors import DatasetNotFoundError from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.plugins.data_formats.coco.base import ( CocoCaptionsBase, CocoImageInfoBase, @@ -18,6 +18,7 @@ CocoPersonKeypointsBase, CocoStuffBase, ) +from datumaro.util.definitions import DEFAULT_SUBSET_NAME from .format import CocoTask diff --git a/src/datumaro/plugins/data_formats/common_semantic_segmentation.py b/src/datumaro/plugins/data_formats/common_semantic_segmentation.py index 700e502b01..f9b5dcefb4 100644 --- a/src/datumaro/plugins/data_formats/common_semantic_segmentation.py +++ b/src/datumaro/plugins/data_formats/common_semantic_segmentation.py @@ -8,9 +8,10 @@ import numpy as np from datumaro.components.annotation import AnnotationType, LabelCategories, Mask, MaskCategories -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.errors import DatasetImportError from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.image import find_images from datumaro.util.mask_tools import generate_colormap, lazy_mask diff --git a/src/datumaro/plugins/data_formats/common_super_resolution.py b/src/datumaro/plugins/data_formats/common_super_resolution.py index e62e640354..475ad3f2e0 100644 --- a/src/datumaro/plugins/data_formats/common_super_resolution.py +++ b/src/datumaro/plugins/data_formats/common_super_resolution.py @@ -5,8 +5,9 @@ import os.path as osp from datumaro.components.annotation import SuperResolutionAnnotation -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.image import find_images diff --git a/src/datumaro/plugins/data_formats/cvat/base.py b/src/datumaro/plugins/data_formats/cvat/base.py index 95574434da..5a69c2def9 100644 --- a/src/datumaro/plugins/data_formats/cvat/base.py +++ b/src/datumaro/plugins/data_formats/cvat/base.py @@ -20,8 +20,9 @@ PolyLine, Skeleton, ) -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from .format import CvatPath diff --git a/src/datumaro/plugins/data_formats/cvat/exporter.py b/src/datumaro/plugins/data_formats/cvat/exporter.py index 62099bcb56..ee24f50d49 100644 --- a/src/datumaro/plugins/data_formats/cvat/exporter.py +++ b/src/datumaro/plugins/data_formats/cvat/exporter.py @@ -13,8 +13,8 @@ from xml.sax.saxutils import XMLGenerator # nosec from datumaro.components.annotation import AnnotationType, LabelCategories, PointsCategories -from datumaro.components.dataset import ItemStatus from datumaro.components.dataset_base import DatasetItem +from datumaro.components.dataset_item_storage import ItemStatus from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.media import Image diff --git a/src/datumaro/plugins/data_formats/datumaro/base.py b/src/datumaro/plugins/data_formats/datumaro/base.py index 31f07c4df4..b173f8f1c8 100644 --- a/src/datumaro/plugins/data_formats/datumaro/base.py +++ b/src/datumaro/plugins/data_formats/datumaro/base.py @@ -20,9 +20,10 @@ RleMask, Skeleton, ) -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.errors import DatasetImportError, InvalidAnnotationError from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image, MediaElement, PointCloud from datumaro.util import parse_json, parse_json_file, take_by diff --git a/src/datumaro/plugins/data_formats/datumaro/exporter.py b/src/datumaro/plugins/data_formats/datumaro/exporter.py index ab9c324907..e18bb65f81 100644 --- a/src/datumaro/plugins/data_formats/datumaro/exporter.py +++ b/src/datumaro/plugins/data_formats/datumaro/exporter.py @@ -32,11 +32,12 @@ Skeleton, _Shape, ) -from datumaro.components.dataset import ItemStatus -from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME, CategoriesInfo, DatasetItem +from datumaro.components.dataset_base import CategoriesInfo, DatasetItem +from datumaro.components.dataset_item_storage import ItemStatus from datumaro.components.exporter import Exporter from datumaro.components.media import Image, MediaElement, PointCloud from datumaro.util import cast, dump_json_file +from datumaro.util.definitions import DEFAULT_SUBSET_NAME from .format import DatumaroPath diff --git a/src/datumaro/plugins/data_formats/icdar/base.py b/src/datumaro/plugins/data_formats/icdar/base.py index 3d47520741..f8087c3ef1 100644 --- a/src/datumaro/plugins/data_formats/icdar/base.py +++ b/src/datumaro/plugins/data_formats/icdar/base.py @@ -10,8 +10,9 @@ import numpy as np from datumaro.components.annotation import Bbox, Caption, Mask, MaskCategories, Polygon -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.image import IMAGE_EXTENSIONS, find_images from datumaro.util.mask_tools import lazy_mask diff --git a/src/datumaro/plugins/data_formats/image_dir.py b/src/datumaro/plugins/data_formats/image_dir.py index 9eed128f21..0b7210d6f0 100644 --- a/src/datumaro/plugins/data_formats/image_dir.py +++ b/src/datumaro/plugins/data_formats/image_dir.py @@ -6,8 +6,9 @@ import os import os.path as osp -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.exporter import Exporter +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.image import find_images diff --git a/src/datumaro/plugins/data_formats/image_zip.py b/src/datumaro/plugins/data_formats/image_zip.py index d9e1d02b0c..22e0cb905f 100644 --- a/src/datumaro/plugins/data_formats/image_zip.py +++ b/src/datumaro/plugins/data_formats/image_zip.py @@ -8,8 +8,9 @@ from enum import Enum from zipfile import ZIP_BZIP2, ZIP_DEFLATED, ZIP_LZMA, ZIP_STORED, ZipFile -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.exporter import Exporter +from datumaro.components.importer import Importer from datumaro.components.media import ByteImage from datumaro.util import parse_str_enum_value from datumaro.util.image import IMAGE_EXTENSIONS, encode_image diff --git a/src/datumaro/plugins/data_formats/imagenet.py b/src/datumaro/plugins/data_formats/imagenet.py index a323a847a9..303a613be4 100644 --- a/src/datumaro/plugins/data_formats/imagenet.py +++ b/src/datumaro/plugins/data_formats/imagenet.py @@ -7,9 +7,10 @@ import os.path as osp from datumaro.components.annotation import AnnotationType, Label, LabelCategories -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.image import find_images diff --git a/src/datumaro/plugins/data_formats/imagenet_txt.py b/src/datumaro/plugins/data_formats/imagenet_txt.py index adcfdceb3c..722573b177 100644 --- a/src/datumaro/plugins/data_formats/imagenet_txt.py +++ b/src/datumaro/plugins/data_formats/imagenet_txt.py @@ -9,10 +9,11 @@ from datumaro.components.annotation import AnnotationType, Label, LabelCategories from datumaro.components.cli_plugin import CliPlugin -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.errors import DatasetImportError, MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.meta_file_util import has_meta_file, parse_meta_file diff --git a/src/datumaro/plugins/data_formats/kinetics.py b/src/datumaro/plugins/data_formats/kinetics.py index 541a98ec68..c0d942cfcb 100644 --- a/src/datumaro/plugins/data_formats/kinetics.py +++ b/src/datumaro/plugins/data_formats/kinetics.py @@ -7,8 +7,9 @@ import os.path as osp from datumaro.components.annotation import AnnotationType, Label, LabelCategories -from datumaro.components.dataset_base import DatasetBase, DatasetItem, Importer +from datumaro.components.dataset_base import DatasetBase, DatasetItem from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Video from datumaro.plugins.data_formats.video import VIDEO_EXTENSIONS from datumaro.util import parse_json, parse_json_file diff --git a/src/datumaro/plugins/data_formats/kitti/importer.py b/src/datumaro/plugins/data_formats/kitti/importer.py index a0955f221f..09bf1259b6 100644 --- a/src/datumaro/plugins/data_formats/kitti/importer.py +++ b/src/datumaro/plugins/data_formats/kitti/importer.py @@ -6,7 +6,7 @@ import os.path as osp from glob import glob -from datumaro.components.dataset_base import Importer +from datumaro.components.importer import Importer from .format import KittiPath, KittiTask diff --git a/src/datumaro/plugins/data_formats/kitti_raw/base.py b/src/datumaro/plugins/data_formats/kitti_raw/base.py index a1797f2a44..9cf61e912a 100644 --- a/src/datumaro/plugins/data_formats/kitti_raw/base.py +++ b/src/datumaro/plugins/data_formats/kitti_raw/base.py @@ -8,8 +8,9 @@ from defusedxml import ElementTree as ET from datumaro.components.annotation import AnnotationType, Cuboid3d, LabelCategories -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image, PointCloud from datumaro.util import cast from datumaro.util.image import find_images diff --git a/src/datumaro/plugins/data_formats/kitti_raw/exporter.py b/src/datumaro/plugins/data_formats/kitti_raw/exporter.py index ee88a2e223..8b3f747df7 100644 --- a/src/datumaro/plugins/data_formats/kitti_raw/exporter.py +++ b/src/datumaro/plugins/data_formats/kitti_raw/exporter.py @@ -11,8 +11,8 @@ from xml.sax.saxutils import XMLGenerator # nosec from datumaro.components.annotation import AnnotationType, LabelCategories -from datumaro.components.dataset import ItemStatus from datumaro.components.dataset_base import DatasetItem +from datumaro.components.dataset_item_storage import ItemStatus from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.media import PointCloud diff --git a/src/datumaro/plugins/data_formats/labelme.py b/src/datumaro/plugins/data_formats/labelme.py index 1bc1bfb4cc..4b17b1011a 100644 --- a/src/datumaro/plugins/data_formats/labelme.py +++ b/src/datumaro/plugins/data_formats/labelme.py @@ -12,10 +12,11 @@ from defusedxml import ElementTree from datumaro.components.annotation import AnnotationType, Bbox, LabelCategories, Mask, Polygon -from datumaro.components.dataset_base import DatasetBase, DatasetItem, Importer +from datumaro.components.dataset_base import DatasetBase, DatasetItem from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util import cast, escape, unescape from datumaro.util.image import save_image diff --git a/src/datumaro/plugins/data_formats/lfw.py b/src/datumaro/plugins/data_formats/lfw.py index 54cca15775..98fe8d8b9e 100644 --- a/src/datumaro/plugins/data_formats/lfw.py +++ b/src/datumaro/plugins/data_formats/lfw.py @@ -7,10 +7,11 @@ import re from datumaro.components.annotation import AnnotationType, Label, LabelCategories, Points -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.image import find_images from datumaro.util.meta_file_util import has_meta_file, parse_meta_file diff --git a/src/datumaro/plugins/data_formats/mapillary_vistas/importer.py b/src/datumaro/plugins/data_formats/mapillary_vistas/importer.py index 5e136e8189..148d71b045 100644 --- a/src/datumaro/plugins/data_formats/mapillary_vistas/importer.py +++ b/src/datumaro/plugins/data_formats/mapillary_vistas/importer.py @@ -5,7 +5,8 @@ import logging as log import os.path as osp -from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME, Importer +from datumaro.components.importer import Importer +from datumaro.util.definitions import DEFAULT_SUBSET_NAME from .base import MapillaryVistasInstancesBase, MapillaryVistasPanopticBase from .format import MapillaryVistasPath, MapillaryVistasTask diff --git a/src/datumaro/plugins/data_formats/market1501.py b/src/datumaro/plugins/data_formats/market1501.py index b9eadf4e4e..322f6ec044 100644 --- a/src/datumaro/plugins/data_formats/market1501.py +++ b/src/datumaro/plugins/data_formats/market1501.py @@ -6,9 +6,10 @@ import os.path as osp import re -from datumaro.components.dataset_base import DatasetBase, DatasetItem, Importer +from datumaro.components.dataset_base import DatasetBase, DatasetItem from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util import str_to_bool from datumaro.util.image import find_images diff --git a/src/datumaro/plugins/data_formats/mars.py b/src/datumaro/plugins/data_formats/mars.py index 882a885a69..52729419dd 100644 --- a/src/datumaro/plugins/data_formats/mars.py +++ b/src/datumaro/plugins/data_formats/mars.py @@ -9,8 +9,9 @@ from datumaro.components.annotation import AnnotationType, Label, LabelCategories from datumaro.components.dataset import DatasetItem -from datumaro.components.dataset_base import DatasetBase, Importer +from datumaro.components.dataset_base import DatasetBase from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.image import find_images diff --git a/src/datumaro/plugins/data_formats/mnist.py b/src/datumaro/plugins/data_formats/mnist.py index 8055a0c0a1..28a8a7578d 100644 --- a/src/datumaro/plugins/data_formats/mnist.py +++ b/src/datumaro/plugins/data_formats/mnist.py @@ -9,9 +9,10 @@ import numpy as np from datumaro.components.annotation import AnnotationType, Label, LabelCategories -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.meta_file_util import has_meta_file, parse_meta_file diff --git a/src/datumaro/plugins/data_formats/mnist_csv.py b/src/datumaro/plugins/data_formats/mnist_csv.py index 6771a7bb48..2e67442937 100644 --- a/src/datumaro/plugins/data_formats/mnist_csv.py +++ b/src/datumaro/plugins/data_formats/mnist_csv.py @@ -8,9 +8,10 @@ import numpy as np from datumaro.components.annotation import AnnotationType, Label, LabelCategories -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.meta_file_util import has_meta_file, parse_meta_file diff --git a/src/datumaro/plugins/data_formats/mot.py b/src/datumaro/plugins/data_formats/mot.py index 126333509a..b18a9dc64d 100644 --- a/src/datumaro/plugins/data_formats/mot.py +++ b/src/datumaro/plugins/data_formats/mot.py @@ -15,10 +15,11 @@ from enum import Enum from datumaro.components.annotation import AnnotationType, Bbox, LabelCategories -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util import cast from datumaro.util.image import find_images diff --git a/src/datumaro/plugins/data_formats/mots.py b/src/datumaro/plugins/data_formats/mots.py index 6743a704df..77cc611c1c 100644 --- a/src/datumaro/plugins/data_formats/mots.py +++ b/src/datumaro/plugins/data_formats/mots.py @@ -13,9 +13,10 @@ import numpy as np from datumaro.components.annotation import AnnotationType, LabelCategories, Mask -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.image import find_images, load_image, save_image from datumaro.util.mask_tools import merge_masks diff --git a/src/datumaro/plugins/data_formats/mpii/mpii_json.py b/src/datumaro/plugins/data_formats/mpii/mpii_json.py index ec8c3c2205..75ca910710 100644 --- a/src/datumaro/plugins/data_formats/mpii/mpii_json.py +++ b/src/datumaro/plugins/data_formats/mpii/mpii_json.py @@ -7,8 +7,9 @@ import numpy as np from datumaro.components.annotation import Bbox, LabelCategories, Points, PointsCategories -from datumaro.components.dataset_base import AnnotationType, DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import AnnotationType, DatasetItem, SubsetBase from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util import parse_json_file diff --git a/src/datumaro/plugins/data_formats/mpii/mpii_mat.py b/src/datumaro/plugins/data_formats/mpii/mpii_mat.py index 884db5c450..60b287a378 100644 --- a/src/datumaro/plugins/data_formats/mpii/mpii_mat.py +++ b/src/datumaro/plugins/data_formats/mpii/mpii_mat.py @@ -9,8 +9,9 @@ from packaging.version import Version from datumaro.components.annotation import Bbox, LabelCategories, Points, PointsCategories -from datumaro.components.dataset_base import AnnotationType, DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import AnnotationType, DatasetItem, SubsetBase from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from .format import MPII_POINTS_JOINTS, MPII_POINTS_LABELS diff --git a/src/datumaro/plugins/data_formats/nyu_depth_v2.py b/src/datumaro/plugins/data_formats/nyu_depth_v2.py index 442a13a52a..305f1bfb42 100644 --- a/src/datumaro/plugins/data_formats/nyu_depth_v2.py +++ b/src/datumaro/plugins/data_formats/nyu_depth_v2.py @@ -9,8 +9,9 @@ import numpy as np from datumaro.components.annotation import DepthAnnotation -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image diff --git a/src/datumaro/plugins/data_formats/open_images.py b/src/datumaro/plugins/data_formats/open_images.py index 338679472c..cee44f1e50 100644 --- a/src/datumaro/plugins/data_formats/open_images.py +++ b/src/datumaro/plugins/data_formats/open_images.py @@ -20,8 +20,8 @@ from attr import attrs from datumaro.components.annotation import AnnotationType, Bbox, Label, LabelCategories, Mask -from datumaro.components.dataset import ItemStatus -from datumaro.components.dataset_base import DatasetBase, DatasetItem, Importer +from datumaro.components.dataset_base import DatasetBase, DatasetItem +from datumaro.components.dataset_item_storage import ItemStatus from datumaro.components.errors import ( DatasetError, MediaTypeError, @@ -30,6 +30,7 @@ ) from datumaro.components.exporter import Exporter from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.components.validator import Severity from datumaro.util import parse_json_file diff --git a/src/datumaro/plugins/data_formats/sly_pointcloud/base.py b/src/datumaro/plugins/data_formats/sly_pointcloud/base.py index f7ec90c7f3..a97cb8758a 100644 --- a/src/datumaro/plugins/data_formats/sly_pointcloud/base.py +++ b/src/datumaro/plugins/data_formats/sly_pointcloud/base.py @@ -6,7 +6,8 @@ from glob import iglob from datumaro.components.annotation import AnnotationType, Cuboid3d, LabelCategories -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase +from datumaro.components.importer import Importer from datumaro.components.media import Image, PointCloud from datumaro.util import parse_json_file from datumaro.util.image import find_images diff --git a/src/datumaro/plugins/data_formats/sly_pointcloud/exporter.py b/src/datumaro/plugins/data_formats/sly_pointcloud/exporter.py index e0e2638cd2..a19ab51a6c 100644 --- a/src/datumaro/plugins/data_formats/sly_pointcloud/exporter.py +++ b/src/datumaro/plugins/data_formats/sly_pointcloud/exporter.py @@ -15,8 +15,8 @@ from datetime import datetime from datumaro.components.annotation import AnnotationType, LabelCategories -from datumaro.components.dataset import ItemStatus from datumaro.components.dataset_base import DatasetItem, IDataset +from datumaro.components.dataset_item_storage import ItemStatus from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.media import PointCloud diff --git a/src/datumaro/plugins/data_formats/synthia.py b/src/datumaro/plugins/data_formats/synthia.py index e35252f465..403992936c 100644 --- a/src/datumaro/plugins/data_formats/synthia.py +++ b/src/datumaro/plugins/data_formats/synthia.py @@ -8,8 +8,9 @@ import numpy as np from datumaro.components.annotation import AnnotationType, LabelCategories, Mask, MaskCategories -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.image import find_images, load_image from datumaro.util.mask_tools import generate_colormap, lazy_mask diff --git a/src/datumaro/plugins/data_formats/tf_detection_api/base.py b/src/datumaro/plugins/data_formats/tf_detection_api/base.py index 41ce010379..b5a8ac2ec1 100644 --- a/src/datumaro/plugins/data_formats/tf_detection_api/base.py +++ b/src/datumaro/plugins/data_formats/tf_detection_api/base.py @@ -9,7 +9,8 @@ import numpy as np from datumaro.components.annotation import AnnotationType, Bbox, LabelCategories, Mask -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase +from datumaro.components.importer import Importer from datumaro.components.media import ByteImage from datumaro.util.image import decode_image, lazy_image from datumaro.util.tf_util import import_tf as _import_tf diff --git a/src/datumaro/plugins/data_formats/vgg_face2.py b/src/datumaro/plugins/data_formats/vgg_face2.py index e333f108de..5273242a13 100644 --- a/src/datumaro/plugins/data_formats/vgg_face2.py +++ b/src/datumaro/plugins/data_formats/vgg_face2.py @@ -8,10 +8,11 @@ import os.path as osp from datumaro.components.annotation import AnnotationType, Bbox, Label, LabelCategories, Points -from datumaro.components.dataset_base import DatasetBase, DatasetItem, Importer +from datumaro.components.dataset_base import DatasetBase, DatasetItem from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.image import find_images from datumaro.util.meta_file_util import has_meta_file, parse_meta_file diff --git a/src/datumaro/plugins/data_formats/video.py b/src/datumaro/plugins/data_formats/video.py index ce33d1a886..5e4b4aa5b8 100644 --- a/src/datumaro/plugins/data_formats/video.py +++ b/src/datumaro/plugins/data_formats/video.py @@ -5,9 +5,11 @@ import os.path as osp from typing import Optional -from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME, DatasetBase, DatasetItem, Importer +from datumaro.components.dataset_base import DatasetBase, DatasetItem from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Video, VideoFrame +from datumaro.util.definitions import DEFAULT_SUBSET_NAME from datumaro.util.os_util import find_files # Taken from https://en.wikipedia.org/wiki/Comparison_of_video_container_formats diff --git a/src/datumaro/plugins/data_formats/voc/exporter.py b/src/datumaro/plugins/data_formats/voc/exporter.py index 16e84bc70e..a0e43246b6 100644 --- a/src/datumaro/plugins/data_formats/voc/exporter.py +++ b/src/datumaro/plugins/data_formats/voc/exporter.py @@ -24,8 +24,8 @@ LabelCategories, Mask, ) -from datumaro.components.dataset import ItemStatus from datumaro.components.dataset_base import DatasetItem +from datumaro.components.dataset_item_storage import ItemStatus from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.media import Image diff --git a/src/datumaro/plugins/data_formats/voc/importer.py b/src/datumaro/plugins/data_formats/voc/importer.py index f256b00ebc..ab4b8c2662 100644 --- a/src/datumaro/plugins/data_formats/voc/importer.py +++ b/src/datumaro/plugins/data_formats/voc/importer.py @@ -4,8 +4,8 @@ import os.path as osp -from datumaro.components.dataset_base import Importer from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from .format import VocPath, VocTask diff --git a/src/datumaro/plugins/data_formats/vott_csv.py b/src/datumaro/plugins/data_formats/vott_csv.py index 3398039238..b64138ed4f 100644 --- a/src/datumaro/plugins/data_formats/vott_csv.py +++ b/src/datumaro/plugins/data_formats/vott_csv.py @@ -6,8 +6,9 @@ import os.path as osp from datumaro.components.annotation import AnnotationType, Bbox, LabelCategories -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util.meta_file_util import has_meta_file, parse_meta_file diff --git a/src/datumaro/plugins/data_formats/vott_json.py b/src/datumaro/plugins/data_formats/vott_json.py index 2064d0df3b..d39dc431c7 100644 --- a/src/datumaro/plugins/data_formats/vott_json.py +++ b/src/datumaro/plugins/data_formats/vott_json.py @@ -5,8 +5,9 @@ import os.path as osp from datumaro.components.annotation import AnnotationType, Bbox, LabelCategories -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util import parse_json_file from datumaro.util.meta_file_util import has_meta_file, parse_meta_file diff --git a/src/datumaro/plugins/data_formats/widerface.py b/src/datumaro/plugins/data_formats/widerface.py index f587bc3335..97cee58a06 100644 --- a/src/datumaro/plugins/data_formats/widerface.py +++ b/src/datumaro/plugins/data_formats/widerface.py @@ -7,10 +7,11 @@ import re from datumaro.components.annotation import AnnotationType, Bbox, Label, LabelCategories -from datumaro.components.dataset_base import DatasetItem, Importer, SubsetBase +from datumaro.components.dataset_base import DatasetItem, SubsetBase from datumaro.components.errors import MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.components.media import Image from datumaro.util import str_to_bool from datumaro.util.meta_file_util import has_meta_file, parse_meta_file diff --git a/src/datumaro/plugins/data_formats/yolo/exporter.py b/src/datumaro/plugins/data_formats/yolo/exporter.py index 0e54ddf411..acae0c5da7 100644 --- a/src/datumaro/plugins/data_formats/yolo/exporter.py +++ b/src/datumaro/plugins/data_formats/yolo/exporter.py @@ -24,12 +24,14 @@ Polygon, Skeleton, ) -from datumaro.components.dataset import DatasetPatch, ItemStatus -from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME, DatasetItem, IDataset +from datumaro.components.dataset_base import DatasetItem, IDataset +from datumaro.components.dataset_item_storage import ItemStatus +from datumaro.components.dataset_storage import DatasetPatch from datumaro.components.errors import DatasetExportError, MediaTypeError from datumaro.components.exporter import Exporter from datumaro.components.media import Image from datumaro.util import dump_json_file, str_to_bool +from datumaro.util.definitions import DEFAULT_SUBSET_NAME from datumaro.util.os_util import split_path from .format import YoloPath, YoloUltralyticsClassificationFormat, YoloUltralyticsPath diff --git a/src/datumaro/plugins/data_formats/yolo/importer.py b/src/datumaro/plugins/data_formats/yolo/importer.py index d883947508..9f323a64f2 100644 --- a/src/datumaro/plugins/data_formats/yolo/importer.py +++ b/src/datumaro/plugins/data_formats/yolo/importer.py @@ -11,8 +11,8 @@ import yaml -from datumaro import Importer from datumaro.components.format_detection import FormatDetectionContext +from datumaro.components.importer import Importer from datumaro.plugins.data_formats.yolo.base import ( YoloUltralyticsClassificationBase, YoloUltralyticsDetectionBase, diff --git a/src/datumaro/plugins/ndr.py b/src/datumaro/plugins/ndr.py index 56d168e528..99ccb6309d 100644 --- a/src/datumaro/plugins/ndr.py +++ b/src/datumaro/plugins/ndr.py @@ -10,8 +10,9 @@ from scipy.linalg import orth from datumaro.components.cli_plugin import CliPlugin -from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME, Transform +from datumaro.components.transformer import Transform from datumaro.util import parse_str_enum_value +from datumaro.util.definitions import DEFAULT_SUBSET_NAME class Algorithm(Enum): diff --git a/src/datumaro/plugins/sampler/random_sampler.py b/src/datumaro/plugins/sampler/random_sampler.py index 7c1b81bd4b..0bfd7551c8 100644 --- a/src/datumaro/plugins/sampler/random_sampler.py +++ b/src/datumaro/plugins/sampler/random_sampler.py @@ -9,7 +9,8 @@ from datumaro.components.annotation import AnnotationType from datumaro.components.cli_plugin import CliPlugin -from datumaro.components.dataset_base import DatasetItem, IDataset, Transform +from datumaro.components.dataset_base import DatasetItem, IDataset +from datumaro.components.transformer import Transform from datumaro.util import cast diff --git a/src/datumaro/plugins/sampler/relevancy_sampler.py b/src/datumaro/plugins/sampler/relevancy_sampler.py index aa806261c0..6c204eae64 100644 --- a/src/datumaro/plugins/sampler/relevancy_sampler.py +++ b/src/datumaro/plugins/sampler/relevancy_sampler.py @@ -8,7 +8,8 @@ import pandas as pd from datumaro.components.cli_plugin import CliPlugin -from datumaro.components.dataset_base import IDataset, Transform +from datumaro.components.dataset_base import IDataset +from datumaro.components.transformer import Transform from datumaro.util import parse_str_enum_value from .algorithm.algorithm import Algorithm, SamplingMethod diff --git a/src/datumaro/plugins/splitter.py b/src/datumaro/plugins/splitter.py index b1f6ff0ec2..4547d4754e 100644 --- a/src/datumaro/plugins/splitter.py +++ b/src/datumaro/plugins/splitter.py @@ -11,8 +11,9 @@ from datumaro.components.annotation import AnnotationType from datumaro.components.cli_plugin import CliPlugin -from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME, Transform +from datumaro.components.transformer import Transform from datumaro.util import cast +from datumaro.util.definitions import DEFAULT_SUBSET_NAME NEAR_ZERO = 1e-7 diff --git a/src/datumaro/plugins/synthetic_data/image_generator.py b/src/datumaro/plugins/synthetic_data/image_generator.py index 3f6533ff59..824deb5475 100644 --- a/src/datumaro/plugins/synthetic_data/image_generator.py +++ b/src/datumaro/plugins/synthetic_data/image_generator.py @@ -15,7 +15,7 @@ import numpy as np import requests -from datumaro.components.dataset_generator import DatasetGenerator +from datumaro.components.generator import DatasetGenerator from datumaro.util.image import save_image from datumaro.util.scope import on_error_do, on_exit_do, scope_add, scoped diff --git a/src/datumaro/plugins/transforms.py b/src/datumaro/plugins/transforms.py index 129ec8ed65..acb013467c 100644 --- a/src/datumaro/plugins/transforms.py +++ b/src/datumaro/plugins/transforms.py @@ -36,17 +36,13 @@ RleMask, ) from datumaro.components.cli_plugin import CliPlugin -from datumaro.components.dataset_base import ( - DEFAULT_SUBSET_NAME, - DatasetItem, - IDataset, - ItemTransform, - Transform, -) +from datumaro.components.dataset_base import DatasetItem, IDataset from datumaro.components.errors import DatumaroError from datumaro.components.media import Image +from datumaro.components.transformer import ItemTransform, Transform from datumaro.util import NOTSET, filter_dict, parse_str_enum_value, take_by from datumaro.util.annotation_util import find_group_leader, find_instances +from datumaro.util.definitions import DEFAULT_SUBSET_NAME class CropCoveredSegments(ItemTransform, CliPlugin): diff --git a/src/datumaro/util/definitions.py b/src/datumaro/util/definitions.py new file mode 100644 index 0000000000..2a95b8e683 --- /dev/null +++ b/src/datumaro/util/definitions.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +DEFAULT_SUBSET_NAME = "default" diff --git a/tests/unit/data_formats/test_yolo_format.py b/tests/unit/data_formats/test_yolo_format.py index a185e565af..b8601b8122 100644 --- a/tests/unit/data_formats/test_yolo_format.py +++ b/tests/unit/data_formats/test_yolo_format.py @@ -26,7 +26,7 @@ Skeleton, ) from datumaro.components.dataset import Dataset -from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME, DatasetItem +from datumaro.components.dataset_base import DatasetItem from datumaro.components.environment import Environment from datumaro.components.errors import ( AnnotationImportError, @@ -63,6 +63,7 @@ YoloUltralyticsPoseImporter, YoloUltralyticsSegmentationImporter, ) +from datumaro.util.definitions import DEFAULT_SUBSET_NAME from datumaro.util.image import save_image from tests.requirements import Requirements, mark_requirement diff --git a/tests/unit/test_compare.py b/tests/unit/test_compare.py index b363dda779..528f1f422a 100644 --- a/tests/unit/test_compare.py +++ b/tests/unit/test_compare.py @@ -13,10 +13,11 @@ PointsCategories, Skeleton, ) -from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME, DatasetItem +from datumaro.components.dataset_base import DatasetItem from datumaro.components.media import Image from datumaro.components.operations import DistanceComparator, ExactComparator from datumaro.components.project import Dataset +from datumaro.util.definitions import DEFAULT_SUBSET_NAME from tests.requirements import Requirements, mark_requirement diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 01b567bba5..869cd947df 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -18,23 +18,16 @@ Polygon, PolyLine, ) -from datumaro.components.dataset import DEFAULT_FORMAT, Dataset, ItemStatus, eager_mode +from datumaro.components.dataset import DEFAULT_FORMAT, Dataset, eager_mode from datumaro.components.dataset_base import ( - DEFAULT_SUBSET_NAME, DatasetBase, DatasetItem, FailingImportErrorPolicy, ImportErrorPolicy, - ItemTransform, ProgressReporter, SubsetBase, - Transform, -) -from datumaro.components.dataset_filter import ( - DatasetItemEncoder, - XPathAnnotationsFilter, - XPathDatasetFilter, ) +from datumaro.components.dataset_item_storage import ItemStatus from datumaro.components.environment import Environment from datumaro.components.errors import ( ConflictingCategoriesError, @@ -49,9 +42,16 @@ UnknownFormatError, ) from datumaro.components.exporter import Exporter +from datumaro.components.filter import ( + DatasetItemEncoder, + XPathAnnotationsFilter, + XPathDatasetFilter, +) from datumaro.components.launcher import Launcher from datumaro.components.media import Image, MediaElement, Video from datumaro.components.progress_reporting import NullProgressReporter +from datumaro.components.transformer import ItemTransform, Transform +from datumaro.util.definitions import DEFAULT_SUBSET_NAME from tests.requirements import Requirements, mark_requirement from tests.utils.test_utils import TestDir, compare_datasets, compare_datasets_strict diff --git a/tests/unit/test_ops.py b/tests/unit/test_ops.py index fe3b11ca18..487012f474 100644 --- a/tests/unit/test_ops.py +++ b/tests/unit/test_ops.py @@ -17,7 +17,7 @@ PolyLine, ) from datumaro.components.dataset import Dataset -from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME, DatasetItem +from datumaro.components.dataset_base import DatasetItem from datumaro.components.media import Image, MultiframeImage, PointCloud from datumaro.components.operations import ( FailedAttrVotingError, @@ -31,6 +31,7 @@ match_segments, mean_std, ) +from datumaro.util.definitions import DEFAULT_SUBSET_NAME from tests.requirements import Requirements, mark_requirement from tests.utils.assets import get_test_asset_path diff --git a/tests/unit/test_project.py b/tests/unit/test_project.py index c36832b119..1b35a97791 100644 --- a/tests/unit/test_project.py +++ b/tests/unit/test_project.py @@ -9,7 +9,7 @@ from datumaro.components.annotation import Bbox, Label from datumaro.components.config_model import Model, Source from datumaro.components.dataset import DEFAULT_FORMAT, Dataset -from datumaro.components.dataset_base import DatasetBase, DatasetItem, ItemTransform +from datumaro.components.dataset_base import DatasetBase, DatasetItem from datumaro.components.errors import ( DatasetMergeError, EmptyCommitError, @@ -29,6 +29,7 @@ from datumaro.components.launcher import Launcher from datumaro.components.media import Image from datumaro.components.project import DiffStatus, Project +from datumaro.components.transformer import ItemTransform from datumaro.util.scope import scope_add, scoped from tests.requirements import Requirements, mark_requirement