Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable stream import for COCO data format #1091

Merged
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
4ccb5c1
Refactor dataset.py to seperate DatasetStorage
vinnamkim Jul 4, 2023
1412e62
Add StreamDatasetStorage and StreamDataset
vinnamkim Jul 5, 2023
7be138a
Merge remote-tracking branch 'upstream/develop' into feature/stream-d…
vinnamkim Jul 6, 2023
75cd172
Update CHANGELOG.md
vinnamkim Jul 6, 2023
a080382
Fix typing error
vinnamkim Jul 6, 2023
3a09d02
Enable stream import for VOC and YOLO
vinnamkim Jul 7, 2023
67cd067
Merge branch 'develop' into feature/stream-importer-voc-and-yolo
vinnamkim Jul 7, 2023
02603fc
Fix duplication
vinnamkim Jul 7, 2023
d0ff715
Update CHANGELOG.md
vinnamkim Jul 7, 2023
505f782
Update specs.json
vinnamkim Jul 7, 2023
2edea68
Refactor first
vinnamkim Jul 7, 2023
56215ca
Implement streamable DatumaroBase
vinnamkim Jul 7, 2023
f82d070
Update tests
vinnamkim Jul 7, 2023
798d7fe
Merge branch 'releases/1.4.0' into feature/stream-importer-datum
vinnamkim Jul 10, 2023
34f1e56
Update CHANGELOG.md
vinnamkim Jul 10, 2023
758f1ca
Revert unused change
vinnamkim Jul 10, 2023
3df5f19
Update skip comment
vinnamkim Jul 10, 2023
ee79511
Fix Windows test errors
vinnamkim Jul 11, 2023
8d50797
Rename and update broken is_stream propagation
vinnamkim Jul 11, 2023
47c464b
Add COCOPageMapper
vinnamkim Jul 10, 2023
5803c78
Update COCOImporter to introduce streaming
vinnamkim Jul 11, 2023
c8767c1
Update CHANGELOG.md
vinnamkim Jul 11, 2023
467749d
Add utf-8 encoding directive for Windows
vinnamkim Jul 12, 2023
4d3caa5
Merge remote-tracking branch 'upstream/releases/1.4.0' into feature/s…
vinnamkim Jul 12, 2023
8629ef5
Change COCOPageMapper to be based on the bytes comparison for platfor…
vinnamkim Jul 12, 2023
08908ec
Remove TYPE from test_roboflow.py
vinnamkim Jul 12, 2023
ab88cf9
Rename
vinnamkim Jul 12, 2023
3a8af6d
Nit fix
vinnamkim Jul 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Migrate DVC v3.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/1072>)
- Stream dataset import/export
(<https://github.com/openvinotoolkit/datumaro/pull/1077>, <https://github.com/openvinotoolkit/datumaro/pull/1081>, <https://github.com/openvinotoolkit/datumaro/pull/1082>)
(<https://github.com/openvinotoolkit/datumaro/pull/1077>, <https://github.com/openvinotoolkit/datumaro/pull/1081>, <https://github.com/openvinotoolkit/datumaro/pull/1082>, <https://github.com/openvinotoolkit/datumaro/pull/1091>)
- Support mask annotations for CVAT data format
(<https://github.com/openvinotoolkit/datumaro/pull/1078>)

Expand Down
203 changes: 130 additions & 73 deletions src/datumaro/plugins/data_formats/coco/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
import logging as log
import os.path as osp
from inspect import isclass
from typing import Any, Dict, Optional, Tuple, Type, TypeVar, Union, overload
from typing import Any, Dict, Iterator, Optional, Tuple, Type, TypeVar, Union, overload

import json_stream
import pycocotools.mask as mask_utils
from attrs import define

Expand All @@ -34,12 +35,13 @@
)
from datumaro.components.importer import ImportContext
from datumaro.components.media import Image
from datumaro.util import NOTSET, parse_json_file, take_by
from datumaro.util import NOTSET, parse_json_file, take_by, to_dict_from_streaming_json
from datumaro.util.image import lazy_image, load_image
from datumaro.util.mask_tools import bgr2index
from datumaro.util.meta_file_util import has_meta_file, parse_meta_file

from .format import CocoImporterType, CocoPath, CocoTask
from .page_mapper import COCOPageMapper

T = TypeVar("T")

Expand Down Expand Up @@ -102,6 +104,7 @@ def __init__(
keep_original_category_ids: bool = False,
coco_importer_type: CocoImporterType = CocoImporterType.default,
subset: Optional[str] = None,
stream: bool = False,
ctx: Optional[ImportContext] = None,
):
if not osp.isfile(path):
Expand All @@ -126,21 +129,56 @@ def __init__(

self._merge_instance_polygons = merge_instance_polygons

json_data = parse_json_file(path)
self._label_map = {} # coco_id -> dm_id
self._load_categories(
json_data,
keep_original_ids=keep_original_category_ids,
)

if self._task == CocoTask.panoptic:
self._mask_dir = osp.splitext(path)[0]
self._items = self._load_items(json_data)

del json_data
self._stream = stream
if not stream:
json_data = parse_json_file(path)

self._load_categories(
json_data,
keep_original_ids=keep_original_category_ids,
)

self._items = self._load_items(json_data)

del json_data
else:
categories_data = self.stream_parse_categories_data(path)

self._load_categories(
{"categories": categories_data},
keep_original_ids=keep_original_category_ids,
)
self._page_mapper = COCOPageMapper(path)
self._length = None

def __iter__(self):
yield from self._items.values()
def stream_parse_categories_data(self, path: str) -> Dict[str, Any]:
"""Parse "categories" section from the given JSON file using the stream json parser"""
with open(path, "r", encoding="utf-8") as fp:
data = json_stream.load(fp)
categories = data.get("categories", None)

if categories is None:
raise DatasetImportError('Annotation JSON file should have "categories" entity.')

return to_dict_from_streaming_json(categories)

def __len__(self) -> int:
if self.is_stream:
if self._length is None:
self._length = sum(1 for _ in self)
return self._length
else:
return len(self._items)

def __iter__(self) -> Iterator[DatasetItem]:
if self.is_stream:
yield from self._stream_items()
else:
yield from self._items.values()

def _load_categories(self, json_data, *, keep_original_ids):
self._categories = {}
Expand Down Expand Up @@ -225,6 +263,29 @@ def _load_person_kp_categories(self, json_cat):

self._categories[AnnotationType.points] = categories

def _stream_items(self) -> Iterator[DatasetItem]:
pbars = self._ctx.progress_reporter
for img_info, ann_infos in pbars.iter(
self._page_mapper,
desc=f"Parsing image info in '{osp.basename(self._path)}'",
):
parsed = self._parse_item(img_info)
if parsed is None:
continue

_, item = parsed

for ann_info in ann_infos:
self._parse_anns(img_info, ann_info, item)

yield item

def _parse_anns(self, img_info, ann_info, item):
if self._task is not CocoTask.panoptic:
self._load_annotations(ann_info, img_info, parsed_annotations=item.annotations)
else:
self._load_panoptic_ann(ann_info, parsed_annotations=item.annotations)

def _load_items(self, json_data):
pbars = self._ctx.progress_reporter.split(2)

Expand All @@ -239,73 +300,65 @@ def _gen_ann(info_lists):
_gen_ann(img_lists),
desc=f"Parsing image info in '{osp.basename(self._path)}'",
):
img_id = None
parsed = self._parse_item(img_info)
if parsed is None:
continue

img_id, item = parsed

# Store item (DatasetItem) and img_info (Dict) to the integer key dictionary
items[img_id] = item
img_infos[img_id] = img_info

ann_lists = self._parse_field(json_data, "annotations", list)

for ann_info in pbars[1].iter(
_gen_ann(ann_lists),
desc=f"Parsing annotations in '{osp.basename(self._path)}'",
):
try:
img_id = self._parse_field(img_info, "id", int)
img_infos[img_id] = img_info
img_id = self._parse_field(ann_info, "image_id", int)
if img_id not in img_infos:
log.warn(f"Unknown image id '{img_id}'")
continue

if img_info.get("height") and img_info.get("width"):
image_size = (
self._parse_field(img_info, "height", int),
self._parse_field(img_info, "width", int),
)
else:
image_size = None

file_name = self._parse_field(img_info, "file_name", str)
items[img_id] = DatasetItem(
id=osp.splitext(file_name)[0],
subset=self._subset,
media=Image.from_file(
path=osp.join(self._images_dir, file_name), size=image_size
),
annotations=[],
attributes={"id": img_id},
)
except Exception as e:
self._ctx.error_policy.report_item_error(e, item_id=(img_id, self._subset))
# Retrieve item (DatasetItem) and img_info (Dict) from the integer key dictionary
item = items[img_id]
img_info = img_infos[img_id]
self._parse_anns(img_info, ann_info, item)

if self._task is not CocoTask.panoptic:
ann_lists = self._parse_field(json_data, "annotations", list)
for ann in pbars[1].iter(
_gen_ann(ann_lists),
desc=f"Parsing annotations in '{osp.basename(self._path)}'",
):
img_id = None
try:
img_id = self._parse_field(ann, "image_id", int)
if img_id not in img_infos:
log.warn(f"Unknown image id '{img_id}'")
continue

self._load_annotations(
ann, img_infos[img_id], parsed_annotations=items[img_id].annotations
)
except Exception as e:
self._ctx.error_policy.report_annotation_error(
e, item_id=(img_id, self._subset)
)
else:
ann_lists = self._parse_field(json_data, "annotations", list)
for ann in pbars[1].iter(
_gen_ann(ann_lists),
desc=f"Parsing annotations in '{osp.basename(self._path)}'",
):
img_id = None
try:
img_id = self._parse_field(ann, "image_id", int)
if img_id not in img_infos:
log.warn(f"Unknown image id '{img_id}'")
continue

self._load_panoptic_ann(ann, items[img_id].annotations)
except Exception as e:
self._ctx.error_policy.report_annotation_error(
e, item_id=(img_id, self._subset)
)
except Exception as e:
self._ctx.error_policy.report_annotation_error(
e, item_id=(ann_info.get("id", None), self._subset)
)

return items

def _parse_item(self, img_info: Dict[str, Any]) -> Optional[Tuple[int, DatasetItem]]:
try:
img_id = self._parse_field(img_info, "id", int)

if img_info.get("height") and img_info.get("width"):
image_size = (
self._parse_field(img_info, "height", int),
self._parse_field(img_info, "width", int),
)
else:
image_size = None

file_name = self._parse_field(img_info, "file_name", str)
return img_id, DatasetItem(
id=osp.splitext(file_name)[0],
subset=self._subset,
media=Image.from_file(path=osp.join(self._images_dir, file_name), size=image_size),
annotations=[],
attributes={"id": img_id},
)
except Exception as e:
self._ctx.error_policy.report_item_error(
e, item_id=(img_info.get("id", None), self._subset)
)

def _load_panoptic_ann(self, ann, parsed_annotations=None):
if parsed_annotations is None:
parsed_annotations = []
Expand Down Expand Up @@ -522,6 +575,10 @@ def _load_annotations(self, ann, image_info=None, parsed_annotations=None):

return parsed_annotations

@property
def is_stream(self) -> bool:
return self._stream


class CocoImageInfoBase(_CocoBase):
def __init__(self, path, **kwargs):
Expand Down
9 changes: 8 additions & 1 deletion src/datumaro/plugins/data_formats/coco/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def detect(
with context.alternative():
context.require_file(f"annotations/{task.name}_*.json")

def __call__(self, path, **extra_params):
def __call__(self, path, stream: bool = False, **extra_params):
subsets = self.find_sources(path)

if len(subsets) == 0:
Expand Down Expand Up @@ -104,6 +104,9 @@ def __call__(self, path, **extra_params):
options["coco_importer_type"] = self._IMPORTER_TYPE
options["subset"] = subset

if stream:
options["stream"] = True

sources.append(
{
"url": ann_file,
Expand Down Expand Up @@ -152,6 +155,10 @@ def detect_coco_task(filename):

return subsets

@property
def can_stream(self) -> bool:
return True


class CocoImageInfoImporter(CocoImporter):
_TASK = CocoTask.image_info
Expand Down
Loading