Readable COCO and datumaro format for CJK (#307)

* Do not force ASCII in COCO and Datumaro JSONs for readable CJK * Add tests * Use utf-8 encoding for writing Co-authored-by: Maxim Zhiltsov <[email protected]>
openvinotoolkit · Jun 23, 2021 · 28d2ed5 · 28d2ed5
1 parent ed7706e
commit 28d2ed5
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 7 deletions.
diff --git a/datumaro/plugins/coco_format/converter.py b/datumaro/plugins/coco_format/converter.py
@@ -100,8 +100,8 @@ def write(self, path):
                 ann['id'] = next_id
                 next_id += 1
 
-        with open(path, 'w') as outfile:
-            json.dump(self._data, outfile)
+        with open(path, 'w', encoding='utf-8') as outfile:
+            json.dump(self._data, outfile, ensure_ascii=False)
 
     @property
     def annotations(self):
@@ -461,8 +461,8 @@ class _StuffConverter(_InstancesConverter):
 
 class _PanopticConverter(_TaskConverter):
     def write(self, path):
-        with open(path, 'w') as outfile:
-            json.dump(self._data, outfile)
+        with open(path, 'w', encoding='utf-8') as outfile:
+            json.dump(self._data, outfile, ensure_ascii=False)
 
     def save_categories(self, dataset):
         label_categories = dataset.categories().get(AnnotationType.label)

diff --git a/datumaro/plugins/datumaro_format/converter.py b/datumaro/plugins/datumaro_format/converter.py
@@ -100,8 +100,9 @@ def write_categories(self, categories):
             self.categories[ann_type.name] = converted_desc
 
     def write(self, save_dir):
-        with open(osp.join(save_dir, '%s.json' % self._name), 'w') as f:
-            json.dump(self._data, f)
+        with open(osp.join(save_dir, '%s.json' % self._name), 'w',
+                encoding='utf-8') as f:
+            json.dump(self._data, f, ensure_ascii=False)
 
     def _convert_annotation(self, obj):
         assert isinstance(obj, Annotation)
@@ -290,4 +291,4 @@ def convert(cls, extractor, save_dir, **kwargs):
         DatumaroConverter.convert(extractor,
             save_dir=osp.join(
                 project.config.project_dir, project.config.dataset_dir),
-            **kwargs)
+            **kwargs)
diff --git a/tests/requirements.py b/tests/requirements.py
@@ -18,6 +18,7 @@ class Requirements:
 
     # GitHub issues (not bugs)
     # https://github.com/openvinotoolkit/datumaro/issues
+    DATUM_231 = "Readable formats for CJK"
     DATUM_244 = "Add Snyk integration"
     DATUM_267 = "Add Image zip format"
     DATUM_280 = "Support KITTI dataset formats"

diff --git a/tests/test_coco_format.py b/tests/test_coco_format.py
@@ -718,6 +718,36 @@ def test_can_save_and_load_images(self):
             self._test_save_and_load(expected_dataset,
                 CocoImageInfoConverter.convert, test_dir)
 
+    @mark_requirement(Requirements.DATUM_231)
+    def test_can_save_dataset_with_cjk_categories(self):
+        expected_dataset = Dataset.from_iterable([
+            DatasetItem(id=1, subset='train', image=np.ones((4, 4, 3)),
+                annotations=[
+                    Bbox(0, 1, 2, 2,
+                        label=0, group=1, id=1,
+                        attributes={ 'is_crowd': False }),
+                ], attributes={'id': 1}),
+            DatasetItem(id=2, subset='train', image=np.ones((4, 4, 3)),
+                annotations=[
+                    Bbox(1, 0, 2, 2, label=1, group=2, id=2,
+                        attributes={ 'is_crowd': False }),
+                ], attributes={'id': 2}),
+
+            DatasetItem(id=3, subset='train', image=np.ones((4, 4, 3)),
+                annotations=[
+                    Bbox(0, 1, 2, 2, label=2, group=3, id=3,
+                        attributes={ 'is_crowd': False }),
+                ], attributes={'id': 3}),
+            ],
+            categories=[
+                "고양이", "ネコ", "猫"
+            ]
+        )
+
+        with TestDir() as test_dir:
+            self._test_save_and_load(expected_dataset,
+                CocoInstancesConverter.convert, test_dir)
+
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
         expected_dataset = Dataset.from_iterable([

diff --git a/tests/test_datumaro_format.py b/tests/test_datumaro_format.py
@@ -111,6 +111,37 @@ def test_relative_paths(self):
             self._test_save_and_load(test_dataset,
                 partial(DatumaroConverter.convert, save_images=True), test_dir)
 
+
+    @mark_requirement(Requirements.DATUM_231)
+    def test_can_save_dataset_with_cjk_categories(self):
+        expected = Dataset.from_iterable([
+            DatasetItem(id=1, subset='train', image=np.ones((4, 4, 3)),
+                annotations=[
+                    Bbox(0, 1, 2, 2,
+                        label=0, group=1, id=1,
+                        attributes={ 'is_crowd': False }),
+                ], attributes={'id': 1}),
+            DatasetItem(id=2, subset='train', image=np.ones((4, 4, 3)),
+                annotations=[
+                    Bbox(1, 0, 2, 2, label=1, group=2, id=2,
+                        attributes={ 'is_crowd': False }),
+                ], attributes={'id': 2}),
+
+            DatasetItem(id=3, subset='train', image=np.ones((4, 4, 3)),
+                annotations=[
+                    Bbox(0, 1, 2, 2, label=2, group=3, id=3,
+                        attributes={ 'is_crowd': False }),
+                ], attributes={'id': 3}),
+            ],
+            categories=[
+                "고양이", "ネコ", "猫"
+            ]
+        )
+
+        with TestDir() as test_dir:
+            self._test_save_and_load(expected,
+                partial(DatumaroConverter.convert, save_images=True), test_dir)
+
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
         test_dataset = Dataset.from_iterable([