Handle path separators in the subset when exporting a datumaro dataset (#1615)

Jihyeon Yi · web-flow · commit dc9bbf2e18e5 · 2024-09-25T13:31:29.000+09:00
### Summary Ticket: 152928 Raise an error when exporting a dataset in Datumaro format if the subset name contains path separators.  ### How to test  ### Checklist  - [x] I have added unit tests to cover my changes.​ - [ ] I have added integration tests to cover my changes.​ - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​ - [x] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [x] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2024 Intel Corporation # # SPDX-License-Identifier: MIT ```
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,17 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## \[Q3 2024 Release 1.9.0\]
+## \[Unreleased\]
+
+### New features
+
+### Enhancements
+- Raise an appropriate error when exporting a datumaro dataset if its subset name contains path separators.
+  (<https://github.com/openvinotoolkit/datumaro/pull/1615>)
+
+### Bug fixes
+
+## Q3 2024 Release 1.9.0
 ### New features
 - Add a new CLI command: datum format
   (<https://github.com/openvinotoolkit/datumaro/pull/1570>)
diff --git a/docs/source/docs/data-formats/formats/datumaro.md b/docs/source/docs/data-formats/formats/datumaro.md
@@ -73,6 +73,8 @@ A Datumaro dataset directory should have the following structure:
         └── ...
 ```
 
+Note that the subset name shouldn't contain path separators.
+
 If your dataset is not following the above directory structure,
 it cannot detect and import your dataset as the Datumaro format properly.
 
diff --git a/docs/source/docs/data-formats/formats/datumaro_binary.md b/docs/source/docs/data-formats/formats/datumaro_binary.md
@@ -113,6 +113,8 @@ A DatumaroBinary dataset directory should have the following structure:
         └── ...
 ```
 
+Note that the subset name shouldn't contain path separators.
+
 If your dataset is not following the above directory structure,
 it cannot detect and import your dataset as the DatumaroBinary format properly.
 
diff --git a/src/datumaro/components/errors.py b/src/datumaro/components/errors.py
@@ -342,6 +342,16 @@ def __str__(self):
         return f"Item {self.item_id} is repeated in the source sequence."
 
 
+@define(auto_exc=False)
+class PathSeparatorInSubsetNameError(DatasetError):
+    subset: str = field()
+
+    def __str__(self):
+        return (
+            f"Failed to export the subset '{self.subset}': subset name contains path separator(s)."
+        )
+
+
 class DatasetQualityError(DatasetError):
     pass
 
diff --git a/src/datumaro/plugins/data_formats/datumaro/exporter.py b/src/datumaro/plugins/data_formats/datumaro/exporter.py
@@ -38,6 +38,7 @@
 from datumaro.components.crypter import NULL_CRYPTER
 from datumaro.components.dataset_base import DatasetItem
 from datumaro.components.dataset_item_storage import ItemStatus
+from datumaro.components.errors import PathSeparatorInSubsetNameError
 from datumaro.components.exporter import ExportContextComponent, Exporter
 from datumaro.components.media import Image, MediaElement, PointCloud, Video, VideoFrame
 from datumaro.util import cast, dump_json_file
@@ -185,7 +186,8 @@ def context_save_media(
 
             if context.save_media:
                 fname = context.make_video_filename(item)
-                context.save_video(item, fname=fname, subdir=item.subset)
+                subdir = item.subset.replace(os.sep, "_") if item.subset else None
+                context.save_video(item, fname=fname, subdir=subdir)
                 item.media = Video(
                     path=fname,
                     step=video._step,
@@ -200,7 +202,8 @@ def context_save_media(
 
             if context.save_media:
                 fname = context.make_video_filename(item)
-                context.save_video(item, fname=fname, subdir=item.subset)
+                subdir = item.subset.replace(os.sep, "_") if item.subset else None
+                context.save_video(item, fname=fname, subdir=subdir)
                 item.media = VideoFrame(Video(fname), video_frame.index)
 
             yield
@@ -210,8 +213,9 @@ def context_save_media(
 
             if context.save_media:
                 # Temporarily update image path and save it.
-                fname = context.make_image_filename(item)
-                context.save_image(item, encryption=encryption, fname=fname, subdir=item.subset)
+                fname = context.make_image_filename(item, name=str(item.id).replace(os.sep, "_"))
+                subdir = item.subset.replace(os.sep, "_") if item.subset else None
+                context.save_image(item, encryption=encryption, fname=fname, subdir=subdir)
                 item.media = Image.from_file(path=fname, size=image._size)
 
             yield
@@ -220,14 +224,18 @@ def context_save_media(
             pcd = item.media_as(PointCloud)
 
             if context.save_media:
-                pcd_fname = context.make_pcd_filename(item)
-                context.save_point_cloud(item, fname=pcd_fname, subdir=item.subset)
+                pcd_name = str(item.id).replace(os.sep, "_")
+                pcd_fname = context.make_pcd_filename(item, name=pcd_name)
+                subdir = item.subset.replace(os.sep, "_") if item.subset else None
+                context.save_point_cloud(item, fname=pcd_fname, subdir=subdir)
 
                 extra_images = []
                 for i, extra_image in enumerate(pcd.extra_images):
                     extra_images.append(
                         Image.from_file(
-                            path=context.make_pcd_extra_image_filename(item, i, extra_image)
+                            path=context.make_pcd_extra_image_filename(
+                                item, i, extra_image, name=f"{pcd_name}/extra_image_{i}"
+                            )
                         )
                     )
 
@@ -507,18 +515,27 @@ def create_writer(
             default_image_ext=self._default_image_ext,
         )
 
+        if os.path.sep in subset:
+            raise PathSeparatorInSubsetNameError(subset)
+
         return (
             _SubsetWriter(
                 context=self,
                 subset=subset,
-                ann_file=osp.join(self._annotations_dir, subset + self.PATH_CLS.ANNOTATION_EXT),
+                ann_file=osp.join(
+                    self._annotations_dir,
+                    subset + self.PATH_CLS.ANNOTATION_EXT,
+                ),
                 export_context=export_context,
             )
             if not self._stream
             else _StreamSubsetWriter(
                 context=self,
                 subset=subset,
-                ann_file=osp.join(self._annotations_dir, subset + self.PATH_CLS.ANNOTATION_EXT),
+                ann_file=osp.join(
+                    self._annotations_dir,
+                    subset + self.PATH_CLS.ANNOTATION_EXT,
+                ),
                 export_context=export_context,
             )
         )
diff --git a/src/datumaro/plugins/data_formats/datumaro_binary/exporter.py b/src/datumaro/plugins/data_formats/datumaro_binary/exporter.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Intel Corporation
+# Copyright (C) 2024 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -15,7 +15,7 @@
 
 from datumaro.components.crypter import NULL_CRYPTER, Crypter
 from datumaro.components.dataset_base import DatasetItem, IDataset
-from datumaro.components.errors import DatumaroError
+from datumaro.components.errors import DatumaroError, PathSeparatorInSubsetNameError
 from datumaro.components.exporter import ExportContext, ExportContextComponent, Exporter
 from datumaro.plugins.data_formats.datumaro.exporter import DatumaroExporter
 from datumaro.plugins.data_formats.datumaro.exporter import _SubsetWriter as __SubsetWriter
@@ -309,6 +309,9 @@ def create_writer(
             default_image_ext=self._default_image_ext,
         )
 
+        if osp.sep in subset:
+            raise PathSeparatorInSubsetNameError(subset)
+
         return _SubsetWriter(
             context=self,
             subset=subset,
diff --git a/tests/unit/data_formats/datumaro/conftest.py b/tests/unit/data_formats/datumaro/conftest.py
@@ -221,6 +221,191 @@ def fxt_test_datumaro_format_dataset():
     )
 
 
+@pytest.fixture
+def fxt_test_datumaro_format_dataset_with_path_separator():
+    label_categories = LabelCategories(attributes={"a", "b", "score"})
+    for i in range(5):
+        label_categories.add("cat" + str(i), attributes={"x", "y"})
+
+    mask_categories = MaskCategories(generate_colormap(len(label_categories.items)))
+
+    points_categories = PointsCategories()
+    for index, _ in enumerate(label_categories.items):
+        points_categories.add(index, ["cat1", "cat2"], joints=[[0, 1]])
+
+    sep = os.path.sep
+    return Dataset.from_iterable(
+        [
+            DatasetItem(
+                id="100/0",
+                subset=f"my{sep}train",
+                media=Image.from_numpy(data=np.ones((10, 6, 3))),
+                annotations=[
+                    Caption("hello", id=1),
+                    Caption("world", id=2, group=5),
+                    Label(
+                        2,
+                        id=3,
+                        attributes={
+                            "x": 1,
+                            "y": "2",
+                        },
+                    ),
+                    Bbox(
+                        1,
+                        2,
+                        3,
+                        4,
+                        label=4,
+                        id=4,
+                        z_order=1,
+                        attributes={
+                            "score": 1.0,
+                        },
+                    ),
+                    Bbox(
+                        5,
+                        6,
+                        7,
+                        8,
+                        id=5,
+                        group=5,
+                        attributes={
+                            "a": 1.5,
+                            "b": "text",
+                        },
+                    ),
+                    Points(
+                        [1, 2, 2, 0, 1, 1],
+                        label=0,
+                        id=5,
+                        z_order=4,
+                        attributes={
+                            "x": 1,
+                            "y": "2",
+                        },
+                    ),
+                    Mask(
+                        label=3,
+                        id=5,
+                        z_order=2,
+                        image=np.ones((2, 3)),
+                        attributes={
+                            "x": 1,
+                            "y": "2",
+                        },
+                    ),
+                    Ellipse(
+                        5,
+                        6,
+                        7,
+                        8,
+                        label=3,
+                        id=5,
+                        z_order=2,
+                        attributes={
+                            "x": 1,
+                            "y": "2",
+                        },
+                    ),
+                    Cuboid2D(
+                        [
+                            (1, 1),
+                            (3, 1),
+                            (3, 3),
+                            (1, 3),
+                            (1.5, 1.5),
+                            (3.5, 1.5),
+                            (3.5, 3.5),
+                            (1.5, 3.5),
+                        ],
+                        label=3,
+                        id=5,
+                        z_order=2,
+                        attributes={
+                            "x": 1,
+                            "y": "2",
+                        },
+                    ),
+                ],
+            ),
+            DatasetItem(
+                id=21,
+                media=Image.from_numpy(data=np.ones((10, 6, 3))),
+                subset="train",
+                annotations=[
+                    Caption("test"),
+                    Label(2),
+                    Bbox(1, 2, 3, 4, label=5, id=42, group=42),
+                ],
+            ),
+            DatasetItem(
+                id=2,
+                media=Image.from_numpy(data=np.ones((10, 6, 3))),
+                subset=f"my{sep}val",
+                annotations=[
+                    PolyLine([1, 2, 3, 4, 5, 6, 7, 8], id=11, z_order=1),
+                    Polygon([1, 2, 3, 4, 5, 6, 7, 8], id=12, z_order=4),
+                ],
+            ),
+            DatasetItem(
+                id="1/1",
+                media=Image.from_numpy(data=np.ones((10, 6, 3))),
+                subset="test",
+                annotations=[
+                    Cuboid3d(
+                        [1.0, 2.0, 3.0],
+                        [2.0, 2.0, 4.0],
+                        [1.0, 3.0, 4.0],
+                        id=6,
+                        label=0,
+                        attributes={"occluded": True},
+                        group=6,
+                    )
+                ],
+            ),
+            DatasetItem(
+                id=42,
+                media=Image.from_numpy(data=np.ones((10, 6, 3))),
+                subset=f"my{sep}test",
+                attributes={"a1": 5, "a2": "42"},
+            ),
+            DatasetItem(
+                id=42,
+                media=Image.from_numpy(data=np.ones((10, 6, 3))),
+                # id and group integer value can be higher than 32bits limits (COCO instances).
+                annotations=[
+                    Mask(
+                        id=900100087038, group=900100087038, image=np.ones((2, 3), dtype=np.uint8)
+                    ),
+                    RleMask(
+                        rle=mask_tools.encode(np.ones((2, 3), dtype=np.uint8, order="F")),
+                        id=900100087038,
+                        group=900100087038,
+                    ),
+                ],
+            ),
+            DatasetItem(
+                id="1/b/c",
+                media=Image.from_file(path="1/b/c.qq", size=(2, 4)),
+            ),
+        ],
+        categories={
+            AnnotationType.label: label_categories,
+            AnnotationType.mask: mask_categories,
+            AnnotationType.points: points_categories,
+        },
+        infos={
+            "string": "test",
+            "int": 0,
+            "float": 0.0,
+            "string_list": ["test0", "test1", "test2"],
+            "int_list": [0, 1, 2],
+            "float_list": [0.0, 0.1, 0.2],
+        },
+    )
+
+
 @pytest.fixture
 def fxt_test_datumaro_format_video_dataset(test_dir) -> Dataset:
     video_path = osp.join(test_dir, "video.avi")
diff --git a/tests/unit/data_formats/datumaro/test_datumaro_format.py b/tests/unit/data_formats/datumaro/test_datumaro_format.py