Skip to content

Commit dc9bbf2

Browse files
author
Jihyeon Yi
authored
Handle path separators in the subset when exporting a datumaro dataset (#1615)
<!-- Contributing guide: https://github.com/openvinotoolkit/datumaro/blob/develop/CONTRIBUTING.md --> ### Summary Ticket: 152928 Raise an error when exporting a dataset in Datumaro format if the subset name contains path separators. <!-- Resolves #111 and #222. Depends on #1000 (for series of dependent commits). This PR introduces this capability to make the project better in this and that. - Added this feature - Removed that feature - Fixed the problem #1234 --> ### How to test <!-- Describe the testing procedure for reviewers, if changes are not fully covered by unit tests or manual testing can be complicated. --> ### Checklist <!-- Put an 'x' in all the boxes that apply --> - [x] I have added unit tests to cover my changes.​ - [ ] I have added integration tests to cover my changes.​ - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​ - [x] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [x] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2024 Intel Corporation # # SPDX-License-Identifier: MIT ```
1 parent bb28e68 commit dc9bbf2

File tree

8 files changed

+267
-12
lines changed

8 files changed

+267
-12
lines changed

CHANGELOG.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,17 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8-
## \[Q3 2024 Release 1.9.0\]
8+
## \[Unreleased\]
9+
10+
### New features
11+
12+
### Enhancements
13+
- Raise an appropriate error when exporting a datumaro dataset if its subset name contains path separators.
14+
(<https://github.com/openvinotoolkit/datumaro/pull/1615>)
15+
16+
### Bug fixes
17+
18+
## Q3 2024 Release 1.9.0
919
### New features
1020
- Add a new CLI command: datum format
1121
(<https://github.com/openvinotoolkit/datumaro/pull/1570>)

docs/source/docs/data-formats/formats/datumaro.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ A Datumaro dataset directory should have the following structure:
7373
└── ...
7474
```
7575

76+
Note that the subset name shouldn't contain path separators.
77+
7678
If your dataset is not following the above directory structure,
7779
it cannot detect and import your dataset as the Datumaro format properly.
7880

docs/source/docs/data-formats/formats/datumaro_binary.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ A DatumaroBinary dataset directory should have the following structure:
113113
└── ...
114114
```
115115

116+
Note that the subset name shouldn't contain path separators.
117+
116118
If your dataset is not following the above directory structure,
117119
it cannot detect and import your dataset as the DatumaroBinary format properly.
118120

src/datumaro/components/errors.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,16 @@ def __str__(self):
342342
return f"Item {self.item_id} is repeated in the source sequence."
343343

344344

345+
@define(auto_exc=False)
346+
class PathSeparatorInSubsetNameError(DatasetError):
347+
subset: str = field()
348+
349+
def __str__(self):
350+
return (
351+
f"Failed to export the subset '{self.subset}': subset name contains path separator(s)."
352+
)
353+
354+
345355
class DatasetQualityError(DatasetError):
346356
pass
347357

src/datumaro/plugins/data_formats/datumaro/exporter.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from datumaro.components.crypter import NULL_CRYPTER
3939
from datumaro.components.dataset_base import DatasetItem
4040
from datumaro.components.dataset_item_storage import ItemStatus
41+
from datumaro.components.errors import PathSeparatorInSubsetNameError
4142
from datumaro.components.exporter import ExportContextComponent, Exporter
4243
from datumaro.components.media import Image, MediaElement, PointCloud, Video, VideoFrame
4344
from datumaro.util import cast, dump_json_file
@@ -185,7 +186,8 @@ def context_save_media(
185186

186187
if context.save_media:
187188
fname = context.make_video_filename(item)
188-
context.save_video(item, fname=fname, subdir=item.subset)
189+
subdir = item.subset.replace(os.sep, "_") if item.subset else None
190+
context.save_video(item, fname=fname, subdir=subdir)
189191
item.media = Video(
190192
path=fname,
191193
step=video._step,
@@ -200,7 +202,8 @@ def context_save_media(
200202

201203
if context.save_media:
202204
fname = context.make_video_filename(item)
203-
context.save_video(item, fname=fname, subdir=item.subset)
205+
subdir = item.subset.replace(os.sep, "_") if item.subset else None
206+
context.save_video(item, fname=fname, subdir=subdir)
204207
item.media = VideoFrame(Video(fname), video_frame.index)
205208

206209
yield
@@ -210,8 +213,9 @@ def context_save_media(
210213

211214
if context.save_media:
212215
# Temporarily update image path and save it.
213-
fname = context.make_image_filename(item)
214-
context.save_image(item, encryption=encryption, fname=fname, subdir=item.subset)
216+
fname = context.make_image_filename(item, name=str(item.id).replace(os.sep, "_"))
217+
subdir = item.subset.replace(os.sep, "_") if item.subset else None
218+
context.save_image(item, encryption=encryption, fname=fname, subdir=subdir)
215219
item.media = Image.from_file(path=fname, size=image._size)
216220

217221
yield
@@ -220,14 +224,18 @@ def context_save_media(
220224
pcd = item.media_as(PointCloud)
221225

222226
if context.save_media:
223-
pcd_fname = context.make_pcd_filename(item)
224-
context.save_point_cloud(item, fname=pcd_fname, subdir=item.subset)
227+
pcd_name = str(item.id).replace(os.sep, "_")
228+
pcd_fname = context.make_pcd_filename(item, name=pcd_name)
229+
subdir = item.subset.replace(os.sep, "_") if item.subset else None
230+
context.save_point_cloud(item, fname=pcd_fname, subdir=subdir)
225231

226232
extra_images = []
227233
for i, extra_image in enumerate(pcd.extra_images):
228234
extra_images.append(
229235
Image.from_file(
230-
path=context.make_pcd_extra_image_filename(item, i, extra_image)
236+
path=context.make_pcd_extra_image_filename(
237+
item, i, extra_image, name=f"{pcd_name}/extra_image_{i}"
238+
)
231239
)
232240
)
233241

@@ -507,18 +515,27 @@ def create_writer(
507515
default_image_ext=self._default_image_ext,
508516
)
509517

518+
if os.path.sep in subset:
519+
raise PathSeparatorInSubsetNameError(subset)
520+
510521
return (
511522
_SubsetWriter(
512523
context=self,
513524
subset=subset,
514-
ann_file=osp.join(self._annotations_dir, subset + self.PATH_CLS.ANNOTATION_EXT),
525+
ann_file=osp.join(
526+
self._annotations_dir,
527+
subset + self.PATH_CLS.ANNOTATION_EXT,
528+
),
515529
export_context=export_context,
516530
)
517531
if not self._stream
518532
else _StreamSubsetWriter(
519533
context=self,
520534
subset=subset,
521-
ann_file=osp.join(self._annotations_dir, subset + self.PATH_CLS.ANNOTATION_EXT),
535+
ann_file=osp.join(
536+
self._annotations_dir,
537+
subset + self.PATH_CLS.ANNOTATION_EXT,
538+
),
522539
export_context=export_context,
523540
)
524541
)

src/datumaro/plugins/data_formats/datumaro_binary/exporter.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (C) 2023 Intel Corporation
1+
# Copyright (C) 2024 Intel Corporation
22
#
33
# SPDX-License-Identifier: MIT
44

@@ -15,7 +15,7 @@
1515

1616
from datumaro.components.crypter import NULL_CRYPTER, Crypter
1717
from datumaro.components.dataset_base import DatasetItem, IDataset
18-
from datumaro.components.errors import DatumaroError
18+
from datumaro.components.errors import DatumaroError, PathSeparatorInSubsetNameError
1919
from datumaro.components.exporter import ExportContext, ExportContextComponent, Exporter
2020
from datumaro.plugins.data_formats.datumaro.exporter import DatumaroExporter
2121
from datumaro.plugins.data_formats.datumaro.exporter import _SubsetWriter as __SubsetWriter
@@ -309,6 +309,9 @@ def create_writer(
309309
default_image_ext=self._default_image_ext,
310310
)
311311

312+
if osp.sep in subset:
313+
raise PathSeparatorInSubsetNameError(subset)
314+
312315
return _SubsetWriter(
313316
context=self,
314317
subset=subset,

tests/unit/data_formats/datumaro/conftest.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,191 @@ def fxt_test_datumaro_format_dataset():
221221
)
222222

223223

224+
@pytest.fixture
225+
def fxt_test_datumaro_format_dataset_with_path_separator():
226+
label_categories = LabelCategories(attributes={"a", "b", "score"})
227+
for i in range(5):
228+
label_categories.add("cat" + str(i), attributes={"x", "y"})
229+
230+
mask_categories = MaskCategories(generate_colormap(len(label_categories.items)))
231+
232+
points_categories = PointsCategories()
233+
for index, _ in enumerate(label_categories.items):
234+
points_categories.add(index, ["cat1", "cat2"], joints=[[0, 1]])
235+
236+
sep = os.path.sep
237+
return Dataset.from_iterable(
238+
[
239+
DatasetItem(
240+
id="100/0",
241+
subset=f"my{sep}train",
242+
media=Image.from_numpy(data=np.ones((10, 6, 3))),
243+
annotations=[
244+
Caption("hello", id=1),
245+
Caption("world", id=2, group=5),
246+
Label(
247+
2,
248+
id=3,
249+
attributes={
250+
"x": 1,
251+
"y": "2",
252+
},
253+
),
254+
Bbox(
255+
1,
256+
2,
257+
3,
258+
4,
259+
label=4,
260+
id=4,
261+
z_order=1,
262+
attributes={
263+
"score": 1.0,
264+
},
265+
),
266+
Bbox(
267+
5,
268+
6,
269+
7,
270+
8,
271+
id=5,
272+
group=5,
273+
attributes={
274+
"a": 1.5,
275+
"b": "text",
276+
},
277+
),
278+
Points(
279+
[1, 2, 2, 0, 1, 1],
280+
label=0,
281+
id=5,
282+
z_order=4,
283+
attributes={
284+
"x": 1,
285+
"y": "2",
286+
},
287+
),
288+
Mask(
289+
label=3,
290+
id=5,
291+
z_order=2,
292+
image=np.ones((2, 3)),
293+
attributes={
294+
"x": 1,
295+
"y": "2",
296+
},
297+
),
298+
Ellipse(
299+
5,
300+
6,
301+
7,
302+
8,
303+
label=3,
304+
id=5,
305+
z_order=2,
306+
attributes={
307+
"x": 1,
308+
"y": "2",
309+
},
310+
),
311+
Cuboid2D(
312+
[
313+
(1, 1),
314+
(3, 1),
315+
(3, 3),
316+
(1, 3),
317+
(1.5, 1.5),
318+
(3.5, 1.5),
319+
(3.5, 3.5),
320+
(1.5, 3.5),
321+
],
322+
label=3,
323+
id=5,
324+
z_order=2,
325+
attributes={
326+
"x": 1,
327+
"y": "2",
328+
},
329+
),
330+
],
331+
),
332+
DatasetItem(
333+
id=21,
334+
media=Image.from_numpy(data=np.ones((10, 6, 3))),
335+
subset="train",
336+
annotations=[
337+
Caption("test"),
338+
Label(2),
339+
Bbox(1, 2, 3, 4, label=5, id=42, group=42),
340+
],
341+
),
342+
DatasetItem(
343+
id=2,
344+
media=Image.from_numpy(data=np.ones((10, 6, 3))),
345+
subset=f"my{sep}val",
346+
annotations=[
347+
PolyLine([1, 2, 3, 4, 5, 6, 7, 8], id=11, z_order=1),
348+
Polygon([1, 2, 3, 4, 5, 6, 7, 8], id=12, z_order=4),
349+
],
350+
),
351+
DatasetItem(
352+
id="1/1",
353+
media=Image.from_numpy(data=np.ones((10, 6, 3))),
354+
subset="test",
355+
annotations=[
356+
Cuboid3d(
357+
[1.0, 2.0, 3.0],
358+
[2.0, 2.0, 4.0],
359+
[1.0, 3.0, 4.0],
360+
id=6,
361+
label=0,
362+
attributes={"occluded": True},
363+
group=6,
364+
)
365+
],
366+
),
367+
DatasetItem(
368+
id=42,
369+
media=Image.from_numpy(data=np.ones((10, 6, 3))),
370+
subset=f"my{sep}test",
371+
attributes={"a1": 5, "a2": "42"},
372+
),
373+
DatasetItem(
374+
id=42,
375+
media=Image.from_numpy(data=np.ones((10, 6, 3))),
376+
# id and group integer value can be higher than 32bits limits (COCO instances).
377+
annotations=[
378+
Mask(
379+
id=900100087038, group=900100087038, image=np.ones((2, 3), dtype=np.uint8)
380+
),
381+
RleMask(
382+
rle=mask_tools.encode(np.ones((2, 3), dtype=np.uint8, order="F")),
383+
id=900100087038,
384+
group=900100087038,
385+
),
386+
],
387+
),
388+
DatasetItem(
389+
id="1/b/c",
390+
media=Image.from_file(path="1/b/c.qq", size=(2, 4)),
391+
),
392+
],
393+
categories={
394+
AnnotationType.label: label_categories,
395+
AnnotationType.mask: mask_categories,
396+
AnnotationType.points: points_categories,
397+
},
398+
infos={
399+
"string": "test",
400+
"int": 0,
401+
"float": 0.0,
402+
"string_list": ["test0", "test1", "test2"],
403+
"int_list": [0, 1, 2],
404+
"float_list": [0.0, 0.1, 0.2],
405+
},
406+
)
407+
408+
224409
@pytest.fixture
225410
def fxt_test_datumaro_format_video_dataset(test_dir) -> Dataset:
226411
video_path = osp.join(test_dir, "video.avi")

0 commit comments

Comments
 (0)