Skip to content

Commit f716078

Browse files
author
Jihyeon Yi
authored
Enhance 'id_from_image_name' transform to ensure each identifier is unique (#1635)
<!-- Contributing guide: https://github.com/openvinotoolkit/datumaro/blob/develop/CONTRIBUTING.md --> ### Summary Ticket: 153389 1. Enhance 'id_from_image_name' transform to ensure each identifier is unique. - add random suffix if the image name is not distinct: [image_name]__[suffix] - introduce related parameters: ensure_unique(default: false), suffix_length (default: 3) 2. Handle VideoFrame considering its index - format: [video_name]_frame-[index] <!-- Resolves #111 and #222. Depends on #1000 (for series of dependent commits). This PR introduces this capability to make the project better in this and that. - Added this feature - Removed that feature - Fixed the problem #1234 --> ### How to test <!-- Describe the testing procedure for reviewers, if changes are not fully covered by unit tests or manual testing can be complicated. --> ### Checklist <!-- Put an 'x' in all the boxes that apply --> - [x] I have added unit tests to cover my changes.​ - [ ] I have added integration tests to cover my changes.​ - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​ - [ ] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [ ] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2024 Intel Corporation # # SPDX-License-Identifier: MIT ```
1 parent 5f647da commit f716078

File tree

4 files changed

+190
-27
lines changed

4 files changed

+190
-27
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1414
(<https://github.com/openvinotoolkit/datumaro/pull/1594>)
1515

1616
### Enhancements
17+
- Enhance 'id_from_image_name' transform to ensure each identifier is unique
18+
(<https://github.com/openvinotoolkit/datumaro/pull/1635>)
1719
- Raise an appropriate error when exporting a datumaro dataset if its subset name contains path separators.
1820
(<https://github.com/openvinotoolkit/datumaro/pull/1615>)
1921
- Update docs for transform plugins

docs/source/docs/command-reference/context_free/transform.md

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,15 +176,36 @@ Examples:
176176

177177
#### `id_from_image_name`
178178

179-
Renames items in the dataset using image file name (without extension).
179+
Renames items in the dataset based on the image file name, excluding the extension.
180+
When 'ensure_unique' is enabled, a random suffix is appened to ensure each identifier is unique
181+
in cases where the image name is not distinct. By default, the random suffix is three characters long,
182+
but this can be adjusted with the 'suffix_length' parameter.
180183

181184
Usage:
182185
```console
183-
id_from_image_name [-h]
186+
id_from_image_name [-h] [-u] [-l SUFFIX_LENGTH]
184187
```
185188

186189
Optional arguments:
187-
- `-h`, `--help` (flag) - Show this help message and exit
190+
- `-h`, `--help` (flag) - show this help message and exit
191+
- `-u`, `--ensure_unique` (flag) - Appends a random suffix to ensure each identifier is unique if the image name is duplicated
192+
- `-l`, `--suffix_length` (int) - Alters the length of the random suffix if the `ensure_unique` is enabled(default: 3)
193+
194+
Examples:
195+
- Renames items without duplication check
196+
```console
197+
datum transform -t id_from_image_name
198+
```
199+
200+
- Renames items with duplication check
201+
```console
202+
datum transform -t id_from_image_name -- --ensure_unique
203+
```
204+
205+
- Renames items with duplication check and alters the suffix length(default: 3)
206+
```console
207+
datum transform -t id_from_image_name -- --ensure_unique --suffix_length 2
208+
```
188209

189210
#### `reindex`
190211

src/datumaro/plugins/transforms.py

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import os.path as osp
1010
import random
1111
import re
12+
import string
1213
from collections import Counter, defaultdict
1314
from copy import deepcopy
1415
from enum import Enum, auto
@@ -63,7 +64,7 @@
6364
UndefinedAttribute,
6465
UndefinedLabel,
6566
)
66-
from datumaro.components.media import Image, TableRow
67+
from datumaro.components.media import Image, TableRow, VideoFrame
6768
from datumaro.components.transformer import ItemTransform, Transform
6869
from datumaro.util import NOTSET, filter_dict, parse_json_file, parse_str_enum_value, take_by
6970
from datumaro.util.annotation_util import find_group_leader, find_instances
@@ -595,12 +596,92 @@ def __iter__(self):
595596

596597
class IdFromImageName(ItemTransform, CliPlugin):
597598
"""
598-
Renames items in the dataset using image file name (without extension).
599+
Renames items in the dataset based on the image file name, excluding the extension.|n
600+
When 'ensure_unique' is enabled, a random suffix is appened to ensure each identifier is unique
601+
in cases where the image name is not distinct. By default, the random suffix is three characters long,
602+
but this can be adjusted with the 'suffix_length' parameter.|n
603+
|n
604+
Examples:|n
605+
|n
606+
|s|s- Renames items without duplication check:|n
607+
608+
.. code-block::
609+
610+
|s|s|s|s%(prog)s|n
611+
|n
612+
|s|s- Renames items with duplication check:|n
613+
614+
.. code-block::
615+
616+
|s|s|s|s%(prog)s --ensure_unique|n
617+
|n
618+
|s|s- Renames items with duplication check and alters the suffix length(default: 3):|n
619+
620+
.. code-block::
621+
622+
|s|s|s|s%(prog)s --ensure_unique --suffix_length 2
599623
"""
600624

625+
DEFAULT_RETRY = 1000
626+
SUFFIX_LETTERS = string.ascii_lowercase + string.digits
627+
628+
@classmethod
629+
def build_cmdline_parser(cls, **kwargs):
630+
parser = super().build_cmdline_parser(**kwargs)
631+
parser.add_argument(
632+
"-u",
633+
"--ensure_unique",
634+
action="store_true",
635+
help="Appends a random suffix to ensure each identifier is unique if the image name is duplicated.",
636+
)
637+
parser.add_argument(
638+
"-l",
639+
"--suffix_length",
640+
type=int,
641+
default=3,
642+
help="Alters the length of the random suffix if the 'ensure_unique' is enabled.",
643+
)
644+
645+
return parser
646+
647+
def __init__(self, extractor, ensure_unique: bool = False, suffix_length: int = 3):
648+
super().__init__(extractor)
649+
self._length = "parent"
650+
self._ensure_unique = ensure_unique
651+
self._names: set[str] = set()
652+
self._suffix_length = suffix_length
653+
if suffix_length <= 0:
654+
raise ValueError(
655+
f"The 'suffix_length' must be greater than 0. Received: {suffix_length}."
656+
)
657+
self._max_retry = min(
658+
self.DEFAULT_RETRY, pow(len(self.SUFFIX_LETTERS), self._suffix_length)
659+
)
660+
661+
def _add_unique_suffix(self, name):
662+
count = 0
663+
while name in self._names:
664+
suffix = "".join(random.choices(self.SUFFIX_LETTERS, k=self._suffix_length))
665+
new_name = f"{name}__{suffix}"
666+
if new_name not in self._names:
667+
name = new_name
668+
break
669+
count += 1
670+
if count == self._max_retry:
671+
raise Exception(
672+
f"Too many duplicate names. Failed to generate a unique suffix after {self._max_retry} attempts."
673+
)
674+
675+
self._names.add(name)
676+
return name
677+
601678
def transform_item(self, item):
602679
if isinstance(item.media, Image) and hasattr(item.media, "path"):
603680
name = osp.splitext(osp.basename(item.media.path))[0]
681+
if isinstance(item.media, VideoFrame):
682+
name += f"_frame-{item.media.index}"
683+
if self._ensure_unique:
684+
name = self._add_unique_suffix(name)
604685
return self.wrap_item(item, id=name)
605686
else:
606687
log.debug("Can't change item id for item '%s': " "item has no path info" % item.id)

tests/unit/test_transforms.py

Lines changed: 81 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import os.path as osp
77
import random
88
from unittest import TestCase
9+
from unittest.mock import MagicMock, patch
910

1011
import numpy as np
1112
import pandas as pd
@@ -33,10 +34,10 @@
3334
Tabular,
3435
TabularCategories,
3536
)
36-
from datumaro.components.dataset import Dataset
37+
from datumaro.components.dataset import Dataset, eager_mode
3738
from datumaro.components.dataset_base import DatasetItem
3839
from datumaro.components.errors import AnnotationTypeError
39-
from datumaro.components.media import Image, Table, TableRow
40+
from datumaro.components.media import Image, Table, TableRow, Video, VideoFrame
4041

4142
from ..requirements import Requirements, mark_bug, mark_requirement
4243

@@ -420,26 +421,6 @@ def test_shapes_to_boxes(self):
420421
actual = transforms.ShapesToBoxes(source_dataset)
421422
compare_datasets(self, target_dataset, actual)
422423

423-
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
424-
def test_id_from_image(self):
425-
source_dataset = Dataset.from_iterable(
426-
[
427-
DatasetItem(id=1, media=Image.from_file(path="path.jpg")),
428-
DatasetItem(id=2),
429-
DatasetItem(id=3, media=Image.from_numpy(data=np.ones([5, 5, 3]))),
430-
]
431-
)
432-
target_dataset = Dataset.from_iterable(
433-
[
434-
DatasetItem(id="path", media=Image.from_file(path="path.jpg")),
435-
DatasetItem(id=2),
436-
DatasetItem(id=3, media=Image.from_numpy(data=np.ones([5, 5, 3]))),
437-
]
438-
)
439-
440-
actual = transforms.IdFromImageName(source_dataset)
441-
compare_datasets(self, target_dataset, actual)
442-
443424
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
444425
def test_boxes_to_masks(self):
445426
source_dataset = Dataset.from_iterable(
@@ -1227,6 +1208,84 @@ def test_annotation_reindex(self, fxt_dataset: Dataset, reindex_each_item: bool)
12271208
)
12281209

12291210

1211+
class IdFromImageNameTest:
1212+
@pytest.fixture
1213+
def fxt_dataset(self, n_labels=3, n_anns=5, n_items=7) -> Dataset:
1214+
video = Video("video.mp4")
1215+
video._frame_size = MagicMock(return_value=(32, 32))
1216+
video.get_frame_data = MagicMock(return_value=np.ndarray((32, 32, 3), dtype=np.uint8))
1217+
return Dataset.from_iterable(
1218+
[
1219+
DatasetItem(id=1, media=Image.from_file(path="path1.jpg")),
1220+
DatasetItem(id=2, media=Image.from_file(path="path1.jpg")),
1221+
DatasetItem(id=3, media=Image.from_file(path="path1.jpg")),
1222+
DatasetItem(id=4, media=VideoFrame(video, index=30)),
1223+
DatasetItem(id=5, media=VideoFrame(video, index=30)),
1224+
DatasetItem(id=6, media=VideoFrame(video, index=60)),
1225+
DatasetItem(id=7),
1226+
DatasetItem(id=8, media=Image.from_numpy(data=np.ones([5, 5, 3]))),
1227+
]
1228+
)
1229+
1230+
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
1231+
@pytest.mark.parametrize("ensure_unique", [True, False])
1232+
def test_id_from_image(self, fxt_dataset, ensure_unique):
1233+
source_dataset: Dataset = fxt_dataset
1234+
actual_dataset = transforms.IdFromImageName(source_dataset, ensure_unique=ensure_unique)
1235+
1236+
unique_names: set[str] = set()
1237+
for src, actual in zip(source_dataset, actual_dataset):
1238+
if not isinstance(src.media, Image) or not hasattr(src.media, "path"):
1239+
src == actual
1240+
else:
1241+
if isinstance(src.media, VideoFrame):
1242+
expected_id = f"video_frame-{src.media.index}"
1243+
else:
1244+
expected_id = os.path.splitext(src.media.path)[0]
1245+
if ensure_unique:
1246+
assert actual.id.startswith(expected_id)
1247+
assert actual.wrap(id=src.id) == src
1248+
assert actual.id not in unique_names
1249+
unique_names.add(actual.id)
1250+
else:
1251+
assert src.wrap(id=expected_id) == actual
1252+
1253+
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
1254+
def test_id_from_image_wrong_suffix_length(self, fxt_dataset):
1255+
with pytest.raises(ValueError) as e:
1256+
transforms.IdFromImageName(fxt_dataset, ensure_unique=True, suffix_length=0)
1257+
assert str(e.value).startswith("The 'suffix_length' must be greater than 0.")
1258+
1259+
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
1260+
def test_id_from_image_too_many_duplication(self, fxt_dataset):
1261+
with patch("datumaro.plugins.transforms.IdFromImageName.DEFAULT_RETRY", 1), patch(
1262+
"datumaro.plugins.transforms.IdFromImageName.SUFFIX_LETTERS", "a"
1263+
), pytest.raises(Exception) as e:
1264+
with eager_mode():
1265+
fxt_dataset.transform(
1266+
"id_from_image_name",
1267+
ensure_unique=True,
1268+
suffix_length=1,
1269+
)
1270+
assert str(e.value).startswith("Too many duplicate names.")
1271+
1272+
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
1273+
@pytest.mark.parametrize(
1274+
"args,ensure_unique,suffix_length",
1275+
[
1276+
([], False, 3),
1277+
(["--ensure_unique", "--suffix_length", "2"], True, 2),
1278+
],
1279+
ids=["default", "ensure_unique"],
1280+
)
1281+
def test_parser(self, args, ensure_unique, suffix_length):
1282+
parser = transforms.IdFromImageName.build_cmdline_parser()
1283+
args = parser.parse_args(args)
1284+
1285+
assert hasattr(args, "ensure_unique") and args.ensure_unique == ensure_unique
1286+
assert hasattr(args, "suffix_length") and args.suffix_length == suffix_length
1287+
1288+
12301289
class AstypeAnnotationsTest(TestCase):
12311290
def setUp(self):
12321291
self.table = Table.from_list(

0 commit comments

Comments
 (0)