Enhance 'id_from_image_name' transform to ensure each identifier is unique (#1635)

Jihyeon Yi · web-flow · commit f71607879c3e · 2024-10-11T16:24:35.000+09:00
### Summary Ticket: 153389 1. Enhance 'id_from_image_name' transform to ensure each identifier is unique. - add random suffix if the image name is not distinct: [image_name]__[suffix] - introduce related parameters: ensure_unique(default: false), suffix_length (default: 3) 2. Handle VideoFrame considering its index - format: [video_name]_frame-[index]  ### How to test  ### Checklist  - [x] I have added unit tests to cover my changes.​ - [ ] I have added integration tests to cover my changes.​ - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​ - [ ] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [ ] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2024 Intel Corporation # # SPDX-License-Identifier: MIT ```
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1594>)
 
 ### Enhancements
+- Enhance 'id_from_image_name' transform to ensure each identifier is unique
+  (<https://github.com/openvinotoolkit/datumaro/pull/1635>)
 - Raise an appropriate error when exporting a datumaro dataset if its subset name contains path separators.
   (<https://github.com/openvinotoolkit/datumaro/pull/1615>)
 - Update docs for transform plugins
diff --git a/docs/source/docs/command-reference/context_free/transform.md b/docs/source/docs/command-reference/context_free/transform.md
@@ -176,15 +176,36 @@ Examples:
 
 #### `id_from_image_name`
 
-Renames items in the dataset using image file name (without extension).
+Renames items in the dataset based on the image file name, excluding the extension.
+When 'ensure_unique' is enabled, a random suffix is appened to ensure each identifier is unique
+in cases where the image name is not distinct. By default, the random suffix is three characters long,
+but this can be adjusted with the 'suffix_length' parameter.
 
 Usage:
 ```console
-id_from_image_name [-h]
+id_from_image_name [-h] [-u] [-l SUFFIX_LENGTH]
 ```
 
 Optional arguments:
-- `-h`, `--help` (flag) - Show this help message and exit
+- `-h`, `--help` (flag) - show this help message and exit
+- `-u`, `--ensure_unique` (flag) - Appends a random suffix to ensure each identifier is unique if the image name is duplicated
+- `-l`, `--suffix_length` (int) - Alters the length of the random suffix if the `ensure_unique` is enabled(default: 3)
+
+Examples:
+- Renames items without duplication check
+  ```console
+  datum transform -t id_from_image_name
+  ```
+
+- Renames items with duplication check
+  ```console
+  datum transform -t id_from_image_name -- --ensure_unique
+  ```
+
+- Renames items with duplication check and alters the suffix length(default: 3)
+  ```console
+  datum transform -t id_from_image_name -- --ensure_unique --suffix_length 2
+  ```
 
 #### `reindex`
 
diff --git a/src/datumaro/plugins/transforms.py b/src/datumaro/plugins/transforms.py
@@ -9,6 +9,7 @@
 import os.path as osp
 import random
 import re
+import string
 from collections import Counter, defaultdict
 from copy import deepcopy
 from enum import Enum, auto
@@ -63,7 +64,7 @@
     UndefinedAttribute,
     UndefinedLabel,
 )
-from datumaro.components.media import Image, TableRow
+from datumaro.components.media import Image, TableRow, VideoFrame
 from datumaro.components.transformer import ItemTransform, Transform
 from datumaro.util import NOTSET, filter_dict, parse_json_file, parse_str_enum_value, take_by
 from datumaro.util.annotation_util import find_group_leader, find_instances
@@ -595,12 +596,92 @@ def __iter__(self):
 
 class IdFromImageName(ItemTransform, CliPlugin):
     """
-    Renames items in the dataset using image file name (without extension).
+    Renames items in the dataset based on the image file name, excluding the extension.|n
+    When 'ensure_unique' is enabled, a random suffix is appened to ensure each identifier is unique
+    in cases where the image name is not distinct. By default, the random suffix is three characters long,
+    but this can be adjusted with the 'suffix_length' parameter.|n
+    |n
+    Examples:|n
+    |n
+    |s|s- Renames items without duplication check:|n
+
+      .. code-block::
+
+    |s|s|s|s%(prog)s|n
+    |n
+    |s|s- Renames items with duplication check:|n
+
+    .. code-block::
+
+    |s|s|s|s%(prog)s --ensure_unique|n
+    |n
+    |s|s- Renames items with duplication check and alters the suffix length(default: 3):|n
+
+    .. code-block::
+
+    |s|s|s|s%(prog)s --ensure_unique --suffix_length 2
     """
 
+    DEFAULT_RETRY = 1000
+    SUFFIX_LETTERS = string.ascii_lowercase + string.digits
+
+    @classmethod
+    def build_cmdline_parser(cls, **kwargs):
+        parser = super().build_cmdline_parser(**kwargs)
+        parser.add_argument(
+            "-u",
+            "--ensure_unique",
+            action="store_true",
+            help="Appends a random suffix to ensure each identifier is unique if the image name is duplicated.",
+        )
+        parser.add_argument(
+            "-l",
+            "--suffix_length",
+            type=int,
+            default=3,
+            help="Alters the length of the random suffix if the 'ensure_unique' is enabled.",
+        )
+
+        return parser
+
+    def __init__(self, extractor, ensure_unique: bool = False, suffix_length: int = 3):
+        super().__init__(extractor)
+        self._length = "parent"
+        self._ensure_unique = ensure_unique
+        self._names: set[str] = set()
+        self._suffix_length = suffix_length
+        if suffix_length <= 0:
+            raise ValueError(
+                f"The 'suffix_length' must be greater than 0. Received: {suffix_length}."
+            )
+        self._max_retry = min(
+            self.DEFAULT_RETRY, pow(len(self.SUFFIX_LETTERS), self._suffix_length)
+        )
+
+    def _add_unique_suffix(self, name):
+        count = 0
+        while name in self._names:
+            suffix = "".join(random.choices(self.SUFFIX_LETTERS, k=self._suffix_length))
+            new_name = f"{name}__{suffix}"
+            if new_name not in self._names:
+                name = new_name
+                break
+            count += 1
+            if count == self._max_retry:
+                raise Exception(
+                    f"Too many duplicate names. Failed to generate a unique suffix after {self._max_retry} attempts."
+                )
+
+        self._names.add(name)
+        return name
+
     def transform_item(self, item):
         if isinstance(item.media, Image) and hasattr(item.media, "path"):
             name = osp.splitext(osp.basename(item.media.path))[0]
+            if isinstance(item.media, VideoFrame):
+                name += f"_frame-{item.media.index}"
+            if self._ensure_unique:
+                name = self._add_unique_suffix(name)
             return self.wrap_item(item, id=name)
         else:
             log.debug("Can't change item id for item '%s': " "item has no path info" % item.id)
diff --git a/tests/unit/test_transforms.py b/tests/unit/test_transforms.py
@@ -6,6 +6,7 @@
 import os.path as osp
 import random
 from unittest import TestCase
+from unittest.mock import MagicMock, patch
 
 import numpy as np
 import pandas as pd
@@ -33,10 +34,10 @@
     Tabular,
     TabularCategories,
 )
-from datumaro.components.dataset import Dataset
+from datumaro.components.dataset import Dataset, eager_mode
 from datumaro.components.dataset_base import DatasetItem
 from datumaro.components.errors import AnnotationTypeError
-from datumaro.components.media import Image, Table, TableRow
+from datumaro.components.media import Image, Table, TableRow, Video, VideoFrame
 
 from ..requirements import Requirements, mark_bug, mark_requirement
 
@@ -420,26 +421,6 @@ def test_shapes_to_boxes(self):
         actual = transforms.ShapesToBoxes(source_dataset)
         compare_datasets(self, target_dataset, actual)
 
-    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
-    def test_id_from_image(self):
-        source_dataset = Dataset.from_iterable(
-            [
-                DatasetItem(id=1, media=Image.from_file(path="path.jpg")),
-                DatasetItem(id=2),
-                DatasetItem(id=3, media=Image.from_numpy(data=np.ones([5, 5, 3]))),
-            ]
-        )
-        target_dataset = Dataset.from_iterable(
-            [
-                DatasetItem(id="path", media=Image.from_file(path="path.jpg")),
-                DatasetItem(id=2),
-                DatasetItem(id=3, media=Image.from_numpy(data=np.ones([5, 5, 3]))),
-            ]
-        )
-
-        actual = transforms.IdFromImageName(source_dataset)
-        compare_datasets(self, target_dataset, actual)
-
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     def test_boxes_to_masks(self):
         source_dataset = Dataset.from_iterable(
@@ -1227,6 +1208,84 @@ def test_annotation_reindex(self, fxt_dataset: Dataset, reindex_each_item: bool)
         )
 
 
+class IdFromImageNameTest:
+    @pytest.fixture
+    def fxt_dataset(self, n_labels=3, n_anns=5, n_items=7) -> Dataset:
+        video = Video("video.mp4")
+        video._frame_size = MagicMock(return_value=(32, 32))
+        video.get_frame_data = MagicMock(return_value=np.ndarray((32, 32, 3), dtype=np.uint8))
+        return Dataset.from_iterable(
+            [
+                DatasetItem(id=1, media=Image.from_file(path="path1.jpg")),
+                DatasetItem(id=2, media=Image.from_file(path="path1.jpg")),
+                DatasetItem(id=3, media=Image.from_file(path="path1.jpg")),
+                DatasetItem(id=4, media=VideoFrame(video, index=30)),
+                DatasetItem(id=5, media=VideoFrame(video, index=30)),
+                DatasetItem(id=6, media=VideoFrame(video, index=60)),
+                DatasetItem(id=7),
+                DatasetItem(id=8, media=Image.from_numpy(data=np.ones([5, 5, 3]))),
+            ]
+        )
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    @pytest.mark.parametrize("ensure_unique", [True, False])
+    def test_id_from_image(self, fxt_dataset, ensure_unique):
+        source_dataset: Dataset = fxt_dataset
+        actual_dataset = transforms.IdFromImageName(source_dataset, ensure_unique=ensure_unique)
+
+        unique_names: set[str] = set()
+        for src, actual in zip(source_dataset, actual_dataset):
+            if not isinstance(src.media, Image) or not hasattr(src.media, "path"):
+                src == actual
+            else:
+                if isinstance(src.media, VideoFrame):
+                    expected_id = f"video_frame-{src.media.index}"
+                else:
+                    expected_id = os.path.splitext(src.media.path)[0]
+                if ensure_unique:
+                    assert actual.id.startswith(expected_id)
+                    assert actual.wrap(id=src.id) == src
+                    assert actual.id not in unique_names
+                    unique_names.add(actual.id)
+                else:
+                    assert src.wrap(id=expected_id) == actual
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_id_from_image_wrong_suffix_length(self, fxt_dataset):
+        with pytest.raises(ValueError) as e:
+            transforms.IdFromImageName(fxt_dataset, ensure_unique=True, suffix_length=0)
+        assert str(e.value).startswith("The 'suffix_length' must be greater than 0.")
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_id_from_image_too_many_duplication(self, fxt_dataset):
+        with patch("datumaro.plugins.transforms.IdFromImageName.DEFAULT_RETRY", 1), patch(
+            "datumaro.plugins.transforms.IdFromImageName.SUFFIX_LETTERS", "a"
+        ), pytest.raises(Exception) as e:
+            with eager_mode():
+                fxt_dataset.transform(
+                    "id_from_image_name",
+                    ensure_unique=True,
+                    suffix_length=1,
+                )
+        assert str(e.value).startswith("Too many duplicate names.")
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    @pytest.mark.parametrize(
+        "args,ensure_unique,suffix_length",
+        [
+            ([], False, 3),
+            (["--ensure_unique", "--suffix_length", "2"], True, 2),
+        ],
+        ids=["default", "ensure_unique"],
+    )
+    def test_parser(self, args, ensure_unique, suffix_length):
+        parser = transforms.IdFromImageName.build_cmdline_parser()
+        args = parser.parse_args(args)
+
+        assert hasattr(args, "ensure_unique") and args.ensure_unique == ensure_unique
+        assert hasattr(args, "suffix_length") and args.suffix_length == suffix_length
+
+
 class AstypeAnnotationsTest(TestCase):
     def setUp(self):
         self.table = Table.from_list(