open-edge-platform · zhiltsov-max · Mar 30, 2022 · Mar 18, 2022 · Mar 18, 2022 · Mar 18, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,6 +38,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/684>,
    <https://github.com/openvinotoolkit/datumaro/pull/686>,
    <https://github.com/openvinotoolkit/datumaro/pull/687>)
+- YOLO format now supports almost any subset names, except `backup`, `names` and `classes`
+  (instead of just `train` and `valid`). The reserved names now raise an error on exporting.
+  (<https://github.com/openvinotoolkit/datumaro/pull/688>)
 
 ### Deprecated
 - `--save-images` is replaced with `--save-media` in CLI and converter API

diff --git a/datumaro/plugins/yolo_format/converter.py b/datumaro/plugins/yolo_format/converter.py
@@ -10,7 +10,7 @@
 from datumaro.components.annotation import AnnotationType, Bbox
 from datumaro.components.converter import Converter
 from datumaro.components.dataset import ItemStatus
-from datumaro.components.errors import MediaTypeError
+from datumaro.components.errors import DatasetExportError, MediaTypeError
 from datumaro.components.extractor import DEFAULT_SUBSET_NAME, DatasetItem, IExtractor
 from datumaro.components.media import Image
 from datumaro.util import str_to_bool
@@ -75,13 +75,10 @@ def apply(self):
         for (subset_name, subset), pbar in zip(subsets.items(), pbars):
             if not subset_name or subset_name == DEFAULT_SUBSET_NAME:
                 subset_name = YoloPath.DEFAULT_SUBSET_NAME
-            elif subset_name not in YoloPath.SUBSET_NAMES:
-                log.warning(
-                    "Skipping subset export '%s'. "
-                    "If specified, the only valid names are %s"
-                    % (subset_name, ", ".join("'%s'" % s for s in YoloPath.SUBSET_NAMES))
+            elif subset_name in YoloPath.RESERVED_CONFIG_KEYS:
+                raise DatasetExportError(
+                    f"Can't export '{subset_name}' subset in YOLO format, this word is reserved."
                 )
-                continue
 
             subset_dir = osp.join(save_dir, "obj_%s_data" % subset_name)
             os.makedirs(subset_dir, exist_ok=True)

diff --git a/datumaro/plugins/yolo_format/extractor.py b/datumaro/plugins/yolo_format/extractor.py
@@ -71,29 +71,24 @@ def __init__(
 
         self._image_info = image_info
 
-        with open(config_path, "r", encoding="utf-8") as f:
-            config_lines = f.readlines()
-
-        subsets = OrderedDict()
-        names_path = None
-
-        for line in config_lines:
-            match = re.match(r"^\s*(\w+)\s*=\s*(.+)$", line)
-            if not match:
-                continue
-
-            key = match.group(1)
-            value = match.group(2)
-            if key == "names":
-                names_path = value
-            elif key in YoloPath.SUBSET_NAMES:
-                subsets[key] = value
-            else:
-                continue
+        config = self._parse_config(config_path)
 
+        names_path = config.get("names")
         if not names_path:
             raise InvalidAnnotationError(f"Failed to parse names file path from config")
 
+        # The original format is like this:
+        #
+        # classes = 2
+        # train  = data/train.txt
+        # valid  = data/test.txt
+        # names = data/obj.names
+        # backup = backup/
+        #
+        # To support more subset names, we disallow subsets
+        # called 'classes' and 'backup'.
+        subsets = {k: v for k, v in config.items() if k not in YoloPath.RESERVED_CONFIG_KEYS}
+
         for subset_name, list_path in subsets.items():
             list_path = osp.join(self._path, self.localize_path(list_path))
             if not osp.isfile(list_path):
@@ -114,6 +109,24 @@ def __init__(
             )
         }
 
+    @staticmethod
+    def _parse_config(path: str) -> Dict[str, str]:
+        with open(path, "r", encoding="utf-8") as f:
+            config_lines = f.readlines()
+
+        config = {}
+
+        for line in config_lines:
+            match = re.match(r"^\s*(\w+)\s*=\s*(.+)$", line)
+            if not match:
+                continue
+
+            key = match.group(1)
+            value = match.group(2)
+            config[key] = value
+
+        return config
+
     @staticmethod
     def localize_path(path: str) -> str:
         """

diff --git a/datumaro/plugins/yolo_format/format.py b/datumaro/plugins/yolo_format/format.py
@@ -6,3 +6,4 @@
 class YoloPath:
     DEFAULT_SUBSET_NAME = "train"
     SUBSET_NAMES = ["train", "valid"]
+    RESERVED_CONFIG_KEYS = ["backup", "classes", "names"]
diff --git a/site/content/en/docs/formats/yolo.md b/site/content/en/docs/formats/yolo.md
@@ -18,7 +18,12 @@ Supported annotation types:
 
 YOLO format doesn't support attributes for annotations.
 
-The format only supports subsets named `train` or `valid`.
+The format supports arbitrary subset names, except `classes`, `names` and `backup`.
+
+> Note, that by default, the YOLO framework does not expect any subset names,
+  except `train` and `valid`, Datumaro supports this as an extension.
+  If there is no subset separation in a project, the data
+  will be saved in the `train` subset.
 
 ## Import YOLO dataset
 
@@ -55,12 +60,6 @@ YOLO dataset directory should have the following structure:
         ├── image102.jpg
         └── ...
 ```
-> YOLO dataset cannot contain a subset with a name other than `train` or `valid`.
-  If an imported dataset contains such subsets, they will be ignored.
-  If you are exporting a project into YOLO format,
-  all subsets different from `train` and `valid` will be skipped.
-  If there is no subset separation in a project, the data
-  will be saved in `train` subset.
 
 - `obj.data` should have the following content, it is not necessary to have both
   subsets, but necessary to have one of them:

diff --git a/tests/cli/test_yolo_format.py b/tests/cli/test_yolo_format.py
@@ -51,7 +51,7 @@ def test_can_save_and_load_yolo_dataset(self):
                 "-f",
                 "yolo",
                 "--",
-                "--save-images",
+                "--save-media",
             )
 
             parsed_dataset = Dataset.import_from(export_dir, format="yolo")
@@ -73,7 +73,7 @@ def test_can_export_mot_as_yolo(self):
             run(self, "import", "-p", test_dir, "-f", "mot_seq", mot_dir)
 
             yolo_dir = osp.join(test_dir, "yolo_dir")
-            run(self, "export", "-p", test_dir, "-o", yolo_dir, "-f", "yolo", "--", "--save-images")
+            run(self, "export", "-p", test_dir, "-o", yolo_dir, "-f", "yolo", "--", "--save-media")
 
             parsed_dataset = Dataset.import_from(yolo_dir, format="yolo")
             compare_datasets(self, target_dataset, parsed_dataset)
@@ -91,7 +91,10 @@ def test_can_convert_voc_to_yolo(self):
                         Bbox(4.0, 5.0, 2.0, 2.0, label=15),
                         Bbox(5.5, 6, 2, 2, label=22),
                     ],
-                )
+                ),
+                DatasetItem(
+                    id="2007_000002", subset="test", media=Image(data=np.ones((10, 20, 3)))
+                ),
             ],
             categories=[label.name for label in VOC.make_voc_categories()[AnnotationType.label]],
         )
@@ -118,58 +121,12 @@ def test_can_convert_voc_to_yolo(self):
                 "-o",
                 yolo_dir,
                 "--",
-                "--save-images",
+                "--save-media",
             )
 
             parsed_dataset = Dataset.import_from(yolo_dir, format="yolo")
             compare_datasets(self, target_dataset, parsed_dataset, require_media=True)
 
-    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
-    def test_can_ignore_non_supported_subsets(self):
-        source_dataset = Dataset.from_iterable(
-            [
-                DatasetItem(
-                    id="img1",
-                    subset="test",
-                    media=Image(data=np.ones((10, 20, 3))),
-                    annotations=[Bbox(1.0, 2.0, 1.0, 1.0, label=0)],
-                ),
-                DatasetItem(
-                    id="img2",
-                    subset="train",
-                    media=Image(data=np.ones((10, 5, 3))),
-                    annotations=[Bbox(3.0, 1.0, 2.0, 1.0, label=1)],
-                ),
-            ],
-            categories=[str(i) for i in range(4)],
-        )
-
-        target_dataset = Dataset.from_iterable(
-            [
-                DatasetItem(
-                    id="img2",
-                    subset="train",
-                    media=Image(data=np.ones((10, 5, 3))),
-                    annotations=[Bbox(3.0, 1.0, 2.0, 1.0, label=1)],
-                )
-            ],
-            categories=[str(i) for i in range(4)],
-        )
-
-        with TestDir() as test_dir:
-            dataset_dir = osp.join(test_dir, "dataset_dir")
-            source_dataset.save(dataset_dir, save_media=True)
-
-            proj_dir = osp.join(test_dir, "proj")
-            run(self, "create", "-o", proj_dir)
-            run(self, "import", "-p", proj_dir, "-f", "datumaro", dataset_dir)
-
-            yolo_dir = osp.join(test_dir, "yolo_dir")
-            run(self, "export", "-p", proj_dir, "-o", yolo_dir, "-f", "yolo", "--", "--save-images")
-
-            parsed_dataset = Dataset.import_from(yolo_dir, format="yolo")
-            compare_datasets(self, target_dataset, parsed_dataset)
-
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     def test_can_delete_labels_from_yolo_dataset(self):
         target_dataset = Dataset.from_iterable(

diff --git a/tests/requirements.py b/tests/requirements.py
@@ -35,6 +35,7 @@ class Requirements:
     DATUM_475 = "Support import for CelebA dataset"
     DATUM_497 = "Support import for SYNTHIA dataset"
     DATUM_542 = "Images missing after merging two datasets"
+    DATUM_565 = "Export in YOLO with custom subset name"
     DATUM_580 = "Import for MPII Human Pose Dataset"
     DATUM_609 = "Allow not to prepend data/ prefix in YOLO export"
     DATUM_616 = "Import for BraTS dataset"

diff --git a/tests/test_yolo_format.py b/tests/test_yolo_format.py
@@ -10,6 +10,7 @@
 from datumaro.components.environment import Environment
 from datumaro.components.errors import (
     AnnotationImportError,
+    DatasetExportError,
     DatasetImportError,
     InvalidAnnotationError,
     ItemImportError,
@@ -262,6 +263,48 @@ def test_can_save_and_load_with_meta_file(self):
             self.assertTrue(osp.isfile(osp.join(test_dir, "dataset_meta.json")))
             compare_datasets(self, source_dataset, parsed_dataset)
 
+    @mark_requirement(Requirements.DATUM_565)
+    def test_can_save_and_load_with_custom_subset_name(self):
+        source_dataset = Dataset.from_iterable(
+            [
+                DatasetItem(
+                    id=3,
+                    subset="anything",
+                    media=Image(data=np.ones((8, 8, 3))),
+                    annotations=[
+                        Bbox(0, 1, 5, 2, label=2),
+                        Bbox(0, 2, 3, 2, label=5),
+                    ],
+                ),
+            ],
+            categories=["label_" + str(i) for i in range(10)],
+        )
+
+        with TestDir() as test_dir:
+            YoloConverter.convert(source_dataset, test_dir, save_media=True)
+            parsed_dataset = Dataset.import_from(test_dir, "yolo")
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
+    @mark_requirement(Requirements.DATUM_565)
+    def test_cant_save_with_reserved_subset_name(self):
+        for subset in ["backup", "classes"]:
+
+            dataset = Dataset.from_iterable(
+                [
+                    DatasetItem(
+                        id=3,
+                        subset=subset,
+                        media=Image(data=np.ones((8, 8, 3))),
+                    ),
+                ],
+                categories=["a"],
+            )
+
+            with TestDir() as test_dir:
+                with self.assertRaisesRegex(DatasetExportError, f"Can't export '{subset}' subset"):
+                    YoloConverter.convert(dataset, test_dir)
+
     @mark_requirement(Requirements.DATUM_609)
     def test_can_save_and_load_without_path_prefix(self):
         source_dataset = Dataset.from_iterable(