Skip to content

Allow custom subset names in YOLO #688

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
Mar 30, 2022
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
20703c5
Improve error messages in COCO format
Mar 18, 2022
7fc3071
Improve VOC parsing errors
Mar 18, 2022
9238ca2
Improve error messages in COCO format
Mar 18, 2022
a05b835
Add quotes to invalid label error message
Mar 21, 2022
8d8acd9
Unwrap reporter errors during dataset iteration
Mar 21, 2022
fb882b2
Rename arg
Mar 21, 2022
5dd1eb7
Improve error messages in VOC
Mar 21, 2022
29835c4
Fix typo
Mar 21, 2022
f2aa35a
Add quotes to error messages
Mar 21, 2022
0c3f53e
Fix message
Mar 21, 2022
4cb1488
Remove extra type from annotations
Mar 21, 2022
c91f928
Split polygon points check
Mar 22, 2022
f27e699
Use f style for most messages
Mar 22, 2022
c523812
Add mask size value count check
Mar 22, 2022
3cb8ee8
Prohibit float values in size
Mar 22, 2022
eeba792
Extract commond annotation template in error tests
Mar 22, 2022
7c36135
Rename invalid label to undeclared label
Mar 22, 2022
175bae2
Add validators and docs, update error hierarchy
Mar 22, 2022
6cd2e0d
Merge branch 'zm/coco-parsing-errors' into zm/voc-parsing-errors
Mar 22, 2022
af3ec5d
Extract common xml template in tests
Mar 22, 2022
551e651
Improve voc labelmap parsing errors
Mar 22, 2022
6979f81
Improve YOLO parsing errors
Mar 22, 2022
0cc5dcf
Allow custom subset names in YOLO format
Mar 22, 2022
317b581
Update docs
Mar 22, 2022
3b43ea9
Update changelog
Mar 22, 2022
52c7112
Fix docs
Mar 22, 2022
d7a13fb
Merge branch 'develop' into zm/yolo-custom-subset-name
Mar 29, 2022
a14752b
Add a test for failing export with a reserved subset name
Mar 29, 2022
1b6a081
Update CLI tests
Mar 29, 2022
ce29c0f
Replace save-images with save-media in tests
Mar 29, 2022
3d31fda
fix import
Mar 29, 2022
6ed400b
Merge branch 'develop' into zm/yolo-custom-subset-name
Mar 30, 2022
ff6631f
Fix comment
Mar 30, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/684>,
<https://github.com/openvinotoolkit/datumaro/pull/686>,
<https://github.com/openvinotoolkit/datumaro/pull/687>)
- YOLO format now supports almost any subset names, except `backup`, `names` and `classes`
(instead of just `train` and `valid`). The reserved names now raise an error on exporting.
(<https://github.com/openvinotoolkit/datumaro/pull/688>)

### Deprecated
- `--save-images` is replaced with `--save-media` in CLI and converter API
Expand Down
11 changes: 4 additions & 7 deletions datumaro/plugins/yolo_format/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from datumaro.components.annotation import AnnotationType, Bbox
from datumaro.components.converter import Converter
from datumaro.components.dataset import ItemStatus
from datumaro.components.errors import MediaTypeError
from datumaro.components.errors import DatasetExportError, MediaTypeError
from datumaro.components.extractor import DEFAULT_SUBSET_NAME, DatasetItem, IExtractor
from datumaro.components.media import Image
from datumaro.util import str_to_bool
Expand Down Expand Up @@ -75,13 +75,10 @@ def apply(self):
for (subset_name, subset), pbar in zip(subsets.items(), pbars):
if not subset_name or subset_name == DEFAULT_SUBSET_NAME:
subset_name = YoloPath.DEFAULT_SUBSET_NAME
elif subset_name not in YoloPath.SUBSET_NAMES:
log.warning(
"Skipping subset export '%s'. "
"If specified, the only valid names are %s"
% (subset_name, ", ".join("'%s'" % s for s in YoloPath.SUBSET_NAMES))
elif subset_name in YoloPath.RESERVED_CONFIG_KEYS:
raise DatasetExportError(
f"Can't export '{subset_name}' subset in YOLO format, this word is reserved."
)
continue

subset_dir = osp.join(save_dir, "obj_%s_data" % subset_name)
os.makedirs(subset_dir, exist_ok=True)
Expand Down
51 changes: 32 additions & 19 deletions datumaro/plugins/yolo_format/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,29 +71,24 @@ def __init__(

self._image_info = image_info

with open(config_path, "r", encoding="utf-8") as f:
config_lines = f.readlines()

subsets = OrderedDict()
names_path = None

for line in config_lines:
match = re.match(r"^\s*(\w+)\s*=\s*(.+)$", line)
if not match:
continue

key = match.group(1)
value = match.group(2)
if key == "names":
names_path = value
elif key in YoloPath.SUBSET_NAMES:
subsets[key] = value
else:
continue
config = self._parse_config(config_path)

names_path = config.get("names")
if not names_path:
raise InvalidAnnotationError(f"Failed to parse names file path from config")

# The original format is like this:
#
# classes = 2
# train = data/train.txt
# valid = data/test.txt
# names = data/obj.names
# backup = backup/
#
# To support more subset names, we disallow subsets
# called 'classes' and 'backup'.
subsets = {k: v for k, v in config.items() if k not in YoloPath.RESERVED_CONFIG_KEYS}

for subset_name, list_path in subsets.items():
list_path = osp.join(self._path, self.localize_path(list_path))
if not osp.isfile(list_path):
Expand All @@ -114,6 +109,24 @@ def __init__(
)
}

@staticmethod
def _parse_config(path: str) -> Dict[str, str]:
with open(path, "r", encoding="utf-8") as f:
config_lines = f.readlines()

config = {}

for line in config_lines:
match = re.match(r"^\s*(\w+)\s*=\s*(.+)$", line)
if not match:
continue

key = match.group(1)
value = match.group(2)
config[key] = value

return config

@staticmethod
def localize_path(path: str) -> str:
"""
Expand Down
1 change: 1 addition & 0 deletions datumaro/plugins/yolo_format/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
class YoloPath:
DEFAULT_SUBSET_NAME = "train"
SUBSET_NAMES = ["train", "valid"]
RESERVED_CONFIG_KEYS = ["backup", "classes", "names"]
13 changes: 6 additions & 7 deletions site/content/en/docs/formats/yolo.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@ Supported annotation types:

YOLO format doesn't support attributes for annotations.

The format only supports subsets named `train` or `valid`.
The format supports arbitrary subset names, except `classes`, `names` and `backup`.

> Note, that by default, the YOLO framework does not expect any subset names,
except `train` and `valid`, Datumaro supports this as an extension.
If there is no subset separation in a project, the data
will be saved in the `train` subset.

## Import YOLO dataset

Expand Down Expand Up @@ -55,12 +60,6 @@ YOLO dataset directory should have the following structure:
├── image102.jpg
└── ...
```
> YOLO dataset cannot contain a subset with a name other than `train` or `valid`.
If an imported dataset contains such subsets, they will be ignored.
If you are exporting a project into YOLO format,
all subsets different from `train` and `valid` will be skipped.
If there is no subset separation in a project, the data
will be saved in `train` subset.

- `obj.data` should have the following content, it is not necessary to have both
subsets, but necessary to have one of them:
Expand Down
57 changes: 7 additions & 50 deletions tests/cli/test_yolo_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_can_save_and_load_yolo_dataset(self):
"-f",
"yolo",
"--",
"--save-images",
"--save-media",
)

parsed_dataset = Dataset.import_from(export_dir, format="yolo")
Expand All @@ -73,7 +73,7 @@ def test_can_export_mot_as_yolo(self):
run(self, "import", "-p", test_dir, "-f", "mot_seq", mot_dir)

yolo_dir = osp.join(test_dir, "yolo_dir")
run(self, "export", "-p", test_dir, "-o", yolo_dir, "-f", "yolo", "--", "--save-images")
run(self, "export", "-p", test_dir, "-o", yolo_dir, "-f", "yolo", "--", "--save-media")

parsed_dataset = Dataset.import_from(yolo_dir, format="yolo")
compare_datasets(self, target_dataset, parsed_dataset)
Expand All @@ -91,7 +91,10 @@ def test_can_convert_voc_to_yolo(self):
Bbox(4.0, 5.0, 2.0, 2.0, label=15),
Bbox(5.5, 6, 2, 2, label=22),
],
)
),
DatasetItem(
id="2007_000002", subset="test", media=Image(data=np.ones((10, 20, 3)))
),
],
categories=[label.name for label in VOC.make_voc_categories()[AnnotationType.label]],
)
Expand All @@ -118,58 +121,12 @@ def test_can_convert_voc_to_yolo(self):
"-o",
yolo_dir,
"--",
"--save-images",
"--save-media",
)

parsed_dataset = Dataset.import_from(yolo_dir, format="yolo")
compare_datasets(self, target_dataset, parsed_dataset, require_media=True)

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_can_ignore_non_supported_subsets(self):
source_dataset = Dataset.from_iterable(
[
DatasetItem(
id="img1",
subset="test",
media=Image(data=np.ones((10, 20, 3))),
annotations=[Bbox(1.0, 2.0, 1.0, 1.0, label=0)],
),
DatasetItem(
id="img2",
subset="train",
media=Image(data=np.ones((10, 5, 3))),
annotations=[Bbox(3.0, 1.0, 2.0, 1.0, label=1)],
),
],
categories=[str(i) for i in range(4)],
)

target_dataset = Dataset.from_iterable(
[
DatasetItem(
id="img2",
subset="train",
media=Image(data=np.ones((10, 5, 3))),
annotations=[Bbox(3.0, 1.0, 2.0, 1.0, label=1)],
)
],
categories=[str(i) for i in range(4)],
)

with TestDir() as test_dir:
dataset_dir = osp.join(test_dir, "dataset_dir")
source_dataset.save(dataset_dir, save_media=True)

proj_dir = osp.join(test_dir, "proj")
run(self, "create", "-o", proj_dir)
run(self, "import", "-p", proj_dir, "-f", "datumaro", dataset_dir)

yolo_dir = osp.join(test_dir, "yolo_dir")
run(self, "export", "-p", proj_dir, "-o", yolo_dir, "-f", "yolo", "--", "--save-images")

parsed_dataset = Dataset.import_from(yolo_dir, format="yolo")
compare_datasets(self, target_dataset, parsed_dataset)

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_can_delete_labels_from_yolo_dataset(self):
target_dataset = Dataset.from_iterable(
Expand Down
1 change: 1 addition & 0 deletions tests/requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class Requirements:
DATUM_475 = "Support import for CelebA dataset"
DATUM_497 = "Support import for SYNTHIA dataset"
DATUM_542 = "Images missing after merging two datasets"
DATUM_565 = "Export in YOLO with custom subset name"
DATUM_580 = "Import for MPII Human Pose Dataset"
DATUM_609 = "Allow not to prepend data/ prefix in YOLO export"
DATUM_616 = "Import for BraTS dataset"
Expand Down
43 changes: 43 additions & 0 deletions tests/test_yolo_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from datumaro.components.environment import Environment
from datumaro.components.errors import (
AnnotationImportError,
DatasetExportError,
DatasetImportError,
InvalidAnnotationError,
ItemImportError,
Expand Down Expand Up @@ -262,6 +263,48 @@ def test_can_save_and_load_with_meta_file(self):
self.assertTrue(osp.isfile(osp.join(test_dir, "dataset_meta.json")))
compare_datasets(self, source_dataset, parsed_dataset)

@mark_requirement(Requirements.DATUM_565)
def test_can_save_and_load_with_custom_subset_name(self):
source_dataset = Dataset.from_iterable(
[
DatasetItem(
id=3,
subset="anything",
media=Image(data=np.ones((8, 8, 3))),
annotations=[
Bbox(0, 1, 5, 2, label=2),
Bbox(0, 2, 3, 2, label=5),
],
),
],
categories=["label_" + str(i) for i in range(10)],
)

with TestDir() as test_dir:
YoloConverter.convert(source_dataset, test_dir, save_media=True)
parsed_dataset = Dataset.import_from(test_dir, "yolo")

compare_datasets(self, source_dataset, parsed_dataset)

@mark_requirement(Requirements.DATUM_565)
def test_cant_save_with_reserved_subset_name(self):
for subset in ["backup", "classes"]:

dataset = Dataset.from_iterable(
[
DatasetItem(
id=3,
subset=subset,
media=Image(data=np.ones((8, 8, 3))),
),
],
categories=["a"],
)

with TestDir() as test_dir:
with self.assertRaisesRegex(DatasetExportError, f"Can't export '{subset}' subset"):
YoloConverter.convert(dataset, test_dir)

@mark_requirement(Requirements.DATUM_609)
def test_can_save_and_load_without_path_prefix(self):
source_dataset = Dataset.from_iterable(
Expand Down