Skip to content

Commit d7f3d85

Browse files
Maxim Zhiltsovchuneuny-emily
andauthored
Add removal transforms (#670)
* add item, annotation and attribute removal transforms * update docs * add tests Co-authored-by: Emily <[email protected]>
1 parent 1724ccf commit d7f3d85

File tree

4 files changed

+332
-7
lines changed

4 files changed

+332
-7
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
3232
<https://github.com/openvinotoolkit/datumaro/pull/659>)
3333
- A way for formats to signal that they don't support detection
3434
(<https://github.com/openvinotoolkit/datumaro/pull/665>)
35+
- Removal transforms to remove items/annoations/attributes from dataset
36+
(`remove_items`, `remove_annotations`, `remove_attributes`)
37+
(<https://github.com/openvinotoolkit/datumaro/pull/670>)
3538

3639
### Changed
3740
- Allowed direct file paths in `datum import`. Such sources are imported like

datumaro/plugins/transforms.py

Lines changed: 144 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,14 @@
22
#
33
# SPDX-License-Identifier: MIT
44

5+
from __future__ import annotations
6+
57
from collections import Counter
68
from copy import deepcopy
79
from enum import Enum, auto
810
from itertools import chain
9-
from typing import Dict, Iterable, List, Tuple, Union
11+
from typing import Dict, Iterable, List, Optional, Tuple, Union
12+
import argparse
1013
import logging as log
1114
import os.path as osp
1215
import random
@@ -23,10 +26,10 @@
2326
from datumaro.components.cli_plugin import CliPlugin
2427
from datumaro.components.errors import DatumaroError
2528
from datumaro.components.extractor import (
26-
DEFAULT_SUBSET_NAME, IExtractor, ItemTransform, Transform,
29+
DEFAULT_SUBSET_NAME, DatasetItem, IExtractor, ItemTransform, Transform,
2730
)
2831
from datumaro.components.media import Image
29-
from datumaro.util import NOTSET, parse_str_enum_value, take_by
32+
from datumaro.util import NOTSET, filter_dict, parse_str_enum_value, take_by
3033
from datumaro.util.annotation_util import find_group_leader, find_instances
3134
import datumaro.util.mask_tools as mask_tools
3235

@@ -306,7 +309,6 @@ class MapSubsets(ItemTransform, CliPlugin):
306309
def _mapping_arg(s):
307310
parts = s.split(':')
308311
if len(parts) != 2:
309-
import argparse
310312
raise argparse.ArgumentTypeError()
311313
return parts
312314

@@ -354,7 +356,6 @@ class RandomSplit(Transform, CliPlugin):
354356
def _split_arg(s):
355357
parts = s.split(':')
356358
if len(parts) != 2:
357-
import argparse
358359
raise argparse.ArgumentTypeError()
359360
return (parts[0], float(parts[1]))
360361

@@ -497,7 +498,6 @@ class DefaultAction(Enum):
497498
def _split_arg(s):
498499
parts = s.split(':')
499500
if len(parts) != 2:
500-
import argparse
501501
raise argparse.ArgumentTypeError()
502502
return (parts[0], parts[1])
503503

@@ -863,3 +863,141 @@ def transform_item(self, item):
863863
return self.wrap_item(item,
864864
image=resized_image,
865865
annotations=resized_annotations)
866+
867+
class RemoveItems(ItemTransform):
868+
"""
869+
Allows to remove specific dataset items from dataset by their ids.|n
870+
|n
871+
Can be useful to clean the dataset from broken or unnecessary samples.|n
872+
|n
873+
Examples:|n
874+
- Remove specific items from the dataset|n
875+
|s|s%(prog)s --id 'image1:train' --id 'image2:test'
876+
"""
877+
878+
@staticmethod
879+
def _parse_id(s):
880+
full_id = s.split(':')
881+
if len(full_id) != 2:
882+
raise argparse.ArgumentTypeError(None,
883+
message="Invalid id format of '%s'. "
884+
"Expected a 'name:subset' pair." % s)
885+
return full_id
886+
887+
@classmethod
888+
def build_cmdline_parser(cls, **kwargs):
889+
parser = super().build_cmdline_parser(**kwargs)
890+
parser.add_argument('--id', dest='ids', type=cls._parse_id,
891+
action='append', required=True,
892+
help="Item id to remove. Id is 'name:subset' pair (repeatable)")
893+
return parser
894+
895+
def __init__(self, extractor: IExtractor, ids: Iterable[Tuple[str, str]]):
896+
super().__init__(extractor)
897+
self._ids = set(tuple(v) for v in (ids or []))
898+
899+
def transform_item(self, item):
900+
if (item.id, item.subset) in self._ids:
901+
return None
902+
return item
903+
904+
class RemoveAnnotations(ItemTransform):
905+
"""
906+
Allows to remove annotations on specific dataset items.|n
907+
|n
908+
Can be useful to clean the dataset from broken or unnecessary annotations.|n
909+
|n
910+
Examples:|n
911+
- Remove annotations from specific items in the dataset|n
912+
|s|s%(prog)s --id 'image1:train' --id 'image2:test'
913+
"""
914+
915+
@staticmethod
916+
def _parse_id(s):
917+
full_id = s.split(':')
918+
if len(full_id) != 2:
919+
raise argparse.ArgumentTypeError(None,
920+
message="Invalid id format of '%s'. "
921+
"Expected a 'name:subset' pair." % s)
922+
return full_id
923+
924+
@classmethod
925+
def build_cmdline_parser(cls, **kwargs):
926+
parser = super().build_cmdline_parser(**kwargs)
927+
parser.add_argument('--id', dest='ids', type=cls._parse_id,
928+
action='append',
929+
help="Image id to clean from annotations. "
930+
"Id is 'name:subset' pair. If not specified, removes "
931+
"all annotations (repeatable)")
932+
return parser
933+
934+
def __init__(self, extractor: IExtractor, *,
935+
ids: Optional[Iterable[Tuple[str, str]]] = None):
936+
super().__init__(extractor)
937+
self._ids = set(tuple(v) for v in (ids or []))
938+
939+
def transform_item(self, item: DatasetItem):
940+
if not self._ids or (item.id, item.subset) in self._ids:
941+
return item.wrap(annotations=[])
942+
return item
943+
944+
class RemoveAttributes(ItemTransform):
945+
"""
946+
Allows to remove item and annotation attributes in a dataset.|n
947+
|n
948+
Can be useful to clean the dataset from broken or unnecessary attributes.|n
949+
|n
950+
Examples:|n
951+
- Remove the `is_crowd` attribute from dataset|n
952+
|s|s%(prog)s --attr 'is_crowd'|n
953+
|n
954+
- Remove the `occluded` attribute from annotations of|n
955+
|s|sthe `2010_001705` item in the `train` subset|n
956+
|s|s%(prog)s --id '2010_001705:train' --attr 'occluded'
957+
"""
958+
959+
@staticmethod
960+
def _parse_id(s):
961+
full_id = s.split(':')
962+
if len(full_id) != 2:
963+
raise argparse.ArgumentTypeError(None,
964+
message="Invalid id format of '%s'. "
965+
"Expected a 'name:subset' pair." % s)
966+
return full_id
967+
968+
@classmethod
969+
def build_cmdline_parser(cls, **kwargs):
970+
parser = super().build_cmdline_parser(**kwargs)
971+
parser.add_argument('--id', dest='ids', type=cls._parse_id,
972+
action='append',
973+
help="Image id to clean from annotations. "
974+
"Id is 'name:subset' pair. If not specified, "
975+
"affects all images and annotations (repeatable)")
976+
parser.add_argument('-a', '--attr', action='append', dest='attributes',
977+
help="Attribute name to be removed. If not specified, "
978+
"removes all attributes (repeatable)")
979+
return parser
980+
981+
def __init__(self, extractor: IExtractor,
982+
ids: Optional[Iterable[Tuple[str, str]]] = None,
983+
attributes: Optional[Iterable[str]] = None):
984+
super().__init__(extractor)
985+
self._ids = set(tuple(v) for v in (ids or []))
986+
self._attributes = set(attributes or [])
987+
988+
def _filter_attrs(self, attrs):
989+
if not self._attributes:
990+
return None
991+
else:
992+
return filter_dict(attrs, exclude_keys=self._attributes)
993+
994+
def transform_item(self, item: DatasetItem):
995+
if not self._ids or (item.id, item.subset) in self._ids:
996+
filtered_annotations = []
997+
for ann in item.annotations:
998+
filtered_annotations.append(ann.wrap(
999+
attributes=self._filter_attrs(ann.attributes)))
1000+
1001+
return item.wrap(attributes=self._filter_attrs(item.attributes),
1002+
annotations=filtered_annotations)
1003+
return item

site/content/en/docs/user-manual/command-reference/transform.md

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ Basic dataset item manipulations:
9696
- [`label_random_sampler`](#label_random_sampler-transform) - Leaves at least
9797
k images with annotations per class
9898
- [`resize`](#resize-transform) - Resizes images and annotations in the dataset
99+
- [`remove_images`](#remove_images-transform) - Removes specific images
100+
- [`remove_annotations`](#remove_annotations-transform) - Removes annotations
101+
- [`remove_attributes`](#remove_attributes-transform) - Removes attributes
99102

100103
Subset manipulations:
101104
- [`random_split`](#random_split-transform) - Splits dataset into subsets
@@ -400,6 +403,80 @@ Resize all images to 256x256 size
400403
datum transform -t resize -- -dw 256 -dh 256
401404
```
402405

406+
##### `remove_images` <a id="remove_images-transform"></a>
407+
408+
Removes specific dataset items by their ids.
409+
410+
Usage:
411+
```bash
412+
remove_images [-h] [--id IDs]
413+
```
414+
415+
Optional arguments:
416+
- `-h`, `--help` (flag) - Show this help message and exit
417+
- `--id` (str) - Item id to remove. Id is '<name>:<subset>' pair (repeatable)
418+
419+
Examples:
420+
421+
Remove specific images from the dataset
422+
```bash
423+
datum transform -t remove_images -- --id 'image1:train' --id 'image2:test'
424+
```
425+
426+
##### `remove_annotations` <a id="remove_annotations-transform"></a>
427+
428+
Allows to remove annotations on specific dataset items.
429+
430+
Can be useful to clean the dataset from broken or unnecessary annotations.
431+
432+
Usage:
433+
```bash
434+
remove_annotations [-h] [--id IDs]
435+
```
436+
437+
Optional arguments:
438+
- `-h`, `--help` (flag) - Show this help message and exit
439+
- `--id` (str) - Item id to clean from annotations. Id is '<name>:<subset>' pair.
440+
If not specified, removes all annotations (repeatable)
441+
442+
Examples:
443+
Remove annotations from specific items in the dataset
444+
```bash
445+
datum transform -t remove_annotations -- --id 'image1:train' --id 'image2:test'
446+
```
447+
448+
##### `remove_attributes` <a id="remove_attributes-transform"></a>
449+
450+
Allows to remove item and annotation attributes in a dataset.
451+
452+
Can be useful to clean the dataset from broken or unnecessary attributes.
453+
454+
Usage:
455+
```bash
456+
remove_attributes [-h] [--id IDs] [--attr ATTRIBUTE_NAME]
457+
```
458+
459+
Optional arguments:
460+
- `-h`, `--help` (flag) - Show this help message and exit
461+
- `--id` (str) - Image id to clean from annotations. Id is '<name>:<subset>' pair.
462+
If not specified, affects all items and annotations (repeatable)
463+
- `-a`, `--attr` (flag) - Attribute name to be removed. If not specified,
464+
removes all attributes (repeatable)
465+
466+
Examples:
467+
Remove the `is_crowd` attribute from dataset
468+
```bash
469+
datum transform -t remove_attributes -- \
470+
--attr 'is_crowd'
471+
```
472+
473+
Remove the `occluded` attribute from annotations of
474+
the `2010_001705` item in the `train` subset
475+
```bash
476+
datum transform -t remove_attributes -- \
477+
--id '2010_001705:train' --attr 'occluded'
478+
```
479+
403480
##### `random_split` <a id="random_split-transform"></a>
404481

405482
Joins all subsets into one and splits the result into few parts.

0 commit comments

Comments
 (0)