|
2 | 2 | #
|
3 | 3 | # SPDX-License-Identifier: MIT
|
4 | 4 |
|
| 5 | +from __future__ import annotations |
| 6 | + |
5 | 7 | from collections import Counter
|
6 | 8 | from copy import deepcopy
|
7 | 9 | from enum import Enum, auto
|
8 | 10 | from itertools import chain
|
9 |
| -from typing import Dict, Iterable, List, Tuple, Union |
| 11 | +from typing import Dict, Iterable, List, Optional, Tuple, Union |
| 12 | +import argparse |
10 | 13 | import logging as log
|
11 | 14 | import os.path as osp
|
12 | 15 | import random
|
|
23 | 26 | from datumaro.components.cli_plugin import CliPlugin
|
24 | 27 | from datumaro.components.errors import DatumaroError
|
25 | 28 | from datumaro.components.extractor import (
|
26 |
| - DEFAULT_SUBSET_NAME, IExtractor, ItemTransform, Transform, |
| 29 | + DEFAULT_SUBSET_NAME, DatasetItem, IExtractor, ItemTransform, Transform, |
27 | 30 | )
|
28 | 31 | from datumaro.components.media import Image
|
29 |
| -from datumaro.util import NOTSET, parse_str_enum_value, take_by |
| 32 | +from datumaro.util import NOTSET, filter_dict, parse_str_enum_value, take_by |
30 | 33 | from datumaro.util.annotation_util import find_group_leader, find_instances
|
31 | 34 | import datumaro.util.mask_tools as mask_tools
|
32 | 35 |
|
@@ -306,7 +309,6 @@ class MapSubsets(ItemTransform, CliPlugin):
|
306 | 309 | def _mapping_arg(s):
|
307 | 310 | parts = s.split(':')
|
308 | 311 | if len(parts) != 2:
|
309 |
| - import argparse |
310 | 312 | raise argparse.ArgumentTypeError()
|
311 | 313 | return parts
|
312 | 314 |
|
@@ -354,7 +356,6 @@ class RandomSplit(Transform, CliPlugin):
|
354 | 356 | def _split_arg(s):
|
355 | 357 | parts = s.split(':')
|
356 | 358 | if len(parts) != 2:
|
357 |
| - import argparse |
358 | 359 | raise argparse.ArgumentTypeError()
|
359 | 360 | return (parts[0], float(parts[1]))
|
360 | 361 |
|
@@ -497,7 +498,6 @@ class DefaultAction(Enum):
|
497 | 498 | def _split_arg(s):
|
498 | 499 | parts = s.split(':')
|
499 | 500 | if len(parts) != 2:
|
500 |
| - import argparse |
501 | 501 | raise argparse.ArgumentTypeError()
|
502 | 502 | return (parts[0], parts[1])
|
503 | 503 |
|
@@ -863,3 +863,141 @@ def transform_item(self, item):
|
863 | 863 | return self.wrap_item(item,
|
864 | 864 | image=resized_image,
|
865 | 865 | annotations=resized_annotations)
|
| 866 | + |
| 867 | +class RemoveItems(ItemTransform): |
| 868 | + """ |
| 869 | + Allows to remove specific dataset items from dataset by their ids.|n |
| 870 | + |n |
| 871 | + Can be useful to clean the dataset from broken or unnecessary samples.|n |
| 872 | + |n |
| 873 | + Examples:|n |
| 874 | + - Remove specific items from the dataset|n |
| 875 | + |s|s%(prog)s --id 'image1:train' --id 'image2:test' |
| 876 | + """ |
| 877 | + |
| 878 | + @staticmethod |
| 879 | + def _parse_id(s): |
| 880 | + full_id = s.split(':') |
| 881 | + if len(full_id) != 2: |
| 882 | + raise argparse.ArgumentTypeError(None, |
| 883 | + message="Invalid id format of '%s'. " |
| 884 | + "Expected a 'name:subset' pair." % s) |
| 885 | + return full_id |
| 886 | + |
| 887 | + @classmethod |
| 888 | + def build_cmdline_parser(cls, **kwargs): |
| 889 | + parser = super().build_cmdline_parser(**kwargs) |
| 890 | + parser.add_argument('--id', dest='ids', type=cls._parse_id, |
| 891 | + action='append', required=True, |
| 892 | + help="Item id to remove. Id is 'name:subset' pair (repeatable)") |
| 893 | + return parser |
| 894 | + |
| 895 | + def __init__(self, extractor: IExtractor, ids: Iterable[Tuple[str, str]]): |
| 896 | + super().__init__(extractor) |
| 897 | + self._ids = set(tuple(v) for v in (ids or [])) |
| 898 | + |
| 899 | + def transform_item(self, item): |
| 900 | + if (item.id, item.subset) in self._ids: |
| 901 | + return None |
| 902 | + return item |
| 903 | + |
| 904 | +class RemoveAnnotations(ItemTransform): |
| 905 | + """ |
| 906 | + Allows to remove annotations on specific dataset items.|n |
| 907 | + |n |
| 908 | + Can be useful to clean the dataset from broken or unnecessary annotations.|n |
| 909 | + |n |
| 910 | + Examples:|n |
| 911 | + - Remove annotations from specific items in the dataset|n |
| 912 | + |s|s%(prog)s --id 'image1:train' --id 'image2:test' |
| 913 | + """ |
| 914 | + |
| 915 | + @staticmethod |
| 916 | + def _parse_id(s): |
| 917 | + full_id = s.split(':') |
| 918 | + if len(full_id) != 2: |
| 919 | + raise argparse.ArgumentTypeError(None, |
| 920 | + message="Invalid id format of '%s'. " |
| 921 | + "Expected a 'name:subset' pair." % s) |
| 922 | + return full_id |
| 923 | + |
| 924 | + @classmethod |
| 925 | + def build_cmdline_parser(cls, **kwargs): |
| 926 | + parser = super().build_cmdline_parser(**kwargs) |
| 927 | + parser.add_argument('--id', dest='ids', type=cls._parse_id, |
| 928 | + action='append', |
| 929 | + help="Image id to clean from annotations. " |
| 930 | + "Id is 'name:subset' pair. If not specified, removes " |
| 931 | + "all annotations (repeatable)") |
| 932 | + return parser |
| 933 | + |
| 934 | + def __init__(self, extractor: IExtractor, *, |
| 935 | + ids: Optional[Iterable[Tuple[str, str]]] = None): |
| 936 | + super().__init__(extractor) |
| 937 | + self._ids = set(tuple(v) for v in (ids or [])) |
| 938 | + |
| 939 | + def transform_item(self, item: DatasetItem): |
| 940 | + if not self._ids or (item.id, item.subset) in self._ids: |
| 941 | + return item.wrap(annotations=[]) |
| 942 | + return item |
| 943 | + |
| 944 | +class RemoveAttributes(ItemTransform): |
| 945 | + """ |
| 946 | + Allows to remove item and annotation attributes in a dataset.|n |
| 947 | + |n |
| 948 | + Can be useful to clean the dataset from broken or unnecessary attributes.|n |
| 949 | + |n |
| 950 | + Examples:|n |
| 951 | + - Remove the `is_crowd` attribute from dataset|n |
| 952 | + |s|s%(prog)s --attr 'is_crowd'|n |
| 953 | + |n |
| 954 | + - Remove the `occluded` attribute from annotations of|n |
| 955 | + |s|sthe `2010_001705` item in the `train` subset|n |
| 956 | + |s|s%(prog)s --id '2010_001705:train' --attr 'occluded' |
| 957 | + """ |
| 958 | + |
| 959 | + @staticmethod |
| 960 | + def _parse_id(s): |
| 961 | + full_id = s.split(':') |
| 962 | + if len(full_id) != 2: |
| 963 | + raise argparse.ArgumentTypeError(None, |
| 964 | + message="Invalid id format of '%s'. " |
| 965 | + "Expected a 'name:subset' pair." % s) |
| 966 | + return full_id |
| 967 | + |
| 968 | + @classmethod |
| 969 | + def build_cmdline_parser(cls, **kwargs): |
| 970 | + parser = super().build_cmdline_parser(**kwargs) |
| 971 | + parser.add_argument('--id', dest='ids', type=cls._parse_id, |
| 972 | + action='append', |
| 973 | + help="Image id to clean from annotations. " |
| 974 | + "Id is 'name:subset' pair. If not specified, " |
| 975 | + "affects all images and annotations (repeatable)") |
| 976 | + parser.add_argument('-a', '--attr', action='append', dest='attributes', |
| 977 | + help="Attribute name to be removed. If not specified, " |
| 978 | + "removes all attributes (repeatable)") |
| 979 | + return parser |
| 980 | + |
| 981 | + def __init__(self, extractor: IExtractor, |
| 982 | + ids: Optional[Iterable[Tuple[str, str]]] = None, |
| 983 | + attributes: Optional[Iterable[str]] = None): |
| 984 | + super().__init__(extractor) |
| 985 | + self._ids = set(tuple(v) for v in (ids or [])) |
| 986 | + self._attributes = set(attributes or []) |
| 987 | + |
| 988 | + def _filter_attrs(self, attrs): |
| 989 | + if not self._attributes: |
| 990 | + return None |
| 991 | + else: |
| 992 | + return filter_dict(attrs, exclude_keys=self._attributes) |
| 993 | + |
| 994 | + def transform_item(self, item: DatasetItem): |
| 995 | + if not self._ids or (item.id, item.subset) in self._ids: |
| 996 | + filtered_annotations = [] |
| 997 | + for ann in item.annotations: |
| 998 | + filtered_annotations.append(ann.wrap( |
| 999 | + attributes=self._filter_attrs(ann.attributes))) |
| 1000 | + |
| 1001 | + return item.wrap(attributes=self._filter_attrs(item.attributes), |
| 1002 | + annotations=filtered_annotations) |
| 1003 | + return item |
0 commit comments