Skip to content

Add TabularValidator #1498

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/1475>)
- Add AstypeAnnotations Transform
(<https://github.com/openvinotoolkit/datumaro/pull/1484>)
- Add TabularValidator
(<https://github.com/openvinotoolkit/datumaro/pull/1498>)

### Enhancements
- Fix ambiguous COCO format detector
Expand Down
42 changes: 40 additions & 2 deletions docs/source/docs/command-reference/context_free/validate.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
This command inspects annotations with respect to the task type
and stores the results in JSON file.

The task types supported are `classification`, `detection`, and
`segmentation` (the `-t/--task-type` parameter).
The task types supported are `classification`, `detection`, `segmentation` and
`tabular` (the `-t/--task-type` parameter).

The validation result contains
- `annotation statistics` based on the task type
Expand Down Expand Up @@ -82,6 +82,14 @@ Examples:
| InvalidValue | There's invalid (ex. inf, nan) value for bounding box info. | detection |
| FarFromLabelMean | An annotation has an too small or large value than average for a label | detection, segmentation |
| FarFromAttrMean | An annotation has an too small or large value than average for an attribute | detection, segmentation |
| BrokenAnnotation | Some annotations are not defined for an item | tabular |
| EmptyLabel | A value of the label column is not defined for an item | tabular |
| EmptyCaption | A value of the caption column is not defined for an item | tabular |
| FewSamplesInCaption | The number of samples in a caption might be too low | tabular |
| RedundanciesInCaption | Redundancies of an caption for an item | tabular |
| ImbalancedCaptions | There is an imbalance in the caption distribution | tabular |
| ImbalancedDistInCaption | Values are not evenly distributed for a caption only if caption is number | tabular |
| FarFromCaptionMean | An annotation has an too small or large value than average for a caption only if caption is number | tabular |

Validation Result Format:

Expand Down Expand Up @@ -146,6 +154,36 @@ Validation Result Format:
# }
'mask_distribution_in_dataset_item': <dict>,
# '<item_key>': <mask/polygon count: int>

## statistics for tabular task
'items_broken_annotation': <list>, # [<item_key>, ]
'label_distribution': {
'defined_labels': <dict>, # <label:str>: <count:int>
'empty_labels': <dict>
# <label:str>: {
# 'count': <int>,
# 'items_with_empty_label': [<item_key>, ]
# }
},
'caption_distribution': {
'defined_captions': <dict>, # <label:str>: <count:int>
'empty_captions': <dict>
# <label:str>: {
# 'count': <int>,
# 'items_with_empty_label': [<item_key>, ]
# }
'redundancies': <dict>
# <label:str>: {
# 'stopword': <dict>,
# 'count': <int>,
# 'items_with_redundancies': [<item_key>, ]
# 'url': <dict>,
# 'count': <int>,
# 'items_with_redundancies': [<item_key>, ]
# }
# }
},

},
'validation_reports': <list>, # [ <validation_error_format>, ]
# validation_error_format = {
Expand Down
3 changes: 3 additions & 0 deletions requirements-core.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,6 @@ scikit-learn

# Stream JSON parser
json-stream

# TabularValidator
nltk
84 changes: 83 additions & 1 deletion src/datumaro/components/errors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2020-2022 Intel Corporation
# Copyright (C) 2020-2024 Intel Corporation
#
# SPDX-License-Identifier: MIT

Expand Down Expand Up @@ -540,6 +540,32 @@ def __str__(self):
return f"Item needs '{self.ann_type}' annotation(s), " "but not found."


@define(auto_exc=False)
class BrokenAnnotation(DatasetItemValidationError):
ann_type = field()

def __str__(self):
return f"Item needs whole '{self.ann_type}' annotation(s), " "but missed some."


@define(auto_exc=False)
class EmptyLabel(DatasetItemValidationError):
label_name = field()

def __str__(self):
return f"Item should have the label '{self.label_name}' annotation(s), " "but not found."


@define(auto_exc=False)
class EmptyCaption(DatasetItemValidationError):
caption_name = field()

def __str__(self):
return (
f"Item should have the caption '{self.caption_name}' annotation(s), " "but not found."
)


@define(auto_exc=False)
class MultiLabelAnnotations(DatasetItemValidationError):
def __str__(self):
Expand Down Expand Up @@ -633,6 +659,31 @@ def __str__(self):
)


@define(auto_exc=False)
class FewSamplesInCaption(DatasetValidationError):
caption_name = field()
count = field()

def __str__(self):
return (
f"The number of samples in the caption '{self.caption_name}'"
f" might be too low. Found '{self.count}' samples."
)


@define(auto_exc=False)
class RedundanciesInCaption(DatasetValidationError):
caption_name = field()
redundancy_type = field()
count = field()

def __str__(self):
return (
f"The number of '{self.redundancy_type}' redundancy in the caption '{self.caption_name}'"
f" have found '{self.count}'."
)


@define(auto_exc=False)
class FewSamplesInAttribute(DatasetValidationError):
label_name = field()
Expand All @@ -655,6 +706,12 @@ def __str__(self):
return "There is an imbalance in the label distribution."


@define(auto_exc=False)
class ImbalancedCaptions(DatasetValidationError):
def __str__(self):
return "There is an imbalance in the caption distribution."


@define(auto_exc=False)
class ImbalancedAttribute(DatasetValidationError):
label_name = field()
Expand All @@ -678,6 +735,14 @@ def __str__(self):
)


@define(auto_exc=False)
class ImbalancedDistInCaption(DatasetValidationError):
caption_name = field()

def __str__(self):
return f"Values are not evenly " f"distributed for '{self.caption_name}' caption."


@define(auto_exc=False)
class ImbalancedDistInAttribute(DatasetValidationError):
label_name = field()
Expand Down Expand Up @@ -737,6 +802,23 @@ def __str__(self):
)


@define(auto_exc=False)
class FarFromCaptionMean(DatasetItemValidationError):
caption_name = field()
ann_id = field()
prop = field()
mean = field()
val = field()

def __str__(self):
return (
f"Annotation '{self.ann_id}' in "
f"the item has a value of '{self.prop}' that "
"is too far from the caption average. (mean of "
f"'{self.caption_name}' caption: {self.mean}, got '{self.val}')."
)


@define(auto_exc=False)
class FarFromAttrMean(DatasetItemValidationError):
label_name = field()
Expand Down
1 change: 1 addition & 0 deletions src/datumaro/components/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class TaskType(Enum):
classification = auto()
detection = auto()
segmentation = auto()
tabular = auto()


class Validator(CliPlugin):
Expand Down
5 changes: 5 additions & 0 deletions src/datumaro/plugins/specs.json
Original file line number Diff line number Diff line change
Expand Up @@ -1968,5 +1968,10 @@
"import_path": "datumaro.plugins.validators.SegmentationValidator",
"plugin_name": "segmentation",
"plugin_type": "Validator"
},
{
"import_path": "datumaro.plugins.validators.TabularValidator",
"plugin_name": "tabular",
"plugin_type": "Validator"
}
]
6 changes: 4 additions & 2 deletions src/datumaro/plugins/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1536,12 +1536,14 @@ def categories(self):
return self._categories

def transform_item(self, item: DatasetItem):
import pandas as pd

annotations = [
Label(label=self._id_mapping[name + self._sep_token + str(value)])
if self._tabular_cat_types.get(name) == CategoricalDtype() and value is not None
else Caption(value)
else Caption(name + self._sep_token + str(value))
for name, value in item.annotations[0].values.items()
if value is not None
if not pd.isna(value)
]

return self.wrap_item(item, annotations=annotations)
Loading