|
| 1 | +"""Dataloader for Datumaro format. |
| 2 | +
|
| 3 | +Note: This currently only works for annotations exported from Intel Geti™. |
| 4 | +""" |
| 5 | + |
| 6 | +# Copyright (C) 2024 Intel Corporation |
| 7 | +# SPDX-License-Identifier: Apache-2.0 |
| 8 | + |
| 9 | +import json |
| 10 | +from pathlib import Path |
| 11 | + |
| 12 | +import pandas as pd |
| 13 | +from torchvision.transforms.v2 import Transform |
| 14 | + |
| 15 | +from anomalib import TaskType |
| 16 | +from anomalib.data.base import AnomalibDataModule, AnomalibDataset |
| 17 | +from anomalib.data.utils import LabelName, Split, TestSplitMode, ValSplitMode |
| 18 | + |
| 19 | + |
| 20 | +def make_datumaro_dataset(root: str | Path, split: str | Split | None = None) -> pd.DataFrame: |
| 21 | + """Make Datumaro Dataset. |
| 22 | +
|
| 23 | + Assumes the following directory structure: |
| 24 | +
|
| 25 | + dataset |
| 26 | + ├── annotations |
| 27 | + │ └── default.json |
| 28 | + └── images |
| 29 | + └── default |
| 30 | + ├── image1.jpg |
| 31 | + ├── image2.jpg |
| 32 | + └── ... |
| 33 | +
|
| 34 | + Args: |
| 35 | + root (str | Path): Path to the dataset root directory. |
| 36 | + split (str | Split | None): Split of the dataset, usually Split.TRAIN or Split.TEST. |
| 37 | + Defaults to ``None``. |
| 38 | +
|
| 39 | + Examples: |
| 40 | + >>> root = Path("path/to/dataset") |
| 41 | + >>> samples = make_datumaro_dataset(root) |
| 42 | + >>> samples.head() |
| 43 | + image_path label label_index split mask_path |
| 44 | + 0 path/to/dataset... Normal 0 Split.TRAIN |
| 45 | + 1 path/to/dataset... Normal 0 Split.TRAIN |
| 46 | + 2 path/to/dataset... Normal 0 Split.TRAIN |
| 47 | + 3 path/to/dataset... Normal 0 Split.TRAIN |
| 48 | + 4 path/to/dataset... Normal 0 Split.TRAIN |
| 49 | +
|
| 50 | +
|
| 51 | + Returns: |
| 52 | + DataFrame: an output dataframe containing samples for the requested split (ie., train or test). |
| 53 | + """ |
| 54 | + annotation_file = Path(root) / "annotations" / "default.json" |
| 55 | + with annotation_file.open() as f: |
| 56 | + annotations = json.load(f) |
| 57 | + |
| 58 | + categories = annotations["categories"] |
| 59 | + categories = {idx: label["name"] for idx, label in enumerate(categories["label"]["labels"])} |
| 60 | + |
| 61 | + samples = [] |
| 62 | + for item in annotations["items"]: |
| 63 | + image_path = Path(root) / "images" / "default" / item["image"]["path"] |
| 64 | + label_index = item["annotations"][0]["label_id"] |
| 65 | + label = categories[label_index] |
| 66 | + samples.append({ |
| 67 | + "image_path": str(image_path), |
| 68 | + "label": label, |
| 69 | + "label_index": label_index, |
| 70 | + "split": None, |
| 71 | + "mask_path": "", # mask is provided in the annotation file and is not on disk. |
| 72 | + }) |
| 73 | + samples_df = pd.DataFrame( |
| 74 | + samples, |
| 75 | + columns=["image_path", "label", "label_index", "split", "mask_path"], |
| 76 | + index=range(len(samples)), |
| 77 | + ) |
| 78 | + # Create test/train split |
| 79 | + # By default assign all "Normal" samples to train and all "Anomalous" samples to test |
| 80 | + samples_df.loc[samples_df["label_index"] == LabelName.NORMAL, "split"] = Split.TRAIN |
| 81 | + samples_df.loc[samples_df["label_index"] == LabelName.ABNORMAL, "split"] = Split.TEST |
| 82 | + |
| 83 | + # Get the data frame for the split. |
| 84 | + if split: |
| 85 | + samples_df = samples_df[samples_df.split == split].reset_index(drop=True) |
| 86 | + |
| 87 | + return samples_df |
| 88 | + |
| 89 | + |
| 90 | +class DatumaroDataset(AnomalibDataset): |
| 91 | + """Datumaro dataset class. |
| 92 | +
|
| 93 | + Args: |
| 94 | + task (TaskType): Task type, ``classification``, ``detection`` or ``segmentation``. |
| 95 | + root (str | Path): Path to the dataset root directory. |
| 96 | + transform (Transform, optional): Transforms that should be applied to the input images. |
| 97 | + Defaults to ``None``. |
| 98 | + split (str | Split | None): Split of the dataset, usually Split.TRAIN or Split.TEST |
| 99 | + Defaults to ``None``. |
| 100 | +
|
| 101 | +
|
| 102 | + Examples: |
| 103 | + .. code-block:: python |
| 104 | +
|
| 105 | + from anomalib.data.image.datumaro import DatumaroDataset |
| 106 | + from torchvision.transforms.v2 import Resize |
| 107 | +
|
| 108 | + dataset = DatumaroDataset(root=root, |
| 109 | + task="classification", |
| 110 | + transform=Resize((256, 256)), |
| 111 | + ) |
| 112 | + print(dataset[0].keys()) |
| 113 | + # Output: dict_keys(['dm_format_version', 'infos', 'categories', 'items']) |
| 114 | +
|
| 115 | + """ |
| 116 | + |
| 117 | + def __init__( |
| 118 | + self, |
| 119 | + task: TaskType, |
| 120 | + root: str | Path, |
| 121 | + transform: Transform | None = None, |
| 122 | + split: str | Split | None = None, |
| 123 | + ) -> None: |
| 124 | + super().__init__(task, transform) |
| 125 | + self.split = split |
| 126 | + self.samples = make_datumaro_dataset(root, split) |
| 127 | + |
| 128 | + |
| 129 | +class Datumaro(AnomalibDataModule): |
| 130 | + """Datumaro datamodule. |
| 131 | +
|
| 132 | + Args: |
| 133 | + root (str | Path): Path to the dataset root directory. |
| 134 | + train_batch_size (int): Batch size for training dataloader. |
| 135 | + Defaults to ``32``. |
| 136 | + eval_batch_size (int): Batch size for evaluation dataloader. |
| 137 | + Defaults to ``32``. |
| 138 | + num_workers (int): Number of workers for dataloaders. |
| 139 | + Defaults to ``8``. |
| 140 | + task (TaskType): Task type, ``classification``, ``detection`` or ``segmentation``. |
| 141 | + Defaults to ``TaskType.CLASSIFICATION``. Currently only supports classification. |
| 142 | + image_size (tuple[int, int], optional): Size to which input images should be resized. |
| 143 | + Defaults to ``None``. |
| 144 | + transform (Transform, optional): Transforms that should be applied to the input images. |
| 145 | + Defaults to ``None``. |
| 146 | + train_transform (Transform, optional): Transforms that should be applied to the input images during training. |
| 147 | + Defaults to ``None``. |
| 148 | + eval_transform (Transform, optional): Transforms that should be applied to the input images during evaluation. |
| 149 | + Defaults to ``None``. |
| 150 | + test_split_mode (TestSplitMode): Setting that determines how the testing subset is obtained. |
| 151 | + Defaults to ``TestSplitMode.FROM_DIR``. |
| 152 | + test_split_ratio (float): Fraction of images from the train set that will be reserved for testing. |
| 153 | + Defaults to ``0.2``. |
| 154 | + val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained. |
| 155 | + Defaults to ``ValSplitMode.SAME_AS_TEST``. |
| 156 | + val_split_ratio (float): Fraction of train or test images that will be reserved for validation. |
| 157 | + Defaults to ``0.5``. |
| 158 | + seed (int | None, optional): Seed which may be set to a fixed value for reproducibility. |
| 159 | + Defualts to ``None``. |
| 160 | +
|
| 161 | + Examples: |
| 162 | + To create a Datumaro datamodule |
| 163 | +
|
| 164 | + >>> from pathlib import Path |
| 165 | + >>> from torchvision.transforms.v2 import Resize |
| 166 | + >>> root = Path("path/to/dataset") |
| 167 | + >>> datamodule = Datumaro(root, transform=Resize((256, 256))) |
| 168 | + >>> datamodule.setup() |
| 169 | + >>> i, data = next(enumerate(datamodule.train_dataloader())) |
| 170 | + >>> data.keys() |
| 171 | + dict_keys(['image_path', 'label', 'image']) |
| 172 | +
|
| 173 | + >>> data["image"].shape |
| 174 | + torch.Size([32, 3, 256, 256]) |
| 175 | + """ |
| 176 | + |
| 177 | + def __init__( |
| 178 | + self, |
| 179 | + root: str | Path, |
| 180 | + train_batch_size: int = 32, |
| 181 | + eval_batch_size: int = 32, |
| 182 | + num_workers: int = 8, |
| 183 | + task: TaskType = TaskType.CLASSIFICATION, |
| 184 | + image_size: tuple[int, int] | None = None, |
| 185 | + transform: Transform | None = None, |
| 186 | + train_transform: Transform | None = None, |
| 187 | + eval_transform: Transform | None = None, |
| 188 | + test_split_mode: TestSplitMode | str = TestSplitMode.FROM_DIR, |
| 189 | + test_split_ratio: float = 0.5, |
| 190 | + val_split_mode: ValSplitMode | str = ValSplitMode.FROM_TEST, |
| 191 | + val_split_ratio: float = 0.5, |
| 192 | + seed: int | None = None, |
| 193 | + ) -> None: |
| 194 | + if task != TaskType.CLASSIFICATION: |
| 195 | + msg = "Datumaro dataloader currently only supports classification task." |
| 196 | + raise ValueError(msg) |
| 197 | + super().__init__( |
| 198 | + train_batch_size=train_batch_size, |
| 199 | + eval_batch_size=eval_batch_size, |
| 200 | + num_workers=num_workers, |
| 201 | + val_split_mode=val_split_mode, |
| 202 | + val_split_ratio=val_split_ratio, |
| 203 | + test_split_mode=test_split_mode, |
| 204 | + test_split_ratio=test_split_ratio, |
| 205 | + image_size=image_size, |
| 206 | + transform=transform, |
| 207 | + train_transform=train_transform, |
| 208 | + eval_transform=eval_transform, |
| 209 | + seed=seed, |
| 210 | + ) |
| 211 | + self.root = root |
| 212 | + self.task = task |
| 213 | + |
| 214 | + def _setup(self, _stage: str | None = None) -> None: |
| 215 | + self.train_data = DatumaroDataset( |
| 216 | + task=self.task, |
| 217 | + root=self.root, |
| 218 | + transform=self.train_transform, |
| 219 | + split=Split.TRAIN, |
| 220 | + ) |
| 221 | + self.test_data = DatumaroDataset( |
| 222 | + task=self.task, |
| 223 | + root=self.root, |
| 224 | + transform=self.eval_transform, |
| 225 | + split=Split.TEST, |
| 226 | + ) |
0 commit comments