Skip to content

Commit ca79fbd

Browse files
authored
Develop DatumaroBinaryFormat to export/import the dataset header & DatasetItem (#828)
- Support DatasetItem by Datumaro binary format but annotations are not yet. - Quick fix for CI failure because of dill Signed-off-by: Kim, Vinnam <[email protected]>
1 parent 1e6761b commit ca79fbd

File tree

15 files changed

+486
-36
lines changed

15 files changed

+486
-36
lines changed

.github/workflows/pr_check.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
python-version: ${{ matrix.python-version }}
3232
- name: Installing dependencies
3333
run: |
34-
pip install -e '.[default,tf,tfds]' pytest pytest-cov
34+
pip install -e '.[default,tf,tfds-dev]' pytest pytest-cov
3535
- name: Unit testing
3636
run: |
3737
pytest -v tests/unit/ --cov

datumaro/components/media.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import os.path as osp
99
import shutil
1010
import weakref
11+
from enum import IntEnum
1112
from typing import Callable, Iterable, Iterator, List, Optional, Tuple, Union
1213

1314
import cv2
@@ -19,7 +20,22 @@
1920
BboxIntCoords = Tuple[int, int, int, int] # (x, y, w, h)
2021

2122

23+
class MediaType(IntEnum):
24+
NO_MEDIA = 0
25+
UNKNOWN = 1
26+
IMAGE = 2
27+
BYTE_IMAGE = 3
28+
VIDEO_FRAME = 4
29+
VIDEO = 5
30+
POINT_CLOUD = 6
31+
MULTIFRAME_IMAGE = 7
32+
ROI_IMAGE = 8
33+
MOSAIC_IMAGE = 9
34+
35+
2236
class MediaElement:
37+
MEDIA_TYPE = MediaType.UNKNOWN
38+
2339
def __init__(self, path: str) -> None:
2440
assert path, "Path can't be empty"
2541
self._path = path
@@ -42,6 +58,8 @@ def __eq__(self, other: object) -> bool:
4258

4359

4460
class Image(MediaElement):
61+
MEDIA_TYPE = MediaType.IMAGE
62+
4563
def __init__(
4664
self,
4765
data: Union[np.ndarray, Callable[[str], np.ndarray], None] = None,
@@ -169,6 +187,8 @@ def save(self, path):
169187

170188

171189
class ByteImage(Image):
190+
MEDIA_TYPE = MediaType.BYTE_IMAGE
191+
172192
_FORMAT_MAGICS = (
173193
(b"\x89PNG\r\n\x1a\n", ".png"),
174194
(b"\xff\xd8\xff", ".jpg"),
@@ -235,6 +255,8 @@ def save(self, path):
235255

236256

237257
class VideoFrame(Image):
258+
MEDIA_TYPE = MediaType.VIDEO_FRAME
259+
238260
def __init__(self, video: Video, index: int):
239261
self._video = video
240262
self._index = index
@@ -333,6 +355,8 @@ def _navigate_to(self, idx: int) -> VideoFrame:
333355

334356

335357
class Video(MediaElement, Iterable[VideoFrame]):
358+
MEDIA_TYPE = MediaType.VIDEO
359+
336360
"""
337361
Provides random access to the video frames.
338362
"""
@@ -501,13 +525,17 @@ def __hash__(self):
501525

502526

503527
class PointCloud(MediaElement):
528+
MEDIA_TYPE = MediaType.POINT_CLOUD
529+
504530
def __init__(self, path: str, extra_images: Optional[List[Image]] = None):
505531
self._path = path
506532

507533
self.extra_images: List[Image] = extra_images or []
508534

509535

510536
class MultiframeImage(MediaElement):
537+
MEDIA_TYPE = MediaType.MULTIFRAME_IMAGE
538+
511539
def __init__(
512540
self,
513541
images: Optional[Iterable[Union[str, Image, np.ndarray, Callable[[str], np.ndarray]]]],
@@ -538,6 +566,8 @@ def data(self) -> List[Image]:
538566

539567

540568
class RoIImage(Image):
569+
MEDIA_TYPE = MediaType.ROI_IMAGE
570+
541571
def __init__(
542572
self,
543573
data: Union[np.ndarray, Callable[[str], np.ndarray], None] = None,
@@ -580,6 +610,8 @@ def save(self, path):
580610

581611

582612
class MosaicImage(Image):
613+
MEDIA_TYPE = MediaType.MOSAIC_IMAGE
614+
583615
def __init__(self, imgs: List[ImageWithRoI], size: Tuple[int, int]) -> None:
584616
def _get_mosaic_img(_) -> np.ndarray:
585617
h, w = self.size

datumaro/plugins/data_formats/datumaro/base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,10 @@ def __init__(self, path):
5151
self._related_images_dir = related_images_dir
5252

5353
super().__init__(subset=osp.splitext(osp.basename(path))[0])
54+
self._load_impl(path)
5455

56+
def _load_impl(self, path: str) -> None:
57+
"""Actual implementation of loading Datumaro format."""
5558
parsed_anns = parse_json_file(path)
5659
self._infos = self._load_infos(parsed_anns)
5760
self._categories = self._load_categories(parsed_anns)

datumaro/plugins/data_formats/datumaro/exporter.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -342,9 +342,14 @@ def _convert_points_categories(self, obj):
342342

343343
class DatumaroExporter(Exporter):
344344
DEFAULT_IMAGE_EXT = DatumaroPath.IMAGE_EXT
345-
WRITER_CLS = _SubsetWriter
346345
PATH_CLS = DatumaroPath
347346

347+
def create_writer(self, subset: str) -> _SubsetWriter:
348+
return _SubsetWriter(
349+
context=self,
350+
ann_file=osp.join(self._annotations_dir, subset + self.PATH_CLS.ANNOTATION_EXT),
351+
)
352+
348353
def apply(self):
349354
os.makedirs(self._save_dir, exist_ok=True)
350355

@@ -359,13 +364,8 @@ def apply(self):
359364
self._pcd_dir = osp.join(self._save_dir, self.PATH_CLS.PCD_DIR)
360365
self._related_images_dir = osp.join(self._save_dir, self.PATH_CLS.RELATED_IMAGES_DIR)
361366

362-
writers = {
363-
subset: self.WRITER_CLS(
364-
context=self,
365-
ann_file=osp.join(self._annotations_dir, subset + self.PATH_CLS.ANNOTATION_EXT),
366-
)
367-
for subset in self._extractor.subsets()
368-
}
367+
writers = {subset: self.create_writer(subset) for subset in self._extractor.subsets()}
368+
369369
for writer in writers.values():
370370
writer.add_infos(self._extractor.infos())
371371
writer.add_categories(self._extractor.categories())

datumaro/plugins/data_formats/datumaro_binary/base.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,64 @@
22
#
33
# SPDX-License-Identifier: MIT
44

5+
import struct
6+
from io import BufferedWriter
7+
from typing import Optional
8+
9+
from datumaro.components.errors import DatasetImportError
10+
from datumaro.plugins.data_formats.datumaro_binary.format import DatumaroBinaryPath
11+
from datumaro.plugins.data_formats.datumaro_binary.mapper import DictMapper
12+
513
from ..datumaro.base import DatumaroBase
14+
from .crypter import Crypter
615

716

817
class DatumaroBinaryBase(DatumaroBase):
918
""""""
19+
20+
def __init__(self, path: str, encryption_key: Optional[bytes] = None):
21+
self._fp: Optional[BufferedWriter] = None
22+
self._crypter = Crypter(encryption_key)
23+
super().__init__(path)
24+
25+
def _load_impl(self, path: str) -> None:
26+
"""Actual implementation of loading Datumaro binary format."""
27+
try:
28+
with open(path, "rb") as fp:
29+
self._fp = fp
30+
self._check_signature()
31+
self._check_encryption_field()
32+
self._read_info()
33+
self._read_categories()
34+
finally:
35+
self._fp = None
36+
37+
return
38+
39+
def _check_signature(self):
40+
signature = self._fp.read(DatumaroBinaryPath.SIGNATURE_LEN).decode()
41+
DatumaroBinaryPath.check_signature(signature)
42+
43+
def _check_encryption_field(self):
44+
len_byte = self._fp.read(4)
45+
_bytes = self._fp.read(struct.unpack("I", len_byte)[0])
46+
47+
extracted_key = self._crypter.decrypt(_bytes)
48+
49+
if not self._crypter.handshake(extracted_key):
50+
raise DatasetImportError("Encryption key handshake fails. You give a wrong key.")
51+
52+
def _read_info(self):
53+
len_byte = self._fp.read(4)
54+
_bytes = self._fp.read(struct.unpack("I", len_byte)[0])
55+
_bytes = self._crypter.decrypt(_bytes)
56+
57+
self._infos, _ = DictMapper.backward(_bytes)
58+
59+
def _read_categories(self):
60+
len_byte = self._fp.read(4)
61+
_bytes = self._fp.read(struct.unpack("I", len_byte)[0])
62+
_bytes = self._crypter.decrypt(_bytes)
63+
64+
categories, _ = DictMapper.backward(_bytes)
65+
self._categories = self._load_categories({"categories": categories})
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Copyright (C) 2023 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
from typing import Optional
6+
7+
from cryptography.fernet import Fernet
8+
9+
10+
class Crypter:
11+
FERNET_KEY_LEN = 44
12+
13+
def __init__(self, key: Optional[bytes]) -> None:
14+
if key is not None:
15+
self._key = key
16+
self._fernet = Fernet(self._key)
17+
else:
18+
self._key = None
19+
self._fernet = None
20+
21+
@property
22+
def key(self) -> Optional[bytes]:
23+
return self._key
24+
25+
def decrypt(self, msg: bytes):
26+
return self._fernet.decrypt(msg) if self._fernet is not None else msg
27+
28+
def encrypt(self, msg: bytes):
29+
return self._fernet.encrypt(msg) if self._fernet is not None else msg
30+
31+
def handshake(self, key: bytes) -> bool:
32+
if self._key is None and key == b"":
33+
return True
34+
if self._key is not None and self._key == key:
35+
return True
36+
37+
return False
38+
39+
@staticmethod
40+
def gen_key() -> bytes:
41+
return Fernet.generate_key()

datumaro/plugins/data_formats/datumaro_binary/exporter.py

Lines changed: 80 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,100 @@
44

55
# pylint: disable=no-self-use
66

7-
from io import TextIOWrapper
7+
import os.path as osp
8+
import struct
9+
from io import BufferedWriter
10+
from typing import Any, Optional
811

12+
from datumaro.components.dataset_base import IDataset
13+
from datumaro.components.exporter import ExportContext
914
from datumaro.plugins.data_formats.datumaro.exporter import DatumaroExporter
1015
from datumaro.plugins.data_formats.datumaro.exporter import _SubsetWriter as __SubsetWriter
16+
from datumaro.plugins.data_formats.datumaro_binary.crypter import Crypter
17+
from datumaro.plugins.data_formats.datumaro_binary.mapper import DictMapper
1118

1219
from .format import DatumaroBinaryPath
1320

1421

1522
class _SubsetWriter(__SubsetWriter):
1623
""""""
1724

18-
def _sign(self, fp: TextIOWrapper):
19-
fp.write(DatumaroBinaryPath.SIGNATURE.encode("utf-8"))
25+
def __init__(self, context: IDataset, ann_file: str, encryption_key: Optional[bytes] = None):
26+
super().__init__(context, ann_file)
27+
self._fp: Optional[BufferedWriter] = None
28+
self._crypter = Crypter(encryption_key)
29+
30+
def _sign(self):
31+
self._fp.write(DatumaroBinaryPath.SIGNATURE.encode())
32+
33+
def _dump_encryption_field(self) -> int:
34+
if self._crypter.key is None:
35+
msg = b""
36+
else:
37+
msg = self._crypter.key
38+
msg = self._crypter.encrypt(msg)
39+
40+
return self._fp.write(struct.pack(f"I{len(msg)}s", len(msg), msg))
41+
42+
def _dump_header(self, header: Any):
43+
msg = DictMapper.forward(header)
44+
45+
if self._crypter.key is not None:
46+
msg = self._crypter.encrypt(msg)
47+
48+
length = struct.pack("I", len(msg))
49+
return self._fp.write(length + msg)
50+
51+
def _dump_info(self):
52+
self._dump_header(self.infos)
53+
54+
def _dump_categories(self):
55+
self._dump_header(self.categories)
2056

2157
def write(self):
22-
with open(self.ann_file, "wb") as fp:
23-
self._sign(fp)
58+
try:
59+
with open(self.ann_file, "wb") as fp:
60+
self._fp = fp
61+
self._sign()
62+
self._dump_encryption_field()
63+
self._dump_header(self.infos)
64+
self._dump_header(self.categories)
65+
finally:
66+
self._fp = None
2467

2568

2669
class DatumaroBinaryExporter(DatumaroExporter):
2770
DEFAULT_IMAGE_EXT = DatumaroBinaryPath.IMAGE_EXT
28-
WRITER_CLS = _SubsetWriter
2971
PATH_CLS = DatumaroBinaryPath
72+
73+
def __init__(
74+
self,
75+
extractor: IDataset,
76+
save_dir: str,
77+
*,
78+
save_images=None,
79+
save_media: Optional[bool] = None,
80+
image_ext: Optional[str] = None,
81+
default_image_ext: Optional[str] = None,
82+
save_dataset_meta: bool = False,
83+
ctx: Optional[ExportContext] = None,
84+
encryption_key: Optional[bytes] = None,
85+
):
86+
self._encryption_key = encryption_key
87+
super().__init__(
88+
extractor,
89+
save_dir,
90+
save_images=save_images,
91+
save_media=save_media,
92+
image_ext=image_ext,
93+
default_image_ext=default_image_ext,
94+
save_dataset_meta=save_dataset_meta,
95+
ctx=ctx,
96+
)
97+
98+
def create_writer(self, subset: str):
99+
return _SubsetWriter(
100+
context=self,
101+
ann_file=osp.join(self._annotations_dir, subset + self.PATH_CLS.ANNOTATION_EXT),
102+
encryption_key=self._encryption_key,
103+
)

datumaro/plugins/data_formats/datumaro_binary/format.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
#
33
# SPDX-License-Identifier: MIT
44

5+
from datumaro.errors import DatasetImportError
6+
7+
_SIGNATURE = "signature:datumaro_binary"
8+
59

610
class DatumaroBinaryPath:
711
IMAGES_DIR = "images"
@@ -13,4 +17,12 @@ class DatumaroBinaryPath:
1317
ANNOTATION_EXT = ".datumaro"
1418
IMAGE_EXT = ".jpg"
1519
MASK_EXT = ".png"
16-
SIGNATURE = "signature:datumaro_binary"
20+
SIGNATURE = _SIGNATURE
21+
SIGNATURE_LEN = len(_SIGNATURE)
22+
23+
@classmethod
24+
def check_signature(cls, signature: str):
25+
if signature != cls.SIGNATURE:
26+
raise DatasetImportError(
27+
f"Input signature={signature} is not aligned with the ground truth signature={cls.SIGNATURE}"
28+
)

0 commit comments

Comments
 (0)