Skip to content

Commit e624c5b

Browse files
Simplify upload data for task (#5498)
It's possible to specify only the manifest file and filename pattern for creating task with cloud storage data. The special characters supported now for the pattern are `*`, `?`, `[seq]`, `[!seq]`. Please see [here](https://github.com/opencv/cvat/blob/8898a8b2647514dd6f3f6ce83745b1ca8ef72bce/tests/python/rest_api/test_tasks.py#L686) for some examples of how to use this functionality. Co-authored-by: Maxim Zhiltsov <[email protected]>
1 parent 2071167 commit e624c5b

File tree

5 files changed

+211
-49
lines changed

5 files changed

+211
-49
lines changed

CHANGELOG.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## \[2.4.0] - Unreleased
99
### Added
10-
- TDB
10+
- Filename pattern to simplify uploading cloud storage data for a task (<https://github.com/opencv/cvat/pull/5498>)
1111

1212
### Changed
1313
- TDB

cvat/apps/engine/serializers.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -371,12 +371,13 @@ class DataSerializer(WriteOnceMixin, serializers.ModelSerializer):
371371
use_cache = serializers.BooleanField(default=False)
372372
copy_data = serializers.BooleanField(default=False)
373373
cloud_storage_id = serializers.IntegerField(write_only=True, allow_null=True, required=False)
374+
filename_pattern = serializers.CharField(allow_null=True, required=False)
374375

375376
class Meta:
376377
model = models.Data
377378
fields = ('chunk_size', 'size', 'image_quality', 'start_frame', 'stop_frame', 'frame_filter',
378379
'compressed_chunk_type', 'original_chunk_type', 'client_files', 'server_files', 'remote_files', 'use_zip_chunks',
379-
'cloud_storage_id', 'use_cache', 'copy_data', 'storage_method', 'storage', 'sorting_method')
380+
'cloud_storage_id', 'use_cache', 'copy_data', 'storage_method', 'storage', 'sorting_method', 'filename_pattern')
380381

381382
# pylint: disable=no-self-use
382383
def validate_frame_filter(self, value):
@@ -396,6 +397,7 @@ def validate(self, attrs):
396397
if 'start_frame' in attrs and 'stop_frame' in attrs \
397398
and attrs['start_frame'] > attrs['stop_frame']:
398399
raise serializers.ValidationError('Stop frame must be more or equal start frame')
400+
399401
return attrs
400402

401403
def create(self, validated_data):

cvat/apps/engine/task.py

+85-45
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# SPDX-License-Identifier: MIT
66

77
import itertools
8+
import fnmatch
89
import os
910
import sys
1011
from rest_framework.serializers import ValidationError
@@ -127,7 +128,7 @@ def _save_task_to_db(db_task, extractor):
127128
db_task.data.save()
128129
db_task.save()
129130

130-
def _count_files(data, manifest_files=None):
131+
def _count_files(data):
131132
share_root = settings.SHARE_ROOT
132133
server_files = []
133134

@@ -158,7 +159,7 @@ def count_files(file_mapping, counter):
158159
if mime in counter:
159160
counter[mime].append(rel_path)
160161
elif rel_path.endswith('.jsonl'):
161-
manifest_files.append(rel_path)
162+
continue
162163
else:
163164
slogger.glob.warn("Skip '{}' file (its mime type doesn't "
164165
"correspond to supported MIME file type)".format(full_path))
@@ -177,6 +178,12 @@ def count_files(file_mapping, counter):
177178

178179
return counter
179180

181+
def _find_manifest_files(data):
182+
manifest_files = []
183+
for files in ['client_files', 'server_files', 'remote_files']:
184+
manifest_files.extend(list(filter(lambda x: x.endswith('.jsonl'), data[files])))
185+
return manifest_files
186+
180187
def _validate_data(counter, manifest_files=None):
181188
unique_entries = 0
182189
multiple_entries = 0
@@ -207,10 +214,10 @@ def _validate_data(counter, manifest_files=None):
207214

208215
return counter, task_modes[0]
209216

210-
def _validate_manifest(manifests, root_dir, is_in_cloud, db_cloud_storage):
217+
def _validate_manifest(manifests, root_dir, is_in_cloud, db_cloud_storage, data_storage_method):
211218
if manifests:
212219
if len(manifests) != 1:
213-
raise Exception('Only one manifest file can be attached with data')
220+
raise ValidationError('Only one manifest file can be attached to data')
214221
manifest_file = manifests[0]
215222
full_manifest_path = os.path.join(root_dir, manifests[0])
216223
if is_in_cloud:
@@ -221,8 +228,10 @@ def _validate_manifest(manifests, root_dir, is_in_cloud, db_cloud_storage):
221228
< cloud_storage_instance.get_file_last_modified(manifest_file):
222229
cloud_storage_instance.download_file(manifest_file, full_manifest_path)
223230
if is_manifest(full_manifest_path):
231+
if not (settings.USE_CACHE or data_storage_method != models.StorageMethodChoice.CACHE):
232+
raise ValidationError("Manifest file can be uploaded only if 'Use cache' option is also selected")
224233
return manifest_file
225-
raise Exception('Invalid manifest was uploaded')
234+
raise ValidationError('Invalid manifest was uploaded')
226235
return None
227236

228237
def _validate_url(url):
@@ -291,6 +300,26 @@ def _download_data(urls, upload_dir):
291300
def _get_manifest_frame_indexer(start_frame=0, frame_step=1):
292301
return lambda frame_id: start_frame + frame_id * frame_step
293302

303+
def _create_task_manifest_based_on_cloud_storage_manifest(
304+
sorted_media,
305+
cloud_storage_manifest_prefix,
306+
cloud_storage_manifest,
307+
manifest
308+
):
309+
if cloud_storage_manifest_prefix:
310+
sorted_media_without_manifest_prefix = [
311+
os.path.relpath(i, cloud_storage_manifest_prefix) for i in sorted_media
312+
]
313+
sequence, raw_content = cloud_storage_manifest.get_subset(sorted_media_without_manifest_prefix)
314+
def _add_prefix(properties):
315+
file_name = properties['name']
316+
properties['name'] = os.path.join(cloud_storage_manifest_prefix, file_name)
317+
return properties
318+
content = list(map(_add_prefix, raw_content))
319+
else:
320+
sequence, content = cloud_storage_manifest.get_subset(sorted_media)
321+
sorted_content = (i[1] for i in sorted(zip(sequence, content)))
322+
manifest.create(sorted_content)
294323

295324
@transaction.atomic
296325
def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False):
@@ -300,69 +329,80 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False):
300329
slogger.glob.info("create task #{}".format(db_task.id))
301330

302331
db_data = db_task.data
303-
upload_dir = db_data.get_upload_dirname()
332+
upload_dir = db_data.get_upload_dirname() if db_data.storage != models.StorageChoice.SHARE else settings.SHARE_ROOT
304333
is_data_in_cloud = db_data.storage == models.StorageChoice.CLOUD_STORAGE
305334

306335
if data['remote_files'] and not isDatasetImport:
307336
data['remote_files'] = _download_data(data['remote_files'], upload_dir)
308337

309-
manifest_files = []
310-
media = _count_files(data, manifest_files)
311-
media, task_mode = _validate_data(media, manifest_files)
312-
313-
if data['server_files']:
314-
if db_data.storage == models.StorageChoice.LOCAL:
315-
_copy_data_from_source(data['server_files'], upload_dir, data.get('server_files_path'))
316-
elif db_data.storage == models.StorageChoice.SHARE:
317-
upload_dir = settings.SHARE_ROOT
318-
338+
# find and validate manifest file
339+
manifest_files = _find_manifest_files(data)
319340
manifest_root = None
320-
if db_data.storage in {models.StorageChoice.LOCAL, models.StorageChoice.SHARE}:
341+
342+
# we should also handle this case because files from the share source have not been downloaded yet
343+
if data['copy_data']:
344+
manifest_root = settings.SHARE_ROOT
345+
elif db_data.storage in {models.StorageChoice.LOCAL, models.StorageChoice.SHARE}:
321346
manifest_root = upload_dir
322347
elif is_data_in_cloud:
323348
manifest_root = db_data.cloud_storage.get_storage_dirname()
324349

325350
manifest_file = _validate_manifest(
326351
manifest_files, manifest_root,
327-
is_data_in_cloud, db_data.cloud_storage if is_data_in_cloud else None
352+
is_data_in_cloud, db_data.cloud_storage if is_data_in_cloud else None,
353+
db_data.storage_method,
328354
)
329-
if manifest_file and (not settings.USE_CACHE or db_data.storage_method != models.StorageMethodChoice.CACHE):
330-
raise Exception("File with meta information can be uploaded if 'Use cache' option is also selected")
331355

332-
if data['server_files'] and is_data_in_cloud:
356+
if is_data_in_cloud:
333357
cloud_storage_instance = db_storage_to_storage_instance(db_data.cloud_storage)
334-
sorted_media = sort(media['image'], data['sorting_method'])
335-
336-
data_size = len(sorted_media)
337-
segment_step, *_ = _get_task_segment_data(db_task, data_size)
338-
for start_frame in range(0, data_size, segment_step):
339-
first_sorted_media_image = sorted_media[start_frame]
340-
cloud_storage_instance.download_file(first_sorted_media_image, os.path.join(upload_dir, first_sorted_media_image))
341358

342-
# prepare task manifest file from cloud storage manifest file
343-
# NOTE we should create manifest before defining chunk_size
344-
# FIXME in the future when will be implemented archive support
345359
manifest = ImageManifestManager(db_data.get_manifest_path())
346360
cloud_storage_manifest = ImageManifestManager(
347361
os.path.join(db_data.cloud_storage.get_storage_dirname(), manifest_file),
348362
db_data.cloud_storage.get_storage_dirname()
349363
)
350-
cloud_storage_manifest_prefix = os.path.dirname(manifest_file)
351364
cloud_storage_manifest.set_index()
352-
if cloud_storage_manifest_prefix:
353-
sorted_media_without_manifest_prefix = [
354-
os.path.relpath(i, cloud_storage_manifest_prefix) for i in sorted_media
355-
]
356-
sequence, raw_content = cloud_storage_manifest.get_subset(sorted_media_without_manifest_prefix)
357-
def _add_prefix(properties):
358-
file_name = properties['name']
359-
properties['name'] = os.path.join(cloud_storage_manifest_prefix, file_name)
360-
return properties
361-
content = list(map(_add_prefix, raw_content))
365+
cloud_storage_manifest_prefix = os.path.dirname(manifest_file)
366+
367+
# update list with server files if task creation approach with pattern and manifest file is used
368+
if is_data_in_cloud and data['filename_pattern']:
369+
if 1 != len(data['server_files']):
370+
l = len(data['server_files']) - 1
371+
raise ValidationError(
372+
'Using a filename_pattern is only supported with a manifest file, '
373+
f'but others {l} file{"s" if l > 1 else ""} {"were" if l > 1 else "was"} found'
374+
'Please remove extra files and keep only manifest file in server_files field.'
375+
)
376+
377+
cloud_storage_manifest_data = list(cloud_storage_manifest.data) if not cloud_storage_manifest_prefix \
378+
else [os.path.join(cloud_storage_manifest_prefix, f) for f in cloud_storage_manifest.data]
379+
if data['filename_pattern'] == '*':
380+
server_files = cloud_storage_manifest_data
362381
else:
363-
sequence, content = cloud_storage_manifest.get_subset(sorted_media)
364-
sorted_content = (i[1] for i in sorted(zip(sequence, content)))
365-
manifest.create(sorted_content)
382+
server_files = fnmatch.filter(cloud_storage_manifest_data, data['filename_pattern'])
383+
data['server_files'].extend(server_files)
384+
385+
# count and validate uploaded files
386+
media = _count_files(data)
387+
media, task_mode = _validate_data(media, manifest_files)
388+
389+
if data['server_files']:
390+
if db_data.storage == models.StorageChoice.LOCAL:
391+
_copy_data_from_source(data['server_files'], upload_dir, data.get('server_files_path'))
392+
elif is_data_in_cloud:
393+
sorted_media = sort(media['image'], data['sorting_method'])
394+
395+
# download previews from cloud storage
396+
data_size = len(sorted_media)
397+
segment_step, *_ = _get_task_segment_data(db_task, data_size)
398+
for preview_frame in range(0, data_size, segment_step):
399+
preview = sorted_media[preview_frame]
400+
cloud_storage_instance.download_file(preview, os.path.join(upload_dir, preview))
401+
402+
# Define task manifest content based on cloud storage manifest content and uploaded files
403+
_create_task_manifest_based_on_cloud_storage_manifest(
404+
sorted_media, cloud_storage_manifest_prefix,
405+
cloud_storage_manifest, manifest)
366406

367407
av_scan_paths(upload_dir)
368408

tests/python/rest_api/test_tasks.py

+119
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,20 @@
44
# SPDX-License-Identifier: MIT
55

66
import json
7+
import os.path as osp
8+
import subprocess
79
from copy import deepcopy
10+
from functools import partial
811
from http import HTTPStatus
12+
from tempfile import TemporaryDirectory
913
from time import sleep
1014

1115
import pytest
1216
from cvat_sdk.api_client import apis, models
1317
from cvat_sdk.core.helpers import get_paginated_collection
1418
from deepdiff import DeepDiff
1519

20+
import shared.utils.s3 as s3
1621
from shared.utils.config import get_method, make_api_client, patch_method
1722
from shared.utils.helpers import generate_image_files
1823

@@ -675,6 +680,120 @@ def test_create_task_with_cloud_storage_files(
675680
self._USERNAME, task_spec, data_spec, content_type="application/json", org=org
676681
)
677682

683+
@pytest.mark.with_external_services
684+
@pytest.mark.parametrize("cloud_storage_id", [1])
685+
@pytest.mark.parametrize(
686+
"manifest, filename_pattern, sub_dir, task_size",
687+
[
688+
("manifest.jsonl", "*", True, 3), # public bucket
689+
("manifest.jsonl", "test/*", True, 3),
690+
("manifest.jsonl", "test/sub*1.jpeg", True, 1),
691+
("manifest.jsonl", "*image*.jpeg", True, 3),
692+
("manifest.jsonl", "wrong_pattern", True, 0),
693+
("abc_manifest.jsonl", "[a-c]*.jpeg", False, 2),
694+
("abc_manifest.jsonl", "[d]*.jpeg", False, 1),
695+
("abc_manifest.jsonl", "[e-z]*.jpeg", False, 0),
696+
],
697+
)
698+
@pytest.mark.parametrize("org", [""])
699+
def test_create_task_with_file_pattern(
700+
self,
701+
cloud_storage_id,
702+
manifest,
703+
filename_pattern,
704+
sub_dir,
705+
task_size,
706+
org,
707+
cloud_storages,
708+
request,
709+
):
710+
# prepare dataset on the bucket
711+
prefixes = ("test_image_",) * 3 if sub_dir else ("a_", "b_", "d_")
712+
images = generate_image_files(3, prefixes=prefixes)
713+
s3_client = s3.make_client()
714+
715+
cloud_storage = cloud_storages[cloud_storage_id]
716+
717+
for image in images:
718+
s3_client.create_file(
719+
data=image,
720+
bucket=cloud_storage["resource"],
721+
filename=f"{'test/sub/' if sub_dir else ''}{image.name}",
722+
)
723+
request.addfinalizer(
724+
partial(
725+
s3_client.remove_file,
726+
bucket=cloud_storage["resource"],
727+
filename=f"{'test/sub/' if sub_dir else ''}{image.name}",
728+
)
729+
)
730+
731+
with TemporaryDirectory() as tmp_dir:
732+
for image in images:
733+
with open(osp.join(tmp_dir, image.name), "wb") as f:
734+
f.write(image.getvalue())
735+
736+
command = [
737+
"docker",
738+
"run",
739+
"--rm",
740+
"-u",
741+
"root:root",
742+
"-v",
743+
f"{tmp_dir}:/local",
744+
"--entrypoint",
745+
"python3",
746+
"cvat/server",
747+
"utils/dataset_manifest/create.py",
748+
"--output-dir",
749+
"/local",
750+
"/local",
751+
]
752+
subprocess.run(command, check=True)
753+
with open(osp.join(tmp_dir, "manifest.jsonl"), mode="rb") as m_file:
754+
s3_client.create_file(
755+
data=m_file.read(),
756+
bucket=cloud_storage["resource"],
757+
filename=f"test/sub/{manifest}" if sub_dir else manifest,
758+
)
759+
request.addfinalizer(
760+
partial(
761+
s3_client.remove_file,
762+
bucket=cloud_storage["resource"],
763+
filename=f"test/sub/{manifest}" if sub_dir else manifest,
764+
)
765+
)
766+
767+
task_spec = {
768+
"name": f"Task with files from cloud storage {cloud_storage_id}",
769+
"labels": [
770+
{
771+
"name": "car",
772+
}
773+
],
774+
}
775+
776+
data_spec = {
777+
"image_quality": 75,
778+
"use_cache": True,
779+
"cloud_storage_id": cloud_storage_id,
780+
"server_files": [f"test/sub/{manifest}" if sub_dir else manifest],
781+
"filename_pattern": filename_pattern,
782+
}
783+
784+
if task_size:
785+
task_id = self._test_create_task(
786+
self._USERNAME, task_spec, data_spec, content_type="application/json", org=org
787+
)
788+
789+
with make_api_client(self._USERNAME) as api_client:
790+
(task, response) = api_client.tasks_api.retrieve(task_id, org=org)
791+
assert response.status == HTTPStatus.OK
792+
assert task.size == task_size
793+
else:
794+
status = self._test_cannot_create_task(self._USERNAME, task_spec, data_spec)
795+
assert "No media data found" in status.message
796+
678797
@pytest.mark.with_external_services
679798
@pytest.mark.parametrize(
680799
"cloud_storage_id, manifest, org",

tests/python/shared/utils/helpers.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,11 @@ def generate_image_file(filename="image.png", size=(50, 50), color=(0, 0, 0)):
1818
return f
1919

2020

21-
def generate_image_files(count) -> List[BytesIO]:
21+
def generate_image_files(count, prefixes=None) -> List[BytesIO]:
2222
images = []
2323
for i in range(count):
24-
image = generate_image_file(f"{i}.jpeg", color=(i, i, i))
24+
prefix = prefixes[i] if prefixes else ""
25+
image = generate_image_file(f"{prefix}{i}.jpeg", color=(i, i, i))
2526
images.append(image)
2627

2728
return images

0 commit comments

Comments
 (0)