From f1adf1bbec4821ad3ca0baef73357aeecca39b5d Mon Sep 17 00:00:00 2001 From: Maya Date: Wed, 14 Dec 2022 23:33:36 +0100 Subject: [PATCH 01/21] Add data pattern --- cvat/apps/engine/serializers.py | 17 ++++++- cvat/apps/engine/task.py | 63 +++++++++++++++---------- tests/python/rest_api/test_tasks.py | 69 ++++++++++++++++++++++++++-- tests/python/shared/utils/helpers.py | 4 +- utils/dataset_manifest/core.py | 7 ++- 5 files changed, 128 insertions(+), 32 deletions(-) diff --git a/cvat/apps/engine/serializers.py b/cvat/apps/engine/serializers.py index 4b6c604acb64..3be9f6a30c8b 100644 --- a/cvat/apps/engine/serializers.py +++ b/cvat/apps/engine/serializers.py @@ -377,12 +377,13 @@ class DataSerializer(WriteOnceMixin, serializers.ModelSerializer): use_cache = serializers.BooleanField(default=False) copy_data = serializers.BooleanField(default=False) cloud_storage_id = serializers.IntegerField(write_only=True, allow_null=True, required=False) + pattern = serializers.CharField(allow_null=True, required=False) class Meta: model = models.Data fields = ('chunk_size', 'size', 'image_quality', 'start_frame', 'stop_frame', 'frame_filter', 'compressed_chunk_type', 'original_chunk_type', 'client_files', 'server_files', 'remote_files', 'use_zip_chunks', - 'cloud_storage_id', 'use_cache', 'copy_data', 'storage_method', 'storage', 'sorting_method') + 'cloud_storage_id', 'use_cache', 'copy_data', 'storage_method', 'storage', 'sorting_method', 'pattern') # pylint: disable=no-self-use def validate_frame_filter(self, value): @@ -397,11 +398,25 @@ def validate_chunk_size(self, value): raise serializers.ValidationError('Chunk size must be a positive integer') return value + def validate_pattern(self, value): + import string + supported_wildcards = set(["*", "[", "]", "."]) + no_escapted_symbols = set(['!', '"', '%', "'", ',', '/', ':', ';', '<', '=', '>', '@', "`", "_"]) + legal_special_chars = supported_wildcards.union(no_escapted_symbols) + for c in set(string.punctuation) - legal_special_chars: + if c in value: + value.replace(c, r'\\{}'.format(c)) + + if '*' in value and value != "*": + value = r'.*'.join(value.split('*')) + return value + # pylint: disable=no-self-use def validate(self, attrs): if 'start_frame' in attrs and 'stop_frame' in attrs \ and attrs['start_frame'] > attrs['stop_frame']: raise serializers.ValidationError('Stop frame must be more or equal start frame') + return attrs def create(self, validated_data): diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index cf2530d7d878..d325b8ec549c 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -130,7 +130,7 @@ def _save_task_to_db(db_task, extractor): db_task.data.save() db_task.save() -def _count_files(data, manifest_files=None): +def _count_files(data): share_root = settings.SHARE_ROOT server_files = [] @@ -161,7 +161,7 @@ def count_files(file_mapping, counter): if mime in counter: counter[mime].append(rel_path) elif rel_path.endswith('.jsonl'): - manifest_files.append(rel_path) + continue else: slogger.glob.warn("Skip '{}' file (its mime type doesn't " "correspond to supported MIME file type)".format(full_path)) @@ -180,6 +180,12 @@ def count_files(file_mapping, counter): return counter +def _count_manifest_files(data): + manifest_files = [] + for files in ['client_files', 'server_files', 'remote_files']: + manifest_files.extend(list(filter(lambda x: x.endswith('.jsonl'), data[files]))) + return manifest_files + def _validate_data(counter, manifest_files=None): unique_entries = 0 multiple_entries = 0 @@ -309,16 +315,8 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): if data['remote_files'] and not isDatasetImport: data['remote_files'] = _download_data(data['remote_files'], upload_dir) - manifest_files = [] - media = _count_files(data, manifest_files) - media, task_mode = _validate_data(media, manifest_files) - - if data['server_files']: - if db_data.storage == models.StorageChoice.LOCAL: - _copy_data_from_source(data['server_files'], upload_dir, data.get('server_files_path')) - elif db_data.storage == models.StorageChoice.SHARE: - upload_dir = settings.SHARE_ROOT - + # find and validate manigfest file + manifest_files = _count_manifest_files(data) manifest_root = None if db_data.storage in {models.StorageChoice.LOCAL, models.StorageChoice.SHARE}: manifest_root = upload_dir @@ -332,24 +330,43 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): if manifest_file and (not settings.USE_CACHE or db_data.storage_method != models.StorageMethodChoice.CACHE): raise Exception("File with meta information can be uploaded if 'Use cache' option is also selected") - if data['server_files'] and is_data_in_cloud: + if is_data_in_cloud: cloud_storage_instance = db_storage_to_storage_instance(db_data.cloud_storage) - sorted_media = sort(media['image'], data['sorting_method']) - data_size = len(sorted_media) - segment_step, *_ = _get_task_segment_data(db_task, data_size) - for start_frame in range(0, data_size, segment_step): - first_sorted_media_image = sorted_media[start_frame] - cloud_storage_instance.download_file(first_sorted_media_image, os.path.join(upload_dir, first_sorted_media_image)) - - # prepare task manifest file from cloud storage manifest file - # NOTE we should create manifest before defining chunk_size - # FIXME in the future when will be implemented archive support manifest = ImageManifestManager(db_data.get_manifest_path()) cloud_storage_manifest = ImageManifestManager( os.path.join(db_data.cloud_storage.get_storage_dirname(), manifest_file), db_data.cloud_storage.get_storage_dirname() ) + cloud_storage_manifest.set_index() + + if len(data['server_files']) == 1 and data['pattern']: + if data['pattern'] == '*': # or use "/" ? + shutil.copyfile(cloud_storage_manifest.manifest.path, manifest.manifest.path) + manifest.set_index() + data['server_files'].extend(list(cloud_storage_manifest.data)) + else: + r = re.compile(data['pattern']) + data['server_files'].extend(list(filter(r.match, cloud_storage_manifest.data))) + + media = _count_files(data) + media, task_mode = _validate_data(media, manifest_files) + + if data['server_files']: + if db_data.storage == models.StorageChoice.LOCAL: + _copy_data_from_source(data['server_files'], upload_dir, data.get('server_files_path')) + elif db_data.storage == models.StorageChoice.SHARE: + upload_dir = settings.SHARE_ROOT + + if data['server_files'] and is_data_in_cloud: + sorted_media = sort(media['image'], data['sorting_method']) + + data_size = len(sorted_media) + segment_step, *_ = _get_task_segment_data(db_task, data_size) + for preview_frame in range(0, data_size, segment_step): + preview = sorted_media[preview_frame] + cloud_storage_instance.download_file(preview, os.path.join(upload_dir, preview)) + cloud_storage_manifest_prefix = os.path.dirname(manifest_file) cloud_storage_manifest.set_index() if cloud_storage_manifest_prefix: diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index b29f27f44f6b..0a5dc3f69433 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -15,6 +15,7 @@ from shared.utils.config import get_method, make_api_client, patch_method from shared.utils.helpers import generate_image_files +from shared.utils.s3 import make_client from .utils import export_dataset @@ -26,7 +27,7 @@ def get_cloud_storage_content(username, cloud_storage_id, manifest): ) return data - +@pytest.mark.skip() @pytest.mark.usefixtures("restore_db_per_class") class TestGetTasks: def _test_task_list_200(self, user, project_id, data, exclude_paths="", **kwargs): @@ -141,7 +142,7 @@ def test_org_task_assigneed_to_see_task( self._test_assigned_users_to_see_task_data(tasks, users, is_task_staff, org=org["slug"]) - +@pytest.mark.skip() @pytest.mark.usefixtures("restore_db_per_function") class TestPostTasks: def _test_create_task_201(self, user, spec, **kwargs): @@ -263,7 +264,7 @@ def test_can_create_task_with_skeleton(self): self._test_create_task_201(username, spec) - +@pytest.mark.skip() @pytest.mark.usefixtures("restore_db_per_class") class TestGetData: _USERNAME = "user1" @@ -284,7 +285,7 @@ def test_frame_content_type(self, content_type, task_id): assert response.status == HTTPStatus.OK assert response.headers["Content-Type"] == content_type - +@pytest.mark.skip() @pytest.mark.usefixtures("restore_db_per_function") class TestPatchTaskAnnotations: def _test_check_response(self, is_allow, response, data=None): @@ -391,6 +392,7 @@ def test_member_update_task_annotation( self._test_check_response(is_allow, response, data) +@pytest.mark.skip() @pytest.mark.usefixtures("restore_db_per_class") class TestGetTaskDataset: def _test_export_task(self, username, tid, **kwargs): @@ -447,6 +449,7 @@ def _test_cannot_create_task(self, username, spec, data, **kwargs): return status + @pytest.mark.skip() def test_can_create_task_with_defined_start_and_stop_frames(self): task_spec = { "name": f"test {self._USERNAME} to create a task with defined start and stop frames", @@ -483,6 +486,7 @@ def test_can_create_task_with_defined_start_and_stop_frames(self): (task, _) = api_client.tasks_api.retrieve(task_id) assert task.size == 4 + @pytest.mark.skip() def test_can_get_annotations_from_new_task_with_skeletons(self): spec = { "name": f"test admin1 to create a task with skeleton", @@ -636,6 +640,7 @@ def test_can_get_annotations_from_new_task_with_skeletons(self): response = get_method(self._USERNAME, f"jobs/{job_id}/annotations") assert response.status_code == HTTPStatus.OK + @pytest.mark.skip() @pytest.mark.parametrize( "cloud_storage_id, manifest, use_bucket_content, org", [ @@ -674,6 +679,62 @@ def test_create_task_with_cloud_storage_files( self._USERNAME, task_spec, data_spec, content_type="application/json", org=org ) + @pytest.mark.parametrize( + "cloud_storage_id, manifest, org", + [ + (1, "manifest.jsonl", ""), # public bucket + ], + ) + @pytest.mark.parametrize("pattern", ["test/*"]) # "test/sub*1.jpg", "*image*.jpg" + def test_create_task_with_file_pattern( + self, cloud_storage_id, manifest, pattern, org, cloud_storages + ): + # prepare dataset on the bucket + images = generate_image_files(3, 'test_image') + s3_client = make_client() + + cloud_storage = cloud_storages[cloud_storage_id] + for image in images: + s3_client.create_file(data=image, bucket=cloud_storage["resource"], filename=f'test/sub/{image.name}') + + from utils.dataset_manifest import ImageManifestManager + from io import BytesIO + + from tempfile import NamedTemporaryFile + with NamedTemporaryFile(mode='wb', suffix='manifest.jsonl') as tmp_file: + tmp_name = tmp_file.name + manifest = ImageManifestManager(manifest_path=tmp_name) + manifest.link(sources=images) + manifest.create() + s3_client.create_file(data=self._manifest.path, bucket=cloud_storage["resource"], filename=f'test/sub/{image.name}') + + task_spec = { + "name": f"Task with files from cloud storage {cloud_storage_id}", + "labels": [ + { + "name": "car", + } + ], + } + + data_spec = { + "image_quality": 75, + "use_cache": True, + "cloud_storage_id": cloud_storage_id, + "server_files": [manifest], + "pattern": pattern, + } + + task_id = self._test_create_task( + self._USERNAME, task_spec, data_spec, content_type="application/json", org=org + ) + + with make_api_client(self._USERNAME) as api_client: + (task, response) = api_client.tasks_api.retrieve(task_id, org=org) + assert response.status == HTTPStatus.OK + assert task.size + + @pytest.mark.skip() @pytest.mark.parametrize( "cloud_storage_id, manifest, org", [(1, "manifest.jsonl", "")], # public bucket diff --git a/tests/python/shared/utils/helpers.py b/tests/python/shared/utils/helpers.py index a8a7120c78bd..8f6e088e6c50 100644 --- a/tests/python/shared/utils/helpers.py +++ b/tests/python/shared/utils/helpers.py @@ -18,10 +18,10 @@ def generate_image_file(filename="image.png", size=(50, 50), color=(0, 0, 0)): return f -def generate_image_files(count) -> List[BytesIO]: +def generate_image_files(count, filename='') -> List[BytesIO]: images = [] for i in range(count): - image = generate_image_file(f"{i}.jpeg", color=(i, i, i)) + image = generate_image_file(f"{filename}{i}.jpeg", color=(i, i, i)) images.append(image) return images diff --git a/utils/dataset_manifest/core.py b/utils/dataset_manifest/core.py index 9ea7b6757d53..8ca08434a467 100644 --- a/utils/dataset_manifest/core.py +++ b/utils/dataset_manifest/core.py @@ -195,8 +195,11 @@ def __iter__(self): image = next(sources) img = Image.open(image, mode='r') orientation = img.getexif().get(274, 1) - img_name = os.path.relpath(image, self._data_dir) if self._data_dir \ - else os.path.basename(image) + if isinstance(image, str): + img_name = os.path.relpath(image, self._data_dir) if self._data_dir \ + else os.path.basename(image) + else: + img_name = image.name name, extension = os.path.splitext(img_name) width, height = img.width, img.height if orientation > 4: From 6f0e50f18c211af5d017d18f978a7d9c69b482c4 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 21 Dec 2022 16:19:36 +0100 Subject: [PATCH 02/21] Update test --- .vscode/settings.json | 5 +- tests/python/rest_api/test_tasks.py | 82 ++++++++++++++++++----------- 2 files changed, 56 insertions(+), 31 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 76614f368f9a..36e92230e3cc 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -39,5 +39,8 @@ "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, "python.testing.pytestPath": "${workspaceFolder}/.env/bin/pytest", - "python.testing.cwd": "${workspaceFolder}/tests" + "python.testing.cwd": "${workspaceFolder}/tests", + "python.analysis.extraPaths": [ + "./utils" + ] } diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index 0a5dc3f69433..c8d8b9bafab5 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -4,6 +4,8 @@ # SPDX-License-Identifier: MIT import json +import os +import sys from copy import deepcopy from http import HTTPStatus from time import sleep @@ -17,9 +19,18 @@ from shared.utils.helpers import generate_image_files from shared.utils.s3 import make_client +from tempfile import NamedTemporaryFile from .utils import export_dataset +sys.path.append( + os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))).split("tests")[0], "utils" + ) +) +from dataset_manifest import ImageManifestManager + + def get_cloud_storage_content(username, cloud_storage_id, manifest): with make_api_client(username) as api_client: (data, _) = api_client.cloudstorages_api.retrieve_content( @@ -27,7 +38,7 @@ def get_cloud_storage_content(username, cloud_storage_id, manifest): ) return data -@pytest.mark.skip() + @pytest.mark.usefixtures("restore_db_per_class") class TestGetTasks: def _test_task_list_200(self, user, project_id, data, exclude_paths="", **kwargs): @@ -142,7 +153,7 @@ def test_org_task_assigneed_to_see_task( self._test_assigned_users_to_see_task_data(tasks, users, is_task_staff, org=org["slug"]) -@pytest.mark.skip() + @pytest.mark.usefixtures("restore_db_per_function") class TestPostTasks: def _test_create_task_201(self, user, spec, **kwargs): @@ -264,7 +275,7 @@ def test_can_create_task_with_skeleton(self): self._test_create_task_201(username, spec) -@pytest.mark.skip() + @pytest.mark.usefixtures("restore_db_per_class") class TestGetData: _USERNAME = "user1" @@ -285,7 +296,7 @@ def test_frame_content_type(self, content_type, task_id): assert response.status == HTTPStatus.OK assert response.headers["Content-Type"] == content_type -@pytest.mark.skip() + @pytest.mark.usefixtures("restore_db_per_function") class TestPatchTaskAnnotations: def _test_check_response(self, is_allow, response, data=None): @@ -392,7 +403,6 @@ def test_member_update_task_annotation( self._test_check_response(is_allow, response, data) -@pytest.mark.skip() @pytest.mark.usefixtures("restore_db_per_class") class TestGetTaskDataset: def _test_export_task(self, username, tid, **kwargs): @@ -449,7 +459,6 @@ def _test_cannot_create_task(self, username, spec, data, **kwargs): return status - @pytest.mark.skip() def test_can_create_task_with_defined_start_and_stop_frames(self): task_spec = { "name": f"test {self._USERNAME} to create a task with defined start and stop frames", @@ -486,7 +495,6 @@ def test_can_create_task_with_defined_start_and_stop_frames(self): (task, _) = api_client.tasks_api.retrieve(task_id) assert task.size == 4 - @pytest.mark.skip() def test_can_get_annotations_from_new_task_with_skeletons(self): spec = { "name": f"test admin1 to create a task with skeleton", @@ -640,7 +648,6 @@ def test_can_get_annotations_from_new_task_with_skeletons(self): response = get_method(self._USERNAME, f"jobs/{job_id}/annotations") assert response.status_code == HTTPStatus.OK - @pytest.mark.skip() @pytest.mark.parametrize( "cloud_storage_id, manifest, use_bucket_content, org", [ @@ -682,31 +689,44 @@ def test_create_task_with_cloud_storage_files( @pytest.mark.parametrize( "cloud_storage_id, manifest, org", [ - (1, "manifest.jsonl", ""), # public bucket + (1, "test/sub/manifest.jsonl", ""), # public bucket + ], + ) + @pytest.mark.parametrize( + "pattern, task_size", + [ + ("*", 3), + ("test/*", 3), + ("test/sub*1.jpeg", 1), + ("*image*.jpeg", 3), + ("wrong_pattern", 0), ], ) - @pytest.mark.parametrize("pattern", ["test/*"]) # "test/sub*1.jpg", "*image*.jpg" def test_create_task_with_file_pattern( - self, cloud_storage_id, manifest, pattern, org, cloud_storages + self, cloud_storage_id, manifest, pattern, task_size, org, cloud_storages ): # prepare dataset on the bucket - images = generate_image_files(3, 'test_image') + images = generate_image_files(3, "test_image") s3_client = make_client() cloud_storage = cloud_storages[cloud_storage_id] for image in images: - s3_client.create_file(data=image, bucket=cloud_storage["resource"], filename=f'test/sub/{image.name}') - - from utils.dataset_manifest import ImageManifestManager - from io import BytesIO + s3_client.create_file( + data=image, bucket=cloud_storage["resource"], filename=f"test/sub/{image.name}" + ) - from tempfile import NamedTemporaryFile - with NamedTemporaryFile(mode='wb', suffix='manifest.jsonl') as tmp_file: + with NamedTemporaryFile(mode="wb", suffix="manifest.jsonl", delete=False) as tmp_file: tmp_name = tmp_file.name - manifest = ImageManifestManager(manifest_path=tmp_name) - manifest.link(sources=images) - manifest.create() - s3_client.create_file(data=self._manifest.path, bucket=cloud_storage["resource"], filename=f'test/sub/{image.name}') + tmp_manifest = ImageManifestManager(manifest_path=tmp_name) + tmp_manifest.link(sources=images) + tmp_manifest.create() + with open(tmp_name, "rb") as tmp_file: + s3_client.create_file( + data=tmp_file.read(), + bucket=cloud_storage["resource"], + filename=manifest, + ) + os.remove(tmp_name) task_spec = { "name": f"Task with files from cloud storage {cloud_storage_id}", @@ -725,16 +745,18 @@ def test_create_task_with_file_pattern( "pattern": pattern, } - task_id = self._test_create_task( - self._USERNAME, task_spec, data_spec, content_type="application/json", org=org - ) + if task_size: + task_id = self._test_create_task( + self._USERNAME, task_spec, data_spec, content_type="application/json", org=org + ) - with make_api_client(self._USERNAME) as api_client: - (task, response) = api_client.tasks_api.retrieve(task_id, org=org) - assert response.status == HTTPStatus.OK - assert task.size + with make_api_client(self._USERNAME) as api_client: + (task, response) = api_client.tasks_api.retrieve(task_id, org=org) + assert response.status == HTTPStatus.OK + assert task.size == task_size + else: + self._test_cannot_create_task(self._USERNAME, task_spec, data_spec) - @pytest.mark.skip() @pytest.mark.parametrize( "cloud_storage_id, manifest, org", [(1, "manifest.jsonl", "")], # public bucket From 8e3bc4f0450c1973079579a487cf43e7dc9342dd Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 21 Dec 2022 16:20:06 +0100 Subject: [PATCH 03/21] Small changes --- cvat/apps/engine/task.py | 99 ++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 40 deletions(-) diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index d325b8ec549c..654429d8956b 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -216,10 +216,10 @@ def _validate_data(counter, manifest_files=None): return counter, task_modes[0] -def _validate_manifest(manifests, root_dir, is_in_cloud, db_cloud_storage): +def _validate_manifest(manifests, root_dir, is_in_cloud, db_cloud_storage, data_storage_method): if manifests: if len(manifests) != 1: - raise Exception('Only one manifest file can be attached with data') + raise ValidationError('Only one manifest file can be attached with data') manifest_file = manifests[0] full_manifest_path = os.path.join(root_dir, manifests[0]) if is_in_cloud: @@ -230,8 +230,10 @@ def _validate_manifest(manifests, root_dir, is_in_cloud, db_cloud_storage): < cloud_storage_instance.get_file_last_modified(manifest_file): cloud_storage_instance.download_file(manifest_file, full_manifest_path) if is_manifest(full_manifest_path): + if not (settings.USE_CACHE or data_storage_method != models.StorageMethodChoice.CACHE): + raise ValidationError("Manifest file can be uploaded only if 'Use cache' option is also selected") return manifest_file - raise Exception('Invalid manifest was uploaded') + raise ValidationError('Invalid manifest was uploaded') return None def _validate_url(url): @@ -300,6 +302,26 @@ def _download_data(urls, upload_dir): def _get_manifest_frame_indexer(start_frame=0, frame_step=1): return lambda frame_id: start_frame + frame_id * frame_step +def _create_task_manifest_based_on_cloud_storage_manifest( + sorted_media, + cloud_storage_manifest_prefix, + cloud_storage_manifest, + manifest +): + if cloud_storage_manifest_prefix: + sorted_media_without_manifest_prefix = [ + os.path.relpath(i, cloud_storage_manifest_prefix) for i in sorted_media + ] + sequence, raw_content = cloud_storage_manifest.get_subset(sorted_media_without_manifest_prefix) + def _add_prefix(properties): + file_name = properties['name'] + properties['name'] = os.path.join(cloud_storage_manifest_prefix, file_name) + return properties + content = list(map(_add_prefix, raw_content)) + else: + sequence, content = cloud_storage_manifest.get_subset(sorted_media) + sorted_content = (i[1] for i in sorted(zip(sequence, content))) + manifest.create(sorted_content) @transaction.atomic def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): @@ -318,6 +340,7 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): # find and validate manigfest file manifest_files = _count_manifest_files(data) manifest_root = None + if db_data.storage in {models.StorageChoice.LOCAL, models.StorageChoice.SHARE}: manifest_root = upload_dir elif is_data_in_cloud: @@ -325,10 +348,9 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): manifest_file = _validate_manifest( manifest_files, manifest_root, - is_data_in_cloud, db_data.cloud_storage if is_data_in_cloud else None + is_data_in_cloud, db_data.cloud_storage if is_data_in_cloud else None, + db_data.storage_method, ) - if manifest_file and (not settings.USE_CACHE or db_data.storage_method != models.StorageMethodChoice.CACHE): - raise Exception("File with meta information can be uploaded if 'Use cache' option is also selected") if is_data_in_cloud: cloud_storage_instance = db_storage_to_storage_instance(db_data.cloud_storage) @@ -339,16 +361,25 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): db_data.cloud_storage.get_storage_dirname() ) cloud_storage_manifest.set_index() + cloud_storage_manifest_prefix = os.path.dirname(manifest_file) - if len(data['server_files']) == 1 and data['pattern']: - if data['pattern'] == '*': # or use "/" ? - shutil.copyfile(cloud_storage_manifest.manifest.path, manifest.manifest.path) - manifest.set_index() - data['server_files'].extend(list(cloud_storage_manifest.data)) - else: - r = re.compile(data['pattern']) - data['server_files'].extend(list(filter(r.match, cloud_storage_manifest.data))) + # update list with server files if task creation approach with pattern and manifest file is used + if is_data_in_cloud and data['pattern']: + if 1 != len(data['server_files']): + raise ValidationError('Using a pattern is only supported with a manifest, but others files were found') + + cloud_storage_manifest_data = list(cloud_storage_manifest.data) if not cloud_storage_manifest_prefix \ + else [os.path.join(cloud_storage_manifest_prefix, f) for f in cloud_storage_manifest.data] + if data['pattern'] == '*': + shutil.copyfile(cloud_storage_manifest.manifest.path, manifest.manifest.path) + manifest.set_index() + server_files = cloud_storage_manifest_data + else: + r = re.compile(data['pattern']) + server_files = list(filter(r.match, cloud_storage_manifest_data)) + data['server_files'].extend(server_files) + # count and validate uploaded files media = _count_files(data) media, task_mode = _validate_data(media, manifest_files) @@ -357,32 +388,20 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): _copy_data_from_source(data['server_files'], upload_dir, data.get('server_files_path')) elif db_data.storage == models.StorageChoice.SHARE: upload_dir = settings.SHARE_ROOT - - if data['server_files'] and is_data_in_cloud: - sorted_media = sort(media['image'], data['sorting_method']) - - data_size = len(sorted_media) - segment_step, *_ = _get_task_segment_data(db_task, data_size) - for preview_frame in range(0, data_size, segment_step): - preview = sorted_media[preview_frame] - cloud_storage_instance.download_file(preview, os.path.join(upload_dir, preview)) - - cloud_storage_manifest_prefix = os.path.dirname(manifest_file) - cloud_storage_manifest.set_index() - if cloud_storage_manifest_prefix: - sorted_media_without_manifest_prefix = [ - os.path.relpath(i, cloud_storage_manifest_prefix) for i in sorted_media - ] - sequence, raw_content = cloud_storage_manifest.get_subset(sorted_media_without_manifest_prefix) - def _add_prefix(properties): - file_name = properties['name'] - properties['name'] = os.path.join(cloud_storage_manifest_prefix, file_name) - return properties - content = list(map(_add_prefix, raw_content)) - else: - sequence, content = cloud_storage_manifest.get_subset(sorted_media) - sorted_content = (i[1] for i in sorted(zip(sequence, content))) - manifest.create(sorted_content) + elif is_data_in_cloud: + sorted_media = sort(media['image'], data['sorting_method']) + + # download previews from cloud storage + data_size = len(sorted_media) + segment_step, *_ = _get_task_segment_data(db_task, data_size) + for preview_frame in range(0, data_size, segment_step): + preview = sorted_media[preview_frame] + cloud_storage_instance.download_file(preview, os.path.join(upload_dir, preview)) + + # Define task manifest content based on cloud storage manifest content and uploaded files + _create_task_manifest_based_on_cloud_storage_manifest( + sorted_media, cloud_storage_manifest_prefix, + cloud_storage_manifest, manifest) av_scan_paths(upload_dir) From 3c743880ba600e2eaf32acf8f7b28fe320a3a618 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 21 Dec 2022 21:50:31 +0100 Subject: [PATCH 04/21] Update requirements --- tests/python/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python/requirements.txt b/tests/python/requirements.txt index 33c7d92f052e..d18c683b9a3d 100644 --- a/tests/python/requirements.txt +++ b/tests/python/requirements.txt @@ -4,3 +4,4 @@ requests==2.26.0 deepdiff==5.6.0 boto3==1.17.61 Pillow==9.3.0 +av==9.2.0 --no-binary=av From b8be000ab3505550c6803d8d3296efc1b475ff48 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 21 Dec 2022 21:52:55 +0100 Subject: [PATCH 05/21] Black && isort --- tests/python/rest_api/test_tasks.py | 3 +-- tests/python/shared/utils/helpers.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index c8d8b9bafab5..54005abaf367 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -8,6 +8,7 @@ import sys from copy import deepcopy from http import HTTPStatus +from tempfile import NamedTemporaryFile from time import sleep import pytest @@ -19,10 +20,8 @@ from shared.utils.helpers import generate_image_files from shared.utils.s3 import make_client -from tempfile import NamedTemporaryFile from .utils import export_dataset - sys.path.append( os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))).split("tests")[0], "utils" diff --git a/tests/python/shared/utils/helpers.py b/tests/python/shared/utils/helpers.py index 8f6e088e6c50..2922ce271501 100644 --- a/tests/python/shared/utils/helpers.py +++ b/tests/python/shared/utils/helpers.py @@ -18,7 +18,7 @@ def generate_image_file(filename="image.png", size=(50, 50), color=(0, 0, 0)): return f -def generate_image_files(count, filename='') -> List[BytesIO]: +def generate_image_files(count, filename="") -> List[BytesIO]: images = [] for i in range(count): image = generate_image_file(f"{filename}{i}.jpeg", color=(i, i, i)) From 7b2d235d7682a5de68ab765e880c43b939c3277c Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 21 Dec 2022 23:40:33 +0100 Subject: [PATCH 06/21] Update test --- cvat/apps/engine/task.py | 2 +- tests/python/rest_api/test_tasks.py | 40 ++++++++++++++-------------- tests/python/shared/utils/helpers.py | 4 +-- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index 3274faa5978b..08a3f36dd0da 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -334,7 +334,7 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): if data['remote_files'] and not isDatasetImport: data['remote_files'] = _download_data(data['remote_files'], upload_dir) - # find and validate manigfest file + # find and validate manifest file manifest_files = _count_manifest_files(data) manifest_root = None diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index 54005abaf367..07fe5cdea988 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -686,46 +686,46 @@ def test_create_task_with_cloud_storage_files( ) @pytest.mark.parametrize( - "cloud_storage_id, manifest, org", + "cloud_storage_id, manifest, pattern, sub_dir, task_size, org", [ - (1, "test/sub/manifest.jsonl", ""), # public bucket - ], - ) - @pytest.mark.parametrize( - "pattern, task_size", - [ - ("*", 3), - ("test/*", 3), - ("test/sub*1.jpeg", 1), - ("*image*.jpeg", 3), - ("wrong_pattern", 0), + (1, "manifest.jsonl", "*", True, 3, ""), # public bucket + (1, "manifest.jsonl", "test/*", True, 3, ""), + (1, "manifest.jsonl", "test/sub*1.jpeg", True, 1, ""), + (1, "manifest.jsonl", "*image*.jpeg", True, 3, ""), + (1, "manifest.jsonl", "wrong_pattern", True, 0, ""), + (1, "abc_manifest.jsonl", "[a-c]*.jpeg", False, 2, ""), + (1, "abc_manifest.jsonl", "[d]*.jpeg", False, 1, ""), + (1, "abc_manifest.jsonl", "[e-z]*.jpeg", False, 0, ""), ], ) def test_create_task_with_file_pattern( - self, cloud_storage_id, manifest, pattern, task_size, org, cloud_storages + self, cloud_storage_id, manifest, pattern, sub_dir, task_size, org, cloud_storages ): # prepare dataset on the bucket - images = generate_image_files(3, "test_image") + prefixes = ("test_image_",) * 3 if sub_dir else ("a_", "b_", "d_") + images = generate_image_files(3, prefixes=prefixes) s3_client = make_client() cloud_storage = cloud_storages[cloud_storage_id] + for image in images: s3_client.create_file( - data=image, bucket=cloud_storage["resource"], filename=f"test/sub/{image.name}" + data=image, + bucket=cloud_storage["resource"], + filename=f"{'test/sub/' if sub_dir else ''}{image.name}", ) - with NamedTemporaryFile(mode="wb", suffix="manifest.jsonl", delete=False) as tmp_file: + with NamedTemporaryFile(mode="wb+", suffix="manifest.jsonl") as tmp_file: tmp_name = tmp_file.name tmp_manifest = ImageManifestManager(manifest_path=tmp_name) tmp_manifest.link(sources=images) tmp_manifest.create() - with open(tmp_name, "rb") as tmp_file: + tmp_file.seek(0) s3_client.create_file( data=tmp_file.read(), bucket=cloud_storage["resource"], - filename=manifest, + filename=f"test/sub/{manifest}" if sub_dir else manifest, ) - os.remove(tmp_name) task_spec = { "name": f"Task with files from cloud storage {cloud_storage_id}", @@ -740,7 +740,7 @@ def test_create_task_with_file_pattern( "image_quality": 75, "use_cache": True, "cloud_storage_id": cloud_storage_id, - "server_files": [manifest], + "server_files": [f"test/sub/{manifest}" if sub_dir else manifest], "pattern": pattern, } diff --git a/tests/python/shared/utils/helpers.py b/tests/python/shared/utils/helpers.py index 2922ce271501..277007a68a86 100644 --- a/tests/python/shared/utils/helpers.py +++ b/tests/python/shared/utils/helpers.py @@ -18,10 +18,10 @@ def generate_image_file(filename="image.png", size=(50, 50), color=(0, 0, 0)): return f -def generate_image_files(count, filename="") -> List[BytesIO]: +def generate_image_files(count, prefixes=None) -> List[BytesIO]: images = [] for i in range(count): - image = generate_image_file(f"{filename}{i}.jpeg", color=(i, i, i)) + image = generate_image_file(f"{prefixes[i] if prefixes else ''}{i}.jpeg", color=(i, i, i)) images.append(image) return images From 2cb0cbf4a6f352e996756fe429144e846a7307d7 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 21 Dec 2022 23:54:41 +0100 Subject: [PATCH 07/21] Small fix --- cvat/apps/engine/task.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index 08a3f36dd0da..d93e918579a8 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -368,8 +368,6 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): cloud_storage_manifest_data = list(cloud_storage_manifest.data) if not cloud_storage_manifest_prefix \ else [os.path.join(cloud_storage_manifest_prefix, f) for f in cloud_storage_manifest.data] if data['pattern'] == '*': - shutil.copyfile(cloud_storage_manifest.manifest.path, manifest.manifest.path) - manifest.set_index() server_files = cloud_storage_manifest_data else: r = re.compile(data['pattern']) From 6a58dc3c3b9e32cbf4379838d54b95cdbc8c90b6 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 22 Dec 2022 00:55:35 +0100 Subject: [PATCH 08/21] Fix manifest root --- cvat/apps/engine/task.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index d93e918579a8..f93fa1781eff 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -328,7 +328,7 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): slogger.glob.info("create task #{}".format(db_task.id)) db_data = db_task.data - upload_dir = db_data.get_upload_dirname() + upload_dir = db_data.get_upload_dirname() if db_data.storage != models.StorageChoice.SHARE else settings.SHARE_ROOT is_data_in_cloud = db_data.storage == models.StorageChoice.CLOUD_STORAGE if data['remote_files'] and not isDatasetImport: @@ -338,7 +338,10 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): manifest_files = _count_manifest_files(data) manifest_root = None - if db_data.storage in {models.StorageChoice.LOCAL, models.StorageChoice.SHARE}: + # we sould also handle this case because files from the share source have not been downloaded yet + if data['copy_data']: + manifest_root = settings.SHARE_ROOT + elif db_data.storage in {models.StorageChoice.LOCAL, models.StorageChoice.SHARE}: manifest_root = upload_dir elif is_data_in_cloud: manifest_root = db_data.cloud_storage.get_storage_dirname() @@ -381,8 +384,6 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): if data['server_files']: if db_data.storage == models.StorageChoice.LOCAL: _copy_data_from_source(data['server_files'], upload_dir, data.get('server_files_path')) - elif db_data.storage == models.StorageChoice.SHARE: - upload_dir = settings.SHARE_ROOT elif is_data_in_cloud: sorted_media = sort(media['image'], data['sorting_method']) From 5e75d97cd9ceacadb65766dd861471fcf2690f7a Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 22 Dec 2022 10:08:55 +0100 Subject: [PATCH 09/21] Remove av requirement --- tests/python/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python/requirements.txt b/tests/python/requirements.txt index d18c683b9a3d..33c7d92f052e 100644 --- a/tests/python/requirements.txt +++ b/tests/python/requirements.txt @@ -4,4 +4,3 @@ requests==2.26.0 deepdiff==5.6.0 boto3==1.17.61 Pillow==9.3.0 -av==9.2.0 --no-binary=av From 445d9146b845520b8046890e49f11928252bb53e Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 22 Dec 2022 17:00:31 +0100 Subject: [PATCH 10/21] Update cvat/apps/engine/serializers.py Co-authored-by: Maxim Zhiltsov --- cvat/apps/engine/serializers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cvat/apps/engine/serializers.py b/cvat/apps/engine/serializers.py index 7f1c9cd7458f..f549ffc9e581 100644 --- a/cvat/apps/engine/serializers.py +++ b/cvat/apps/engine/serializers.py @@ -395,9 +395,9 @@ def validate_chunk_size(self, value): def validate_pattern(self, value): import string supported_wildcards = set(["*", "[", "]", "."]) - no_escapted_symbols = set(['!', '"', '%', "'", ',', '/', ':', ';', '<', '=', '>', '@', "`", "_"]) - legal_special_chars = supported_wildcards.union(no_escapted_symbols) - for c in set(string.punctuation) - legal_special_chars: + non_escaped_symbols = set(['!', '"', '%', "'", ',', '/', ':', ';', '<', '=', '>', '@', "`", "_"]) + allowed_special_chars = supported_wildcards.union(non_escaped_symbols) + for c in set(string.punctuation) - allowed_special_chars: if c in value: value.replace(c, r'\\{}'.format(c)) From 1290edd3af50040c636c3832c5a0de2dad680018 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 22 Dec 2022 17:31:50 +0100 Subject: [PATCH 11/21] Update cvat/apps/engine/task.py Co-authored-by: Maxim Zhiltsov --- cvat/apps/engine/task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index f93fa1781eff..2a4e1022425d 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -338,7 +338,7 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): manifest_files = _count_manifest_files(data) manifest_root = None - # we sould also handle this case because files from the share source have not been downloaded yet + # we should also handle this case because files from the share source have not been downloaded yet if data['copy_data']: manifest_root = settings.SHARE_ROOT elif db_data.storage in {models.StorageChoice.LOCAL, models.StorageChoice.SHARE}: From 0ab87dea02d75b1f9e1f03064f591ebd05bf6337 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 22 Dec 2022 19:22:30 +0100 Subject: [PATCH 12/21] Apply comments --- .vscode/settings.json | 5 +- cvat/apps/engine/serializers.py | 8 +-- cvat/apps/engine/task.py | 19 ++++--- tests/python/rest_api/test_tasks.py | 78 ++++++++++++++++++---------- tests/python/shared/utils/helpers.py | 3 +- utils/dataset_manifest/core.py | 7 +-- 6 files changed, 72 insertions(+), 48 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 36e92230e3cc..76614f368f9a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -39,8 +39,5 @@ "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, "python.testing.pytestPath": "${workspaceFolder}/.env/bin/pytest", - "python.testing.cwd": "${workspaceFolder}/tests", - "python.analysis.extraPaths": [ - "./utils" - ] + "python.testing.cwd": "${workspaceFolder}/tests" } diff --git a/cvat/apps/engine/serializers.py b/cvat/apps/engine/serializers.py index f549ffc9e581..b22c384a849d 100644 --- a/cvat/apps/engine/serializers.py +++ b/cvat/apps/engine/serializers.py @@ -6,6 +6,7 @@ import os import re import shutil +import string from tempfile import NamedTemporaryFile from typing import OrderedDict @@ -371,13 +372,13 @@ class DataSerializer(WriteOnceMixin, serializers.ModelSerializer): use_cache = serializers.BooleanField(default=False) copy_data = serializers.BooleanField(default=False) cloud_storage_id = serializers.IntegerField(write_only=True, allow_null=True, required=False) - pattern = serializers.CharField(allow_null=True, required=False) + filename_pattern = serializers.CharField(allow_null=True, required=False) class Meta: model = models.Data fields = ('chunk_size', 'size', 'image_quality', 'start_frame', 'stop_frame', 'frame_filter', 'compressed_chunk_type', 'original_chunk_type', 'client_files', 'server_files', 'remote_files', 'use_zip_chunks', - 'cloud_storage_id', 'use_cache', 'copy_data', 'storage_method', 'storage', 'sorting_method', 'pattern') + 'cloud_storage_id', 'use_cache', 'copy_data', 'storage_method', 'storage', 'sorting_method', 'filename_pattern') # pylint: disable=no-self-use def validate_frame_filter(self, value): @@ -392,8 +393,7 @@ def validate_chunk_size(self, value): raise serializers.ValidationError('Chunk size must be a positive integer') return value - def validate_pattern(self, value): - import string + def validate_filename_pattern(self, value): supported_wildcards = set(["*", "[", "]", "."]) non_escaped_symbols = set(['!', '"', '%', "'", ',', '/', ':', ';', '<', '=', '>', '@', "`", "_"]) allowed_special_chars = supported_wildcards.union(non_escaped_symbols) diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index f93fa1781eff..252713e72fe0 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -177,7 +177,7 @@ def count_files(file_mapping, counter): return counter -def _count_manifest_files(data): +def _find_manifest_files(data): manifest_files = [] for files in ['client_files', 'server_files', 'remote_files']: manifest_files.extend(list(filter(lambda x: x.endswith('.jsonl'), data[files]))) @@ -216,7 +216,7 @@ def _validate_data(counter, manifest_files=None): def _validate_manifest(manifests, root_dir, is_in_cloud, db_cloud_storage, data_storage_method): if manifests: if len(manifests) != 1: - raise ValidationError('Only one manifest file can be attached with data') + raise ValidationError('Only one manifest file can be attached to data') manifest_file = manifests[0] full_manifest_path = os.path.join(root_dir, manifests[0]) if is_in_cloud: @@ -335,7 +335,7 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): data['remote_files'] = _download_data(data['remote_files'], upload_dir) # find and validate manifest file - manifest_files = _count_manifest_files(data) + manifest_files = _find_manifest_files(data) manifest_root = None # we sould also handle this case because files from the share source have not been downloaded yet @@ -364,16 +364,21 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): cloud_storage_manifest_prefix = os.path.dirname(manifest_file) # update list with server files if task creation approach with pattern and manifest file is used - if is_data_in_cloud and data['pattern']: + if is_data_in_cloud and data['filename_pattern']: if 1 != len(data['server_files']): - raise ValidationError('Using a pattern is only supported with a manifest, but others files were found') + l = len(data['server_files']) - 1 + raise ValidationError( + 'Using a filename_pattern is only supported with a manifest file, ' + f'but others {l} file{"s" if l > 1 else ""} {"were" if l > 1 else "was"} found' + 'Please remove extra files and keep only manifest file in server_files field.' + ) cloud_storage_manifest_data = list(cloud_storage_manifest.data) if not cloud_storage_manifest_prefix \ else [os.path.join(cloud_storage_manifest_prefix, f) for f in cloud_storage_manifest.data] - if data['pattern'] == '*': + if data['filename_pattern'] == '*': server_files = cloud_storage_manifest_data else: - r = re.compile(data['pattern']) + r = re.compile(data['filename_pattern']) server_files = list(filter(r.match, cloud_storage_manifest_data)) data['server_files'].extend(server_files) diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index 07fe5cdea988..d5e5b7125006 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -4,11 +4,12 @@ # SPDX-License-Identifier: MIT import json -import os -import sys +import os.path as osp +import subprocess +from contextlib import ExitStack from copy import deepcopy from http import HTTPStatus -from tempfile import NamedTemporaryFile +from tempfile import TemporaryDirectory from time import sleep import pytest @@ -16,19 +17,12 @@ from cvat_sdk.core.helpers import get_paginated_collection from deepdiff import DeepDiff +import shared.utils.s3 as s3 from shared.utils.config import get_method, make_api_client, patch_method from shared.utils.helpers import generate_image_files -from shared.utils.s3 import make_client from .utils import export_dataset -sys.path.append( - os.path.join( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))).split("tests")[0], "utils" - ) -) -from dataset_manifest import ImageManifestManager - def get_cloud_storage_content(username, cloud_storage_id, manifest): with make_api_client(username) as api_client: @@ -686,7 +680,7 @@ def test_create_task_with_cloud_storage_files( ) @pytest.mark.parametrize( - "cloud_storage_id, manifest, pattern, sub_dir, task_size, org", + "cloud_storage_id, manifest, filename_pattern, sub_dir, task_size, org", [ (1, "manifest.jsonl", "*", True, 3, ""), # public bucket (1, "manifest.jsonl", "test/*", True, 3, ""), @@ -699,12 +693,12 @@ def test_create_task_with_cloud_storage_files( ], ) def test_create_task_with_file_pattern( - self, cloud_storage_id, manifest, pattern, sub_dir, task_size, org, cloud_storages + self, cloud_storage_id, manifest, filename_pattern, sub_dir, task_size, org, cloud_storages ): # prepare dataset on the bucket prefixes = ("test_image_",) * 3 if sub_dir else ("a_", "b_", "d_") images = generate_image_files(3, prefixes=prefixes) - s3_client = make_client() + s3_client = s3.make_client() cloud_storage = cloud_storages[cloud_storage_id] @@ -715,17 +709,35 @@ def test_create_task_with_file_pattern( filename=f"{'test/sub/' if sub_dir else ''}{image.name}", ) - with NamedTemporaryFile(mode="wb+", suffix="manifest.jsonl") as tmp_file: - tmp_name = tmp_file.name - tmp_manifest = ImageManifestManager(manifest_path=tmp_name) - tmp_manifest.link(sources=images) - tmp_manifest.create() - tmp_file.seek(0) - s3_client.create_file( - data=tmp_file.read(), - bucket=cloud_storage["resource"], - filename=f"test/sub/{manifest}" if sub_dir else manifest, - ) + with TemporaryDirectory() as tmp_dir: + for image in images: + with open(osp.join(tmp_dir, image.name), "wb") as f: + f.write(image.getvalue()) + + command = [ + "docker", + "run", + "-it", + "--rm", + "-u", + "django:django", + "-v", + f"{tmp_dir}:/local", + "--entrypoint", + "python3", + "cvat/server:dev", + "utils/dataset_manifest/create.py", + "--output-dir", + "/local", + "/local", + ] + subprocess.run(command, check=True, stdout=subprocess.DEVNULL) + with open(osp.join(tmp_dir, "manifest.jsonl"), mode="rb") as m_file: + s3_client.create_file( + data=m_file.read(), + bucket=cloud_storage["resource"], + filename=f"test/sub/{manifest}" if sub_dir else manifest, + ) task_spec = { "name": f"Task with files from cloud storage {cloud_storage_id}", @@ -741,7 +753,7 @@ def test_create_task_with_file_pattern( "use_cache": True, "cloud_storage_id": cloud_storage_id, "server_files": [f"test/sub/{manifest}" if sub_dir else manifest], - "pattern": pattern, + "filename_pattern": filename_pattern, } if task_size: @@ -754,7 +766,19 @@ def test_create_task_with_file_pattern( assert response.status == HTTPStatus.OK assert task.size == task_size else: - self._test_cannot_create_task(self._USERNAME, task_spec, data_spec) + status = self._test_cannot_create_task(self._USERNAME, task_spec, data_spec) + assert "No media data found" in status.message + + with ExitStack() as stack: + files = [f"{'test/sub/' if sub_dir else ''}{image.name}" for image in images] + [ + f"test/sub/{manifest}" if sub_dir else manifest + ] + for f in files: + stack.callback( + s3_client.remove_file, + bucket=cloud_storage["resource"], + filename=f, + ) @pytest.mark.parametrize( "cloud_storage_id, manifest, org", diff --git a/tests/python/shared/utils/helpers.py b/tests/python/shared/utils/helpers.py index 277007a68a86..0e71d3e65507 100644 --- a/tests/python/shared/utils/helpers.py +++ b/tests/python/shared/utils/helpers.py @@ -21,7 +21,8 @@ def generate_image_file(filename="image.png", size=(50, 50), color=(0, 0, 0)): def generate_image_files(count, prefixes=None) -> List[BytesIO]: images = [] for i in range(count): - image = generate_image_file(f"{prefixes[i] if prefixes else ''}{i}.jpeg", color=(i, i, i)) + prefix = prefixes[i] if prefixes else '' + image = generate_image_file(f"{prefix}{i}.jpeg", color=(i, i, i)) images.append(image) return images diff --git a/utils/dataset_manifest/core.py b/utils/dataset_manifest/core.py index 8ca08434a467..9ea7b6757d53 100644 --- a/utils/dataset_manifest/core.py +++ b/utils/dataset_manifest/core.py @@ -195,11 +195,8 @@ def __iter__(self): image = next(sources) img = Image.open(image, mode='r') orientation = img.getexif().get(274, 1) - if isinstance(image, str): - img_name = os.path.relpath(image, self._data_dir) if self._data_dir \ - else os.path.basename(image) - else: - img_name = image.name + img_name = os.path.relpath(image, self._data_dir) if self._data_dir \ + else os.path.basename(image) name, extension = os.path.splitext(img_name) width, height = img.width, img.height if orientation > 4: From 782f30a33123a147fc5c233d4a98542d31164444 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 22 Dec 2022 19:34:25 +0100 Subject: [PATCH 13/21] black --- tests/python/shared/utils/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/shared/utils/helpers.py b/tests/python/shared/utils/helpers.py index 0e71d3e65507..e7289ac3bee0 100644 --- a/tests/python/shared/utils/helpers.py +++ b/tests/python/shared/utils/helpers.py @@ -21,7 +21,7 @@ def generate_image_file(filename="image.png", size=(50, 50), color=(0, 0, 0)): def generate_image_files(count, prefixes=None) -> List[BytesIO]: images = [] for i in range(count): - prefix = prefixes[i] if prefixes else '' + prefix = prefixes[i] if prefixes else "" image = generate_image_file(f"{prefix}{i}.jpeg", color=(i, i, i)) images.append(image) From 1fc948f9a6009a8db72983834a73051e61380ea0 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 22 Dec 2022 23:16:58 +0100 Subject: [PATCH 14/21] Apply comments && debug --- cvat/apps/engine/serializers.py | 22 +++++------ cvat/apps/engine/task.py | 4 +- tests/python/rest_api/test_tasks.py | 57 +++++++++++++++++------------ 3 files changed, 47 insertions(+), 36 deletions(-) diff --git a/cvat/apps/engine/serializers.py b/cvat/apps/engine/serializers.py index b22c384a849d..d1ae4e8050f7 100644 --- a/cvat/apps/engine/serializers.py +++ b/cvat/apps/engine/serializers.py @@ -393,17 +393,17 @@ def validate_chunk_size(self, value): raise serializers.ValidationError('Chunk size must be a positive integer') return value - def validate_filename_pattern(self, value): - supported_wildcards = set(["*", "[", "]", "."]) - non_escaped_symbols = set(['!', '"', '%', "'", ',', '/', ':', ';', '<', '=', '>', '@', "`", "_"]) - allowed_special_chars = supported_wildcards.union(non_escaped_symbols) - for c in set(string.punctuation) - allowed_special_chars: - if c in value: - value.replace(c, r'\\{}'.format(c)) - - if '*' in value and value != "*": - value = r'.*'.join(value.split('*')) - return value + # def validate_filename_pattern(self, value): + # supported_wildcards = set(["*", "[", "]", "."]) + # non_escaped_symbols = set(['!', '"', '%', "'", ',', '/', ':', ';', '<', '=', '>', '@', "`", "_"]) + # allowed_special_chars = supported_wildcards.union(non_escaped_symbols) + # for c in set(string.punctuation) - allowed_special_chars: + # if c in value: + # value.replace(c, r'\\{}'.format(c)) + + # if '*' in value and value != "*": + # value = r'.*'.join(value.split('*')) + # return value # pylint: disable=no-self-use def validate(self, attrs): diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index 4f1ae7933d1d..a4bcd8d5c332 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -5,6 +5,7 @@ # SPDX-License-Identifier: MIT import itertools +import fnmatch import os import sys from rest_framework.serializers import ValidationError @@ -378,8 +379,7 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False): if data['filename_pattern'] == '*': server_files = cloud_storage_manifest_data else: - r = re.compile(data['filename_pattern']) - server_files = list(filter(r.match, cloud_storage_manifest_data)) + server_files = fnmatch.filter(cloud_storage_manifest_data, data['filename_pattern']) data['server_files'].extend(server_files) # count and validate uploaded files diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index d5e5b7125006..1d229c8ca87d 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -6,7 +6,7 @@ import json import os.path as osp import subprocess -from contextlib import ExitStack +from functools import partial from copy import deepcopy from http import HTTPStatus from tempfile import TemporaryDirectory @@ -679,21 +679,31 @@ def test_create_task_with_cloud_storage_files( self._USERNAME, task_spec, data_spec, content_type="application/json", org=org ) + @pytest.mark.parametrize("cloud_storage_id", [1]) @pytest.mark.parametrize( - "cloud_storage_id, manifest, filename_pattern, sub_dir, task_size, org", + "manifest, filename_pattern, sub_dir, task_size", [ - (1, "manifest.jsonl", "*", True, 3, ""), # public bucket - (1, "manifest.jsonl", "test/*", True, 3, ""), - (1, "manifest.jsonl", "test/sub*1.jpeg", True, 1, ""), - (1, "manifest.jsonl", "*image*.jpeg", True, 3, ""), - (1, "manifest.jsonl", "wrong_pattern", True, 0, ""), - (1, "abc_manifest.jsonl", "[a-c]*.jpeg", False, 2, ""), - (1, "abc_manifest.jsonl", "[d]*.jpeg", False, 1, ""), - (1, "abc_manifest.jsonl", "[e-z]*.jpeg", False, 0, ""), + ("manifest.jsonl", "*", True, 3), # public bucket + ("manifest.jsonl", "test/*", True, 3), + ("manifest.jsonl", "test/sub*1.jpeg", True, 1), + ("manifest.jsonl", "*image*.jpeg", True, 3), + ("manifest.jsonl", "wrong_pattern", True, 0), + ("abc_manifest.jsonl", "[a-c]*.jpeg", False, 2), + ("abc_manifest.jsonl", "[d]*.jpeg", False, 1), + ("abc_manifest.jsonl", "[e-z]*.jpeg", False, 0), ], ) + @pytest.mark.parametrize("org", [""]) def test_create_task_with_file_pattern( - self, cloud_storage_id, manifest, filename_pattern, sub_dir, task_size, org, cloud_storages + self, + cloud_storage_id, + manifest, + filename_pattern, + sub_dir, + task_size, + org, + cloud_storages, + request, ): # prepare dataset on the bucket prefixes = ("test_image_",) * 3 if sub_dir else ("a_", "b_", "d_") @@ -708,6 +718,11 @@ def test_create_task_with_file_pattern( bucket=cloud_storage["resource"], filename=f"{'test/sub/' if sub_dir else ''}{image.name}", ) + request.addfinalizer( + partial( + s3_client.remove_file, bucket=cloud_storage["resource"], filename=image.name + ) + ) with TemporaryDirectory() as tmp_dir: for image in images: @@ -731,13 +746,20 @@ def test_create_task_with_file_pattern( "/local", "/local", ] - subprocess.run(command, check=True, stdout=subprocess.DEVNULL) + subprocess.run(command, check=True, stdout=subprocess.PIPE) with open(osp.join(tmp_dir, "manifest.jsonl"), mode="rb") as m_file: s3_client.create_file( data=m_file.read(), bucket=cloud_storage["resource"], filename=f"test/sub/{manifest}" if sub_dir else manifest, ) + request.addfinalizer( + partial( + s3_client.remove_file, + bucket=cloud_storage["resource"], + filename=f"test/sub/{manifest}" if sub_dir else manifest, + ) + ) task_spec = { "name": f"Task with files from cloud storage {cloud_storage_id}", @@ -769,17 +791,6 @@ def test_create_task_with_file_pattern( status = self._test_cannot_create_task(self._USERNAME, task_spec, data_spec) assert "No media data found" in status.message - with ExitStack() as stack: - files = [f"{'test/sub/' if sub_dir else ''}{image.name}" for image in images] + [ - f"test/sub/{manifest}" if sub_dir else manifest - ] - for f in files: - stack.callback( - s3_client.remove_file, - bucket=cloud_storage["resource"], - filename=f, - ) - @pytest.mark.parametrize( "cloud_storage_id, manifest, org", [(1, "manifest.jsonl", "")], # public bucket From 1a0c1b90f21d2dda89519a25e5ac0045b0548ecb Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 23 Dec 2022 10:15:35 +0100 Subject: [PATCH 15/21] Remove validation method --- cvat/apps/engine/serializers.py | 12 ------------ tests/python/rest_api/test_tasks.py | 2 +- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/cvat/apps/engine/serializers.py b/cvat/apps/engine/serializers.py index d1ae4e8050f7..c2c5965da27f 100644 --- a/cvat/apps/engine/serializers.py +++ b/cvat/apps/engine/serializers.py @@ -393,18 +393,6 @@ def validate_chunk_size(self, value): raise serializers.ValidationError('Chunk size must be a positive integer') return value - # def validate_filename_pattern(self, value): - # supported_wildcards = set(["*", "[", "]", "."]) - # non_escaped_symbols = set(['!', '"', '%', "'", ',', '/', ':', ';', '<', '=', '>', '@', "`", "_"]) - # allowed_special_chars = supported_wildcards.union(non_escaped_symbols) - # for c in set(string.punctuation) - allowed_special_chars: - # if c in value: - # value.replace(c, r'\\{}'.format(c)) - - # if '*' in value and value != "*": - # value = r'.*'.join(value.split('*')) - # return value - # pylint: disable=no-self-use def validate(self, attrs): if 'start_frame' in attrs and 'stop_frame' in attrs \ diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index 1d229c8ca87d..c020ac82bd71 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -740,7 +740,7 @@ def test_create_task_with_file_pattern( f"{tmp_dir}:/local", "--entrypoint", "python3", - "cvat/server:dev", + "cvat/server", "utils/dataset_manifest/create.py", "--output-dir", "/local", From 95ae6186b144002e4f835ceac0802278d9888d2c Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 23 Dec 2022 10:19:55 +0100 Subject: [PATCH 16/21] Fix linters issues --- cvat/apps/engine/serializers.py | 1 - tests/python/rest_api/test_tasks.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cvat/apps/engine/serializers.py b/cvat/apps/engine/serializers.py index c2c5965da27f..abb29bd36e3e 100644 --- a/cvat/apps/engine/serializers.py +++ b/cvat/apps/engine/serializers.py @@ -6,7 +6,6 @@ import os import re import shutil -import string from tempfile import NamedTemporaryFile from typing import OrderedDict diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index c86819fa2849..6e76f30fc3ac 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -6,8 +6,8 @@ import json import os.path as osp import subprocess -from functools import partial from copy import deepcopy +from functools import partial from http import HTTPStatus from tempfile import TemporaryDirectory from time import sleep From d619b9786e614275de0eddb075c2aeafdbb02131 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 23 Dec 2022 11:21:12 +0100 Subject: [PATCH 17/21] Small fix && debug --- tests/python/rest_api/test_tasks.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index 6e76f30fc3ac..26403845b4a1 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -722,7 +722,7 @@ def test_create_task_with_file_pattern( ) request.addfinalizer( partial( - s3_client.remove_file, bucket=cloud_storage["resource"], filename=image.name + s3_client.remove_file, bucket=cloud_storage["resource"], filename=f"{'test/sub/' if sub_dir else ''}{image.name}" ) ) @@ -748,7 +748,12 @@ def test_create_task_with_file_pattern( "/local", "/local", ] - subprocess.run(command, check=True, stdout=subprocess.PIPE) + try: + subprocess.check_output(command) + except subprocess.CalledProcessError as ex: + print(ex.returncode) + print(ex.output) + raise with open(osp.join(tmp_dir, "manifest.jsonl"), mode="rb") as m_file: s3_client.create_file( data=m_file.read(), From d9ebfd2fbe9b80a23199613c514bd988d0a7d0d7 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 23 Dec 2022 12:29:41 +0100 Subject: [PATCH 18/21] d --- tests/python/rest_api/test_tasks.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index 26403845b4a1..6efdaed99650 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -722,7 +722,9 @@ def test_create_task_with_file_pattern( ) request.addfinalizer( partial( - s3_client.remove_file, bucket=cloud_storage["resource"], filename=f"{'test/sub/' if sub_dir else ''}{image.name}" + s3_client.remove_file, + bucket=cloud_storage["resource"], + filename=f"{'test/sub/' if sub_dir else ''}{image.name}", ) ) @@ -737,7 +739,7 @@ def test_create_task_with_file_pattern( "-it", "--rm", "-u", - "django:django", + "root:root", "-v", f"{tmp_dir}:/local", "--entrypoint", @@ -749,6 +751,8 @@ def test_create_task_with_file_pattern( "/local", ] try: + result = subprocess.run('docker image ls | grep cvat/server', stdout=subprocess.PIPE, shell=True) + print('result: ', result) subprocess.check_output(command) except subprocess.CalledProcessError as ex: print(ex.returncode) From b40af24f7d12920ec4d455052b432822d24865fc Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 23 Dec 2022 13:14:18 +0100 Subject: [PATCH 19/21] d --- tests/python/rest_api/test_tasks.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index 6efdaed99650..b42dc6f10675 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -736,7 +736,6 @@ def test_create_task_with_file_pattern( command = [ "docker", "run", - "-it", "--rm", "-u", "root:root", @@ -751,8 +750,10 @@ def test_create_task_with_file_pattern( "/local", ] try: - result = subprocess.run('docker image ls | grep cvat/server', stdout=subprocess.PIPE, shell=True) - print('result: ', result) + result = subprocess.run( + "docker image ls | grep cvat/server", stdout=subprocess.PIPE, shell=True + ) + print("result: ", result) subprocess.check_output(command) except subprocess.CalledProcessError as ex: print(ex.returncode) From 7cce69b0a987429d2fa7c5b939700d640b31b581 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 23 Dec 2022 13:50:48 +0100 Subject: [PATCH 20/21] revert debug --- tests/python/rest_api/test_tasks.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index b42dc6f10675..08800aef9caa 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -749,16 +749,7 @@ def test_create_task_with_file_pattern( "/local", "/local", ] - try: - result = subprocess.run( - "docker image ls | grep cvat/server", stdout=subprocess.PIPE, shell=True - ) - print("result: ", result) - subprocess.check_output(command) - except subprocess.CalledProcessError as ex: - print(ex.returncode) - print(ex.output) - raise + subprocess.run(command, check=True) with open(osp.join(tmp_dir, "manifest.jsonl"), mode="rb") as m_file: s3_client.create_file( data=m_file.read(), From 9210794eb6ce75994b0f51a637c12847f3af7e02 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 27 Dec 2022 10:17:15 +0100 Subject: [PATCH 21/21] Update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2fa0b2dca4c..7a0fc6897d5a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## \[2.4.0] - Unreleased ### Added -- TDB +- Filename pattern to simplify uploading cloud storage data for a task () ### Changed - TDB