5
5
# SPDX-License-Identifier: MIT
6
6
7
7
import itertools
8
+ import fnmatch
8
9
import os
9
10
import sys
10
11
from rest_framework .serializers import ValidationError
@@ -127,7 +128,7 @@ def _save_task_to_db(db_task, extractor):
127
128
db_task .data .save ()
128
129
db_task .save ()
129
130
130
- def _count_files (data , manifest_files = None ):
131
+ def _count_files (data ):
131
132
share_root = settings .SHARE_ROOT
132
133
server_files = []
133
134
@@ -158,7 +159,7 @@ def count_files(file_mapping, counter):
158
159
if mime in counter :
159
160
counter [mime ].append (rel_path )
160
161
elif rel_path .endswith ('.jsonl' ):
161
- manifest_files . append ( rel_path )
162
+ continue
162
163
else :
163
164
slogger .glob .warn ("Skip '{}' file (its mime type doesn't "
164
165
"correspond to supported MIME file type)" .format (full_path ))
@@ -177,6 +178,12 @@ def count_files(file_mapping, counter):
177
178
178
179
return counter
179
180
181
+ def _find_manifest_files (data ):
182
+ manifest_files = []
183
+ for files in ['client_files' , 'server_files' , 'remote_files' ]:
184
+ manifest_files .extend (list (filter (lambda x : x .endswith ('.jsonl' ), data [files ])))
185
+ return manifest_files
186
+
180
187
def _validate_data (counter , manifest_files = None ):
181
188
unique_entries = 0
182
189
multiple_entries = 0
@@ -207,10 +214,10 @@ def _validate_data(counter, manifest_files=None):
207
214
208
215
return counter , task_modes [0 ]
209
216
210
- def _validate_manifest (manifests , root_dir , is_in_cloud , db_cloud_storage ):
217
+ def _validate_manifest (manifests , root_dir , is_in_cloud , db_cloud_storage , data_storage_method ):
211
218
if manifests :
212
219
if len (manifests ) != 1 :
213
- raise Exception ('Only one manifest file can be attached with data' )
220
+ raise ValidationError ('Only one manifest file can be attached to data' )
214
221
manifest_file = manifests [0 ]
215
222
full_manifest_path = os .path .join (root_dir , manifests [0 ])
216
223
if is_in_cloud :
@@ -221,8 +228,10 @@ def _validate_manifest(manifests, root_dir, is_in_cloud, db_cloud_storage):
221
228
< cloud_storage_instance .get_file_last_modified (manifest_file ):
222
229
cloud_storage_instance .download_file (manifest_file , full_manifest_path )
223
230
if is_manifest (full_manifest_path ):
231
+ if not (settings .USE_CACHE or data_storage_method != models .StorageMethodChoice .CACHE ):
232
+ raise ValidationError ("Manifest file can be uploaded only if 'Use cache' option is also selected" )
224
233
return manifest_file
225
- raise Exception ('Invalid manifest was uploaded' )
234
+ raise ValidationError ('Invalid manifest was uploaded' )
226
235
return None
227
236
228
237
def _validate_url (url ):
@@ -291,6 +300,26 @@ def _download_data(urls, upload_dir):
291
300
def _get_manifest_frame_indexer (start_frame = 0 , frame_step = 1 ):
292
301
return lambda frame_id : start_frame + frame_id * frame_step
293
302
303
+ def _create_task_manifest_based_on_cloud_storage_manifest (
304
+ sorted_media ,
305
+ cloud_storage_manifest_prefix ,
306
+ cloud_storage_manifest ,
307
+ manifest
308
+ ):
309
+ if cloud_storage_manifest_prefix :
310
+ sorted_media_without_manifest_prefix = [
311
+ os .path .relpath (i , cloud_storage_manifest_prefix ) for i in sorted_media
312
+ ]
313
+ sequence , raw_content = cloud_storage_manifest .get_subset (sorted_media_without_manifest_prefix )
314
+ def _add_prefix (properties ):
315
+ file_name = properties ['name' ]
316
+ properties ['name' ] = os .path .join (cloud_storage_manifest_prefix , file_name )
317
+ return properties
318
+ content = list (map (_add_prefix , raw_content ))
319
+ else :
320
+ sequence , content = cloud_storage_manifest .get_subset (sorted_media )
321
+ sorted_content = (i [1 ] for i in sorted (zip (sequence , content )))
322
+ manifest .create (sorted_content )
294
323
295
324
@transaction .atomic
296
325
def _create_thread (db_task , data , isBackupRestore = False , isDatasetImport = False ):
@@ -300,69 +329,80 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False):
300
329
slogger .glob .info ("create task #{}" .format (db_task .id ))
301
330
302
331
db_data = db_task .data
303
- upload_dir = db_data .get_upload_dirname ()
332
+ upload_dir = db_data .get_upload_dirname () if db_data . storage != models . StorageChoice . SHARE else settings . SHARE_ROOT
304
333
is_data_in_cloud = db_data .storage == models .StorageChoice .CLOUD_STORAGE
305
334
306
335
if data ['remote_files' ] and not isDatasetImport :
307
336
data ['remote_files' ] = _download_data (data ['remote_files' ], upload_dir )
308
337
309
- manifest_files = []
310
- media = _count_files (data , manifest_files )
311
- media , task_mode = _validate_data (media , manifest_files )
312
-
313
- if data ['server_files' ]:
314
- if db_data .storage == models .StorageChoice .LOCAL :
315
- _copy_data_from_source (data ['server_files' ], upload_dir , data .get ('server_files_path' ))
316
- elif db_data .storage == models .StorageChoice .SHARE :
317
- upload_dir = settings .SHARE_ROOT
318
-
338
+ # find and validate manifest file
339
+ manifest_files = _find_manifest_files (data )
319
340
manifest_root = None
320
- if db_data .storage in {models .StorageChoice .LOCAL , models .StorageChoice .SHARE }:
341
+
342
+ # we should also handle this case because files from the share source have not been downloaded yet
343
+ if data ['copy_data' ]:
344
+ manifest_root = settings .SHARE_ROOT
345
+ elif db_data .storage in {models .StorageChoice .LOCAL , models .StorageChoice .SHARE }:
321
346
manifest_root = upload_dir
322
347
elif is_data_in_cloud :
323
348
manifest_root = db_data .cloud_storage .get_storage_dirname ()
324
349
325
350
manifest_file = _validate_manifest (
326
351
manifest_files , manifest_root ,
327
- is_data_in_cloud , db_data .cloud_storage if is_data_in_cloud else None
352
+ is_data_in_cloud , db_data .cloud_storage if is_data_in_cloud else None ,
353
+ db_data .storage_method ,
328
354
)
329
- if manifest_file and (not settings .USE_CACHE or db_data .storage_method != models .StorageMethodChoice .CACHE ):
330
- raise Exception ("File with meta information can be uploaded if 'Use cache' option is also selected" )
331
355
332
- if data [ 'server_files' ] and is_data_in_cloud :
356
+ if is_data_in_cloud :
333
357
cloud_storage_instance = db_storage_to_storage_instance (db_data .cloud_storage )
334
- sorted_media = sort (media ['image' ], data ['sorting_method' ])
335
-
336
- data_size = len (sorted_media )
337
- segment_step , * _ = _get_task_segment_data (db_task , data_size )
338
- for start_frame in range (0 , data_size , segment_step ):
339
- first_sorted_media_image = sorted_media [start_frame ]
340
- cloud_storage_instance .download_file (first_sorted_media_image , os .path .join (upload_dir , first_sorted_media_image ))
341
358
342
- # prepare task manifest file from cloud storage manifest file
343
- # NOTE we should create manifest before defining chunk_size
344
- # FIXME in the future when will be implemented archive support
345
359
manifest = ImageManifestManager (db_data .get_manifest_path ())
346
360
cloud_storage_manifest = ImageManifestManager (
347
361
os .path .join (db_data .cloud_storage .get_storage_dirname (), manifest_file ),
348
362
db_data .cloud_storage .get_storage_dirname ()
349
363
)
350
- cloud_storage_manifest_prefix = os .path .dirname (manifest_file )
351
364
cloud_storage_manifest .set_index ()
352
- if cloud_storage_manifest_prefix :
353
- sorted_media_without_manifest_prefix = [
354
- os .path .relpath (i , cloud_storage_manifest_prefix ) for i in sorted_media
355
- ]
356
- sequence , raw_content = cloud_storage_manifest .get_subset (sorted_media_without_manifest_prefix )
357
- def _add_prefix (properties ):
358
- file_name = properties ['name' ]
359
- properties ['name' ] = os .path .join (cloud_storage_manifest_prefix , file_name )
360
- return properties
361
- content = list (map (_add_prefix , raw_content ))
365
+ cloud_storage_manifest_prefix = os .path .dirname (manifest_file )
366
+
367
+ # update list with server files if task creation approach with pattern and manifest file is used
368
+ if is_data_in_cloud and data ['filename_pattern' ]:
369
+ if 1 != len (data ['server_files' ]):
370
+ l = len (data ['server_files' ]) - 1
371
+ raise ValidationError (
372
+ 'Using a filename_pattern is only supported with a manifest file, '
373
+ f'but others { l } file{ "s" if l > 1 else "" } { "were" if l > 1 else "was" } found'
374
+ 'Please remove extra files and keep only manifest file in server_files field.'
375
+ )
376
+
377
+ cloud_storage_manifest_data = list (cloud_storage_manifest .data ) if not cloud_storage_manifest_prefix \
378
+ else [os .path .join (cloud_storage_manifest_prefix , f ) for f in cloud_storage_manifest .data ]
379
+ if data ['filename_pattern' ] == '*' :
380
+ server_files = cloud_storage_manifest_data
362
381
else :
363
- sequence , content = cloud_storage_manifest .get_subset (sorted_media )
364
- sorted_content = (i [1 ] for i in sorted (zip (sequence , content )))
365
- manifest .create (sorted_content )
382
+ server_files = fnmatch .filter (cloud_storage_manifest_data , data ['filename_pattern' ])
383
+ data ['server_files' ].extend (server_files )
384
+
385
+ # count and validate uploaded files
386
+ media = _count_files (data )
387
+ media , task_mode = _validate_data (media , manifest_files )
388
+
389
+ if data ['server_files' ]:
390
+ if db_data .storage == models .StorageChoice .LOCAL :
391
+ _copy_data_from_source (data ['server_files' ], upload_dir , data .get ('server_files_path' ))
392
+ elif is_data_in_cloud :
393
+ sorted_media = sort (media ['image' ], data ['sorting_method' ])
394
+
395
+ # download previews from cloud storage
396
+ data_size = len (sorted_media )
397
+ segment_step , * _ = _get_task_segment_data (db_task , data_size )
398
+ for preview_frame in range (0 , data_size , segment_step ):
399
+ preview = sorted_media [preview_frame ]
400
+ cloud_storage_instance .download_file (preview , os .path .join (upload_dir , preview ))
401
+
402
+ # Define task manifest content based on cloud storage manifest content and uploaded files
403
+ _create_task_manifest_based_on_cloud_storage_manifest (
404
+ sorted_media , cloud_storage_manifest_prefix ,
405
+ cloud_storage_manifest , manifest )
366
406
367
407
av_scan_paths (upload_dir )
368
408
0 commit comments