|
1 | 1 | """This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license.
|
2 | 2 | """
|
3 | 3 | import base64
|
| 4 | +import concurrent.futures |
| 5 | +import itertools |
4 | 6 | import json
|
5 | 7 | import logging
|
| 8 | +import os |
6 | 9 | import traceback as tb
|
| 10 | +from concurrent.futures import ThreadPoolExecutor |
7 | 11 | from datetime import datetime
|
8 | 12 | from typing import Union
|
9 | 13 | from urllib.parse import urljoin
|
@@ -526,10 +530,23 @@ def storage_background_failure(*args, **kwargs):
|
526 | 530 | storage.info_set_failed()
|
527 | 531 |
|
528 | 532 |
|
| 533 | +# note: this is available in python 3.12 , #TODO to switch to builtin function when we move to it. |
| 534 | +def _batched(iterable, n): |
| 535 | + # batched('ABCDEFG', 3) --> ABC DEF G |
| 536 | + if n < 1: |
| 537 | + raise ValueError('n must be at least one') |
| 538 | + it = iter(iterable) |
| 539 | + while batch := tuple(itertools.islice(it, n)): |
| 540 | + yield batch |
| 541 | + |
| 542 | + |
529 | 543 | class ExportStorage(Storage, ProjectStorageMixin):
|
530 | 544 | can_delete_objects = models.BooleanField(
|
531 | 545 | _('can_delete_objects'), null=True, blank=True, help_text='Deletion from storage enabled'
|
532 | 546 | )
|
| 547 | + # Use 8 threads, unless we know we only have a single core |
| 548 | + # TODO from testing, more than 8 seems to cause problems. revisit to add more parallelism. |
| 549 | + max_workers = min(8, (os.cpu_count() or 2) * 4) |
533 | 550 |
|
534 | 551 | def _get_serialized_data(self, annotation):
|
535 | 552 | user = self.project.organization.created_by
|
@@ -557,13 +574,24 @@ def save_annotations(self, annotations: models.QuerySet[Annotation]):
|
557 | 574 | self.info_set_in_progress()
|
558 | 575 | self.cached_user = self.project.organization.created_by
|
559 | 576 |
|
560 |
| - for annotation in annotations.iterator(chunk_size=settings.STORAGE_EXPORT_CHUNK_SIZE): |
561 |
| - annotation.cached_user = self.cached_user |
562 |
| - self.save_annotation(annotation) |
563 |
| - |
564 |
| - # update progress counters |
565 |
| - annotation_exported += 1 |
566 |
| - self.info_update_progress(last_sync_count=annotation_exported, total_annotations=total_annotations) |
| 577 | + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: |
| 578 | + # Batch annotations so that we update progress before having to submit every future. |
| 579 | + # Updating progress in thread requires coordinating on count and db writes, so just |
| 580 | + # batching to keep it simpler. |
| 581 | + for annotation_batch in _batched( |
| 582 | + Annotation.objects.filter(project=self.project).iterator( |
| 583 | + chunk_size=settings.STORAGE_EXPORT_CHUNK_SIZE |
| 584 | + ), |
| 585 | + settings.STORAGE_EXPORT_CHUNK_SIZE, |
| 586 | + ): |
| 587 | + futures = [] |
| 588 | + for annotation in annotation_batch: |
| 589 | + annotation.cached_user = self.cached_user |
| 590 | + futures.append(executor.submit(self.save_annotation, annotation)) |
| 591 | + |
| 592 | + for future in concurrent.futures.as_completed(futures): |
| 593 | + annotation_exported += 1 |
| 594 | + self.info_update_progress(last_sync_count=annotation_exported, total_annotations=total_annotations) |
567 | 595 |
|
568 | 596 | self.info_set_completed(last_sync_count=annotation_exported, total_annotations=total_annotations)
|
569 | 597 |
|
|
0 commit comments