[Feature] add multi-source sampler (#1938)

Tau-J · web-flow · commit d83c4ba27893 · 2023-02-06T22:31:38.000+08:00
diff --git a/mmpose/datasets/__init__.py b/mmpose/datasets/__init__.py
@@ -2,6 +2,7 @@
 from .builder import build_dataset
 from .dataset_wrappers import CombinedDataset
 from .datasets import *  # noqa
+from .samplers import MultiSourceSampler
 from .transforms import *  # noqa
 
-__all__ = ['build_dataset', 'CombinedDataset']
+__all__ = ['build_dataset', 'CombinedDataset', 'MultiSourceSampler']
diff --git a/mmpose/datasets/samplers.py b/mmpose/datasets/samplers.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+import math
+from typing import Iterator, List, Optional, Sized, Union
+
+import torch
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmpose.datasets import CombinedDataset
+from mmpose.registry import DATA_SAMPLERS
+
+
+@DATA_SAMPLERS.register_module()
+class MultiSourceSampler(Sampler):
+    r"""Multi-Source Sampler.
+    According to the sampling ratio, sample data from different
+    datasets to form batches.
+    Args:
+        dataset (Sized): The dataset
+        batch_size (int): Size of mini-batch
+        source_ratio (list[int | float]): The sampling ratio of different
+            source datasets in a mini-batch
+        shuffle (bool): Whether shuffle the dataset or not. Defaults to
+            ``True``
+        seed (int, optional): Random seed. If ``None``, set a random seed.
+            Defaults to ``None``
+    """
+
+    def __init__(self,
+                 dataset: Sized,
+                 batch_size: int,
+                 source_ratio: List[Union[int, float]],
+                 shuffle: bool = True,
+                 seed: Optional[int] = None) -> None:
+
+        assert isinstance(dataset, CombinedDataset),\
+            f'The dataset must be CombinedDataset, but get {dataset}'
+        assert isinstance(batch_size, int) and batch_size > 0, \
+            'batch_size must be a positive integer value, ' \
+            f'but got batch_size={batch_size}'
+        assert isinstance(source_ratio, list), \
+            f'source_ratio must be a list, but got source_ratio={source_ratio}'
+        assert len(source_ratio) == len(dataset._lens), \
+            'The length of source_ratio must be equal to ' \
+            f'the number of datasets, but got source_ratio={source_ratio}'
+
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.cumulative_sizes = [0] + list(itertools.accumulate(dataset._lens))
+        self.batch_size = batch_size
+        self.source_ratio = source_ratio
+
+        self.num_samples = math.ceil(len(self.dataset) / world_size)
+
+        self.num_per_source = [
+            int(batch_size * sr / sum(source_ratio)) for sr in source_ratio
+        ]
+        self.num_per_source[0] = batch_size - sum(self.num_per_source[1:])
+
+        assert sum(self.num_per_source) == batch_size, \
+            'The sum of num_per_source must be equal to ' \
+            f'batch_size, but get {self.num_per_source}'
+
+        self.seed = sync_random_seed() if seed is None else seed
+        self.shuffle = shuffle
+        self.source2inds = {
+            source: self._indices_of_rank(len(ds))
+            for source, ds in enumerate(dataset.datasets)
+        }
+
+    def _infinite_indices(self, sample_size: int) -> Iterator[int]:
+        """Infinitely yield a sequence of indices."""
+        g = torch.Generator()
+        g.manual_seed(self.seed)
+        while True:
+            if self.shuffle:
+                yield from torch.randperm(sample_size, generator=g).tolist()
+            else:
+                yield from torch.arange(sample_size).tolist()
+
+    def _indices_of_rank(self, sample_size: int) -> Iterator[int]:
+        """Slice the infinite indices by rank."""
+        yield from itertools.islice(
+            self._infinite_indices(sample_size), self.rank, None,
+            self.world_size)
+
+    def __iter__(self) -> Iterator[int]:
+        batch_buffer = []
+        while True:
+            for source, num in enumerate(self.num_per_source):
+                batch_buffer_per_source = []
+                for idx in self.source2inds[source]:
+                    idx += self.cumulative_sizes[source]
+                    batch_buffer_per_source.append(idx)
+                    if len(batch_buffer_per_source) == num:
+                        batch_buffer += batch_buffer_per_source
+                        break
+            yield from batch_buffer
+            batch_buffer = []
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        """Compatible in `epoch-based runner."""
+        pass