scikit-adaptation · YanisLalou · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/skada/__init__.py b/skada/__init__.py
@@ -61,7 +61,7 @@
     OTLabelProp,
     JCPOTLabelPropAdapter,
     JCPOTLabelProp)
-from .transformers import SubsampleTransformer, DomainStratifiedSubsampleTransformer
+from .transformers import SubsampleTransformer, DomainStratifiedSubsampleTransformer, DomainAndLabelStratifiedSubsampleTransformer
 from ._self_labeling import DASVMClassifier
 from ._pipeline import make_da_pipeline
 from .utils import source_target_split, per_domain_split

diff --git a/skada/tests/test_transformers.py b/skada/tests/test_transformers.py
@@ -7,6 +7,7 @@
 
 from skada import CORAL, make_da_pipeline
 from skada.transformers import (
+    DomainAndLabelStratifiedSubsampleTransformer,
     DomainStratifiedSubsampleTransformer,
     SubsampleTransformer,
 )
@@ -47,7 +48,9 @@ def test_SubsampleTransformer(da_dataset):
 
     ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
     assert ypred.shape[0] == X_target.shape[0]
-    assert ypred.shape[0] == X_target.shape[0]
+
+    ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True)
+    assert ypred.shape[0] == X.shape[0]
 
 
 def test_DomainStratifiedSubsampleTransformer(da_dataset):
@@ -89,4 +92,50 @@ def test_DomainStratifiedSubsampleTransformer(da_dataset):
 
     ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
     assert ypred.shape[0] == X_target.shape[0]
+
+    ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True)
+    assert ypred.shape[0] == X.shape[0]
+
+
+def test_DomainAndLabelStratifiedSubsampleTransformer(da_dataset):
+    X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"])
+    sample_weight = np.ones_like(y)
+
+    train_size = 10
+
+    # test size of output on fit_transform
+    transformer = DomainAndLabelStratifiedSubsampleTransformer(
+        train_size=train_size, random_state=42
+    )
+
+    X_subsampled, y_subsampled, params = transformer.fit_transform(
+        X, y, sample_domain=sample_domain, sample_weight=sample_weight
+    )
+
+    assert X_subsampled.shape == (train_size, X.shape[1])
+    assert y_subsampled.shape[0] == train_size
+    assert params["sample_domain"].shape[0] == train_size
+    assert params["sample_weight"].shape[0] == train_size
+    # check stratification
+    assert sum(params["sample_domain"] == 1) == train_size // 2
+
+    # test size of output on transform
+    X_target, y_target, sample_domain_target = da_dataset.pack_test(as_targets=["t"])
+
+    X_target_subsampled = transformer.transform(
+        X_target, y_target, sample_domain=sample_domain_target
+    )
+
+    assert X_target_subsampled.shape[0] == X_target.shape[0]
+
+    # now with a pipeline with end task
+    transformer = DomainAndLabelStratifiedSubsampleTransformer(train_size=train_size)
+    pipeline = make_da_pipeline(StandardScaler(), transformer, CORAL())
+
+    pipeline.fit(X, y, sample_domain=sample_domain)
+
+    ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
     assert ypred.shape[0] == X_target.shape[0]
+
+    ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True)
+    assert ypred.shape[0] == X.shape[0]
diff --git a/skada/transformers.py b/skada/transformers.py
@@ -6,6 +6,7 @@
 from sklearn.utils import check_random_state
 
 from .base import BaseAdapter
+from .model_selection import StratifiedDomainShuffleSplit
 from .utils import check_X_y_domain
 
 
@@ -67,7 +68,9 @@ def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
         )
         return X_subsampled, y_subsampled, params
 
-    def transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
+    def transform(
+        self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None
+    ):
         """Transform the data."""
         return X
 
@@ -129,6 +132,72 @@ def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
         )
         return X_subsampled, y_subsampled, params
 
-    def transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
+    def transform(
+        self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None
+    ):
+        """Transform the data."""
+        return X
+
+
+class DomainAndLabelStratifiedSubsampleTransformer(BaseAdapter):
+    """Transformer that subsamples the data in a domain and label stratified way.
+    This transformer is useful to speed up computations when the data is too
+    large. It randomly selects a subset of the data to work with during training
+    but does not change the data during testing.
+
+    .. note::
+        This transformer should not be used as the last step of a pipeline
+        because it returns non standard output.
+
+    Parameters
+    ----------
+    train_size : int, float
+        Number of samples to keep (keep all if data smaller) if integer, or
+        proportion of train sample if float 0<= train_size <= 1.
+    random_state : int, RandomState instance or None, default=None
+        Controls the random resampling of the data.
+    """
+
+    def __init__(self, train_size, random_state=None):
+        self.train_size = train_size
+        self.random_state = random_state
+
+    def _pack_params(self, idx, **params):
+        return {
+            k: (v[idx] if idx is not None else v)
+            for k, v in params.items()
+            if v is not None
+        }
+
+    def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
+        """Fit and transform the data."""
+        X, y, sample_domain = check_X_y_domain(X, y, sample_domain)
+
+        self.rng_ = check_random_state(self.random_state)
+
+        if self.train_size >= X.shape[0]:
+            return (
+                X,
+                y,
+                self._pack_params(
+                    None, sample_domain=sample_domain, sample_weight=sample_weight
+                ),
+            )
+
+        splitter = StratifiedDomainShuffleSplit(
+            n_splits=1, train_size=self.train_size, random_state=self.rng_
+        )
+
+        train_idx, _ = next(splitter.split(X, y, sample_domain))
+        X_subsampled = X[train_idx]
+        y_subsampled = y[train_idx] if y is not None else None
+        params = self._pack_params(
+            train_idx, sample_domain=sample_domain, sample_weight=sample_weight
+        )
+        return X_subsampled, y_subsampled, params
+
+    def transform(
+        self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None
+    ):
         """Transform the data."""
         return X