[MRG] Add DomainAndLabelStratifiedSubsampleTransformer + Fix DomainStratifiedSubsampleTransformer (#268)

YanisLalou · antoinecollas · web-flow · commit 9a976ce8f64b · 2024-10-25T17:39:32.000+02:00
* Add DomainAndLabelStratifiedSubsampleTransformer + fix DomainStratifiedSubsampleTransformer

* Add test to check stratification proportions

* rename subsamplers

---------

Co-authored-by: Antoine Collas &lt;contact@antoinecollas.fr&gt;
diff --git a/skada/__init__.py b/skada/__init__.py
@@ -61,7 +61,7 @@
     OTLabelProp,
     JCPOTLabelPropAdapter,
     JCPOTLabelProp)
-from .transformers import SubsampleTransformer, DomainStratifiedSubsampleTransformer
+from .transformers import Subsampler, DomainSubsampler, StratifiedDomainSubsampler
 from ._self_labeling import DASVMClassifier
 from ._pipeline import make_da_pipeline
 from .utils import source_target_split, per_domain_split
diff --git a/skada/tests/test_transformers.py b/skada/tests/test_transformers.py
@@ -2,24 +2,27 @@
 #
 # License: BSD 3-Clause
 
+from collections import Counter
+
 import numpy as np
 from sklearn.preprocessing import StandardScaler
 
 from skada import CORAL, make_da_pipeline
 from skada.transformers import (
-    DomainStratifiedSubsampleTransformer,
-    SubsampleTransformer,
+    DomainSubsampler,
+    StratifiedDomainSubsampler,
+    Subsampler,
 )
 
 
-def test_SubsampleTransformer(da_dataset):
+def test_Subsampler(da_dataset):
     X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"])
     sample_weight = np.ones_like(y)
 
     train_size = 10
 
     # test size of output on fit_transform
-    transformer = SubsampleTransformer(train_size=train_size, random_state=42)
+    transformer = Subsampler(train_size=train_size, random_state=42)
 
     X_subsampled, y_subsampled, params = transformer.fit_transform(
         X, y, sample_domain=sample_domain, sample_weight=sample_weight
@@ -40,26 +43,26 @@ def test_SubsampleTransformer(da_dataset):
     assert X_target_subsampled.shape[0] == X_target.shape[0]
 
     # now with a pipeline with end task
-    transformer = SubsampleTransformer(train_size=train_size)
+    transformer = Subsampler(train_size=train_size)
     pipeline = make_da_pipeline(StandardScaler(), transformer, CORAL())
 
     pipeline.fit(X, y, sample_domain=sample_domain)
 
     ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
     assert ypred.shape[0] == X_target.shape[0]
-    assert ypred.shape[0] == X_target.shape[0]
+
+    ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True)
+    assert ypred.shape[0] == X.shape[0]
 
 
-def test_DomainStratifiedSubsampleTransformer(da_dataset):
+def test_DomainSubsampler(da_dataset):
     X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"])
     sample_weight = np.ones_like(y)
 
     train_size = 10
 
     # test size of output on fit_transform
-    transformer = DomainStratifiedSubsampleTransformer(
-        train_size=train_size, random_state=42
-    )
+    transformer = DomainSubsampler(train_size=train_size, random_state=42)
 
     X_subsampled, y_subsampled, params = transformer.fit_transform(
         X, y, sample_domain=sample_domain, sample_weight=sample_weight
@@ -82,11 +85,64 @@ def test_DomainStratifiedSubsampleTransformer(da_dataset):
     assert X_target_subsampled.shape[0] == X_target.shape[0]
 
     # now with a pipeline with end task
-    transformer = DomainStratifiedSubsampleTransformer(train_size=train_size)
+    transformer = DomainSubsampler(train_size=train_size)
     pipeline = make_da_pipeline(StandardScaler(), transformer, CORAL())
 
     pipeline.fit(X, y, sample_domain=sample_domain)
 
     ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
     assert ypred.shape[0] == X_target.shape[0]
+
+    ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True)
+    assert ypred.shape[0] == X.shape[0]
+
+
+def test_StratifiedDomainSubsampler(da_dataset):
+    X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"])
+    sample_weight = np.ones_like(y)
+
+    train_size = 10
+
+    # test size of output on fit_transform
+    transformer = StratifiedDomainSubsampler(train_size=train_size, random_state=42)
+
+    X_subsampled, y_subsampled, params = transformer.fit_transform(
+        X, y, sample_domain=sample_domain, sample_weight=sample_weight
+    )
+
+    assert X_subsampled.shape == (train_size, X.shape[1])
+    assert y_subsampled.shape[0] == train_size
+    assert params["sample_domain"].shape[0] == train_size
+    assert params["sample_weight"].shape[0] == train_size
+
+    # Check stratification proportions
+    original_freq = Counter(zip(sample_domain, y))
+    subsampled_freq = Counter(zip(params["sample_domain"], y_subsampled))
+
+    for key in original_freq:
+        original_ratio = original_freq[key] / len(y)
+        subsampled_ratio = subsampled_freq[key] / train_size
+        assert np.isclose(
+            original_ratio, subsampled_ratio, atol=0.1
+        ), f"Stratification not preserved for {key}"
+
+    # test size of output on transform
+    X_target, y_target, sample_domain_target = da_dataset.pack_test(as_targets=["t"])
+
+    X_target_subsampled = transformer.transform(
+        X_target, y_target, sample_domain=sample_domain_target
+    )
+
+    assert X_target_subsampled.shape[0] == X_target.shape[0]
+
+    # now with a pipeline with end task
+    transformer = StratifiedDomainSubsampler(train_size=train_size)
+    pipeline = make_da_pipeline(StandardScaler(), transformer, CORAL())
+
+    pipeline.fit(X, y, sample_domain=sample_domain)
+
+    ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
     assert ypred.shape[0] == X_target.shape[0]
+
+    ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True)
+    assert ypred.shape[0] == X.shape[0]
diff --git a/skada/transformers.py b/skada/transformers.py
@@ -6,10 +6,11 @@
 from sklearn.utils import check_random_state
 
 from .base import BaseAdapter
+from .model_selection import StratifiedDomainShuffleSplit
 from .utils import check_X_y_domain
 
 
-class SubsampleTransformer(BaseAdapter):
+class Subsampler(BaseAdapter):
     """Transformer that subsamples the data.
 
     This transformer is useful to speed up computations when the data is too
@@ -67,12 +68,14 @@ def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
         )
         return X_subsampled, y_subsampled, params
 
-    def transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
+    def transform(
+        self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None
+    ):
         """Transform the data."""
         return X
 
 
-class DomainStratifiedSubsampleTransformer(BaseAdapter):
+class DomainSubsampler(BaseAdapter):
     """Transformer that subsamples the data in a domain stratified way.
 
     This transformer is useful to speed up computations when the data is too
@@ -129,6 +132,72 @@ def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
         )
         return X_subsampled, y_subsampled, params
 
-    def transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
+    def transform(
+        self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None
+    ):
+        """Transform the data."""
+        return X
+
+
+class StratifiedDomainSubsampler(BaseAdapter):
+    """Transformer that subsamples the data in a domain and label stratified way.
+    This transformer is useful to speed up computations when the data is too
+    large. It randomly selects a subset of the data to work with during training
+    but does not change the data during testing.
+
+    .. note::
+        This transformer should not be used as the last step of a pipeline
+        because it returns non standard output.
+
+    Parameters
+    ----------
+    train_size : int, float
+        Number of samples to keep (keep all if data smaller) if integer, or
+        proportion of train sample if float 0<= train_size <= 1.
+    random_state : int, RandomState instance or None, default=None
+        Controls the random resampling of the data.
+    """
+
+    def __init__(self, train_size, random_state=None):
+        self.train_size = train_size
+        self.random_state = random_state
+
+    def _pack_params(self, idx, **params):
+        return {
+            k: (v[idx] if idx is not None else v)
+            for k, v in params.items()
+            if v is not None
+        }
+
+    def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
+        """Fit and transform the data."""
+        X, y, sample_domain = check_X_y_domain(X, y, sample_domain)
+
+        self.rng_ = check_random_state(self.random_state)
+
+        if self.train_size >= X.shape[0]:
+            return (
+                X,
+                y,
+                self._pack_params(
+                    None, sample_domain=sample_domain, sample_weight=sample_weight
+                ),
+            )
+
+        splitter = StratifiedDomainShuffleSplit(
+            n_splits=1, train_size=self.train_size, random_state=self.rng_
+        )
+
+        train_idx, _ = next(splitter.split(X, y, sample_domain))
+        X_subsampled = X[train_idx]
+        y_subsampled = y[train_idx] if y is not None else None
+        params = self._pack_params(
+            train_idx, sample_domain=sample_domain, sample_weight=sample_weight
+        )
+        return X_subsampled, y_subsampled, params
+
+    def transform(
+        self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None
+    ):
         """Transform the data."""
         return X