[MRG] Allow model selection cv to handle nd inputs (#225)

YanisLalou · antoinecollas · web-flow · commit 85550f8976ea · 2024-09-04T09:13:20.000+02:00
* Allow model selection cv to handle nd inputs

* Add test_check_X_y_domain_multi_nd

* Add test_cv_with_nd_dimensional_X

* Remove unused comment

* Add nd support to scorers

* fix IW scorer when data are of more than 2 dimensions

* test all scorers

* repeat data to test scorers with multi dimensional data

* test  SupervisedScorer with multidimensional data

* fix DEV when having multidimensional features

* fix PredictionEntropyScorer when proba == 0

* Remove raise error in test_scorer_with_nd_input cv

---------

Co-authored-by: Antoine Collas &lt;22830806+antoinecollas@users.noreply.github.com&gt;
diff --git a/skada/metrics.py b/skada/metrics.py
@@ -72,7 +72,7 @@ def _score(
     ):
         scorer = check_scoring(estimator, self.scoring)
 
-        X, y, sample_domain = check_X_y_domain(X, y, sample_domain)
+        X, y, sample_domain = check_X_y_domain(X, y, sample_domain, allow_nd=True)
         source_idx = extract_source_indices(sample_domain)
 
         return self._sign * scorer(
@@ -136,9 +136,11 @@ def _fit(self, X_source, X_target):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, *), where * is any number
+            of dimensions of at least 1
             The source data.
-        X_target : array-like, shape (n_samples, n_features)
+        X_target : array-like, shape (n_samples, *), where * is any number
+            of dimensions of at least 1
             The target data.
 
         Returns
@@ -151,6 +153,8 @@ def _fit(self, X_source, X_target):
             weight_estimator = KernelDensity()
         self.weight_estimator_source_ = clone(weight_estimator)
         self.weight_estimator_target_ = clone(weight_estimator)
+        X_source = X_source.reshape(X_source.shape[0], -1)
+        X_target = X_target.reshape(X_target.shape[0], -1)
         self.weight_estimator_source_.fit(X_source)
         self.weight_estimator_target_.fit(X_target)
         return self
@@ -165,10 +169,12 @@ def _score(self, estimator, X, y, sample_domain=None, **params):
                 f"The estimator {estimator!r} does not."
             )
 
-        X, y, sample_domain = check_X_y_domain(X, y, sample_domain)
+        X, y, sample_domain = check_X_y_domain(X, y, sample_domain, allow_nd=True)
         X_source, X_target, y_source, _ = source_target_split(
             X, y, sample_domain=sample_domain
         )
+        X_source = X_source.reshape(X_source.shape[0], -1)
+        X_target = X_target.reshape(X_target.shape[0], -1)
         self._fit(X_source, X_target)
         ws = self.weight_estimator_source_.score_samples(X_source)
         wt = self.weight_estimator_target_.score_samples(X_source)
@@ -239,7 +245,7 @@ def _score(self, estimator, X, y, sample_domain=None, **params):
                 "The estimator passed should have a 'predict_proba' method. "
                 f"The estimator {estimator!r} does not."
             )
-        X, y, sample_domain = check_X_y_domain(X, y, sample_domain)
+        X, y, sample_domain = check_X_y_domain(X, y, sample_domain, allow_nd=True)
         source_idx = extract_source_indices(sample_domain)
         proba = estimator.predict_proba(
             X[~source_idx], sample_domain=sample_domain[~source_idx], **params
@@ -250,7 +256,9 @@ def _score(self, estimator, X, y, sample_domain=None, **params):
             )
         else:
             log_proba = np.log(proba + 1e-7)
+        infty_mask = np.isneginf(log_proba)
         entropy_per_sample = -proba * log_proba
+        entropy_per_sample[infty_mask] = 0  # x*log(x) -> 0 as x -> 0
         if self.reduction == "none":
             return self._sign * entropy_per_sample
         elif self.reduction == "sum":
@@ -298,7 +306,7 @@ def _score(self, estimator, X, y, sample_domain=None, **params):
                 f"The estimator {estimator!r} does not."
             )
 
-        X, y, sample_domain = check_X_y_domain(X, y, sample_domain)
+        X, y, sample_domain = check_X_y_domain(X, y, sample_domain, allow_nd=True)
         source_idx = extract_source_indices(sample_domain)
         proba = estimator.predict_proba(
             X[~source_idx], sample_domain=sample_domain[~source_idx], **params
@@ -403,7 +411,8 @@ def identity(x):
             # We use the input data as features
             transformer = identity
 
-        X, y, sample_domain = check_X_y_domain(X, y, sample_domain)
+        X, y, sample_domain = check_X_y_domain(X, y, sample_domain, allow_nd=True)
+        X = X.reshape(X.shape[0], -1)
         source_idx = extract_source_indices(sample_domain)
         rng = check_random_state(self.random_state)
         X_train, X_val, _, y_val, _, sample_domain_val = train_test_split(
@@ -550,7 +559,7 @@ def _score(self, estimator, X, y, sample_domain=None):
         float
             The computed score.
         """
-        X, y, sample_domain = check_X_y_domain(X, y, sample_domain)
+        X, y, sample_domain = check_X_y_domain(X, y, sample_domain, allow_nd=True)
 
         try:
             _check_y_masking(y)
@@ -707,7 +716,7 @@ def _score(self, estimator, X, y=None, sample_domain=None, **params):
         """
         scorer = check_scoring(estimator, self.scoring)
 
-        X, _, sample_domain = check_X_y_domain(X, y, sample_domain)
+        X, _, sample_domain = check_X_y_domain(X, y, sample_domain, allow_nd=True)
         source_idx = extract_source_indices(sample_domain)
         X_target = X[~source_idx]
 
diff --git a/skada/model_selection.py b/skada/model_selection.py
@@ -80,7 +80,10 @@ def split(self, X, y=None, sample_domain=None):
         """
         # automatically derive sample_domain if it is not provided
         X, sample_domain = check_X_domain(
-            X, sample_domain, allow_auto_sample_domain=True
+            X,
+            sample_domain,
+            allow_auto_sample_domain=True,
+            allow_nd=True,
         )
         X, y, sample_domain = indexable(X, y, sample_domain)
         yield from self._iter_indices(X, y, sample_domain=sample_domain)
@@ -138,7 +141,7 @@ def __init__(
         self._default_test_size = 0.1
 
     def _iter_indices(self, X, y=None, sample_domain=None):
-        X, sample_domain = check_X_domain(X, sample_domain)
+        X, sample_domain = check_X_domain(X, sample_domain, allow_nd=True)
         indices = extract_source_indices(sample_domain)
         (source_idx,) = np.where(indices)
         (target_idx,) = np.where(~indices)
@@ -225,7 +228,10 @@ def split(self, X, y=None, sample_domain=None):
         """
         # automatically derive sample_domain if it is not provided
         X, sample_domain = check_X_domain(
-            X, sample_domain, allow_auto_sample_domain=True
+            X,
+            sample_domain,
+            allow_auto_sample_domain=True,
+            allow_nd=True,
         )
         X, y, sample_domain = indexable(X, y, sample_domain)
         # xxx(okachaiev): make sure all domains are given both as sources and targets
@@ -253,7 +259,7 @@ def split(self, X, y=None, sample_domain=None):
                 yield split_idx[train_idx], split_idx[test_idx]
 
     def _iter_indices(self, X, y=None, sample_domain=None):
-        X, sample_domain = check_X_domain(X, sample_domain)
+        X, sample_domain = check_X_domain(X, sample_domain, allow_nd=True)
         indices = extract_source_indices(sample_domain)
         (source_idx,) = np.where(indices)
         (target_idx,) = np.where(~indices)
@@ -383,7 +389,10 @@ def _iter_indices(self, X, y, sample_domain=None):
         # License: BSD
 
         X, sample_domain = check_X_domain(
-            X, sample_domain, allow_auto_sample_domain=True
+            X,
+            sample_domain,
+            allow_auto_sample_domain=True,
+            allow_nd=True,
         )
         X, y, sample_domain = indexable(X, y, sample_domain)
 
@@ -532,7 +541,7 @@ def __init__(
             raise ValueError("under_sampling should be between 0 and 1")
 
     def _iter_indices(self, X, y=None, sample_domain=None):
-        X, sample_domain = check_X_domain(X, sample_domain)
+        X, sample_domain = check_X_domain(X, sample_domain, allow_nd=True)
         domain_source_idx_dict, domain_target_idx_dict = extract_domains_indices(
             sample_domain, split_source_target=True
         )
diff --git a/skada/tests/test_cv.py b/skada/tests/test_cv.py
@@ -166,3 +166,38 @@ def test_stratified_domain_shuffle_split_exceptions():
     splitter = StratifiedDomainShuffleSplit(n_splits=4, test_size=0.1, random_state=0)
     with pytest.raises(ValueError):
         next(iter(splitter.split(X, y, sample_domain)))
+
+
+@pytest.mark.parametrize(
+    "cv",
+    [
+        (GroupShuffleSplit(n_splits=2, test_size=0.3, random_state=0)),
+        (GroupKFold(n_splits=2)),
+        (LeaveOneGroupOut()),
+        (SourceTargetShuffleSplit(n_splits=2, test_size=0.3, random_state=0)),
+        (
+            DomainShuffleSplit(
+                n_splits=2, test_size=0.3, random_state=0, under_sampling=1
+            )
+        ),
+        (StratifiedDomainShuffleSplit(n_splits=2, test_size=0.3, random_state=0)),
+    ],
+)
+def test_cv_with_nd_dimensional_X(da_dataset, cv):
+    X, y, sample_domain = da_dataset.pack_lodo()
+    # Transform X from 2D to 3D
+    X = X.reshape(X.shape[0], -1, 1)  # Reshape to (n_samples, n_features, 1)
+    assert X.ndim == 3, "X should be 3-dimensional after reshaping"
+
+    splits = list(cv.split(X, y, sample_domain))
+
+    for train, test in splits:
+        assert isinstance(train, np.ndarray) and isinstance(
+            test, np.ndarray
+        ), "split indices should be numpy arrays"
+        assert len(train) + len(test) == len(
+            X
+        ), "train and test indices should cover all samples"
+        assert (
+            len(np.intersect1d(train, test)) == 0
+        ), "train and test indices should not overlap"
diff --git a/skada/tests/test_scorer.py b/skada/tests/test_scorer.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 import pytest
-from sklearn.dummy import DummyRegressor
+from sklearn.dummy import DummyClassifier, DummyRegressor
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import ShuffleSplit, cross_validate
@@ -307,3 +307,47 @@ def test_mixval_scorer_regression(da_reg_dataset):
     scorer = MixValScorer(alpha=0.55, random_state=42)
     with pytest.raises(ValueError):
         scorer(estimator, X, y, sample_domain)
+
+
+@pytest.mark.parametrize(
+    "scorer",
+    [
+        SupervisedScorer(),
+        ImportanceWeightedScorer(),
+        PredictionEntropyScorer(),
+        SoftNeighborhoodDensity(),
+        DeepEmbeddedValidation(),
+        CircularValidation(),
+        MixValScorer(alpha=0.55, random_state=42),
+    ],
+)
+def test_scorer_with_nd_input(scorer, da_dataset):
+    X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"])
+
+    # Repeat data to have a 3D input
+    X_3d = np.repeat(X[:, :, None], repeats=3, axis=2)
+
+    estimator = make_da_pipeline(
+        DummyClassifier(strategy="stratified", random_state=42)
+        .set_fit_request(sample_weight=True)
+        .set_score_request(sample_weight=True),
+    )
+    cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
+    if isinstance(scorer, SupervisedScorer):
+        _, target_labels, _ = da_dataset.pack(
+            as_sources=["s"], as_targets=["t"], train=False
+        )
+        params = {"sample_domain": sample_domain, "target_labels": target_labels}
+    else:
+        params = {"sample_domain": sample_domain}
+    scores = cross_validate(
+        estimator,
+        X_3d,
+        y,
+        cv=cv,
+        params=params,
+        scoring=scorer,
+    )["test_score"]
+
+    assert scores.shape[0] == 3, "evaluate 3 splits"
+    assert np.all(~np.isnan(scores)), "all scores are computed"
diff --git a/skada/tests/test_utils.py b/skada/tests/test_utils.py
@@ -337,6 +337,34 @@ def test_check_X_allow_exceptions():
         )
 
 
+def test_check_X_domain_multi_nd():
+    # Create a 3D array (10 samples, 2 features, 3 channels)
+    X = np.random.rand(10, 2, 3)
+    sample_domain = np.array([1] * 5 + [-1] * 5)
+
+    # Test with allow_nd=True
+    check_X_domain(X, sample_domain=sample_domain, allow_nd=True)
+
+    # Test with allow_nd=False (should raise an error)
+    with pytest.raises(ValueError, match="Found array with dim 3. None expected <= 2."):
+        check_X_domain(X, sample_domain=sample_domain, allow_nd=False)
+
+
+def test_check_X_y_domain_multi_nd():
+    # Create a 3D array for X (10 samples, 2 features, 3 channels)
+    X = np.random.rand(10, 2, 3)
+    # Create a 2D array for y (10 samples, 2 outputs)
+    y = np.random.rand(10, 2)
+    sample_domain = np.array([1] * 5 + [-1] * 5)
+
+    # Test with allow_nd=True
+    check_X_y_domain(X, y, sample_domain=sample_domain, allow_nd=True)
+
+    # Test with allow_nd=False (should raise an error for X)
+    with pytest.raises(ValueError, match="Found array with dim 3. None expected <= 2."):
+        check_X_y_domain(X, y, sample_domain=sample_domain, allow_nd=False)
+
+
 def test_extract_source_indices():
     n_samples_source = 50
     n_samples_target = 20