Skip to content

[MRG] Subsampling transformer #259

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Oct 24, 2024
4 changes: 4 additions & 0 deletions skada/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
OTLabelProp,
JCPOTLabelPropAdapter,
JCPOTLabelProp)
from .transformers import SubsampleTransformer, DomainStratifiedSubsampleTransformer
from ._self_labeling import DASVMClassifier
from ._pipeline import make_da_pipeline
from .utils import source_target_split, per_domain_split
Expand Down Expand Up @@ -131,6 +132,9 @@
"JDOTClassifier",
"OTLabelPropAdapter",

"SubsampleTransformer",
"DomainStratifiedSubsampleTransformer",

"make_da_pipeline",

"source_target_split",
Expand Down
98 changes: 98 additions & 0 deletions skada/tests/test_transformers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Author: Yanis Lalou <[email protected]>
#
# License: BSD 3-Clause


from sklearn.preprocessing import StandardScaler

from skada import CORAL, make_da_pipeline
from skada.transformers import (
DomainStratifiedSubsampleTransformer,
SubsampleTransformer,
)


def test_SubsampleTransformer(da_dataset):
X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"])

train_size = 10

transformer = SubsampleTransformer(train_size=train_size)

X_subsampled, y_subsampled, params = transformer.fit_transform(
X, y, sample_domain=sample_domain
)

assert X_subsampled.shape[0] == train_size
assert y_subsampled.shape[0] == train_size
assert "sample_domain" in params

X_target, y_target, sample_domain_target = da_dataset.pack_test(as_targets=["t"])

X_target_subsampled = transformer.transform(
X_target, y_target, sample_domain=sample_domain_target
)

assert X_target_subsampled.shape[0] == X_target.shape[0]

# within a pipeline

transformer = SubsampleTransformer(train_size=train_size)
pipeline = make_da_pipeline(StandardScaler(), transformer)

temp = pipeline.fit_transform(X, y, sample_domain=sample_domain)

assert temp is not None

# now with a pipeline with end task
transformer = SubsampleTransformer(train_size=train_size)
pipeline = make_da_pipeline(StandardScaler(), transformer, CORAL())

pipeline.fit(X, y, sample_domain=sample_domain)

ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
assert ypred.shape[0] == X_target.shape[0]
assert ypred.shape[0] == X_target.shape[0]


def test_DomainStratifiedSubsampleTransformer(da_dataset):
X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"])

train_size = 10

transformer = DomainStratifiedSubsampleTransformer(train_size=train_size)

X_subsampled, y_subsampled, params = transformer.fit_transform(
X, y, sample_domain=sample_domain
)

assert X_subsampled.shape[0] == train_size
assert y_subsampled.shape[0] == train_size
assert "sample_domain" in params

X_target, y_target, sample_domain_target = da_dataset.pack_test(as_targets=["t"])

X_target_subsampled = transformer.transform(
X_target, y_target, sample_domain=sample_domain_target
)

assert X_target_subsampled.shape[0] == X_target.shape[0]

# within a pipeline

transformer = DomainStratifiedSubsampleTransformer(train_size=train_size)
pipeline = make_da_pipeline(StandardScaler(), transformer)

temp = pipeline.fit_transform(X, y, sample_domain=sample_domain)

assert temp is not None

# now with a pipeline with end task
transformer = DomainStratifiedSubsampleTransformer(train_size=train_size)
pipeline = make_da_pipeline(StandardScaler(), transformer, CORAL())

pipeline.fit(X, y, sample_domain=sample_domain)

ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
assert ypred.shape[0] == X_target.shape[0]
assert ypred.shape[0] == X_target.shape[0]
134 changes: 134 additions & 0 deletions skada/transformers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# Author: Remi Flamary <[email protected]>
#
# License: BSD 3-Clause

from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.utils import check_random_state

from .base import BaseAdapter
from .utils import check_X_y_domain


class SubsampleTransformer(BaseAdapter):
"""Transformer that subsamples the data.

This transformer is useful to speed up computations when the data is too
large. It randomly selects a subset of the data to work with during training
but does not change the data during testing.

.. note::
This transformer should not be used as the last step of a pipeline
because it returns non standard output.

Parameters
----------
train_size : int, float
Number of samples to keep (keep all if data smaller) if integer, or
proportion of train sample if float 0<= train_size <= 1.
random_state : int, RandomState instance or None, default=None
Controls the random resampling of the data.
"""

def __init__(self, train_size, random_state=None):
self.train_size = train_size
self.random_state = random_state

def _pack_params(self, idx, **params):
return {
k: (v[idx] if idx is not None else v)
for k, v in params.items()
if v is not None
}

def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
"""Fit and transform the data."""
X, y, sample_domain2 = check_X_y_domain(X, y, sample_domain)

self.rng_ = check_random_state(self.random_state)

if self.train_size >= X.shape[0]:
return (

Check warning on line 50 in skada/transformers.py

View check run for this annotation

Codecov / codecov/patch

skada/transformers.py#L50

Added line #L50 was not covered by tests
X,
y,
self._pack_params(
None, sample_domain=sample_domain, sample_weight=sample_weight
),
)

splitter = ShuffleSplit(
n_splits=1, train_size=self.train_size, random_state=self.rng_
)

idx = next(splitter.split(X))[0]
X_subsampled = X[idx]
y_subsampled = y[idx] if y is not None else None
params = self._pack_params(
idx, sample_domain=sample_domain2, sample_weight=sample_weight
)
return X_subsampled, y_subsampled, params

def transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
"""Transform the data."""
return X


class DomainStratifiedSubsampleTransformer(BaseAdapter):
"""Transformer that subsamples the data in a domain stratified way.

This transformer is useful to speed up computations when the data is too
large. It randomly selects a subset of the data to work with during training
but does not change the data during testing.

.. note::
This transformer should not be used as the last step of a pipeline
because it returns non standard output.

Parameters
----------
train_size : int, float
Number of samples to keep (keep all if data smaller) if integer, or
proportion of train sample if float 0<= train_size <= 1.
random_state : int, RandomState instance or None, default=None
Controls the random resampling of the data.
"""

def __init__(self, train_size, random_state=None):
self.train_size = train_size
self.random_state = random_state

def _pack_params(self, idx, **params):
return {
k: (v[idx] if idx is not None else v)
for k, v in params.items()
if v is not None
}

def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
"""Fit and transform the data."""
X, y, sample_domain2 = check_X_y_domain(X, y, sample_domain)

self.rng_ = check_random_state(self.random_state)

if self.train_size >= X.shape[0]:
return (

Check warning on line 113 in skada/transformers.py

View check run for this annotation

Codecov / codecov/patch

skada/transformers.py#L113

Added line #L113 was not covered by tests
X,
y,
self._pack_params(
None, sample_domain=sample_domain, sample_weight=sample_weight
),
)

splitter = StratifiedShuffleSplit(
n_splits=1, train_size=self.train_size, random_state=self.rng_
)
idx = next(splitter.split(X, sample_domain2))[0]
X_subsampled = X[idx]
y_subsampled = y[idx] if y is not None else None
params = self._pack_params(
idx, sample_domain=sample_domain2, sample_weight=sample_weight
)
return X_subsampled, y_subsampled, params

def transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
"""Transform the data."""
return X
Loading