Skip to content

[MRG] Add DomainAndLabelStratifiedSubsampleTransformer + Fix DomainStratifiedSubsampleTransformer #268

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion skada/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
OTLabelProp,
JCPOTLabelPropAdapter,
JCPOTLabelProp)
from .transformers import SubsampleTransformer, DomainStratifiedSubsampleTransformer
from .transformers import SubsampleTransformer, DomainStratifiedSubsampleTransformer, DomainAndLabelStratifiedSubsampleTransformer
from ._self_labeling import DASVMClassifier
from ._pipeline import make_da_pipeline
from .utils import source_target_split, per_domain_split
Expand Down
51 changes: 50 additions & 1 deletion skada/tests/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from skada import CORAL, make_da_pipeline
from skada.transformers import (
DomainAndLabelStratifiedSubsampleTransformer,
DomainStratifiedSubsampleTransformer,
SubsampleTransformer,
)
Expand Down Expand Up @@ -47,7 +48,9 @@ def test_SubsampleTransformer(da_dataset):

ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
assert ypred.shape[0] == X_target.shape[0]
assert ypred.shape[0] == X_target.shape[0]

ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True)
assert ypred.shape[0] == X.shape[0]


def test_DomainStratifiedSubsampleTransformer(da_dataset):
Expand Down Expand Up @@ -89,4 +92,50 @@ def test_DomainStratifiedSubsampleTransformer(da_dataset):

ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
assert ypred.shape[0] == X_target.shape[0]

ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True)
assert ypred.shape[0] == X.shape[0]


def test_DomainAndLabelStratifiedSubsampleTransformer(da_dataset):
X, y, sample_domain = da_dataset.pack_train(as_sources=["s"], as_targets=["t"])
sample_weight = np.ones_like(y)

train_size = 10

# test size of output on fit_transform
transformer = DomainAndLabelStratifiedSubsampleTransformer(
train_size=train_size, random_state=42
)

X_subsampled, y_subsampled, params = transformer.fit_transform(
X, y, sample_domain=sample_domain, sample_weight=sample_weight
)

assert X_subsampled.shape == (train_size, X.shape[1])
assert y_subsampled.shape[0] == train_size
assert params["sample_domain"].shape[0] == train_size
assert params["sample_weight"].shape[0] == train_size
# check stratification
assert sum(params["sample_domain"] == 1) == train_size // 2

# test size of output on transform
X_target, y_target, sample_domain_target = da_dataset.pack_test(as_targets=["t"])

X_target_subsampled = transformer.transform(
X_target, y_target, sample_domain=sample_domain_target
)

assert X_target_subsampled.shape[0] == X_target.shape[0]

# now with a pipeline with end task
transformer = DomainAndLabelStratifiedSubsampleTransformer(train_size=train_size)
pipeline = make_da_pipeline(StandardScaler(), transformer, CORAL())

pipeline.fit(X, y, sample_domain=sample_domain)

ypred = pipeline.predict(X_target, sample_domain=sample_domain_target)
assert ypred.shape[0] == X_target.shape[0]

ypred = pipeline.predict(X, sample_domain=sample_domain, allow_source=True)
assert ypred.shape[0] == X.shape[0]
73 changes: 71 additions & 2 deletions skada/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from sklearn.utils import check_random_state

from .base import BaseAdapter
from .model_selection import StratifiedDomainShuffleSplit
from .utils import check_X_y_domain


Expand Down Expand Up @@ -67,7 +68,9 @@ def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
)
return X_subsampled, y_subsampled, params

def transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
def transform(
self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None
):
"""Transform the data."""
return X

Expand Down Expand Up @@ -129,6 +132,72 @@ def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
)
return X_subsampled, y_subsampled, params

def transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
def transform(
self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None
):
"""Transform the data."""
return X


class DomainAndLabelStratifiedSubsampleTransformer(BaseAdapter):
"""Transformer that subsamples the data in a domain and label stratified way.
This transformer is useful to speed up computations when the data is too
large. It randomly selects a subset of the data to work with during training
but does not change the data during testing.

.. note::
This transformer should not be used as the last step of a pipeline
because it returns non standard output.

Parameters
----------
train_size : int, float
Number of samples to keep (keep all if data smaller) if integer, or
proportion of train sample if float 0<= train_size <= 1.
random_state : int, RandomState instance or None, default=None
Controls the random resampling of the data.
"""

def __init__(self, train_size, random_state=None):
self.train_size = train_size
self.random_state = random_state

def _pack_params(self, idx, **params):
return {
k: (v[idx] if idx is not None else v)
for k, v in params.items()
if v is not None
}

def fit_transform(self, X, y=None, *, sample_domain=None, sample_weight=None):
"""Fit and transform the data."""
X, y, sample_domain = check_X_y_domain(X, y, sample_domain)

self.rng_ = check_random_state(self.random_state)

if self.train_size >= X.shape[0]:
return (
X,
y,
self._pack_params(
None, sample_domain=sample_domain, sample_weight=sample_weight
),
)

splitter = StratifiedDomainShuffleSplit(
n_splits=1, train_size=self.train_size, random_state=self.rng_
)

train_idx, _ = next(splitter.split(X, y, sample_domain))
X_subsampled = X[train_idx]
y_subsampled = y[train_idx] if y is not None else None
params = self._pack_params(
train_idx, sample_domain=sample_domain, sample_weight=sample_weight
)
return X_subsampled, y_subsampled, params

def transform(
self, X, y=None, *, sample_domain=None, sample_weight=None, allow_source=None
):
"""Transform the data."""
return X
Loading