2
2
#
3
3
# License: BSD 3-Clause
4
4
5
+ from collections import Counter
6
+
5
7
import numpy as np
6
8
from sklearn .preprocessing import StandardScaler
7
9
8
10
from skada import CORAL , make_da_pipeline
9
11
from skada .transformers import (
10
- DomainStratifiedSubsampleTransformer ,
11
- SubsampleTransformer ,
12
+ DomainSubsampler ,
13
+ StratifiedDomainSubsampler ,
14
+ Subsampler ,
12
15
)
13
16
14
17
15
- def test_SubsampleTransformer (da_dataset ):
18
+ def test_Subsampler (da_dataset ):
16
19
X , y , sample_domain = da_dataset .pack_train (as_sources = ["s" ], as_targets = ["t" ])
17
20
sample_weight = np .ones_like (y )
18
21
19
22
train_size = 10
20
23
21
24
# test size of output on fit_transform
22
- transformer = SubsampleTransformer (train_size = train_size , random_state = 42 )
25
+ transformer = Subsampler (train_size = train_size , random_state = 42 )
23
26
24
27
X_subsampled , y_subsampled , params = transformer .fit_transform (
25
28
X , y , sample_domain = sample_domain , sample_weight = sample_weight
@@ -40,26 +43,26 @@ def test_SubsampleTransformer(da_dataset):
40
43
assert X_target_subsampled .shape [0 ] == X_target .shape [0 ]
41
44
42
45
# now with a pipeline with end task
43
- transformer = SubsampleTransformer (train_size = train_size )
46
+ transformer = Subsampler (train_size = train_size )
44
47
pipeline = make_da_pipeline (StandardScaler (), transformer , CORAL ())
45
48
46
49
pipeline .fit (X , y , sample_domain = sample_domain )
47
50
48
51
ypred = pipeline .predict (X_target , sample_domain = sample_domain_target )
49
52
assert ypred .shape [0 ] == X_target .shape [0 ]
50
- assert ypred .shape [0 ] == X_target .shape [0 ]
53
+
54
+ ypred = pipeline .predict (X , sample_domain = sample_domain , allow_source = True )
55
+ assert ypred .shape [0 ] == X .shape [0 ]
51
56
52
57
53
- def test_DomainStratifiedSubsampleTransformer (da_dataset ):
58
+ def test_DomainSubsampler (da_dataset ):
54
59
X , y , sample_domain = da_dataset .pack_train (as_sources = ["s" ], as_targets = ["t" ])
55
60
sample_weight = np .ones_like (y )
56
61
57
62
train_size = 10
58
63
59
64
# test size of output on fit_transform
60
- transformer = DomainStratifiedSubsampleTransformer (
61
- train_size = train_size , random_state = 42
62
- )
65
+ transformer = DomainSubsampler (train_size = train_size , random_state = 42 )
63
66
64
67
X_subsampled , y_subsampled , params = transformer .fit_transform (
65
68
X , y , sample_domain = sample_domain , sample_weight = sample_weight
@@ -82,11 +85,64 @@ def test_DomainStratifiedSubsampleTransformer(da_dataset):
82
85
assert X_target_subsampled .shape [0 ] == X_target .shape [0 ]
83
86
84
87
# now with a pipeline with end task
85
- transformer = DomainStratifiedSubsampleTransformer (train_size = train_size )
88
+ transformer = DomainSubsampler (train_size = train_size )
86
89
pipeline = make_da_pipeline (StandardScaler (), transformer , CORAL ())
87
90
88
91
pipeline .fit (X , y , sample_domain = sample_domain )
89
92
90
93
ypred = pipeline .predict (X_target , sample_domain = sample_domain_target )
91
94
assert ypred .shape [0 ] == X_target .shape [0 ]
95
+
96
+ ypred = pipeline .predict (X , sample_domain = sample_domain , allow_source = True )
97
+ assert ypred .shape [0 ] == X .shape [0 ]
98
+
99
+
100
+ def test_StratifiedDomainSubsampler (da_dataset ):
101
+ X , y , sample_domain = da_dataset .pack_train (as_sources = ["s" ], as_targets = ["t" ])
102
+ sample_weight = np .ones_like (y )
103
+
104
+ train_size = 10
105
+
106
+ # test size of output on fit_transform
107
+ transformer = StratifiedDomainSubsampler (train_size = train_size , random_state = 42 )
108
+
109
+ X_subsampled , y_subsampled , params = transformer .fit_transform (
110
+ X , y , sample_domain = sample_domain , sample_weight = sample_weight
111
+ )
112
+
113
+ assert X_subsampled .shape == (train_size , X .shape [1 ])
114
+ assert y_subsampled .shape [0 ] == train_size
115
+ assert params ["sample_domain" ].shape [0 ] == train_size
116
+ assert params ["sample_weight" ].shape [0 ] == train_size
117
+
118
+ # Check stratification proportions
119
+ original_freq = Counter (zip (sample_domain , y ))
120
+ subsampled_freq = Counter (zip (params ["sample_domain" ], y_subsampled ))
121
+
122
+ for key in original_freq :
123
+ original_ratio = original_freq [key ] / len (y )
124
+ subsampled_ratio = subsampled_freq [key ] / train_size
125
+ assert np .isclose (
126
+ original_ratio , subsampled_ratio , atol = 0.1
127
+ ), f"Stratification not preserved for { key } "
128
+
129
+ # test size of output on transform
130
+ X_target , y_target , sample_domain_target = da_dataset .pack_test (as_targets = ["t" ])
131
+
132
+ X_target_subsampled = transformer .transform (
133
+ X_target , y_target , sample_domain = sample_domain_target
134
+ )
135
+
136
+ assert X_target_subsampled .shape [0 ] == X_target .shape [0 ]
137
+
138
+ # now with a pipeline with end task
139
+ transformer = StratifiedDomainSubsampler (train_size = train_size )
140
+ pipeline = make_da_pipeline (StandardScaler (), transformer , CORAL ())
141
+
142
+ pipeline .fit (X , y , sample_domain = sample_domain )
143
+
144
+ ypred = pipeline .predict (X_target , sample_domain = sample_domain_target )
92
145
assert ypred .shape [0 ] == X_target .shape [0 ]
146
+
147
+ ypred = pipeline .predict (X , sample_domain = sample_domain , allow_source = True )
148
+ assert ypred .shape [0 ] == X .shape [0 ]
0 commit comments