Skip to content

Commit 05675ad

Browse files
committed
Fixed imputer for sparse matrices
1 parent be80a2b commit 05675ad

File tree

2 files changed

+20
-16
lines changed

2 files changed

+20
-16
lines changed

autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -29,23 +29,25 @@ def fit(self, X: PIPELINE_DATA_DTYPE,
2929
y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'CategoricalImputation':
3030
import sklearn.impute
3131

32-
fill_value = None
3332
if hasattr(X, 'columns'):
3433
kind = X[X.columns[-1]].dtype.kind
3534
else:
3635
# Series, sparse and numpy have dtype
3736
# Only DataFrame does not
3837
kind = X.dtype.kind
3938

40-
if kind in ("i", "u", "f"):
39+
number_kinds = ("i", "u", "f")
40+
if kind in number_kinds:
4141
# We do not want to impute a category with the default
42-
# value (0 is the default) in case such default is in the
43-
# train data already!
44-
if issparse(X):
45-
# X.data doesn't return 0's
46-
fill_value = min([*X.data, 0]) - 1
47-
else:
48-
fill_value = min(np.unique(X)) - 1
42+
# value (0 is the default).
43+
# Hence we take one greater than the max
44+
unique = np.unique([*X.data, 0]) if issparse(X) else np.unique(X)
45+
print(unique)
46+
fill_value = min(unique) - 1
47+
else:
48+
fill_value = None
49+
50+
print(fill_value)
4951

5052
self.preprocessor = sklearn.impute.SimpleImputer(
5153
strategy='constant', copy=False, fill_value=fill_value)

test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,16 @@ def test_default_imputation(input_data_imputation, categorical):
3434
X = X.astype('str').astype('object')
3535
X[mask] = np.nan
3636
else:
37-
imputation_value = 0
37+
imputation_value = min(np.unique(X)) - 1
38+
3839
Y = CategoricalImputation().fit_transform(X.copy())
39-
assert ((np.argwhere(Y == imputation_value) == np.argwhere(mask)).all())
40-
assert ((np.argwhere(Y != imputation_value) == np.argwhere(np.logical_not(mask))).all())
40+
41+
assert np.array_equal(Y == imputation_value, mask)
42+
assert np.array_equal(Y != imputation_value, ~mask)
4143

4244

4345
@pytest.mark.parametrize('format_type', ('numpy', 'pandas'))
4446
def test_nonzero_numerical_imputation(format_type):
45-
4647
# First try with an array with 0 as only valid category. The imputation should
4748
# happen with -1
4849
X = np.full(fill_value=np.nan, shape=(10, 10))
@@ -69,8 +70,9 @@ def test_nonzero_numerical_imputation(format_type):
6970
@pytest.mark.parametrize('input_data_imputation', ('numpy'), indirect=True)
7071
def test_default_sparse(input_data_imputation):
7172
X, mask = input_data_imputation
72-
X = sparse.csc_matrix(X)
73+
X = sparse.csr_matrix(X)
7374
Y = CategoricalImputation().fit_transform(X)
7475
Y = Y.todense()
75-
assert (np.argwhere(Y == 0) == np.argwhere(mask)).all()
76-
assert (np.argwhere(Y != 0) == np.argwhere(np.logical_not(mask))).all()
76+
77+
assert np.array_equal(Y == -1, mask)
78+
assert np.array_equal(Y != -1, ~mask)

0 commit comments

Comments
 (0)