Fixed imputer for sparse matrices

eddiebergman · eddiebergman · commit 05675ad7d692 · 2021-11-04T17:51:23.000+01:00
diff --git a/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py b/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py
@@ -29,23 +29,25 @@ def fit(self, X: PIPELINE_DATA_DTYPE,
             y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'CategoricalImputation':
         import sklearn.impute
 
-        fill_value = None
         if hasattr(X, 'columns'):
             kind = X[X.columns[-1]].dtype.kind
         else:
             # Series, sparse and numpy have dtype
             # Only DataFrame does not
             kind = X.dtype.kind
 
-        if kind in ("i", "u", "f"):
+        number_kinds = ("i", "u", "f")
+        if kind in number_kinds:
             # We do not want to impute a category with the default
-            # value (0 is the default) in case such default is in the
-            # train data already!
-            if issparse(X):
-                # X.data doesn't return 0's
-                fill_value = min([*X.data, 0]) - 1
-            else:
-                fill_value = min(np.unique(X)) - 1
+            # value (0 is the default).
+            # Hence we take one greater than the max
+            unique = np.unique([*X.data, 0]) if issparse(X) else np.unique(X)
+            print(unique)
+            fill_value = min(unique) - 1
+        else:
+            fill_value = None
+
+        print(fill_value)
 
         self.preprocessor = sklearn.impute.SimpleImputer(
             strategy='constant', copy=False, fill_value=fill_value)
diff --git a/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py b/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py
@@ -34,15 +34,16 @@ def test_default_imputation(input_data_imputation, categorical):
         X = X.astype('str').astype('object')
         X[mask] = np.nan
     else:
-        imputation_value = 0
+        imputation_value = min(np.unique(X)) - 1
+
     Y = CategoricalImputation().fit_transform(X.copy())
-    assert ((np.argwhere(Y == imputation_value) == np.argwhere(mask)).all())
-    assert ((np.argwhere(Y != imputation_value) == np.argwhere(np.logical_not(mask))).all())
+
+    assert np.array_equal(Y == imputation_value, mask)
+    assert np.array_equal(Y != imputation_value, ~mask)
 
 
 @pytest.mark.parametrize('format_type', ('numpy', 'pandas'))
 def test_nonzero_numerical_imputation(format_type):
-
     # First try with an array with 0 as only valid category. The imputation should
     # happen with -1
     X = np.full(fill_value=np.nan, shape=(10, 10))
@@ -69,8 +70,9 @@ def test_nonzero_numerical_imputation(format_type):
 @pytest.mark.parametrize('input_data_imputation', ('numpy'), indirect=True)
 def test_default_sparse(input_data_imputation):
     X, mask = input_data_imputation
-    X = sparse.csc_matrix(X)
+    X = sparse.csr_matrix(X)
     Y = CategoricalImputation().fit_transform(X)
     Y = Y.todense()
-    assert (np.argwhere(Y == 0) == np.argwhere(mask)).all()
-    assert (np.argwhere(Y != 0) == np.argwhere(np.logical_not(mask))).all()
+
+    assert np.array_equal(Y == -1, mask)
+    assert np.array_equal(Y != -1, ~mask)