Revert "Fixed imputer for sparse matrices"

eddiebergman · eddiebergman · commit d031b0d0fb3a · 2021-11-05T14:11:17.000+01:00
This reverts commit 05675ad.
diff --git a/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py b/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py
@@ -29,25 +29,23 @@ def fit(self, X: PIPELINE_DATA_DTYPE,
             y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'CategoricalImputation':
         import sklearn.impute
 
+        fill_value = None
         if hasattr(X, 'columns'):
             kind = X[X.columns[-1]].dtype.kind
         else:
             # Series, sparse and numpy have dtype
             # Only DataFrame does not
             kind = X.dtype.kind
 
-        number_kinds = ("i", "u", "f")
-        if kind in number_kinds:
+        if kind in ("i", "u", "f"):
             # We do not want to impute a category with the default
-            # value (0 is the default).
-            # Hence we take one greater than the max
-            unique = np.unique([*X.data, 0]) if issparse(X) else np.unique(X)
-            print(unique)
-            fill_value = min(unique) - 1
-        else:
-            fill_value = None
-
-        print(fill_value)
+            # value (0 is the default) in case such default is in the
+            # train data already!
+            if issparse(X):
+                # X.data doesn't return 0's
+                fill_value = min([*X.data, 0]) - 1
+            else:
+                fill_value = min(np.unique(X)) - 1
 
         self.preprocessor = sklearn.impute.SimpleImputer(
             strategy='constant', copy=False, fill_value=fill_value)
diff --git a/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py b/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py
@@ -34,16 +34,15 @@ def test_default_imputation(input_data_imputation, categorical):
         X = X.astype('str').astype('object')
         X[mask] = np.nan
     else:
-        imputation_value = min(np.unique(X)) - 1
-
+        imputation_value = 0
     Y = CategoricalImputation().fit_transform(X.copy())
-
-    assert np.array_equal(Y == imputation_value, mask)
-    assert np.array_equal(Y != imputation_value, ~mask)
+    assert ((np.argwhere(Y == imputation_value) == np.argwhere(mask)).all())
+    assert ((np.argwhere(Y != imputation_value) == np.argwhere(np.logical_not(mask))).all())
 
 
 @pytest.mark.parametrize('format_type', ('numpy', 'pandas'))
 def test_nonzero_numerical_imputation(format_type):
+
     # First try with an array with 0 as only valid category. The imputation should
     # happen with -1
     X = np.full(fill_value=np.nan, shape=(10, 10))
@@ -70,9 +69,8 @@ def test_nonzero_numerical_imputation(format_type):
 @pytest.mark.parametrize('input_data_imputation', ('numpy'), indirect=True)
 def test_default_sparse(input_data_imputation):
     X, mask = input_data_imputation
-    X = sparse.csr_matrix(X)
+    X = sparse.csc_matrix(X)
     Y = CategoricalImputation().fit_transform(X)
     Y = Y.todense()
-
-    assert np.array_equal(Y == -1, mask)
-    assert np.array_equal(Y != -1, ~mask)
+    assert (np.argwhere(Y == 0) == np.argwhere(mask)).all()
+    assert (np.argwhere(Y != 0) == np.argwhere(np.logical_not(mask))).all()