Skip to content

Commit ba8206d

Browse files
authored
Update sklearnex estimators to support sklearn 1.5 (#1794)
* Update TSNE and PCA to support sklearn 1.5pre-release * Update PCA solver branching * Add PCA parameter in test_n_jobs_support.py * Fix n_iter check in TSNE for sklearn<1.2 * Apply isort * Move to sklearnex.utils.get_namespace * Fix PCA.fit_transform * Update PCA solver selection * Update PCA algorithm doc * Fix solver name in tests * Fix svd solver contraint * Fix PCA solver constraint * Fix check_feature_names warning and deselect RF feature importance tests * Change KMeans estimator doc strings * Update solver selection * Update solver selection * Revert test de-deselection
1 parent 8c7a928 commit ba8206d

File tree

9 files changed

+132
-175
lines changed

9 files changed

+132
-175
lines changed

daal4py/sklearn/cluster/k_means.py

Lines changed: 15 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -258,23 +258,6 @@ def is_string(s, target_str):
258258

259259

260260
def _fit(self, X, y=None, sample_weight=None):
261-
"""Compute k-means clustering.
262-
263-
Parameters
264-
----------
265-
X : array-like or sparse matrix, shape=(n_samples, n_features)
266-
Training instances to cluster. It must be noted that the data
267-
will be converted to C ordering, which will cause a memory
268-
copy if the given data is not C-contiguous.
269-
270-
y : Ignored
271-
not used, present here for API consistency by convention.
272-
273-
sample_weight : array-like, shape (n_samples,), optional
274-
The weights for each observation in X. If None, all observations
275-
are assigned equal weight (default: None)
276-
277-
"""
278261
init = self.init
279262
if sklearn_check_version("1.1"):
280263
if sklearn_check_version("1.2"):
@@ -447,26 +430,6 @@ def _daal4py_check_test_data(self, X):
447430

448431

449432
def _predict(self, X, sample_weight=None):
450-
"""Predict the closest cluster each sample in X belongs to.
451-
452-
In the vector quantization literature, `cluster_centers_` is called
453-
the code book and each value returned by `predict` is the index of
454-
the closest code in the code book.
455-
456-
Parameters
457-
----------
458-
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
459-
New data to predict.
460-
461-
sample_weight : array-like, shape (n_samples,), optional
462-
The weights for each observation in X. If None, all observations
463-
are assigned equal weight (default: None)
464-
465-
Returns
466-
-------
467-
labels : array, shape [n_samples,]
468-
Index of the cluster each sample belongs to.
469-
"""
470433
check_is_fitted(self)
471434

472435
X = _daal4py_check_test_data(self, X)
@@ -614,86 +577,29 @@ def __init__(
614577

615578
@support_usm_ndarray()
616579
def fit(self, X, y=None, sample_weight=None):
617-
"""
618-
Compute k-means clustering.
619-
620-
Parameters
621-
----------
622-
X : {array-like, sparse matrix} of shape (n_samples, n_features)
623-
Training instances to cluster. It must be noted that the data
624-
will be converted to C ordering, which will cause a memory
625-
copy if the given data is not C-contiguous.
626-
If a sparse matrix is passed, a copy will be made if it's not in
627-
CSR format.
628-
629-
y : Ignored
630-
Not used, present here for API consistency by convention.
631-
632-
sample_weight : array-like of shape (n_samples,), default=None
633-
The weights for each observation in X. If None, all observations
634-
are assigned equal weight.
635-
636-
.. versionadded:: 0.20
637-
638-
Returns
639-
-------
640-
self : object
641-
Fitted estimator.
642-
"""
643580
return _fit(self, X, y=y, sample_weight=sample_weight)
644581

645-
@support_usm_ndarray()
646-
def predict(
647-
self, X, sample_weight="deprecated" if sklearn_check_version("1.3") else None
648-
):
649-
"""
650-
Predict the closest cluster each sample in X belongs to.
582+
if sklearn_check_version("1.5"):
651583

652-
In the vector quantization literature, `cluster_centers_` is called
653-
the code book and each value returned by `predict` is the index of
654-
the closest code in the code book.
584+
@support_usm_ndarray()
585+
def predict(self, X):
586+
return _predict(self, X)
655587

656-
Parameters
657-
----------
658-
X : {array-like, sparse matrix} of shape (n_samples, n_features)
659-
New data to predict.
660-
661-
sample_weight : array-like of shape (n_samples,), default=None
662-
The weights for each observation in X. If None, all observations
663-
are assigned equal weight.
588+
else:
664589

665-
Returns
666-
-------
667-
labels : ndarray of shape (n_samples,)
668-
Index of the cluster each sample belongs to.
669-
"""
670-
return _predict(self, X, sample_weight=sample_weight)
590+
@support_usm_ndarray()
591+
def predict(
592+
self, X, sample_weight="deprecated" if sklearn_check_version("1.3") else None
593+
):
594+
return _predict(self, X, sample_weight=sample_weight)
671595

672596
@support_usm_ndarray()
673597
def fit_predict(self, X, y=None, sample_weight=None):
674-
"""
675-
Compute cluster centers and predict cluster index for each sample.
676-
677-
Convenience method; equivalent to calling fit(X) followed by
678-
predict(X).
679-
680-
Parameters
681-
----------
682-
X : {array-like, sparse matrix} of shape (n_samples, n_features)
683-
New data to transform.
684-
685-
y : Ignored
686-
Not used, present here for API consistency by convention.
687-
688-
sample_weight : array-like of shape (n_samples,), default=None
689-
The weights for each observation in X. If None, all observations
690-
are assigned equal weight.
691-
692-
Returns
693-
-------
694-
labels : ndarray of shape (n_samples,)
695-
Index of the cluster each sample belongs to.
696-
"""
697598
return super().fit_predict(X, y, sample_weight)
698599

699600
score = support_usm_ndarray()(KMeans_original.score)
601+
602+
fit.__doc__ = KMeans_original.fit.__doc__
603+
predict.__doc__ = KMeans_original.predict.__doc__
604+
fit_predict.__doc__ = KMeans_original.fit_predict.__doc__
605+
score.__doc__ = KMeans_original.score.__doc__

daal4py/sklearn/manifold/_t_sne.py

Lines changed: 27 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -44,52 +44,15 @@
4444
class TSNE(BaseTSNE):
4545
__doc__ = BaseTSNE.__doc__
4646

47+
if sklearn_check_version("1.2"):
48+
_parameter_constraints: dict = {**BaseTSNE._parameter_constraints}
49+
4750
@support_usm_ndarray()
4851
def fit_transform(self, X, y=None):
49-
"""
50-
Fit X into an embedded space and return that transformed output.
51-
52-
Parameters
53-
----------
54-
X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
55-
If the metric is 'precomputed' X must be a square distance
56-
matrix. Otherwise it contains a sample per row. If the method
57-
is 'exact', X may be a sparse matrix of type 'csr', 'csc'
58-
or 'coo'. If the method is 'barnes_hut' and the metric is
59-
'precomputed', X may be a precomputed sparse graph.
60-
61-
y : None
62-
Ignored.
63-
64-
Returns
65-
-------
66-
X_new : ndarray of shape (n_samples, n_components)
67-
Embedding of the training data in low-dimensional space.
68-
"""
6952
return super().fit_transform(X, y)
7053

7154
@support_usm_ndarray()
7255
def fit(self, X, y=None):
73-
"""
74-
Fit X into an embedded space.
75-
76-
Parameters
77-
----------
78-
X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
79-
If the metric is 'precomputed' X must be a square distance
80-
matrix. Otherwise it contains a sample per row. If the method
81-
is 'exact', X may be a sparse matrix of type 'csr', 'csc'
82-
or 'coo'. If the method is 'barnes_hut' and the metric is
83-
'precomputed', X may be a precomputed sparse graph.
84-
85-
y : None
86-
Ignored.
87-
88-
Returns
89-
-------
90-
X_new : array of shape (n_samples, n_components)
91-
Embedding of the training data in low-dimensional space.
92-
"""
9356
return super().fit(X, y)
9457

9558
def _daal_tsne(self, P, n_samples, X_embedded):
@@ -101,11 +64,27 @@ def _daal_tsne(self, P, n_samples, X_embedded):
10164
# * final optimization with momentum at 0.8
10265

10366
# N, nnz, n_iter_without_progress, n_iter
104-
size_iter = [[n_samples], [P.nnz], [self.n_iter_without_progress], [self.n_iter]]
67+
size_iter = [
68+
[n_samples],
69+
[P.nnz],
70+
[self.n_iter_without_progress],
71+
[self._max_iter if sklearn_check_version("1.5") else self.n_iter],
72+
]
10573

10674
# Pass params to daal4py backend
10775
if daal_check_version((2023, "P", 1)):
108-
size_iter.extend([[self._EXPLORATION_N_ITER], [self._N_ITER_CHECK]])
76+
size_iter.extend(
77+
[
78+
[
79+
(
80+
self._EXPLORATION_MAX_ITER
81+
if sklearn_check_version("1.5")
82+
else self._EXPLORATION_N_ITER
83+
)
84+
],
85+
[self._N_ITER_CHECK],
86+
]
87+
)
10988

11089
size_iter = np.array(size_iter, dtype=P.dtype)
11190

@@ -255,8 +234,9 @@ def _fit(self, X, skip_num_points=0):
255234
)
256235
)
257236

258-
if self.n_iter < 250:
259-
raise ValueError("n_iter should be at least 250")
237+
if not sklearn_check_version("1.2"):
238+
if self.n_iter < 250:
239+
raise ValueError("n_iter should be at least 250")
260240

261241
n_samples = X.shape[0]
262242

@@ -423,3 +403,6 @@ def _fit(self, X, skip_num_points=0):
423403
neighbors=neighbors_nn,
424404
skip_num_points=skip_num_points,
425405
)
406+
407+
fit.__doc__ = BaseTSNE.fit.__doc__
408+
fit_transform.__doc__ = BaseTSNE.fit_transform.__doc__

deselected_tests.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,14 @@ deselected_tests:
6060
- inspection/tests/test_partial_dependence.py::test_partial_dependence_easy_target[2-est2] >=0.23 darwin
6161
- inspection/tests/test_partial_dependence.py::test_partial_dependence_easy_target[2-est3] >=0.23 darwin
6262

63+
# Sklearnex RandomForestClassifier RNG is different from scikit-learn and daal4py
64+
# resulting in different feature importances for small number of trees (10).
65+
# Issue dissappears with bigger number of trees (>=20)
66+
- inspection/tests/test_permutation_importance.py::test_permutation_importance_correlated_feature_regression_pandas[0.5-1]
67+
- inspection/tests/test_permutation_importance.py::test_permutation_importance_correlated_feature_regression_pandas[0.5-2]
68+
- inspection/tests/test_permutation_importance.py::test_permutation_importance_correlated_feature_regression_pandas[1.0-1]
69+
- inspection/tests/test_permutation_importance.py::test_permutation_importance_correlated_feature_regression_pandas[1.0-2]
70+
6371
# Random forest classifier selects a different most-important feature
6472
# Feature importances:
6573
# scikit-learn-intelex [0. 0.00553064 0.71323666 0.2812327 ]

doc/sources/algorithms.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ Dimensionality reduction
157157
* - `PCA`
158158
- All parameters are supported except:
159159

160-
- ``svd_solver`` != `'full'`
160+
- ``svd_solver`` not in [`'full'`, `'covariance_eigh'`]
161161
- Sparse data is not supported
162162
* - `TSNE`
163163
- All parameters are supported except:
@@ -340,7 +340,7 @@ Dimensionality reduction
340340
* - `PCA`
341341
- All parameters are supported except:
342342

343-
- ``svd_solver`` != `'full'`
343+
- ``svd_solver`` not in [`'full'`, `'covariance_eigh'`]
344344
- Sparse data is not supported
345345

346346
Nearest Neighbors

0 commit comments

Comments
 (0)