[python-package][sklearn] Support PyArrow Table as an input in scikit-learn methods (#6910)

StrikerRUS · web-flow · commit 72a39817c331 · 2025-05-26T21:07:46.000-05:00
diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
@@ -269,6 +269,7 @@ def __init__(self, *args: Any, **kwargs: Any):
     from pyarrow import Array as pa_Array
     from pyarrow import ChunkedArray as pa_ChunkedArray
     from pyarrow import Table as pa_Table
+    from pyarrow import array as pa_array
     from pyarrow import chunked_array as pa_chunked_array
     from pyarrow.types import is_boolean as arrow_is_boolean
     from pyarrow.types import is_floating as arrow_is_floating
@@ -302,6 +303,7 @@ class pa_compute:  # type: ignore
         all = None
         equal = None
 
+    pa_array = None
     pa_chunked_array = None
     arrow_is_boolean = None
     arrow_is_integer = None
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
@@ -41,6 +41,7 @@
     _LGBMRegressorBase,
     _LGBMValidateData,
     _sklearn_version,
+    pa_Table,
     pd_DataFrame,
 )
 from .engine import train
@@ -60,6 +61,7 @@
     List[Union[List[float], List[int]]],
     np.ndarray,
     pd_DataFrame,
+    pa_Table,
     scipy.sparse.spmatrix,
 ]
 _LGBM_ScikitCustomObjectiveFunction = Union[
@@ -943,7 +945,7 @@ def fit(
         params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"]
         params["metric"] = [metric for metric in params["metric"] if metric is not None]
 
-        if not isinstance(X, pd_DataFrame):
+        if not isinstance(X, (pd_DataFrame, pa_Table)):
             _X, _y = _LGBMValidateData(
                 self,
                 X,
@@ -1075,7 +1077,7 @@ def fit(
 
     fit.__doc__ = (
         _lgbmmodel_doc_fit.format(
-            X_shape="numpy array, pandas DataFrame, scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
+            X_shape="numpy array, pandas DataFrame, pyarrow Table, scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
             y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float, pyarrow Array, pyarrow ChunkedArray of shape = [n_samples]",
             sample_weight_shape="numpy array, pandas Series, list of int or float, pyarrow Array, pyarrow ChunkedArray of shape = [n_samples] or None, optional (default=None)",
             init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float, list of lists, pyarrow Array, pyarrow ChunkedArray, pyarrow Table of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
@@ -1102,7 +1104,7 @@ def predict(
         """Docstring is set after definition, using a template."""
         if not self.__sklearn_is_fitted__():
             raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
-        if not isinstance(X, pd_DataFrame):
+        if not isinstance(X, (pd_DataFrame, pa_Table)):
             X = _LGBMValidateData(
                 self,
                 X,
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
@@ -25,7 +25,11 @@
 from lightgbm.compat import (
     DASK_INSTALLED,
     PANDAS_INSTALLED,
+    PYARROW_INSTALLED,
     _sklearn_version,
+    pa_array,
+    pa_chunked_array,
+    pa_Table,
     pd_DataFrame,
     pd_Series,
 )
@@ -54,6 +58,9 @@
     "regression": lgb.LGBMRegressor,
 }
 all_tasks = tuple(task_to_model_factory.keys())
+all_x_types = ("list2d", "numpy", "pd_DataFrame", "pa_Table", "scipy_csc", "scipy_csr")
+all_y_types = ("list1d", "numpy", "pd_Series", "pd_DataFrame", "pa_Array", "pa_ChunkedArray")
+all_group_types = ("list1d_float", "list1d_int", "numpy", "pd_Series", "pa_Array", "pa_ChunkedArray")
 
 
 def _create_data(task, n_samples=100, n_features=4):
@@ -1884,16 +1891,11 @@ def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disabl
         assert preds.shape[0] == y.shape[0]
 
 
-@pytest.mark.parametrize("X_type", ["list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
-@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"])
-@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"])
-def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task, rng):
-    if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
-        pytest.skip("pandas is not installed")
+def run_minimal_test(X_type, y_type, g_type, task, rng):
     X, y, g = _create_data(task, n_samples=2_000)
     weights = np.abs(rng.standard_normal(size=(y.shape[0],)))
 
-    if task == "binary-classification" or task == "regression":
+    if task in {"binary-classification", "regression", "ranking"}:
         init_score = np.full_like(y, np.mean(y))
     elif task == "multiclass-classification":
         init_score = np.outer(y, np.array([0.1, 0.2, 0.7]))
@@ -1909,6 +1911,8 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
         X = scipy.sparse.csr_matrix(X)
     elif X_type == "pd_DataFrame":
         X = pd_DataFrame(X)
+    elif X_type == "pa_Table":
+        X = pa_Table.from_pandas(pd_DataFrame(X))
     elif X_type != "numpy":
         raise ValueError(f"Unrecognized X_type: '{X_type}'")
 
@@ -1932,19 +1936,50 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
             init_score = pd_DataFrame(init_score)
         else:
             init_score = pd_Series(init_score)
+    elif y_type == "pa_Array":
+        y = pa_array(y)
+        weights = pa_array(weights)
+        if task == "multiclass-classification":
+            init_score = pa_Table.from_pandas(pd_DataFrame(init_score))
+        else:
+            init_score = pa_array(init_score)
+    elif y_type == "pa_ChunkedArray":
+        y = pa_chunked_array([y])
+        weights = pa_chunked_array([weights])
+        if task == "multiclass-classification":
+            init_score = pa_Table.from_pandas(pd_DataFrame(init_score))
+        else:
+            init_score = pa_chunked_array([init_score])
     elif y_type != "numpy":
         raise ValueError(f"Unrecognized y_type: '{y_type}'")
 
+    if g_type == "list1d_float":
+        g = g.astype("float").tolist()
+    elif g_type == "list1d_int":
+        g = g.astype("int").tolist()
+    elif g_type == "pd_Series":
+        g = pd_Series(g)
+    elif g_type == "pa_Array":
+        g = pa_array(g)
+    elif g_type == "pa_ChunkedArray":
+        g = pa_chunked_array([g])
+    elif g_type != "numpy":
+        raise ValueError(f"Unrecognized g_type: '{g_type}'")
+
     model = task_to_model_factory[task](n_estimators=10, verbose=-1)
-    model.fit(
-        X=X,
-        y=y,
-        sample_weight=weights,
-        init_score=init_score,
-        eval_set=[(X_valid, y)],
-        eval_sample_weight=[weights],
-        eval_init_score=[init_score],
-    )
+    params_fit = {
+        "X": X,
+        "y": y,
+        "sample_weight": weights,
+        "init_score": init_score,
+        "eval_set": [(X_valid, y)],
+        "eval_sample_weight": [weights],
+        "eval_init_score": [init_score],
+    }
+    if task == "ranking":
+        params_fit["group"] = g
+        params_fit["eval_group"] = [g]
+    model.fit(**params_fit)
 
     preds = model.predict(X)
     if task == "binary-classification":
@@ -1953,72 +1988,44 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
         assert accuracy_score(y, preds) >= 0.99
     elif task == "regression":
         assert r2_score(y, preds) > 0.86
+    elif task == "ranking":
+        assert spearmanr(preds, y).correlation >= 0.99
     else:
         raise ValueError(f"Unrecognized task: '{task}'")
 
 
-@pytest.mark.parametrize("X_type", ["list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
-@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_DataFrame", "pd_Series"])
-@pytest.mark.parametrize("g_type", ["list1d_float", "list1d_int", "numpy", "pd_Series"])
-def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type, rng):
-    if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
+@pytest.mark.parametrize("X_type", all_x_types)
+@pytest.mark.parametrize("y_type", all_y_types)
+@pytest.mark.parametrize("task", [t for t in all_tasks if t != "ranking"])
+def test_classification_and_regression_minimally_work_with_all_accepted_data_types(
+    X_type,
+    y_type,
+    task,
+    rng,
+):
+    if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
         pytest.skip("pandas is not installed")
-    X, y, g = _create_data(task="ranking", n_samples=1_000)
-    weights = np.abs(rng.standard_normal(size=(y.shape[0],)))
-    init_score = np.full_like(y, np.mean(y))
-    X_valid = X * 2
+    if any(t.startswith("pa_") for t in [X_type, y_type]) and not PYARROW_INSTALLED:
+        pytest.skip("pyarrow is not installed")
 
-    if X_type == "list2d":
-        X = X.tolist()
-    elif X_type == "scipy_csc":
-        X = scipy.sparse.csc_matrix(X)
-    elif X_type == "scipy_csr":
-        X = scipy.sparse.csr_matrix(X)
-    elif X_type == "pd_DataFrame":
-        X = pd_DataFrame(X)
-    elif X_type != "numpy":
-        raise ValueError(f"Unrecognized X_type: '{X_type}'")
+    run_minimal_test(X_type=X_type, y_type=y_type, g_type="numpy", task=task, rng=rng)
 
-    # make weights and init_score same types as y, just to avoid
-    # a huge number of combinations and therefore test cases
-    if y_type == "list1d":
-        y = y.tolist()
-        weights = weights.tolist()
-        init_score = init_score.tolist()
-    elif y_type == "pd_DataFrame":
-        y = pd_DataFrame(y)
-        weights = pd_Series(weights)
-        init_score = pd_Series(init_score)
-    elif y_type == "pd_Series":
-        y = pd_Series(y)
-        weights = pd_Series(weights)
-        init_score = pd_Series(init_score)
-    elif y_type != "numpy":
-        raise ValueError(f"Unrecognized y_type: '{y_type}'")
 
-    if g_type == "list1d_float":
-        g = g.astype("float").tolist()
-    elif g_type == "list1d_int":
-        g = g.astype("int").tolist()
-    elif g_type == "pd_Series":
-        g = pd_Series(g)
-    elif g_type != "numpy":
-        raise ValueError(f"Unrecognized g_type: '{g_type}'")
+@pytest.mark.parametrize("X_type", all_x_types)
+@pytest.mark.parametrize("y_type", all_y_types)
+@pytest.mark.parametrize("g_type", all_group_types)
+def test_ranking_minimally_works_with_all_accepted_data_types(
+    X_type,
+    y_type,
+    g_type,
+    rng,
+):
+    if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
+        pytest.skip("pandas is not installed")
+    if any(t.startswith("pa_") for t in [X_type, y_type, g_type]) and not PYARROW_INSTALLED:
+        pytest.skip("pyarrow is not installed")
 
-    model = task_to_model_factory["ranking"](n_estimators=10, verbose=-1)
-    model.fit(
-        X=X,
-        y=y,
-        sample_weight=weights,
-        init_score=init_score,
-        group=g,
-        eval_set=[(X_valid, y)],
-        eval_sample_weight=[weights],
-        eval_init_score=[init_score],
-        eval_group=[g],
-    )
-    preds = model.predict(X)
-    assert spearmanr(preds, y).correlation >= 0.99
+    run_minimal_test(X_type=X_type, y_type=y_type, g_type=g_type, task="ranking", rng=rng)
 
 
 def test_classifier_fit_detects_classes_every_time():