diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 550b4a8178..5662e54d6d 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -164,6 +164,40 @@ def fit( return self._fit(X, y) +class TrainableWithEvaluationPredictor(TrainablePredictor): + """A BigQuery DataFrames ML Model base class that can be used to fit and predict outputs. + + Additional evaluation data can be provided to measure the model in the fit phase.""" + + @abc.abstractmethod + def _fit(self, X, y, transforms=None, X_eval=None, y_eval=None): + pass + + @abc.abstractmethod + def score(self, X, y): + pass + + +class SupervisedTrainableWithEvaluationPredictor(TrainableWithEvaluationPredictor): + """A BigQuery DataFrames ML Supervised Model base class that can be used to fit and predict outputs. + + Need to provide both X and y in supervised tasks. + + Additional X_eval and y_eval can be provided to measure the model in the fit phase. + """ + + _T = TypeVar("_T", bound="SupervisedTrainableWithEvaluationPredictor") + + def fit( + self: _T, + X: utils.ArrayType, + y: utils.ArrayType, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, + ) -> _T: + return self._fit(X, y, X_eval=X_eval, y_eval=y_eval) + + class UnsupervisedTrainablePredictor(TrainablePredictor): """A BigQuery DataFrames ML Unsupervised Model base class that can be used to fit and predict outputs. diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 91c14e4336..253ef7c5c1 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -52,7 +52,7 @@ @log_adapter.class_logger class XGBRegressor( - base.SupervisedTrainablePredictor, + base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.xgboost.sklearn.XGBRegressor, ): __doc__ = bigframes_vendored.xgboost.sklearn.XGBRegressor.__doc__ @@ -145,14 +145,24 @@ def _fit( X: utils.ArrayType, y: utils.ArrayType, transforms: Optional[List[str]] = None, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, ) -> XGBRegressor: X, y = utils.convert_to_dataframe(X, y) + bqml_options = self._bqml_options + + if X_eval is not None and y_eval is not None: + X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval) + X, y, bqml_options = utils.combine_training_and_evaluation_data( + X, y, X_eval, y_eval, bqml_options + ) + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, - options=self._bqml_options, + options=bqml_options, ) return self @@ -200,7 +210,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor: @log_adapter.class_logger class XGBClassifier( - base.SupervisedTrainablePredictor, + base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.xgboost.sklearn.XGBClassifier, ): @@ -294,14 +304,24 @@ def _fit( X: utils.ArrayType, y: utils.ArrayType, transforms: Optional[List[str]] = None, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, ) -> XGBClassifier: X, y = utils.convert_to_dataframe(X, y) + bqml_options = self._bqml_options + + if X_eval is not None and y_eval is not None: + X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval) + X, y, bqml_options = utils.combine_training_and_evaluation_data( + X, y, X_eval, y_eval, bqml_options + ) + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, - options=self._bqml_options, + options=bqml_options, ) return self @@ -347,7 +367,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier: @log_adapter.class_logger class RandomForestRegressor( - base.SupervisedTrainablePredictor, + base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor, ): @@ -430,14 +450,24 @@ def _fit( X: utils.ArrayType, y: utils.ArrayType, transforms: Optional[List[str]] = None, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, ) -> RandomForestRegressor: X, y = utils.convert_to_dataframe(X, y) + bqml_options = self._bqml_options + + if X_eval is not None and y_eval is not None: + X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval) + X, y, bqml_options = utils.combine_training_and_evaluation_data( + X, y, X_eval, y_eval, bqml_options + ) + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, - options=self._bqml_options, + options=bqml_options, ) return self @@ -503,7 +533,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegresso @log_adapter.class_logger class RandomForestClassifier( - base.SupervisedTrainablePredictor, + base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier, ): @@ -586,14 +616,24 @@ def _fit( X: utils.ArrayType, y: utils.ArrayType, transforms: Optional[List[str]] = None, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, ) -> RandomForestClassifier: X, y = utils.convert_to_dataframe(X, y) + bqml_options = self._bqml_options + + if X_eval is not None and y_eval is not None: + X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval) + X, y, bqml_options = utils.combine_training_and_evaluation_data( + X, y, X_eval, y_eval, bqml_options + ) + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, - options=self._bqml_options, + options=bqml_options, ) return self diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 5665507286..85be54e596 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -47,7 +47,7 @@ @log_adapter.class_logger class LinearRegression( - base.SupervisedTrainablePredictor, + base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.sklearn.linear_model._base.LinearRegression, ): __doc__ = bigframes_vendored.sklearn.linear_model._base.LinearRegression.__doc__ @@ -131,14 +131,24 @@ def _fit( X: utils.ArrayType, y: utils.ArrayType, transforms: Optional[List[str]] = None, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, ) -> LinearRegression: X, y = utils.convert_to_dataframe(X, y) + bqml_options = self._bqml_options + + if X_eval is not None and y_eval is not None: + X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval) + X, y, bqml_options = utils.combine_training_and_evaluation_data( + X, y, X_eval, y_eval, bqml_options + ) + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, - options=self._bqml_options, + options=bqml_options, ) return self @@ -183,7 +193,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LinearRegression: @log_adapter.class_logger class LogisticRegression( - base.SupervisedTrainablePredictor, + base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression, ): __doc__ = ( @@ -283,15 +293,24 @@ def _fit( X: utils.ArrayType, y: utils.ArrayType, transforms: Optional[List[str]] = None, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, ) -> LogisticRegression: - """Fit model with transforms.""" X, y = utils.convert_to_dataframe(X, y) + bqml_options = self._bqml_options + + if X_eval is not None and y_eval is not None: + X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval) + X, y, bqml_options = utils.combine_training_and_evaluation_data( + X, y, X_eval, y_eval, bqml_options + ) + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, - options=self._bqml_options, + options=bqml_options, ) return self diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py index bdca45e457..8daed169da 100644 --- a/bigframes/ml/utils.py +++ b/bigframes/ml/utils.py @@ -13,13 +13,13 @@ # limitations under the License. import typing -from typing import Any, Generator, Literal, Mapping, Optional, Union +from typing import Any, Generator, Literal, Mapping, Optional, Tuple, Union import bigframes_vendored.constants as constants from google.cloud import bigquery import pandas as pd -from bigframes.core import blocks +from bigframes.core import blocks, guid import bigframes.pandas as bpd from bigframes.session import Session @@ -155,3 +155,37 @@ def retrieve_params_from_bq_model( kwargs[bf_param] = bf_param_type(last_fitting[bqml_param]) return kwargs + + +def combine_training_and_evaluation_data( + X_train: bpd.DataFrame, + y_train: bpd.DataFrame, + X_eval: bpd.DataFrame, + y_eval: bpd.DataFrame, + bqml_options: dict, +) -> Tuple[bpd.DataFrame, bpd.DataFrame, dict]: + """ + Combine training data and labels with evlauation data and labels, and keep + them differentiated through a split column in the combined data and labels. + """ + + assert X_train.columns.equals(X_eval.columns) + assert y_train.columns.equals(y_eval.columns) + + # create a custom split column for BQML and supply the evaluation + # data along with the training data in a combined single table + # https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-dnn-models#data_split_col. + split_col = guid.generate_guid() + assert split_col not in X_train.columns + + X_train[split_col] = False + X_eval[split_col] = True + X = bpd.concat([X_train, X_eval]) + y = bpd.concat([y_train, y_eval]) + + # create options copy to not mutate the incoming one + bqml_options = bqml_options.copy() + bqml_options["data_split_method"] = "CUSTOM" + bqml_options["data_split_col"] = split_col + + return X, y, bqml_options diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 273da97bc5..f6ca26e7e4 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pandas as pd + +from bigframes.ml import model_selection import bigframes.ml.linear_model from tests.system import utils @@ -58,6 +61,85 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase assert reloaded_model.tol == 0.01 +def test_linear_regression_configure_fit_with_eval_score( + penguins_df_default_index, dataset_id +): + model = bigframes.ml.linear_model.LinearRegression() + + df = penguins_df_default_index.dropna() + X = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + ] + y = df[["body_mass_g"]] + + X_train, X_eval, y_train, y_eval = model_selection.train_test_split(X, y) + + model.fit(X_train, y_train, X_eval=X_eval, y_eval=y_eval) + + # Check score to ensure the model was fitted + result = model.score(X_eval, y_eval).to_pandas() + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_REGRESSION_METRICS, index=1 + ) + + # save, load, check parameters to ensure configuration was kept + bq_model_name = f"{dataset_id}.temp_configured_model" + reloaded_model = model.to_gbq(bq_model_name, replace=True) + assert reloaded_model._bqml_model is not None + assert ( + f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name + ) + assert reloaded_model.optimize_strategy == "NORMAL_EQUATION" + assert reloaded_model.fit_intercept is True + assert reloaded_model.calculate_p_values is False + assert reloaded_model.enable_global_explain is False + assert reloaded_model.l1_reg is None + assert reloaded_model.l2_reg == 0.0 + assert reloaded_model.learning_rate is None + assert reloaded_model.learning_rate_strategy == "line_search" + assert reloaded_model.ls_init_learning_rate is None + assert reloaded_model.max_iterations == 20 + assert reloaded_model.tol == 0.01 + + # make sure the bqml model was internally created with custom split + bq_model = penguins_df_default_index._session.bqclient.get_model(bq_model_name) + last_fitting = bq_model.training_runs[-1]["trainingOptions"] + assert last_fitting["dataSplitMethod"] == "CUSTOM" + assert "dataSplitColumn" in last_fitting + + # make sure the bqml model has the same evaluation metrics attached as + # returned by model.score() + bq_model_expected_eval_metrics = result[utils.ML_REGRESSION_METRICS[:5]] + bq_model_eval_metrics = bq_model.training_runs[-1]["evaluationMetrics"][ + "regressionMetrics" + ] + bq_model_eval_metrics = pd.DataFrame( + [ + [ + bq_model_eval_metrics["meanAbsoluteError"], + bq_model_eval_metrics["meanSquaredError"], + bq_model_eval_metrics["meanSquaredLogError"], + bq_model_eval_metrics["medianAbsoluteError"], + bq_model_eval_metrics["rSquared"], + ] + ], + columns=utils.ML_REGRESSION_METRICS[:5], + ) + pd.testing.assert_frame_equal( + bq_model_expected_eval_metrics, + bq_model_eval_metrics, + check_dtype=False, + check_index_type=False, + ) + + def test_linear_regression_customized_params_fit_score( penguins_df_default_index, dataset_id ): @@ -216,6 +298,80 @@ def test_logistic_regression_configure_fit_score(penguins_df_default_index, data assert reloaded_model.class_weight is None +def test_logistic_regression_configure_fit_with_eval_score( + penguins_df_default_index, dataset_id +): + model = bigframes.ml.linear_model.LogisticRegression() + + df = penguins_df_default_index.dropna() + df = df[df["sex"].isin(["MALE", "FEMALE"])] + + X = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + ] + ] + y = df[["sex"]] + + X_train, X_eval, y_train, y_eval = model_selection.train_test_split(X, y) + + model.fit(X_train, y_train, X_eval=X_eval, y_eval=y_eval) + + # Check score to ensure the model was fitted + result = model.score(X_eval, y_eval).to_pandas() + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_CLASSFICATION_METRICS, index=1 + ) + + # save, load, check parameters to ensure configuration was kept + bq_model_name = f"{dataset_id}.temp_configured_logistic_reg_model" + reloaded_model = model.to_gbq(bq_model_name, replace=True) + assert reloaded_model._bqml_model is not None + assert ( + f"{dataset_id}.temp_configured_logistic_reg_model" + in reloaded_model._bqml_model.model_name + ) + assert reloaded_model.fit_intercept is True + assert reloaded_model.class_weight is None + + # make sure the bqml model was internally created with custom split + bq_model = penguins_df_default_index._session.bqclient.get_model(bq_model_name) + last_fitting = bq_model.training_runs[-1]["trainingOptions"] + assert last_fitting["dataSplitMethod"] == "CUSTOM" + assert "dataSplitColumn" in last_fitting + + # make sure the bqml model has the same evaluation metrics attached as + # returned by model.score() + bq_model_expected_eval_metrics = result + bq_model_eval_metrics = bq_model.training_runs[-1]["evaluationMetrics"][ + "binaryClassificationMetrics" + ]["aggregateClassificationMetrics"] + bq_model_eval_metrics = pd.DataFrame( + [ + [ + bq_model_eval_metrics["precision"], + bq_model_eval_metrics["recall"], + bq_model_eval_metrics["accuracy"], + bq_model_eval_metrics["f1Score"], + bq_model_eval_metrics["logLoss"], + bq_model_eval_metrics["rocAuc"], + ] + ], + columns=utils.ML_CLASSFICATION_METRICS, + ) + pd.testing.assert_frame_equal( + bq_model_expected_eval_metrics, + bq_model_eval_metrics, + check_dtype=False, + check_index_type=False, + ) + + def test_logistic_regression_customized_params_fit_score( penguins_df_default_index, dataset_id ): diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py index 1f6284c146..fb81bd6684 100644 --- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py +++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py @@ -54,6 +54,13 @@ def fit(self, X, y): Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. + X_eval (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or DataFrame of shape (n_samples, n_features). Evaluation data. + + y_eval (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). + Evaluation target values. Will be cast to X_eval's dtype if necessary. + Returns: ForestModel: Fitted estimator. diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index fa8f28a656..d6b8a473bd 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -108,6 +108,13 @@ def fit( Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. + X_eval (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or DataFrame of shape (n_samples, n_features). Evaluation data. + + y_eval (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). + Evaluation target values. Will be cast to X_eval's dtype if necessary. + Returns: LinearRegression: Fitted estimator. """ diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index f3419ba8a9..479be19596 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -79,6 +79,14 @@ def fit( y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): DataFrame of shape (n_samples,). Target vector relative to X. + X_eval (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or DataFrame of shape (n_samples, n_features). Evaluation vector, + where `n_samples` is the number of samples and `n_features` is + the number of features. + + y_eval (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + DataFrame of shape (n_samples,). Target vector relative to X_eval. + Returns: LogisticRegression: Fitted estimator. diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py index da1396af02..60a22e83d0 100644 --- a/third_party/bigframes_vendored/xgboost/sklearn.py +++ b/third_party/bigframes_vendored/xgboost/sklearn.py @@ -37,6 +37,13 @@ def fit(self, X, y): DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. + X_eval (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples, n_features). Evaluation data. + + y_eval (bigframes.dataframe.DataFrame or bigframes.series.Series): + DataFrame of shape (n_samples,) or (n_samples, n_targets). + Evaluation target values. Will be cast to X_eval's dtype if necessary. + Returns: XGBModel: Fitted estimator. """