Skip to content

feat: allow fit to take additional eval data in linear and ensemble models #1096

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion bigframes/ml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"""

import abc
from typing import cast, Optional, TypeVar
from typing import cast, Optional, TypeVar, Union

import bigframes_vendored.sklearn.base

Expand Down Expand Up @@ -164,6 +164,40 @@ def fit(
return self._fit(X, y)


class TrainableWithEvaluationPredictor(TrainablePredictor):
"""A BigQuery DataFrames ML Model base class that can be used to fit and predict outputs.

Additional evaluation data can be provided to measure the model in the fit phase."""

@abc.abstractmethod
def _fit(self, X, y, transforms=None, X_eval=None, y_eval=None):
pass

@abc.abstractmethod
def score(self, X, y):
pass


class SupervisedTrainableWithEvaluationPredictor(TrainableWithEvaluationPredictor):
"""A BigQuery DataFrames ML Supervised Model base class that can be used to fit and predict outputs.

Need to provide both X and y in supervised tasks.

Additional X_eval and y_eval can be provided to measure the model in the fit phase.
"""

_T = TypeVar("_T", bound="SupervisedTrainableWithEvaluationPredictor")

def fit(
self: _T,
X: Union[bpd.DataFrame, bpd.Series],
y: Union[bpd.DataFrame, bpd.Series],
X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
) -> _T:
return self._fit(X, y, X_eval=X_eval, y_eval=y_eval)


class UnsupervisedTrainablePredictor(TrainablePredictor):
"""A BigQuery DataFrames ML Unsupervised Model base class that can be used to fit and predict outputs.

Expand Down
58 changes: 49 additions & 9 deletions bigframes/ml/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from __future__ import annotations

from typing import Dict, List, Literal, Optional
from typing import Dict, List, Literal, Optional, Union

import bigframes_vendored.sklearn.ensemble._forest
import bigframes_vendored.xgboost.sklearn
Expand Down Expand Up @@ -52,7 +52,7 @@

@log_adapter.class_logger
class XGBRegressor(
base.SupervisedTrainablePredictor,
base.SupervisedTrainableWithEvaluationPredictor,
bigframes_vendored.xgboost.sklearn.XGBRegressor,
):
__doc__ = bigframes_vendored.xgboost.sklearn.XGBRegressor.__doc__
Expand Down Expand Up @@ -145,14 +145,24 @@ def _fit(
X: utils.ArrayType,
y: utils.ArrayType,
transforms: Optional[List[str]] = None,
X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
) -> XGBRegressor:
X, y = utils.convert_to_dataframe(X, y)

bqml_options = self._bqml_options

if X_eval is not None and y_eval is not None:
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
X, y, bqml_options = utils.combine_training_and_evaluation_data(
X, y, X_eval, y_eval, bqml_options
)

self._bqml_model = self._bqml_model_factory.create_model(
X,
y,
transforms=transforms,
options=self._bqml_options,
options=bqml_options,
)
return self

Expand Down Expand Up @@ -200,7 +210,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor:

@log_adapter.class_logger
class XGBClassifier(
base.SupervisedTrainablePredictor,
base.SupervisedTrainableWithEvaluationPredictor,
bigframes_vendored.xgboost.sklearn.XGBClassifier,
):

Expand Down Expand Up @@ -294,14 +304,24 @@ def _fit(
X: utils.ArrayType,
y: utils.ArrayType,
transforms: Optional[List[str]] = None,
X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
) -> XGBClassifier:
X, y = utils.convert_to_dataframe(X, y)

bqml_options = self._bqml_options

if X_eval is not None and y_eval is not None:
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
X, y, bqml_options = utils.combine_training_and_evaluation_data(
X, y, X_eval, y_eval, bqml_options
)

self._bqml_model = self._bqml_model_factory.create_model(
X,
y,
transforms=transforms,
options=self._bqml_options,
options=bqml_options,
)
return self

Expand Down Expand Up @@ -347,7 +367,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier:

@log_adapter.class_logger
class RandomForestRegressor(
base.SupervisedTrainablePredictor,
base.SupervisedTrainableWithEvaluationPredictor,
bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor,
):

Expand Down Expand Up @@ -430,14 +450,24 @@ def _fit(
X: utils.ArrayType,
y: utils.ArrayType,
transforms: Optional[List[str]] = None,
X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
) -> RandomForestRegressor:
X, y = utils.convert_to_dataframe(X, y)

bqml_options = self._bqml_options

if X_eval is not None and y_eval is not None:
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
X, y, bqml_options = utils.combine_training_and_evaluation_data(
X, y, X_eval, y_eval, bqml_options
)

self._bqml_model = self._bqml_model_factory.create_model(
X,
y,
transforms=transforms,
options=self._bqml_options,
options=bqml_options,
)
return self

Expand Down Expand Up @@ -503,7 +533,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegresso

@log_adapter.class_logger
class RandomForestClassifier(
base.SupervisedTrainablePredictor,
base.SupervisedTrainableWithEvaluationPredictor,
bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier,
):

Expand Down Expand Up @@ -586,14 +616,24 @@ def _fit(
X: utils.ArrayType,
y: utils.ArrayType,
transforms: Optional[List[str]] = None,
X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
) -> RandomForestClassifier:
X, y = utils.convert_to_dataframe(X, y)

bqml_options = self._bqml_options

if X_eval is not None and y_eval is not None:
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
X, y, bqml_options = utils.combine_training_and_evaluation_data(
X, y, X_eval, y_eval, bqml_options
)

self._bqml_model = self._bqml_model_factory.create_model(
X,
y,
transforms=transforms,
options=self._bqml_options,
options=bqml_options,
)
return self

Expand Down
29 changes: 24 additions & 5 deletions bigframes/ml/linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@

@log_adapter.class_logger
class LinearRegression(
base.SupervisedTrainablePredictor,
base.SupervisedTrainableWithEvaluationPredictor,
bigframes_vendored.sklearn.linear_model._base.LinearRegression,
):
__doc__ = bigframes_vendored.sklearn.linear_model._base.LinearRegression.__doc__
Expand Down Expand Up @@ -131,14 +131,24 @@ def _fit(
X: utils.ArrayType,
y: utils.ArrayType,
transforms: Optional[List[str]] = None,
X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
) -> LinearRegression:
X, y = utils.convert_to_dataframe(X, y)

bqml_options = self._bqml_options

if X_eval is not None and y_eval is not None:
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
X, y, bqml_options = utils.combine_training_and_evaluation_data(
X, y, X_eval, y_eval, bqml_options
)

self._bqml_model = self._bqml_model_factory.create_model(
X,
y,
transforms=transforms,
options=self._bqml_options,
options=bqml_options,
)
return self

Expand Down Expand Up @@ -183,7 +193,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LinearRegression:

@log_adapter.class_logger
class LogisticRegression(
base.SupervisedTrainablePredictor,
base.SupervisedTrainableWithEvaluationPredictor,
bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression,
):
__doc__ = (
Expand Down Expand Up @@ -283,15 +293,24 @@ def _fit(
X: utils.ArrayType,
y: utils.ArrayType,
transforms: Optional[List[str]] = None,
X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None,
) -> LogisticRegression:
"""Fit model with transforms."""
X, y = utils.convert_to_dataframe(X, y)

bqml_options = self._bqml_options

if X_eval is not None and y_eval is not None:
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
X, y, bqml_options = utils.combine_training_and_evaluation_data(
X, y, X_eval, y_eval, bqml_options
)

self._bqml_model = self._bqml_model_factory.create_model(
X,
y,
transforms=transforms,
options=self._bqml_options,
options=bqml_options,
)
return self

Expand Down
38 changes: 36 additions & 2 deletions bigframes/ml/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
# limitations under the License.

import typing
from typing import Any, Generator, Literal, Mapping, Optional, Union
from typing import Any, Generator, Literal, Mapping, Optional, Tuple, Union

import bigframes_vendored.constants as constants
from google.cloud import bigquery
import pandas as pd

from bigframes.core import blocks
from bigframes.core import blocks, guid
import bigframes.pandas as bpd
from bigframes.session import Session

Expand Down Expand Up @@ -155,3 +155,37 @@ def retrieve_params_from_bq_model(
kwargs[bf_param] = bf_param_type(last_fitting[bqml_param])

return kwargs


def combine_training_and_evaluation_data(
X_train: bpd.DataFrame,
y_train: bpd.DataFrame,
X_eval: bpd.DataFrame,
y_eval: bpd.DataFrame,
bqml_options: dict,
) -> Tuple[bpd.DataFrame, bpd.DataFrame, dict]:
"""
Combine training data and labels with evlauation data and labels, and keep
them differentiated through a split column in the combined data and labels.
"""

assert X_train.columns.equals(X_eval.columns)
assert y_train.columns.equals(y_eval.columns)

# create a custom split column for BQML and supply the evaluation
# data along with the training data in a combined single table
# https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-dnn-models#data_split_col.
split_col = guid.generate_guid()
assert split_col not in X_train.columns

X_train[split_col] = False
X_eval[split_col] = True
X = bpd.concat([X_train, X_eval])
y = bpd.concat([y_train, y_eval])

# create options copy to not mutate the incoming one
bqml_options = bqml_options.copy()
bqml_options["data_split_method"] = "CUSTOM"
bqml_options["data_split_col"] = split_col

return X, y, bqml_options
Loading