feat: add support for creating a Matrix Factorization model (#1330)

rey-esp · tswast · web-flow · commit b5297f909b08 · 2025-04-01T15:07:06.000-05:00
* docs: update title of pypi notebook example to reflect use of the PyPI public dataset

In response to feedback on internal change 662899733.

* feat: add support for creating a  Matrix Factorization model

* feat: add support for creating a  Matrix Factorization model

* Update bigframes/ml/decomposition.py

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* Update bigframes/ml/decomposition.py

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* Update bigframes/ml/decomposition.py

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* rating_col

* (nearly) complete class

* removem print()

* adding recommend

* remove hyper parameter runing references

* swap predict in _mf for recommend

* recommend -&gt; predict

* update predict doc string

* Merge branch 'main' into b338873783-matrix-factorization

* preparing test files

* add test data

* new error: to_gbq column names need to be changed?

* Merge branch 'main' into b338873783-matrix-factorization

* Merge branch 'main' into b338873783-matrix-factorization

* Delete demo.ipynb

* passing system test

* preparing to add unit tests

* 2 out of 3 (so far) passing unit tests

* attempted mocking

* fix tests

* new test file for model creation unit tests

* add unit tests for num_factors, user_col, and item_col

* Update tests/unit/ml/test_matrix_factorization.py

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* Update tests/unit/ml/test_matrix_factorization.py

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* uncomment one test

* uncomment test

* uncomment test

* uncomment test

* nearly all tests

* tests complete and passing

* seeing if test causes kokoro failure

* uncomment test-kokoro still failing

* remove comment

* fix test

* test kokoro

* test_decomposition.py failing and now feedback_type attr does not exist

* passing tests

* Update bigframes/ml/decomposition.py

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* Update tests/system/large/ml/test_decomposition.py

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* doc attempt - _mf.py example

* feedback_type case ignore

* Update _mf.py - remove global_explain()

* fit

* W

* fix docs (maybe)

* Update test_matrix_factorization.py with updated error messages

* ilnt

* Update test_matrix_factorization.py - add 'f'

* improve errors and update tests

* Update tests/system/large/ml/test_decomposition.py

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* Update bigframes/ml/decomposition.py - num_factors error messsage

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* Update bigframes/ml/decomposition.py - user_col error message

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* Update bigframes/ml/decomposition.py - rating_col error message

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* Update bigframes/ml/decomposition.py - l2_reg error msg

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* fix tests to match updated error messages

* Update third_party/bigframes_vendored/sklearn/decomposition/_mf.py - docs df

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* Update third_party/bigframes_vendored/sklearn/decomposition/_mf.py - docs model

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* Update third_party/bigframes_vendored/sklearn/decomposition/_mf.py - docs fit

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* Update third_party/bigframes_vendored/sklearn/decomposition/_mf.py

* remove errors and tests

* Update bigframes/ml/decomposition.py

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* Update bigframes/ml/decomposition.py

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* Update bigframes/ml/decomposition.py

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

* passing system test

* E           AssertionError: expected call not found.
E           Expected: read_gbq('SELECT * FROM ML.RECOMMEND(MODEL ..,\n  (input_X_sql))', trial_id=['index_column_id'])
E           Actual: read_gbq('SELECT * FROM ML.RECOMMEND(MODEL ..,\n  (input_X_sql))', index_col=['index_column_id'])

* same # of elements in each

* attempt

* doc fix

* doc fix

---------

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;
diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
@@ -117,6 +117,12 @@ def model(self) -> bigquery.Model:
         """Get the BQML model associated with this wrapper"""
         return self._model
 
+    def recommend(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
+        return self._apply_ml_tvf(
+            input_data,
+            self._model_manipulation_sql_generator.ml_recommend,
+        )
+
     def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
         return self._apply_ml_tvf(
             input_data,
diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
@@ -19,6 +19,7 @@
 
 from typing import List, Literal, Optional, Union
 
+import bigframes_vendored.sklearn.decomposition._mf
 import bigframes_vendored.sklearn.decomposition._pca
 from google.cloud import bigquery
 
@@ -27,7 +28,15 @@
 import bigframes.pandas as bpd
 import bigframes.session
 
-_BQML_PARAMS_MAPPING = {"svd_solver": "pcaSolver"}
+_BQML_PARAMS_MAPPING = {
+    "svd_solver": "pcaSolver",
+    "feedback_type": "feedbackType",
+    "num_factors": "numFactors",
+    "user_col": "userColumn",
+    "item_col": "itemColumn",
+    "_input_label_columns": "inputLabelColumns",
+    "l2_reg": "l2Regularization",
+}
 
 
 @log_adapter.class_logger
@@ -197,3 +206,159 @@ def score(
 
         # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE.
         return self._bqml_model.evaluate()
+
+
+@log_adapter.class_logger
+class MatrixFactorization(
+    base.UnsupervisedTrainablePredictor,
+    bigframes_vendored.sklearn.decomposition._mf.MatrixFactorization,
+):
+    __doc__ = bigframes_vendored.sklearn.decomposition._mf.MatrixFactorization.__doc__
+
+    def __init__(
+        self,
+        *,
+        feedback_type: Literal["explicit", "implicit"] = "explicit",
+        num_factors: int,
+        user_col: str,
+        item_col: str,
+        rating_col: str = "rating",
+        # TODO: Add support for hyperparameter tuning.
+        l2_reg: float = 1.0,
+    ):
+
+        feedback_type = feedback_type.lower()  # type: ignore
+        if feedback_type not in ("explicit", "implicit"):
+            raise ValueError("Expected feedback_type to be `explicit` or `implicit`.")
+
+        self.feedback_type = feedback_type
+
+        if not isinstance(num_factors, int):
+            raise TypeError(
+                f"Expected num_factors to be an int, but got {type(num_factors)}."
+            )
+
+        if num_factors < 0:
+            raise ValueError(
+                f"Expected num_factors to be a positive integer, but got {num_factors}."
+            )
+
+        self.num_factors = num_factors
+
+        if not isinstance(user_col, str):
+            raise TypeError(f"Expected user_col to be a str, but got {type(user_col)}.")
+
+        self.user_col = user_col
+
+        if not isinstance(item_col, str):
+            raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.")
+
+        self.item_col = item_col
+
+        if not isinstance(rating_col, str):
+            raise TypeError(
+                f"Expected rating_col to be a str, but got {type(rating_col)}."
+            )
+
+        self._input_label_columns = [rating_col]
+
+        if not isinstance(l2_reg, (float, int)):
+            raise TypeError(
+                f"Expected l2_reg to be a float or int, but got {type(l2_reg)}."
+            )
+
+        self.l2_reg = l2_reg
+        self._bqml_model: Optional[core.BqmlModel] = None
+        self._bqml_model_factory = globals.bqml_model_factory()
+
+    @property
+    def rating_col(self) -> str:
+        """str: The rating column name. Defaults to 'rating'."""
+        return self._input_label_columns[0]
+
+    @classmethod
+    def _from_bq(
+        cls, session: bigframes.session.Session, bq_model: bigquery.Model
+    ) -> MatrixFactorization:
+        assert bq_model.model_type == "MATRIX_FACTORIZATION"
+
+        kwargs = utils.retrieve_params_from_bq_model(
+            cls, bq_model, _BQML_PARAMS_MAPPING
+        )
+
+        model = cls(**kwargs)
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
+
+    @property
+    def _bqml_options(self) -> dict:
+        """The model options as they will be set for BQML"""
+        options: dict = {
+            "model_type": "matrix_factorization",
+            "feedback_type": self.feedback_type,
+            "user_col": self.user_col,
+            "item_col": self.item_col,
+            "rating_col": self.rating_col,
+            "l2_reg": self.l2_reg,
+        }
+
+        if self.num_factors is not None:
+            options["num_factors"] = self.num_factors
+
+        return options
+
+    def _fit(
+        self,
+        X: utils.ArrayType,
+        y=None,
+        transforms: Optional[List[str]] = None,
+    ) -> MatrixFactorization:
+        if y is not None:
+            raise ValueError(
+                "Label column not supported for Matrix Factorization model but y was not `None`"
+            )
+
+        (X,) = utils.batch_convert_to_dataframe(X)
+
+        self._bqml_model = self._bqml_model_factory.create_model(
+            X_train=X,
+            transforms=transforms,
+            options=self._bqml_options,
+        )
+        return self
+
+    def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before recommend")
+
+        (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
+
+        return self._bqml_model.recommend(X)
+
+    def to_gbq(self, model_name: str, replace: bool = False) -> MatrixFactorization:
+        """Save the model to BigQuery.
+
+        Args:
+            model_name (str):
+                The name of the model.
+            replace (bool, default False):
+                Determine whether to replace if the model already exists. Default to False.
+
+        Returns:
+            MatrixFactorization: Saved model."""
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before it can be saved")
+
+        new_model = self._bqml_model.copy(model_name, replace)
+        return new_model.session.read_gbq_model(model_name)
+
+    def score(
+        self,
+        X=None,
+        y=None,
+    ) -> bpd.DataFrame:
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before score")
+
+        # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE.
+        return self._bqml_model.evaluate()
diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py
@@ -42,6 +42,7 @@
         "LINEAR_REGRESSION": linear_model.LinearRegression,
         "LOGISTIC_REGRESSION": linear_model.LogisticRegression,
         "KMEANS": cluster.KMeans,
+        "MATRIX_FACTORIZATION": decomposition.MatrixFactorization,
         "PCA": decomposition.PCA,
         "BOOSTED_TREE_REGRESSOR": ensemble.XGBRegressor,
         "BOOSTED_TREE_CLASSIFIER": ensemble.XGBClassifier,
@@ -80,6 +81,7 @@
 def from_bq(
     session: bigframes.session.Session, bq_model: bigquery.Model
 ) -> Union[
+    decomposition.MatrixFactorization,
     decomposition.PCA,
     cluster.KMeans,
     linear_model.LinearRegression,
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
@@ -299,6 +299,11 @@ def alter_model(
         return "\n".join(parts)
 
     # ML prediction TVFs
+    def ml_recommend(self, source_sql: str) -> str:
+        """Encode ML.RECOMMEND for BQML"""
+        return f"""SELECT * FROM ML.RECOMMEND(MODEL {self._model_ref_sql()},
+  ({source_sql}))"""
+
     def ml_predict(self, source_sql: str) -> str:
         """Encode ML.PREDICT for BQML"""
         return f"""SELECT * FROM ML.PREDICT(MODEL {self._model_ref_sql()},
diff --git a/tests/data/ratings.jsonl b/tests/data/ratings.jsonl
@@ -0,0 +1,20 @@
+{"user_id": 1, "item_id": 2, "rating": 4.0}
+{"user_id": 1, "item_id": 5, "rating": 3.0}
+{"user_id": 2, "item_id": 1, "rating": 5.0}
+{"user_id": 2, "item_id": 3, "rating": 2.0}
+{"user_id": 3, "item_id": 4, "rating": 4.5}
+{"user_id": 3, "item_id": 7, "rating": 3.5}
+{"user_id": 4, "item_id": 2, "rating": 1.0}
+{"user_id": 4, "item_id": 8, "rating": 5.0}
+{"user_id": 5, "item_id": 3, "rating": 4.0}
+{"user_id": 5, "item_id": 9, "rating": 2.5}
+{"user_id": 6, "item_id": 1, "rating": 3.0}
+{"user_id": 6, "item_id": 6, "rating": 4.5}
+{"user_id": 7, "item_id": 5, "rating": 5.0}
+{"user_id": 7, "item_id": 10, "rating": 1.5}
+{"user_id": 8, "item_id": 4, "rating": 2.0}
+{"user_id": 8, "item_id": 7, "rating": 4.0}
+{"user_id": 9, "item_id": 2, "rating": 3.5}
+{"user_id": 9, "item_id": 9, "rating": 5.0}
+{"user_id": 10, "item_id": 3, "rating": 4.5}
+{"user_id": 10, "item_id": 8, "rating": 2.5}
diff --git a/tests/data/ratings_schema.json b/tests/data/ratings_schema.json
@@ -0,0 +1,17 @@
+[
+    {
+      "mode": "NULLABLE",
+      "name": "user_id",
+      "type": "STRING"
+    },
+    {
+      "mode": "NULLABLE",
+      "name": "item_id",
+      "type": "INT64"
+    },
+    {
+      "mode": "NULLABLE",
+      "name": "rating",
+      "type": "FLOAT"
+    }
+]
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
@@ -320,6 +320,7 @@ def load_test_data_tables(
         ("repeated", "repeated_schema.json", "repeated.jsonl"),
         ("json", "json_schema.json", "json.jsonl"),
         ("penguins", "penguins_schema.json", "penguins.jsonl"),
+        ("ratings", "ratings_schema.json", "ratings.jsonl"),
         ("time_series", "time_series_schema.json", "time_series.jsonl"),
         ("hockey_players", "hockey_players.json", "hockey_players.jsonl"),
         ("matrix_2by3", "matrix_2by3.json", "matrix_2by3.jsonl"),
@@ -416,6 +417,11 @@ def penguins_table_id(test_data_tables) -> str:
     return test_data_tables["penguins"]
 
 
+@pytest.fixture(scope="session")
+def ratings_table_id(test_data_tables) -> str:
+    return test_data_tables["ratings"]
+
+
 @pytest.fixture(scope="session")
 def urban_areas_table_id(test_data_tables) -> str:
     return test_data_tables["urban_areas"]
@@ -769,6 +775,14 @@ def penguins_df_null_index(
     return unordered_session.read_gbq(penguins_table_id)
 
 
+@pytest.fixture(scope="session")
+def ratings_df_default_index(
+    ratings_table_id: str, session: bigframes.Session
+) -> bigframes.dataframe.DataFrame:
+    """DataFrame pointing at test data."""
+    return session.read_gbq(ratings_table_id)
+
+
 @pytest.fixture(scope="session")
 def time_series_df_default_index(
     time_series_table_id: str, session: bigframes.Session
diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py
@@ -163,3 +163,49 @@ def test_decomposition_configure_fit_load_none_component(
         in reloaded_model._bqml_model.model_name
     )
     assert reloaded_model.n_components == 7
+
+
+def test_decomposition_mf_configure_fit_load(
+    session, ratings_df_default_index, dataset_id
+):
+    model = decomposition.MatrixFactorization(
+        num_factors=6,
+        feedback_type="explicit",
+        user_col="user_id",
+        item_col="item_id",
+        rating_col="rating",
+        l2_reg=9.83,
+    )
+
+    model.fit(ratings_df_default_index)
+
+    reloaded_model = model.to_gbq(
+        f"{dataset_id}.temp_configured_mf_model", replace=True
+    )
+
+    new_ratings = session.read_pandas(
+        pd.DataFrame(
+            {
+                "user_id": ["11", "12", "13"],
+                "item_id": [1, 2, 3],
+                "rating": [1.0, 2.0, 3.0],
+            }
+        )
+    )
+
+    reloaded_model.score(new_ratings)
+
+    result = reloaded_model.predict(new_ratings).to_pandas()
+
+    assert reloaded_model._bqml_model is not None
+    assert (
+        f"{dataset_id}.temp_configured_mf_model"
+        in reloaded_model._bqml_model.model_name
+    )
+    assert result is not None
+    assert reloaded_model.feedback_type == "explicit"
+    assert reloaded_model.num_factors == 6
+    assert reloaded_model.user_col == "user_id"
+    assert reloaded_model.item_col == "item_id"
+    assert reloaded_model.rating_col == "rating"
+    assert reloaded_model.l2_reg == 9.83
diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py
diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py