feat: Add validation of the BigQuery location when creating a MultimodalDataset

fthoele · copybara-github · commit 98459aafa6fb · 2025-03-28T07:20:28.000-07:00
PiperOrigin-RevId: 741515869
diff --git a/google/cloud/aiplatform/preview/datasets.py b/google/cloud/aiplatform/preview/datasets.py
@@ -88,13 +88,46 @@ def _get_metadata_for_bq(
     return json_format.ParseDict(input_config, struct_pb2.Value())
 
 
-def _normalize_table_id(*, table_id: str, project: str):
-    if table_id.count(".") == 1:
-        # table_id has the "dataset.table" format, prepend the project
-        return f"{project}.{table_id}"
-    elif table_id.count(".") != 2:
-        raise ValueError(f"invalid table id: {table_id}")
-    return table_id
+def _normalize_and_validate_table_id(
+    *,
+    table_id: str,
+    project: Optional[str] = None,
+    vertex_location: Optional[str] = None,
+    credentials: Optional[auth_credentials.Credentials] = None,
+):
+    from google.cloud import bigquery  # pylint: disable=g-import-not-at-top
+
+    if not project:
+        project = initializer.global_config.project
+    if not vertex_location:
+        vertex_location = initializer.global_config.location
+    if not credentials:
+        credentials = initializer.global_config.credentials
+
+    table_ref = bigquery.TableReference.from_string(table_id, default_project=project)
+    if table_ref.project != project:
+        raise ValueError(
+            f"The BigQuery table "
+            f"`{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}`"
+            " must be in the same project as the multimodal dataset."
+            f" The multimodal dataset is in `{project}`, but the BigQuery table"
+            f" is in `{table_ref.project}`."
+        )
+
+    dataset_ref = bigquery.DatasetReference(
+        project=table_ref.project, dataset_id=table_ref.dataset_id
+    )
+    client = bigquery.Client(project=project, credentials=credentials)
+    bq_dataset = client.get_dataset(dataset_ref=dataset_ref)
+    if bq_dataset.location != vertex_location:
+        raise ValueError(
+            f"The BigQuery dataset"
+            f" `{dataset_ref.project}.{dataset_ref.dataset_id}` must be in the"
+            " same location as the multimodal dataset. The multimodal dataset"
+            f" is in `{vertex_location}`, but the BigQuery dataset is in"
+            f" `{bq_dataset.location}`."
+        )
+    return f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}"
 
 
 class GeminiExample:
@@ -577,7 +610,8 @@ def from_pandas(
                 table id can be in the format of "dataset.table" or
                 "project.dataset.table". If a table already exists with the
                 given table id, it will be overwritten. Note that the BigQuery
-                dataset must already exist.
+                dataset must already exist and be in the same location as the
+                multimodal dataset.
             display_name (str):
                 Optional. The user-defined name of the dataset. The name can be
                 up to 128 characters long and can consist of any UTF-8
@@ -614,12 +648,15 @@ def from_pandas(
                 The created multimodal dataset.
         """
         bigframes = _try_import_bigframes()
-        if not project:
-            project = initializer.global_config.project
         # TODO(b/400355374): `table_id` should be optional, and if not provided,
         # we generate a random table id. Also, check if we can use a default
         # dataset that's created from the SDK.
-        target_table_id = _normalize_table_id(table_id=target_table_id, project=project)
+        target_table_id = _normalize_and_validate_table_id(
+            table_id=target_table_id,
+            project=project,
+            vertex_location=location,
+            credentials=credentials,
+        )
 
         temp_bigframes_df = bigframes.pandas.read_pandas(dataframe)
         temp_bigframes_df.to_gbq(
@@ -662,7 +699,8 @@ def from_bigframes(
                 table id can be in the format of "dataset.table" or
                 "project.dataset.table". If a table already exists with the
                 given table id, it will be overwritten. Note that the BigQuery
-                dataset must already exist.
+                dataset must already exist and be in the same location as the
+                multimodal dataset.
             display_name (str):
                 Optional. The user-defined name of the dataset. The name can be
                 up to 128 characters long and can consist of any UTF-8
@@ -697,12 +735,14 @@ def from_bigframes(
         Returns:
             The created multimodal dataset.
         """
-        project_id = project or initializer.global_config.project
         # TODO(b/400355374): `table_id` should be optional, and if not provided,
         # we generate a random table id. Also, check if we can use a default
         # dataset that's created from the SDK.
-        target_table_id = _normalize_table_id(
-            table_id=target_table_id, project=project_id
+        target_table_id = _normalize_and_validate_table_id(
+            table_id=target_table_id,
+            project=project,
+            vertex_location=location,
+            credentials=credentials,
         )
         dataframe.to_gbq(
             destination_table=target_table_id,
diff --git a/tests/unit/aiplatform/test_multimodal_datasets.py b/tests/unit/aiplatform/test_multimodal_datasets.py
@@ -21,6 +21,7 @@
 from google import auth
 from google.api_core import operation
 from google.auth import credentials as auth_credentials
+from google.cloud import bigquery
 from google.cloud import aiplatform
 from google.cloud.aiplatform import base
 from google.cloud.aiplatform import initializer
@@ -42,6 +43,7 @@
 
 _TEST_PROJECT = "test-project"
 _TEST_LOCATION = "us-central1"
+_TEST_ALTERNATE_LOCATION = "europe-west6"
 _TEST_ID = "1028944691210842416"
 _TEST_PARENT = f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}"
 _TEST_NAME = f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}/datasets/{_TEST_ID}"
@@ -53,6 +55,8 @@
 )
 
 _TEST_SOURCE_URI_BQ = "bq://my-project.my-dataset.table"
+_TEST_TARGET_BQ_DATASET = f"{_TEST_PROJECT}.target-dataset"
+_TEST_TARGET_BQ_TABLE = f"{_TEST_TARGET_BQ_DATASET}.target-table"
 _TEST_DISPLAY_NAME = "my_dataset_1234"
 _TEST_METADATA_SCHEMA_URI_MULTIMODAL = (
     "gs://google-cloud-aiplatform/schema/dataset/metadata/multimodal_1.0.0.yaml"
@@ -168,6 +172,24 @@ def bigframes_import_mock():
     del sys.modules["bigframes.pandas"]
 
 
+@pytest.fixture
+def get_bq_dataset_mock():
+    with mock.patch.object(bigquery.Client, "get_dataset") as get_bq_dataset_mock:
+        bq_dataset = mock.Mock()
+        bq_dataset.location = _TEST_LOCATION
+        get_bq_dataset_mock.return_value = bq_dataset
+        yield get_bq_dataset_mock
+
+
+@pytest.fixture
+def get_bq_dataset_alternate_location_mock():
+    with mock.patch.object(bigquery.Client, "get_dataset") as get_bq_dataset_mock:
+        bq_dataset = mock.Mock()
+        bq_dataset.location = _TEST_ALTERNATE_LOCATION
+        get_bq_dataset_mock.return_value = bq_dataset
+        yield get_bq_dataset_mock
+
+
 @pytest.fixture
 def update_dataset_with_template_config_mock():
     with mock.patch.object(
@@ -259,7 +281,7 @@ def test_create_dataset_from_bigquery(self, create_dataset_mock, sync):
         )
 
     @pytest.mark.skip(reason="flaky with other tests mocking bigframes")
-    @pytest.mark.usefixtures("get_dataset_mock")
+    @pytest.mark.usefixtures("get_dataset_mock", "get_bq_dataset_mock")
     def test_create_dataset_from_pandas(
         self, create_dataset_mock, bigframes_import_mock
     ):
@@ -273,55 +295,100 @@ def test_create_dataset_from_pandas(
                 "answer": ["answer"],
             }
         )
-        bq_table = "my-project.my-dataset.my-table"
         ummd.MultimodalDataset.from_pandas(
             dataframe=dataframe,
-            target_table_id=bq_table,
+            target_table_id=_TEST_TARGET_BQ_TABLE,
             display_name=_TEST_DISPLAY_NAME,
         )
         expected_dataset = gca_dataset.Dataset(
             display_name=_TEST_DISPLAY_NAME,
             metadata_schema_uri=_TEST_METADATA_SCHEMA_URI_MULTIMODAL,
-            metadata={"inputConfig": {"bigquerySource": {"uri": f"bq://{bq_table}"}}},
+            metadata={
+                "inputConfig": {
+                    "bigquerySource": {"uri": f"bq://{_TEST_TARGET_BQ_TABLE}"}
+                }
+            },
         )
         create_dataset_mock.assert_called_once_with(
             dataset=expected_dataset,
             parent=_TEST_PARENT,
             timeout=None,
         )
         bigframes_mock.to_gbq.assert_called_once_with(
-            destination_table=bq_table,
+            destination_table=_TEST_TARGET_BQ_TABLE,
             if_exists="replace",
         )
 
     @pytest.mark.skip(reason="flaky with other tests mocking bigframes")
-    @pytest.mark.usefixtures("bigframes_import_mock")
-    @pytest.mark.usefixtures("get_dataset_mock")
+    @pytest.mark.usefixtures(
+        "bigframes_import_mock", "get_dataset_mock", "get_bq_dataset_mock"
+    )
     def test_create_dataset_from_bigframes(self, create_dataset_mock):
         aiplatform.init(project=_TEST_PROJECT)
         bigframes_df = mock.Mock()
-        bq_table = "my-project.my-dataset.my-table"
         ummd.MultimodalDataset.from_bigframes(
             dataframe=bigframes_df,
-            target_table_id=bq_table,
+            target_table_id=_TEST_TARGET_BQ_TABLE,
             display_name=_TEST_DISPLAY_NAME,
         )
 
         bigframes_df.to_gbq.assert_called_once_with(
-            destination_table=bq_table,
+            destination_table=_TEST_TARGET_BQ_TABLE,
             if_exists="replace",
         )
         expected_dataset = gca_dataset.Dataset(
             display_name=_TEST_DISPLAY_NAME,
             metadata_schema_uri=_TEST_METADATA_SCHEMA_URI_MULTIMODAL,
-            metadata={"inputConfig": {"bigquerySource": {"uri": f"bq://{bq_table}"}}},
+            metadata={
+                "inputConfig": {
+                    "bigquerySource": {"uri": f"bq://{_TEST_TARGET_BQ_TABLE}"}
+                }
+            },
         )
         create_dataset_mock.assert_called_once_with(
             dataset=expected_dataset,
             parent=_TEST_PARENT,
             timeout=None,
         )
 
+    @pytest.mark.skip(reason="flaky with other tests mocking bigframes")
+    @pytest.mark.usefixtures("bigframes_import_mock")
+    def test_create_dataset_from_bigframes_different_project_throws_error(self):
+        aiplatform.init(project=_TEST_PROJECT)
+        bigframes_df = mock.Mock()
+        with pytest.raises(ValueError):
+            ummd.MultimodalDataset.from_bigframes(
+                dataframe=bigframes_df,
+                target_table_id="another_project.dataset.table",
+                display_name=_TEST_DISPLAY_NAME,
+            )
+
+    @pytest.mark.skip(reason="flaky with other tests mocking bigframes")
+    @pytest.mark.usefixtures(
+        "bigframes_import_mock", "get_bq_dataset_alternate_location_mock"
+    )
+    def test_create_dataset_from_bigframes_different_location_throws_error(self):
+        aiplatform.init(project=_TEST_PROJECT)
+        bigframes_df = mock.Mock()
+        with pytest.raises(ValueError):
+            ummd.MultimodalDataset.from_bigframes(
+                dataframe=bigframes_df,
+                target_table_id=_TEST_TARGET_BQ_TABLE,
+                display_name=_TEST_DISPLAY_NAME,
+            )
+
+    @pytest.mark.skip(reason="flaky with other tests mocking bigframes")
+    @pytest.mark.usefixtures("bigframes_import_mock")
+    def test_create_dataset_from_bigframes_invalid_target_table_id_throws_error(self):
+        aiplatform.init(project=_TEST_PROJECT)
+        bigframes_df = mock.Mock()
+        with pytest.raises(ValueError):
+            ummd.MultimodalDataset.from_bigframes(
+                dataframe=bigframes_df,
+                target_table_id="invalid-table",
+                display_name=_TEST_DISPLAY_NAME,
+            )
+
     @pytest.mark.usefixtures("get_dataset_mock")
     def test_update_dataset(self, update_dataset_mock):
         aiplatform.init(project=_TEST_PROJECT)