Skip to content

Commit e7a197e

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
fix: Fix experiments failure when backing tensorboard has been deleted.
PiperOrigin-RevId: 599955406
1 parent 3b28d64 commit e7a197e

File tree

5 files changed

+138
-21
lines changed

5 files changed

+138
-21
lines changed

google/cloud/aiplatform/metadata/experiment_resources.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,7 @@ def _lookup_backing_tensorboard(self) -> Optional[tensorboard_resource.Tensorboa
502502
"""Returns backing tensorboard if one is set.
503503
504504
Returns:
505-
Tensorboard resource if one exists.
505+
Tensorboard resource if one exists, otherwise returns None.
506506
"""
507507
tensorboard_resource_name = self._metadata_context.metadata.get(
508508
constants._BACKING_TENSORBOARD_RESOURCE_KEY
@@ -516,10 +516,16 @@ def _lookup_backing_tensorboard(self) -> Optional[tensorboard_resource.Tensorboa
516516
)
517517

518518
if tensorboard_resource_name:
519-
return tensorboard_resource.Tensorboard(
520-
tensorboard_resource_name,
521-
credentials=self._metadata_context.credentials,
522-
)
519+
try:
520+
return tensorboard_resource.Tensorboard(
521+
tensorboard_resource_name,
522+
credentials=self._metadata_context.credentials,
523+
)
524+
except exceptions.NotFound:
525+
self._metadata_context.update(
526+
metadata={constants._BACKING_TENSORBOARD_RESOURCE_KEY: None}
527+
)
528+
return None
523529

524530
def get_backing_tensorboard_resource(
525531
self,

google/cloud/aiplatform/metadata/metadata.py

+25-3
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from typing import Dict, Union, Optional, Any, List
2222

2323
from google.api_core import exceptions
24+
import google.auth
2425
from google.auth import credentials as auth_credentials
2526
from google.protobuf import timestamp_pb2
2627

@@ -216,7 +217,7 @@ def _execution_to_column_named_metadata(
216217

217218

218219
class _ExperimentTracker:
219-
"""Tracks Experiments and Experiment Runs wil high level APIs"""
220+
"""Tracks Experiments and Experiment Runs with high level APIs."""
220221

221222
def __init__(self):
222223
self._experiment: Optional[experiment_resources.Experiment] = None
@@ -229,6 +230,27 @@ def reset(self):
229230
self._experiment = None
230231
self._experiment_run = None
231232

233+
def _get_global_tensorboard(self) -> Optional[tensorboard_resource.Tensorboard]:
234+
"""Helper method to get the global TensorBoard instance.
235+
236+
Returns:
237+
tensorboard_resource.Tensorboard: the global TensorBoard instance.
238+
"""
239+
if self._global_tensorboard:
240+
credentials, _ = google.auth.default()
241+
if self.experiment and self.experiment._metadata_context.credentials:
242+
credentials = self.experiment._metadata_context.credentials
243+
try:
244+
return tensorboard_resource.Tensorboard(
245+
self._global_tensorboard.resource_name,
246+
project=self._global_tensorboard.project,
247+
location=self._global_tensorboard.location,
248+
credentials=credentials,
249+
)
250+
except exceptions.NotFound:
251+
self._global_tensorboard = None
252+
return None
253+
232254
@property
233255
def experiment_name(self) -> Optional[str]:
234256
"""Return the currently set experiment name, if experiment is not set, return None"""
@@ -284,7 +306,7 @@ def set_experiment(
284306
If ommitted, or set to `True` or `None`, the global tensorboard is used.
285307
If no global tensorboard is set, the default tensorboard will be used, and created if it does not exist.
286308
287-
To disable using a backign tensorboard, set `backing_tensorboard` to `False`.
309+
To disable using a backing tensorboard, set `backing_tensorboard` to `False`.
288310
To maintain this behavior, set `experiment_tensorboard` to `False` in subsequent calls to aiplatform.init().
289311
"""
290312
self.reset()
@@ -299,7 +321,7 @@ def set_experiment(
299321
backing_tb = None
300322
else:
301323
backing_tb = (
302-
self._global_tensorboard or _get_or_create_default_tensorboard()
324+
self._get_global_tensorboard() or _get_or_create_default_tensorboard()
303325
)
304326

305327
current_backing_tb = experiment.backing_tensorboard_resource_name

google/cloud/aiplatform/tensorboard/uploader_tracker.py

+12-13
Original file line numberDiff line numberDiff line change
@@ -234,24 +234,23 @@ def _create_uploader(
234234
project, location, tensorboard_id
235235
)
236236
else:
237-
if _experiment_tracker._global_tensorboard:
237+
if _experiment_tracker._get_global_tensorboard():
238238
tensorboard_resource_name = (
239-
_experiment_tracker._global_tensorboard.resource_name
239+
_experiment_tracker._get_global_tensorboard().resource_name
240240
)
241-
else:
242-
if _experiment_tracker._experiment:
243-
if _experiment_tracker._experiment._lookup_backing_tensorboard():
244-
tensorboard_resource_name = (
245-
_experiment_tracker._experiment._lookup_backing_tensorboard().resource_name
246-
)
247-
else:
248-
raise ValueError(
249-
f"No TensorBoard associated with experiment {initializer.global_config.experiment_name}. Please provide tensorboard_id in the argument."
250-
)
241+
elif _experiment_tracker._experiment:
242+
if _experiment_tracker._experiment._lookup_backing_tensorboard():
243+
tensorboard_resource_name = (
244+
_experiment_tracker._experiment._lookup_backing_tensorboard().resource_name
245+
)
251246
else:
252247
raise ValueError(
253-
"No TensorBoard found. Please provide tensorboard_id in the argument."
248+
f"No TensorBoard associated with experiment {initializer.global_config.experiment_name}. Please provide tensorboard_id in the argument."
254249
)
250+
else:
251+
raise ValueError(
252+
"No TensorBoard found. Please provide tensorboard_id in the argument."
253+
)
255254

256255
api_client = initializer.global_config.create_client(
257256
client_class=TensorboardClientWithOverride,

tests/system/aiplatform/test_experiments.py

+59
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#
1717
import tempfile
1818

19+
import uuid
1920
import pytest
2021

2122
from google.api_core import exceptions
@@ -618,3 +619,61 @@ def test_init_associates_global_tensorboard_to_experiment(self, shared_state):
618619
)
619620
== tensorboard.resource_name
620621
)
622+
623+
def test_get_backing_tensorboard_resource_returns_tensorboard(self, shared_state):
624+
tensorboard = aiplatform.Tensorboard.create(
625+
project=e2e_base._PROJECT,
626+
location=e2e_base._LOCATION,
627+
display_name=self._make_display_name("")[:64],
628+
)
629+
shared_state["resources"] = [tensorboard]
630+
aiplatform.init(
631+
project=e2e_base._PROJECT,
632+
location=e2e_base._LOCATION,
633+
experiment=self._experiment_name,
634+
experiment_tensorboard=tensorboard,
635+
)
636+
experiment = aiplatform.Experiment(
637+
self._experiment_name,
638+
project=e2e_base._PROJECT,
639+
location=e2e_base._LOCATION,
640+
)
641+
642+
assert (
643+
experiment.get_backing_tensorboard_resource().resource_name
644+
== tensorboard.resource_name
645+
)
646+
647+
def test_get_backing_tensorboard_resource_returns_none(self):
648+
new_experiment_name = f"example-{uuid.uuid1()}"
649+
aiplatform.init(
650+
project=e2e_base._PROJECT,
651+
location=e2e_base._LOCATION,
652+
experiment=new_experiment_name,
653+
experiment_tensorboard=False,
654+
)
655+
new_experiment = aiplatform.Experiment(
656+
new_experiment_name,
657+
project=e2e_base._PROJECT,
658+
location=e2e_base._LOCATION,
659+
)
660+
661+
assert new_experiment.get_backing_tensorboard_resource() is None
662+
663+
def test_delete_backing_tensorboard_experiment_run_success(self):
664+
aiplatform.init(
665+
project=e2e_base._PROJECT,
666+
location=e2e_base._LOCATION,
667+
experiment=self._experiment_name,
668+
)
669+
experiment = aiplatform.Experiment(
670+
self._experiment_name,
671+
project=e2e_base._PROJECT,
672+
location=e2e_base._LOCATION,
673+
)
674+
experiment.get_backing_tensorboard_resource().delete()
675+
run = aiplatform.start_run(_RUN)
676+
aiplatform.end_run()
677+
678+
assert experiment.get_backing_tensorboard_resource() is None
679+
assert run.name == _RUN

tests/unit/vertexai/test_remote_training.py

+31
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,14 @@
2323
from unittest.mock import patch
2424

2525
import cloudpickle
26+
from google import auth
2627
from google.api_core import exceptions
28+
from google.auth import credentials as auth_credentials
2729
from google.cloud import aiplatform
2830
from google.cloud.aiplatform import utils
2931
from google.cloud.aiplatform.compat.services import (
3032
job_service_client_v1beta1 as job_service_client,
33+
tensorboard_service_client,
3134
)
3235
from google.cloud.aiplatform.compat.types import (
3336
custom_job_v1beta1 as gca_custom_job_compat,
@@ -89,6 +92,7 @@
8992
_TEST_EXPERIMENT = "test-experiment"
9093
_TEST_EXPERIMENT_RUN = "test-experiment-run"
9194
_TEST_SERVICE_ACCOUNT = f"{_TEST_PROJECT_NUMBER}[email protected]"
95+
_TEST_CREDENTIALS = mock.Mock(spec=auth_credentials.AnonymousCredentials())
9296

9397
# dataset constants
9498
dataset = load_iris()
@@ -707,6 +711,25 @@ def aiplatform_autolog_mock():
707711
yield aiplatform_autolog_mock
708712

709713

714+
@pytest.fixture(scope="module")
715+
def google_auth_mock():
716+
with mock.patch.object(auth, "default") as auth_mock:
717+
auth_mock.return_value = (
718+
auth_credentials.AnonymousCredentials(),
719+
"test-project",
720+
)
721+
yield auth_mock
722+
723+
724+
@pytest.fixture
725+
def get_tensorboard_mock():
726+
with patch.object(
727+
tensorboard_service_client.TensorboardServiceClient, "get_tensorboard"
728+
) as get_tensorboard_mock:
729+
get_tensorboard_mock.return_value = _TEST_DEFAULT_TENSORBOARD_GCA
730+
yield get_tensorboard_mock
731+
732+
710733
# unittest `assert_any_call` method doesn't work when arguments contain `np.ndarray`
711734
# https://stackoverflow.com/questions/56644729/mock-assert-mock-calls-with-a-numpy-array-as-argument-raises-valueerror-and-np
712735
# tentatively runtime patch `assert_any_call` to solve this issue
@@ -1636,6 +1659,7 @@ def test_remote_training_keras_distributed_no_cuda_no_worker_pool_specs(
16361659
"get_artifact_not_found_mock",
16371660
"update_context_mock",
16381661
"mock_autolog_disabled",
1662+
"get_tensorboard_mock",
16391663
)
16401664
def test_remote_training_sklearn_with_experiment(
16411665
self,
@@ -1647,6 +1671,7 @@ def test_remote_training_sklearn_with_experiment(
16471671
location=_TEST_LOCATION,
16481672
staging_bucket=_TEST_BUCKET_NAME,
16491673
experiment=_TEST_EXPERIMENT,
1674+
credentials=_TEST_CREDENTIALS,
16501675
)
16511676
vertexai.preview.init(remote=True)
16521677

@@ -1720,6 +1745,7 @@ def test_remote_training_sklearn_with_experiment(
17201745
"update_context_mock",
17211746
"aiplatform_autolog_mock",
17221747
"mock_autolog_enabled",
1748+
"get_tensorboard_mock",
17231749
)
17241750
def test_remote_training_sklearn_with_experiment_autolog_enabled(
17251751
self,
@@ -1731,6 +1757,7 @@ def test_remote_training_sklearn_with_experiment_autolog_enabled(
17311757
location=_TEST_LOCATION,
17321758
staging_bucket=_TEST_BUCKET_NAME,
17331759
experiment=_TEST_EXPERIMENT,
1760+
credentials=_TEST_CREDENTIALS,
17341761
)
17351762
vertexai.preview.init(remote=True, autolog=True)
17361763

@@ -1926,6 +1953,7 @@ def test_initialize_existing_persistent_resource_service_account_mismatch(self):
19261953
"aiplatform_autolog_mock",
19271954
"mock_autolog_enabled",
19281955
"persistent_resource_running_mock",
1956+
"get_tensorboard_mock",
19291957
)
19301958
def test_remote_training_sklearn_with_persistent_cluster_no_service_account_and_experiment_error(
19311959
self,
@@ -1935,6 +1963,7 @@ def test_remote_training_sklearn_with_persistent_cluster_no_service_account_and_
19351963
location=_TEST_LOCATION,
19361964
staging_bucket=_TEST_BUCKET_NAME,
19371965
experiment=_TEST_EXPERIMENT,
1966+
credentials=_TEST_CREDENTIALS,
19381967
)
19391968
vertexai.preview.init(
19401969
remote=True, autolog=True, cluster=_TEST_PERSISTENT_RESOURCE_CONFIG
@@ -1966,6 +1995,7 @@ def test_remote_training_sklearn_with_persistent_cluster_no_service_account_and_
19661995
"persistent_resource_service_account_running_mock",
19671996
"mock_timestamped_unique_name",
19681997
"mock_get_custom_job",
1998+
"get_tensorboard_mock",
19691999
)
19702000
def test_remote_training_sklearn_with_persistent_cluster_and_experiment_autologging(
19712001
self,
@@ -1977,6 +2007,7 @@ def test_remote_training_sklearn_with_persistent_cluster_and_experiment_autologg
19772007
location=_TEST_LOCATION,
19782008
staging_bucket=_TEST_BUCKET_NAME,
19792009
experiment=_TEST_EXPERIMENT,
2010+
credentials=_TEST_CREDENTIALS,
19802011
)
19812012
vertexai.preview.init(
19822013
remote=True,

0 commit comments

Comments
 (0)