Skip to content

Commit 339f8b6

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Create Vertex Experiment when uploading Tensorboard logs
PiperOrigin-RevId: 634035336
1 parent b47e6ff commit 339f8b6

File tree

10 files changed

+435
-174
lines changed

10 files changed

+435
-174
lines changed

google/cloud/aiplatform/metadata/context.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ def update(
290290
metadata: Optional[Dict] = None,
291291
description: Optional[str] = None,
292292
credentials: Optional[auth_credentials.Credentials] = None,
293+
location: Optional[str] = None,
293294
):
294295
"""Updates an existing Metadata Context with new metadata.
295296
@@ -307,7 +308,10 @@ def update(
307308
for _ in range(_ETAG_ERROR_MAX_RETRY_COUNT - 1):
308309
try:
309310
super().update(
310-
metadata=metadata, description=description, credentials=credentials
311+
metadata=metadata,
312+
description=description,
313+
credentials=credentials,
314+
location=location,
311315
)
312316
return
313317
except Aborted as aborted_exception:
@@ -322,7 +326,10 @@ def update(
322326

323327
# Expose result/exception directly in the last retry.
324328
super().update(
325-
metadata=metadata, description=description, credentials=credentials
329+
metadata=metadata,
330+
description=description,
331+
credentials=credentials,
332+
location=location,
326333
)
327334

328335
@classmethod

google/cloud/aiplatform/metadata/experiment_resources.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -634,7 +634,8 @@ def assign_backing_tensorboard(
634634
self._metadata_context.update(
635635
metadata={
636636
constants._BACKING_TENSORBOARD_RESOURCE_KEY: tensorboard.resource_name
637-
}
637+
},
638+
location=self._metadata_context.location,
638639
)
639640

640641
def _log_experiment_loggable(self, experiment_loggable: "_ExperimentLoggable"):

google/cloud/aiplatform/metadata/experiment_run_resource.py

+21-4
Original file line numberDiff line numberDiff line change
@@ -716,7 +716,9 @@ def create(
716716
The newly created experiment run.
717717
"""
718718

719-
experiment = cls._get_experiment(experiment)
719+
experiment = cls._get_experiment(
720+
experiment, project=project, location=location, credentials=credentials
721+
)
720722

721723
run_id = _format_experiment_run_resource_id(
722724
experiment_name=experiment.name, run_name=run_name
@@ -760,7 +762,10 @@ def _create_context():
760762
try:
761763
if tensorboard:
762764
cls._assign_backing_tensorboard(
763-
self=experiment_run, tensorboard=tensorboard
765+
self=experiment_run,
766+
tensorboard=tensorboard,
767+
project=project,
768+
location=location,
764769
)
765770
else:
766771
cls._assign_to_experiment_backing_tensorboard(self=experiment_run)
@@ -792,7 +797,10 @@ def _format_tensorboard_experiment_display_name(experiment_name: str) -> str:
792797
return f"{experiment_name} Backing Tensorboard Experiment"
793798

794799
def _assign_backing_tensorboard(
795-
self, tensorboard: Union[tensorboard_resource.Tensorboard, str]
800+
self,
801+
tensorboard: Union[tensorboard_resource.Tensorboard, str],
802+
project: Optional[str] = None,
803+
location: Optional[str] = None,
796804
):
797805
"""Assign tensorboard as the backing tensorboard to this run.
798806
@@ -802,7 +810,10 @@ def _assign_backing_tensorboard(
802810
"""
803811
if isinstance(tensorboard, str):
804812
tensorboard = tensorboard_resource.Tensorboard(
805-
tensorboard, credentials=self._metadata_node.credentials
813+
tensorboard,
814+
project=project,
815+
location=location,
816+
credentials=self._metadata_node.credentials,
806817
)
807818

808819
tensorboard_resource_name_parts = tensorboard._parse_resource_name(
@@ -827,6 +838,8 @@ def _assign_backing_tensorboard(
827838
self._experiment.name
828839
),
829840
tensorboard_name=tensorboard.resource_name,
841+
project=project,
842+
location=location,
830843
credentials=tensorboard.credentials,
831844
labels=constants._VERTEX_EXPERIMENT_TB_EXPERIMENT_LABEL,
832845
)
@@ -849,6 +862,8 @@ def _assign_backing_tensorboard(
849862
tensorboard_run = tensorboard_resource.TensorboardRun.create(
850863
tensorboard_run_id=self._run_name,
851864
tensorboard_experiment_name=tensorboard_experiment.resource_name,
865+
project=project,
866+
location=location,
852867
credentials=tensorboard.credentials,
853868
)
854869

@@ -865,6 +880,8 @@ def _assign_backing_tensorboard(
865880
schema_title=constants._TENSORBOARD_RUN_REFERENCE_ARTIFACT.schema_title,
866881
schema_version=constants._TENSORBOARD_RUN_REFERENCE_ARTIFACT.schema_version,
867882
state=gca_artifact.Artifact.State.LIVE,
883+
project=project,
884+
location=location,
868885
)
869886

870887
self._metadata_node.add_artifacts_and_executions(

google/cloud/aiplatform/metadata/metadata.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,8 @@ def set_experiment(
292292
backing_tensorboard: Optional[
293293
Union[str, tensorboard_resource.Tensorboard, bool]
294294
] = None,
295+
project: Optional[str] = None,
296+
location: Optional[str] = None,
295297
):
296298
"""Set the experiment. Will retrieve the Experiment if it exists or create one with the provided name.
297299
@@ -309,11 +311,20 @@ def set_experiment(
309311
310312
To disable using a backing tensorboard, set `backing_tensorboard` to `False`.
311313
To maintain this behavior, set `experiment_tensorboard` to `False` in subsequent calls to aiplatform.init().
314+
project (str):
315+
Optional. Project where this experiment will be retrieved from or created. Overrides project set in
316+
aiplatform.init.
317+
location (str):
318+
Optional. Location where this experiment will be retrieved from or created. Overrides location set in
319+
aiplatform.init.
312320
"""
313321
self.reset()
314322

315323
experiment = experiment_resources.Experiment.get_or_create(
316-
experiment_name=experiment, description=description
324+
experiment_name=experiment,
325+
description=description,
326+
project=project,
327+
location=location,
317328
)
318329

319330
if backing_tensorboard and not isinstance(backing_tensorboard, bool):

google/cloud/aiplatform/metadata/resource.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ def update(
285285
metadata: Optional[Dict] = None,
286286
description: Optional[str] = None,
287287
credentials: Optional[auth_credentials.Credentials] = None,
288+
location: Optional[str] = None,
288289
):
289290
"""Updates an existing Metadata resource with new metadata.
290291
@@ -309,7 +310,9 @@ def update(
309310
if description:
310311
gca_resource.description = description
311312

312-
api_client = self._instantiate_client(credentials=credentials)
313+
api_client = self._instantiate_client(
314+
credentials=credentials, location=location
315+
)
313316
# TODO: if etag is not valid sync and retry
314317
update_gca_resource = self._update_resource(
315318
client=api_client,

google/cloud/aiplatform/tensorboard/logdir_loader.py

+3
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,14 @@ def synchronize_runs(self):
6363
6464
In addition, any existing `DirectoryLoader` whose run directory
6565
no longer exists will be deleted.
66+
67+
Modify run name to work with Experiments restrictions.
6668
"""
6769
logger.info("Starting logdir traversal of %s", self._logdir)
6870
runs_seen = set()
6971
for subdir in io_wrapper.GetLogdirSubdirectories(self._logdir):
7072
run = os.path.relpath(subdir, self._logdir)
73+
run = run.replace("/", "-").replace("_", "-")
7174
runs_seen.add(run)
7275
if run not in self._directory_loaders:
7376
logger.info("- Adding run for relative directory %s", run)

google/cloud/aiplatform/tensorboard/uploader.py

+32-35
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,21 @@
2020
from collections import defaultdict
2121
import functools
2222
import logging
23-
import os
2423
import re
2524
import time
2625
from typing import ContextManager, Dict, FrozenSet, Generator, Iterable, Optional, Tuple
2726
import uuid
2827

29-
from google.api_core import exceptions
3028
from google.cloud import storage
3129
from google.cloud.aiplatform import base
3230
from google.cloud.aiplatform.compat.services import (
3331
tensorboard_service_client,
3432
)
3533
from google.cloud.aiplatform.compat.types import tensorboard_data
36-
from google.cloud.aiplatform.compat.types import tensorboard_experiment
3734
from google.cloud.aiplatform.compat.types import tensorboard_service
3835
from google.cloud.aiplatform.compat.types import tensorboard_time_series
36+
from google.cloud.aiplatform.metadata import experiment_resources
37+
from google.cloud.aiplatform.metadata import metadata
3938
from google.cloud.aiplatform.tensorboard import logdir_loader
4039
from google.cloud.aiplatform.tensorboard import upload_tracker
4140
from google.cloud.aiplatform.tensorboard import uploader_constants
@@ -215,47 +214,45 @@ def active_filter(secs):
215214

216215
self._create_additional_senders()
217216

218-
def _create_or_get_experiment(self) -> tensorboard_experiment.TensorboardExperiment:
219-
"""Create an experiment or get an experiment.
220-
221-
Attempts to create an experiment. If the experiment already exists and
222-
creation fails then the experiment will be retrieved.
217+
def create_experiment(self):
218+
"""Creates an Experiment for this upload session.
223219
224-
Returns:
225-
The created or retrieved experiment.
220+
Sets the tensorboard resource and experiment, which will get or create a
221+
Vertex Experiment and associate it with a Tensorboard Experiment.
226222
"""
227-
logger.info("Creating experiment")
223+
m = self._api.parse_tensorboard_path(self._tensorboard_resource_name)
228224

229-
tb_experiment = tensorboard_experiment.TensorboardExperiment(
230-
description=self._description, display_name=self._experiment_display_name
225+
existing_experiment = experiment_resources.Experiment.get(
226+
experiment_name=self._experiment_name,
227+
project=m["project"],
228+
location=m["location"],
231229
)
232-
233-
try:
234-
experiment = self._api.create_tensorboard_experiment(
235-
parent=self._tensorboard_resource_name,
236-
tensorboard_experiment=tb_experiment,
237-
tensorboard_experiment_id=self._experiment_name,
238-
)
230+
if not existing_experiment:
239231
self._is_brand_new_experiment = True
240-
except exceptions.AlreadyExists:
241-
logger.info("Creating experiment failed. Retrieving experiment.")
242-
experiment_name = os.path.join(
243-
self._tensorboard_resource_name, "experiments", self._experiment_name
244-
)
245-
experiment = self._api.get_tensorboard_experiment(name=experiment_name)
246-
return experiment
247232

248-
def create_experiment(self):
249-
"""Creates an Experiment for this upload session and returns the ID."""
233+
metadata._experiment_tracker.reset()
234+
metadata._experiment_tracker.set_tensorboard(
235+
tensorboard=self._tensorboard_resource_name,
236+
project=m["project"],
237+
location=m["location"],
238+
)
239+
metadata._experiment_tracker.set_experiment(
240+
project=m["project"],
241+
location=m["location"],
242+
experiment=self._experiment_name,
243+
description=self._description,
244+
backing_tensorboard=self._tensorboard_resource_name,
245+
)
250246

251-
experiment = self._create_or_get_experiment()
252-
self._experiment = experiment
247+
self._tensorboard_experiment_resource_name = (
248+
f"{self._tensorboard_resource_name}/experiments/{self._experiment_name}"
249+
)
253250
self._one_platform_resource_manager = uploader_utils.OnePlatformResourceManager(
254-
self._experiment.name, self._api
251+
self._tensorboard_experiment_resource_name, self._api
255252
)
256253

257254
self._request_sender = _BatchedRequestSender(
258-
self._experiment.name,
255+
self._tensorboard_experiment_resource_name,
259256
self._api,
260257
allowed_plugins=self._allowed_plugins,
261258
upload_limits=self._upload_limits,
@@ -271,7 +268,7 @@ def create_experiment(self):
271268
# Update partials with experiment name
272269
for sender in self._additional_senders.keys():
273270
self._additional_senders[sender] = self._additional_senders[sender](
274-
experiment_resource_name=self._experiment.name,
271+
experiment_resource_name=self._tensorboard_experiment_resource_name,
275272
)
276273

277274
self._dispatcher = _Dispatcher(
@@ -310,7 +307,7 @@ def _create_additional_senders(self) -> Dict[str, uploader_utils.RequestSender]:
310307
)
311308

312309
def get_experiment_resource_name(self):
313-
return self._experiment.name
310+
return self._tensorboard_experiment_resource_name
314311

315312
def start_uploading(self):
316313
"""Blocks forever to continuously upload data from the logdir.

0 commit comments

Comments
 (0)