Skip to content

Commit 879dbcd

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
fix: Rollback change to tensorboard uploader causing increased latency
PiperOrigin-RevId: 673086243
1 parent 71c6f3c commit 879dbcd

File tree

3 files changed

+234
-378
lines changed

3 files changed

+234
-378
lines changed

google/cloud/aiplatform/tensorboard/tensorboard_resource.py

-8
Original file line numberDiff line numberDiff line change
@@ -858,14 +858,6 @@ def list(
858858

859859
return tensorboard_runs
860860

861-
def get_tensorboard_time_series_id(self, display_name: str) -> str:
862-
"""Returns the TensorboardTimeSeries with the given display name."""
863-
if display_name not in self._time_series_display_name_to_id_mapping:
864-
self._sync_time_series_display_name_to_id_mapping()
865-
866-
time_series_id = self._time_series_display_name_to_id_mapping.get(display_name)
867-
return time_series_id
868-
869861
def write_tensorboard_scalar_data(
870862
self,
871863
time_series_data: Dict[str, float],

google/cloud/aiplatform/tensorboard/uploader_utils.py

+83-85
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"""Shared utils for tensorboard log uploader."""
1919
import abc
2020
import contextlib
21+
import json
2122
import logging
2223
import re
2324
import time
@@ -120,7 +121,7 @@ def batch_create_runs(
120121
"""
121122
created_runs = []
122123
for run_name in run_names:
123-
tb_run = self._get_or_create_run_resource(run_name)
124+
tb_run = self._create_or_get_run_resource(run_name)
124125
created_runs.append(tb_run)
125126
if run_name not in self._run_name_to_run_resource_name:
126127
self._run_name_to_run_resource_name[run_name] = tb_run.resource_name
@@ -195,11 +196,11 @@ def get_run_resource_name(self, run_name: str) -> str:
195196
Resource name of the run.
196197
"""
197198
if run_name not in self._run_name_to_run_resource_name:
198-
tb_run = self._get_or_create_run_resource(run_name)
199+
tb_run = self._create_or_get_run_resource(run_name)
199200
self._run_name_to_run_resource_name[run_name] = tb_run.resource_name
200201
return self._run_name_to_run_resource_name[run_name]
201202

202-
def _get_or_create_run_resource(
203+
def _create_or_get_run_resource(
203204
self, run_name: str
204205
) -> tensorboard_run.TensorboardRun:
205206
"""Creates new experiment run and tensorboard run resources.
@@ -270,7 +271,7 @@ def get_time_series_resource_name(
270271
Resource name of the time series
271272
"""
272273
if (run_name, tag_name) not in self._run_tag_name_to_time_series_name:
273-
time_series = self._get_or_create_time_series(
274+
time_series = self._create_or_get_time_series(
274275
self.get_run_resource_name(run_name),
275276
tag_name,
276277
time_series_resource_creator,
@@ -280,7 +281,7 @@ def get_time_series_resource_name(
280281
] = time_series.name
281282
return self._run_tag_name_to_time_series_name[(run_name, tag_name)]
282283

283-
def _get_or_create_time_series(
284+
def _create_or_get_time_series(
284285
self,
285286
run_resource_name: str,
286287
tag_name: str,
@@ -310,29 +311,45 @@ def _get_or_create_time_series(
310311
ValueError:
311312
More than one time series with the resource name was found.
312313
"""
313-
time_series = None
314-
run_name = run_resource_name.split("/")[-1]
315-
run = self._get_or_create_run_resource(run_name)
316-
time_series_id = run.get_tensorboard_time_series_id(tag_name)
317-
if time_series_id:
318-
time_series = self._api.get_tensorboard_time_series(
319-
request=tensorboard_service.GetTensorboardTimeSeriesRequest(
320-
name=run_resource_name + "/timeSeries/" + time_series_id
321-
)
314+
time_series = time_series_resource_creator()
315+
time_series.display_name = tag_name
316+
try:
317+
time_series = self._api.create_tensorboard_time_series(
318+
parent=run_resource_name, tensorboard_time_series=time_series
322319
)
323-
if not time_series:
324-
time_series = time_series_resource_creator()
325-
time_series.display_name = tag_name
326-
try:
327-
time_series = self._api.create_tensorboard_time_series(
328-
parent=run_resource_name, tensorboard_time_series=time_series
320+
except exceptions.InvalidArgument as e:
321+
# If the time series display name already exists then retrieve it
322+
if "already exist" in e.message:
323+
list_of_time_series = self._api.list_tensorboard_time_series(
324+
request=tensorboard_service.ListTensorboardTimeSeriesRequest(
325+
parent=run_resource_name,
326+
filter="display_name = {}".format(json.dumps(str(tag_name))),
327+
)
329328
)
330-
except exceptions.InvalidArgument as e:
331-
raise ValueError(
332-
"Could not find time series resource with display name: {}".format(
333-
tag_name
329+
num = 0
330+
time_series = None
331+
332+
for ts in list_of_time_series:
333+
num += 1
334+
if num > 1:
335+
break
336+
time_series = ts
337+
338+
if not time_series:
339+
raise ExistingResourceNotFoundError(
340+
"Could not find time series resource with display name: {}".format(
341+
tag_name
342+
)
343+
)
344+
345+
if num != 1:
346+
raise ValueError(
347+
"More than one time series resource found with display_name: {}".format(
348+
tag_name
349+
)
334350
)
335-
) from e
351+
else:
352+
raise
336353
return time_series
337354

338355

@@ -355,45 +372,6 @@ def __init__(self, run_resource_id: str, api: TensorboardServiceClient):
355372
str, tensorboard_time_series.TensorboardTimeSeries
356373
] = {}
357374

358-
def _get_run_resource(self) -> tensorboard_run.TensorboardRun:
359-
"""Gets or creates new experiment run and tensorboard run resources.
360-
361-
The experiment run will be associated with the tensorboard run resource.
362-
This will link all tensorboard run data to the associated experiment.
363-
364-
Returns:
365-
tb_run (tensorboard_run.TensorboardRun):
366-
The TensorboardRun given the run_name.
367-
368-
Raises:
369-
ValueError:
370-
run_resource_id is invalid.
371-
"""
372-
m = re.match(
373-
"projects/(.*)/locations/(.*)/tensorboards/(.*)/experiments/(.*)/runs/(.*)",
374-
self._run_resource_id,
375-
)
376-
project = m[1]
377-
location = m[2]
378-
tensorboard = m[3]
379-
experiment = m[4]
380-
run_name = m[5]
381-
experiment_run = experiment_run_resource.ExperimentRun.get(
382-
project=project, location=location, run_name=run_name
383-
)
384-
if not experiment_run:
385-
experiment_run = experiment_run_resource.ExperimentRun.create(
386-
project=project,
387-
location=location,
388-
run_name=run_name,
389-
experiment=experiment,
390-
tensorboard=tensorboard,
391-
state=gca_execution.Execution.State.RUNNING,
392-
)
393-
tb_run_artifact = experiment_run._backing_tensorboard_run
394-
tb_run = tb_run_artifact.resource
395-
return tb_run
396-
397375
def get_or_create(
398376
self,
399377
tag_name: str,
@@ -416,36 +394,56 @@ def get_or_create(
416394
A new or existing tensorboard_time_series.TensorboardTimeSeries.
417395
418396
Raises:
419-
ValueError:
397+
exceptions.InvalidArgument:
420398
The tag_name or time_series_resource_creator is an invalid argument
421399
to create_tensorboard_time_series api call.
400+
ExistingResourceNotFoundError:
401+
Could not find the resource given the tag name.
402+
ValueError:
403+
More than one time series with the resource name was found.
422404
"""
423405
if tag_name in self._tag_to_time_series_proto:
424406
return self._tag_to_time_series_proto[tag_name]
425407

426-
time_series = None
427-
tb_run = self._get_run_resource()
428-
time_series_id = tb_run.get_tensorboard_time_series_id(tag_name)
429-
if time_series_id:
430-
time_series = self._api.get_tensorboard_time_series(
431-
request=tensorboard_service.GetTensorboardTimeSeriesRequest(
432-
name=self._run_resource_id + "/timeSeries/" + time_series_id
433-
)
408+
time_series = time_series_resource_creator()
409+
time_series.display_name = tag_name
410+
try:
411+
time_series = self._api.create_tensorboard_time_series(
412+
parent=self._run_resource_id, tensorboard_time_series=time_series
434413
)
435-
if not time_series:
436-
time_series = time_series_resource_creator()
437-
time_series.display_name = tag_name
438-
439-
try:
440-
time_series = self._api.create_tensorboard_time_series(
441-
parent=self._run_resource_id, tensorboard_time_series=time_series
414+
except exceptions.InvalidArgument as e:
415+
# If the time series display name already exists then retrieve it
416+
if "already exist" in e.message:
417+
list_of_time_series = self._api.list_tensorboard_time_series(
418+
request=tensorboard_service.ListTensorboardTimeSeriesRequest(
419+
parent=self._run_resource_id,
420+
filter="display_name = {}".format(json.dumps(str(tag_name))),
421+
)
442422
)
443-
except exceptions.InvalidArgument as e:
444-
raise ValueError(
445-
"Could not find time series resource with display name: {}".format(
446-
tag_name
423+
num = 0
424+
time_series = None
425+
426+
for ts in list_of_time_series:
427+
num += 1
428+
if num > 1:
429+
break
430+
time_series = ts
431+
432+
if not time_series:
433+
raise ExistingResourceNotFoundError(
434+
"Could not find time series resource with display name: {}".format(
435+
tag_name
436+
)
437+
)
438+
439+
if num != 1:
440+
raise ValueError(
441+
"More than one time series resource found with display_name: {}".format(
442+
tag_name
443+
)
447444
)
448-
) from e
445+
else:
446+
raise
449447

450448
self._tag_to_time_series_proto[tag_name] = time_series
451449
return time_series

0 commit comments

Comments
 (0)