Skip to content

Commit 5ebd618

Browse files
committed
fix: fix regression caused by join on trials view
1 parent f78b9aa commit 5ebd618

File tree

8 files changed

+185
-28
lines changed

8 files changed

+185
-28
lines changed

master/internal/api_experiment_intg_test.go

+28-6
Original file line numberDiff line numberDiff line change
@@ -516,14 +516,25 @@ func TestPutExperimentRetainLogs(t *testing.T) {
516516
require.NoError(t, err)
517517
require.NotNil(t, res)
518518

519-
var logRetentionDays []int
519+
var taskLogRetentionDays []int
520520
err = db.Bun().NewSelect().Table("tasks").
521521
Column("log_retention_days").
522522
Where("task_id IN (?)", bun.In(taskIDs)).
523-
Scan(ctx, &logRetentionDays)
523+
Scan(ctx, &taskLogRetentionDays)
524524
require.NoError(t, err)
525525

526-
for _, v := range logRetentionDays {
526+
for _, v := range taskLogRetentionDays {
527+
require.Equal(t, v, numDays)
528+
}
529+
530+
var trialLogRetentionDays []int
531+
err = db.Bun().NewSelect().Table("runs").
532+
Column("log_retention_days").
533+
Where("id IN (?)", bun.In(trialIDs)).
534+
Scan(ctx, &trialLogRetentionDays)
535+
require.NoError(t, err)
536+
537+
for _, v := range trialLogRetentionDays {
527538
require.Equal(t, v, numDays)
528539
}
529540
}
@@ -561,14 +572,25 @@ func TestPutExperimentsRetainLogs(t *testing.T) {
561572
_, taskIDs, err := db.ExperimentsTrialAndTaskIDs(ctx, db.Bun(), intExpIDS)
562573
require.NoError(t, err)
563574

564-
var logRetentionDays []int
575+
var taskLogRetentionDays []int
565576
err = db.Bun().NewSelect().Table("tasks").
566577
Column("log_retention_days").
567578
Where("task_id IN (?)", bun.In(taskIDs)).
568-
Scan(ctx, &logRetentionDays)
579+
Scan(ctx, &taskLogRetentionDays)
580+
require.NoError(t, err)
581+
582+
for _, v := range taskLogRetentionDays {
583+
require.Equal(t, v, numDays)
584+
}
585+
586+
var trialLogRetentionDays []int
587+
err = db.Bun().NewSelect().Table("runs").
588+
Column("log_retention_days").
589+
Where("id IN (?)", bun.In(trialIDs)).
590+
Scan(ctx, &trialLogRetentionDays)
569591
require.NoError(t, err)
570592

571-
for _, v := range logRetentionDays {
593+
for _, v := range trialLogRetentionDays {
572594
require.Equal(t, v, numDays)
573595
}
574596
}

master/internal/api_trials.go

+6
Original file line numberDiff line numberDiff line change
@@ -717,6 +717,12 @@ func (a *apiServer) PutTrialRetainLogs(
717717
Exec(ctx); err != nil {
718718
return fmt.Errorf("updating log retention days for tasks: %w", err)
719719
}
720+
if _, err := tx.NewUpdate().Table("runs").
721+
Set("log_retention_days = ?", req.NumDays).
722+
Where("id = ?", req.TrialId).
723+
Exec(ctx); err != nil {
724+
return fmt.Errorf("updating log retention days for trial: %w", err)
725+
}
720726
return nil
721727
})
722728
if err != nil {

master/internal/experiment/bulk_action.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -807,7 +807,7 @@ func BulkUpdateLogRentention(ctx context.Context, database db.DB,
807807
}
808808
}
809809

810-
_, taskIDs, err := db.ExperimentsTrialAndTaskIDs(ctx, db.Bun(), intExpIDs)
810+
trialIDs, taskIDs, err := db.ExperimentsTrialAndTaskIDs(ctx, db.Bun(), intExpIDs)
811811
if err != nil {
812812
return nil, errors.Wrapf(err, "failed to gather trial IDs for experiments")
813813
}
@@ -823,8 +823,15 @@ func BulkUpdateLogRentention(ctx context.Context, database db.DB,
823823
Exec(ctx); err != nil {
824824
return fmt.Errorf("updating log retention days for tasks: %w", err)
825825
}
826+
if _, err := tx.NewUpdate().Table("runs").
827+
Set("log_retention_days = ?", numDays).
828+
Where("id IN (?)", bun.In(trialIDs)).
829+
Exec(ctx); err != nil {
830+
return fmt.Errorf("updating log retention days for tasks: %w", err)
831+
}
826832
return nil
827833
})
834+
828835
if err != nil {
829836
return nil, err
830837
}

master/pkg/model/experiment.go

+2
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,7 @@ func (t *Trial) ToRunAndTrialV2(experimentsProjectID int) (*Run, *TrialV2) {
479479
Restarts: t.Restarts,
480480
RunnerState: t.RunnerState,
481481
LastActivity: t.LastActivity,
482+
LogRetentionDays: t.LogRetentionDays,
482483
}
483484
v2 := &TrialV2{
484485
RunID: t.ID,
@@ -516,6 +517,7 @@ type Run struct {
516517
Restarts int `db:"restarts"`
517518
RunnerState string `db:"runner_state"`
518519
LastActivity *time.Time `db:"last_activity"`
520+
LogRetentionDays *int16 `db:"log_retention_days"`
519521
}
520522

521523
// RunTaskID represents a row from the `run_id_task_id` table.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
ALTER TABLE RUNS DROP COLUMN log_retention_days;
2+
3+
DROP VIEW trials;
4+
CREATE VIEW trials AS
5+
WITH task_log_retention AS (
6+
SELECT
7+
r.run_id,
8+
-- This is written with the assumption that every task related
9+
-- to a trial will have the same log_retention_days. MIN() is
10+
-- just used to aggregate the number of days for each trial.
11+
MIN(t.log_retention_days) as log_retention_days
12+
FROM tasks t
13+
JOIN run_id_task_id as r ON t.task_id = r.task_id
14+
GROUP BY r.run_id
15+
)
16+
SELECT
17+
t.run_id AS id,
18+
19+
-- metrics
20+
r.summary_metrics AS summary_metrics,
21+
r.summary_metrics_timestamp AS summary_metrics_timestamp,
22+
r.latest_validation_id AS latest_validation_id,
23+
r.total_batches AS total_batches,
24+
25+
-- metadata fields
26+
r.state AS state,
27+
r.tags AS tags,
28+
r.external_run_id AS external_trial_id,
29+
r.restart_id AS run_id,
30+
r.last_activity AS last_activity,
31+
r.start_time AS start_time,
32+
r.end_time AS end_time,
33+
r.restarts AS restarts,
34+
l.log_retention_days AS log_retention_days,
35+
36+
-- run_hp_search_stuff
37+
r.hparams AS hparams,
38+
r.searcher_metric_value AS searcher_metric_value,
39+
r.searcher_metric_value_signed AS searcher_metric_value_signed,
40+
r.best_validation_id AS best_validation_id,
41+
42+
-- run_checkpoint_stats
43+
r.checkpoint_size AS checkpoint_size,
44+
r.checkpoint_count AS checkpoint_count,
45+
46+
-- trial_v2 table.
47+
t.request_id AS request_id,
48+
t.seed AS seed,
49+
50+
r.experiment_id AS experiment_id,
51+
r.warm_start_checkpoint_id AS warm_start_checkpoint_id,
52+
53+
-- eventually delete runner state.
54+
r.runner_state AS runner_state
55+
FROM trials_v2 t
56+
JOIN runs r ON t.run_id = r.id
57+
JOIN task_log_retention l ON l.run_id = t.run_id;
58+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
ALTER TABLE runs ADD COLUMN log_retention_days SMALLINT;
2+
3+
DROP VIEW trials;
4+
5+
CREATE VIEW trials AS
6+
SELECT
7+
t.run_id AS id,
8+
9+
-- metrics
10+
r.summary_metrics AS summary_metrics,
11+
r.summary_metrics_timestamp AS summary_metrics_timestamp,
12+
r.latest_validation_id AS latest_validation_id,
13+
r.total_batches AS total_batches,
14+
15+
-- metadata fields
16+
r.state AS state,
17+
r.tags AS tags,
18+
r.external_run_id AS external_trial_id,
19+
r.restart_id AS run_id,
20+
r.last_activity AS last_activity,
21+
r.start_time AS start_time,
22+
r.end_time AS end_time,
23+
r.restarts AS restarts,
24+
r.log_retention_days AS log_retention_days,
25+
26+
-- run_hp_search_stuff
27+
r.hparams AS hparams,
28+
r.searcher_metric_value AS searcher_metric_value,
29+
r.searcher_metric_value_signed AS searcher_metric_value_signed,
30+
r.best_validation_id AS best_validation_id,
31+
32+
-- run_checkpoint_stats
33+
r.checkpoint_size AS checkpoint_size,
34+
r.checkpoint_count AS checkpoint_count,
35+
36+
-- trial_v2 table.
37+
t.request_id AS request_id,
38+
t.seed AS seed,
39+
40+
r.experiment_id AS experiment_id,
41+
r.warm_start_checkpoint_id AS warm_start_checkpoint_id,
42+
43+
-- eventually delete runner state.
44+
r.runner_state AS runner_state
45+
FROM trials_v2 t
46+
JOIN runs r ON t.run_id = r.id;

proto/pkg/runv1/run.pb.go

+35-21
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

proto/src/determined/run/v1/run.proto

+2
Original file line numberDiff line numberDiff line change
@@ -111,4 +111,6 @@ message FlatRun {
111111
bool parent_archived = 18;
112112
// Data related the the experiment associated with this run.
113113
optional FlatRunExperiment experiment = 19;
114+
// Number of days to retain logs for.
115+
optional int32 log_retention_days = 20;
114116
}

0 commit comments

Comments
 (0)