Skip to content

Commit 17d48cd

Browse files
committed
Change Log Signal feature behaviors
1 parent a715dec commit 17d48cd

33 files changed

+854
-1110
lines changed

e2e_tests/tests/cluster/test_log_policies.py

Lines changed: 9 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -157,75 +157,30 @@ def test_log_policy_exclude_slurm(should_match: bool) -> None:
157157

158158
@pytest.mark.e2e_cpu
159159
@pytest.mark.parametrize("should_match", [True, False])
160-
def test_log_signal(should_match: bool) -> None:
160+
def test_log_policy_matched(should_match: bool) -> None:
161161
sess = api_utils.user_session()
162162
regex = r"executing.*action.*exit.*code.*7"
163163
if not should_match:
164164
regex = r"(.*) this should not match (.*)"
165165

166-
expected_signal = "Test Signal"
166+
expected_policy = "Test"
167167
config = {
168-
"log_policies": [{"pattern": regex, "actions": [{"signal": expected_signal}]}],
168+
"log_policies": [{"name": expected_policy, "pattern": regex}],
169169
"max_restarts": 1,
170170
}
171171

172172
exp_ref = noop.create_experiment(sess, [noop.Exit(7)], config=config)
173173
assert exp_ref.wait(interval=0.01) == client.ExperimentState.ERROR
174174

175175
searchRes = utils.get_run_by_exp_id(sess, exp_ref.id)
176-
runSignal = searchRes.runs[0].logSignal
176+
runPolicyMatched = searchRes.runs[0].logPolicyMatched
177177

178178
trialRes = bindings.get_GetTrial(sess, trialId=searchRes.runs[0].id)
179-
trialSignal = trialRes.trial.logSignal
179+
trialPolicyMatched = trialRes.trial.logPolicyMatched
180180

181181
if should_match:
182-
assert runSignal == expected_signal
183-
assert trialSignal == expected_signal
182+
assert runPolicyMatched == expected_policy
183+
assert trialPolicyMatched == expected_policy
184184
else:
185-
assert runSignal is None
186-
assert trialSignal is None
187-
188-
189-
@pytest.mark.e2e_cpu
190-
def test_signal_clear_after_exp_continue() -> None:
191-
sess = api_utils.user_session()
192-
regex = r"executing.*action.*exit.*code.*7"
193-
194-
expected_signal = "Test Signal"
195-
config = {
196-
"log_policies": [{"pattern": regex, "actions": [{"signal": expected_signal}]}],
197-
"max_restarts": 0,
198-
}
199-
200-
exp_ref = noop.create_experiment(sess, [noop.Exit(7)], config=config)
201-
assert exp_ref.wait(interval=0.01) == client.ExperimentState.ERROR
202-
203-
searchRes = utils.get_run_by_exp_id(sess, exp_ref.id)
204-
runSignal = searchRes.runs[0].logSignal
205-
206-
trialRes = bindings.get_GetTrial(sess, trialId=searchRes.runs[0].id)
207-
trialSignal = trialRes.trial.logSignal
208-
209-
assert runSignal == expected_signal
210-
assert trialSignal == expected_signal
211-
212-
detproc.check_call(
213-
sess,
214-
[
215-
"det",
216-
"e",
217-
"continue",
218-
str(exp_ref.id),
219-
*noop.cli_config_overrides([noop.Exit(0)]),
220-
],
221-
)
222-
exp.wait_for_experiment_state(sess, exp_ref.id, bindings.experimentv1State.COMPLETED)
223-
224-
searchRes = utils.get_run_by_exp_id(sess, exp_ref.id)
225-
runSignal = searchRes.runs[0].logSignal
226-
227-
trialRes = bindings.get_GetTrial(sess, trialId=searchRes.runs[0].id)
228-
trialSignal = trialRes.trial.logSignal
229-
230-
assert runSignal is None
231-
assert trialSignal is None
185+
assert runPolicyMatched is None
186+
assert trialPolicyMatched is None

harness/determined/common/api/bindings.py

Lines changed: 16 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

master/internal/api_experiment.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1616,10 +1616,9 @@ func (a *apiServer) ContinueExperiment(
16161616
if _, err := tx.NewUpdate().Table("runs"). // TODO(nick-runs) call runs package.
16171617
Set("restarts = 0").
16181618
Set("end_time = null").
1619-
Set("log_signal = null").
16201619
Where("id IN (?)", bun.In(trialIDs)).
16211620
Exec(ctx); err != nil {
1622-
return fmt.Errorf("zeroing out trial stats: %w", err)
1621+
return fmt.Errorf("zeroing out trial restarts: %w", err)
16231622
}
16241623
}
16251624

master/internal/api_runs.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ func getRunsColumns(q *bun.SelectQuery) *bun.SelectQuery {
170170
'pachyderm_integration', NULLIF(e.config#>'{integrations,pachyderm}', 'null'),
171171
'id', e.id) AS experiment`).
172172
ColumnExpr("rm.metadata AS metadata").
173-
ColumnExpr("r.log_signal AS log_signal").
173+
ColumnExpr("r.log_policy_matched AS log_policy_matched").
174174
Join("LEFT JOIN experiments AS e ON r.experiment_id=e.id").
175175
Join("LEFT JOIN runs_metadata AS rm ON r.id=rm.run_id").
176176
Join("LEFT JOIN users u ON e.owner_id = u.id").

master/internal/api_tasks_intg_test.go

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -220,12 +220,12 @@ func TestPostTaskLogsLogPattern(t *testing.T) {
220220
require.NoError(t, err)
221221
activeConfig.RawLogPolicies = expconf.LogPoliciesConfig{
222222
expconf.LogPolicy{
223-
RawPattern: "sub",
224-
RawActions: expconf.LogActionsV0{expconf.LogActionV0{Type: expconf.LogActionTypeCancelRetries}},
223+
RawPattern: ptrs.Ptr("sub"),
224+
RawAction: &expconf.LogActionV0{Type: expconf.LogActionTypeCancelRetries},
225225
},
226226
expconf.LogPolicy{
227-
RawPattern: `\d{5}$`,
228-
RawActions: expconf.LogActionsV0{expconf.LogActionV0{Type: expconf.LogActionTypeExcludeNode}},
227+
RawPattern: ptrs.Ptr(`\d{5}$`),
228+
RawAction: &expconf.LogActionV0{Type: expconf.LogActionTypeExcludeNode},
229229
},
230230
}
231231

@@ -453,11 +453,10 @@ func TestPostTaskLogsLogSignalDataSaving(t *testing.T) {
453453
activeConfig, err := api.m.db.ActiveExperimentConfig(trial.ExperimentID)
454454
require.NoError(t, err)
455455

456-
signal := "sub"
457456
activeConfig.RawLogPolicies = expconf.LogPoliciesConfig{
458457
expconf.LogPolicy{
459-
RawPattern: "sub",
460-
RawActions: expconf.LogActionsV0{expconf.LogActionV0{Type: expconf.LogActionTypeSignal, Signal: &signal}},
458+
RawName: ptrs.Ptr("test"),
459+
RawPattern: ptrs.Ptr("sub"),
461460
},
462461
}
463462

@@ -490,30 +489,30 @@ func TestPostTaskLogsLogSignalDataSaving(t *testing.T) {
490489
require.NoError(t, err)
491490

492491
runsOut := struct {
493-
bun.BaseModel `bun:"table:runs"`
494-
LogSignal *string `db:"log_signal"`
492+
bun.BaseModel `bun:"table:runs"`
493+
LogPolicyMatched *string `db:"log_policy_matched"`
495494
}{}
496495

497496
err = db.Bun().NewSelect().Model(&runsOut).
498497
Where("id = ?", trial.ID).
499498
Scan(ctx)
500499
require.NoError(t, err)
501500
require.NotNil(t, runsOut)
502-
require.NotNil(t, runsOut.LogSignal)
501+
require.NotNil(t, runsOut.LogPolicyMatched)
503502

504-
require.Equal(t, "sub", *runsOut.LogSignal)
503+
require.Equal(t, "test", *runsOut.LogPolicyMatched)
505504

506505
tasksOut := struct {
507-
bun.BaseModel `bun:"table:tasks"`
508-
LogSignal *string `db:"log_signal"`
506+
bun.BaseModel `bun:"table:tasks"`
507+
LogPolicyMatched *string `db:"log_policy_matched"`
509508
}{}
510509
err = db.Bun().NewSelect().Model(&tasksOut).
511510
Join("LEFT JOIN run_id_task_id AS rt on tasks.task_id = rt.task_id").
512511
Where("run_id = ?", trial.ID).
513512
Scan(ctx)
514513
require.NoError(t, err)
515514
require.NotNil(t, tasksOut)
516-
require.NotNil(t, tasksOut.LogSignal)
515+
require.NotNil(t, tasksOut.LogPolicyMatched)
517516

518-
require.Equal(t, "sub", *tasksOut.LogSignal)
517+
require.Equal(t, "test", *tasksOut.LogPolicyMatched)
519518
}

master/internal/configpolicy/postgres_task_config_policy.go

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,7 @@ const (
2222
"resources": {"slots": 4, "max_slots": 8},
2323
"log_policies": [
2424
{
25-
"pattern": ".*CUDA out of memory.*",
26-
"actions": [
27-
{
28-
"signal": "CUDA OOM"
29-
}
30-
]
31-
},
32-
{
33-
"pattern": ".*uncorrectable ECC error encountered.*",
34-
"actions": [
35-
{
36-
"signal": "ECC Error"
37-
}
38-
]
25+
"pattern": "nonrepeat"
3926
}
4027
]
4128
}`

master/internal/configpolicy/task_config_policy_intg_test.go

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -472,23 +472,10 @@ func TestMergeWithInvariantExperimentConfigs(t *testing.T) {
472472
}
473473
],
474474
"log_policies": [
475-
{
476-
"pattern": ".*CUDA out of memory.*",
477-
"actions": [
478-
{
479-
"signal": "CUDA OOM"
480-
}
481-
]
482-
},
483-
{
484-
"pattern": ".*uncorrectable ECC error encountered.*",
485-
"actions": [
486-
{
487-
"signal": "ECC Error"
488-
}
489-
]
490-
}
491-
]
475+
{
476+
"pattern": "nonrepeat"
477+
}
478+
]
492479
}`
493480

494481
var defaultInvariantConfig expconf.ExperimentConfigV0

master/internal/db/postgres_experiments_intg_test.go

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -436,16 +436,14 @@ func TestActiveLogPatternPolicies(t *testing.T) {
436436
policies, err := ActiveLogPolicies(ctx, exp.ID)
437437
require.NoError(t, err)
438438
require.NotEmpty(t, policies)
439-
eccErrorSignal := "ECC Error"
440-
cudaOOMSignal := "CUDA OOM"
441439
expected := expconf.LogPoliciesConfig{
442440
expconf.LogPolicy{
443-
RawPattern: ".*CUDA out of memory.*",
444-
RawActions: expconf.LogActionsV0{expconf.LogActionV0{Type: expconf.LogActionTypeSignal, Signal: &cudaOOMSignal}},
441+
RawName: ptrs.Ptr(expconf.CUDAOOM),
442+
RawPattern: ptrs.Ptr(expconf.CUDAOOMPattern),
445443
},
446444
expconf.LogPolicy{
447-
RawPattern: ".*uncorrectable ECC error encountered.*",
448-
RawActions: expconf.LogActionsV0{expconf.LogActionV0{Type: expconf.LogActionTypeSignal, Signal: &eccErrorSignal}},
445+
RawName: ptrs.Ptr(expconf.ECCError),
446+
RawPattern: ptrs.Ptr(expconf.ECCErrorPattern),
449447
},
450448
}
451449

@@ -455,12 +453,12 @@ func TestActiveLogPatternPolicies(t *testing.T) {
455453
require.NoError(t, err)
456454
activeConfig.RawLogPolicies = expconf.LogPoliciesConfig{
457455
expconf.LogPolicy{
458-
RawPattern: `\d{5}$`,
459-
RawActions: expconf.LogActionsV0{expconf.LogActionV0{Type: expconf.LogActionTypeExcludeNode}},
456+
RawPattern: ptrs.Ptr(`\d{5}$`),
457+
RawAction: &expconf.LogActionV0{Type: expconf.LogActionTypeExcludeNode},
460458
},
461459
expconf.LogPolicy{
462-
RawPattern: "sub",
463-
RawActions: expconf.LogActionsV0{expconf.LogActionV0{Type: expconf.LogActionTypeCancelRetries}},
460+
RawPattern: ptrs.Ptr("sub"),
461+
RawAction: &expconf.LogActionV0{Type: expconf.LogActionTypeCancelRetries},
464462
},
465463
}
466464

0 commit comments

Comments
 (0)