Skip to content

Commit 62584a5

Browse files
committed
refactor, Simplify assessment cleanup for tests
1 parent 3438679 commit 62584a5

File tree

8 files changed

+338
-333
lines changed

8 files changed

+338
-333
lines changed

cmd/eval-dev-quality/cmd/evaluate_test.go

Lines changed: 281 additions & 251 deletions
Large diffs are not rendered by default.

evaluate/evaluate_test.go

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -126,19 +126,12 @@ func TestEvaluate(t *testing.T) {
126126

127127
var actualAssessments metricstesting.AssessmentTuples
128128
require.NoError(t, assessmentStore.Walk(func(m evalmodel.Model, l language.Language, r string, ti task.Identifier, a metrics.Assessments) error {
129-
// Normalize assessments.
130-
if v, ok := a[metrics.AssessmentKeyProcessingTime]; ok {
131-
if assert.Greater(t, v, uint64(0)) {
132-
delete(a, metrics.AssessmentKeyProcessingTime)
133-
}
134-
}
135-
136129
actualAssessments = append(actualAssessments, &metricstesting.AssessmentTuple{
137130
Model: m,
138131
Language: l,
139132
RepositoryPath: r,
140133
Task: ti,
141-
Assessment: a,
134+
Assessment: metricstesting.Clean(a),
142135
})
143136

144137
return nil
@@ -501,7 +494,6 @@ func TestEvaluate(t *testing.T) {
501494
RepositoryPath: repositoryNextPath,
502495
Task: evaluatetask.IdentifierWriteTests,
503496
Assessment: map[metrics.AssessmentKey]uint64{
504-
metrics.AssessmentKeyCoverage: 0,
505497
metrics.AssessmentKeyFilesExecuted: 1,
506498
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
507499
metrics.AssessmentKeyResponseNoError: 1,
@@ -513,7 +505,6 @@ func TestEvaluate(t *testing.T) {
513505
RepositoryPath: repositoryNextPath,
514506
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
515507
Assessment: map[metrics.AssessmentKey]uint64{
516-
metrics.AssessmentKeyCoverage: 0,
517508
metrics.AssessmentKeyFilesExecuted: 1,
518509
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
519510
metrics.AssessmentKeyResponseNoError: 1,
@@ -525,7 +516,6 @@ func TestEvaluate(t *testing.T) {
525516
RepositoryPath: repositoryPlainPath,
526517
Task: evaluatetask.IdentifierWriteTests,
527518
Assessment: map[metrics.AssessmentKey]uint64{
528-
metrics.AssessmentKeyCoverage: 0,
529519
metrics.AssessmentKeyFilesExecuted: 2,
530520
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
531521
metrics.AssessmentKeyResponseNoError: 2,
@@ -537,7 +527,6 @@ func TestEvaluate(t *testing.T) {
537527
RepositoryPath: repositoryPlainPath,
538528
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
539529
Assessment: map[metrics.AssessmentKey]uint64{
540-
metrics.AssessmentKeyCoverage: 0,
541530
metrics.AssessmentKeyFilesExecuted: 2,
542531
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
543532
metrics.AssessmentKeyResponseNoError: 2,
@@ -602,7 +591,6 @@ func TestEvaluate(t *testing.T) {
602591
RepositoryPath: repositoryNextPath,
603592
Task: evaluatetask.IdentifierWriteTests,
604593
Assessment: map[metrics.AssessmentKey]uint64{
605-
metrics.AssessmentKeyCoverage: 0,
606594
metrics.AssessmentKeyFilesExecuted: 2,
607595
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
608596
metrics.AssessmentKeyResponseNoError: 2,
@@ -614,7 +602,6 @@ func TestEvaluate(t *testing.T) {
614602
RepositoryPath: repositoryNextPath,
615603
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
616604
Assessment: map[metrics.AssessmentKey]uint64{
617-
metrics.AssessmentKeyCoverage: 0,
618605
metrics.AssessmentKeyFilesExecuted: 2,
619606
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
620607
metrics.AssessmentKeyResponseNoError: 2,
@@ -626,7 +613,6 @@ func TestEvaluate(t *testing.T) {
626613
RepositoryPath: repositoryPlainPath,
627614
Task: evaluatetask.IdentifierWriteTests,
628615
Assessment: map[metrics.AssessmentKey]uint64{
629-
metrics.AssessmentKeyCoverage: 0,
630616
metrics.AssessmentKeyFilesExecuted: 1,
631617
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
632618
metrics.AssessmentKeyResponseNoError: 1,
@@ -638,7 +624,6 @@ func TestEvaluate(t *testing.T) {
638624
RepositoryPath: repositoryPlainPath,
639625
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
640626
Assessment: map[metrics.AssessmentKey]uint64{
641-
metrics.AssessmentKeyCoverage: 0,
642627
metrics.AssessmentKeyFilesExecuted: 1,
643628
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
644629
metrics.AssessmentKeyResponseNoError: 1,
@@ -762,7 +747,6 @@ func TestEvaluate(t *testing.T) {
762747
RepositoryPath: repositoryPath,
763748
Task: evaluatetask.IdentifierWriteTests,
764749
Assessment: map[metrics.AssessmentKey]uint64{
765-
metrics.AssessmentKeyCoverage: 0,
766750
metrics.AssessmentKeyFilesExecuted: 3,
767751
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
768752
metrics.AssessmentKeyResponseNoError: 3,
@@ -774,7 +758,6 @@ func TestEvaluate(t *testing.T) {
774758
RepositoryPath: repositoryPath,
775759
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
776760
Assessment: map[metrics.AssessmentKey]uint64{
777-
metrics.AssessmentKeyCoverage: 0,
778761
metrics.AssessmentKeyFilesExecuted: 3,
779762
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
780763
metrics.AssessmentKeyResponseNoError: 3,
@@ -834,7 +817,6 @@ func TestEvaluate(t *testing.T) {
834817
RepositoryPath: repositoryPath,
835818
Task: evaluatetask.IdentifierWriteTests,
836819
Assessment: map[metrics.AssessmentKey]uint64{
837-
metrics.AssessmentKeyCoverage: 0,
838820
metrics.AssessmentKeyFilesExecuted: 3,
839821
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
840822
metrics.AssessmentKeyResponseNoError: 3,
@@ -846,7 +828,6 @@ func TestEvaluate(t *testing.T) {
846828
RepositoryPath: repositoryPath,
847829
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
848830
Assessment: map[metrics.AssessmentKey]uint64{
849-
metrics.AssessmentKeyCoverage: 0,
850831
metrics.AssessmentKeyFilesExecuted: 3,
851832
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
852833
metrics.AssessmentKeyResponseNoError: 3,
@@ -935,7 +916,6 @@ func TestEvaluate(t *testing.T) {
935916
RepositoryPath: repositoryPath,
936917
Task: evaluatetask.IdentifierWriteTests,
937918
Assessment: map[metrics.AssessmentKey]uint64{
938-
metrics.AssessmentKeyCoverage: 0,
939919
metrics.AssessmentKeyFilesExecuted: 3,
940920
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
941921
metrics.AssessmentKeyResponseNoError: 3,
@@ -947,7 +927,6 @@ func TestEvaluate(t *testing.T) {
947927
RepositoryPath: repositoryPath,
948928
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
949929
Assessment: map[metrics.AssessmentKey]uint64{
950-
metrics.AssessmentKeyCoverage: 0,
951930
metrics.AssessmentKeyFilesExecuted: 3,
952931
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
953932
metrics.AssessmentKeyResponseNoError: 3,
@@ -1019,7 +998,6 @@ func TestEvaluate(t *testing.T) {
1019998
RepositoryPath: repositoryPath,
1020999
Task: evaluatetask.IdentifierWriteTests,
10211000
Assessment: map[metrics.AssessmentKey]uint64{
1022-
metrics.AssessmentKeyCoverage: 0,
10231001
metrics.AssessmentKeyFilesExecuted: 3,
10241002
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
10251003
metrics.AssessmentKeyResponseNoError: 3,
@@ -1031,7 +1009,6 @@ func TestEvaluate(t *testing.T) {
10311009
RepositoryPath: repositoryPath,
10321010
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
10331011
Assessment: map[metrics.AssessmentKey]uint64{
1034-
metrics.AssessmentKeyCoverage: 0,
10351012
metrics.AssessmentKeyFilesExecuted: 3,
10361013
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
10371014
metrics.AssessmentKeyResponseNoError: 3,
@@ -1085,7 +1062,6 @@ func TestEvaluate(t *testing.T) {
10851062
RepositoryPath: repositoryPath,
10861063
Task: evaluatetask.IdentifierWriteTests,
10871064
Assessment: map[metrics.AssessmentKey]uint64{
1088-
metrics.AssessmentKeyCoverage: 0,
10891065
metrics.AssessmentKeyFilesExecuted: 1,
10901066
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
10911067
metrics.AssessmentKeyResponseNoError: 1,
@@ -1097,7 +1073,6 @@ func TestEvaluate(t *testing.T) {
10971073
RepositoryPath: repositoryPath,
10981074
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
10991075
Assessment: map[metrics.AssessmentKey]uint64{
1100-
metrics.AssessmentKeyCoverage: 0,
11011076
metrics.AssessmentKeyFilesExecuted: 1,
11021077
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
11031078
metrics.AssessmentKeyResponseNoError: 1,

evaluate/metrics/testing/assessments.go

Lines changed: 28 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,50 @@
11
package metricstesting
22

33
import (
4-
"testing"
5-
6-
"golang.org/x/exp/maps"
7-
8-
"github.com/stretchr/testify/assert"
9-
"github.com/stretchr/testify/require"
4+
"maps"
105

116
"github.com/symflower/eval-dev-quality/evaluate/metrics"
127
"github.com/symflower/eval-dev-quality/language"
138
"github.com/symflower/eval-dev-quality/model"
149
"github.com/symflower/eval-dev-quality/task"
1510
)
1611

17-
// AssertAssessmentsEqual checks if the given assessments are equal ignoring default and nondeterministic values.
18-
func AssertAssessmentsEqual(t *testing.T, expected metrics.Assessments, actual metrics.Assessments) {
19-
expected = maps.Clone(expected)
20-
actual = maps.Clone(actual)
12+
// Clean deletes all empty and nondeterministic keys from the assessment.
13+
func Clean(assessment metrics.Assessments) metrics.Assessments {
14+
copy := metrics.Assessments{}
15+
maps.Copy(copy, assessment)
2116

22-
clearNonDeterministicAssessmentValues(expected)
23-
clearNonDeterministicAssessmentValues(actual)
17+
delete(copy, metrics.AssessmentKeyProcessingTime)
2418

25-
assert.Truef(t, expected.Equal(actual), "expected:%s\nactual:%s", expected, actual)
26-
}
19+
for _, key := range metrics.AllAssessmentKeysStrings {
20+
if copy[metrics.AssessmentKey(key)] == 0 {
21+
delete(copy, metrics.AssessmentKey(key))
22+
}
23+
}
2724

28-
// AssertTaskAssessmentsEqual checks if the given assessments per task are equal ignoring default and nondeterministic values.
29-
func AssertTaskAssessmentsEqual(t *testing.T, expected map[task.Identifier]metrics.Assessments, actual map[task.Identifier]metrics.Assessments) {
30-
expected = maps.Clone(expected)
31-
actual = maps.Clone(actual)
25+
return copy
26+
}
3227

33-
// The expected and actual maps must have the same task identifiers.
34-
require.ElementsMatch(t, maps.Keys(expected), maps.Keys(actual))
28+
// CleanSlice deletes all empty and nondeterministic keys from the assessments.
29+
func CleanSlice(assessments []metrics.Assessments) []metrics.Assessments {
30+
copy := make([]metrics.Assessments, len(assessments))
3531

36-
// Ignore non-deterministic values.
37-
for _, assessment := range expected {
38-
clearNonDeterministicAssessmentValues(assessment)
39-
}
40-
for _, assessment := range actual {
41-
clearNonDeterministicAssessmentValues(assessment)
32+
for i, assessment := range assessments {
33+
copy[i] = Clean(assessment)
4234
}
4335

44-
for task, expectedAssessment := range expected {
45-
actualAssessment := actual[task]
46-
assert.Truef(t, expectedAssessment.Equal(actualAssessment), "task:%s\nexpected:%s\nactual:%s", task, expected, actual)
47-
}
36+
return copy
4837
}
4938

50-
// clearNonDeterministicAssessmentValues ignores non-deterministic values such as processing time and response character count.
51-
func clearNonDeterministicAssessmentValues(assessment metrics.Assessments) {
52-
assessment[metrics.AssessmentKeyProcessingTime] = 0
53-
assessment[metrics.AssessmentKeyGenerateTestsForFileCharacterCount] = 0
54-
assessment[metrics.AssessmentKeyResponseCharacterCount] = 0
39+
// CleanMap deletes all empty and nondeterministic keys from the assessments.
40+
func CleanMap[E comparable](assessments map[E]metrics.Assessments) map[E]metrics.Assessments {
41+
copy := map[E]metrics.Assessments{}
42+
43+
for key, assessment := range assessments {
44+
copy[key] = Clean(assessment)
45+
}
46+
47+
return copy
5548
}
5649

5750
// AssessmentsWithProcessingTime is an empty assessment collection with positive processing time.

evaluate/report/collection_test.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,12 @@ func TestAssessmentPerModelPerLanguagePerRepositoryWalk(t *testing.T) {
3434

3535
assert.NoError(t, assessmentStore.Walk(func(m model.Model, l language.Language, r string, ti task.Identifier, a metrics.Assessments) (err error) {
3636
actualOrder = append(actualOrder, a)
37-
metricstesting.AssertAssessmentsEqual(t, assessmentLookup[m][l][r][ti], a)
37+
assert.Equal(t, metricstesting.Clean(assessmentLookup[m][l][r][ti]), metricstesting.Clean(a))
3838

3939
return nil
4040
}))
4141

42-
if assert.Equal(t, len(tc.ExpectedOrder), len(actualOrder)) {
43-
for i := range tc.ExpectedOrder {
44-
metricstesting.AssertAssessmentsEqual(t, tc.ExpectedOrder[i], actualOrder[i])
45-
}
46-
}
42+
assert.Equal(t, metricstesting.CleanSlice(tc.ExpectedOrder), metricstesting.CleanSlice(actualOrder))
4743
})
4844
}
4945

evaluate/task/testing/task.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ func (tc *TestCaseTask) Validate(t *testing.T, createRepository createRepository
5959
}
6060
actualRepositoryAssessment, actualProblems, actualErr := tc.Task.Run(taskContext)
6161

62-
metricstesting.AssertTaskAssessmentsEqual(t, tc.ExpectedRepositoryAssessment, actualRepositoryAssessment)
62+
assert.Equal(t, metricstesting.CleanMap(tc.ExpectedRepositoryAssessment), metricstesting.CleanMap(actualRepositoryAssessment))
63+
6364
if assert.Equal(t, len(tc.ExpectedProblemContains), len(actualProblems), "problems count") {
6465
for i, expectedProblem := range tc.ExpectedProblemContains {
6566
actualProblem := actualProblems[i]

model/llm/llm_test.go

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ func TestModelGenerateTestsForFile(t *testing.T) {
6868
}
6969
actualAssessment, actualError := llm.WriteTests(ctx)
7070
assert.NoError(t, actualError)
71-
metricstesting.AssertAssessmentsEqual(t, tc.ExpectedAssessment, actualAssessment)
71+
72+
assert.Equal(t, metricstesting.Clean(tc.ExpectedAssessment), metricstesting.Clean(actualAssessment))
7273

7374
actualTestFileContent, err := os.ReadFile(filepath.Join(temporaryPath, tc.ExpectedTestFilePath))
7475
assert.NoError(t, err)
@@ -172,7 +173,8 @@ func TestModelRepairSourceCodeFile(t *testing.T) {
172173
}
173174
actualAssessment, actualError := llm.RepairCode(ctx)
174175
assert.NoError(t, actualError)
175-
metricstesting.AssertAssessmentsEqual(t, tc.ExpectedAssessment, actualAssessment)
176+
177+
assert.Equal(t, metricstesting.Clean(tc.ExpectedAssessment), metricstesting.Clean(actualAssessment))
176178

177179
actualSourceFileContent, err := os.ReadFile(filepath.Join(repositoryPath, tc.SourceFilePath))
178180
assert.NoError(t, err)
@@ -210,8 +212,10 @@ func TestModelRepairSourceCodeFile(t *testing.T) {
210212
},
211213

212214
ExpectedAssessment: metrics.Assessments{
213-
metrics.AssessmentKeyResponseNoExcess: 1,
214-
metrics.AssessmentKeyResponseWithCode: 1,
215+
metrics.AssessmentKeyResponseNoExcess: 1,
216+
metrics.AssessmentKeyResponseWithCode: 1,
217+
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 134,
218+
metrics.AssessmentKeyResponseCharacterCount: 143,
215219
},
216220
ExpectedSourceFileContent: `
217221
package openingBracketMissing
@@ -260,8 +264,10 @@ func TestModelRepairSourceCodeFile(t *testing.T) {
260264
},
261265

262266
ExpectedAssessment: metrics.Assessments{
263-
metrics.AssessmentKeyResponseNoExcess: 1,
264-
metrics.AssessmentKeyResponseWithCode: 1,
267+
metrics.AssessmentKeyResponseNoExcess: 1,
268+
metrics.AssessmentKeyResponseWithCode: 1,
269+
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 186,
270+
metrics.AssessmentKeyResponseCharacterCount: 195,
265271
},
266272
ExpectedSourceFileContent: `
267273
package com.eval;
@@ -514,7 +520,8 @@ func TestModelTranspile(t *testing.T) {
514520

515521
actualAssessment, actualError := llm.Transpile(ctx)
516522
assert.NoError(t, actualError)
517-
metricstesting.AssertAssessmentsEqual(t, tc.ExpectedAssessment, actualAssessment)
523+
524+
assert.Equal(t, metricstesting.Clean(tc.ExpectedAssessment), metricstesting.Clean(actualAssessment))
518525

519526
actualStubFileContent, err := os.ReadFile(filepath.Join(repositoryPath, tc.StubFilePath))
520527
assert.NoError(t, err)
@@ -562,8 +569,10 @@ func TestModelTranspile(t *testing.T) {
562569
StubFilePath: filepath.Join("binarySearch.go"),
563570

564571
ExpectedAssessment: metrics.Assessments{
565-
metrics.AssessmentKeyResponseNoExcess: 1,
566-
metrics.AssessmentKeyResponseWithCode: 1,
572+
metrics.AssessmentKeyResponseNoExcess: 1,
573+
metrics.AssessmentKeyResponseWithCode: 1,
574+
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 280,
575+
metrics.AssessmentKeyResponseCharacterCount: 289,
567576
},
568577
ExpectedStubFileContent: transpiledFileContent,
569578
})
@@ -610,8 +619,10 @@ func TestModelTranspile(t *testing.T) {
610619
StubFilePath: filepath.Join("src", "main", "java", "com", "eval", "BinarySearch.java"),
611620

612621
ExpectedAssessment: metrics.Assessments{
613-
metrics.AssessmentKeyResponseNoExcess: 1,
614-
metrics.AssessmentKeyResponseWithCode: 1,
622+
metrics.AssessmentKeyResponseNoExcess: 1,
623+
metrics.AssessmentKeyResponseWithCode: 1,
624+
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 348,
625+
metrics.AssessmentKeyResponseCharacterCount: 357,
615626
},
616627
ExpectedStubFileContent: transpiledFileContent,
617628
})

model/llm/prompt/parse_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ func TestParseResponse(t *testing.T) {
3131
assert.Error(t, err)
3232
}
3333

34-
metricstesting.AssertAssessmentsEqual(t, tc.ExpectedAssessment, actualAssessment)
34+
assert.Equal(t, metricstesting.Clean(tc.ExpectedAssessment), metricstesting.Clean(actualAssessment))
3535
assert.Equal(t, strings.TrimSpace(tc.ExpectedCode), actualCode)
3636
})
3737
}

model/symflower/symflower_test.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,7 @@ func TestModelGenerateTestsForFile(t *testing.T) {
7878
} else {
7979
require.NoError(t, actualError)
8080

81-
metricstesting.AssertAssessmentsEqual(t, tc.ExpectedAssessment, actualAssessment)
82-
81+
assert.Equal(t, metricstesting.Clean(tc.ExpectedAssessment), metricstesting.Clean(actualAssessment))
8382
actualTestResult, actualProblems, err := tc.Language.ExecuteTests(logger, repositoryPath)
8483
require.NoError(t, err)
8584
require.Empty(t, actualProblems)

0 commit comments

Comments
 (0)