Skip to content

Commit

Permalink
refactor, Simplify assessment cleanup for tests
Browse files Browse the repository at this point in the history
  • Loading branch information
bauersimon committed Oct 3, 2024
1 parent 3438679 commit 3f9055a
Show file tree
Hide file tree
Showing 8 changed files with 338 additions and 333 deletions.
532 changes: 281 additions & 251 deletions cmd/eval-dev-quality/cmd/evaluate_test.go

Large diffs are not rendered by default.

27 changes: 1 addition & 26 deletions evaluate/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,19 +126,12 @@ func TestEvaluate(t *testing.T) {

var actualAssessments metricstesting.AssessmentTuples
require.NoError(t, assessmentStore.Walk(func(m evalmodel.Model, l language.Language, r string, ti task.Identifier, a metrics.Assessments) error {
// Normalize assessments.
if v, ok := a[metrics.AssessmentKeyProcessingTime]; ok {
if assert.Greater(t, v, uint64(0)) {
delete(a, metrics.AssessmentKeyProcessingTime)
}
}

actualAssessments = append(actualAssessments, &metricstesting.AssessmentTuple{
Model: m,
Language: l,
RepositoryPath: r,
Task: ti,
Assessment: a,
Assessment: metricstesting.Clean(a),
})

return nil
Expand Down Expand Up @@ -501,7 +494,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryNextPath,
Task: evaluatetask.IdentifierWriteTests,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
metrics.AssessmentKeyResponseNoError: 1,
Expand All @@ -513,7 +505,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryNextPath,
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
metrics.AssessmentKeyResponseNoError: 1,
Expand All @@ -525,7 +516,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPlainPath,
Task: evaluatetask.IdentifierWriteTests,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 2,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
metrics.AssessmentKeyResponseNoError: 2,
Expand All @@ -537,7 +527,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPlainPath,
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 2,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
metrics.AssessmentKeyResponseNoError: 2,
Expand Down Expand Up @@ -602,7 +591,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryNextPath,
Task: evaluatetask.IdentifierWriteTests,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 2,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
metrics.AssessmentKeyResponseNoError: 2,
Expand All @@ -614,7 +602,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryNextPath,
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 2,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
metrics.AssessmentKeyResponseNoError: 2,
Expand All @@ -626,7 +613,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPlainPath,
Task: evaluatetask.IdentifierWriteTests,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
metrics.AssessmentKeyResponseNoError: 1,
Expand All @@ -638,7 +624,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPlainPath,
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 2,
metrics.AssessmentKeyResponseNoError: 1,
Expand Down Expand Up @@ -762,7 +747,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPath,
Task: evaluatetask.IdentifierWriteTests,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
metrics.AssessmentKeyResponseNoError: 3,
Expand All @@ -774,7 +758,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPath,
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
metrics.AssessmentKeyResponseNoError: 3,
Expand Down Expand Up @@ -834,7 +817,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPath,
Task: evaluatetask.IdentifierWriteTests,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
metrics.AssessmentKeyResponseNoError: 3,
Expand All @@ -846,7 +828,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPath,
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
metrics.AssessmentKeyResponseNoError: 3,
Expand Down Expand Up @@ -935,7 +916,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPath,
Task: evaluatetask.IdentifierWriteTests,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
metrics.AssessmentKeyResponseNoError: 3,
Expand All @@ -947,7 +927,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPath,
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
metrics.AssessmentKeyResponseNoError: 3,
Expand Down Expand Up @@ -1019,7 +998,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPath,
Task: evaluatetask.IdentifierWriteTests,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
metrics.AssessmentKeyResponseNoError: 3,
Expand All @@ -1031,7 +1009,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPath,
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
metrics.AssessmentKeyResponseNoError: 3,
Expand Down Expand Up @@ -1085,7 +1062,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPath,
Task: evaluatetask.IdentifierWriteTests,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
metrics.AssessmentKeyResponseNoError: 1,
Expand All @@ -1097,7 +1073,6 @@ func TestEvaluate(t *testing.T) {
RepositoryPath: repositoryPath,
Task: evaluatetask.IdentifierWriteTestsSymflowerFix,
Assessment: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyCoverage: 0,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
metrics.AssessmentKeyResponseNoError: 1,
Expand Down
63 changes: 28 additions & 35 deletions evaluate/metrics/testing/assessments.go
Original file line number Diff line number Diff line change
@@ -1,57 +1,50 @@
package metricstesting

import (
"testing"

"golang.org/x/exp/maps"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"maps"

"github.com/symflower/eval-dev-quality/evaluate/metrics"
"github.com/symflower/eval-dev-quality/language"
"github.com/symflower/eval-dev-quality/model"
"github.com/symflower/eval-dev-quality/task"
)

// AssertAssessmentsEqual checks if the given assessments are equal ignoring default and nondeterministic values.
func AssertAssessmentsEqual(t *testing.T, expected metrics.Assessments, actual metrics.Assessments) {
expected = maps.Clone(expected)
actual = maps.Clone(actual)
// Clean deletes all empty and nondeterministic keys from the assessment.
func Clean(assessment metrics.Assessments) metrics.Assessments {
copy := metrics.Assessments{}
maps.Copy(copy, assessment)

clearNonDeterministicAssessmentValues(expected)
clearNonDeterministicAssessmentValues(actual)
delete(copy, metrics.AssessmentKeyProcessingTime)

assert.Truef(t, expected.Equal(actual), "expected:%s\nactual:%s", expected, actual)
}
for _, key := range metrics.AllAssessmentKeysStrings {
if copy[metrics.AssessmentKey(key)] == 0 {
delete(copy, metrics.AssessmentKey(key))
}
}

// AssertTaskAssessmentsEqual checks if the given assessments per task are equal ignoring default and nondeterministic values.
func AssertTaskAssessmentsEqual(t *testing.T, expected map[task.Identifier]metrics.Assessments, actual map[task.Identifier]metrics.Assessments) {
expected = maps.Clone(expected)
actual = maps.Clone(actual)
return copy
}

// The expected and actual maps must have the same task identifiers.
require.ElementsMatch(t, maps.Keys(expected), maps.Keys(actual))
// CleanSlice deletes all empty and nondeterministic keys from the assessments.
func CleanSlice(assessments []metrics.Assessments) []metrics.Assessments {
copy := make([]metrics.Assessments, len(assessments))

// Ignore non-deterministic values.
for _, assessment := range expected {
clearNonDeterministicAssessmentValues(assessment)
}
for _, assessment := range actual {
clearNonDeterministicAssessmentValues(assessment)
for i, assessment := range assessments {
copy[i] = Clean(assessment)
}

for task, expectedAssessment := range expected {
actualAssessment := actual[task]
assert.Truef(t, expectedAssessment.Equal(actualAssessment), "task:%s\nexpected:%s\nactual:%s", task, expected, actual)
}
return copy
}

// clearNonDeterministicAssessmentValues ignores non-deterministic values such as processing time and response character count.
func clearNonDeterministicAssessmentValues(assessment metrics.Assessments) {
assessment[metrics.AssessmentKeyProcessingTime] = 0
assessment[metrics.AssessmentKeyGenerateTestsForFileCharacterCount] = 0
assessment[metrics.AssessmentKeyResponseCharacterCount] = 0
// CleanMap deletes all empty and nondeterministic keys from the assessments.
func CleanMap[E comparable](assessments map[E]metrics.Assessments) map[E]metrics.Assessments {
copy := map[E]metrics.Assessments{}

for key, assessment := range assessments {
copy[key] = Clean(assessment)
}

return copy
}

// AssessmentsWithProcessingTime is an empty assessment collection with positive processing time.
Expand Down
8 changes: 2 additions & 6 deletions evaluate/report/collection_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,12 @@ func TestAssessmentPerModelPerLanguagePerRepositoryWalk(t *testing.T) {

assert.NoError(t, assessmentStore.Walk(func(m model.Model, l language.Language, r string, ti task.Identifier, a metrics.Assessments) (err error) {
actualOrder = append(actualOrder, a)
metricstesting.AssertAssessmentsEqual(t, assessmentLookup[m][l][r][ti], a)
assert.Equal(t, metricstesting.Clean(assessmentLookup[m][l][r][ti]), metricstesting.Clean(a))

return nil
}))

if assert.Equal(t, len(tc.ExpectedOrder), len(actualOrder)) {
for i := range tc.ExpectedOrder {
metricstesting.AssertAssessmentsEqual(t, tc.ExpectedOrder[i], actualOrder[i])
}
}
assert.Equal(t, metricstesting.CleanSlice(tc.ExpectedOrder), metricstesting.CleanSlice(actualOrder))
})
}

Expand Down
3 changes: 2 additions & 1 deletion evaluate/task/testing/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ func (tc *TestCaseTask) Validate(t *testing.T, createRepository createRepository
}
actualRepositoryAssessment, actualProblems, actualErr := tc.Task.Run(taskContext)

metricstesting.AssertTaskAssessmentsEqual(t, tc.ExpectedRepositoryAssessment, actualRepositoryAssessment)
assert.Equal(t, metricstesting.CleanMap(tc.ExpectedRepositoryAssessment), metricstesting.CleanMap(actualRepositoryAssessment))

if assert.Equal(t, len(tc.ExpectedProblemContains), len(actualProblems), "problems count") {
for i, expectedProblem := range tc.ExpectedProblemContains {
actualProblem := actualProblems[i]
Expand Down
33 changes: 22 additions & 11 deletions model/llm/llm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ func TestModelGenerateTestsForFile(t *testing.T) {
}
actualAssessment, actualError := llm.WriteTests(ctx)
assert.NoError(t, actualError)
metricstesting.AssertAssessmentsEqual(t, tc.ExpectedAssessment, actualAssessment)

assert.Equal(t, metricstesting.Clean(tc.ExpectedAssessment), metricstesting.Clean(actualAssessment))

actualTestFileContent, err := os.ReadFile(filepath.Join(temporaryPath, tc.ExpectedTestFilePath))
assert.NoError(t, err)
Expand Down Expand Up @@ -172,7 +173,8 @@ func TestModelRepairSourceCodeFile(t *testing.T) {
}
actualAssessment, actualError := llm.RepairCode(ctx)
assert.NoError(t, actualError)
metricstesting.AssertAssessmentsEqual(t, tc.ExpectedAssessment, actualAssessment)

assert.Equal(t, metricstesting.Clean(tc.ExpectedAssessment), metricstesting.Clean(actualAssessment))

actualSourceFileContent, err := os.ReadFile(filepath.Join(repositoryPath, tc.SourceFilePath))
assert.NoError(t, err)
Expand Down Expand Up @@ -210,8 +212,10 @@ func TestModelRepairSourceCodeFile(t *testing.T) {
},

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 134,
metrics.AssessmentKeyResponseCharacterCount: 143,
},
ExpectedSourceFileContent: `
package openingBracketMissing
Expand Down Expand Up @@ -260,8 +264,10 @@ func TestModelRepairSourceCodeFile(t *testing.T) {
},

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 186,
metrics.AssessmentKeyResponseCharacterCount: 195,
},
ExpectedSourceFileContent: `
package com.eval;
Expand Down Expand Up @@ -514,7 +520,8 @@ func TestModelTranspile(t *testing.T) {

actualAssessment, actualError := llm.Transpile(ctx)
assert.NoError(t, actualError)
metricstesting.AssertAssessmentsEqual(t, tc.ExpectedAssessment, actualAssessment)

assert.Equal(t, metricstesting.Clean(tc.ExpectedAssessment), metricstesting.Clean(actualAssessment))

actualStubFileContent, err := os.ReadFile(filepath.Join(repositoryPath, tc.StubFilePath))
assert.NoError(t, err)
Expand Down Expand Up @@ -562,8 +569,10 @@ func TestModelTranspile(t *testing.T) {
StubFilePath: filepath.Join("binarySearch.go"),

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 280,
metrics.AssessmentKeyResponseCharacterCount: 289,
},
ExpectedStubFileContent: transpiledFileContent,
})
Expand Down Expand Up @@ -610,8 +619,10 @@ func TestModelTranspile(t *testing.T) {
StubFilePath: filepath.Join("src", "main", "java", "com", "eval", "BinarySearch.java"),

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 348,
metrics.AssessmentKeyResponseCharacterCount: 357,
},
ExpectedStubFileContent: transpiledFileContent,
})
Expand Down
2 changes: 1 addition & 1 deletion model/llm/prompt/parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func TestParseResponse(t *testing.T) {
assert.Error(t, err)
}

metricstesting.AssertAssessmentsEqual(t, tc.ExpectedAssessment, actualAssessment)
assert.Equal(t, metricstesting.Clean(tc.ExpectedAssessment), metricstesting.Clean(actualAssessment))
assert.Equal(t, strings.TrimSpace(tc.ExpectedCode), actualCode)
})
}
Expand Down
3 changes: 1 addition & 2 deletions model/symflower/symflower_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,7 @@ func TestModelGenerateTestsForFile(t *testing.T) {
} else {
require.NoError(t, actualError)

metricstesting.AssertAssessmentsEqual(t, tc.ExpectedAssessment, actualAssessment)

assert.Equal(t, metricstesting.Clean(tc.ExpectedAssessment), metricstesting.Clean(actualAssessment))
actualTestResult, actualProblems, err := tc.Language.ExecuteTests(logger, repositoryPath)
require.NoError(t, err)
require.Empty(t, actualProblems)
Expand Down

0 comments on commit 3f9055a

Please sign in to comment.