Skip to content

Commit

Permalink
More automatic assessments based on a model response
Browse files Browse the repository at this point in the history
Part of #32
  • Loading branch information
bauersimon committed Apr 18, 2024
1 parent 4e6c4dd commit 41aebee
Show file tree
Hide file tree
Showing 9 changed files with 178 additions and 23 deletions.
24 changes: 24 additions & 0 deletions evaluate/metrics/assessment.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ var (
// AssessmentKeyCoverageStatement counts the cases where 100% coverage was reached.
AssessmentKeyCoverageStatement = RegisterAssessmentKey("coverage-statement")

// AssessmentKeyResponseNoError indicates that a model responded without error.
AssessmentKeyResponseNoError = RegisterAssessmentKey("response-no-error")
// AssessmentKeyResponseNotEmpty indicates that a model response was not empty.
AssessmentKeyResponseNotEmpty = RegisterAssessmentKey("response-not-empty")
// AssessmentKeyResponseWithCode indicates that a model responded with code.
AssessmentKeyResponseWithCode = RegisterAssessmentKey("response-with-code")
// AssessmentKeyResponseNoExcess indicates that a model did not produce more content as requested.
AssessmentKeyResponseNoExcess = RegisterAssessmentKey("response-no-excess")
)
Expand All @@ -57,6 +63,24 @@ func (a Assessments) Add(x Assessments) {
}
}

// IsEqual checks if both assessment collections are equal.
func (a Assessments) IsEqual(x Assessments) bool {
if a == nil {
a = NewAssessments()
}
if x == nil {
x = NewAssessments()
}

for _, key := range allAssessmentKeys {
if a[key] != x[key] {
return false
}
}

return true
}

// Merge combines two assessment collections into a new assessment collection and returns the new assessment collection.
func Merge(a Assessments, b Assessments) (c Assessments) {
c = NewAssessments()
Expand Down
106 changes: 96 additions & 10 deletions evaluate/metrics/assessment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ func TestAssessmentString(t *testing.T) {

Assessment: NewAssessments(),

ExpectedString: "files-executed=0, files-problems=0, coverage-statement=0, response-no-excess=0",
ExpectedString: "files-executed=0, files-problems=0, coverage-statement=0, response-no-error=0, response-not-empty=0, response-with-code=0, response-no-excess=0",
})

validate(t, &testCase{
Expand All @@ -148,10 +148,13 @@ func TestAssessmentString(t *testing.T) {
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyFilesProblems: 3,
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseNoError: 4,
AssessmentKeyResponseNoExcess: 5,
AssessmentKeyResponseNotEmpty: 6,
AssessmentKeyResponseWithCode: 7,
},

ExpectedString: "files-executed=2, files-problems=3, coverage-statement=1, response-no-excess=4",
ExpectedString: "files-executed=0, files-problems=0, coverage-statement=0, response-no-error=0, response-not-empty=0, response-with-code=0, response-no-excess=0",
})
}

Expand Down Expand Up @@ -181,8 +184,8 @@ func TestFormatStringCSV(t *testing.T) {
},

ExpectedString: `
model,files-executed,files-problems,coverage-statement,response-no-excess
Model,0,0,0,0
model,files-executed,files-problems,coverage-statement,response-no-error,response-not-empty,response-with-code,response-no-excess
Model,0,0,0,0,0,0,0
`,
})
validate(t, &testCase{
Expand All @@ -193,20 +196,103 @@ func TestFormatStringCSV(t *testing.T) {
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyFilesProblems: 3,
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseNoError: 4,
AssessmentKeyResponseNoExcess: 5,
AssessmentKeyResponseNotEmpty: 6,
AssessmentKeyResponseWithCode: 7,
},
"ModelB": Assessments{
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyFilesProblems: 3,
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseNoError: 4,
AssessmentKeyResponseNoExcess: 5,
AssessmentKeyResponseNotEmpty: 6,
AssessmentKeyResponseWithCode: 7,
},
},

ExpectedString: `
model,files-executed,files-problems,coverage-statement,response-no-excess
ModelA,2,3,1,4
ModelB,2,3,1,4
model,files-executed,files-problems,coverage-statement,response-no-error,response-not-empty,response-with-code,response-no-excess
ModelA,2,3,1,4,6,7,5
ModelB,2,3,1,4,6,7,5
`,
})
}

func TestAssessmentsIsEqual(t *testing.T) {
type testCase struct {
Name string

Assessments Assessments
X Assessments

ExpectedBool bool
}

validate := func(t *testing.T, tc *testCase) {
t.Run(tc.Name, func(t *testing.T) {
actualBool := tc.Assessments.IsEqual(tc.X)

assert.Equal(t, tc.ExpectedBool, actualBool)
})
}

validate(t, &testCase{
Name: "Empty",

Assessments: NewAssessments(),
X: NewAssessments(),

ExpectedBool: true,
})

validate(t, &testCase{
Name: "Nil",

Assessments: nil,
X: nil,

ExpectedBool: true,
})

validate(t, &testCase{
Name: "Equal Values",

Assessments: Assessments{
AssessmentKeyResponseWithCode: 2,
},
X: Assessments{
AssessmentKeyResponseWithCode: 2,
},

ExpectedBool: true,
})

validate(t, &testCase{
Name: "Default Value",

Assessments: Assessments{
AssessmentKeyResponseWithCode: 2,
AssessmentKeyResponseNoError: 0,
},
X: Assessments{
AssessmentKeyResponseWithCode: 2,
},

ExpectedBool: true,
})

validate(t, &testCase{
Name: "Different Values",

Assessments: Assessments{
AssessmentKeyResponseWithCode: 3,
},
X: Assessments{
AssessmentKeyResponseWithCode: 2,
},

ExpectedBool: false,
})
}
10 changes: 8 additions & 2 deletions evaluate/repository.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,15 @@ func EvaluateRepository(resultPath string, model model.Model, language language.

repositoryAssessment = metrics.NewAssessments()
for _, filePath := range filePaths {
assessments, err := model.GenerateTestsForFile(language, temporaryRepositoryPath, filePath)
generationAssessments, err := model.GenerateTestsForFile(language, temporaryRepositoryPath, filePath)
if err != nil {
problems = append(problems, pkgerrors.WithMessage(err, filePath))
repositoryAssessment[metrics.AssessmentKeyFilesProblems]++

continue
}
repositoryAssessment.Add(assessments)
repositoryAssessment.Add(generationAssessments)
repositoryAssessment[metrics.AssessmentKeyResponseNoError]++

coverage, err := language.Execute(temporaryRepositoryPath)
if err != nil {
Expand All @@ -75,6 +76,11 @@ func EvaluateRepository(resultPath string, model model.Model, language language.
if coverage == 100 {
repositoryAssessment[metrics.AssessmentKeyCoverageStatement]++
}

// If we weren't able to determine if the response contained any code but the execution was successful, correct that now by incrementing the "AssessmentKeyResponseWithCode".
if generationAssessments[metrics.AssessmentKeyResponseWithCode] == 0 {
repositoryAssessment[metrics.AssessmentKeyResponseWithCode]++
}
}

return repositoryAssessment, problems, nil
Expand Down
4 changes: 3 additions & 1 deletion model/llm/llm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func TestModelLLMGenerateTestsForFile(t *testing.T) {

actualAssessment, actualError := llm.GenerateTestsForFile(tc.Language, temporaryPath, tc.SourceFilePath)
assert.NoError(t, actualError)
assert.Equal(t, tc.ExpectedAssessment, actualAssessment)
assert.Truef(t, tc.ExpectedAssessment.IsEqual(actualAssessment), "expected:%s\nactual:%s", tc.ExpectedAssessment, actualAssessment)

actualTestFileContent, err := os.ReadFile(filepath.Join(temporaryPath, tc.ExpectedTestFilePath))
assert.NoError(t, err)
Expand Down Expand Up @@ -89,6 +89,8 @@ func TestModelLLMGenerateTestsForFile(t *testing.T) {

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedTestFileContent: `
package native
Expand Down
15 changes: 13 additions & 2 deletions model/llm/prompt/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,39 @@ var (
func ParseResponse(response string) (assessment metrics.Assessments, code string) {
assessment = metrics.Assessments{}

if isOnlyWhitespace(response) {
return assessment, response
}
assessment[metrics.AssessmentKeyResponseNotEmpty] = 1

// Some models produce duplicated code tags, so unify them if needed.
response = codeTagDuplicatedMatch.ReplaceAllString(response, "```")

blocks := bytesutil.GuardedBlocks(response, codeTagMatch, codeTagMatch)

// When no code blocks are found, assume that just the code is returned.
if len(blocks) == 0 {
assessment[metrics.AssessmentKeyNoExcessResponse] = 1
// If we cannot distinguish between code and text, we sadly also cannot check if the response contains actual code or if there is any excess response content.

return assessment, strings.TrimSpace(response)
}
assessment[metrics.AssessmentKeyResponseWithCode] = 1

// Assume the first code block contains the response code fragment.
block := blocks[0]

// Check if the response contained only that single code block.
responseWithoutBlock := strings.Replace(response, block, "", 1)
if len(strings.TrimSpace(responseWithoutBlock)) == 0 {
if isOnlyWhitespace(responseWithoutBlock) {
assessment[metrics.AssessmentKeyResponseNoExcess] = 1
} else {
assessment[metrics.AssessmentKeyResponseNoExcess] = 0
}

return assessment, strings.TrimSpace(codeTagMatch.ReplaceAllString(block, ""))
}

// isOnlyWhitespace checks if the string contains only whitespace as defined by unicode.
func isOnlyWhitespace(data string) bool {
return len(strings.TrimSpace(data)) == 0
}
27 changes: 25 additions & 2 deletions model/llm/prompt/parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ func TestParseResponse(t *testing.T) {
t.Run(tc.Name, func(t *testing.T) {
actualAssessment, actualCode := ParseResponse(tc.Response)

assert.Equal(t, tc.ExpectedAssessment, actualAssessment)
assert.Truef(t, tc.ExpectedAssessment.IsEqual(actualAssessment), "expected:%s\nactual:%s", tc.ExpectedAssessment, actualAssessment)
assert.Equal(t, strings.TrimSpace(tc.ExpectedCode), actualCode)
})
}
Expand All @@ -38,13 +38,26 @@ func TestParseResponse(t *testing.T) {
}
`)

validate(t, &testCase{
Name: "Empty Response",

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNotEmpty: 0,
metrics.AssessmentKeyResponseNoExcess: 0,
metrics.AssessmentKeyResponseWithCode: 0,
},
ExpectedCode: "",
})

validate(t, &testCase{
Name: "Only Code",

Response: code,

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseNoExcess: 0,
metrics.AssessmentKeyResponseWithCode: 0,
},
ExpectedCode: code,
})
Expand All @@ -56,7 +69,9 @@ func TestParseResponse(t *testing.T) {
Response: "```\n" + code + "\n```\n",

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedCode: code,
})
Expand All @@ -67,7 +82,9 @@ func TestParseResponse(t *testing.T) {
Response: "Some text...\n\n```\n" + code + "\n```\n\nSome more text...",

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseNoExcess: 0,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedCode: code,
})
Expand All @@ -79,7 +96,9 @@ func TestParseResponse(t *testing.T) {
Response: "```go\n" + code + "\n```\n",

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedCode: code,
})
Expand All @@ -89,7 +108,9 @@ func TestParseResponse(t *testing.T) {

Response: " ```\n" + code + "\n\t```\n",
ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedCode: code,
})
Expand All @@ -99,7 +120,9 @@ func TestParseResponse(t *testing.T) {

Response: "```\n```\n" + code + "\n```\n```\n",
ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedCode: code,
})
Expand Down
6 changes: 4 additions & 2 deletions model/symflower/symflower.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ func (m *ModelSymflower) GenerateTestsForFile(language language.Language, reposi
return nil, pkgerrors.WithStack(err)
}

return metrics.Assessments{
metrics.AssessmentKeyResponseNoExcess: 1, // Symflower only generates code, never additional explanations.
return metrics.Assessments{ // Symflower always generates just source code when it does not fail, so no need to check the assessment properties.
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseWithCode: 1,
}, nil
}
4 changes: 3 additions & 1 deletion model/symflower/symflower_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func TestModelSymflowerGenerateTestsForFile(t *testing.T) {
} else if actualError != nil || tc.ExpectedErrorText != "" {
assert.ErrorContains(t, actualError, tc.ExpectedErrorText)
}
assert.Equal(t, tc.ExpectedAssessment, actualAssessment)
assert.Truef(t, tc.ExpectedAssessment.IsEqual(actualAssessment), "expected:%s\nactual:%s", tc.ExpectedAssessment, actualAssessment)

actualCoverage, err := tc.Language.Execute(repositoryPath)
require.NoError(t, err)
Expand All @@ -67,6 +67,8 @@ func TestModelSymflowerGenerateTestsForFile(t *testing.T) {

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedCoverage: 100,
})
Expand Down
Loading

0 comments on commit 41aebee

Please sign in to comment.