Skip to content

Commit

Permalink
More automatic assessments based on a model response
Browse files Browse the repository at this point in the history
Part of #32
  • Loading branch information
bauersimon committed Apr 18, 2024
1 parent 28a6440 commit 367ca89
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 21 deletions.
6 changes: 6 additions & 0 deletions evaluate/metrics/assessment.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ var (
// AssessmentKeyCoverageStatement counts the cases where 100% coverage was reached.
AssessmentKeyCoverageStatement = RegisterAssessmentKey("coverage-statement")

// AssessmentKeyResponseNoError indicates that a model responded without error.
AssessmentKeyResponseNoError = RegisterAssessmentKey("response-no-error")
// AssessmentKeyResponseNotEmpty indicates that a model response was not empty.
AssessmentKeyResponseNotEmpty = RegisterAssessmentKey("response-not-empty")
// AssessmentKeyResponseWithCode indicates that a model responded with code.
AssessmentKeyResponseWithCode = RegisterAssessmentKey("response-with-code")
// AssessmentKeyResponseNoExcess indicates that a model did not produce more content as requested.
AssessmentKeyResponseNoExcess = RegisterAssessmentKey("response-no-excess")
)
Expand Down
34 changes: 21 additions & 13 deletions evaluate/metrics/assessment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,23 +134,26 @@ func TestAssessmentString(t *testing.T) {
}

validate(t, &testCase{
Name: "Initial Metrics",
Name: "Empty Metrics",

Assessment: NewAssessments(),

ExpectedString: "coverage-statement=0, files-executed=0, response-no-excess=0",
ExpectedString: "coverage-statement=0, files-executed=0, response-no-error=0, response-no-excess=0, response-not-empty=0, response-with-code=0",
})

validate(t, &testCase{
Name: "Empty Metrics",
Name: "Non-empty Metrics",

Assessment: Assessments{
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseNoError: 4,
AssessmentKeyResponseNoExcess: 5,
AssessmentKeyResponseNotEmpty: 6,
AssessmentKeyResponseWithCode: 7,
},

ExpectedString: "coverage-statement=1, files-executed=2, response-no-excess=4",
ExpectedString: "coverage-statement=1, files-executed=2, response-no-error=4, response-no-excess=5, response-not-empty=6, response-with-code=7",
})
}

Expand Down Expand Up @@ -180,8 +183,8 @@ func TestFormatStringCSV(t *testing.T) {
},

ExpectedString: `
model,coverage-statement,files-executed,response-no-excess
Model,0,0,0
model,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
Model,0,0,0,0,0,0
`,
})
validate(t, &testCase{
Expand All @@ -191,19 +194,24 @@ func TestFormatStringCSV(t *testing.T) {
"ModelA": Assessments{
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseNoError: 4,
AssessmentKeyResponseNoExcess: 5,
AssessmentKeyResponseNotEmpty: 6,
AssessmentKeyResponseWithCode: 7,
},
"ModelB": Assessments{
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyFilesExecuted: 2, AssessmentKeyResponseNoError: 4,
AssessmentKeyResponseNoExcess: 5,
AssessmentKeyResponseNotEmpty: 6,
AssessmentKeyResponseWithCode: 7,
},
},

ExpectedString: `
model,coverage-statement,files-executed,response-no-excess
ModelA,1,2,4
ModelB,1,2,4
model,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
ModelA,1,2,4,5,6,7
ModelB,1,2,4,5,6,7
`,
})
}
Expand Down
1 change: 1 addition & 0 deletions evaluate/repository.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ func EvaluateRepository(resultPath string, model model.Model, language language.
continue
}
repositoryAssessment.Add(assessments)
repositoryAssessment[metrics.AssessmentKeyResponseNoError]++

coverage, err := language.Execute(temporaryRepositoryPath)
if err != nil {
Expand Down
2 changes: 2 additions & 0 deletions model/llm/llm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ func TestModelLLMGenerateTestsForFile(t *testing.T) {

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedTestFileContent: `
package native
Expand Down
15 changes: 10 additions & 5 deletions model/llm/prompt/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,32 @@ var (
func ParseResponse(response string) (assessment metrics.Assessments, code string) {
assessment = metrics.Assessments{}

// Check for empty responses.
if bytesutil.IsWhitespace(response) {
return assessment, response
}
assessment[metrics.AssessmentKeyResponseNotEmpty]++

// Some models produce duplicated code tags, so unify them if needed.
response = codeTagDuplicatedMatch.ReplaceAllString(response, "```")

blocks := bytesutil.GuardedBlocks(response, codeTagMatch, codeTagMatch)

// When no code blocks are found, assume that just the code is returned.
if len(blocks) == 0 {
assessment[metrics.AssessmentKeyResponseNoExcess] = 1
// If we cannot distinguish between code and text, we sadly also cannot check if the response contains actual code or if there is any excess response content.

return assessment, strings.TrimSpace(response)
}
assessment[metrics.AssessmentKeyResponseWithCode]++

// Assume the first code block contains the response code fragment.
block := blocks[0]

// Check if the response contained only that single code block.
responseWithoutBlock := strings.Replace(response, block, "", 1)
if len(strings.TrimSpace(responseWithoutBlock)) == 0 {
assessment[metrics.AssessmentKeyResponseNoExcess] = 1
} else {
assessment[metrics.AssessmentKeyResponseNoExcess] = 0
if bytesutil.IsWhitespace(responseWithoutBlock) {
assessment[metrics.AssessmentKeyResponseNoExcess]++
}

return assessment, strings.TrimSpace(codeTagMatch.ReplaceAllString(block, ""))
Expand Down
26 changes: 25 additions & 1 deletion model/llm/prompt/parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,27 @@ func TestParseResponse(t *testing.T) {
}
`)

validate(t, &testCase{
Name: "Empty Response",

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNotEmpty: 0,
metrics.AssessmentKeyResponseNoExcess: 0,
metrics.AssessmentKeyResponseWithCode: 0,
},
ExpectedCode: "",
})

validate(t, &testCase{
Name: "Only Code",

Response: code,

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseNotEmpty: 1,
// If there are no code fences, we currently cannot determine what is code and what is (excessive) text.
metrics.AssessmentKeyResponseNoExcess: 0,
metrics.AssessmentKeyResponseWithCode: 0,
},
ExpectedCode: code,
})
Expand All @@ -58,7 +72,9 @@ func TestParseResponse(t *testing.T) {
Response: "```\n" + code + "\n```\n",

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedCode: code,
})
Expand All @@ -69,7 +85,9 @@ func TestParseResponse(t *testing.T) {
Response: "Some text...\n\n```\n" + code + "\n```\n\nSome more text...",

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseNoExcess: 0,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedCode: code,
})
Expand All @@ -81,7 +99,9 @@ func TestParseResponse(t *testing.T) {
Response: "```go\n" + code + "\n```\n",

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedCode: code,
})
Expand All @@ -91,7 +111,9 @@ func TestParseResponse(t *testing.T) {

Response: " ```\n" + code + "\n\t```\n",
ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedCode: code,
})
Expand All @@ -101,7 +123,9 @@ func TestParseResponse(t *testing.T) {

Response: "```\n```\n" + code + "\n```\n```\n",
ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedCode: code,
})
Expand Down
6 changes: 4 additions & 2 deletions model/symflower/symflower.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ func (m *ModelSymflower) GenerateTestsForFile(language language.Language, reposi
return nil, pkgerrors.WithStack(err)
}

return metrics.Assessments{
metrics.AssessmentKeyResponseNoExcess: 1, // Symflower only generates code, never additional explanations.
return metrics.Assessments{ // Symflower always generates just source code when it does not fail, so no need to check the assessment properties.
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseWithCode: 1,
}, nil
}
2 changes: 2 additions & 0 deletions model/symflower/symflower_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ func TestModelSymflowerGenerateTestsForFile(t *testing.T) {

ExpectedAssessment: metrics.Assessments{
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseNotEmpty: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
ExpectedCoverage: 100,
})
Expand Down

0 comments on commit 367ca89

Please sign in to comment.