From 367ca8961d22524c5d925aa14c6179a0537d9d0f Mon Sep 17 00:00:00 2001 From: Simon Bauer Date: Thu, 18 Apr 2024 12:43:54 +0200 Subject: [PATCH] More automatic assessments based on a model response Part of #32 --- evaluate/metrics/assessment.go | 6 +++++ evaluate/metrics/assessment_test.go | 34 ++++++++++++++++++----------- evaluate/repository.go | 1 + model/llm/llm_test.go | 2 ++ model/llm/prompt/parse.go | 15 ++++++++----- model/llm/prompt/parse_test.go | 26 +++++++++++++++++++++- model/symflower/symflower.go | 6 +++-- model/symflower/symflower_test.go | 2 ++ 8 files changed, 71 insertions(+), 21 deletions(-) diff --git a/evaluate/metrics/assessment.go b/evaluate/metrics/assessment.go index edbbf0fd3..11ff95334 100644 --- a/evaluate/metrics/assessment.go +++ b/evaluate/metrics/assessment.go @@ -39,6 +39,12 @@ var ( // AssessmentKeyCoverageStatement counts the cases where 100% coverage was reached. AssessmentKeyCoverageStatement = RegisterAssessmentKey("coverage-statement") + // AssessmentKeyResponseNoError indicates that a model responded without error. + AssessmentKeyResponseNoError = RegisterAssessmentKey("response-no-error") + // AssessmentKeyResponseNotEmpty indicates that a model response was not empty. + AssessmentKeyResponseNotEmpty = RegisterAssessmentKey("response-not-empty") + // AssessmentKeyResponseWithCode indicates that a model responded with code. + AssessmentKeyResponseWithCode = RegisterAssessmentKey("response-with-code") // AssessmentKeyResponseNoExcess indicates that a model did not produce more content as requested. AssessmentKeyResponseNoExcess = RegisterAssessmentKey("response-no-excess") ) diff --git a/evaluate/metrics/assessment_test.go b/evaluate/metrics/assessment_test.go index 960b76adf..c1f3d8942 100644 --- a/evaluate/metrics/assessment_test.go +++ b/evaluate/metrics/assessment_test.go @@ -134,23 +134,26 @@ func TestAssessmentString(t *testing.T) { } validate(t, &testCase{ - Name: "Initial Metrics", + Name: "Empty Metrics", Assessment: NewAssessments(), - ExpectedString: "coverage-statement=0, files-executed=0, response-no-excess=0", + ExpectedString: "coverage-statement=0, files-executed=0, response-no-error=0, response-no-excess=0, response-not-empty=0, response-with-code=0", }) validate(t, &testCase{ - Name: "Empty Metrics", + Name: "Non-empty Metrics", Assessment: Assessments{ AssessmentKeyCoverageStatement: 1, AssessmentKeyFilesExecuted: 2, - AssessmentKeyResponseNoExcess: 4, + AssessmentKeyResponseNoError: 4, + AssessmentKeyResponseNoExcess: 5, + AssessmentKeyResponseNotEmpty: 6, + AssessmentKeyResponseWithCode: 7, }, - ExpectedString: "coverage-statement=1, files-executed=2, response-no-excess=4", + ExpectedString: "coverage-statement=1, files-executed=2, response-no-error=4, response-no-excess=5, response-not-empty=6, response-with-code=7", }) } @@ -180,8 +183,8 @@ func TestFormatStringCSV(t *testing.T) { }, ExpectedString: ` - model,coverage-statement,files-executed,response-no-excess - Model,0,0,0 + model,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code + Model,0,0,0,0,0,0 `, }) validate(t, &testCase{ @@ -191,19 +194,24 @@ func TestFormatStringCSV(t *testing.T) { "ModelA": Assessments{ AssessmentKeyCoverageStatement: 1, AssessmentKeyFilesExecuted: 2, - AssessmentKeyResponseNoExcess: 4, + AssessmentKeyResponseNoError: 4, + AssessmentKeyResponseNoExcess: 5, + AssessmentKeyResponseNotEmpty: 6, + AssessmentKeyResponseWithCode: 7, }, "ModelB": Assessments{ AssessmentKeyCoverageStatement: 1, - AssessmentKeyFilesExecuted: 2, - AssessmentKeyResponseNoExcess: 4, + AssessmentKeyFilesExecuted: 2, AssessmentKeyResponseNoError: 4, + AssessmentKeyResponseNoExcess: 5, + AssessmentKeyResponseNotEmpty: 6, + AssessmentKeyResponseWithCode: 7, }, }, ExpectedString: ` - model,coverage-statement,files-executed,response-no-excess - ModelA,1,2,4 - ModelB,1,2,4 + model,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code + ModelA,1,2,4,5,6,7 + ModelB,1,2,4,5,6,7 `, }) } diff --git a/evaluate/repository.go b/evaluate/repository.go index 83f7e17fa..cc97685ae 100644 --- a/evaluate/repository.go +++ b/evaluate/repository.go @@ -62,6 +62,7 @@ func EvaluateRepository(resultPath string, model model.Model, language language. continue } repositoryAssessment.Add(assessments) + repositoryAssessment[metrics.AssessmentKeyResponseNoError]++ coverage, err := language.Execute(temporaryRepositoryPath) if err != nil { diff --git a/model/llm/llm_test.go b/model/llm/llm_test.go index c248b6293..5085e8246 100644 --- a/model/llm/llm_test.go +++ b/model/llm/llm_test.go @@ -91,6 +91,8 @@ func TestModelLLMGenerateTestsForFile(t *testing.T) { ExpectedAssessment: metrics.Assessments{ metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseNotEmpty: 1, + metrics.AssessmentKeyResponseWithCode: 1, }, ExpectedTestFileContent: ` package native diff --git a/model/llm/prompt/parse.go b/model/llm/prompt/parse.go index 02ae51352..852743798 100644 --- a/model/llm/prompt/parse.go +++ b/model/llm/prompt/parse.go @@ -17,6 +17,12 @@ var ( func ParseResponse(response string) (assessment metrics.Assessments, code string) { assessment = metrics.Assessments{} + // Check for empty responses. + if bytesutil.IsWhitespace(response) { + return assessment, response + } + assessment[metrics.AssessmentKeyResponseNotEmpty]++ + // Some models produce duplicated code tags, so unify them if needed. response = codeTagDuplicatedMatch.ReplaceAllString(response, "```") @@ -24,20 +30,19 @@ func ParseResponse(response string) (assessment metrics.Assessments, code string // When no code blocks are found, assume that just the code is returned. if len(blocks) == 0 { - assessment[metrics.AssessmentKeyResponseNoExcess] = 1 + // If we cannot distinguish between code and text, we sadly also cannot check if the response contains actual code or if there is any excess response content. return assessment, strings.TrimSpace(response) } + assessment[metrics.AssessmentKeyResponseWithCode]++ // Assume the first code block contains the response code fragment. block := blocks[0] // Check if the response contained only that single code block. responseWithoutBlock := strings.Replace(response, block, "", 1) - if len(strings.TrimSpace(responseWithoutBlock)) == 0 { - assessment[metrics.AssessmentKeyResponseNoExcess] = 1 - } else { - assessment[metrics.AssessmentKeyResponseNoExcess] = 0 + if bytesutil.IsWhitespace(responseWithoutBlock) { + assessment[metrics.AssessmentKeyResponseNoExcess]++ } return assessment, strings.TrimSpace(codeTagMatch.ReplaceAllString(block, "")) diff --git a/model/llm/prompt/parse_test.go b/model/llm/prompt/parse_test.go index 422dc99d2..fae61ff12 100644 --- a/model/llm/prompt/parse_test.go +++ b/model/llm/prompt/parse_test.go @@ -40,13 +40,27 @@ func TestParseResponse(t *testing.T) { } `) + validate(t, &testCase{ + Name: "Empty Response", + + ExpectedAssessment: metrics.Assessments{ + metrics.AssessmentKeyResponseNotEmpty: 0, + metrics.AssessmentKeyResponseNoExcess: 0, + metrics.AssessmentKeyResponseWithCode: 0, + }, + ExpectedCode: "", + }) + validate(t, &testCase{ Name: "Only Code", Response: code, ExpectedAssessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseNotEmpty: 1, + // If there are no code fences, we currently cannot determine what is code and what is (excessive) text. + metrics.AssessmentKeyResponseNoExcess: 0, + metrics.AssessmentKeyResponseWithCode: 0, }, ExpectedCode: code, }) @@ -58,7 +72,9 @@ func TestParseResponse(t *testing.T) { Response: "```\n" + code + "\n```\n", ExpectedAssessment: metrics.Assessments{ + metrics.AssessmentKeyResponseNotEmpty: 1, metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, }, ExpectedCode: code, }) @@ -69,7 +85,9 @@ func TestParseResponse(t *testing.T) { Response: "Some text...\n\n```\n" + code + "\n```\n\nSome more text...", ExpectedAssessment: metrics.Assessments{ + metrics.AssessmentKeyResponseNotEmpty: 1, metrics.AssessmentKeyResponseNoExcess: 0, + metrics.AssessmentKeyResponseWithCode: 1, }, ExpectedCode: code, }) @@ -81,7 +99,9 @@ func TestParseResponse(t *testing.T) { Response: "```go\n" + code + "\n```\n", ExpectedAssessment: metrics.Assessments{ + metrics.AssessmentKeyResponseNotEmpty: 1, metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, }, ExpectedCode: code, }) @@ -91,7 +111,9 @@ func TestParseResponse(t *testing.T) { Response: " ```\n" + code + "\n\t```\n", ExpectedAssessment: metrics.Assessments{ + metrics.AssessmentKeyResponseNotEmpty: 1, metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, }, ExpectedCode: code, }) @@ -101,7 +123,9 @@ func TestParseResponse(t *testing.T) { Response: "```\n```\n" + code + "\n```\n```\n", ExpectedAssessment: metrics.Assessments{ + metrics.AssessmentKeyResponseNotEmpty: 1, metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, }, ExpectedCode: code, }) diff --git a/model/symflower/symflower.go b/model/symflower/symflower.go index 0edb03182..eb706d1e0 100644 --- a/model/symflower/symflower.go +++ b/model/symflower/symflower.go @@ -35,7 +35,9 @@ func (m *ModelSymflower) GenerateTestsForFile(language language.Language, reposi return nil, pkgerrors.WithStack(err) } - return metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 1, // Symflower only generates code, never additional explanations. + return metrics.Assessments{ // Symflower always generates just source code when it does not fail, so no need to check the assessment properties. + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseNotEmpty: 1, + metrics.AssessmentKeyResponseWithCode: 1, }, nil } diff --git a/model/symflower/symflower_test.go b/model/symflower/symflower_test.go index 9bd7a8761..e3477e188 100644 --- a/model/symflower/symflower_test.go +++ b/model/symflower/symflower_test.go @@ -69,6 +69,8 @@ func TestModelSymflowerGenerateTestsForFile(t *testing.T) { ExpectedAssessment: metrics.Assessments{ metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseNotEmpty: 1, + metrics.AssessmentKeyResponseWithCode: 1, }, ExpectedCoverage: 100, })