More automatic assessments based on a model response

Part of #32
symflower · Apr 18, 2024 · 41aebee · 41aebee
1 parent 4e6c4dd
commit 41aebee
Show file tree

Hide file tree

Showing 9 changed files with 178 additions and 23 deletions.
diff --git a/evaluate/metrics/assessment.go b/evaluate/metrics/assessment.go
@@ -38,6 +38,12 @@ var (
 	// AssessmentKeyCoverageStatement counts the cases where 100% coverage was reached.
 	AssessmentKeyCoverageStatement = RegisterAssessmentKey("coverage-statement")
 
+	// AssessmentKeyResponseNoError indicates that a model responded without error.
+	AssessmentKeyResponseNoError = RegisterAssessmentKey("response-no-error")
+	// AssessmentKeyResponseNotEmpty indicates that a model response was not empty.
+	AssessmentKeyResponseNotEmpty = RegisterAssessmentKey("response-not-empty")
+	// AssessmentKeyResponseWithCode indicates that a model responded with code.
+	AssessmentKeyResponseWithCode = RegisterAssessmentKey("response-with-code")
 	// AssessmentKeyResponseNoExcess indicates that a model did not produce more content as requested.
 	AssessmentKeyResponseNoExcess = RegisterAssessmentKey("response-no-excess")
 )
@@ -57,6 +63,24 @@ func (a Assessments) Add(x Assessments) {
 	}
 }
 
+// IsEqual checks if both assessment collections are equal.
+func (a Assessments) IsEqual(x Assessments) bool {
+	if a == nil {
+		a = NewAssessments()
+	}
+	if x == nil {
+		x = NewAssessments()
+	}
+
+	for _, key := range allAssessmentKeys {
+		if a[key] != x[key] {
+			return false
+		}
+	}
+
+	return true
+}
+
 // Merge combines two assessment collections into a new assessment collection and returns the new assessment collection.
 func Merge(a Assessments, b Assessments) (c Assessments) {
 	c = NewAssessments()

diff --git a/evaluate/metrics/assessment_test.go b/evaluate/metrics/assessment_test.go
@@ -138,7 +138,7 @@ func TestAssessmentString(t *testing.T) {
 
 		Assessment: NewAssessments(),
 
-		ExpectedString: "files-executed=0, files-problems=0, coverage-statement=0, response-no-excess=0",
+		ExpectedString: "files-executed=0, files-problems=0, coverage-statement=0, response-no-error=0, response-not-empty=0, response-with-code=0, response-no-excess=0",
 	})
 
 	validate(t, &testCase{
@@ -148,10 +148,13 @@ func TestAssessmentString(t *testing.T) {
 			AssessmentKeyCoverageStatement: 1,
 			AssessmentKeyFilesExecuted:     2,
 			AssessmentKeyFilesProblems:     3,
-			AssessmentKeyResponseNoExcess:  4,
+			AssessmentKeyResponseNoError:   4,
+			AssessmentKeyResponseNoExcess:  5,
+			AssessmentKeyResponseNotEmpty:  6,
+			AssessmentKeyResponseWithCode:  7,
 		},
 
-		ExpectedString: "files-executed=2, files-problems=3, coverage-statement=1, response-no-excess=4",
+		ExpectedString: "files-executed=0, files-problems=0, coverage-statement=0, response-no-error=0, response-not-empty=0, response-with-code=0, response-no-excess=0",
 	})
 }
 
@@ -181,8 +184,8 @@ func TestFormatStringCSV(t *testing.T) {
 		},
 
 		ExpectedString: `
-			model,files-executed,files-problems,coverage-statement,response-no-excess
-			Model,0,0,0,0
+			model,files-executed,files-problems,coverage-statement,response-no-error,response-not-empty,response-with-code,response-no-excess
+			Model,0,0,0,0,0,0,0
 		`,
 	})
 	validate(t, &testCase{
@@ -193,20 +196,103 @@ func TestFormatStringCSV(t *testing.T) {
 				AssessmentKeyCoverageStatement: 1,
 				AssessmentKeyFilesExecuted:     2,
 				AssessmentKeyFilesProblems:     3,
-				AssessmentKeyResponseNoExcess:  4,
+				AssessmentKeyResponseNoError:   4,
+				AssessmentKeyResponseNoExcess:  5,
+				AssessmentKeyResponseNotEmpty:  6,
+				AssessmentKeyResponseWithCode:  7,
 			},
 			"ModelB": Assessments{
 				AssessmentKeyCoverageStatement: 1,
 				AssessmentKeyFilesExecuted:     2,
 				AssessmentKeyFilesProblems:     3,
-				AssessmentKeyResponseNoExcess:  4,
+				AssessmentKeyResponseNoError:   4,
+				AssessmentKeyResponseNoExcess:  5,
+				AssessmentKeyResponseNotEmpty:  6,
+				AssessmentKeyResponseWithCode:  7,
 			},
 		},
 
 		ExpectedString: `
-			model,files-executed,files-problems,coverage-statement,response-no-excess
-			ModelA,2,3,1,4
-			ModelB,2,3,1,4
+			model,files-executed,files-problems,coverage-statement,response-no-error,response-not-empty,response-with-code,response-no-excess
+			ModelA,2,3,1,4,6,7,5
+			ModelB,2,3,1,4,6,7,5
 		`,
 	})
 }
+
+func TestAssessmentsIsEqual(t *testing.T) {
+	type testCase struct {
+		Name string
+
+		Assessments Assessments
+		X           Assessments
+
+		ExpectedBool bool
+	}
+
+	validate := func(t *testing.T, tc *testCase) {
+		t.Run(tc.Name, func(t *testing.T) {
+			actualBool := tc.Assessments.IsEqual(tc.X)
+
+			assert.Equal(t, tc.ExpectedBool, actualBool)
+		})
+	}
+
+	validate(t, &testCase{
+		Name: "Empty",
+
+		Assessments: NewAssessments(),
+		X:           NewAssessments(),
+
+		ExpectedBool: true,
+	})
+
+	validate(t, &testCase{
+		Name: "Nil",
+
+		Assessments: nil,
+		X:           nil,
+
+		ExpectedBool: true,
+	})
+
+	validate(t, &testCase{
+		Name: "Equal Values",
+
+		Assessments: Assessments{
+			AssessmentKeyResponseWithCode: 2,
+		},
+		X: Assessments{
+			AssessmentKeyResponseWithCode: 2,
+		},
+
+		ExpectedBool: true,
+	})
+
+	validate(t, &testCase{
+		Name: "Default Value",
+
+		Assessments: Assessments{
+			AssessmentKeyResponseWithCode: 2,
+			AssessmentKeyResponseNoError:  0,
+		},
+		X: Assessments{
+			AssessmentKeyResponseWithCode: 2,
+		},
+
+		ExpectedBool: true,
+	})
+
+	validate(t, &testCase{
+		Name: "Different Values",
+
+		Assessments: Assessments{
+			AssessmentKeyResponseWithCode: 3,
+		},
+		X: Assessments{
+			AssessmentKeyResponseWithCode: 2,
+		},
+
+		ExpectedBool: false,
+	})
+}
diff --git a/evaluate/repository.go b/evaluate/repository.go
@@ -55,14 +55,15 @@ func EvaluateRepository(resultPath string, model model.Model, language language.
 
 	repositoryAssessment = metrics.NewAssessments()
 	for _, filePath := range filePaths {
-		assessments, err := model.GenerateTestsForFile(language, temporaryRepositoryPath, filePath)
+		generationAssessments, err := model.GenerateTestsForFile(language, temporaryRepositoryPath, filePath)
 		if err != nil {
 			problems = append(problems, pkgerrors.WithMessage(err, filePath))
 			repositoryAssessment[metrics.AssessmentKeyFilesProblems]++
 
 			continue
 		}
-		repositoryAssessment.Add(assessments)
+		repositoryAssessment.Add(generationAssessments)
+		repositoryAssessment[metrics.AssessmentKeyResponseNoError]++
 
 		coverage, err := language.Execute(temporaryRepositoryPath)
 		if err != nil {
@@ -75,6 +76,11 @@ func EvaluateRepository(resultPath string, model model.Model, language language.
 		if coverage == 100 {
 			repositoryAssessment[metrics.AssessmentKeyCoverageStatement]++
 		}
+
+		// If we weren't able to determine if the response contained any code but the execution was successful, correct that now by incrementing the "AssessmentKeyResponseWithCode".
+		if generationAssessments[metrics.AssessmentKeyResponseWithCode] == 0 {
+			repositoryAssessment[metrics.AssessmentKeyResponseWithCode]++
+		}
 	}
 
 	return repositoryAssessment, problems, nil

diff --git a/model/llm/llm_test.go b/model/llm/llm_test.go
@@ -46,7 +46,7 @@ func TestModelLLMGenerateTestsForFile(t *testing.T) {
 
 			actualAssessment, actualError := llm.GenerateTestsForFile(tc.Language, temporaryPath, tc.SourceFilePath)
 			assert.NoError(t, actualError)
-			assert.Equal(t, tc.ExpectedAssessment, actualAssessment)
+			assert.Truef(t, tc.ExpectedAssessment.IsEqual(actualAssessment), "expected:%s\nactual:%s", tc.ExpectedAssessment, actualAssessment)
 
 			actualTestFileContent, err := os.ReadFile(filepath.Join(temporaryPath, tc.ExpectedTestFilePath))
 			assert.NoError(t, err)
@@ -89,6 +89,8 @@ func TestModelLLMGenerateTestsForFile(t *testing.T) {
 
 		ExpectedAssessment: metrics.Assessments{
 			metrics.AssessmentKeyResponseNoExcess: 1,
+			metrics.AssessmentKeyResponseNotEmpty: 1,
+			metrics.AssessmentKeyResponseWithCode: 1,
 		},
 		ExpectedTestFileContent: `
 			package native

diff --git a/model/llm/prompt/parse.go b/model/llm/prompt/parse.go
@@ -17,28 +17,39 @@ var (
 func ParseResponse(response string) (assessment metrics.Assessments, code string) {
 	assessment = metrics.Assessments{}
 
+	if isOnlyWhitespace(response) {
+		return assessment, response
+	}
+	assessment[metrics.AssessmentKeyResponseNotEmpty] = 1
+
 	// Some models produce duplicated code tags, so unify them if needed.
 	response = codeTagDuplicatedMatch.ReplaceAllString(response, "```")
 
 	blocks := bytesutil.GuardedBlocks(response, codeTagMatch, codeTagMatch)
 
 	// When no code blocks are found, assume that just the code is returned.
 	if len(blocks) == 0 {
-		assessment[metrics.AssessmentKeyNoExcessResponse] = 1
+		// If we cannot distinguish between code and text, we sadly also cannot check if the response contains actual code or if there is any excess response content.
 
 		return assessment, strings.TrimSpace(response)
 	}
+	assessment[metrics.AssessmentKeyResponseWithCode] = 1
 
 	// Assume the first code block contains the response code fragment.
 	block := blocks[0]
 
 	// Check if the response contained only that single code block.
 	responseWithoutBlock := strings.Replace(response, block, "", 1)
-	if len(strings.TrimSpace(responseWithoutBlock)) == 0 {
+	if isOnlyWhitespace(responseWithoutBlock) {
 		assessment[metrics.AssessmentKeyResponseNoExcess] = 1
 	} else {
 		assessment[metrics.AssessmentKeyResponseNoExcess] = 0
 	}
 
 	return assessment, strings.TrimSpace(codeTagMatch.ReplaceAllString(block, ""))
 }
+
+// isOnlyWhitespace checks if the string contains only whitespace as defined by unicode.
+func isOnlyWhitespace(data string) bool {
+	return len(strings.TrimSpace(data)) == 0
+}
diff --git a/model/llm/prompt/parse_test.go b/model/llm/prompt/parse_test.go
@@ -23,7 +23,7 @@ func TestParseResponse(t *testing.T) {
 		t.Run(tc.Name, func(t *testing.T) {
 			actualAssessment, actualCode := ParseResponse(tc.Response)
 
-			assert.Equal(t, tc.ExpectedAssessment, actualAssessment)
+			assert.Truef(t, tc.ExpectedAssessment.IsEqual(actualAssessment), "expected:%s\nactual:%s", tc.ExpectedAssessment, actualAssessment)
 			assert.Equal(t, strings.TrimSpace(tc.ExpectedCode), actualCode)
 		})
 	}
@@ -38,13 +38,26 @@ func TestParseResponse(t *testing.T) {
 		}
 	`)
 
+	validate(t, &testCase{
+		Name: "Empty Response",
+
+		ExpectedAssessment: metrics.Assessments{
+			metrics.AssessmentKeyResponseNotEmpty: 0,
+			metrics.AssessmentKeyResponseNoExcess: 0,
+			metrics.AssessmentKeyResponseWithCode: 0,
+		},
+		ExpectedCode: "",
+	})
+
 	validate(t, &testCase{
 		Name: "Only Code",
 
 		Response: code,
 
 		ExpectedAssessment: metrics.Assessments{
-			metrics.AssessmentKeyResponseNoExcess: 1,
+			metrics.AssessmentKeyResponseNotEmpty: 1,
+			metrics.AssessmentKeyResponseNoExcess: 0,
+			metrics.AssessmentKeyResponseWithCode: 0,
 		},
 		ExpectedCode: code,
 	})
@@ -56,7 +69,9 @@ func TestParseResponse(t *testing.T) {
 			Response: "```\n" + code + "\n```\n",
 
 			ExpectedAssessment: metrics.Assessments{
+				metrics.AssessmentKeyResponseNotEmpty: 1,
 				metrics.AssessmentKeyResponseNoExcess: 1,
+				metrics.AssessmentKeyResponseWithCode: 1,
 			},
 			ExpectedCode: code,
 		})
@@ -67,7 +82,9 @@ func TestParseResponse(t *testing.T) {
 			Response: "Some text...\n\n```\n" + code + "\n```\n\nSome more text...",
 
 			ExpectedAssessment: metrics.Assessments{
+				metrics.AssessmentKeyResponseNotEmpty: 1,
 				metrics.AssessmentKeyResponseNoExcess: 0,
+				metrics.AssessmentKeyResponseWithCode: 1,
 			},
 			ExpectedCode: code,
 		})
@@ -79,7 +96,9 @@ func TestParseResponse(t *testing.T) {
 		Response: "```go\n" + code + "\n```\n",
 
 		ExpectedAssessment: metrics.Assessments{
+			metrics.AssessmentKeyResponseNotEmpty: 1,
 			metrics.AssessmentKeyResponseNoExcess: 1,
+			metrics.AssessmentKeyResponseWithCode: 1,
 		},
 		ExpectedCode: code,
 	})
@@ -89,7 +108,9 @@ func TestParseResponse(t *testing.T) {
 
 		Response: " ```\n" + code + "\n\t```\n",
 		ExpectedAssessment: metrics.Assessments{
+			metrics.AssessmentKeyResponseNotEmpty: 1,
 			metrics.AssessmentKeyResponseNoExcess: 1,
+			metrics.AssessmentKeyResponseWithCode: 1,
 		},
 		ExpectedCode: code,
 	})
@@ -99,7 +120,9 @@ func TestParseResponse(t *testing.T) {
 
 		Response: "```\n```\n" + code + "\n```\n```\n",
 		ExpectedAssessment: metrics.Assessments{
+			metrics.AssessmentKeyResponseNotEmpty: 1,
 			metrics.AssessmentKeyResponseNoExcess: 1,
+			metrics.AssessmentKeyResponseWithCode: 1,
 		},
 		ExpectedCode: code,
 	})

diff --git a/model/symflower/symflower.go b/model/symflower/symflower.go
@@ -35,7 +35,9 @@ func (m *ModelSymflower) GenerateTestsForFile(language language.Language, reposi
 		return nil, pkgerrors.WithStack(err)
 	}
 
-	return metrics.Assessments{
-		metrics.AssessmentKeyResponseNoExcess: 1, // Symflower only generates code, never additional explanations.
+	return metrics.Assessments{ // Symflower always generates just source code when it does not fail, so no need to check the assessment properties.
+		metrics.AssessmentKeyResponseNoExcess: 1,
+		metrics.AssessmentKeyResponseNotEmpty: 1,
+		metrics.AssessmentKeyResponseWithCode: 1,
 	}, nil
 }
diff --git a/model/symflower/symflower_test.go b/model/symflower/symflower_test.go
@@ -49,7 +49,7 @@ func TestModelSymflowerGenerateTestsForFile(t *testing.T) {
 			} else if actualError != nil || tc.ExpectedErrorText != "" {
 				assert.ErrorContains(t, actualError, tc.ExpectedErrorText)
 			}
-			assert.Equal(t, tc.ExpectedAssessment, actualAssessment)
+			assert.Truef(t, tc.ExpectedAssessment.IsEqual(actualAssessment), "expected:%s\nactual:%s", tc.ExpectedAssessment, actualAssessment)
 
 			actualCoverage, err := tc.Language.Execute(repositoryPath)
 			require.NoError(t, err)
@@ -67,6 +67,8 @@ func TestModelSymflowerGenerateTestsForFile(t *testing.T) {
 
 		ExpectedAssessment: metrics.Assessments{
 			metrics.AssessmentKeyResponseNoExcess: 1,
+			metrics.AssessmentKeyResponseNotEmpty: 1,
+			metrics.AssessmentKeyResponseWithCode: 1,
 		},
 		ExpectedCoverage: 100,
 	})