From a8e7e40bfb97b50322288d79bb89d34196a998df Mon Sep 17 00:00:00 2001 From: Simon Bauer Date: Wed, 17 Apr 2024 11:23:13 +0200 Subject: [PATCH] refactor, Move existing metrics to assessments Closes #34 --- cmd/eval-dev-quality/cmd/evaluate.go | 8 +- evaluate/metrics/assessment.go | 74 ++++++++++++++++- evaluate/metrics/assessment_test.go | 96 ++++++++++++++++++++++ evaluate/metrics/metrics.go | 117 --------------------------- evaluate/metrics/metrics_test.go | 70 ---------------- evaluate/repository.go | 24 +++--- 6 files changed, 186 insertions(+), 203 deletions(-) delete mode 100644 evaluate/metrics/metrics.go delete mode 100644 evaluate/metrics/metrics_test.go diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index 15b06dd1..7d21bd8c 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -100,12 +100,12 @@ func (command *Evaluate) Execute(args []string) (err error) { // Check that models and languages can be evaluated by executing the "plain" repositories. log.Printf("Checking that models and languages can be used for evaluation") - metricsPerModel := map[string]metrics.Metrics{} + metricsPerModel := map[string]metrics.Assessments{} problemsPerModel := map[string][]error{} { // Ensure we report metrics for every model even if they are excluded. for _, modelID := range command.Models { - metricsPerModel[modelID] = metrics.Metrics{} + metricsPerModel[modelID] = metrics.NewAssessments() } for _, languageID := range command.Languages { @@ -114,7 +114,7 @@ func (command *Evaluate) Execute(args []string) (err error) { language := language.Languages[languageID] metrics, ps, err := evaluate.EvaluateRepository(model, language, filepath.Join(command.TestdataPath, language.ID(), "plain")) - metricsPerModel[modelID] = metricsPerModel[modelID].Add(metrics) + metricsPerModel[modelID].Add(metrics) if err != nil { ps = append(ps, err) } @@ -155,7 +155,7 @@ func (command *Evaluate) Execute(args []string) (err error) { language := language.Languages[languageID] metrics, ps, err := evaluate.EvaluateRepository(model, language, filepath.Join(languagePath, repository.Name())) - metricsPerModel[model.ID()] = metricsPerModel[model.ID()].Add(metrics) + metricsPerModel[model.ID()].Add(metrics) problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...) if err != nil { log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repository.Name(), err) diff --git a/evaluate/metrics/assessment.go b/evaluate/metrics/assessment.go index 4257f87e..e874fc0c 100644 --- a/evaluate/metrics/assessment.go +++ b/evaluate/metrics/assessment.go @@ -1,5 +1,15 @@ package metrics +import ( + "encoding/csv" + "fmt" + "sort" + "strings" + + pkgerrors "github.com/pkg/errors" + "golang.org/x/exp/maps" +) + // AssessmentKey defines a key for a numerical key-value assessment pair. type AssessmentKey string @@ -20,6 +30,14 @@ func RegisterAssessmentKey(key string) AssessmentKey { } var ( + // AssessmentKeyFilesExecutes holds the successfully executed files. + AssessmentKeyFilesExecuted = RegisterAssessmentKey("files-executed") + // AssessmentKeyFilesProblems holds the files with problems. + AssessmentKeyFilesProblems = RegisterAssessmentKey("files-problems") + + // AssessmentKeyCoverageStatement counts the cases where 100% coverage was reached. + AssessmentKeyCoverageStatement = RegisterAssessmentKey("coverage-statement") + // AssessmentKeyNoExcessResponse indicates that a model did not produce more content as requested. AssessmentKeyNoExcessResponse = RegisterAssessmentKey("no-excess-response") ) @@ -27,7 +45,7 @@ var ( // Assessments holds a collection of numerical assessment metrics. type Assessments map[AssessmentKey]uint -// NewAssessments create a new assessment collection. +// NewAssessments creates a new assessment collection. func NewAssessments() Assessments { return map[AssessmentKey]uint{} } @@ -51,3 +69,57 @@ func Merge(a Assessments, b Assessments) (c Assessments) { return c } + +// String returns a string representation of the metrics. +func (a Assessments) String() string { + if a == nil { + a = NewAssessments() + } + metrics := make([]string, len(allAssessmentKeys)) + + for i, key := range allAssessmentKeys { + metrics[i] = fmt.Sprintf("%s=%d", key, a[key]) + } + + return strings.Join(metrics, ", ") +} + +// StringCSV returns a CSV row string representation of the metrics. +func (a Assessments) StringCSV() (row []string) { + if a == nil { + a = NewAssessments() + } + + row = make([]string, len(allAssessmentKeys)) + for i, key := range allAssessmentKeys { + row[i] = fmt.Sprintf("%d", a[key]) + } + + return row +} + +func csvHeader() []string { + return append([]string{"model"}, allAssessmentKeysStrings...) +} + +// FormatStringCSV formats the given metrics as CSV. +func FormatStringCSV(metricsPerModel map[string]Assessments) (string, error) { + var out strings.Builder + csv := csv.NewWriter(&out) + + if err := csv.Write(csvHeader()); err != nil { + return "", err + } + models := maps.Keys(metricsPerModel) + sort.Strings(models) + for _, model := range models { + row := metricsPerModel[model].StringCSV() + + if err := csv.Write(append([]string{model}, row...)); err != nil { + return "", pkgerrors.WithStack(err) + } + } + csv.Flush() + + return out.String(), nil +} diff --git a/evaluate/metrics/assessment_test.go b/evaluate/metrics/assessment_test.go index 5677d09b..3a6f4ee2 100644 --- a/evaluate/metrics/assessment_test.go +++ b/evaluate/metrics/assessment_test.go @@ -4,6 +4,7 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/zimmski/osutil/bytesutil" ) func TestAssessmentsAdd(t *testing.T) { @@ -114,3 +115,98 @@ func TestMerge(t *testing.T) { }, }) } + +func TestAssessmentString(t *testing.T) { + type testCase struct { + Name string + + Assessment Assessments + + ExpectedString string + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + actualString := tc.Assessment.String() + + assert.Equal(t, tc.ExpectedString, actualString) + }) + } + + validate(t, &testCase{ + Name: "Initial Metrics", + + Assessment: NewAssessments(), + + ExpectedString: "files-executed=0, files-problems=0, coverage-statement=0, no-excess-response=0", + }) + + validate(t, &testCase{ + Name: "Empty Metrics", + + Assessment: Assessments{ + AssessmentKeyCoverageStatement: 1, + AssessmentKeyFilesExecuted: 2, + AssessmentKeyFilesProblems: 3, + AssessmentKeyNoExcessResponse: 4, + }, + + ExpectedString: "files-executed=2, files-problems=3, coverage-statement=1, no-excess-response=4", + }) +} + +func TestFormatStringCSV(t *testing.T) { + type testCase struct { + Name string + + AssessmentPerModel map[string]Assessments + + ExpectedString string + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + actualString, err := FormatStringCSV(tc.AssessmentPerModel) + assert.NoError(t, err) + + assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedString), actualString) + }) + } + + validate(t, &testCase{ + Name: "Single Empty Model", + + AssessmentPerModel: map[string]Assessments{ + "Model": Assessments{}, + }, + + ExpectedString: ` + model,files-executed,files-problems,coverage-statement,no-excess-response + Model,0,0,0,0 + `, + }) + validate(t, &testCase{ + Name: "Multiple Models", + + AssessmentPerModel: map[string]Assessments{ + "ModelA": Assessments{ + AssessmentKeyCoverageStatement: 1, + AssessmentKeyFilesExecuted: 2, + AssessmentKeyFilesProblems: 3, + AssessmentKeyNoExcessResponse: 4, + }, + "ModelB": Assessments{ + AssessmentKeyCoverageStatement: 1, + AssessmentKeyFilesExecuted: 2, + AssessmentKeyFilesProblems: 3, + AssessmentKeyNoExcessResponse: 4, + }, + }, + + ExpectedString: ` + model,files-executed,files-problems,coverage-statement,no-excess-response + ModelA,2,3,1,4 + ModelB,2,3,1,4 + `, + }) +} diff --git a/evaluate/metrics/metrics.go b/evaluate/metrics/metrics.go deleted file mode 100644 index 1ae422c8..00000000 --- a/evaluate/metrics/metrics.go +++ /dev/null @@ -1,117 +0,0 @@ -package metrics - -import ( - "encoding/csv" - "fmt" - "math" - "sort" - "strings" - - pkgerrors "github.com/pkg/errors" - "golang.org/x/exp/maps" - "gonum.org/v1/gonum/stat" -) - -// Metrics holds numerical benchmarking metrics. -// TODO Move all metrics to assessment. https://github.com/symflower/eval-dev-quality/issues/34 -type Metrics struct { - // Executed is the number of benchmarking candidates with successful execution. - Executed uint - // Problems is the number of benchmarking candidates with problems. - Problems uint - // Total is the total number of benchmarking candidates. - Total uint - - // Coverage holds the coverage of the benchmarking candidates. - Coverage []float64 - - // Assessments holds numerical assessments of a generation. - Assessments Assessments -} - -// Add sums two metrics objects. -func (m Metrics) Add(o Metrics) Metrics { - return Metrics{ - Problems: m.Problems + o.Problems, - Executed: m.Executed + o.Executed, - Total: m.Total + o.Total, - - Coverage: append(m.Coverage, o.Coverage...), - - Assessments: Merge(m.Assessments, o.Assessments), - } -} - -// AverageCoverage returns the average coverage. -func (m Metrics) AverageCoverage() float64 { - averageCoverage := stat.Mean(m.Coverage, nil) - if math.IsNaN(averageCoverage) { - averageCoverage = 0 - } - - return averageCoverage -} - -// String returns a string representation of the metrics. -func (m Metrics) String() string { - problemsPercentage := float64(m.Problems) / float64(m.Total) * 100.0 - if math.IsNaN(problemsPercentage) { - problemsPercentage = 0 - } - executedPercentage := float64(m.Executed) / float64(m.Total) * 100.0 - if math.IsNaN(executedPercentage) { - executedPercentage = 0 - } - return fmt.Sprintf( - "#executed=%3.1f%%(%d/%d), #problems=%3.1f%%(%d/%d), average statement coverage=%3.1f%%", - executedPercentage, - m.Executed, - m.Total, - problemsPercentage, - m.Problems, - m.Total, - m.AverageCoverage(), - ) -} - -// StringCSV returns a CSV row string representation of the metrics. -func (m Metrics) StringCSV() (row []string) { - assessment := m.Assessments - if assessment == nil { - assessment = Assessments{} - } - - row = []string{ - fmt.Sprintf("%d", m.Total), - fmt.Sprintf("%d", m.Executed), - fmt.Sprintf("%d", m.Problems), - fmt.Sprintf("%.0f", m.AverageCoverage()), - } - for _, key := range allAssessmentKeys { - row = append(row, fmt.Sprintf("%d", assessment[key])) - } - - return row -} - -// FormatStringCSV formats the given metrics as CSV. -func FormatStringCSV(metricsPerModel map[string]Metrics) (string, error) { - var out strings.Builder - csv := csv.NewWriter(&out) - - if err := csv.Write(append([]string{"model", "files-total", "files-executed", "files-problems", "coverage-statement"}, allAssessmentKeysStrings...)); err != nil { - return "", err - } - categories := maps.Keys(metricsPerModel) - sort.Strings(categories) - for _, category := range categories { - row := metricsPerModel[category].StringCSV() - - if err := csv.Write(append([]string{category}, row...)); err != nil { - return "", pkgerrors.WithStack(err) - } - } - csv.Flush() - - return out.String(), nil -} diff --git a/evaluate/metrics/metrics_test.go b/evaluate/metrics/metrics_test.go deleted file mode 100644 index cf09c10e..00000000 --- a/evaluate/metrics/metrics_test.go +++ /dev/null @@ -1,70 +0,0 @@ -package metrics - -import ( - "testing" - - "github.com/stretchr/testify/assert" - "github.com/zimmski/osutil/bytesutil" -) - -func TestFormatStringCSV(t *testing.T) { - type testCase struct { - Name string - - MetricsPerModel map[string]Metrics - - ExpectedString string - } - - validate := func(t *testing.T, tc *testCase) { - t.Run(tc.Name, func(t *testing.T) { - actualString, err := FormatStringCSV(tc.MetricsPerModel) - assert.NoError(t, err) - - assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedString), actualString) - }) - } - - validate(t, &testCase{ - Name: "Single Empty Model", - - MetricsPerModel: map[string]Metrics{ - "Model": Metrics{}, - }, - - ExpectedString: ` - model,files-total,files-executed,files-problems,coverage-statement,no-excess-response - Model,0,0,0,0,0 - `, - }) - validate(t, &testCase{ - Name: "Multiple Models", - - MetricsPerModel: map[string]Metrics{ - "ModelA": Metrics{ - Total: 5, - Executed: 3, - Problems: 2, - Coverage: []float64{100.0}, - Assessments: Assessments{ - AssessmentKeyNoExcessResponse: 3, - }, - }, - "ModelB": Metrics{ - Total: 4, - Executed: 2, - Problems: 2, - Coverage: []float64{70.0}, - Assessments: Assessments{ - AssessmentKeyNoExcessResponse: 2, - }, - }, - }, - - ExpectedString: ` - model,files-total,files-executed,files-problems,coverage-statement,no-excess-response - ModelA,5,3,2,100,3 - ModelB,4,2,2,70,2 - `, - }) -} diff --git a/evaluate/repository.go b/evaluate/repository.go index ba564912..008955d9 100644 --- a/evaluate/repository.go +++ b/evaluate/repository.go @@ -15,7 +15,7 @@ import ( ) // EvaluateRepository evaluate a repository with the given model and language. -func EvaluateRepository(model model.Model, language language.Language, repositoryPath string) (metrics metrics.Metrics, problems []error, err error) { +func EvaluateRepository(model model.Model, language language.Language, repositoryPath string) (repositoryMetrics metrics.Assessments, problems []error, err error) { log.Printf("Evaluating model %q using language %q and repository %q", model.ID(), language.ID(), repositoryPath) defer func() { log.Printf("Evaluated model %q using language %q and repository %q: encountered %d problems", model.ID(), language.ID(), repositoryPath, len(problems)) @@ -23,7 +23,7 @@ func EvaluateRepository(model model.Model, language language.Language, repositor temporaryPath, err := os.MkdirTemp("", "eval-dev-quality") if err != nil { - return metrics, problems, pkgerrors.WithStack(err) + return repositoryMetrics, problems, pkgerrors.WithStack(err) } defer func() { if e := os.RemoveAll(temporaryPath); e != nil { @@ -36,35 +36,37 @@ func EvaluateRepository(model model.Model, language language.Language, repositor }() temporaryRepositoryPath := filepath.Join(temporaryPath, filepath.Base(repositoryPath)) if err := osutil.CopyTree(repositoryPath, temporaryRepositoryPath); err != nil { - return metrics, problems, pkgerrors.WithStack(err) + return repositoryMetrics, problems, pkgerrors.WithStack(err) } filePaths, err := language.Files(repositoryPath) if err != nil { - return metrics, problems, pkgerrors.WithStack(err) + return repositoryMetrics, problems, pkgerrors.WithStack(err) } + repositoryMetrics = metrics.NewAssessments() for _, filePath := range filePaths { - metrics.Total++ assessments, err := model.GenerateTestsForFile(language, temporaryRepositoryPath, filePath) if err != nil { problems = append(problems, pkgerrors.WithMessage(err, filePath)) - metrics.Problems++ + repositoryMetrics[metrics.AssessmentKeyFilesProblems]++ continue } - metrics.Assessments.Add(assessments) + repositoryMetrics.Add(assessments) coverage, err := language.Execute(temporaryRepositoryPath) if err != nil { problems = append(problems, pkgerrors.WithMessage(err, filePath)) - metrics.Problems++ + repositoryMetrics[metrics.AssessmentKeyFilesProblems]++ continue } - metrics.Executed++ - metrics.Coverage = append(metrics.Coverage, coverage) + repositoryMetrics[metrics.AssessmentKeyFilesExecuted]++ + if coverage == 100 { + repositoryMetrics[metrics.AssessmentKeyCoverageStatement]++ + } } - return metrics, problems, nil + return repositoryMetrics, problems, nil }