Skip to content

Commit

Permalink
Merge pull request #369 from symflower/remove-scoring
Browse files Browse the repository at this point in the history
Remove scoring from within the evaluation and remove and lint unused functions
  • Loading branch information
ruiAzevedo19 authored Jan 8, 2025
2 parents 4c78efe + 92cf309 commit 74457ae
Show file tree
Hide file tree
Showing 19 changed files with 146 additions and 955 deletions.
13 changes: 3 additions & 10 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"golang.org/x/exp/maps"

"github.com/symflower/eval-dev-quality/evaluate"
"github.com/symflower/eval-dev-quality/evaluate/metrics"
"github.com/symflower/eval-dev-quality/evaluate/report"
evaltask "github.com/symflower/eval-dev-quality/evaluate/task"
"github.com/symflower/eval-dev-quality/language"
Expand Down Expand Up @@ -531,20 +530,14 @@ func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err
DateTime: command.timestamp,
Version: evaluate.Version,
Revision: evaluate.Revision,

CSVPath: "./evaluation.csv",
LogPaths: []string{"./evaluation.log"},
ModelLogsPath: ".",
}).WriteToFile(filepath.Join(command.ResultPath, "README.md")); err != nil {
command.logger.Panicf("ERROR: %s", err)
}

assessmentsPerModel := assessments.CollapseByModel()
_ = assessmentsPerModel.WalkByScore(func(model string, assessment metrics.Assessments, score uint64) (err error) {
command.logger.Printf("Evaluation score for %q: %s", model, assessment)

return nil
})
for _, modelID := range maps.Keys(assessmentsPerModel) {
command.logger.Printf("Evaluation assessments for %q: %s", modelID, assessmentsPerModel[modelID])
}

return nil
}
Expand Down
142 changes: 71 additions & 71 deletions cmd/eval-dev-quality/cmd/evaluate_test.go

Large diffs are not rendered by default.

13 changes: 0 additions & 13 deletions evaluate/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ func TestEvaluate(t *testing.T) {
Context *Context

ExpectedAssessments metricstesting.AssessmentTuples
ExpectedTotalScore uint64
ExpectedOutputValidate func(t *testing.T, output string, resultPath string)
ExpectedResultFiles map[string]func(t *testing.T, filePath string, data string)
}
Expand Down Expand Up @@ -226,7 +225,6 @@ func TestEvaluate(t *testing.T) {
},
},
},
ExpectedTotalScore: 2,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.log": nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), mockedModel.ID(), "golang", "golang", "plain", "evaluation.log"): nil,
Expand Down Expand Up @@ -304,7 +302,6 @@ func TestEvaluate(t *testing.T) {
},
},
},
ExpectedTotalScore: 2,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.log": nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): func(t *testing.T, filePath, data string) {
Expand Down Expand Up @@ -400,7 +397,6 @@ func TestEvaluate(t *testing.T) {
},
},
},
ExpectedTotalScore: 2,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.log": nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): func(t *testing.T, filePath, data string) {
Expand Down Expand Up @@ -494,7 +490,6 @@ func TestEvaluate(t *testing.T) {
},
},
},
ExpectedTotalScore: 2,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.log": nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): func(t *testing.T, filePath, data string) {
Expand Down Expand Up @@ -665,7 +660,6 @@ func TestEvaluate(t *testing.T) {
},
},
},
ExpectedTotalScore: 0,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.log": nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil,
Expand Down Expand Up @@ -806,7 +800,6 @@ func TestEvaluate(t *testing.T) {
},
},
},
ExpectedTotalScore: 0,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.log": nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil,
Expand Down Expand Up @@ -893,7 +886,6 @@ func TestEvaluate(t *testing.T) {
},
},
},
ExpectedTotalScore: 0,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.log": nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil,
Expand Down Expand Up @@ -982,7 +974,6 @@ func TestEvaluate(t *testing.T) {
},
},
},
ExpectedTotalScore: 6,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.log": nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil,
Expand Down Expand Up @@ -1074,7 +1065,6 @@ func TestEvaluate(t *testing.T) {
},
},
},
ExpectedTotalScore: 6,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.log": nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil,
Expand Down Expand Up @@ -1195,7 +1185,6 @@ func TestEvaluate(t *testing.T) {
},
},
},
ExpectedTotalScore: 6,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.log": nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil,
Expand Down Expand Up @@ -1299,7 +1288,6 @@ func TestEvaluate(t *testing.T) {
},
},
},
ExpectedTotalScore: 6,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.log": nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil,
Expand Down Expand Up @@ -1385,7 +1373,6 @@ func TestEvaluate(t *testing.T) {
},
},
},
ExpectedTotalScore: 2,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.log": nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil,
Expand Down
65 changes: 16 additions & 49 deletions evaluate/metrics/assessment.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,50 +15,45 @@ var (
allAssessmentKeys []AssessmentKey
// AllAssessmentKeysStrings returns all registered assessment keys as strings.
AllAssessmentKeysStrings []string

// multiplierPerAssessment holds the multipliers awarded for a specific assessment.
multiplierPerAssessment = map[AssessmentKey]uint64{}
)

// RegisterAssessmentKey registers a new assessment key.
// If the multiplier for this assessment type is zero, it is ignored for the score computation.
func RegisterAssessmentKey(key string, multiplier uint64) AssessmentKey {
func RegisterAssessmentKey(key string) AssessmentKey {
assessment := AssessmentKey(key)
i := sort.SearchStrings(AllAssessmentKeysStrings, key)

allAssessmentKeys = slices.Insert(allAssessmentKeys, i, assessment)
AllAssessmentKeysStrings = slices.Insert(AllAssessmentKeysStrings, i, key)
multiplierPerAssessment[assessment] = multiplier

return assessment
}

var (
// AssessmentKeyFilesExecuted holds the successfully executed files.
AssessmentKeyFilesExecuted = RegisterAssessmentKey("files-executed", 1)
AssessmentKeyFilesExecuted = RegisterAssessmentKey("files-executed")
// AssessmentKeyFilesExecutedMaximumReachable holds the maximum theoretically reachable executed files.
AssessmentKeyFilesExecutedMaximumReachable = RegisterAssessmentKey("files-executed-maximum-reachable", 0)
AssessmentKeyFilesExecutedMaximumReachable = RegisterAssessmentKey("files-executed-maximum-reachable")
// AssessmentKeyProcessingTime holds the time in milliseconds that it took to complete the task.
AssessmentKeyProcessingTime = RegisterAssessmentKey("processing-time", 0)
AssessmentKeyProcessingTime = RegisterAssessmentKey("processing-time")

// AssessmentKeyCoverage counts execution coverage objects.
AssessmentKeyCoverage = RegisterAssessmentKey("coverage", 10)
AssessmentKeyCoverage = RegisterAssessmentKey("coverage")

// AssessmentKeyTestsPassing holds the percentage of passing tests.
AssessmentKeyTestsPassing = RegisterAssessmentKey("tests-passing", 10)
AssessmentKeyTestsPassing = RegisterAssessmentKey("tests-passing")

// AssessmentKeyResponseCharacterCount counts the number of characters of a response.
AssessmentKeyResponseCharacterCount = RegisterAssessmentKey("response-character-count", 0)
AssessmentKeyResponseCharacterCount = RegisterAssessmentKey("response-character-count")
// AssessmentKeyGenerateTestsForFileCharacterCount counts the number of characters of a generated test file.
AssessmentKeyGenerateTestsForFileCharacterCount = RegisterAssessmentKey("generate-tests-for-file-character-count", 0)
AssessmentKeyGenerateTestsForFileCharacterCount = RegisterAssessmentKey("generate-tests-for-file-character-count")

// AssessmentKeyResponseNoError indicates that a model responded without error.
AssessmentKeyResponseNoError = RegisterAssessmentKey("response-no-error", 1)
AssessmentKeyResponseNoError = RegisterAssessmentKey("response-no-error")
// AssessmentKeyResponseWithCode indicates that a model responded with code.
AssessmentKeyResponseWithCode = RegisterAssessmentKey("response-with-code", 1)
AssessmentKeyResponseWithCode = RegisterAssessmentKey("response-with-code")
// AssessmentKeyResponseNoExcess indicates that a model did not produce more content as requested.
// TODO Infer if a model produced "too much" code. https://github.com/symflower/eval-dev-quality/issues/44
AssessmentKeyResponseNoExcess = RegisterAssessmentKey("response-no-excess", 1)
AssessmentKeyResponseNoExcess = RegisterAssessmentKey("response-no-excess")
)

// Assessments holds a collection of numerical assessment metrics.
Expand Down Expand Up @@ -91,42 +86,14 @@ func (a Assessments) Equal(x Assessments) bool {
return true
}

// Merge combines two assessment collections into a new assessment collection and returns the new assessment collection.
func Merge(a Assessments, b Assessments) (c Assessments) {
c = NewAssessments()
if a != nil {
c.Add(a)
}
if b != nil {
c.Add(b)
}

return c
}

// Score computes the score over all assessments in the collection.
func (a Assessments) Score() (score uint64) {
if len(a) == 0 {
return 0
}

for key, value := range a {
if multiplierPerAssessment[key] != 0 {
score += value
}
}

return score
}

// Award yields the score points defined for the given key.
// Award yields a score point.
func (a Assessments) Award(key AssessmentKey) {
a[key] += multiplierPerAssessment[key]
a[key]++
}

// AwardPoints yields multiple score points defined for the given key.
func (a Assessments) AwardPoints(key AssessmentKey, count uint64) {
a[key] += multiplierPerAssessment[key] * count
// AwardMultiple yields multiple score points.
func (a Assessments) AwardMultiple(key AssessmentKey, count uint64) {
a[key] += count
}

// String returns a string representation of the metrics.
Expand Down
103 changes: 6 additions & 97 deletions evaluate/metrics/assessment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,59 +62,6 @@ func TestAssessmentsAdd(t *testing.T) {
})
}

func TestAssessmentsMerge(t *testing.T) {
type testCase struct {
Name string

A Assessments
B Assessments

ExpectedC Assessments
}

validate := func(t *testing.T, tc *testCase) {
t.Run(tc.Name, func(t *testing.T) {
actualC := Merge(tc.A, tc.B)

assert.Equal(t, tc.ExpectedC, actualC)
})
}

validate(t, &testCase{
Name: "Empty",

ExpectedC: NewAssessments(),
})

validate(t, &testCase{
Name: "Non existing key",

A: NewAssessments(),
B: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 1,
},

ExpectedC: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 1,
},
})

validate(t, &testCase{
Name: "Existing key",

A: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 1,
},
B: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 1,
},

ExpectedC: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 2,
},
})
}

func TestAssessmentString(t *testing.T) {
type testCase struct {
Name string
Expand Down Expand Up @@ -153,10 +100,10 @@ func TestAssessmentString(t *testing.T) {
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseWithCode: 5,
AssessmentKeyProcessingTime: 200,
AssessmentKeyTestsPassing: 70,
AssessmentKeyTestsPassing: 7,
},

ExpectedString: "coverage=1, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5, tests-passing=70",
ExpectedString: "coverage=1, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5, tests-passing=7",
})
}

Expand Down Expand Up @@ -237,44 +184,6 @@ func TestAssessmentsEqual(t *testing.T) {
})
}

func TestAssessmentsScore(t *testing.T) {
type testCase struct {
Name string

Assessments Assessments

ExpectedScore uint64
}

validate := func(t *testing.T, tc *testCase) {
t.Run(tc.Name, func(t *testing.T) {
actualScore := tc.Assessments.Score()

assert.Equal(t, tc.ExpectedScore, actualScore)
})
}

validate(t, &testCase{
Name: "Empty Assessment",

Assessments: NewAssessments(),

ExpectedScore: uint64(0),
})

validate(t, &testCase{
Name: "Values Assessment",

Assessments: Assessments{
AssessmentKeyFilesExecuted: 5,
AssessmentKeyCoverage: 4,
AssessmentKeyProcessingTime: 200,
},

ExpectedScore: uint64(9),
})
}

func TestCombineModelAndSymflowerFixAssessments(t *testing.T) {
type testCase struct {
Name string
Expand Down Expand Up @@ -309,21 +218,21 @@ func TestCombineModelAndSymflowerFixAssessments(t *testing.T) {
SymflowerFixAssessments: Assessments{
AssessmentKeyFilesExecuted: 1,
AssessmentKeyProcessingTime: uint64(100),
AssessmentKeyCoverage: 10,
AssessmentKeyCoverage: 1,
AssessmentKeyResponseNoError: 1,
AssessmentKeyTestsPassing: 100,
AssessmentKeyTestsPassing: 10,
},

ExpectedAssessments: Assessments{
AssessmentKeyFilesExecuted: 1,
AssessmentKeyProcessingTime: uint64(300),
AssessmentKeyCoverage: 10,
AssessmentKeyCoverage: 1,
AssessmentKeyResponseCharacterCount: 100,
AssessmentKeyGenerateTestsForFileCharacterCount: 50,
AssessmentKeyResponseNoError: 0,
AssessmentKeyResponseWithCode: 1,
AssessmentKeyResponseNoExcess: 1,
AssessmentKeyTestsPassing: 100,
AssessmentKeyTestsPassing: 10,
},
})
}
Loading

0 comments on commit 74457ae

Please sign in to comment.