Skip to content

Commit

Permalink
feat: add meta-evaluation to Python package (#7)
Browse files Browse the repository at this point in the history
Co-authored-by: António Loison <[email protected]>
  • Loading branch information
antonioloison and António Loison authored Sep 24, 2024
1 parent f227c8f commit 1969e27
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 22 deletions.
1 change: 1 addition & 0 deletions grouse/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from grouse.dtos import EvaluationSample, ExpectedGroundedQAEvaluation
from grouse.grounded_qa_evaluator import GroundedQAEvaluator
from grouse.meta_evaluator import meta_evaluate_pipeline
24 changes: 3 additions & 21 deletions grouse/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import click
import jsonlines

from grouse.dtos import EvaluationSample, MetaTestCase, MetaTestCaseResult
from grouse.dtos import EvaluationSample, MetaTestCaseResult
from grouse.grounded_qa_evaluator import GroundedQAEvaluator
from grouse.meta_evaluator import MetaEvaluator
from grouse.meta_evaluator import meta_evaluate_pipeline
from grouse.plot import plot_matrices
from grouse.register_models import register_models
from grouse.utils import NanConverter, load_unit_tests
Expand Down Expand Up @@ -111,26 +111,8 @@ def meta_evaluate(
OUTPUT_DIR_PATH (str): Path to directory where results report and
unit test results are saved.
"""
evaluation_samples, conditions = load_unit_tests("train" if train_set else "test")

evaluator = GroundedQAEvaluator(model_name, prompts_path=prompts_path)
evaluations = evaluator.evaluate_multiple_samples(evaluation_samples)

meta_evaluator = MetaEvaluator()

meta_test_cases = []
for sample, evaluation, condition in zip(
evaluation_samples, evaluations, conditions
):
meta_test_cases.append(
MetaTestCase(
evaluation_sample=sample,
actual_evaluation=evaluation,
expected_evaluation=condition,
)
)

meta_evaluations = meta_evaluator.evaluate(meta_test_cases)
meta_evaluations = meta_evaluate_pipeline(model_name, prompts_path, train_set)

os.makedirs(output_dir_path, exist_ok=True)
with open(
Expand Down
32 changes: 31 additions & 1 deletion grouse/meta_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
MetaTestCaseResult,
Score,
)
from grouse.utils import get_positive_acceptance_negative_rejection
from grouse.grounded_qa_evaluator import GroundedQAEvaluator
from grouse.utils import get_positive_acceptance_negative_rejection, load_unit_tests


class MetaEvaluator:
Expand Down Expand Up @@ -153,3 +154,32 @@ def evaluate(self, test_cases: List[MetaTestCase]) -> MetaEvaluationsAndReport:
total=total,
),
)


def meta_evaluate_pipeline(
model_name: str,
prompts_path: Optional[str] = None,
train_set: bool = False,
) -> List[MetaEvaluationsAndReport]:
evaluation_samples, conditions = load_unit_tests("train" if train_set else "test")

evaluator = GroundedQAEvaluator(model_name, prompts_path=prompts_path)
evaluations = evaluator.evaluate_multiple_samples(evaluation_samples)

meta_evaluator = MetaEvaluator()

meta_test_cases = []
for sample, evaluation, condition in zip(
evaluation_samples, evaluations, conditions
):
meta_test_cases.append(
MetaTestCase(
evaluation_sample=sample,
actual_evaluation=evaluation,
expected_evaluation=condition,
)
)

meta_evaluations = meta_evaluator.evaluate(meta_test_cases)

return meta_evaluations

0 comments on commit 1969e27

Please sign in to comment.