From 800dbe37f86f8b9a763cb0ef95d87e530e79195d Mon Sep 17 00:00:00 2001
From: antonioloison <48316195+antonioloison@users.noreply.github.com>
Date: Fri, 20 Sep 2024 18:15:53 +0200
Subject: [PATCH] Fix/fix prompts and structured generation (#4)

* fix: fix prompts

* fix: remove structured generation

* fix: fix dataset split in plotting
---
 grouse/gpt4_prompts/faithfulness.txt |  7 ++-----
 grouse/gpt4_prompts/usefulness.txt   |  3 +--
 grouse/grounded_qa_evaluator.py      | 14 +-------------
 grouse/main.py                       |  2 +-
 4 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/grouse/gpt4_prompts/faithfulness.txt b/grouse/gpt4_prompts/faithfulness.txt
index be651ed..6f8b9f3 100644
--- a/grouse/gpt4_prompts/faithfulness.txt
+++ b/grouse/gpt4_prompts/faithfulness.txt
@@ -1,16 +1,13 @@
 [TASK]
 Task: Grounded Question Answering
-Based solely on the content of the references, the objective is to generate a response to the user's query. Each statement must be followed by
-the reference of the source passage, in the format [i] where i is the number of the reference. If no passage seems relevant, the answer should
-begin with "No document seems to precisely answer your question" and may be supplemented with related sourced information.
+Based solely on the content of the references, the objective is to generate a response to the user's query. Each statement must be followed by the reference of the source passage, in the format [i] where i is the number of the reference. If no passage seems relevant, the answer should begin with "No document seems to precisely answer your question" and may be supplemented with related sourced information.
 [/TASK]
 [EVALUATION INSTRUCTIONS]
 I will provide you with two answers, numbered 1 and 2, each containing a response to the user request.
 I want you to assign to each answer a boolean faithfulness grade. An answer is faithful if:
 - Each statement made by the answer is followed by a source indicating the reference from which it is drawn.
 - The information preceding the source is indeed from the corresponding reference.
-- The information preceding the source is in agreement with the corresponding reference, and does not assert facts different from those
-indicated in the reference.
+- The information preceding the source is in agreement with the corresponding reference, and does not assert facts different from those indicated in the reference.
 In all other cases, the response is considered non-faithful.
 Faithfulness is also considered non-measurable if the answer asserts that no document responds to the question, and it does not provide any related information, it is then `null`.
 
diff --git a/grouse/gpt4_prompts/usefulness.txt b/grouse/gpt4_prompts/usefulness.txt
index f59b304..02564c5 100644
--- a/grouse/gpt4_prompts/usefulness.txt
+++ b/grouse/gpt4_prompts/usefulness.txt
@@ -5,8 +5,7 @@ Based solely on the content of the references, the objective is to generate a re
 [EVALUATION INSTRUCTIONS]
 I will provide you with two answers, numbered 1 and 2, each containing a response to the user request.
 I want you to assign to each answer a usefulness grade of 0 or 1:
-- Usefulness is only evaluated when the answer says that no document precisely answers the user's question, but it still provides information
-related to the question.
+- Usefulness is only evaluated when the answer says that no document precisely answers the user's question, but it still provides information related to the question.
 - Usefulness measures how interesting the related information is to know for the user, given that there is no answer in the references.
 - If the answer responds to the user request, usefulness must be `null`.
 - If the answer indicates that no document responds to the user request, without adding other information, usefulness must be `null`.
diff --git a/grouse/grounded_qa_evaluator.py b/grouse/grounded_qa_evaluator.py
index bf36c5d..34f8360 100644
--- a/grouse/grounded_qa_evaluator.py
+++ b/grouse/grounded_qa_evaluator.py
@@ -31,11 +31,6 @@
 )
 from grouse.utils import get_positive_acceptance_negative_rejection
 
-STRUCTURED_OUTPUTS_SUPPORTING_MODELS = [
-    "gpt-4o-mini-2024-07-18",
-    "gpt-4o-2024-08-06",
-]
-
 
 class GroundedQAEvaluator:
     def __init__(
@@ -69,14 +64,7 @@ def __init__(
     async def call_llm(self, prompt: str, pair_model: ScorePair) -> Score | Failed:
         try:
             kwargs = {"temperature": 0.01, "max_tokens": 2048}
-            if self.model_name in STRUCTURED_OUTPUTS_SUPPORTING_MODELS:
-                response = await litellm.acompletion(
-                    model=self.model_name,
-                    messages=[{"role": "user", "content": prompt}],
-                    response_format=pair_model,
-                    **kwargs,
-                )
-            elif "-turbo" in self.model_name or "4o" in self.model_name:
+            if "-turbo" in self.model_name or "4o" in self.model_name:
                 response = await litellm.acompletion(
                     model=self.model_name,
                     messages=[{"role": "user", "content": prompt}],
diff --git a/grouse/main.py b/grouse/main.py
index 499da74..cbb8f5f 100644
--- a/grouse/main.py
+++ b/grouse/main.py
@@ -156,7 +156,7 @@ def plot(meta_test_results_path: str) -> None:
         META_TEST_RESULTS_PATH (str): Path to meta evaluation results in
         jsonlines format.
     """
-    evaluation_samples, _ = load_unit_tests()
+    evaluation_samples, _ = load_unit_tests(dataset_split="test")
 
     results = []
     with jsonlines.open(meta_test_results_path, "r") as reader: