chore: remove instructor

illuin-tech · Sep 4, 2024 · d553d0f · d553d0f
1 parent a8ef2e0
commit d553d0f
Show file tree

Hide file tree

Showing 8 changed files with 84 additions and 209 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # CHANGELOG
 
+## Unreleased
+
+### Changed
+
+- Remove `instructor` package.
+
 ## 0.1.0
 
 ### Added

diff --git a/grouse/dtos.py b/grouse/dtos.py
@@ -4,16 +4,20 @@
 from typing_extensions import override
 
 
-# Sentinel class used until PEP 0661 is accepted
 class Failed(BaseModel):
     """
-    A sentinel singleton class used to distinguish failed request results
-    from results with the value None (which may have different behavior).
+    A sentinel class used to distinguish failed request results from results
+    with the value None (which may have different behavior).
     """
 
+    error: Optional[str] = None
+
     def __bool__(self) -> Literal[False]:
         return False
 
+    def __int__(self) -> int:
+        return 0
+
     @override
     def __repr__(self) -> str:
         return "FAILED"

diff --git a/grouse/grounded_qa_evaluator.py b/grouse/grounded_qa_evaluator.py
@@ -1,12 +1,15 @@
 import asyncio
+import json
+import logging
+import sys
 from typing import List, Optional
 
 import aiohttp
-import instructor
 import litellm
 import numpy as np
 from importlib_resources import files
 from jinja2 import Environment, FileSystemLoader
+from pydantic_core import ValidationError
 from tqdm.asyncio import tqdm
 
 from grouse.dtos import (
@@ -26,10 +29,13 @@
     Usefulness,
     UsefulnessPair,
 )
-from grouse.llm_calls.cached_instructor import CachedAsyncInstructor
-from grouse.llm_calls.tracker import Tracker
 from grouse.utils import get_positive_acceptance_negative_rejection
 
+STRUCTURED_OUTPUTS_SUPPORTING_MODELS = [
+    "gpt-4o-mini-2024-07-18",
+    "gpt-4o-2024-08-06",
+]
+
 
 class GroundedQAEvaluator:
     def __init__(
@@ -46,28 +52,49 @@ def __init__(
         else:
             self.environment = Environment(loader=FileSystemLoader(prompts_path))
 
-        if cache_path is None:
-            cache = litellm.Cache(type="disk", disk_cache_dir=".grouse_cache/")
-        else:
-            cache = litellm.Cache(type="disk", disk_cache_dir=cache_path)
-
-        self.tracker = Tracker()
-        self.async_client = CachedAsyncInstructor(
-            client=None,
-            create=instructor.patch(create=litellm.acompletion),
-            cache=cache,
-            tracker=self.tracker,
+        self.logger = logging.getLogger("LLM Call Tracker")
+        self.logger.setLevel(logging.INFO)
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter(
+            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
         )
+        handler.setFormatter(formatter)
+        self.logger.addHandler(handler)
+
+        litellm.enable_cache("disk", cache_path)
+
+        self.cost = 0
 
     async def call_llm(self, prompt: str, pair_model: ScorePair) -> Score | Failed:
-        pair = await self.async_client.chat.completions.create(
-            model=self.model_name,
-            messages=[{"role": "user", "content": prompt}],
-            response_model=pair_model,
-        )
-        if pair is None:
-            return Failed()
-        return pair.answer_2
+        try:
+            if self.model_name in STRUCTURED_OUTPUTS_SUPPORTING_MODELS:
+                response = await litellm.acompletion(
+                    model=self.model_name,
+                    messages=[{"role": "user", "content": prompt}],
+                    response_format=pair_model,
+                )
+            elif "-turbo" in self.model_name or "4o" in self.model_name:
+                response = await litellm.acompletion(
+                    model=self.model_name,
+                    messages=[{"role": "user", "content": prompt}],
+                    response_format={"type": "json_object"},
+                )
+            else:
+                response = await litellm.acompletion(
+                    model=self.model_name,
+                    messages=[{"role": "user", "content": prompt}],
+                )
+            pair = pair_model(**json.loads(response.choices[0].message.content))
+            self.cost += litellm.completion_cost(response)
+            return pair.answer_2
+
+        except (ValidationError, json.decoder.JSONDecodeError) as val_error:
+            logging.debug(
+                f"Call to {self.model_name} with prompt: {prompt}\n"
+                f"returned the following error:\n{val_error}"
+            )
+            return Failed(error=str(val_error))
 
     async def evaluate_answer_relevancy(
         self, eval_sample: EvaluationSample
@@ -121,12 +148,14 @@ async def evaluate_single_sample(
         completeness = await self.evaluate_completeness(eval_sample)
 
         if isinstance(answer_relevancy, Failed):
-            usefulness = Failed()
-            faithfulness = Failed()
+            usefulness = Failed(error="answer_relevancy failed")
+            faithfulness = Failed(error="answer_relevancy failed")
         else:
             if answer_relevancy.answer_relevancy is None:
                 usefulness = await self.evaluate_usefulness(eval_sample)
-                if usefulness.usefulness is None:
+                if isinstance(usefulness, Failed):
+                    faithfulness = Failed(error="usefulness failed")
+                elif usefulness.usefulness is None:
                     faithfulness = Faithfulness(
                         faithfulness_justification="", faithfulness=None
                     )
@@ -163,7 +192,7 @@ def evaluate_multiple_samples(
         self, eval_samples: List[EvaluationSample]
     ) -> List[GroundedQAEvaluation]:
         results = asyncio.run(self.async_evaluate_multiple_samples(eval_samples))
-        self.tracker.log_summary()
+        self.logger.info(f"Cost: {self.cost:.4f}$")
         return results
 
     def evaluate(self, eval_samples: List[EvaluationSample]) -> EvaluationsAndReport:

diff --git a/grouse/llm_calls/cached_instructor.py b/grouse/llm_calls/cached_instructor.py
diff --git a/grouse/llm_calls/tracker.py b/grouse/llm_calls/tracker.py
diff --git a/grouse/main.py b/grouse/main.py
@@ -32,7 +32,10 @@ def cli() -> None:
 @click.option(
     "--prompts_path",
     type=str,
-    help="Path to the evaluation prompts folder.",
+    help=(
+        "Path to the folder containing the prompts of the evaluator. "
+        "By default, the prompts are those optimized for GPT-4."
+    ),
     default=None,
 )
 def evaluate(
@@ -49,12 +52,6 @@ def evaluate(
         actual_output (generation from the model to evaluate) and expected_output.
         OUTPUT_DIR_PATH (str): Path to directory where results report and
         evaluations are saved.
-
-    Options:
-        --evaluator_model_name (str): Name of the evaluator model. It can be any
-        LiteLLM model. The default model is gpt-4.
-        --prompts_path (str): Path to the folder containing the prompts of the evaluator
-        for each metric. By default, the prompts are those optimized for GPT-4.
     """
     evaluator = GroundedQAEvaluator(
         model_name=evaluator_model_name, prompts_path=prompts_path
@@ -85,7 +82,10 @@ def evaluate(
 @click.option(
     "--prompts_path",
     type=str,
-    help="Path to the evaluation prompts folder.",
+    help=(
+        "Path to the folder containing the prompts of the evaluator. "
+        "By default, the prompts are those optimized for GPT-4."
+    ),
     default=None,
 )
 def meta_evaluate(
@@ -97,10 +97,6 @@ def meta_evaluate(
         MODEL_NAME (str): Name of model available through LiteLLM.
         OUTPUT_DIR_PATH (str): Path to directory where results report and
         unit test results are saved.
-
-    Options:
-        --prompts_path (str): Path to the folder containing the prompts of the
-        evaluator. By default, the prompts are those optimized for GPT-4.
     """
     evaluation_samples, conditions = load_unit_tests()
 

diff --git a/grouse/meta_evaluator.py b/grouse/meta_evaluator.py
@@ -37,7 +37,7 @@ def compare(value: Optional[float], condition: str) -> bool:
 
     def __get_result(self, score: Score, score_name: str, condition: str) -> bool:
         if isinstance(score, Failed):
-            return Failed()
+            return Failed(error=score.error)
         return self.compare(getattr(score, score_name), condition)
 
     def evaluate_single_test_case(self, test_case: MetaTestCase) -> MetaTestCaseResult: