diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ec6df7..e5eb042 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # CHANGELOG +## Unreleased + +### Changed + +- Remove `instructor` package. + ## 0.1.0 ### Added diff --git a/grouse/dtos.py b/grouse/dtos.py index 1d7f1b9..ad1f636 100644 --- a/grouse/dtos.py +++ b/grouse/dtos.py @@ -4,16 +4,20 @@ from typing_extensions import override -# Sentinel class used until PEP 0661 is accepted class Failed(BaseModel): """ - A sentinel singleton class used to distinguish failed request results - from results with the value None (which may have different behavior). + A sentinel class used to distinguish failed request results from results + with the value None (which may have different behavior). """ + error: Optional[str] = None + def __bool__(self) -> Literal[False]: return False + def __int__(self) -> int: + return 0 + @override def __repr__(self) -> str: return "FAILED" diff --git a/grouse/grounded_qa_evaluator.py b/grouse/grounded_qa_evaluator.py index 70c483f..8b0a884 100644 --- a/grouse/grounded_qa_evaluator.py +++ b/grouse/grounded_qa_evaluator.py @@ -1,12 +1,15 @@ import asyncio +import json +import logging +import sys from typing import List, Optional import aiohttp -import instructor import litellm import numpy as np from importlib_resources import files from jinja2 import Environment, FileSystemLoader +from pydantic_core import ValidationError from tqdm.asyncio import tqdm from grouse.dtos import ( @@ -26,10 +29,13 @@ Usefulness, UsefulnessPair, ) -from grouse.llm_calls.cached_instructor import CachedAsyncInstructor -from grouse.llm_calls.tracker import Tracker from grouse.utils import get_positive_acceptance_negative_rejection +STRUCTURED_OUTPUTS_SUPPORTING_MODELS = [ + "gpt-4o-mini-2024-07-18", + "gpt-4o-2024-08-06", +] + class GroundedQAEvaluator: def __init__( @@ -46,28 +52,49 @@ def __init__( else: self.environment = Environment(loader=FileSystemLoader(prompts_path)) - if cache_path is None: - cache = litellm.Cache(type="disk", disk_cache_dir=".grouse_cache/") - else: - cache = litellm.Cache(type="disk", disk_cache_dir=cache_path) - - self.tracker = Tracker() - self.async_client = CachedAsyncInstructor( - client=None, - create=instructor.patch(create=litellm.acompletion), - cache=cache, - tracker=self.tracker, + self.logger = logging.getLogger("LLM Call Tracker") + self.logger.setLevel(logging.INFO) + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(logging.INFO) + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) + handler.setFormatter(formatter) + self.logger.addHandler(handler) + + litellm.enable_cache("disk", cache_path) + + self.cost = 0 async def call_llm(self, prompt: str, pair_model: ScorePair) -> Score | Failed: - pair = await self.async_client.chat.completions.create( - model=self.model_name, - messages=[{"role": "user", "content": prompt}], - response_model=pair_model, - ) - if pair is None: - return Failed() - return pair.answer_2 + try: + if self.model_name in STRUCTURED_OUTPUTS_SUPPORTING_MODELS: + response = await litellm.acompletion( + model=self.model_name, + messages=[{"role": "user", "content": prompt}], + response_format=pair_model, + ) + elif "-turbo" in self.model_name or "4o" in self.model_name: + response = await litellm.acompletion( + model=self.model_name, + messages=[{"role": "user", "content": prompt}], + response_format={"type": "json_object"}, + ) + else: + response = await litellm.acompletion( + model=self.model_name, + messages=[{"role": "user", "content": prompt}], + ) + pair = pair_model(**json.loads(response.choices[0].message.content)) + self.cost += litellm.completion_cost(response) + return pair.answer_2 + + except (ValidationError, json.decoder.JSONDecodeError) as val_error: + logging.debug( + f"Call to {self.model_name} with prompt: {prompt}\n" + f"returned the following error:\n{val_error}" + ) + return Failed(error=str(val_error)) async def evaluate_answer_relevancy( self, eval_sample: EvaluationSample @@ -121,12 +148,14 @@ async def evaluate_single_sample( completeness = await self.evaluate_completeness(eval_sample) if isinstance(answer_relevancy, Failed): - usefulness = Failed() - faithfulness = Failed() + usefulness = Failed(error="answer_relevancy failed") + faithfulness = Failed(error="answer_relevancy failed") else: if answer_relevancy.answer_relevancy is None: usefulness = await self.evaluate_usefulness(eval_sample) - if usefulness.usefulness is None: + if isinstance(usefulness, Failed): + faithfulness = Failed(error="usefulness failed") + elif usefulness.usefulness is None: faithfulness = Faithfulness( faithfulness_justification="", faithfulness=None ) @@ -163,7 +192,7 @@ def evaluate_multiple_samples( self, eval_samples: List[EvaluationSample] ) -> List[GroundedQAEvaluation]: results = asyncio.run(self.async_evaluate_multiple_samples(eval_samples)) - self.tracker.log_summary() + self.logger.info(f"Cost: {self.cost:.4f}$") return results def evaluate(self, eval_samples: List[EvaluationSample]) -> EvaluationsAndReport: diff --git a/grouse/llm_calls/cached_instructor.py b/grouse/llm_calls/cached_instructor.py deleted file mode 100644 index 11ccf42..0000000 --- a/grouse/llm_calls/cached_instructor.py +++ /dev/null @@ -1,76 +0,0 @@ -from copy import deepcopy -from typing import Any, Awaitable, Callable, Iterable, TypeVar, Union - -import instructor -import litellm -import tenacity -from instructor.dsl.partial import Partial -from instructor.exceptions import InstructorRetryException -from openai.types.chat import ChatCompletionMessageParam -from pydantic import BaseModel, TypeAdapter - -from grouse.llm_calls.tracker import Tracker - -T = TypeVar("T", bound=Union[BaseModel, "Iterable[Any]", "Partial[Any]"]) - - -class CachedAsyncInstructor(instructor.AsyncInstructor): - def __init__( - self, - client: Any | None, - create: Callable[..., Any], - cache: litellm.Cache, - tracker: Tracker, - mode: instructor.Mode = instructor.Mode.TOOLS, - provider: instructor.Provider = instructor.Provider.OPENAI, - **kwargs: Any, - ): - super().__init__(client, create, mode, provider, **kwargs) - self.cache = cache - self.tracker = tracker - - async def create( - self, - response_model: type[T], - messages: list[ChatCompletionMessageParam], - max_retries: int = 3, - validation_context: dict[str, Any] | None = None, - strict: bool = True, - **kwargs: Any, - ) -> T | Awaitable[T]: - kwargs = self.handle_kwargs(kwargs) - # When using JSON Schema, a system message gets prepended by instructor, - # messing with the cache. We keep the original parameters and pass a deepcopy - # to avoid storing the wrong key in the cache. - original_messages = deepcopy(messages) - - result = await self.cache.async_get_cache(messages=original_messages, **kwargs) - - if result is not None: # Cache hits - result = TypeAdapter(response_model).validate_python(result) - self.tracker.increment_cache_hit() - return result - - # Cache misses - try: - result = await self.create_fn( - response_model=response_model, - messages=messages, - max_retries=tenacity.AsyncRetrying( - stop=tenacity.stop_after_attempt(max_retries), - after=lambda _: self.tracker.increment_parsing_failure(), - ), - validation_context=validation_context, - strict=strict, - **kwargs, - ) - except InstructorRetryException as _: - result = None - self.tracker.increment_parsing_failure() - await self.cache.async_add_cache( - TypeAdapter(response_model).dump_json(result), - messages=original_messages, - **kwargs, - ) - self.tracker.increment_parsing_successes() - return result diff --git a/grouse/llm_calls/tracker.py b/grouse/llm_calls/tracker.py deleted file mode 100644 index 23f6125..0000000 --- a/grouse/llm_calls/tracker.py +++ /dev/null @@ -1,89 +0,0 @@ -import logging -import sys -from typing import Optional - -import litellm -from litellm.integrations.custom_logger import CustomLogger - - -class Tracker: - def __init__(self) -> None: - litellm.callbacks.append(TrackingHandler(self)) - - self._api_calls = 0 - self._api_successes = 0 - self._api_failures = 0 - self._parsing_successes = 0 - self._parsing_failures = 0 - self._cache_hits = 0 - self._cost = 0.0 - - self.logger = logging.getLogger("LLM Call Tracker") - self.logger.setLevel(logging.INFO) - handler = logging.StreamHandler(sys.stdout) - handler.setLevel(logging.INFO) - formatter = logging.Formatter( - "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - ) - handler.setFormatter(formatter) - self.logger.addHandler(handler) - - def increment_cost(self, cost: Optional[float]) -> None: - if cost is not None: - self._cost += cost - - def increment_cache_hit(self) -> None: - self._cache_hits += 1 - - def increment_api_success(self) -> None: - self._api_calls += 1 - self._api_successes += 1 - - def increment_api_failure(self) -> None: - self._api_calls += 1 - self._api_failures += 1 - - def increment_parsing_successes(self) -> None: - self._parsing_successes += 1 - - def increment_parsing_failure(self) -> None: - self._parsing_failures += 1 - - def log_summary(self) -> None: - self.logger.info(f"API calls: {self._api_calls}") - self.logger.info(f"API successes: {self._api_successes}") - self.logger.info(f"API failures: {self._api_failures}") - self.logger.info(f"Parsing successes: {self._parsing_successes}") - self.logger.info(f"Parsing failures: {self._parsing_failures}") - self.logger.info(f"Cache hits: {self._cache_hits}") - self.logger.info(f"Cost: {self._cost:.2f}$") - - -class TrackingHandler(CustomLogger): - def __init__(self, tracker: Tracker) -> None: - super().__init__() - self.tracker = tracker - - def log_success_event(self, kwargs, response_obj, start_time, end_time): - if kwargs.get("cache_hit", False): - self.tracker.increment_cache_hit() - return - - self.tracker.increment_api_success() - self.tracker.increment_cost(kwargs.get("response_cost", 0.0)) - - def log_failure_event(self, kwargs, response_obj, start_time, end_time): - self.tracker.increment_api_failure() - self.tracker.increment_cost(kwargs.get("response_cost", 0.0)) - - async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): - if kwargs.get("cache_hit", False): - self.tracker.increment_cache_hit() - return - - self.tracker.increment_api_success() - self.tracker.increment_cost(kwargs.get("response_cost", 0.0)) - - async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): - self.tracker.increment_api_failure() - self.tracker.increment_cost(kwargs.get("response_cost", 0.0)) diff --git a/grouse/main.py b/grouse/main.py index 2faa51b..bbff41c 100644 --- a/grouse/main.py +++ b/grouse/main.py @@ -32,7 +32,10 @@ def cli() -> None: @click.option( "--prompts_path", type=str, - help="Path to the evaluation prompts folder.", + help=( + "Path to the folder containing the prompts of the evaluator. " + "By default, the prompts are those optimized for GPT-4." + ), default=None, ) def evaluate( @@ -49,12 +52,6 @@ def evaluate( actual_output (generation from the model to evaluate) and expected_output. OUTPUT_DIR_PATH (str): Path to directory where results report and evaluations are saved. - - Options: - --evaluator_model_name (str): Name of the evaluator model. It can be any - LiteLLM model. The default model is gpt-4. - --prompts_path (str): Path to the folder containing the prompts of the evaluator - for each metric. By default, the prompts are those optimized for GPT-4. """ evaluator = GroundedQAEvaluator( model_name=evaluator_model_name, prompts_path=prompts_path @@ -85,7 +82,10 @@ def evaluate( @click.option( "--prompts_path", type=str, - help="Path to the evaluation prompts folder.", + help=( + "Path to the folder containing the prompts of the evaluator. " + "By default, the prompts are those optimized for GPT-4." + ), default=None, ) def meta_evaluate( @@ -97,10 +97,6 @@ def meta_evaluate( MODEL_NAME (str): Name of model available through LiteLLM. OUTPUT_DIR_PATH (str): Path to directory where results report and unit test results are saved. - - Options: - --prompts_path (str): Path to the folder containing the prompts of the - evaluator. By default, the prompts are those optimized for GPT-4. """ evaluation_samples, conditions = load_unit_tests() diff --git a/grouse/meta_evaluator.py b/grouse/meta_evaluator.py index 9fe6f35..ba38856 100644 --- a/grouse/meta_evaluator.py +++ b/grouse/meta_evaluator.py @@ -37,7 +37,7 @@ def compare(value: Optional[float], condition: str) -> bool: def __get_result(self, score: Score, score_name: str, condition: str) -> bool: if isinstance(score, Failed): - return Failed() + return Failed(error=score.error) return self.compare(getattr(score, score_name), condition) def evaluate_single_test_case(self, test_case: MetaTestCase) -> MetaTestCaseResult: diff --git a/grouse/utils.py b/grouse/utils.py index 105c9c8..4dc59bf 100644 --- a/grouse/utils.py +++ b/grouse/utils.py @@ -37,8 +37,12 @@ def get_positive_acceptance_negative_rejection( answer_relevancy: AnswerRelevancy | Failed, completeness: Completeness | Failed, ) -> Tuple[Optional[int] | Failed, Optional[int] | Failed]: - if isinstance(answer_relevancy, Failed) or isinstance(completeness, Failed): - return Failed(), Failed() + if isinstance(answer_relevancy, Failed): + return Failed(error="answer relevancy failed"), Failed( + error="answer relevancy failed" + ) + elif isinstance(completeness, Failed): + return Failed(error="completeness failed"), Failed(error="completeness failed") else: if answer_relevancy.answer_relevancy is None: if completeness.completeness is None: @@ -75,4 +79,5 @@ def load_unit_tests() -> ( ) ) conditions.append(ExpectedGroundedQAEvaluation(**unit_test["conditions"])) + return evaluation_samples, conditions