Skip to content

Commit

Permalink
chore: remove instructor
Browse files Browse the repository at this point in the history
  • Loading branch information
antonioloison committed Sep 4, 2024
1 parent a8ef2e0 commit d553d0f
Show file tree
Hide file tree
Showing 8 changed files with 84 additions and 209 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# CHANGELOG

## Unreleased

### Changed

- Remove `instructor` package.

## 0.1.0

### Added
Expand Down
10 changes: 7 additions & 3 deletions grouse/dtos.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,20 @@
from typing_extensions import override


# Sentinel class used until PEP 0661 is accepted
class Failed(BaseModel):
"""
A sentinel singleton class used to distinguish failed request results
from results with the value None (which may have different behavior).
A sentinel class used to distinguish failed request results from results
with the value None (which may have different behavior).
"""

error: Optional[str] = None

def __bool__(self) -> Literal[False]:
return False

def __int__(self) -> int:
return 0

@override
def __repr__(self) -> str:
return "FAILED"
Expand Down
81 changes: 55 additions & 26 deletions grouse/grounded_qa_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import asyncio
import json
import logging
import sys
from typing import List, Optional

import aiohttp
import instructor
import litellm
import numpy as np
from importlib_resources import files
from jinja2 import Environment, FileSystemLoader
from pydantic_core import ValidationError
from tqdm.asyncio import tqdm

from grouse.dtos import (
Expand All @@ -26,10 +29,13 @@
Usefulness,
UsefulnessPair,
)
from grouse.llm_calls.cached_instructor import CachedAsyncInstructor
from grouse.llm_calls.tracker import Tracker
from grouse.utils import get_positive_acceptance_negative_rejection

STRUCTURED_OUTPUTS_SUPPORTING_MODELS = [
"gpt-4o-mini-2024-07-18",
"gpt-4o-2024-08-06",
]


class GroundedQAEvaluator:
def __init__(
Expand All @@ -46,28 +52,49 @@ def __init__(
else:
self.environment = Environment(loader=FileSystemLoader(prompts_path))

if cache_path is None:
cache = litellm.Cache(type="disk", disk_cache_dir=".grouse_cache/")
else:
cache = litellm.Cache(type="disk", disk_cache_dir=cache_path)

self.tracker = Tracker()
self.async_client = CachedAsyncInstructor(
client=None,
create=instructor.patch(create=litellm.acompletion),
cache=cache,
tracker=self.tracker,
self.logger = logging.getLogger("LLM Call Tracker")
self.logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
handler.setFormatter(formatter)
self.logger.addHandler(handler)

litellm.enable_cache("disk", cache_path)

self.cost = 0

async def call_llm(self, prompt: str, pair_model: ScorePair) -> Score | Failed:
pair = await self.async_client.chat.completions.create(
model=self.model_name,
messages=[{"role": "user", "content": prompt}],
response_model=pair_model,
)
if pair is None:
return Failed()
return pair.answer_2
try:
if self.model_name in STRUCTURED_OUTPUTS_SUPPORTING_MODELS:
response = await litellm.acompletion(
model=self.model_name,
messages=[{"role": "user", "content": prompt}],
response_format=pair_model,
)
elif "-turbo" in self.model_name or "4o" in self.model_name:
response = await litellm.acompletion(
model=self.model_name,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
)
else:
response = await litellm.acompletion(
model=self.model_name,
messages=[{"role": "user", "content": prompt}],
)
pair = pair_model(**json.loads(response.choices[0].message.content))
self.cost += litellm.completion_cost(response)
return pair.answer_2

except (ValidationError, json.decoder.JSONDecodeError) as val_error:
logging.debug(
f"Call to {self.model_name} with prompt: {prompt}\n"
f"returned the following error:\n{val_error}"
)
return Failed(error=str(val_error))

async def evaluate_answer_relevancy(
self, eval_sample: EvaluationSample
Expand Down Expand Up @@ -121,12 +148,14 @@ async def evaluate_single_sample(
completeness = await self.evaluate_completeness(eval_sample)

if isinstance(answer_relevancy, Failed):
usefulness = Failed()
faithfulness = Failed()
usefulness = Failed(error="answer_relevancy failed")
faithfulness = Failed(error="answer_relevancy failed")
else:
if answer_relevancy.answer_relevancy is None:
usefulness = await self.evaluate_usefulness(eval_sample)
if usefulness.usefulness is None:
if isinstance(usefulness, Failed):
faithfulness = Failed(error="usefulness failed")
elif usefulness.usefulness is None:
faithfulness = Faithfulness(
faithfulness_justification="", faithfulness=None
)
Expand Down Expand Up @@ -163,7 +192,7 @@ def evaluate_multiple_samples(
self, eval_samples: List[EvaluationSample]
) -> List[GroundedQAEvaluation]:
results = asyncio.run(self.async_evaluate_multiple_samples(eval_samples))
self.tracker.log_summary()
self.logger.info(f"Cost: {self.cost:.4f}$")
return results

def evaluate(self, eval_samples: List[EvaluationSample]) -> EvaluationsAndReport:
Expand Down
76 changes: 0 additions & 76 deletions grouse/llm_calls/cached_instructor.py

This file was deleted.

89 changes: 0 additions & 89 deletions grouse/llm_calls/tracker.py

This file was deleted.

20 changes: 8 additions & 12 deletions grouse/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@ def cli() -> None:
@click.option(
"--prompts_path",
type=str,
help="Path to the evaluation prompts folder.",
help=(
"Path to the folder containing the prompts of the evaluator. "
"By default, the prompts are those optimized for GPT-4."
),
default=None,
)
def evaluate(
Expand All @@ -49,12 +52,6 @@ def evaluate(
actual_output (generation from the model to evaluate) and expected_output.
OUTPUT_DIR_PATH (str): Path to directory where results report and
evaluations are saved.
Options:
--evaluator_model_name (str): Name of the evaluator model. It can be any
LiteLLM model. The default model is gpt-4.
--prompts_path (str): Path to the folder containing the prompts of the evaluator
for each metric. By default, the prompts are those optimized for GPT-4.
"""
evaluator = GroundedQAEvaluator(
model_name=evaluator_model_name, prompts_path=prompts_path
Expand Down Expand Up @@ -85,7 +82,10 @@ def evaluate(
@click.option(
"--prompts_path",
type=str,
help="Path to the evaluation prompts folder.",
help=(
"Path to the folder containing the prompts of the evaluator. "
"By default, the prompts are those optimized for GPT-4."
),
default=None,
)
def meta_evaluate(
Expand All @@ -97,10 +97,6 @@ def meta_evaluate(
MODEL_NAME (str): Name of model available through LiteLLM.
OUTPUT_DIR_PATH (str): Path to directory where results report and
unit test results are saved.
Options:
--prompts_path (str): Path to the folder containing the prompts of the
evaluator. By default, the prompts are those optimized for GPT-4.
"""
evaluation_samples, conditions = load_unit_tests()

Expand Down
2 changes: 1 addition & 1 deletion grouse/meta_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def compare(value: Optional[float], condition: str) -> bool:

def __get_result(self, score: Score, score_name: str, condition: str) -> bool:
if isinstance(score, Failed):
return Failed()
return Failed(error=score.error)
return self.compare(getattr(score, score_name), condition)

def evaluate_single_test_case(self, test_case: MetaTestCase) -> MetaTestCaseResult:
Expand Down
Loading

0 comments on commit d553d0f

Please sign in to comment.