Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: missing embeddings argument in testset and some E2E tests #1690

Merged
merged 5 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions src/ragas/metrics/_bleu_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ def init(self, run_config: RunConfig):
async def _single_turn_ascore(
self, sample: SingleTurnSample, callbacks: Callbacks
) -> float:

assert (
self.sentence_segmenter is not None
), "Sentence segmenter is not initialized"
Expand All @@ -56,6 +55,3 @@ async def _single_turn_ascore(

async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
return await self._single_turn_ascore(SingleTurnSample(**row), callbacks)


bleu_score = BleuScore()
32 changes: 24 additions & 8 deletions src/ragas/testset/synthesizers/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
from ragas._analytics import TestsetGenerationEvent, track
from ragas.callbacks import new_group
from ragas.cost import TokenUsageParser
from ragas.embeddings.base import BaseRagasEmbeddings, LlamaIndexEmbeddingsWrapper
from ragas.embeddings.base import (
BaseRagasEmbeddings,
LangchainEmbeddingsWrapper,
LlamaIndexEmbeddingsWrapper,
)
from ragas.executor import Executor
from ragas.llms import BaseRagasLLM, LangchainLLMWrapper, LlamaIndexLLMWrapper
from ragas.run_config import RunConfig
Expand All @@ -24,6 +28,7 @@
if t.TYPE_CHECKING:
from langchain_core.callbacks import Callbacks
from langchain_core.documents import Document as LCDocument
from langchain_core.embeddings import Embeddings as LangchainEmbeddings
from langchain_core.language_models import BaseLanguageModel as LangchainLLM
from llama_index.core.base.embeddings.base import (
BaseEmbedding as LlamaIndexEmbedding,
Expand Down Expand Up @@ -55,13 +60,15 @@ class TestsetGenerator:
"""

llm: BaseRagasLLM
embedding_model: BaseRagasEmbeddings
knowledge_graph: KnowledgeGraph = field(default_factory=KnowledgeGraph)
persona_list: t.Optional[t.List[Persona]] = None

@classmethod
def from_langchain(
cls,
llm: LangchainLLM,
embedding_model: LangchainEmbeddings,
knowledge_graph: t.Optional[KnowledgeGraph] = None,
) -> TestsetGenerator:
"""
Expand All @@ -70,13 +77,15 @@ def from_langchain(
knowledge_graph = knowledge_graph or KnowledgeGraph()
return cls(
LangchainLLMWrapper(llm),
LangchainEmbeddingsWrapper(embedding_model),
knowledge_graph,
)

@classmethod
def from_llama_index(
cls,
llm: LlamaIndexLLM,
embedding_model: LlamaIndexEmbedding,
knowledge_graph: t.Optional[KnowledgeGraph] = None,
) -> TestsetGenerator:
"""
Expand All @@ -85,6 +94,7 @@ def from_llama_index(
knowledge_graph = knowledge_graph or KnowledgeGraph()
return cls(
LlamaIndexLLMWrapper(llm),
LlamaIndexEmbeddingsWrapper(embedding_model),
knowledge_graph,
)

Expand Down Expand Up @@ -145,7 +155,7 @@ def generate_with_langchain_docs(
Provide an LLM on TestsetGenerator instantiation or as an argument for transforms_llm parameter.
Alternatively you can provide your own transforms through the `transforms` parameter."""
)
if not transforms_embedding_model:
if not self.embedding_model and not transforms_embedding_model:
raise ValueError(
"""An embedding client was not provided. Provide an embedding through the transforms_embedding_model parameter. Alternatively you can provide your own transforms through the `transforms` parameter."""
)
Expand All @@ -154,7 +164,7 @@ def generate_with_langchain_docs(
transforms = default_transforms(
documents=list(documents),
llm=transforms_llm or self.llm,
embedding_model=transforms_embedding_model,
embedding_model=transforms_embedding_model or self.embedding_model,
)

# convert the documents to Ragas nodes
Expand Down Expand Up @@ -208,19 +218,25 @@ def generate_with_llamaindex_docs(
raise ValueError(
"An llm client was not provided. Provide an LLM on TestsetGenerator instantiation or as an argument for transforms_llm parameter. Alternatively you can provide your own transforms through the `transforms` parameter."
)
if not transforms_embedding_model:
if not self.embedding_model and not transforms_embedding_model:
raise ValueError(
"An embedding client was not provided. Provide an embedding through the transforms_embedding_model parameter. Alternatively you can provide your own transforms through the `transforms` parameter."
)

if not transforms:
# use TestsetGenerator's LLM and embedding model if no transforms_llm or transforms_embedding_model is provided
if transforms_llm is None:
llm_for_transforms = self.llm
else:
llm_for_transforms = LlamaIndexLLMWrapper(transforms_llm)
embedding_model_for_transforms = LlamaIndexEmbeddingsWrapper(
transforms_embedding_model
)
if transforms_embedding_model is None:
embedding_model_for_transforms = self.embedding_model
else:
embedding_model_for_transforms = LlamaIndexEmbeddingsWrapper(
transforms_embedding_model
)

# create the transforms
transforms = default_transforms(
documents=[LCDocument(page_content=doc.text) for doc in documents],
llm=llm_for_transforms,
Expand Down Expand Up @@ -371,7 +387,7 @@ def generate(

# generate scenarios
exec = Executor(
"Generating Scenarios",
desc="Generating Scenarios",
raise_exceptions=raise_exceptions,
run_config=run_config,
keep_progress_bar=False,
Expand Down
21 changes: 4 additions & 17 deletions tests/benchmarks/benchmark_testsetgen.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,12 @@
import time

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from llama_index.core import download_loader

from ragas.testset.evolutions import conditional, multi_context, reasoning, simple
from ragas.testset.generator import TestsetGenerator
from ragas.testset.synthesizers.generate import TestsetGenerator

generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
critic_llm = ChatOpenAI(model="gpt-4")
generator_llm = ChatOpenAI(model="gpt-4o")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(generator_llm, critic_llm, embeddings)

distributions = {simple: 0.5, multi_context: 0.3, reasoning: 0.1, conditional: 0.1}
generator = TestsetGenerator.from_langchain(generator_llm, embeddings)


def get_documents():
Expand All @@ -31,14 +25,7 @@ def get_documents():

if __name__ == "__main__":
documents = get_documents()

# asyncio
print("Starting [Asyncio]")
start = time.time()
generator.generate_with_llamaindex_docs(
documents=documents,
test_size=50,
distributions=distributions,
is_async=True,
testset_size=50,
)
print(f"Time taken: {time.time() - start:.2f}s")
7 changes: 4 additions & 3 deletions tests/e2e/test_adaptation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from ragas import adapt
from ragas.llms import llm_factory
from ragas.metrics import context_recall


def test_adapt():
adapt([context_recall], language="spanish")
async def test_adapt():
llm = llm_factory("gpt-4o")
await context_recall.adapt_prompts(llm=llm, language="spanish")
assert context_recall.context_recall_prompt.language == "spanish"
19 changes: 11 additions & 8 deletions tests/e2e/test_amnesty_in_ci.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
import typing as t

import pytest
from datasets import load_dataset

from ragas import evaluate
from ragas import EvaluationDataset, evaluate
from ragas.metrics import (
answer_relevancy,
context_precision,
context_recall,
faithfulness,
)

if t.TYPE_CHECKING:
from datasets import Dataset

# loading the V2 dataset
amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2")["eval"]
amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v3")["eval"] # type: ignore


def assert_in_range(score: float, value: float, plus_or_minus: float):
Expand All @@ -23,16 +28,14 @@ def assert_in_range(score: float, value: float, plus_or_minus: float):
@pytest.mark.ragas_ci
def test_amnesty_e2e():
result = evaluate(
amnesty_qa,
EvaluationDataset.from_hf_dataset(t.cast("Dataset", amnesty_qa))[:1],
metrics=[answer_relevancy, faithfulness, context_recall, context_precision],
in_ci=True,
show_progress=False,
)
assert result["answer_relevancy"] >= 0.9
assert result["context_recall"] >= 0.95
assert result["context_precision"] >= 0.95
assert_in_range(result["faithfulness"], value=0.4, plus_or_minus=0.1)
assert result is not None


@pytest.mark.ragas_ci
def test_assert_in_range():
assert_in_range(0.5, value=0.1, plus_or_minus=0.1)
assert_in_range(0.51, value=0.5, plus_or_minus=0.1)
129 changes: 0 additions & 129 deletions tests/e2e/test_evaluation_in_jupyter.ipynb

This file was deleted.

14 changes: 10 additions & 4 deletions tests/e2e/test_fullflow.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import typing as t

from datasets import load_dataset

from ragas import evaluate
from ragas import EvaluationDataset, evaluate
from ragas.metrics import answer_relevancy, context_precision, faithfulness
from ragas.metrics.critique import harmfulness
from ragas.metrics._aspect_critic import harmfulness

if t.TYPE_CHECKING:
from datasets import Dataset


def test_evaluate_e2e():
ds = load_dataset("explodinggradients/fiqa", "ragas_eval")["baseline"]
ds = load_dataset("explodinggradients/amnesty_qa", "english_v3")["eval"] # type: ignore
result = evaluate(
ds.select(range(3)),
EvaluationDataset.from_hf_dataset(t.cast("Dataset", ds))[:1],
metrics=[answer_relevancy, context_precision, faithfulness, harmfulness],
show_progress=False,
)
assert result is not None
Loading
Loading