diff --git a/docs/concepts/metrics/available_metrics/index.md b/docs/concepts/metrics/available_metrics/index.md index dbb9fc68c..c9c537e75 100644 --- a/docs/concepts/metrics/available_metrics/index.md +++ b/docs/concepts/metrics/available_metrics/index.md @@ -11,6 +11,8 @@ Each metric are essentially paradigms that are designed to evaluate a particular - [Noise Sensitivity](noise_sensitivity.md) - [Response Relevancy](answer_relevance.md) - [Faithfulness](faithfulness.md) +- [Multimodal Faithfulness](multi_modal_faithfulness.md) +- [Multimodal Relevance](multi_modal_relevance.md) ## Agents or Tool use cases diff --git a/docs/concepts/metrics/available_metrics/multi_modal_faithfulness.md b/docs/concepts/metrics/available_metrics/multi_modal_faithfulness.md new file mode 100644 index 000000000..701ff26da --- /dev/null +++ b/docs/concepts/metrics/available_metrics/multi_modal_faithfulness.md @@ -0,0 +1,50 @@ +## MultiModalFaithfulness + +`MultiModalFaithfulness` metric measures the factual consistency of the generated answer against both visual and textual context. It is calculated from the answer, retrieved textual context, and visual context. The answer is scaled to a (0,1) range, with higher scores indicating better faithfulness. + +The generated answer is regarded as faithful if all the claims made in the answer can be inferred from either the visual or textual context provided. To determine this, the response is directly evaluated against the provided contexts, and the faithfulness score is either 0 or 1. + +### Example + +```python +from ragas.database_schema import SingleTurnSample +from ragas.metrics import MultiModalFaithfulness + +sample = SingleTurnSample( + user_input="What about the Tesla Model X?", + response="Cats are cute.", + retrieved_contexts=[ + "custom_eval/multimodal/images/tesla.jpg" + ] + ) +scorer = MultiModalFaithfulness() +await scorer.single_turn_ascore(sample) +``` + +### How It’s Calculated + +!!! example + **Question**: What about the Tesla Model X? + + **Context (visual)**: + - An image of the Tesla Model X (custom_eval/multimodal/images/tesla.jpg) + + **High faithfulness answer**: The Tesla Model X is an electric SUV manufactured by Tesla. + + **Low faithfulness answer**: Cats are cute. + +Let's examine how faithfulness was calculated using the low faithfulness answer: + +- **Step 1:** Evaluate the generated response against the given contexts. + - Response: "Cats are cute." + +- **Step 2:** Verify if the response can be inferred from the given context. + - Response: No + +- **Step 3:** Use the result to determine the faithfulness score. + + $$ + \text{Faithfulness} = 0 + $$ + +In this example, the response "Cats are cute" cannot be inferred from the image of the Tesla Model X, so the faithfulness score is 0. \ No newline at end of file diff --git a/docs/concepts/metrics/available_metrics/multi_modal_relevance.md b/docs/concepts/metrics/available_metrics/multi_modal_relevance.md new file mode 100644 index 000000000..d2e4420c2 --- /dev/null +++ b/docs/concepts/metrics/available_metrics/multi_modal_relevance.md @@ -0,0 +1,50 @@ +## MultiModalRelevance + +`MultiModalRelevance` metric measures the relevance of the generated answer against both visual and textual context. It is calculated from the user input, response, and retrieved contexts (both visual and textual). The answer is scaled to a (0,1) range, with higher scores indicating better relevance. + +The generated answer is regarded as relevant if it aligns with the visual or textual context provided. To determine this, the response is directly evaluated against the provided contexts, and the relevance score is either 0 or 1. + +### Example + +```python +from ragas.database_schema import SingleTurnSample +from ragas.metrics import MultiModalRelevance + +sample = SingleTurnSample( + user_input="What about the Tesla Model X?", + response="Cats are cute.", + retrieved_contexts=[ + "custom_eval/multimodal/images/tesla.jpg" + ] + ) +scorer = MultiModalRelevance() +await scorer.single_turn_ascore(sample) +``` + +### How It’s Calculated + +!!! example + **Question**: What about the Tesla Model X? + + **Context (visual)**: + - An image of the Tesla Model X (custom_eval/multimodal/images/tesla.jpg) + + **High relevance answer**: The Tesla Model X is an electric SUV manufactured by Tesla. + + **Low relevance answer**: Cats are cute. + +Let's examine how relevance was calculated using the low relevance answer: + +- **Step 1:** Evaluate the generated response against the given contexts. + - Response: "Cats are cute." + +- **Step 2:** Verify if the response aligns with the given context. + - Response: No + +- **Step 3:** Use the result to determine the relevance score. + + $$ + \text{Relevance} = 0 + $$ + +In this example, the response "Cats are cute" does not align with the image of the Tesla Model X, so the relevance score is 0. \ No newline at end of file diff --git a/src/ragas/exceptions.py b/src/ragas/exceptions.py index 122553639..09782c7a1 100644 --- a/src/ragas/exceptions.py +++ b/src/ragas/exceptions.py @@ -27,9 +27,7 @@ class RagasOutputParserException(RagasException): """ def __init__(self): - msg = ( - "The output parser failed to parse the output including retries." - ) + msg = "The output parser failed to parse the output including retries." super().__init__(msg) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 300d5b267..12862288e 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -21,8 +21,8 @@ from ragas.metrics._context_precision import ( ContextPrecision, ContextUtilization, - LLMContextPrecisionWithReference, LLMContextPrecisionWithoutReference, + LLMContextPrecisionWithReference, NonLLMContextPrecisionWithReference, context_precision, ) @@ -47,6 +47,14 @@ InstanceRubricsScoreWithoutReference, InstanceRubricsWithReference, ) +from ragas.metrics._multi_modal_faithfulness import ( + MultiModalFaithfulness, + multimodal_faithness, +) +from ragas.metrics._multi_modal_relevance import ( + MultiModalRelevance, + multimodal_relevance, +) from ragas.metrics._noise_sensitivity import NoiseSensitivity from ragas.metrics._rogue_score import RougeScore from ragas.metrics._sql_semantic_equivalence import LLMSQLEquivalence @@ -107,6 +115,10 @@ "DistanceMeasure", "TopicAdherenceScore", "LLMSQLEquivalence", + "MultiModalFaithfulness", + "multimodal_faithness", + "MultiModalRelevance", + "multimodal_relevance", ] current_module = sys.modules[__name__] diff --git a/src/ragas/metrics/_multi_modal_faithfulness.py b/src/ragas/metrics/_multi_modal_faithfulness.py new file mode 100644 index 000000000..d1deabb3b --- /dev/null +++ b/src/ragas/metrics/_multi_modal_faithfulness.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import typing as t +from dataclasses import dataclass, field + +import numpy as np +from pydantic import BaseModel, Field + +from ragas.dataset_schema import SingleTurnSample +from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric +from ragas.prompt import ImageTextPrompt + +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + + +class FaithfulnessInput(BaseModel): + response: str = Field(description="response from AI") + retrieved_contexts: list[str] = Field(description="contexts retrieved from the LLM") + + def to_string_list(self): + return [ + "inputs:", + self.response, + "retrieved_contexts: ", + ] + self.retrieved_contexts + + +class FaithfulnessOutput(BaseModel): + faithful: bool = Field(description="boolean indicating if request was faithful") + + +class MultiModalFaithfulnessPrompt( + ImageTextPrompt[FaithfulnessInput, FaithfulnessOutput] +): + # refer: https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/evaluation/multi_modal/faithfulness.py + instruction = "Please tell if a given piece of information is supported by the visual as well as textual context information. You need to answer with either True or False. Answer True if any of the image(s) and textual context supports the information" + input_model = FaithfulnessInput + output_model = FaithfulnessOutput + examples = [ + ( + FaithfulnessInput( + response="Apple pie is generally double-crusted.", + retrieved_contexts=[ + "An apple pie is a fruit pie in which the principal filling ingredient is apples.", + "Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard or cheddar cheese.", + "It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).", + ], + ), + FaithfulnessOutput(faithful=True), + ), + ( + FaithfulnessInput( + response="Apple pies tastes bad.", + retrieved_contexts=[ + "An apple pie is a fruit pie in which the principal filling ingredient is apples.", + "Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard or cheddar cheese.", + "It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).", + ], + ), + FaithfulnessOutput(faithful=False), + ), + ] + + +@dataclass +class MultiModalFaithfulness(MetricWithLLM, SingleTurnMetric): + name: str = "faithful_rate" # type: ignore + _required_columns: t.Dict[MetricType, t.Set[str]] = field( + default_factory=lambda: { + MetricType.SINGLE_TURN: { + "response", + "retrieved_contexts", + } + } + ) + faithfulness_prompt: ImageTextPrompt = MultiModalFaithfulnessPrompt() + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + prompt_input = FaithfulnessInput( + response=row["response"], retrieved_contexts=row["retrieved_contexts"] + ) + assert self.llm is not None, "LLM is not set" + prompt_response = await self.faithfulness_prompt.generate( + data=prompt_input, llm=self.llm, callbacks=callbacks + ) + if prompt_response is None: + return np.nan + return float(prompt_response.faithful) + + async def _single_turn_ascore( + self, sample: SingleTurnSample, callbacks: Callbacks + ) -> float: + row = sample.to_dict() + return await self._ascore(row, callbacks) + + +multimodal_faithness = MultiModalFaithfulness() diff --git a/src/ragas/metrics/_multi_modal_relevance.py b/src/ragas/metrics/_multi_modal_relevance.py new file mode 100644 index 000000000..6a85d68a1 --- /dev/null +++ b/src/ragas/metrics/_multi_modal_relevance.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +import typing as t +from dataclasses import dataclass, field + +import numpy as np +from pydantic import BaseModel, Field + +from ragas.dataset_schema import SingleTurnSample +from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric +from ragas.prompt import ImageTextPrompt + +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + + +class RelevanceInput(BaseModel): + user_input: str = Field(description="user input") + response: str = Field(description="response from AI") + retrieved_contexts: list[str] = Field(description="contexts retrieved from the LLM") + + def to_string_list(self): + return [ + f"Question: {self.user_input}", + f"Response: {self.response}", + "retrieved_contexts: ", + ] + self.retrieved_contexts + + +class RelevanceOutput(BaseModel): + relevance: bool = Field(description="boolean indicating if request was relevance") + + +class MultiModalRelevancePrompt(ImageTextPrompt[RelevanceInput, RelevanceOutput]): + # refer https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/evaluation/multi_modal/relevancy.py + instruction = """ +Your task is to evaluate if the response for the query is in line with the images and textual context information provided. +You have two options to answer. Either True / False. +Answer - True, if the response for the query is in line with context information otherwise False. +""" + input_model = RelevanceInput + output_model = RelevanceOutput + examples = [ + ( + RelevanceInput( + user_input="What is the primary ingredient in a traditional Margherita pizza?", + response="The primary ingredients in a Margherita pizza are tomatoes, mozzarella cheese, and fresh basil.", + retrieved_contexts=[ + "A traditional Margherita pizza consists of a thin crust.", + "The main toppings include tomatoes, mozzarella cheese, fresh basil, salt, and olive oil.", + "It is one of the simplest and most classic types of pizza.", + ], + ), + RelevanceOutput(relevance=True), + ), + ( + RelevanceInput( + user_input="Who won the Best Actor award at the Oscars in 2021?", + response="The Best Actor award in 2021 was won by Leonardo DiCaprio.", + retrieved_contexts=[ + "The 93rd Academy Awards were held in 2021.", + "Anthony Hopkins won the Best Actor award for his role in 'The Father'.", + "The event was unique due to COVID-19 restrictions.", + ], + ), + RelevanceOutput(relevance=False), + ), + ] + + +@dataclass +class MultiModalRelevance(MetricWithLLM, SingleTurnMetric): + name: str = "relevance_rate" # type: ignore + _required_columns: t.Dict[MetricType, t.Set[str]] = field( + default_factory=lambda: { + MetricType.SINGLE_TURN: { + "user_input", + "response", + "retrieved_contexts", + } + } + ) + relevance_prompt: ImageTextPrompt = MultiModalRelevancePrompt() + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + prompt_input = RelevanceInput( + user_input=row["user_input"], + response=row["response"], + retrieved_contexts=row["retrieved_contexts"], + ) + assert self.llm is not None, "LLM is not set" + prompt_response = await self.relevance_prompt.generate( + data=prompt_input, llm=self.llm, callbacks=callbacks + ) + if prompt_response is None: + return np.nan + return float(prompt_response.relevance) + + async def _single_turn_ascore( + self, sample: SingleTurnSample, callbacks: Callbacks + ) -> float: + row = sample.to_dict() + return await self._ascore(row, callbacks) + + +multimodal_relevance = MultiModalRelevance() diff --git a/src/ragas/prompt/__init__.py b/src/ragas/prompt/__init__.py index 8244c68ba..5743ea22c 100644 --- a/src/ragas/prompt/__init__.py +++ b/src/ragas/prompt/__init__.py @@ -1,5 +1,6 @@ from .base import BasePrompt, BoolIO, StringIO, StringPrompt from .mixin import PromptMixin +from .multi_modal_prompt import ImageTextPrompt, ImageTextPromptValue from .pydantic_prompt import InputModel, OutputModel, PydanticPrompt __all__ = [ @@ -11,4 +12,6 @@ "PromptMixin", "InputModel", "OutputModel", + "ImageTextPrompt", + "ImageTextPromptValue", ] diff --git a/src/ragas/prompt/multi_modal_prompt.py b/src/ragas/prompt/multi_modal_prompt.py new file mode 100644 index 000000000..6ac103495 --- /dev/null +++ b/src/ragas/prompt/multi_modal_prompt.py @@ -0,0 +1,214 @@ +from __future__ import annotations + +import base64 +import logging +import mimetypes +import typing as t +import urllib.request +from urllib.parse import urlparse + +from langchain_core.messages import BaseMessage, HumanMessage +from langchain_core.prompt_values import PromptValue +from pydantic import BaseModel + +from ragas.callbacks import ChainType, new_group +from ragas.exceptions import RagasOutputParserException +from ragas.prompt.pydantic_prompt import PydanticPrompt, RagasOutputParser + +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + + from ragas.llms.base import BaseRagasLLM + + +# type variables for input and output models +InputModel = t.TypeVar("InputModel", bound=BaseModel) +OutputModel = t.TypeVar("OutputModel", bound=BaseModel) + +logger = logging.getLogger(__name__) + + +class ImageTextPrompt(PydanticPrompt, t.Generic[InputModel, OutputModel]): + def _generate_examples(self): + if self.examples: + example_strings = [] + for e in self.examples: + input_data, output_data = e + example_strings.append( + self.instruction + + "\n" + + "input: " + + input_data.model_dump_json(indent=4) + + "\n" + + "output: " + + output_data.model_dump_json(indent=4) + ) + + return ( + "Some examples are provided below with only text context, but please do use any images for context if they are provided.\n" + + "\n\n".join(example_strings) + ) + # if no examples are provided + else: + return "" + + def to_prompt_value(self, data: t.Optional[InputModel] = None): + text = [ + self._generate_instruction(), + self._generate_output_signature(), + self._generate_examples(), + "Now perform the above instruction with the following", + ] + data.to_string_list() # type: ignore + return ImageTextPromptValue(items=text) + + async def generate_multiple( + self, + llm: BaseRagasLLM, + data: InputModel, + n: int = 1, + temperature: t.Optional[float] = None, + stop: t.Optional[t.List[str]] = None, + callbacks: t.Optional[Callbacks] = None, + retries_left: int = 3, + ) -> t.List[OutputModel]: + """ + Generate multiple outputs using the provided language model and input data. + + Parameters + ---------- + llm : BaseRagasLLM + The language model to use for generation. + data : InputModel + The input data for generation. + n : int, optional + The number of outputs to generate. Default is 1. + temperature : float, optional + The temperature parameter for controlling randomness in generation. + stop : List[str], optional + A list of stop sequences to end generation. + callbacks : Callbacks, optional + Callback functions to be called during the generation process. + + Returns + ------- + List[OutputModel] + A list of generated outputs. + + Raises + ------ + RagasOutputParserException + If there's an error parsing the output. + """ + callbacks = callbacks or [] + processed_data = self.process_input(data) + prompt_rm, prompt_cb = new_group( + name=self.name, + inputs={"data": processed_data}, + callbacks=callbacks, + metadata={"type": ChainType.RAGAS_PROMPT}, + ) + prompt_value = self.to_prompt_value(processed_data) + resp = await llm.generate( + prompt_value, + n=n, + temperature=temperature, + stop=stop, + callbacks=prompt_cb, + ) + + output_models = [] + parser = RagasOutputParser(pydantic_object=self.output_model) # type: ignore + for i in range(n): + output_string = resp.generations[0][i].text + try: + answer = await parser.parse_output_string( + output_string=output_string, + prompt_value=prompt_value, # type: ignore + llm=llm, + callbacks=prompt_cb, + retries_left=retries_left, + ) + processed_output = self.process_output(answer, data) # type: ignore + output_models.append(processed_output) + except RagasOutputParserException as e: + prompt_rm.on_chain_error(error=e) + logger.error("Prompt %s failed to parse output: %s", self.name, e) + raise e + + prompt_rm.on_chain_end({"output": output_models}) + return output_models + + +class ImageTextPromptValue(PromptValue): + items: t.List[str] + + def to_messages(self) -> t.List[BaseMessage]: + messages = [] + for item in self.items: + if self.is_image(item): + messages.append(self.get_image(item)) + else: + messages.append(self.get_text(item)) + return [HumanMessage(content=messages)] + + def get_text(self, item): + return {"type": "text", "text": item} + + def get_image(self, item): + if self.is_base64(item): + encoded_image = item + elif self.is_valid_url(item): + encoded_image = self.download_and_encode_image(item) + else: + encoded_image = self.encode_image_to_base64(item) + + return { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}, + } + + def to_string(self): + string_representation = "" + for item in self.items: + if self.is_image(item): + string_representation += "[Image]" + else: + string_representation += item + string_representation += " " + return string_representation.strip() + + def is_base64(self, s): + try: + if isinstance(s, str): + # Try to decode the string + if base64.b64encode(base64.b64decode(s)).decode("utf-8") == s: + return True + return False + except Exception: + return False + + def is_valid_url(self, url): + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except ValueError: + return False + + def encode_image_to_base64(self, file_path): + with open(file_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + + def download_and_encode_image(self, url): + with urllib.request.urlopen(url) as response: + return base64.b64encode(response.read()).decode("utf-8") + + def is_image(self, item): + if self.is_base64(item): + return True + elif self.is_valid_url(item): + mime_type, _ = mimetypes.guess_type(item) + return mime_type and mime_type.startswith("image") + elif isinstance(item, str): + mime_type, _ = mimetypes.guess_type(item) + return mime_type and mime_type.startswith("image") + return False diff --git a/src/ragas/prompt/pydantic_prompt.py b/src/ragas/prompt/pydantic_prompt.py index 1bbf75563..950252ec8 100644 --- a/src/ragas/prompt/pydantic_prompt.py +++ b/src/ragas/prompt/pydantic_prompt.py @@ -417,7 +417,7 @@ async def parse_output_string( prompt_value=prompt_value.to_string(), ), callbacks=retry_cb, - retries_left = retries_left - 1, + retries_left=retries_left - 1, ) retry_rm.on_chain_end({"fixed_output_string": fixed_output_string}) result = fixed_output_string diff --git a/src/ragas/testset/synthesizers/generate.py b/src/ragas/testset/synthesizers/generate.py index 010f0fbf9..d36de35a3 100644 --- a/src/ragas/testset/synthesizers/generate.py +++ b/src/ragas/testset/synthesizers/generate.py @@ -9,10 +9,10 @@ from ragas._analytics import TestsetGenerationEvent, track from ragas.callbacks import new_group from ragas.cost import TokenUsageParser +from ragas.embeddings.base import BaseRagasEmbeddings, LangchainEmbeddingsWrapper from ragas.executor import Executor from ragas.llms import BaseRagasLLM, LangchainLLMWrapper from ragas.run_config import RunConfig -from ragas.embeddings.base import BaseRagasEmbeddings, LangchainEmbeddingsWrapper from ragas.testset.graph import KnowledgeGraph, Node, NodeType from ragas.testset.synthesizers import default_query_distribution from ragas.testset.synthesizers.testset_schema import Testset, TestsetSample @@ -22,8 +22,8 @@ if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks from langchain_core.documents import Document as LCDocument - from langchain_core.language_models import BaseLanguageModel as LangchainLLM from langchain_core.embeddings.embeddings import Embeddings as LangchainEmbeddings + from langchain_core.language_models import BaseLanguageModel as LangchainLLM from ragas.embeddings.base import BaseRagasEmbeddings from ragas.llms.base import BaseRagasLLM @@ -66,10 +66,10 @@ def from_langchain( """ knowledge_graph = knowledge_graph or KnowledgeGraph() return cls( - LangchainLLMWrapper(llm), - LangchainEmbeddingsWrapper(embedding_model), - knowledge_graph - ) + LangchainLLMWrapper(llm), + LangchainEmbeddingsWrapper(embedding_model), + knowledge_graph, + ) def generate_with_langchain_docs( self, @@ -91,22 +91,22 @@ def generate_with_langchain_docs( # force the user to provide an llm and embedding client to prevent use of default LLMs if not self.llm and not transforms_llm: raise ValueError( - '''An llm client was not provided. + """An llm client was not provided. Provide an LLM on TestsetGenerator instantiation or as an argument for transforms_llm parameter. - Alternatively you can provide your own transforms through the `transforms` parameter.''' - ) + Alternatively you can provide your own transforms through the `transforms` parameter.""" + ) if not self.embedding_model and not transforms_embedding_model: raise ValueError( - '''An embedding client was not provided. + """An embedding client was not provided. Provide an embedding model on TestsetGenerator instantiation or as an argument for transforms_llm parameter. - Alternatively you can provide your own transforms through the `transforms` parameter.''' - ) + Alternatively you can provide your own transforms through the `transforms` parameter.""" + ) if not transforms: transforms = default_transforms( - llm=transforms_llm or self.llm, - embedding_model=transforms_embedding_model or self.embedding_model - ) + llm=transforms_llm or self.llm, + embedding_model=transforms_embedding_model or self.embedding_model, + ) # convert the documents to Ragas nodes nodes = [] diff --git a/tests/unit/test_prompt.py b/tests/unit/test_prompt.py index 10db3b432..8be1dc867 100644 --- a/tests/unit/test_prompt.py +++ b/tests/unit/test_prompt.py @@ -208,8 +208,8 @@ def test_prompt_class_attributes(): @pytest.mark.asyncio async def test_prompt_parse_retry(): - from ragas.prompt import PydanticPrompt, StringIO from ragas.exceptions import RagasOutputParserException + from ragas.prompt import PydanticPrompt, StringIO class OutputModel(BaseModel): example: str