explodinggradients · jjmachan · Oct 25, 2024 · Oct 23, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/docs/concepts/metrics/available_metrics/index.md b/docs/concepts/metrics/available_metrics/index.md
@@ -11,6 +11,8 @@ Each metric are essentially paradigms that is designed to evaluate a particular
 - [Noise Sensitivity](noise_sensitivity.md)
 - [Response Relevancy](answer_relevance.md)
 - [Faithfulness](faithfulness.md)
+- [Multimodal Faithfulness](multi_modal_faithfulness.md)
+- [Multimodal Relevance](multi_modal_relevance.md)
 
 ## Agents or Tool use cases
 

diff --git a/docs/concepts/metrics/available_metrics/multi_modal_faithfulness.md b/docs/concepts/metrics/available_metrics/multi_modal_faithfulness.md
@@ -0,0 +1,50 @@
+## MultiModalFaithfulness
+
+`MultiModalFaithfulness` metric measures the factual consistency of the generated answer against both visual and textual context. It is calculated from the answer, retrieved textual context, and visual context. The answer is scaled to a (0,1) range, with higher scores indicating better faithfulness.
+
+The generated answer is regarded as faithful if all the claims made in the answer can be inferred from either the visual or textual context provided. To determine this, the response is directly evaluated against the provided contexts, and the faithfulness score is either 0 or 1.
+
+### Example
+
+```python
+from ragas.database_schema import SingleTurnSample 
+from ragas.metrics import MultiModalFaithfulness
+
+sample = SingleTurnSample(
+        user_input="What about the Tesla Model X?",
+        response="Cats are cute.",
+        retrieved_contexts=[
+            "custom_eval/multimodal/images/tesla.jpg"
+        ]
+    )
+scorer = MultiModalFaithfulness()
+await scorer.single_turn_ascore(sample)
+```
+
+### How It’s Calculated 
+
+!!! example
+    **Question**: What about the Tesla Model X?
+
+    **Context (visual)**: 
+    - An image of the Tesla Model X (custom_eval/multimodal/images/tesla.jpg)
+
+    **High faithfulness answer**: The Tesla Model X is an electric SUV manufactured by Tesla.
+
+    **Low faithfulness answer**: Cats are cute.
+
+Let's examine how faithfulness was calculated using the low faithfulness answer:
+
+- **Step 1:** Evaluate the generated response against the given contexts.
+    - Response: "Cats are cute."
+
+- **Step 2:** Verify if the response can be inferred from the given context.
+    - Response: No
+
+- **Step 3:** Use the result to determine the faithfulness score.
+
+    $$
+    \text{Faithfulness} = 0
+    $$
+
+In this example, the response "Cats are cute" cannot be inferred from the image of the Tesla Model X, so the faithfulness score is 0.
diff --git a/docs/concepts/metrics/available_metrics/multi_modal_relevance.md b/docs/concepts/metrics/available_metrics/multi_modal_relevance.md
@@ -0,0 +1,50 @@
+## MultiModalRelevance
+
+`MultiModalRelevance` metric measures the relevance of the generated answer against both visual and textual context. It is calculated from the user input, response, and retrieved contexts (both visual and textual). The answer is scaled to a (0,1) range, with higher scores indicating better relevance.
+
+The generated answer is regarded as relevant if it aligns with the visual or textual context provided. To determine this, the response is directly evaluated against the provided contexts, and the relevance score is either 0 or 1.
+
+### Example
+
+```python
+from ragas.database_schema import SingleTurnSample 
+from ragas.metrics import MultiModalRelevance
+
+sample = SingleTurnSample(
+        user_input="What about the Tesla Model X?",
+        response="Cats are cute.",
+        retrieved_contexts=[
+            "custom_eval/multimodal/images/tesla.jpg"
+        ]
+    )
+scorer = MultiModalRelevance()
+await scorer.single_turn_ascore(sample)
+```
+
+### How It’s Calculated 
+
+!!! example
+    **Question**: What about the Tesla Model X?
+
+    **Context (visual)**: 
+    - An image of the Tesla Model X (custom_eval/multimodal/images/tesla.jpg)
+
+    **High relevance answer**: The Tesla Model X is an electric SUV manufactured by Tesla.
+
+    **Low relevance answer**: Cats are cute.
+
+Let's examine how relevance was calculated using the low relevance answer:
+
+- **Step 1:** Evaluate the generated response against the given contexts.
+    - Response: "Cats are cute."
+
+- **Step 2:** Verify if the response aligns with the given context.
+    - Response: No
+
+- **Step 3:** Use the result to determine the relevance score.
+
+    $$
+    \text{Relevance} = 0
+    $$
+
+In this example, the response "Cats are cute" does not align with the image of the Tesla Model X, so the relevance score is 0.
diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py
@@ -58,6 +58,14 @@
 from ragas.metrics._summarization import SummarizationScore, summarization_score
 from ragas.metrics._tool_call_accuracy import ToolCallAccuracy
 from ragas.metrics._topic_adherence import TopicAdherenceScore
+from ragas.metrics._multi_modal_faithfulness import (
+    MultiModalFaithfulness,
+    multimodal_faithness,
+)
+from ragas.metrics._multi_modal_relevance import (
+    MultiModalRelevance,
+    multimodal_relevance,
+)
 
 __all__ = [
     "AnswerCorrectness",
@@ -105,6 +113,10 @@
     "DistanceMeasure",
     "TopicAdherenceScore",
     "LLMSQLEquivalence",
+    "MultiModalFaithfulness",
+    "multimodal_faithness",
+    "MultiModalRelevance",
+    "multimodal_relevance",
 ]
 
 current_module = sys.modules[__name__]

diff --git a/src/ragas/metrics/_multi_modal_faithfulness.py b/src/ragas/metrics/_multi_modal_faithfulness.py
@@ -0,0 +1,91 @@
+import typing as t
+from dataclasses import dataclass, field
+from ragas.metrics.base import MetricWithLLM, SingleTurnMetric, MetricType
+from pydantic import BaseModel, Field
+from ragas.prompt import ImageTextPrompt
+from ragas.dataset_schema import SingleTurnSample
+import numpy as np
+
+
+class FaithfulnessInput(BaseModel):
+    response: str = Field(description="response from AI")
+    retrieved_contexts: list[str] = Field(description="contexts retrieved from the LLM")
+
+    def to_string_list(self):
+        return [
+            "inputs:",
+            self.response,
+            "retrieved_contexts: ",
+        ] + self.retrieved_contexts
+
+
+class FaithfulnessOutput(BaseModel):
+    faithful: bool = Field(description="boolean indicating if request was faithful")
+
+
+class MultiModalFaithfulnessPrompt(
+    ImageTextPrompt[FaithfulnessInput, FaithfulnessOutput]
+):
+    # refer: https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/evaluation/multi_modal/faithfulness.py
+    instruction = "Please tell if a given piece of information is supported by the visual as well as textual context information. You need to answer with either True or False. Answer True if any of the image(s) and textual context supports the information"
+    input_model = FaithfulnessInput
+    output_model = FaithfulnessOutput
+    examples = [
+        (
+            FaithfulnessInput(
+                response="Apple pie is generally double-crusted.",
+                retrieved_contexts=[
+                    "An apple pie is a fruit pie in which the principal filling ingredient is apples.",
+                    "Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard or cheddar cheese.",
+                    "It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).",
+                ],
+            ),
+            FaithfulnessOutput(faithful=True),
+        ),
+        (
+            FaithfulnessInput(
+                response="Apple pies tastes bad.",
+                retrieved_contexts=[
+                    "An apple pie is a fruit pie in which the principal filling ingredient is apples.",
+                    "Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard or cheddar cheese.",
+                    "It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).",
+                ],
+            ),
+            FaithfulnessOutput(faithful=False),
+        ),
+    ]
+
+
+@dataclass
+class MultiModalFaithfulness(MetricWithLLM, SingleTurnMetric):
+    name: str = "faithful_rate"  # type: ignore
+    _required_columns: t.Dict[MetricType, t.Set[str]] = field(
+        default_factory=lambda: {
+            MetricType.SINGLE_TURN: {
+                "response",
+                "retrieved_contexts",
+            }
+        }
+    )
+    faithfulness_prompt: ImageTextPrompt = MultiModalFaithfulnessPrompt()
+
+    async def _ascore(self, row: t.Dict, callbacks: t.Any) -> float:
+        prompt_input = FaithfulnessInput(
+            response=row["response"], retrieved_contexts=row["retrieved_contexts"]
+        )
+        assert self.llm is not None, "LLM is not set"
+        prompt_response = await self.faithfulness_prompt.generate(
+            data=prompt_input, llm=self.llm
+        )
+        if prompt_response is None:
+            return np.nan
+        return float(prompt_response.faithful)
+
+    async def _single_turn_ascore(
+        self, sample: SingleTurnSample, callbacks: t.Any
+    ) -> float:
+        row = sample.to_dict()
+        return await self._ascore(row, callbacks)
+
+
+multimodal_faithness = MultiModalFaithfulness()
diff --git a/src/ragas/metrics/_multi_modal_relevance.py b/src/ragas/metrics/_multi_modal_relevance.py
@@ -0,0 +1,100 @@
+import typing as t
+from dataclasses import dataclass, field
+from ragas.metrics.base import MetricWithLLM, SingleTurnMetric, MetricType
+from pydantic import BaseModel, Field
+from ragas.prompt import ImageTextPrompt
+from ragas.dataset_schema import SingleTurnSample
+import numpy as np
+
+
+class RelevanceInput(BaseModel):
+    user_input: str = Field(description="user input")
+    response: str = Field(description="response from AI")
+    retrieved_contexts: list[str] = Field(description="contexts retrieved from the LLM")
+
+    def to_string_list(self):
+        return [
+            f"Question: {self.user_input}",
+            f"Response: {self.response}",
+            "retrieved_contexts: ",
+        ] + self.retrieved_contexts
+
+
+class RelevanceOutput(BaseModel):
+    relevance: bool = Field(description="boolean indicating if request was relevance")
+
+
+class MultiModalRelevancePrompt(ImageTextPrompt[RelevanceInput, RelevanceOutput]):
+    # refer https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/evaluation/multi_modal/relevancy.py
+    instruction = """
+Your task is to evaluate if the response for the query is in line with the images and textual context information provided.
+You have two options to answer. Either True / False.
+Answer - True, if the response for the query is in line with context information otherwise False.
+"""
+    input_model = RelevanceInput
+    output_model = RelevanceOutput
+    examples = [
+        (
+            RelevanceInput(
+                user_input="What is the primary ingredient in a traditional Margherita pizza?",
+                response="The primary ingredients in a Margherita pizza are tomatoes, mozzarella cheese, and fresh basil.",
+                retrieved_contexts=[
+                    "A traditional Margherita pizza consists of a thin crust.",
+                    "The main toppings include tomatoes, mozzarella cheese, fresh basil, salt, and olive oil.",
+                    "It is one of the simplest and most classic types of pizza.",
+                ],
+            ),
+            RelevanceOutput(relevance=True),
+        ),
+        (
+            RelevanceInput(
+                user_input="Who won the Best Actor award at the Oscars in 2021?",
+                response="The Best Actor award in 2021 was won by Leonardo DiCaprio.",
+                retrieved_contexts=[
+                    "The 93rd Academy Awards were held in 2021.",
+                    "Anthony Hopkins won the Best Actor award for his role in 'The Father'.",
+                    "The event was unique due to COVID-19 restrictions.",
+                ],
+            ),
+            RelevanceOutput(relevance=False),
+        ),
+    ]
+
+
+@dataclass
+class MultiModalRelevance(MetricWithLLM, SingleTurnMetric):
+    name: str = "relevance_rate"  # type: ignore
+    _required_columns: t.Dict[MetricType, t.Set[str]] = field(
+        default_factory=lambda: {
+            MetricType.SINGLE_TURN: {
+                "user_input",
+                "response",
+                "retrieved_contexts",
+            }
+        }
+    )
+    relevance_prompt: ImageTextPrompt = MultiModalRelevancePrompt()
+
+    async def _ascore(self, row: t.Dict, callbacks: t.Any) -> float:
+
+        prompt_input = RelevanceInput(
+            user_input=row["user_input"],
+            response=row["response"],
+            retrieved_contexts=row["retrieved_contexts"],
+        )
+        assert self.llm is not None, "LLM is not set"
+        prompt_response = await self.relevance_prompt.generate(
+            data=prompt_input, llm=self.llm
+        )
+        if prompt_response is None:
+            return np.nan
+        return float(prompt_response.relevance)
+
+    async def _single_turn_ascore(
+        self, sample: SingleTurnSample, callbacks: t.Any
+    ) -> float:
+        row = sample.to_dict()
+        return await self._ascore(row, callbacks)
+
+
+multimodal_relevance = MultiModalRelevance()
diff --git a/src/ragas/prompt/__init__.py b/src/ragas/prompt/__init__.py
@@ -1,6 +1,7 @@
 from .base import BasePrompt, BoolIO, StringIO, StringPrompt
 from .mixin import PromptMixin
 from .pydantic_prompt import InputModel, OutputModel, PydanticPrompt
+from .multi_modal_prompt import ImageTextPrompt, ImageTextPromptValue
 
 __all__ = [
     "BasePrompt",
@@ -11,4 +12,6 @@
     "PromptMixin",
     "InputModel",
     "OutputModel",
+    "ImageTextPrompt",
+    "ImageTextPromptValue",
 ]