explodinggradients · jjmachan · Sep 18, 2024 · Sep 15, 2024 · Sep 15, 2024 · Sep 16, 2024
diff --git a/docs/concepts/metrics/sql.md b/docs/concepts/metrics/sql.md
@@ -0,0 +1,104 @@
+# SQL 
+
+
+## Execution based metrics
+In these metrics the resulting SQL is compared after executing the SQL query on the database and then comparing the `response` with the expected results. 
+
+### DataCompy Score
+
+DataCompy is a python library that compares two pandas DataFrames. It provides a simple interface to compare two DataFrames and provides a detailed report of the differences. In this metric the `response` is executed on the database and the resulting data is compared with the expected data, ie `reference`. To enable comparison both `response` and `reference` should be in the form of a Comma-Separated Values as shown in the example.
+
+Dataframes can be compared across rows or columns. This can be configured using `mode` parameter. 
+
+If mode is `row` then the comparison is done row-wise. If mode is `column` then the comparison is done column-wise.
+
+```{math}
+:label: precision
+\text{Precision } = {|\text{Number of matching rows in response and reference}| \over |\text{Total number of rows in response}|}
+```
+
+```{math}
+:label: recall
+\text{Precision } = {|\text{Number of matching rows in response and reference}| \over |\text{Total number of rows in reference}|}
+```
+
+By default, the mode is set to `row`, and metric is F1 score which is the harmonic mean of precision and recall.
+
+
+```{code-block} python
+from ragas.metrics._datacompy_score import DataCompyScore
+from ragas.dataset_schema import SingleTurnSample
+
+data1 = """acct_id,dollar_amt,name,float_fld,date_fld
+10000001234,123.45,George Maharis,14530.1555,2017-01-01
+10000001235,0.45,Michael Bluth,1,2017-01-01
+10000001236,1345,George Bluth,,2017-01-01
+10000001237,123456,Bob Loblaw,345.12,2017-01-01
+10000001238,1.05,Lucille Bluth,,2017-01-01
+10000001238,1.05,Loose Seal Bluth,,2017-01-01
+"""
+
+data2 = """acct_id,dollar_amt,name,float_fld
+10000001234,123.4,George Michael Bluth,14530.155
+10000001235,0.45,Michael Bluth,
+10000001236,1345,George Bluth,1
+10000001237,123456,Robert Loblaw,345.12
+10000001238,1.05,Loose Seal Bluth,111
+"""
+sample = SingleTurnSample(response=data1, reference=data2)
+scorer = DataCompyScore()
+await scorer.single_turn_ascore(sample)
+```
+To change the mode to column-wise comparison, set the `mode` parameter to `column`.
+
+
+```{code-block} python
+scorer = DataCompyScore(mode="column", metric="recall")
+```
+
+## Non Execution based metrics
+
+Executing SQL queries on the database can be time-consuming and sometimes not feasible. In such cases, we can use non-execution based metrics to evaluate the SQL queries. These metrics compare the SQL queries directly without executing them on the database.
+
+### SQL Query Semantic equivalence
+
+SQL Query Semantic equivalence is a metric that can be used to evaluate the equivalence of `response` query with `reference` query. The metric also needs database schema to be used when comparing queries, this is inputted in `reference_contexts`. This metric is a binary metric, with 1 indicating that the SQL queries are semantically equivalent and 0 indicating that the SQL queries are not semantically equivalent.
+
+```{code-block} python
+from ragas.metrics._sql_semantic_equivalence import LLMSqlEquivalenceWithReference
+from ragas.dataset_schema import SingleTurnSample
+
+sample = SingleTurnSample(
+    response="""
+        SELECT p.product_name, SUM(oi.quantity) AS total_quantity
+        FROM order_items oi
+        JOIN products p ON oi.product_id = p.product_id
+        GROUP BY p.product_name;
+    """,
+    reference="""
+        SELECT p.product_name, COUNT(oi.quantity) AS total_quantity
+        FROM order_items oi
+        JOIN products p ON oi.product_id = p.product_id
+        GROUP BY p.product_name;
+    """,
+    reference_contexts=[
+        """
+        Table order_items:
+        - order_item_id: INT
+        - order_id: INT
+        - product_id: INT
+        - quantity: INT
+        """,
+        """
+        Table products:
+        - product_id: INT
+        - product_name: VARCHAR
+        - price: DECIMAL
+        """
+    ]
+)
+
+scorer = LLMSqlEquivalenceWithReference()
+scorer.llm = openai_model
+await scorer.single_turn_ascore(sample)
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,8 @@ all = [
     "rouge_score",
     "fuzzywuzzy",
     "rapidfuzz",
+    "pandas",
+    "datacompy",
 ]
 
 [tool.setuptools]

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -13,4 +13,6 @@ graphene
 fuzzywuzzy
 rouge_score
 nltk
-rapidfuzz
+rapidfuzz
+pandas
+datacompy
diff --git a/src/ragas/metrics/_datacompy_score.py b/src/ragas/metrics/_datacompy_score.py
@@ -0,0 +1,78 @@
+import logging
+import typing as t
+from dataclasses import dataclass, field
+from io import StringIO
+
+import numpy as np
+from langchain_core.callbacks import Callbacks
+
+from ragas.dataset_schema import SingleTurnSample
+from ragas.metrics.base import MetricType, SingleTurnMetric
+from ragas.run_config import RunConfig
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DataCompyScore(SingleTurnMetric):
+    name: str = "data_compare_score"  # type: ignore
+    _required_columns: t.Dict[MetricType, t.Set[str]] = field(
+        default_factory=lambda: {MetricType.SINGLE_TURN: {"reference", "response"}}
+    )
+    mode: t.Literal["rows", "columns"] = "rows"
+    metric: t.Literal["precision", "recall", "f1"] = "f1"
+
+    def __post_init__(self):
+        try:
+            import datacompy
+            import pandas as pd
+        except ImportError as e:
+            raise ImportError(
+                f"{e.name} is required for bleu score. Please install it using `pip install {e.name}`"
+            )
+
+        self.data_compare = datacompy
+        self.pd = pd
+        if self.mode not in ["rows", "columns"]:
+            raise ValueError("Mode should be either rows or columns")
+
+        if self.metric not in ["precision", "recall", "f1"]:
+            raise ValueError("Metric should be either precision, recall or f1")
+
+    def init(self, run_config: RunConfig):
+        pass
+
+    async def _single_turn_ascore(
+        self, sample: SingleTurnSample, callbacks: Callbacks
+    ) -> float:
+        reference = sample.reference
+        response = sample.response
+        assert isinstance(reference, str), "Expecting a string"
+        assert isinstance(response, str), "Expecting a string"
+        try:
+            reference_df = self.pd.read_csv(StringIO(reference))
+            response_df = self.pd.read_csv(StringIO(response))
+        except Exception as e:
+            logging.error(f"Error in reading csv: {e}")
+            return np.nan
+
+        compare = self.data_compare.Compare(reference_df, response_df, on_index=True)
+        if self.mode == "rows":
+            recall = compare.count_matching_rows() / reference_df.shape[0]
+            precision = compare.count_matching_rows() / response_df.shape[0]
+        else:
+            matched_cols = len(
+                [col for col in compare.column_stats if col["unequal_cnt"] == 0]
+            )
+            recall = matched_cols / reference_df.shape[1]
+            precision = matched_cols / response_df.shape[1]
+
+        if self.metric == "precision":
+            return precision
+        elif self.metric == "recall":
+            return recall
+        else:
+            return 2 * (precision * recall) / (precision + recall)
+
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
+        return await self._single_turn_ascore(SingleTurnSample(**row), callbacks)
diff --git a/src/ragas/metrics/_sql_semantic_equivalence.py b/src/ragas/metrics/_sql_semantic_equivalence.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+import logging
+import typing as t
+from dataclasses import dataclass, field
+
+from pydantic import BaseModel, Field
+
+from ragas.dataset_schema import SingleTurnSample
+from ragas.experimental.llms.prompt import PydanticPrompt
+from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric
+
+if t.TYPE_CHECKING:
+    from langchain_core.callbacks import Callbacks
+
+
+logger = logging.getLogger(__name__)
+
+
+class EquivalenceInput(BaseModel):
+    reference: str = Field(..., description="Reference SQL")
+    response: str = Field(..., description="Generated SQL")
+    database_schema: str = Field(..., description="Reference SQL schema")
+
+
+class EquivalenceOutput(BaseModel):
+    response_query_explaination: str = Field(
+        ..., description="Explanation of the generated SQL"
+    )
+    reference_query_explaination: str = Field(
+        ..., description="Explanation of the reference SQL"
+    )
+    equivalence: bool = Field(
+        ..., description="Whether the generated SQL is equivalent to the reference SQL"
+    )
+
+
+class EquivalencePrompt(PydanticPrompt[EquivalenceInput, EquivalenceOutput]):
+    instruction = """
+    Explain and compare two SQL queries (Q1 and Q2) based on the provided database schema. First, explain each query, then determine if they have significant logical differences.
+    """
+    input_model = EquivalenceInput
+    output_model = EquivalenceOutput
+    examples = [
+        (
+            EquivalenceInput(
+                reference="SELECT id, name FROM users WHERE active = 1;",
+                response="SELECT id, name FROM users WHERE active = true;",
+                database_schema="""
+                    Table users:
+                    - id: INT
+                    - name: VARCHAR
+                    - active: BOOLEAN
+                """,
+            ),
+            EquivalenceOutput(
+                response_query_explaination="The generated SQL query retrieves the id and name of users where the active field is true.",
+                reference_query_explaination="The reference SQL query retrieves the id and name of users where the active field equals 1.",
+                equivalence=True,
+            ),
+        )
+    ]
+
+
+@dataclass
+class LLMSqlEquivalenceWithReference(MetricWithLLM, SingleTurnMetric):
+    name: str = "llm_sql_equivalence_with_reference"  # type: ignore
+    _required_columns: t.Dict[MetricType, t.Set[str]] = field(
+        default_factory=lambda: {
+            MetricType.SINGLE_TURN: {"response", "reference", "reference_contexts"}
+        }
+    )
+    equivalence_prompt: PydanticPrompt = EquivalencePrompt()
+
+    async def _single_turn_ascore(
+        self, sample: SingleTurnSample, callbacks: Callbacks
+    ) -> float:
+        assert self.llm is not None, "LLM is not initialized"
+        assert isinstance(sample.reference, str), "Sample reference must be a string"
+        assert isinstance(sample.response, str), "Sample response must be a string"
+        assert isinstance(
+            sample.reference_contexts, list
+        ), "Sample reference_contexts must be a List"
+
+        database_schema = "\n".join(sample.reference_contexts)
+        input_data = EquivalenceInput(
+            reference=sample.reference,
+            response=sample.response,
+            database_schema=database_schema,
+        )
+        response = await self.equivalence_prompt.generate(
+            data=input_data, llm=self.llm, callbacks=callbacks
+        )
+        return int(response.equivalence)
+
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
+        return await self._single_turn_ascore(SingleTurnSample(**row), callbacks)