explodinggradients · shahules786 · Jul 20, 2023 · Jul 14, 2023 · Jul 19, 2023 · Jul 19, 2023
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -38,7 +38,6 @@ jobs:
             ragas:
               - "src/ragas/**"
               - "tests/**"
-              - "examples/**"
             docs:
               - *related
               - requirements/docs-requirements.txt
@@ -52,7 +51,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: ["3.7", "3.8", "3.9", "3.10"]
+        python-version: ["3.8", "3.9", "3.10"]
 
     if: ${{ (github.event_name == 'pull_request' && needs.diff.outputs.ragas == 'true') || github.event_name == 'push' }}
     name: python${{ matrix.python-version }}_unit_tests (${{ matrix.os }})
@@ -86,6 +85,7 @@ jobs:
           pip install "."
           pip install -r requirements/test.txt
 
+
       - name: Run unit tests
         run: |
           # OPTS=(--cov-config pyproject.toml --cov=src/bentoml --cov-append)
@@ -94,7 +94,7 @@ jobs:
             OPTS=(--dist loadfile -n auto)
           fi
           # Now run the unit tests
-          pytest tests/unit "${OPTS[@]}"
+          OPENAI_API_KEY="test" pytest tests/unit "${OPTS[@]}"
 
   codestyle_check:
     runs-on: ubuntu-latest

diff --git a/experiments/assesments/metrics_assesments.ipynb b/experiments/assesments/metrics_assesments.ipynb
@@ -106,16 +106,17 @@
    "source": [
     "import os\n",
     "import openai\n",
+    "\n",
     "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
     "\n",
     "completion = openai.ChatCompletion.create(\n",
-    "  model=\"gpt-3.5-turbo\",\n",
-    "  messages=[\n",
-    "    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
-    "  ]\n",
+    "    model=\"gpt-3.5-turbo\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
+    "    ],\n",
     ")\n",
     "\n",
-    "print(completion.choices[0].message)\n"
+    "print(completion.choices[0].message)"
    ]
   },
   {
@@ -125,11 +126,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "def llm2(prompt, **kwargs):\n",
     "    response = openai.ChatCompletion.create(\n",
-    "        model=kwargs.get(\"model\",\"gpt-3.5-turbo-16k\"),\n",
-    "        messages=[{\"role\": \"system\", \"content\":prompt}],\n",
+    "        model=kwargs.get(\"model\", \"gpt-3.5-turbo-16k\"),\n",
+    "        messages=[{\"role\": \"system\", \"content\": prompt}],\n",
     "        temperature=kwargs.get(\"temperature\", 0),\n",
     "        top_p=kwargs.get(\"top_p\", 1),\n",
     "        frequency_penalty=kwargs.get(\"frequency_penalty\", 0.0),\n",
@@ -139,6 +139,7 @@
     "    )\n",
     "    return response\n",
     "\n",
+    "\n",
     "def llm(prompt, **kwargs):\n",
     "    response = openai.Completion.create(\n",
     "        model=kwargs.get(\"model\", \"text-davinci-003\"),\n",
@@ -375,7 +376,7 @@
     }
    ],
    "source": [
-    "llm2([Question_generation.format(2,answer)])"
+    "llm2([Question_generation.format(2, answer)])"
    ]
   },
   {
@@ -1039,10 +1040,12 @@
    ],
    "source": [
     "def get_all_facts(item):\n",
-    "    all_facts = item['context']['sentences']\n",
+    "    all_facts = item[\"context\"][\"sentences\"]\n",
     "    all_facts = [sent for para in all_facts for sent in para]\n",
-    "    return {\"full_context\":''.join(all_facts)}\n",
-    "hotpot_qa = hotpot_qa.map(get_all_facts, batched=False)   "
+    "    return {\"full_context\": \"\".join(all_facts)}\n",
+    "\n",
+    "\n",
+    "hotpot_qa = hotpot_qa.map(get_all_facts, batched=False)"
    ]
   },
   {
@@ -1090,8 +1093,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "i=15\n",
-    "q,c = hotpot_qa[i]['question'],hotpot_qa[i]['full_context']"
+    "i = 15\n",
+    "q, c = hotpot_qa[i][\"question\"], hotpot_qa[i][\"full_context\"]"
    ]
   },
   {
@@ -1112,7 +1115,7 @@
    "outputs": [],
    "source": [
     "q = \"what is general relativity?\"\n",
-    "n=2"
+    "n = 2"
    ]
   },
   {
@@ -1123,20 +1126,21 @@
    "outputs": [],
    "source": [
     "import wikipediaapi\n",
+    "\n",
     "wiki_wiki = wikipediaapi.Wikipedia(\n",
-    "        language='en',\n",
-    "        extract_format=wikipediaapi.ExtractFormat.WIKI\n",
+    "    language=\"en\", extract_format=wikipediaapi.ExtractFormat.WIKI\n",
     ")\n",
     "\n",
     "p_wiki = wiki_wiki.page(\"Black hole\")\n",
     "\n",
+    "\n",
     "def get_page_section(page, section):\n",
     "    all_text = \"\"\n",
     "    p_wiki = wiki_wiki.page(page)\n",
     "    sections = p_wiki.sections_by_title(section)\n",
     "    for s in sections:\n",
     "        all_text += s.full_text()\n",
-    "    return all_text\n"
+    "    return all_text"
    ]
   },
   {
@@ -1152,48 +1156,42 @@
     "\n",
     "cross_encoder = CrossEncoder(\"cross-encoder/stsb-TinyBERT-L-4\")\n",
     "\n",
-    "        \n",
+    "\n",
     "def sent_tokenize(sent):\n",
-    "    return [s[:-1] if  s.endswith('.') else s for s in sent.strip().split('. ')]\n",
+    "    return [s[:-1] if s.endswith(\".\") else s for s in sent.strip().split(\". \")]\n",
+    "\n",
     "\n",
     "class SentenceAgreement:\n",
-    "    \n",
     "    def __init__(self, scoring=\"bert_score\"):\n",
-    "        \n",
     "        self.scoring = scoring\n",
     "\n",
-    "        \n",
     "    @staticmethod\n",
     "    def bert_score(para1, para2):\n",
-    "        \n",
     "        sentences1, sentences2 = sent_tokenize(para1), sent_tokenize(para2)\n",
     "        scores = cross_encoder.predict(list(itertools.product(sentences1, sentences2)))\n",
     "        scores = scores.reshape(len(sentences1), len(sentences2))\n",
     "        return scores.max(axis=1).mean()\n",
     "\n",
     "    @staticmethod\n",
     "    def jaccard_score(para1, para2):\n",
-    "        \n",
     "        sentences1, sentences2 = sent_tokenize(para1), sent_tokenize(para2)\n",
     "        intersect = len(np.intersect1d(sentences1, sentences2))\n",
     "        union = len(np.union1d(sentences1, sentences2))\n",
-    "        return intersect/union\n",
-    "    \n",
-    "    def evaluate(self,answers:List[List[str]]):\n",
-    "        \n",
+    "        return intersect / union\n",
+    "\n",
+    "    def evaluate(self, answers: List[List[str]]):\n",
     "        \"\"\"\n",
     "        eval nC2 combinations\n",
     "        \"\"\"\n",
     "        scores = []\n",
-    "        groups = combinations(answers,2)\n",
+    "        groups = combinations(answers, 2)\n",
     "        for group in groups:\n",
     "            if self.scoring == \"jaccard\":\n",
     "                score = self.jaccard_score(*group)\n",
     "            elif self.scoring == \"bert_score\":\n",
     "                score = self.bert_score(*group)\n",
     "            scores.append(score)\n",
-    "        return np.mean(scores)\n",
-    "            "
+    "        return np.mean(scores)"
    ]
   },
   {
@@ -1204,26 +1202,30 @@
    "outputs": [],
    "source": [
     "class ContextRelevacy:\n",
-    "    \n",
-    "    def __init__(self, strictness = 2, agreement_metric=\"bert_score\"):\n",
-    "        \n",
+    "    def __init__(self, strictness=2, agreement_metric=\"bert_score\"):\n",
     "        self.strictness = strictness\n",
     "        self.sent_agreement = SentenceAgreement(agreement_metric)\n",
-    "        \n",
-    "    def score(self,question,context):\n",
+    "\n",
+    "    def score(self, question, context):\n",
     "        scores = []\n",
-    "        outputs = llm(Context_relevency.format(q,c),n=self.strictness,temperature=1)\n",
-    "        outputs = [outputs['choices'][i]['text'].strip() for i in range(self.strictness)]\n",
+    "        outputs = llm(Context_relevency.format(q, c), n=self.strictness, temperature=1)\n",
+    "        outputs = [\n",
+    "            outputs[\"choices\"][i][\"text\"].strip() for i in range(self.strictness)\n",
+    "        ]\n",
     "        context_sents = sent_tokenize(context)\n",
     "        for output in outputs:\n",
-    "            indices = [context.find(sent) for sent in sent_tokenize(output) if context.find(sent)!=-1]\n",
-    "            scores.append(len(indices)/len(context_sents))\n",
-    "        \n",
+    "            indices = [\n",
+    "                context.find(sent)\n",
+    "                for sent in sent_tokenize(output)\n",
+    "                if context.find(sent) != -1\n",
+    "            ]\n",
+    "            scores.append(len(indices) / len(context_sents))\n",
+    "\n",
     "        if self.strictness > 1:\n",
     "            agr_score = self.sent_agreement.evaluate(outputs)\n",
     "        else:\n",
-    "            agr_score =1 \n",
-    "        return agr_score * np.mean(scores)\n"
+    "            agr_score = 1\n",
+    "        return agr_score * np.mean(scores)"
    ]
   },
   {
@@ -1234,7 +1236,7 @@
    "outputs": [],
    "source": [
     "c = get_page_section(\"HIV/AIDS\", \"Prevention\")\n",
-    "c = ' '.join(c.split(' ')[:500])\n",
+    "c = \" \".join(c.split(\" \")[:500])\n",
     "q = \"When was the first HIV case detected?\""
    ]
   },
@@ -1245,7 +1247,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "output = llm([Context_relevency.format(q,c), Context_relevency.format(\"How to prevent AIDS?\",c)],n=n,temperature=1)"
+    "output = llm(\n",
+    "    [\n",
+    "        Context_relevency.format(q, c),\n",
+    "        Context_relevency.format(\"How to prevent AIDS?\", c),\n",
+    "    ],\n",
+    "    n=n,\n",
+    "    temperature=1,\n",
+    ")"
    ]
   },
   {
@@ -1397,7 +1406,7 @@
     }
    ],
    "source": [
-    "context_relevancy.score(dataset[\"baseline\"].select(range(0,3)))"
+    "context_relevancy.score(dataset[\"baseline\"].select(range(0, 3)))"
    ]
   },
   {
@@ -1491,7 +1500,7 @@
     }
    ],
    "source": [
-    "context_relevancy.score(dataset[\"baseline\"].select(range(0,3)))"
+    "context_relevancy.score(dataset[\"baseline\"].select(range(0, 3)))"
    ]
   },
   {

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,8 +6,9 @@ dependencies = [
     "sentence-transformers",
     "datasets",
     "protobuf<=3.20.0",
-    "backoff",
+    "langchain>=0.0.218",
     "openai",
+    "pydantic<2.0"
 ]
 dynamic = ["version", "readme"]
 

diff --git a/src/ragas/async_utils.py b/src/ragas/async_utils.py
@@ -0,0 +1,39 @@
+"""Async utils."""
+import asyncio
+from typing import Any, Coroutine, List
+
+
+def run_async_tasks(
+    tasks: List[Coroutine],
+    show_progress: bool = False,
+    progress_bar_desc: str = "Running async tasks",
+) -> List[Any]:
+    """Run a list of async tasks."""
+
+    tasks_to_execute: List[Any] = tasks
+    if show_progress:
+        try:
+            import nest_asyncio
+            from tqdm.asyncio import tqdm
+
+            # jupyter notebooks already have an event loop running
+            # we need to reuse it instead of creating a new one
+            nest_asyncio.apply()
+            loop = asyncio.get_event_loop()
+
+            async def _tqdm_gather() -> List[Any]:
+                return await tqdm.gather(*tasks_to_execute, desc=progress_bar_desc)
+
+            tqdm_outputs: List[Any] = loop.run_until_complete(_tqdm_gather())
+            return tqdm_outputs
+        # run the operation w/o tqdm on hitting a fatal
+        # may occur in some environments where tqdm.asyncio
+        # is not supported
+        except Exception:
+            pass
+
+    async def _gather() -> List[Any]:
+        return await asyncio.gather(*tasks_to_execute)
+
+    outputs: List[Any] = asyncio.run(_gather())
+    return outputs
diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
@@ -12,6 +12,8 @@
 from math import floor
 
 from datasets import Dataset
+from langchain.chat_models.base import BaseChatModel
+from langchain.llms.base import BaseLLM
 
 
 def make_batches(total_size: int, batch_size: int) -> list[range]:
@@ -31,17 +33,18 @@ def make_batches(total_size: int, batch_size: int) -> list[range]:
 
 @dataclass
 class Metric(ABC):
-    @property
-    @abstractmethod
-    def batch_size(self: t.Self) -> int:
-        ...
+    batch_size: int
+    llm: t.Optional[BaseLLM | BaseChatModel] = None
+
+    def __post_init__(self: t.Self):
+        if self.llm is None:
+            from langchain.chat_models import ChatOpenAI
+
+            self.llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k")  # type: ignore
 
     @property
     @abstractmethod
-    def name(self: t.Self) -> str:
-        """
-        the metric name
-        """
+    def name(self) -> str:
         ...
 
     @abstractmethod